# Protein conformational changes between conditions


## Getting the data

In [1]:
import pandas as pd
import numpy as np

# OsmitcStress.xlsx is the file that contains the data already filtered for the osmotic stress experiment
df = pd.read_excel('data/OsmoticStress.xlsx')

In [2]:
df = df[['Uniprot_ID', 'Peptide_sequence', 'Log2FC(LiP_norm)', 'Qvalue(LiP)']]

In [3]:
df.head()

Unnamed: 0,Uniprot_ID,Peptide_sequence,Log2FC(LiP_norm),Qvalue(LiP)
0,P15703,KPNTSGTSDVEK,-2.176707,0.003686
1,P15703,SYTSTVK,0.285029,0.034862
2,P15703,NDLTASQLSDK,-0.49824,0.039093
3,P15703,EAFDEDWKPNTSGTSDVEK,0.348193,0.042584
4,P15703,SVVADISDSDGK,-0.434615,0.048552


In [4]:
# Reads data from the .Fasta file

from Bio import SeqIO

def read_fasta_to_dict(file_path):
    sequences = {}
    for record in SeqIO.parse(file_path, "fasta"):
        uniprot_id = record.id.split('|')[1]
        sequences[uniprot_id] = str(record.seq)
    return sequences

fasta_file_path = 'data/UP000002311_559292.fasta'
sequences = read_fasta_to_dict(fasta_file_path)

In [5]:
df["full_sequence"] = df['Uniprot_ID'].map(sequences)

In [6]:
df["full_sequence"]= df["full_sequence"].astype(str)
df["Peptide_sequence"]= df["Peptide_sequence"]. astype(str)

In [7]:
df.shape

(32709, 5)

In [8]:
# Just get the sequences bellow length of 1000 to ease out memory usage

trimmed_df = df[df['full_sequence'].apply(lambda x: len(x)) < 1000]

In [9]:
trimmed_df.shape

(28008, 5)

In [10]:
def binary_position(row):
    full_len = len(row['full_sequence'])
    short_len = len(row['Peptide_sequence'])
    start_index = row['full_sequence'].find(row['Peptide_sequence'])

    positions = np.zeros(full_len, dtype=int)

    if start_index != -1:
        positions[start_index:start_index + short_len] = 1
    return positions.tolist()

# Creation of the binary positions
trimmed_df['Binary_Positions'] = trimmed_df.apply(binary_position, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [11]:
trimmed_df

Unnamed: 0,Uniprot_ID,Peptide_sequence,Log2FC(LiP_norm),Qvalue(LiP),full_sequence,Binary_Positions
0,P15703,KPNTSGTSDVEK,-2.176707,0.003686,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,P15703,SYTSTVK,0.285029,0.034862,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,P15703,NDLTASQLSDK,-0.498240,0.039093,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,P15703,EAFDEDWKPNTSGTSDVEK,0.348193,0.042584,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,P15703,SVVADISDSDGK,-0.434615,0.048552,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...
32704,P38887,FITVPLVVASR,-0.008441,0.376924,MILKLVHCLVALTGLIFAKPYQQQQAVLAPSQDVPLRDIHIGDINF...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
32705,P53093,GTLDETVLQTLK,0.006678,0.381210,MSYGREDTTIEPDFIEPDAPLAASGGVADNIGGTMQNSGSRGTLDE...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
32706,Q04772,DVSNSKPNVR,-0.015149,0.381696,MSSDGMNRDVSNSKPNVRFAAPQRLSVAHPAISSPLHMPMSKSSRK...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ..."
32707,P21192,VLEEQEEVAQK,-0.010109,0.382355,MDNVVDPWYINPSGFAKDTQDEEYVQHHDNVNPTIPPPDNYILNNE...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [12]:
max_length = 1000

# Padding binary position to length of 1000
trimmed_df['Padded_Binary_Positions'] = trimmed_df['Binary_Positions'].apply(lambda x: np.pad(x, (0, max_length - len(x)), 'constant'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [13]:
trimmed_df.head()

Unnamed: 0,Uniprot_ID,Peptide_sequence,Log2FC(LiP_norm),Qvalue(LiP),full_sequence,Binary_Positions,Padded_Binary_Positions
0,P15703,KPNTSGTSDVEK,-2.176707,0.003686,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,P15703,SYTSTVK,0.285029,0.034862,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,P15703,NDLTASQLSDK,-0.49824,0.039093,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,P15703,EAFDEDWKPNTSGTSDVEK,0.348193,0.042584,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,P15703,SVVADISDSDGK,-0.434615,0.048552,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [14]:
trimmed_df = trimmed_df.sort_values(by='Qvalue(LiP)', ascending=True)
trimmed_df.head()

Unnamed: 0,Uniprot_ID,Peptide_sequence,Log2FC(LiP_norm),Qvalue(LiP),full_sequence,Binary_Positions,Padded_Binary_Positions
0,P15703,KPNTSGTSDVEK,-2.176707,0.003686,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
101,P06169,TTYVTQRPVYLGLPANLVDLNVPAK,2.264383,0.003686,MSEITLGKYLFERLKQVNVNTVFGLPGDFNLSLLDKIYEVEGMRWA...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
18,P38174,NLCGHSIAPYR,1.216913,0.003686,MTDAEIENSPASDLKELNLENEGVEQQDQAKADESDPVESKKKKNK...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
280,P00359,VINDAFGIEEGLMTTVHSLTATQK,2.790874,0.003686,MVRVAINGFGRIGRLVMRIALSRPNVEVVALNDPFITNDYAAYMFK...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
331,P0CH08;P0CH09,TITLEVESSDTIDNVK,-0.647026,0.004177,,"[0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [15]:
trimmed_df = trimmed_df.head(5000)
trimmed_df.reset_index(inplace=True)

In [16]:
print(trimmed_df.shape)

(5000, 8)


In [17]:
# Prevent formating problem
import json
trimmed_df['Padded_Binary_Positions'] = trimmed_df['Padded_Binary_Positions'].apply(lambda x: json.dumps(x.tolist()))

In [18]:
trimmed_df.to_csv('data/OsmoticStress_with_binary_positions_padded_5000.csv', index=False)