# Protein conformational changes between conditions


## Getting the data

In [13]:
import pandas as pd
import numpy as np

# OsmitcStress.xlsx is the file that contains the data already filtered for the osmotic stress experiment
# Source: TODO add source
df = pd.read_excel('data/OsmoticStress.xlsx')

In [14]:
# Check for NaN values in each column
nan_per_column = df.isna().sum()
print("Number of NaN values in each column:\n", nan_per_column)


Number of NaN values in each column:
 Uniprot_ID                0
Gene_name                 8
Systematic_gene_name    864
Protein_Description       0
Peptide_sequence          0
Log2FC(LiP_raw)           0
Log2FC(LiP_norm)          0
Pvalue(LiP)               0
Qvalue(LiP)               0
Log2FC(P.Abundance)     864
Pvalue(P.Abundance)       0
Qvalue(P.Abundance)       0
dtype: int64


In [15]:
# Remove rows with any missing values
df = df.dropna()

In [16]:
# Check for NaN values in each column after dropping nan valus
nan_per_column = df.isna().sum()
print("Number of NaN values in each column:\n", nan_per_column)

Number of NaN values in each column:
 Uniprot_ID              0
Gene_name               0
Systematic_gene_name    0
Protein_Description     0
Peptide_sequence        0
Log2FC(LiP_raw)         0
Log2FC(LiP_norm)        0
Pvalue(LiP)             0
Qvalue(LiP)             0
Log2FC(P.Abundance)     0
Pvalue(P.Abundance)     0
Qvalue(P.Abundance)     0
dtype: int64


In [17]:
df = df[['Uniprot_ID', 'Peptide_sequence', 'Log2FC(LiP_norm)']]

In [18]:
df.head()

Unnamed: 0,Uniprot_ID,Peptide_sequence,Log2FC(LiP_norm)
0,P15703,KPNTSGTSDVEK,-2.176707
1,P15703,SYTSTVK,0.285029
2,P15703,NDLTASQLSDK,-0.49824
3,P15703,EAFDEDWKPNTSGTSDVEK,0.348193
4,P15703,SVVADISDSDGK,-0.434615


In [8]:
from Bio import SeqIO

def read_fasta_to_dict(file_path):
    sequences = {}
    for record in SeqIO.parse(file_path, "fasta"):
        uniprot_id = record.id.split('|')[1]
        sequences[uniprot_id] = str(record.seq)
    return sequences

fasta_file_path = 'data/UP000002311_559292.fasta'
sequences = read_fasta_to_dict(fasta_file_path)

In [9]:
df["full_sequence"] = df['Uniprot_ID'].map(sequences)

In [10]:
df["full_sequence"]= df["full_sequence"].astype(str)
df["Peptide_sequence"]= df["Peptide_sequence"]. astype(str)

def binary_position(row):
    full_len = len(row['full_sequence'])
    short_len = len(row['Peptide_sequence'])
    start_index = row['full_sequence'].find(row['Peptide_sequence'])

    positions = np.zeros(full_len, dtype=int)

    if start_index != -1:
        positions[start_index:start_index + short_len] = 1
    return positions.tolist()

df['Binary_Positions'] = df.apply(binary_position, axis=1)

# TODO: what other poistion encoding can be used?

In [11]:
df.head()

Unnamed: 0,Uniprot_ID,Peptide_sequence,Log2FC(LiP_norm),full_sequence,Binary_Positions
0,P15703,KPNTSGTSDVEK,-2.176707,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,P15703,SYTSTVK,0.285029,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,P15703,NDLTASQLSDK,-0.49824,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,P15703,EAFDEDWKPNTSGTSDVEK,0.348193,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,P15703,SVVADISDSDGK,-0.434615,MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [12]:
df.to_csv('data/OsmoticStress_with_binary_positions.csv', index=False)