# Protein conformational changes between conditions


## Getting the data

In [29]:
import pandas as pd
import numpy as np

df = pd.read_excel('OsmoticStress.xlsx')

In [30]:
df = df[['Uniprot_ID', 'Peptide_sequence', 'Log2FC(LiP_norm)']]

In [31]:
df.head()

Unnamed: 0,Uniprot_ID,Peptide_sequence,Log2FC(LiP_norm)
0,P15703,KPNTSGTSDVEK,-2.176707
1,P15703,SYTSTVK,0.285029
2,P15703,NDLTASQLSDK,-0.49824
3,P15703,EAFDEDWKPNTSGTSDVEK,0.348193
4,P15703,SVVADISDSDGK,-0.434615


In [32]:
from Bio import SeqIO

def read_fasta_to_dict(file_path):
    sequences = {}
    for record in SeqIO.parse(file_path, "fasta"):
        # Splitting the record.id to extract the UniProt ID if necessary
        uniprot_id = record.id.split('|')[1]  # Adjust this as per your FASTA file's format
        sequences[uniprot_id] = str(record.seq)
    return sequences

# Provide the path to your FASTA file
fasta_file_path = 'UP000002311_559292.fasta'
sequences = read_fasta_to_dict(fasta_file_path)

In [33]:
df["full_sequence"] = df['Uniprot_ID'].map(sequences)

In [37]:
# Function to create a binary array for the short peptide's position within the full peptide
df["full_sequence"]= df["full_sequence"].astype(str)
df["Peptide_sequence"]= df["Peptide_sequence"]. astype(str)

def binary_position(row):
    full_len = len(row['full_sequence'])
    short_len = len(row['Peptide_sequence'])
    start_index = row['full_sequence'].find(row['Peptide_sequence'])
    # Create an array of zeros with length equal to the full peptide
    positions = np.zeros(full_len, dtype=int)
    # If the short peptide is found, set positions to 1 for its length from the start index
    if start_index != -1:
        positions[start_index:start_index + short_len] = 1
    return positions.tolist()
# Apply the function to each row and create a new column for the binary positions
df['Binary_Positions'] = df.apply(binary_position, axis=1)
print(df)


      Uniprot_ID     Peptide_sequence  Log2FC(LiP_norm)  \
0         P15703         KPNTSGTSDVEK         -2.176707   
1         P15703              SYTSTVK          0.285029   
2         P15703          NDLTASQLSDK         -0.498240   
3         P15703  EAFDEDWKPNTSGTSDVEK          0.348193   
4         P15703         SVVADISDSDGK         -0.434615   
...          ...                  ...               ...   
32704     P38887          FITVPLVVASR         -0.008441   
32705     P53093         GTLDETVLQTLK          0.006678   
32706     Q04772           DVSNSKPNVR         -0.015149   
32707     P21192          VLEEQEEVAQK         -0.010109   
32708     Q03687     DSNDTSDSPQDDQVGK         -0.192194   

                                           full_sequence  \
0      MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...   
1      MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...   
2      MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...   
3      MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTS

In [43]:
df.iloc[0,:]

Uniprot_ID                                                     P15703
Peptide_sequence                                         KPNTSGTSDVEK
Log2FC(LiP_norm)                                             -2.17671
full_sequence       MRFSTTLATAATALFFTASQVSAIGELAFNLGVKNNDGTCKSTSDY...
Binary_Positions    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: 0, dtype: object

In [None]:
df.to_csv(gdrive_path + "/processed_yeast_df.csv")

In [None]:
#TODO Split training and testing data

## Getting the model from hugging face


In [None]:
from huggingface_hub import notebook_login

notebook_login()


Choose one of the following checkpoints:


| Checkpoint name | Num layers | Num parameters |
|------------------------------|----|----------|
| `esm2_t48_15B_UR50D`         | 48 | 15B     |
| `esm2_t36_3B_UR50D`          | 36 | 3B      |
| `esm2_t33_650M_UR50D`        | 33 | 650M    |
| `esm2_t30_150M_UR50D`        | 30 | 150M    |
| `esm2_t12_35M_UR50D`         | 12 | 35M     |
| `esm2_t6_8M_UR50D`           | 6  | 8M      |

In [None]:
model_checkpoint = "facebook/esm2_t6_8M_UR50D"

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)