<a href="https://colab.research.google.com/github/faezesarlakifar/test/blob/main/esm_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Protein Embeddings
### Due to memory limitations, we processed the [Algpred 2.0 dataset](https://webs.iiitd.edu.in/raghava/algpred2/stand.html) in four separate notebooks, extracting [facebook esm](https://github.com/facebookresearch/esm) embeddings for each subset of the data independently. For clarity, the code from those four notebooks has been consolidated here to provide a complete overview of our workflow, even though the embeddings were generated in stages.<br><br> By splitting the workload across notebooks, we were able to work around the memory constraints and successfully extract ESM embeddings for the full Algpred 2.0 dataset. ❤

In [None]:
# @markdown configs
!git clone https://github.com/facebookresearch/esm.git
!pip install git+https://github.com/facebookresearch/esm.git
!pip install torch

In [None]:
# @markdown import necessaries
from google.colab import drive
from tqdm.notebook import tqdm
import esm
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [None]:
# @markdown mount google drive
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
path = '/content/drive/MyDrive/allergen-detection/fasta-files/'

In [None]:
# @title extract embeddings for train positive data
input_file = path+'algpred2_train_positive.fasta'
!python esm/scripts/extract.py esm2_t33_650M_UR50D $input_file \
embeddings/algpred2_train_positive --repr_layers 0 32 33 --include mean per_tok

Transferred model to GPU
Read /content/drive/MyDrive/allergen-detection/fasta-files/algpred2_train_positive.fasta<br>
with **8060** sequences in **539** batches

In [None]:
# @title extract embeddings for train negative data
input_file = path+'algpred2_train_negative.fasta'
!python esm/scripts/extract.py esm2_t33_650M_UR50D $input_file \
embeddings/algpred2_train_negative --repr_layers 0 32 33 --include mean per_tok

Transferred model to GPU
Read /content/drive/MyDrive/allergen-detection/fasta-files/algpred2_train_negative.fasta<br>
with **8060** sequences in **1002** batches

In [None]:
# @title extract embeddings for validation positive data
input_file = path+'algpred2_validation_positive.fasta'
!python esm/scripts/extract.py esm2_t33_650M_UR50D $input_file \
embeddings/algpred2_validation_positive --repr_layers 0 32 33 --include mean per_tok

Transferred model to GPU
Read /content/drive/MyDrive/allergen-detection/fasta-files/algpred2_validation_positive.fasta<br>
with **2015** sequences in **118** batches

In [None]:
# @title extract embeddings for validation negative data
input_file = path+'algpred2_validation_negative.fasta'
!python esm/scripts/extract.py esm2_t33_650M_UR50D $input_file \
embeddings/algpred2_validation_negative --repr_layers 0 32 33 --include mean per_tok

Transferred model to GPU
Read /content/drive/MyDrive/allergen-detection/fasta-files/algpred2_validation_negative.fasta<br>
with **2015** sequences in **250** batches

In [None]:
# @title load_embbedings helper function
def load_embbedings(fasta_path, emb_path, toxin_label):
  ys = []
  Xs = []

  for header, _seq in esm.data.read_fasta(input_file):
    ys.append(toxin_label)
    fn = f'{embedding_path}/{header}.pt'
    embs = torch.load(fn)
    Xs.append(embs['mean_representations'][33])
  Xs = torch.stack(Xs, dim=0).numpy()

  return Xs, ys

## Save the train positive data as a .csv file

In [None]:
embedding_path = 'embeddings/algpred2_train_positive'

In [None]:
Xs, ys = load_embbedings(input_file, embedding_path, 1.0)

In [None]:
len(Xs[0])

1280

In [None]:
df_train_positive = pd.DataFrame(Xs)
df_train_positive['Label'] = ys
df_train_positive.to_csv('df_train_positive.csv')
df_train_positive.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1271,1272,1273,1274,1275,1276,1277,1278,1279,Label
0,0.017011,-0.030445,-0.076579,0.064098,-0.009584,-0.113924,0.052694,-0.041591,-0.04789,0.065836,...,0.00046,-0.0872,-0.003533,-0.019282,0.004049,0.057629,-0.065134,-0.042381,-0.003722,1.0
1,-0.04699,-0.058921,-0.112702,-0.027254,-0.057771,-0.027416,0.047696,-0.171788,0.020205,0.111519,...,-0.041634,0.002714,0.020408,-0.131773,0.097146,0.035977,-0.231252,-0.074935,0.047514,1.0
2,0.040395,-0.006669,-0.076083,-0.001854,-0.026292,-0.058505,0.037825,0.020744,-0.014593,0.019407,...,0.028197,-0.00593,-0.028209,0.050372,0.117581,0.032476,0.015124,-0.076775,0.105106,1.0
3,-0.054366,-0.076783,-0.089232,-0.047373,-0.066595,-0.061191,0.004398,-0.112409,0.025635,0.110483,...,-0.039761,-0.010691,0.011734,-0.101002,0.13077,0.078484,-0.18491,-0.078393,0.02751,1.0
4,-0.050533,-0.076656,-0.093471,-0.054474,-0.065348,-0.058643,0.001537,-0.116433,0.02158,0.112339,...,-0.037913,-0.006154,0.00537,-0.115029,0.143506,0.079029,-0.191161,-0.078071,0.028236,1.0


In [None]:
df_train_positive.to_csv('df_train_positive.csv')

## Save the train nagative data as a .csv file

In [None]:
embedding_path = 'embeddings/algpred2_train_negative'

In [None]:
Xs, ys = load_embbedings(input_file, embedding_path, 0)

In [None]:
df_train_negative = pd.DataFrame(Xs)
df_train_negative['Label'] = ys
df_train_negative.to_csv('df_train_negative.csv')
df_train_negative.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1271,1272,1273,1274,1275,1276,1277,1278,1279,Label
0,0.07283,-0.028645,0.051227,0.067016,-0.048081,-0.068333,-0.02482,0.058451,0.096248,-0.018448,...,0.006939,-0.064909,0.120399,0.006641,-0.016284,0.020331,0.012317,-0.030993,-0.030012,0
1,0.079035,-0.030169,-0.06867,-0.047601,0.02699,0.07753,0.041112,-0.034504,-0.042318,0.052084,...,-0.126265,-0.054226,0.045931,0.065614,-0.03416,0.031799,-0.036175,-0.094126,0.001437,0
2,0.08749,-0.025145,-0.072481,-0.052096,0.008663,-0.040584,0.090343,0.017101,-0.090984,0.039985,...,-0.146274,-0.020185,-0.074541,0.081277,0.079511,0.02189,-0.053278,-0.049421,0.038386,0
3,-0.025563,0.00233,0.025772,-0.015071,-0.071527,0.123731,0.043271,-0.031532,-0.046841,-0.039799,...,-0.044043,0.000331,0.109708,-0.026834,0.066347,0.058534,-0.046456,-0.033516,0.082908,0
4,0.030823,-0.0146,0.059927,0.076293,0.064594,-0.134257,0.034322,0.028235,-0.024199,0.022223,...,0.040488,-0.042896,0.009447,0.027206,0.067797,0.121291,-0.048117,-0.084549,0.019162,0


In [None]:
df_train_negative.to_csv('df_train_negative.csv')

## Save the validation negative data as a .csv file

In [None]:
embedding_path = 'embeddings/algpred2_validation_negative'

In [None]:
Xs, ys = load_embbedings(input_file, embedding_path, 0)
df_validation_negative = pd.DataFrame(Xs)
df_validation_negative['Label'] = ys
df_validation_negative.to_csv('df_validation_negative.csv')
df_validation_negative.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1271,1272,1273,1274,1275,1276,1277,1278,1279,Label
0,0.004677,-0.052888,-0.007795,0.009523,0.020367,-0.02192,0.020655,-0.155942,0.023892,0.030005,...,0.02779,-0.038936,0.032309,-0.038206,-0.052182,0.073241,-0.110275,-0.008077,0.014144,0
1,-0.024765,0.057699,-0.014629,0.106731,-0.027154,-0.005547,0.063337,-0.230972,0.008309,0.101275,...,-0.002994,-0.063221,0.009837,-0.077741,-0.045143,0.063148,-0.073535,-0.049646,-0.007093,0
2,-0.030412,0.013076,-0.04123,0.031167,-0.037301,-0.078382,0.095941,0.01812,-0.081935,0.084584,...,-0.043988,-0.086649,-0.002378,-0.032624,0.07477,0.082218,-0.062845,-0.001786,0.056165,0
3,-0.009307,-0.060038,0.006437,0.06493,-0.116684,-0.063954,0.054519,-0.038748,-0.046068,0.043275,...,-0.027185,-0.04554,0.042025,0.014919,-0.008175,0.091617,-0.109595,0.00123,0.036975,0
4,-0.167446,0.014414,-0.075528,0.057897,-0.02593,0.086558,0.079545,-0.078385,0.013562,-0.005843,...,0.059329,0.101454,0.177743,-0.031807,0.063472,-0.024722,0.04888,-0.103623,0.024687,0


In [None]:
df_validation_negative.to_csv('df_validation_negative.csv')

## Save the validation positive data as a .csv file

In [None]:
embedding_path = 'embeddings/algpred2_validation_positive'

In [None]:
Xs, ys = load_embbedings(input_file, embedding_path, 1)
df_validation_positive = pd.DataFrame(Xs)
df_validation_positive['Label'] = ys
df_validation_positive.to_csv('df_validation_positive.csv')
df_validation_positive.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1271,1272,1273,1274,1275,1276,1277,1278,1279,Label
0,0.00304,0.017125,-0.04463,0.018953,-0.026359,-0.011301,-0.03821,-0.0303,-0.006179,-0.031763,...,0.067436,-0.043971,0.049848,-0.020259,0.015893,0.157528,-0.073251,-0.013437,0.001237,1
1,0.009885,0.007735,-0.055399,0.010226,-0.031136,-0.021302,-0.052895,-0.04598,0.001671,-0.033147,...,0.027894,-0.046595,0.055568,-0.012621,0.006327,0.155229,-0.080183,0.017658,-0.008036,1
2,-0.012057,-0.05992,-0.046038,0.120857,-0.025764,-0.010265,0.042683,-0.175396,0.080291,0.035952,...,0.031651,0.001406,0.108662,-0.038689,-0.046039,-0.026731,-0.147501,-0.007822,-0.066521,1
3,-0.029771,-0.16174,-0.109914,0.083685,0.104613,-0.203691,0.124318,0.174724,-0.114174,0.092469,...,0.031522,-0.134996,-0.060767,-0.063418,0.06233,0.060911,-0.118915,0.167166,0.107324,1
4,-0.031435,-0.145662,-0.104183,0.054887,0.084018,-0.207865,0.113749,0.203442,-0.120808,0.06698,...,0.03649,-0.131744,-0.063849,-0.060193,0.089299,0.06381,-0.120718,0.165145,0.098017,1


In [None]:
df_validation_positive.to_csv('df_validation_positive.csv')

## Merge train dataframes into a single dataframe

In [None]:
df_train = pd.concat([df_train_positive, df_train_negative]).sample(frac=1).reset_index(drop=True)

In [None]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,1271,1272,1273,1274,1275,1276,1277,1278,1279,Label
0,4917,0.007024,-0.065006,-0.028041,0.111512,0.010963,0.050427,0.117898,-0.123601,0.096952,...,-0.064849,0.025921,0.108775,-0.035739,-0.08095,0.042518,-0.047726,-0.0287,0.069091,1.0
1,4868,-0.016422,0.017157,0.065902,-0.045502,-0.020965,0.019144,-0.008605,0.107032,0.024592,...,0.036166,-0.074265,0.072319,-0.044529,0.162673,0.107594,0.038635,-0.063711,0.072911,0.0
2,1774,-0.067091,0.036359,-0.012731,0.002579,0.015604,-0.16338,0.009639,-0.003973,-0.108023,...,0.014872,-0.137274,0.086011,-0.035893,0.001509,0.090051,-0.043187,0.003108,0.034345,1.0
3,6342,0.003069,-0.012822,-0.020514,0.109759,-0.0717,0.016879,0.031743,0.002377,0.093078,...,0.098554,0.046076,0.111619,-0.035462,0.100664,0.14782,0.014518,-0.152727,-0.041294,1.0
4,1277,0.045561,-0.089433,-0.03923,0.128283,-0.066253,-0.094849,0.142744,-0.094645,0.017515,...,0.039378,-0.177669,-0.003415,0.036854,-0.080315,-0.023137,0.03175,0.065749,0.07015,0.0


In [None]:
df_train = df_train.drop(['Unnamed: 0'], axis=1)
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1271,1272,1273,1274,1275,1276,1277,1278,1279,Label
0,0.007024,-0.065006,-0.028041,0.111512,0.010963,0.050427,0.117898,-0.123601,0.096952,0.053966,...,-0.064849,0.025921,0.108775,-0.035739,-0.08095,0.042518,-0.047726,-0.0287,0.069091,1.0
1,-0.016422,0.017157,0.065902,-0.045502,-0.020965,0.019144,-0.008605,0.107032,0.024592,0.059676,...,0.036166,-0.074265,0.072319,-0.044529,0.162673,0.107594,0.038635,-0.063711,0.072911,0.0
2,-0.067091,0.036359,-0.012731,0.002579,0.015604,-0.16338,0.009639,-0.003973,-0.108023,0.069293,...,0.014872,-0.137274,0.086011,-0.035893,0.001509,0.090051,-0.043187,0.003108,0.034345,1.0
3,0.003069,-0.012822,-0.020514,0.109759,-0.0717,0.016879,0.031743,0.002377,0.093078,-0.009462,...,0.098554,0.046076,0.111619,-0.035462,0.100664,0.14782,0.014518,-0.152727,-0.041294,1.0
4,0.045561,-0.089433,-0.03923,0.128283,-0.066253,-0.094849,0.142744,-0.094645,0.017515,0.11139,...,0.039378,-0.177669,-0.003415,0.036854,-0.080315,-0.023137,0.03175,0.065749,0.07015,0.0


In [None]:
df_train.to_csv('df_train.csv')

## Merge validation dataframes into a single dataframe

In [None]:
df_test = pd.concat([df_validation_positive, df_validation_negative]).sample(frac=1).reset_index(drop=True)

In [None]:
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1271,1272,1273,1274,1275,1276,1277,1278,1279,Label
0,-0.022788,0.102351,-0.004759,0.056018,0.041625,-0.091652,-0.132269,0.186708,0.009317,-0.013024,...,0.058825,-0.126656,0.029039,-0.043456,0.176962,0.10751,0.044039,-0.119478,-0.009796,0
1,-0.028571,-0.017322,-0.037169,0.027144,0.002909,-0.105564,0.061668,-0.166686,-0.007113,0.075881,...,-0.024243,0.019002,0.03587,-0.109937,0.029677,0.09228,-0.143162,-0.092381,0.013006,1
2,0.067068,-0.074117,0.011278,0.056075,-0.099463,-0.078861,0.021225,-0.012659,0.03501,-0.041538,...,-0.014853,0.003386,0.025327,-0.010808,0.033378,0.120301,-0.004345,-0.005355,0.035297,0
3,-0.029574,-0.04999,-0.052827,-0.066867,0.070059,-0.028007,-0.04895,-0.046356,-0.16547,0.093307,...,0.026353,-0.027442,-0.003394,-0.07409,-0.126639,-0.044253,-0.181208,0.106327,0.195119,0
4,0.016483,-0.101985,0.016983,0.041254,-0.012146,-0.078855,-0.021125,-0.142708,0.039837,0.017908,...,-0.008563,-0.002452,-0.037658,-0.044058,0.018399,0.075191,-0.139385,0.036831,0.094892,1


In [None]:
df_test = df_test.drop(['Unnamed: 0'], axis=1)
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1271,1272,1273,1274,1275,1276,1277,1278,1279,Label
0,-0.022788,0.102351,-0.004759,0.056018,0.041625,-0.091652,-0.132269,0.186708,0.009317,-0.013024,...,0.058825,-0.126656,0.029039,-0.043456,0.176962,0.10751,0.044039,-0.119478,-0.009796,0
1,-0.028571,-0.017322,-0.037169,0.027144,0.002909,-0.105564,0.061668,-0.166686,-0.007113,0.075881,...,-0.024243,0.019002,0.03587,-0.109937,0.029677,0.09228,-0.143162,-0.092381,0.013006,1
2,0.067068,-0.074117,0.011278,0.056075,-0.099463,-0.078861,0.021225,-0.012659,0.03501,-0.041538,...,-0.014853,0.003386,0.025327,-0.010808,0.033378,0.120301,-0.004345,-0.005355,0.035297,0
3,-0.029574,-0.04999,-0.052827,-0.066867,0.070059,-0.028007,-0.04895,-0.046356,-0.16547,0.093307,...,0.026353,-0.027442,-0.003394,-0.07409,-0.126639,-0.044253,-0.181208,0.106327,0.195119,0
4,0.016483,-0.101985,0.016983,0.041254,-0.012146,-0.078855,-0.021125,-0.142708,0.039837,0.017908,...,-0.008563,-0.002452,-0.037658,-0.044058,0.018399,0.075191,-0.139385,0.036831,0.094892,1


In [None]:
df_test.to_csv('df_test.csv')