## package installation and load packages


In [None]:
### packages required 
!pip install fair-esm 
!pip install torch
!pip install tensorflow
!pip install sklearn
!pip install h5py
!pip install joblib

In [None]:
import esm
import pandas as pd
import numpy as np
from keras.layers import Input, Dense, Activation, BatchNormalization, Flatten, Conv1D
from keras.layers import Dropout, AveragePooling1D, MaxPooling1D
from keras.models import Sequential,Model, load_model
from keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint,LearningRateScheduler, EarlyStopping
import keras
from keras import backend as K
import tensorflow as tf
if tf.test.gpu_device_name():
    print('GPU found')
    tf.config.experimental.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU') # set the deep learning with GPU 
else:
    print("No GPU found")

No GPU found


### peptide embeddings with differen pretrained model
https://github.com/facebookresearch/esm

Explaination of the memeory usage of the following models

sequence length > 900 

2560 output dimension model might need 24 G GPU memory

5129 output dimension model, (in our attempts, 40 GB GPU memory is not enough) 


In [None]:
def esm_embeddings_320(esm2, esm2_alphabet, peptide_sequence_list):
  # NOTICE: ESM for embeddings is quite RAM usage, if your sequence is too long, 
  #         or you have too many sequences for transformation in a single converting, 
  #         you computer might automatically kill the job.
  import torch
  import esm
  import collections
  import pandas as pd
  import gc

  if torch.cuda.is_available():
    device = torch.device("cuda")
  else:
    device = torch.device("cpu")
  esm2 = esm2.eval().to(device)

  batch_converter = esm2_alphabet.get_batch_converter()

  # load the peptide sequence list into the bach_converter
  batch_labels, batch_strs, batch_tokens = batch_converter(peptide_sequence_list)
  batch_lens = (batch_tokens != esm2_alphabet.padding_idx).sum(1)
  ## batch tokens are the embedding results of the whole data set

  batch_tokens = batch_tokens.to(device)

  # Extract per-residue representations (on CPU)
  with torch.no_grad():
      # Here we export the last layer of the EMS model output as the representation of the peptides
      # model'esm2_t6_8M_UR50D' only has 6 layers, and therefore repr_layers parameters is equal to 6
      results = esm2(batch_tokens, repr_layers=[6], return_contacts=False)
  token_representations = results["representations"][6].cpu()

  # Generate per-sequence representations via averaging
  # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
  sequence_representations = []
  for i, tokens_len in enumerate(batch_lens):
      sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
  # save dataset
  # sequence_representations is a list and each element is a tensor
  embeddings_results = collections.defaultdict(list)
  for i in range(len(sequence_representations)):
      # tensor can be transformed as numpy sequence_representations[0].numpy() or sequence_representations[0].to_list
      each_seq_rep = sequence_representations[i].tolist()
      for each_element in each_seq_rep:
          embeddings_results[i].append(each_element)
  embeddings_results = pd.DataFrame(embeddings_results).T
  del  batch_labels, batch_strs, batch_tokens, results, token_representations
  torch.cuda.empty_cache()
  gc.collect()
  return embeddings_results


In [None]:
def esm_embeddings_480(esm2, esm2_alphabet, peptide_sequence_list):
  # NOTICE: ESM for embeddings is quite RAM usage, if your sequence is too long, 
  #         or you have too many sequences for transformation in a single converting, 
  #         you computer might automatically kill the job.
  import torch
  import esm
  import collections
  import pandas as pd
  import gc

  if torch.cuda.is_available():
    device = torch.device("cuda")
  else:
    device = torch.device("cpu")
  esm2 = esm2.eval().to(device)

  batch_converter = esm2_alphabet.get_batch_converter()

  # load the peptide sequence list into the bach_converter
  batch_labels, batch_strs, batch_tokens = batch_converter(peptide_sequence_list)
  batch_lens = (batch_tokens != esm2_alphabet.padding_idx).sum(1)
  ## batch tokens are the embedding results of the whole data set

  batch_tokens = batch_tokens.to(device)

  # Extract per-residue representations (on CPU)
  with torch.no_grad():
      # Here we export the last layer of the EMS model output as the representation of the peptides
      # model'esm2_t12_35M_UR50D' only has 12 layers, and therefore repr_layers parameters is equal to 12
      results = esm2(batch_tokens, repr_layers=[12], return_contacts=False)
  token_representations = results["representations"][12].cpu()

  # Generate per-sequence representations via averaging
  # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
  sequence_representations = []
  for i, tokens_len in enumerate(batch_lens):
      sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
  # save dataset
  # sequence_representations is a list and each element is a tensor
  embeddings_results = collections.defaultdict(list)
  for i in range(len(sequence_representations)):
      # tensor can be transformed as numpy sequence_representations[0].numpy() or sequence_representations[0].to_list
      each_seq_rep = sequence_representations[i].tolist()
      for each_element in each_seq_rep:
          embeddings_results[i].append(each_element)
  embeddings_results = pd.DataFrame(embeddings_results).T
  del  batch_labels, batch_strs, batch_tokens, results, token_representations
  torch.cuda.empty_cache()
  gc.collect()
  return embeddings_results


In [None]:
def esm_embeddings_640(esm2, esm2_alphabet, peptide_sequence_list):
  # NOTICE: ESM for embeddings is quite RAM usage, if your sequence is too long, 
  #         or you have too many sequences for transformation in a single converting, 
  #         you computer might automatically kill the job.
  import torch
  import esm
  import collections
  import pandas as pd
  import gc

  if torch.cuda.is_available():
    device = torch.device("cuda")
  else:
    device = torch.device("cpu")
  esm2 = esm2.eval().to(device)

  batch_converter = esm2_alphabet.get_batch_converter()

  # load the peptide sequence list into the bach_converter
  batch_labels, batch_strs, batch_tokens = batch_converter(peptide_sequence_list)
  batch_lens = (batch_tokens != esm2_alphabet.padding_idx).sum(1)
  ## batch tokens are the embedding results of the whole data set

  batch_tokens = batch_tokens.to(device)

  # Extract per-residue representations (on CPU)
  with torch.no_grad():
      # Here we export the last layer of the EMS model output as the representation of the peptides
      # model'esm2_t30_150M_UR50D' only has 30 layers, and therefore repr_layers parameters is equal to 30
      results = esm2(batch_tokens, repr_layers=[30], return_contacts=False)
  token_representations = results["representations"][30].cpu()

  # Generate per-sequence representations via averaging
  # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
  sequence_representations = []
  for i, tokens_len in enumerate(batch_lens):
      sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
  # save dataset
  # sequence_representations is a list and each element is a tensor
  embeddings_results = collections.defaultdict(list)
  for i in range(len(sequence_representations)):
      # tensor can be transformed as numpy sequence_representations[0].numpy() or sequence_representations[0].to_list
      each_seq_rep = sequence_representations[i].tolist()
      for each_element in each_seq_rep:
          embeddings_results[i].append(each_element)
  embeddings_results = pd.DataFrame(embeddings_results).T
  del  batch_labels, batch_strs, batch_tokens, results, token_representations
  torch.cuda.empty_cache()
  gc.collect()
  return embeddings_results


In [None]:
def esm_embeddings_1280(esm2, esm2_alphabet, peptide_sequence_list):
  # NOTICE: ESM for embeddings is quite RAM usage, if your sequence is too long, 
  #         or you have too many sequences for transformation in a single converting, 
  #         you computer might automatically kill the job.
  import torch
  import esm
  import collections
  import pandas as pd
  import gc

  if torch.cuda.is_available():
    device = torch.device("cuda")
  else:
    device = torch.device("cpu")
  esm2 = esm2.eval().to(device)

  batch_converter = esm2_alphabet.get_batch_converter()

  # load the peptide sequence list into the bach_converter
  batch_labels, batch_strs, batch_tokens = batch_converter(peptide_sequence_list)
  batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
  ## batch tokens are the embedding results of the whole data set

  batch_tokens = batch_tokens.to(device)

  # Extract per-residue representations (on CPU)
  with torch.no_grad():
      # Here we export the last layer of the EMS model output as the representation of the peptides
      # model'esm2_t33_650M_UR50D' only has 33 layers, and therefore repr_layers parameters is equal to 33
      results = esm2(batch_tokens, repr_layers=[33], return_contacts=False)
  token_representations = results["representations"][33].cpu()

  # Generate per-sequence representations via averaging
  # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
  sequence_representations = []
  for i, tokens_len in enumerate(batch_lens):
      sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
  # save dataset
  # sequence_representations is a list and each element is a tensor
  embeddings_results = collections.defaultdict(list)
  for i in range(len(sequence_representations)):
      # tensor can be transformed as numpy sequence_representations[0].numpy() or sequence_representations[0].to_list
      each_seq_rep = sequence_representations[i].tolist()
      for each_element in each_seq_rep:
          embeddings_results[i].append(each_element)
  embeddings_results = pd.DataFrame(embeddings_results).T
  del  batch_labels, batch_strs, batch_tokens, results, token_representations
  torch.cuda.empty_cache()
  gc.collect()
  return embeddings_results


In [None]:
def esm_embeddings_2560(esm2, esm2_alphabet, peptide_sequence_list):
  # NOTICE: ESM for embeddings is quite RAM usage, if your sequence is too long, 
  #         or you have too many sequences for transformation in a single converting, 
  #         you computer might automatically kill the job.
  import torch
  import esm
  import collections
  import pandas as pd
  import gc

  if torch.cuda.is_available():
    device = torch.device("cuda")
  else:
    device = torch.device("cpu")
  esm2 = esm2.eval().to(device)

  batch_converter = esm2_alphabet.get_batch_converter()

  # load the peptide sequence list into the bach_converter
  batch_labels, batch_strs, batch_tokens = batch_converter(peptide_sequence_list)
  batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
  ## batch tokens are the embedding results of the whole data set

  batch_tokens = batch_tokens.to(device)

  # Extract per-residue representations (on CPU)
  with torch.no_grad():
      # Here we export the last layer of the EMS model output as the representation of the peptides
      # model'esm2_t36_3B_UR50D' only has 36 layers, and therefore repr_layers parameters is equal to 36
      results = esm2(batch_tokens, repr_layers=[36], return_contacts=False)
  token_representations = results["representations"][36].cpu()

  # Generate per-sequence representations via averaging
  # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
  sequence_representations = []
  for i, tokens_len in enumerate(batch_lens):
      sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
  # save dataset
  # sequence_representations is a list and each element is a tensor
  embeddings_results = collections.defaultdict(list)
  for i in range(len(sequence_representations)):
      # tensor can be transformed as numpy sequence_representations[0].numpy() or sequence_representations[0].to_list
      each_seq_rep = sequence_representations[i].tolist()
      for each_element in each_seq_rep:
          embeddings_results[i].append(each_element)
  embeddings_results = pd.DataFrame(embeddings_results).T
  del  batch_labels, batch_strs, batch_tokens, results, token_representations
  torch.cuda.empty_cache()
  gc.collect()
  return embeddings_results


In [None]:
def esm_embeddings_5120(esm2, esm2_alphabet, peptide_sequence_list):
  # NOTICE: ESM for embeddings is quite RAM usage, if your sequence is too long, 
  #         or you have too many sequences for transformation in a single converting, 
  #         you computer might automatically kill the job.
  import torch
  import esm
  import collections
  import pandas as pd
  import gc

  if torch.cuda.is_available():
    device = torch.device("cuda")
  else:
    device = torch.device("cpu")
  esm2 = esm2.eval().to(device)

  batch_converter = esm2_alphabet.get_batch_converter()

  # load the peptide sequence list into the bach_converter
  batch_labels, batch_strs, batch_tokens = batch_converter(peptide_sequence_list)
  batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
  ## batch tokens are the embedding results of the whole data set

  batch_tokens = batch_tokens.to(device)

  # Extract per-residue representations (on CPU)
  with torch.no_grad():
      # Here we export the last layer of the EMS model output as the representation of the peptides
      # model'esm2_t48_15B_UR50D' only has 48 layers, and therefore repr_layers parameters is equal to 48
      results = esm2(batch_tokens, repr_layers=[48], return_contacts=False)
  token_representations = results["representations"][48].cpu()

  # Generate per-sequence representations via averaging
  # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
  sequence_representations = []
  for i, tokens_len in enumerate(batch_lens):
      sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
  # save dataset
  # sequence_representations is a list and each element is a tensor
  embeddings_results = collections.defaultdict(list)
  for i in range(len(sequence_representations)):
      # tensor can be transformed as numpy sequence_representations[0].numpy() or sequence_representations[0].to_list
      each_seq_rep = sequence_representations[i].tolist()
      for each_element in each_seq_rep:
          embeddings_results[i].append(each_element)
  embeddings_results = pd.DataFrame(embeddings_results).T
  del  batch_labels, batch_strs, batch_tokens, results, token_representations
  torch.cuda.empty_cache()
  gc.collect()
  return embeddings_results


### load packages

In [None]:
from keras.layers import Input, Dense, Activation, BatchNormalization, Flatten, Conv1D
from keras.layers import Dropout, AveragePooling1D, MaxPooling1D
from keras.models import Sequential,Model, load_model
from keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint,LearningRateScheduler, EarlyStopping
import keras
from keras import backend as K
import tensorflow as tf
if tf.test.gpu_device_name():
    print('GPU found')
    tf.config.experimental.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU') # set the deep learning with GPU 
else:
    print("No GPU found")

No GPU found


## Load your sample and our developed model for local running

In [None]:
# collect the output
def assign_activity(predicted_class):
    import collections
    out_put = []
    for i in range(len(predicted_class)):
        if predicted_class[i] == 0:
            # out_put[int_features[i]].append(1)
            out_put.append('Allergen')
        else:
            # out_put[int_features[i]].append(2)
            out_put.append('Non-allergen')
    return out_put

### 320 feature dimension embedding test

In [None]:
# embedding your sample
import numpy as np
import pandas as pd
import esm
# select the ESM model for embeddings (you can select you desired model from https://github.com/facebookresearch/esm)
# NOTICE: if you choose other model, the following model architecture might not be very compitable
#         bseides,please revise the correspdoning parameters in esm_embeddings function (layers for feature extraction)
model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()


# whole dataset loading and dataset splitting 
dataset = pd.read_excel('allergens_dataset.xlsx',na_filter = False) # take care the NA sequence problem

# generate the peptide embeddings
sequence_list = dataset['sequence'] 
embeddings_results = pd.DataFrame()
for seq in sequence_list:
    # the setting is just following the input format setting in ESM model, [name,sequence]
    tuple_sequence = tuple([seq,seq])
    peptide_sequence_list = []
    peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
    # employ ESM model for converting and save the converted data in csv format
    one_seq_embeddings = esm_embeddings_320(model, alphabet, peptide_sequence_list)
    embeddings_results= pd.concat([embeddings_results,one_seq_embeddings])

In [None]:
import joblib

scaler = joblib.load('best_model_grid_320_server.joblib')
normalized_embeddings_results = scaler.transform(embeddings_results)

from keras.models import load_model
# load the save best model
saved_model = load_model('best_model_grid_320_server.h5')
# result collection list
# confusion matrix 
predicted_class= []
predicted_protability = saved_model.predict(normalized_embeddings_results,batch_size=1)
for p in range(predicted_protability.shape[0]):
  index = np.where(predicted_protability[p] == np.amax(predicted_protability[p]))[0][0]
  predicted_class.append(index)
predicted_class = np.array(predicted_class)

predicted_class = assign_activity(predicted_class)  # transform results (0 and 1) into 'active' and 'non-active'
print(predicted_class)
report = {"sequence": sequence_list, "activity": predicted_class}
report_df = pd.DataFrame(report)
report_df.to_excel("320_report.xlsx")


['Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen']


### 480 feature dimension embedding test

In [None]:
import numpy as np
import pandas as pd
import esm
# select the ESM model for embeddings (you can select you desired model from https://github.com/facebookresearch/esm)
# NOTICE: if you choose other model, the following model architecture might not be very compitable
#         bseides,please revise the correspdoning parameters in esm_embeddings function (layers for feature extraction)
model, alphabet = esm.pretrained.esm2_t12_35M_UR50D()


# whole dataset loading and dataset splitting 
dataset = pd.read_excel('allergens_dataset.xlsx',na_filter = False) # take care the NA sequence problem

# generate the peptide embeddings
sequence_list = dataset['sequence'] 
embeddings_results = pd.DataFrame()
for seq in sequence_list:
    # the setting is just following the input format setting in ESM model, [name,sequence]
    tuple_sequence = tuple([seq,seq])
    peptide_sequence_list = []
    peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
    # employ ESM model for converting and save the converted data in csv format
    one_seq_embeddings = esm_embeddings_480(model, alphabet, peptide_sequence_list)
    embeddings_results= pd.concat([embeddings_results,one_seq_embeddings])
embeddings_results.to_csv('whole_sample_dataset_esm2_t12_35M_UR50D_unified_480_dimension.csv')

In [None]:
import joblib

scaler = joblib.load('best_model_grid_480_server.joblib')
normalized_embeddings_results = scaler.transform(embeddings_results)

from keras.models import load_model
# load the save best model
saved_model = load_model('best_model_grid_480_server.h5')
# result collection list
# confusion matrix 
predicted_class= []
predicted_protability = saved_model.predict(normalized_embeddings_results,batch_size=1)
for p in range(predicted_protability.shape[0]):
  index = np.where(predicted_protability[p] == np.amax(predicted_protability[p]))[0][0]
  predicted_class.append(index)
predicted_class = np.array(predicted_class)

predicted_class = assign_activity(predicted_class)  # transform results (0 and 1) into 'active' and 'non-active'
print(predicted_class)
report = {"sequence": sequence_list, "activity": predicted_class}
report_df = pd.DataFrame(report)
report_df.to_excel("480_report.xlsx")


['Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen']


### 640 feature dimension embedding test

In [None]:
import numpy as np
import pandas as pd
import esm
# select the ESM model for embeddings (you can select you desired model from https://github.com/facebookresearch/esm)
# NOTICE: if you choose other model, the following model architecture might not be very compitable
#         bseides,please revise the correspdoning parameters in esm_embeddings function (layers for feature extraction)
model, alphabet = esm.pretrained.esm2_t30_150M_UR50D()


# whole dataset loading and dataset splitting 
dataset = pd.read_excel('allergens_dataset.xlsx',na_filter = False) # take care the NA sequence problem

# generate the peptide embeddings
sequence_list = dataset['sequence'] 
embeddings_results = pd.DataFrame()
for seq in sequence_list:
    # the setting is just following the input format setting in ESM model, [name,sequence]
    tuple_sequence = tuple([seq,seq])
    peptide_sequence_list = []
    peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
    # employ ESM model for converting and save the converted data in csv format
    one_seq_embeddings = esm_embeddings_640(model, alphabet, peptide_sequence_list)
    embeddings_results= pd.concat([embeddings_results,one_seq_embeddings])
embeddings_results.to_csv('whole_sample_dataset_esm2_t30_150M_UR50D_unified_640_dimension.csv')

In [None]:
import joblib

scaler = joblib.load('best_model_grid_640_server.joblib')
normalized_embeddings_results = scaler.transform(embeddings_results)

from keras.models import load_model
# load the save best model
saved_model = load_model('best_model_grid_640_server.h5')
# result collection list
# confusion matrix 
predicted_class= []
predicted_protability = saved_model.predict(normalized_embeddings_results,batch_size=1)
for p in range(predicted_protability.shape[0]):
  index = np.where(predicted_protability[p] == np.amax(predicted_protability[p]))[0][0]
  predicted_class.append(index)
predicted_class = np.array(predicted_class)

predicted_class = assign_activity(predicted_class)  # transform results (0 and 1) into 'active' and 'non-active'
print(predicted_class)
report = {"sequence": sequence_list, "activity": predicted_class}
report_df = pd.DataFrame(report)
report_df.to_excel("640_report.xlsx")


['Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen']


### 1280 feature dimension embedding test

In [None]:
import numpy as np
import pandas as pd
import esm
# select the ESM model for embeddings (you can select you desired model from https://github.com/facebookresearch/esm)
# NOTICE: if you choose other model, the following model architecture might not be very compitable
#         bseides,please revise the correspdoning parameters in esm_embeddings function (layers for feature extraction)
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()

# whole dataset loading and dataset splitting 
dataset = pd.read_excel('allergens_dataset.xlsx',na_filter = False) # take care the NA sequence problem

# generate the peptide embeddings
sequence_list = dataset['sequence'] 
embeddings_results = pd.DataFrame()
for seq in sequence_list:
    # the setting is just following the input format setting in ESM model, [name,sequence]
    tuple_sequence = tuple([seq,seq])
    peptide_sequence_list = []
    peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
    # employ ESM model for converting and save the converted data in csv format
    one_seq_embeddings = esm_embeddings_1280(model, alphabet, peptide_sequence_list)
    embeddings_results= pd.concat([embeddings_results,one_seq_embeddings])
embeddings_results.to_csv('whole_sample_dataset_esm2_t33_650M_UR50D_unified_1280_dimension.csv')

In [None]:
import joblib

scaler = joblib.load('best_model_grid_1280_server.joblib')
normalized_embeddings_results = scaler.transform(embeddings_results)

from keras.models import load_model
# load the save best model
saved_model = load_model('best_model_grid_1280_server.h5')
# result collection list
# confusion matrix 
predicted_class= []
predicted_protability = saved_model.predict(normalized_embeddings_results,batch_size=1)
for p in range(predicted_protability.shape[0]):
  index = np.where(predicted_protability[p] == np.amax(predicted_protability[p]))[0][0]
  predicted_class.append(index)
predicted_class = np.array(predicted_class)

predicted_class = assign_activity(predicted_class)  # transform results (0 and 1) into 'active' and 'non-active'
print(predicted_class)
report = {"sequence": sequence_list, "activity": predicted_class}
report_df = pd.DataFrame(report)
report_df.to_excel("1280_report.xlsx")


['Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen', 'Non-allergen']


### 2560 feature dimension embedding test

In [None]:
import numpy as np
import pandas as pd
import esm
# select the ESM model for embeddings (you can select you desired model from https://github.com/facebookresearch/esm)
# NOTICE: if you choose other model, the following model architecture might not be very compitable
#         bseides,please revise the correspdoning parameters in esm_embeddings function (layers for feature extraction)
model, alphabet = esm.pretrained.esm2_t36_3B_UR50D()


# whole dataset loading and dataset splitting 
dataset = pd.read_excel('allergens_dataset.xlsx',na_filter = False) # take care the NA sequence problem
a=0
# generate the peptide embeddings
sequence_list = dataset['sequence'] 
embeddings_results = pd.DataFrame()
for seq in sequence_list:
    # the setting is just following the input format setting in ESM model, [name,sequence]
    tuple_sequence = tuple([seq,seq])
    peptide_sequence_list = []
    peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
    # employ ESM model for converting and save the converted data in csv format
    one_seq_embeddings = esm_embeddings_2560(model, alphabet, peptide_sequence_list)
    embeddings_results= pd.concat([embeddings_results,one_seq_embeddings])
    a=a+1
    print(a)
embeddings_results.to_csv('whole_sample_dataset_esm2_t36_3B_UR50D_unified_2560_dimension.csv')

In [None]:
import joblib

scaler = joblib.load('best_model_grid_2560_server.joblib')
normalized_embeddings_results = scaler.transform(embeddings_results)

from keras.models import load_model
# load the save best model
saved_model = load_model('best_model_grid_2560_server.h5')
# result collection list
# confusion matrix 
predicted_class= []
predicted_protability = saved_model.predict(normalized_embeddings_results,batch_size=1)
for p in range(predicted_protability.shape[0]):
  index = np.where(predicted_protability[p] == np.amax(predicted_protability[p]))[0][0]
  predicted_class.append(index)
predicted_class = np.array(predicted_class)

predicted_class = assign_activity(predicted_class)  # transform results (0 and 1) into 'active' and 'non-active'
print(predicted_class)
report = {"sequence": sequence_list, "activity": predicted_class}
report_df = pd.DataFrame(report)
report_df.to_excel("2560_report.xlsx")
