### requirements for the following codings


In [None]:
### packages required 
!pip install fair-esm 
!pip install torch
!pip install tensorflow
!pip install sklearn
!pip install biopython
!pip install h5py

### peptide embeddings with differen pretrained model
https://github.com/facebookresearch/esm

In [None]:
def esm_embeddings_320(esm2, esm2_alphabet, peptide_sequence_list):
  # NOTICE: ESM for embeddings is quite RAM usage, if your sequence is too long, 
  #         or you have too many sequences for transformation in a single converting, 
  #         you computer might automatically kill the job.
  import torch
  import esm
  import collections
  import pandas as pd
  import gc

  if torch.cuda.is_available():
    device = torch.device("cuda")
  else:
    device = torch.device("cpu")
  esm2 = esm2.eval().to(device)

  batch_converter = esm2_alphabet.get_batch_converter()

  # load the peptide sequence list into the bach_converter
  batch_labels, batch_strs, batch_tokens = batch_converter(peptide_sequence_list)
  batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
  ## batch tokens are the embedding results of the whole data set

  batch_tokens = batch_tokens.to(device)

  # Extract per-residue representations (on CPU)
  with torch.no_grad():
      # Here we export the last layer of the EMS model output as the representation of the peptides
      # model'esm2_t6_8M_UR50D' only has 6 layers, and therefore repr_layers parameters is equal to 6
      results = esm2(batch_tokens, repr_layers=[6], return_contacts=False)
  token_representations = results["representations"][6].cpu()

  # Generate per-sequence representations via averaging
  # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
  sequence_representations = []
  for i, tokens_len in enumerate(batch_lens):
      sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
  # save dataset
  # sequence_representations is a list and each element is a tensor
  embeddings_results = collections.defaultdict(list)
  for i in range(len(sequence_representations)):
      # tensor can be transformed as numpy sequence_representations[0].numpy() or sequence_representations[0].to_list
      each_seq_rep = sequence_representations[i].tolist()
      for each_element in each_seq_rep:
          embeddings_results[i].append(each_element)
  embeddings_results = pd.DataFrame(embeddings_results).T
  del  batch_labels, batch_strs, batch_tokens, results, token_representations
  return embeddings_results


In [None]:
def esm_embeddings_480(esm2, esm2_alphabet, peptide_sequence_list):
  # NOTICE: ESM for embeddings is quite RAM usage, if your sequence is too long, 
  #         or you have too many sequences for transformation in a single converting, 
  #         you computer might automatically kill the job.
  import torch
  import esm
  import collections
  import pandas as pd
  import gc

  if torch.cuda.is_available():
    device = torch.device("cuda")
  else:
    device = torch.device("cpu")
  esm2 = esm2.eval().to(device)

  batch_converter = esm2_alphabet.get_batch_converter()

  # load the peptide sequence list into the bach_converter
  batch_labels, batch_strs, batch_tokens = batch_converter(peptide_sequence_list)
  batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
  ## batch tokens are the embedding results of the whole data set

  batch_tokens = batch_tokens.to(device)

  # Extract per-residue representations (on CPU)
  with torch.no_grad():
      # Here we export the last layer of the EMS model output as the representation of the peptides
      # model'esm2_t12_35M_UR50D' only has 12 layers, and therefore repr_layers parameters is equal to 12
      results = esm2(batch_tokens, repr_layers=[12], return_contacts=False)
  token_representations = results["representations"][12].cpu()

  # Generate per-sequence representations via averaging
  # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
  sequence_representations = []
  for i, tokens_len in enumerate(batch_lens):
      sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
  # save dataset
  # sequence_representations is a list and each element is a tensor
  embeddings_results = collections.defaultdict(list)
  for i in range(len(sequence_representations)):
      # tensor can be transformed as numpy sequence_representations[0].numpy() or sequence_representations[0].to_list
      each_seq_rep = sequence_representations[i].tolist()
      for each_element in each_seq_rep:
          embeddings_results[i].append(each_element)
  embeddings_results = pd.DataFrame(embeddings_results).T
  del  batch_labels, batch_strs, batch_tokens, results, token_representations
  return embeddings_results


In [None]:
def esm_embeddings_640(esm2, esm2_alphabet, peptide_sequence_list):
  # NOTICE: ESM for embeddings is quite RAM usage, if your sequence is too long, 
  #         or you have too many sequences for transformation in a single converting, 
  #         you computer might automatically kill the job.
  import torch
  import esm
  import collections
  import pandas as pd
  import gc

  if torch.cuda.is_available():
    device = torch.device("cuda")
  else:
    device = torch.device("cpu")
  esm2 = esm2.eval().to(device)

  batch_converter = esm2_alphabet.get_batch_converter()

  # load the peptide sequence list into the bach_converter
  batch_labels, batch_strs, batch_tokens = batch_converter(peptide_sequence_list)
  batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
  ## batch tokens are the embedding results of the whole data set

  batch_tokens = batch_tokens.to(device)

  # Extract per-residue representations (on CPU)
  with torch.no_grad():
      # Here we export the last layer of the EMS model output as the representation of the peptides
      # model'esm2_t30_150M_UR50D' only has 30 layers, and therefore repr_layers parameters is equal to 30
      results = esm2(batch_tokens, repr_layers=[30], return_contacts=False)
  token_representations = results["representations"][30].cpu()

  # Generate per-sequence representations via averaging
  # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
  sequence_representations = []
  for i, tokens_len in enumerate(batch_lens):
      sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
  # save dataset
  # sequence_representations is a list and each element is a tensor
  embeddings_results = collections.defaultdict(list)
  for i in range(len(sequence_representations)):
      # tensor can be transformed as numpy sequence_representations[0].numpy() or sequence_representations[0].to_list
      each_seq_rep = sequence_representations[i].tolist()
      for each_element in each_seq_rep:
          embeddings_results[i].append(each_element)
  embeddings_results = pd.DataFrame(embeddings_results).T
  del  batch_labels, batch_strs, batch_tokens, results, token_representations
  return embeddings_results


In [None]:
def esm_embeddings_1280(esm2, esm2_alphabet, peptide_sequence_list):
  # NOTICE: ESM for embeddings is quite RAM usage, if your sequence is too long, 
  #         or you have too many sequences for transformation in a single converting, 
  #         you computer might automatically kill the job.
  import torch
  import esm
  import collections
  import pandas as pd
  import gc

  if torch.cuda.is_available():
    device = torch.device("cuda")
  else:
    device = torch.device("cpu")
  esm2 = esm2.eval().to(device)

  batch_converter = esm2_alphabet.get_batch_converter()

  # load the peptide sequence list into the bach_converter
  batch_labels, batch_strs, batch_tokens = batch_converter(peptide_sequence_list)
  batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
  ## batch tokens are the embedding results of the whole data set

  batch_tokens = batch_tokens.to(device)

  # Extract per-residue representations (on CPU)
  with torch.no_grad():
      # Here we export the last layer of the EMS model output as the representation of the peptides
      # model'esm2_t33_650M_UR50D' only has 33 layers, and therefore repr_layers parameters is equal to 33
      results = esm2(batch_tokens, repr_layers=[33], return_contacts=False)
  token_representations = results["representations"][33].cpu()

  # Generate per-sequence representations via averaging
  # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
  sequence_representations = []
  for i, tokens_len in enumerate(batch_lens):
      sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
  # save dataset
  # sequence_representations is a list and each element is a tensor
  embeddings_results = collections.defaultdict(list)
  for i in range(len(sequence_representations)):
      # tensor can be transformed as numpy sequence_representations[0].numpy() or sequence_representations[0].to_list
      each_seq_rep = sequence_representations[i].tolist()
      for each_element in each_seq_rep:
          embeddings_results[i].append(each_element)
  embeddings_results = pd.DataFrame(embeddings_results).T
  del  batch_labels, batch_strs, batch_tokens, results, token_representations
  return embeddings_results


In [None]:
def esm_embeddings_2560(esm2, esm2_alphabet, peptide_sequence_list):
  # NOTICE: ESM for embeddings is quite RAM usage, if your sequence is too long, 
  #         or you have too many sequences for transformation in a single converting, 
  #         you computer might automatically kill the job.
  import torch
  import esm
  import collections
  import pandas as pd
  import gc

  if torch.cuda.is_available():
    device = torch.device("cuda")
  else:
    device = torch.device("cpu")
  esm2 = esm2.eval().to(device)

  batch_converter = esm2_alphabet.get_batch_converter()

  # load the peptide sequence list into the bach_converter
  batch_labels, batch_strs, batch_tokens = batch_converter(peptide_sequence_list)
  batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
  ## batch tokens are the embedding results of the whole data set

  batch_tokens = batch_tokens.to(device)

  # Extract per-residue representations (on CPU)
  with torch.no_grad():
      # Here we export the last layer of the EMS model output as the representation of the peptides
      # model'esm2_t36_3B_UR50D' only has 36 layers, and therefore repr_layers parameters is equal to 36
      results = esm2(batch_tokens, repr_layers=[36], return_contacts=False)
  token_representations = results["representations"][36].cpu()

  # Generate per-sequence representations via averaging
  # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
  sequence_representations = []
  for i, tokens_len in enumerate(batch_lens):
      sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
  # save dataset
  # sequence_representations is a list and each element is a tensor
  embeddings_results = collections.defaultdict(list)
  for i in range(len(sequence_representations)):
      # tensor can be transformed as numpy sequence_representations[0].numpy() or sequence_representations[0].to_list
      each_seq_rep = sequence_representations[i].tolist()
      for each_element in each_seq_rep:
          embeddings_results[i].append(each_element)
  embeddings_results = pd.DataFrame(embeddings_results).T
  del  batch_labels, batch_strs, batch_tokens, results, token_representations
  return embeddings_results


In [None]:
def esm_embeddings_5120(esm2, esm2_alphabet, peptide_sequence_list):
  # NOTICE: ESM for embeddings is quite RAM usage, if your sequence is too long, 
  #         or you have too many sequences for transformation in a single converting, 
  #         you computer might automatically kill the job.
  import torch
  import esm
  import collections
  import pandas as pd
  import gc

  if torch.cuda.is_available():
    device = torch.device("cuda")
  else:
    device = torch.device("cpu")
  esm2 = esm2.eval().to(device)

  batch_converter = esm2_alphabet.get_batch_converter()

  # load the peptide sequence list into the bach_converter
  batch_labels, batch_strs, batch_tokens = batch_converter(peptide_sequence_list)
  batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
  ## batch tokens are the embedding results of the whole data set

  batch_tokens = batch_tokens.to(device)

  # Extract per-residue representations (on CPU)
  with torch.no_grad():
      # Here we export the last layer of the EMS model output as the representation of the peptides
      # model'esm2_t48_15B_UR50D' only has 48 layers, and therefore repr_layers parameters is equal to 48
      results = esm2(batch_tokens, repr_layers=[48], return_contacts=False)
  token_representations = results["representations"][48].cpu()

  # Generate per-sequence representations via averaging
  # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
  sequence_representations = []
  for i, tokens_len in enumerate(batch_lens):
      sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
  # save dataset
  # sequence_representations is a list and each element is a tensor
  embeddings_results = collections.defaultdict(list)
  for i in range(len(sequence_representations)):
      # tensor can be transformed as numpy sequence_representations[0].numpy() or sequence_representations[0].to_list
      each_seq_rep = sequence_representations[i].tolist()
      for each_element in each_seq_rep:
          embeddings_results[i].append(each_element)
  embeddings_results = pd.DataFrame(embeddings_results).T
  del  batch_labels, batch_strs, batch_tokens, results, token_representations
  return embeddings_results


### connect with googledrive


In [1]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.getcwd()
os.chdir("drive/MyDrive/universal_allergens")
!ls

Mounted at /content/drive
allergens_dataset.xlsx
best_model.h5
best_model_large.h5
whole_sample_dataset_esm2_t12_35M_UR50D_unified_480_dimension.csv
whole_sample_dataset_esm2_t30_150M_UR50D_unified_640_dimension.csv
whole_sample_dataset_esm2_t33_650M_UR50D_unified_1280_dimension.csv
whole_sample_dataset_esm2_t36_3B_UR50D_unified_2560_dimension.csv
whole_sample_dataset_esm2_t6_8M_UR50D_unified_320_dimension.csv


#### load packages

In [None]:
from keras.layers import Input, Dense, Activation, BatchNormalization, Flatten, Conv1D
from keras.layers import Dropout, AveragePooling1D, MaxPooling1D
from keras.models import Sequential,Model, load_model
from keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint,LearningRateScheduler, EarlyStopping
import keras
from keras import backend as K
import tensorflow as tf
if tf.test.gpu_device_name():
    print('GPU found')
    tf.config.experimental.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU') # set the deep learning with GPU 
else:
    print("No GPU found")

### 320 feature dimension embedding test

In [7]:
import numpy as np
import pandas as pd
import esm
# select the ESM model for embeddings (you can select you desired model from https://github.com/facebookresearch/esm)
# NOTICE: if you choose other model, the following model architecture might not be very compitable
#         bseides,please revise the correspdoning parameters in esm_embeddings function (layers for feature extraction)
# model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()


# whole dataset loading and dataset splitting 
dataset = pd.read_excel('allergens_dataset.xlsx',na_filter = False) # take care the NA sequence problem

# generate the peptide embeddings
sequence_list = dataset['sequence'] 
# embeddings_results = pd.DataFrame()
# for seq in sequence_list:
#     # the setting is just following the input format setting in ESM model, [name,sequence]
#     tuple_sequence = tuple([seq,seq])
#     peptide_sequence_list = []
#     peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
#     # employ ESM model for converting and save the converted data in csv format
#     one_seq_embeddings = esm_embeddings_320(model, alphabet, peptide_sequence_list)
#     embeddings_results= pd.concat([embeddings_results,one_seq_embeddings])
# embeddings_results.to_csv('whole_sample_dataset_esm2_t6_8M_UR50D_unified_320_dimension.csv')

# loading the y dataset for model development 
y = dataset['label']
y = np.array(y) # transformed as np.array for CNN model

# read the peptide embeddings
X_data_name = 'whole_sample_dataset_esm2_t6_8M_UR50D_unified_320_dimension.csv'
X_data = pd.read_csv(X_data_name,header=0, index_col = 0,delimiter=',')
X = np.array(X_data)

# split dataset as training and test dataset as ratio of 8:2
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=123)

# normalize the X data range
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train) # normalize X to 0-1 range 
X_test = scaler.transform(X_test)

In [None]:
ACC_collecton = []
BACC_collecton = []
Sn_collecton = []
Sp_collecton = []
MCC_collecton = []
AUC_collecton = []
CNN_channel = [16,32,64,128,256]
dense_node = [32,64,128,256]
kernel_size = [3,6,9,12]
for i in range(len(CNN_channel)):
  for j in range(len(dense_node)):
    for k in range(len(kernel_size)):
      inputShape=(320,1) # input feature size 
      input = Input(inputShape)
      x = Conv1D(CNN_channel[i],(kernel_size[k]),strides = (1),name='layer_conv2',padding='same')(input)
      x = BatchNormalization()(x)
      x = Activation('relu')(x)
      x = MaxPooling1D((2), name='MaxPool2',padding="same")(x)
      x = Dropout(0.15)(x)
      x = Flatten()(x)
      x = Dense(dense_node[j],activation = 'relu',name='fc1')(x)
      x = Dropout(0.15)(x)
      x = Dense(2,activation = 'softmax',name='fc2')(x)
      model = Model(inputs = input,outputs = x,name='Predict')
      # define SGD optimizer
      momentum = 0.5
      sgd = SGD(lr=0.01, momentum=momentum, decay=0.0, nesterov=False)
      # compile the model
      model.compile(loss='sparse_categorical_crossentropy',optimizer=sgd, metrics=['accuracy'])
      # learning deccay setting
      import math
      def step_decay(epoch): # gradually decrease the learning rate 
          initial_lrate=0.1
          drop=0.6
          epochs_drop = 3.0
          lrate= initial_lrate * math.pow(drop,    # math.pow base raised to a power
                math.floor((1+epoch)/epochs_drop)) # math.floor Round numbers down to the nearest integer
          return lrate
      lr = LearningRateScheduler(step_decay)
      # early stop setting
      early_stop = EarlyStopping(monitor='val_accuracy', patience = 40,verbose=1,restore_best_weights = True)
      # set checkpoint and save the best model
      mc = ModelCheckpoint('best_model.h5',  monitor='val_accuracy', mode='max', verbose=1, save_best_only=True, save_weights_only=False)
      # summary the callbacks_list
      callbacks_list = [ lr , early_stop, mc]
      model_history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=200,callbacks=callbacks_list,batch_size = 16, verbose=1)
      # load the save best model
      saved_model = load_model('best_model.h5')
      # result collection list
      # confusion matrix 
      predicted_class= []
      predicted_protability = saved_model.predict(X_test,batch_size=1)
      for p in range(predicted_protability.shape[0]):
        index = np.where(predicted_protability[p] == np.amax(predicted_protability[p]))[0][0]
        predicted_class.append(index)
      predicted_class = np.array(predicted_class)
      y_true = y_test    
      from sklearn.metrics import confusion_matrix
      import math
      # np.ravel() return a flatten 1D array
      TP, FP, FN, TN = confusion_matrix(y_true, predicted_class).ravel() # shape [ [True-Positive, False-positive], [False-negative, True-negative] ]
      ACC = (TP+TN)/(TP+TN+FP+FN)
      ACC_collecton.append(ACC)
      Sn_collecton.append(TP/(TP+FN))
      Sp_collecton.append(TN/(TN+FP))
      MCC = (TP*TN-FP*FN)/math.pow(((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)),0.5)
      MCC_collecton.append(MCC)
      BACC_collecton.append(0.5*TP/(TP+FN)+0.5*TN/(TN+FP))
      from sklearn.metrics import roc_auc_score
      AUC = roc_auc_score(y_test, predicted_protability[:,1])
      AUC_collecton.append(AUC)
print(ACC_collecton)
print(BACC_collecton)
print(Sn_collecton)
print(Sp_collecton)
print(MCC_collecton)
print(AUC_collecton)
for i in range(len(CNN_channel)):
  for j in range(len(dense_node)):
    for k in range(len(kernel_size)):
      print(CNN_channel[i],dense_node[j],kernel_size[k])

In [None]:
# combine the lists into a list of tuples
combined_list = list(zip(CNN_channels, dense_nodes, kernel_sizes, ACC_collection, BACC_collection, Sn_collection, Sp_collection, MCC_collection, AUC_collection))

# create a DataFrame from the list of tuples
df = pd.DataFrame(combined_list, columns=['CNN_channels', 'dense_nodes', 'kernel_sizes', 'ACC_collection', 'BACC_collection', 'Sn_collection', 'Sp_collection', 'MCC_collection', 'AUC_collection'])

# export the DataFrame to an Excel file
df.to_excel('320 performance output.xlsx', index=False)

### 480 feature dimension embedding test

In [9]:
import numpy as np
import pandas as pd
import esm
# select the ESM model for embeddings (you can select you desired model from https://github.com/facebookresearch/esm)
# NOTICE: if you choose other model, the following model architecture might not be very compitable
#         bseides,please revise the correspdoning parameters in esm_embeddings function (layers for feature extraction)
# model, alphabet = esm.pretrained.esm2_t12_35M_UR50D()


# whole dataset loading and dataset splitting 
dataset = pd.read_excel('allergens_dataset.xlsx',na_filter = False) # take care the NA sequence problem

# # generate the peptide embeddings
sequence_list = dataset['sequence'] 
# embeddings_results = pd.DataFrame()
# for seq in sequence_list:
#     # the setting is just following the input format setting in ESM model, [name,sequence]
#     tuple_sequence = tuple([seq,seq])
#     peptide_sequence_list = []
#     peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
#     # employ ESM model for converting and save the converted data in csv format
#     one_seq_embeddings = esm_embeddings_480(model, alphabet, peptide_sequence_list)
#     embeddings_results= pd.concat([embeddings_results,one_seq_embeddings])
# embeddings_results.to_csv('whole_sample_dataset_esm2_t12_35M_UR50D_unified_480_dimension.csv')

# loading the y dataset for model development 
y = dataset['label']
y = np.array(y) # transformed as np.array for CNN model

# read the peptide embeddings
X_data_name = 'whole_sample_dataset_esm2_t12_35M_UR50D_unified_480_dimension.csv'
X_data = pd.read_csv(X_data_name,header=0, index_col = 0,delimiter=',')
X = np.array(X_data)

# split dataset as training and test dataset as ratio of 8:2
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=123)

# normalize the X data range
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train) # normalize X to 0-1 range 
X_test = scaler.transform(X_test)

In [None]:
ACC_collecton = []
BACC_collecton = []
Sn_collecton = []
Sp_collecton = []
MCC_collecton = []
AUC_collecton = []
CNN_channel = [16,32,64,128,256]
dense_node = [32,64,128,256]
kernel_size = [3,6,9,12]
for i in range(len(CNN_channel)):
  for j in range(len(dense_node)):
    for k in range(len(kernel_size)):
      inputShape=(480,1) # input feature size 
      input = Input(inputShape)
      x = Conv1D(CNN_channel[i],(kernel_size[k]),strides = (1),name='layer_conv2',padding='same')(input)
      x = BatchNormalization()(x)
      x = Activation('relu')(x)
      x = MaxPooling1D((2), name='MaxPool2',padding="same")(x)
      x = Dropout(0.15)(x)
      x = Flatten()(x)
      x = Dense(dense_node[j],activation = 'relu',name='fc1')(x)
      x = Dropout(0.15)(x)
      x = Dense(2,activation = 'softmax',name='fc2')(x)
      model = Model(inputs = input,outputs = x,name='Predict')
      # define SGD optimizer
      momentum = 0.5
      sgd = SGD(lr=0.01, momentum=momentum, decay=0.0, nesterov=False)
      # compile the model
      model.compile(loss='sparse_categorical_crossentropy',optimizer=sgd, metrics=['accuracy'])
      # learning deccay setting
      import math
      def step_decay(epoch): # gradually decrease the learning rate 
          initial_lrate=0.1
          drop=0.6
          epochs_drop = 3.0
          lrate= initial_lrate * math.pow(drop,    # math.pow base raised to a power
                math.floor((1+epoch)/epochs_drop)) # math.floor Round numbers down to the nearest integer
          return lrate
      lr = LearningRateScheduler(step_decay)
      # early stop setting
      early_stop = EarlyStopping(monitor='val_accuracy', patience = 40,verbose=1,restore_best_weights = True)
      # set checkpoint and save the best model
      mc = ModelCheckpoint('best_model.h5',  monitor='val_accuracy', mode='max', verbose=1, save_best_only=True, save_weights_only=False)
      # summary the callbacks_list
      callbacks_list = [ lr , early_stop, mc]
      model_history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=200,callbacks=callbacks_list,batch_size = 16, verbose=1)
      # load the save best model
      saved_model = load_model('best_model.h5')
      # result collection list
      # confusion matrix 
      predicted_class= []
      predicted_protability = saved_model.predict(X_test,batch_size=1)
      for p in range(predicted_protability.shape[0]):
        index = np.where(predicted_protability[p] == np.amax(predicted_protability[p]))[0][0]
        predicted_class.append(index)
      predicted_class = np.array(predicted_class)
      y_true = y_test    
      from sklearn.metrics import confusion_matrix
      import math
      # np.ravel() return a flatten 1D array
      TP, FP, FN, TN = confusion_matrix(y_true, predicted_class).ravel() # shape [ [True-Positive, False-positive], [False-negative, True-negative] ]
      ACC = (TP+TN)/(TP+TN+FP+FN)
      ACC_collecton.append(ACC)
      Sn_collecton.append(TP/(TP+FN))
      Sp_collecton.append(TN/(TN+FP))
      MCC = (TP*TN-FP*FN)/math.pow(((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)),0.5)
      MCC_collecton.append(MCC)
      BACC_collecton.append(0.5*TP/(TP+FN)+0.5*TN/(TN+FP))
      from sklearn.metrics import roc_auc_score
      AUC = roc_auc_score(y_test, predicted_protability[:,1])
      AUC_collecton.append(AUC)
print(ACC_collecton)
print(BACC_collecton)
print(Sn_collecton)
print(Sp_collecton)
print(MCC_collecton)
print(AUC_collecton)
for i in range(len(CNN_channel)):
  for j in range(len(dense_node)):
    for k in range(len(kernel_size)):
      print(CNN_channel[i],dense_node[j],kernel_size[k])

In [None]:
# combine the lists into a list of tuples
combined_list = list(zip(CNN_channels, dense_nodes, kernel_sizes, ACC_collection, BACC_collection, Sn_collection, Sp_collection, MCC_collection, AUC_collection))

# create a DataFrame from the list of tuples
df = pd.DataFrame(combined_list, columns=['CNN_channels', 'dense_nodes', 'kernel_sizes', 'ACC_collection', 'BACC_collection', 'Sn_collection', 'Sp_collection', 'MCC_collection', 'AUC_collection'])

# export the DataFrame to an Excel file
df.to_excel('480 performance output.xlsx', index=False)

### 640 feature dimension embedding test

In [None]:
import numpy as np
import pandas as pd
import esm
# select the ESM model for embeddings (you can select you desired model from https://github.com/facebookresearch/esm)
# NOTICE: if you choose other model, the following model architecture might not be very compitable
#         bseides,please revise the correspdoning parameters in esm_embeddings function (layers for feature extraction)
# model, alphabet = esm.pretrained.esm2_t30_150M_UR50D()


# whole dataset loading and dataset splitting 
dataset = pd.read_excel('allergens_dataset.xlsx',na_filter = False) # take care the NA sequence problem

# generate the peptide embeddings
sequence_list = dataset['sequence'] 
# embeddings_results = pd.DataFrame()
# for seq in sequence_list:
#     # the setting is just following the input format setting in ESM model, [name,sequence]
#     tuple_sequence = tuple([seq,seq])
#     peptide_sequence_list = []
#     peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
#     # employ ESM model for converting and save the converted data in csv format
#     one_seq_embeddings = esm_embeddings_640(model, alphabet, peptide_sequence_list)
#     embeddings_results= pd.concat([embeddings_results,one_seq_embeddings])
# embeddings_results.to_csv('whole_sample_dataset_esm2_t30_150M_UR50D_unified_640_dimension.csv')

# loading the y dataset for model development 
y = dataset['label']
y = np.array(y) # transformed as np.array for CNN model

# read the peptide embeddings
X_data_name = 'whole_sample_dataset_esm2_t30_150M_UR50D_unified_640_dimension.csv'
X_data = pd.read_csv(X_data_name,header=0, index_col = 0,delimiter=',')
X = np.array(X_data)

# split dataset as training and test dataset as ratio of 8:2
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=123)

# normalize the X data range
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train) # normalize X to 0-1 range 
X_test = scaler.transform(X_test)

In [None]:
ACC_collecton = []
BACC_collecton = []
Sn_collecton = []
Sp_collecton = []
MCC_collecton = []
AUC_collecton = []
CNN_channel = [16,32,64,128,256]
dense_node = [32,64,128,256]
kernel_size = [3,6,9,12]
for i in range(len(CNN_channel)):
  for j in range(len(dense_node)):
    for k in range(len(kernel_size)):
      inputShape=(640,1) # input feature size 
      input = Input(inputShape)
      x = Conv1D(CNN_channel[i],(kernel_size[k]),strides = (1),name='layer_conv2',padding='same')(input)
      x = BatchNormalization()(x)
      x = Activation('relu')(x)
      x = MaxPooling1D((2), name='MaxPool2',padding="same")(x)
      x = Dropout(0.15)(x)
      x = Flatten()(x)
      x = Dense(dense_node[j],activation = 'relu',name='fc1')(x)
      x = Dropout(0.15)(x)
      x = Dense(2,activation = 'softmax',name='fc2')(x)
      model = Model(inputs = input,outputs = x,name='Predict')
      # define SGD optimizer
      momentum = 0.5
      sgd = SGD(lr=0.01, momentum=momentum, decay=0.0, nesterov=False)
      # compile the model
      model.compile(loss='sparse_categorical_crossentropy',optimizer=sgd, metrics=['accuracy'])
      # learning deccay setting
      import math
      def step_decay(epoch): # gradually decrease the learning rate 
          initial_lrate=0.1
          drop=0.6
          epochs_drop = 3.0
          lrate= initial_lrate * math.pow(drop,    # math.pow base raised to a power
                math.floor((1+epoch)/epochs_drop)) # math.floor Round numbers down to the nearest integer
          return lrate
      lr = LearningRateScheduler(step_decay)
      # early stop setting
      early_stop = EarlyStopping(monitor='val_accuracy', patience = 40,verbose=1,restore_best_weights = True)
      # set checkpoint and save the best model
      mc = ModelCheckpoint('best_model.h5',  monitor='val_accuracy', mode='max', verbose=1, save_best_only=True, save_weights_only=False)
      # summary the callbacks_list
      callbacks_list = [ lr , early_stop, mc]
      model_history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=200,callbacks=callbacks_list,batch_size = 16, verbose=1)
      # load the save best model
      saved_model = load_model('best_model.h5')
      # result collection list
      # confusion matrix 
      predicted_class= []
      predicted_protability = saved_model.predict(X_test,batch_size=1)
      for p in range(predicted_protability.shape[0]):
        index = np.where(predicted_protability[p] == np.amax(predicted_protability[p]))[0][0]
        predicted_class.append(index)
      predicted_class = np.array(predicted_class)
      y_true = y_test    
      from sklearn.metrics import confusion_matrix
      import math
      # np.ravel() return a flatten 1D array
      TP, FP, FN, TN = confusion_matrix(y_true, predicted_class).ravel() # shape [ [True-Positive, False-positive], [False-negative, True-negative] ]
      ACC = (TP+TN)/(TP+TN+FP+FN)
      ACC_collecton.append(ACC)
      Sn_collecton.append(TP/(TP+FN))
      Sp_collecton.append(TN/(TN+FP))
      MCC = (TP*TN-FP*FN)/math.pow(((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)),0.5)
      MCC_collecton.append(MCC)
      BACC_collecton.append(0.5*TP/(TP+FN)+0.5*TN/(TN+FP))
      from sklearn.metrics import roc_auc_score
      AUC = roc_auc_score(y_test, predicted_protability[:,1])
      AUC_collecton.append(AUC)
print(ACC_collecton)
print(BACC_collecton)
print(Sn_collecton)
print(Sp_collecton)
print(MCC_collecton)
print(AUC_collecton)
for i in range(len(CNN_channel)):
  for j in range(len(dense_node)):
    for k in range(len(kernel_size)):
      print(CNN_channel[i],dense_node[j],kernel_size[k])

### 1280 feature dimension embedding test

In [None]:
import numpy as np
import pandas as pd
import esm
# select the ESM model for embeddings (you can select you desired model from https://github.com/facebookresearch/esm)
# NOTICE: if you choose other model, the following model architecture might not be very compitable
#         bseides,please revise the correspdoning parameters in esm_embeddings function (layers for feature extraction)
# model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()

# whole dataset loading and dataset splitting 
dataset = pd.read_excel('allergens_dataset.xlsx',na_filter = False) # take care the NA sequence problem

# # generate the peptide embeddings
sequence_list = dataset['sequence'] 
# embeddings_results = pd.DataFrame()
# for seq in sequence_list:
#     # the setting is just following the input format setting in ESM model, [name,sequence]
#     tuple_sequence = tuple([seq,seq])
#     peptide_sequence_list = []
#     peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
#     # employ ESM model for converting and save the converted data in csv format
#     one_seq_embeddings = esm_embeddings_1280(model, alphabet, peptide_sequence_list)
#     embeddings_results= pd.concat([embeddings_results,one_seq_embeddings])
# embeddings_results.to_csv('whole_sample_dataset_esm2_t33_650M_UR50D_unified_1280_dimension.csv')

# loading the y dataset for model development 
y = dataset['label']
y = np.array(y) # transformed as np.array for CNN model

# read the peptide embeddings
X_data_name = 'whole_sample_dataset_esm2_t33_650M_UR50D_unified_1280_dimension.csv'
X_data = pd.read_csv(X_data_name,header=0, index_col = 0,delimiter=',')
X = np.array(X_data)

# split dataset as training and test dataset as ratio of 8:2
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=123)

# normalize the X data range
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train) # normalize X to 0-1 range 
X_test = scaler.transform(X_test)

In [None]:
ACC_collecton = []
BACC_collecton = []
Sn_collecton = []
Sp_collecton = []
MCC_collecton = []
AUC_collecton = []
CNN_channel = [16,32,64,128,256,512]
dense_node = [32,64,128,256,512]
kernel_size = [3,6,9,12]
for i in range(len(CNN_channel)):
  for j in range(len(dense_node)):
    for k in range(len(kernel_size)):
      inputShape=(1280,1) # input feature size 
      input = Input(inputShape)
      x = Conv1D(CNN_channel[i],(kernel_size[k]),strides = (1),name='layer_conv2',padding='same')(input)
      x = BatchNormalization()(x)
      x = Activation('relu')(x)
      x = MaxPooling1D((2), name='MaxPool2',padding="same")(x)
      x = Dropout(0.15)(x)
      x = Flatten()(x)
      x = Dense(dense_node[j],activation = 'relu',name='fc1')(x)
      x = Dropout(0.15)(x)
      x = Dense(2,activation = 'softmax',name='fc2')(x)
      model = Model(inputs = input,outputs = x,name='Predict')
      # define SGD optimizer
      momentum = 0.5
      sgd = SGD(lr=0.01, momentum=momentum, decay=0.0, nesterov=False)
      # compile the model
      model.compile(loss='sparse_categorical_crossentropy',optimizer=sgd, metrics=['accuracy'])
      # learning deccay setting
      import math
      def step_decay(epoch): # gradually decrease the learning rate 
          initial_lrate=0.1
          drop=0.6
          epochs_drop = 3.0
          lrate= initial_lrate * math.pow(drop,    # math.pow base raised to a power
                math.floor((1+epoch)/epochs_drop)) # math.floor Round numbers down to the nearest integer
          return lrate
      lr = LearningRateScheduler(step_decay)
      # early stop setting
      early_stop = EarlyStopping(monitor='val_accuracy', patience = 40,verbose=1,restore_best_weights = True)
      # set checkpoint and save the best model
      mc = ModelCheckpoint('best_model.h5',  monitor='val_accuracy', mode='max', verbose=1, save_best_only=True, save_weights_only=False)
      # summary the callbacks_list
      callbacks_list = [ lr , early_stop, mc]
      model_history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=200,callbacks=callbacks_list,batch_size = 16, verbose=1)
      # load the save best model
      saved_model = load_model('best_model.h5')
      # result collection list
      # confusion matrix 
      predicted_class= []
      predicted_protability = saved_model.predict(X_test,batch_size=1)
      for p in range(predicted_protability.shape[0]):
        index = np.where(predicted_protability[p] == np.amax(predicted_protability[p]))[0][0]
        predicted_class.append(index)
      predicted_class = np.array(predicted_class)
      y_true = y_test    
      from sklearn.metrics import confusion_matrix
      import math
      # np.ravel() return a flatten 1D array
      TP, FP, FN, TN = confusion_matrix(y_true, predicted_class).ravel() # shape [ [True-Positive, False-positive], [False-negative, True-negative] ]
      ACC = (TP+TN)/(TP+TN+FP+FN)
      ACC_collecton.append(ACC)
      Sn_collecton.append(TP/(TP+FN))
      Sp_collecton.append(TN/(TN+FP))
      MCC = (TP*TN-FP*FN)/math.pow(((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)),0.5)
      MCC_collecton.append(MCC)
      BACC_collecton.append(0.5*TP/(TP+FN)+0.5*TN/(TN+FP))
      from sklearn.metrics import roc_auc_score
      AUC = roc_auc_score(y_test, predicted_protability[:,1])
      AUC_collecton.append(AUC)
print(ACC_collecton)
print(BACC_collecton)
print(Sn_collecton)
print(Sp_collecton)
print(MCC_collecton)
print(AUC_collecton)
for i in range(len(CNN_channel)):
  for j in range(len(dense_node)):
    for k in range(len(kernel_size)):
      print(CNN_channel[i],dense_node[j],kernel_size[k])

In [None]:
# combine the lists into a list of tuples
combined_list = list(zip(CNN_channels, dense_nodes, kernel_sizes, ACC_collection, BACC_collection, Sn_collection, Sp_collection, MCC_collection, AUC_collection))

# create a DataFrame from the list of tuples
df = pd.DataFrame(combined_list, columns=['CNN_channels', 'dense_nodes', 'kernel_sizes', 'ACC_collection', 'BACC_collection', 'Sn_collection', 'Sp_collection', 'MCC_collection', 'AUC_collection'])

# export the DataFrame to an Excel file
df.to_excel('1280 performance output.xlsx', index=False)

### 2560 feature dimension embedding test

In [None]:
import numpy as np
import pandas as pd
import esm
# select the ESM model for embeddings (you can select you desired model from https://github.com/facebookresearch/esm)
# NOTICE: if you choose other model, the following model architecture might not be very compitable
#         bseides,please revise the correspdoning parameters in esm_embeddings function (layers for feature extraction)
model, alphabet = esm.pretrained.esm2_t36_3B_UR50D()


# whole dataset loading and dataset splitting 
dataset = pd.read_excel('allergens_dataset.xlsx',na_filter = False) # take care the NA sequence problem

# generate the peptide embeddings
sequence_list = dataset['sequence'] 
embeddings_results = pd.DataFrame()
for seq in sequence_list:
    # the setting is just following the input format setting in ESM model, [name,sequence]
    tuple_sequence = tuple([seq,seq])
    peptide_sequence_list = []
    peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
    # employ ESM model for converting and save the converted data in csv format
    one_seq_embeddings = esm_embeddings_2560(model, alphabet, peptide_sequence_list)
    embeddings_results= pd.concat([embeddings_results,one_seq_embeddings])
embeddings_results.to_csv('whole_sample_dataset_esm2_t36_3B_UR50D_unified_2560_dimension.csv')

# loading the y dataset for model development 
y = dataset['label']
y = np.array(y) # transformed as np.array for CNN model

# read the peptide embeddings
X_data_name = 'whole_sample_dataset_esm2_t36_3B_UR50D_unified_2560_dimension.csv'
X_data = pd.read_csv(X_data_name,header=0, index_col = 0,delimiter=',')
X = np.array(X_data)

# split dataset as training and test dataset as ratio of 8:2
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=123)

# normalize the X data range
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train) # normalize X to 0-1 range 
X_test = scaler.transform(X_test)

In [None]:
ACC_collecton = []
BACC_collecton = []
Sn_collecton = []
Sp_collecton = []
MCC_collecton = []
AUC_collecton = []
CNN_channel = [16,32,64,128,256,512]
dense_node = [32,64,128,256,512]
kernel_size = [3,6,9,12]
for i in range(len(CNN_channel)):
  for j in range(len(dense_node)):
    for k in range(len(kernel_size)):
      inputShape=(2560,1) # input feature size 
      input = Input(inputShape)
      x = Conv1D(CNN_channel[i],(kernel_size[k]),strides = (1),name='layer_conv2',padding='same')(input)
      x = BatchNormalization()(x)
      x = Activation('relu')(x)
      x = MaxPooling1D((2), name='MaxPool2',padding="same")(x)
      x = Dropout(0.15)(x)
      x = Flatten()(x)
      x = Dense(dense_node[j],activation = 'relu',name='fc1')(x)
      x = Dropout(0.15)(x)
      x = Dense(2,activation = 'softmax',name='fc2')(x)
      model = Model(inputs = input,outputs = x,name='Predict')
      # define SGD optimizer
      momentum = 0.5
      sgd = SGD(lr=0.01, momentum=momentum, decay=0.0, nesterov=False)
      # compile the model
      model.compile(loss='sparse_categorical_crossentropy',optimizer=sgd, metrics=['accuracy'])
      # learning deccay setting
      import math
      def step_decay(epoch): # gradually decrease the learning rate 
          initial_lrate=0.1
          drop=0.6
          epochs_drop = 3.0
          lrate= initial_lrate * math.pow(drop,    # math.pow base raised to a power
                math.floor((1+epoch)/epochs_drop)) # math.floor Round numbers down to the nearest integer
          return lrate
      lr = LearningRateScheduler(step_decay)
      # early stop setting
      early_stop = EarlyStopping(monitor='val_accuracy', patience = 40,verbose=1,restore_best_weights = True)
      # set checkpoint and save the best model
      mc = ModelCheckpoint('best_model.h5',  monitor='val_accuracy', mode='max', verbose=1, save_best_only=True, save_weights_only=False)
      # summary the callbacks_list
      callbacks_list = [ lr , early_stop, mc]
      model_history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=200,callbacks=callbacks_list,batch_size = 16, verbose=1)
      # load the save best model
      saved_model = load_model('best_model.h5')
      # result collection list
      # confusion matrix 
      predicted_class= []
      predicted_protability = saved_model.predict(X_test,batch_size=1)
      for p in range(predicted_protability.shape[0]):
        index = np.where(predicted_protability[p] == np.amax(predicted_protability[p]))[0][0]
        predicted_class.append(index)
      predicted_class = np.array(predicted_class)
      y_true = y_test    
      from sklearn.metrics import confusion_matrix
      import math
      # np.ravel() return a flatten 1D array
      TP, FP, FN, TN = confusion_matrix(y_true, predicted_class).ravel() # shape [ [True-Positive, False-positive], [False-negative, True-negative] ]
      ACC = (TP+TN)/(TP+TN+FP+FN)
      ACC_collecton.append(ACC)
      Sn_collecton.append(TP/(TP+FN))
      Sp_collecton.append(TN/(TN+FP))
      MCC = (TP*TN-FP*FN)/math.pow(((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)),0.5)
      MCC_collecton.append(MCC)
      BACC_collecton.append(0.5*TP/(TP+FN)+0.5*TN/(TN+FP))
      from sklearn.metrics import roc_auc_score
      AUC = roc_auc_score(y_test, predicted_protability[:,1])
      AUC_collecton.append(AUC)
print(ACC_collecton)
print(BACC_collecton)
print(Sn_collecton)
print(Sp_collecton)
print(MCC_collecton)
print(AUC_collecton)
for i in range(len(CNN_channel)):
  for j in range(len(dense_node)):
    for k in range(len(kernel_size)):
      print(CNN_channel[i],dense_node[j],kernel_size[k])

In [None]:
# combine the lists into a list of tuples
combined_list = list(zip(CNN_channels, dense_nodes, kernel_sizes, ACC_collection, BACC_collection, Sn_collection, Sp_collection, MCC_collection, AUC_collection))

# create a DataFrame from the list of tuples
df = pd.DataFrame(combined_list, columns=['CNN_channels', 'dense_nodes', 'kernel_sizes', 'ACC_collection', 'BACC_collection', 'Sn_collection', 'Sp_collection', 'MCC_collection', 'AUC_collection'])

# export the DataFrame to an Excel file
df.to_excel('2560 performance output.xlsx', index=False)

GPU found
