In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


#### ESM-2-2560 and MolFormer embeddings function

In [None]:
!pip install fair-esm
!pip install rdkit

Collecting fair-esm
  Downloading fair_esm-2.0.0-py3-none-any.whl (93 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fair-esm
Successfully installed fair-esm-2.0.0
Collecting rdkit
  Downloading rdkit-2024.3.3-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2024.3.3


In [None]:
import pandas as pd
import numpy as np
ESP_train_df = pd.read_csv('ESP_train_df.csv', header= 0)
ESP_val_df = pd.read_csv('ESP_val_df.csv', header= 0)
ESP_test_df = pd.read_csv('ESP_test_df.csv', header= 0)


In [None]:
def esm_embeddings_2560(esm2, esm2_alphabet, peptide_sequence_list):
  # NOTICE: ESM for embeddings is quite RAM usage, if your sequence is too long,
  #         or you have too many sequences for transformation in a single converting,
  #         you computer might automatically kill the job.
  import torch
  import esm
  import collections
  import pandas as pd
  import gc

  if torch.cuda.is_available():
    device = torch.device("cuda")
  else:
    device = torch.device("cpu")
  esm2 = esm2.eval().to(device)

  batch_converter = esm2_alphabet.get_batch_converter()

  # load the peptide sequence list into the bach_converter
  batch_labels, batch_strs, batch_tokens = batch_converter(peptide_sequence_list)
  batch_lens = (batch_tokens != esm2_alphabet.padding_idx).sum(1)
  ## batch tokens are the embedding results of the whole data set

  batch_tokens = batch_tokens.to(device)

  # Extract per-residue representations (on CPU)
  with torch.no_grad():
      # Here we export the last layer of the EMS model output as the representation of the peptides
      # model'esm2_t12_35M_UR50D' only has 12 layers, and therefore repr_layers parameters is equal to 12
      results = esm2(batch_tokens, repr_layers=[36], return_contacts=False)
  token_representations = results["representations"][36].cpu()
  del results, batch_tokens
  torch.cuda.empty_cache()
  gc.collect()
  return token_representations[:,1:-1,:].mean(1)


In [None]:
import torch
from transformers import AutoModel, AutoTokenizer
from rdkit import Chem

model_smiles = AutoModel.from_pretrained("ibm/MoLFormer-XL-both-10pct", deterministic_eval=True, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("ibm/MoLFormer-XL-both-10pct", trust_remote_code=True)

def MolFormer_embedding(model_smiles, tokenizer, SMILES_list):
    inputs = tokenizer(SMILES_list, padding=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model_smiles(**inputs)
    # NOTICE: if you have several smiles in the list, you will find the average embedding of each token will remain the same
    #           no matter which smiles in side the list, however, the padding will based on the longest smiles,
    #           therefore, the last hidden state representation shape:[len, 768] will change for the same smiles in difference smiles list.
    return outputs.pooler_output # shape is [len_list, 768] ; torch tensor;

### select specific part from the test datset for following prediction performance evaluation

####  use the prediction model to predict the results in the test dataset (validate the model performance)

In [2]:
import os
os.chdir('/content/drive/MyDrive/EC_number_kroll/esm2_2560')

In [3]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

import numpy as np
import pandas as pd

from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch
import torch.nn as nn

import warnings
from tqdm import tqdm
import os
from pathlib import Path
# Define the device
device = "cuda" if torch.cuda.is_available() else "mps" if  torch.backends.mps.is_available() else "cpu" # torch.has_mps or
print("Using device:", device)
if (device == 'cuda'):
    print(f"Device name: {torch.cuda.get_device_name(device.index)}")
    print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
elif (device == 'mps'):
    print(f"Device name: <mps>")
else:
    print("NOTE: If you have a GPU, consider using it for training.")
    print("      On a Windows machine with NVidia GPU, check this video: https://www.youtube.com/watch?v=GMSjDTU8Zlc")
    print("      On a Mac machine, run: pip3 install --pre torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/nightly/cpu")
device = torch.device(device)

Using device: cuda
Device name: Tesla T4
Device memory: 14.74810791015625 GB


In [5]:
ESP_test_df_enzy = torch.load('ESP_test_df_enzy_esm2_2560.pt')
# Load the saved embeddings_results
ESP_test_df_smiles = torch.load('ESP_test_df_smiles_esm2_2560.pt')
y_test = torch.load('ESP_test_df_label_esm2_2560.pt')
ESP_test_df_enzy_add = torch.load('ESP_test_df_enzy_>2800_<8000.pt')
ESP_test_df_smiles_add = torch.load('ESP_test_df_smiles_>2800_<8000.pt')
y_test_add = torch.load('ESP_test_df_label_>2800_<8000.pt')

ESP_test_df_enzy = torch.cat([ESP_test_df_enzy, ESP_test_df_enzy_add], dim=0)
ESP_test_df_smiles = torch.cat([ESP_test_df_smiles, ESP_test_df_smiles_add], dim=0)
y_test = torch.cat([y_test, y_test_add], dim=0)
print(ESP_test_df_enzy.shape, ESP_test_df_smiles.shape, y_test.shape)

test_tensor_dataset = TensorDataset(ESP_test_df_enzy, ESP_test_df_smiles, y_test)

# Create TensorDataset and DataLoaders
batch_size = 16
test_loader = DataLoader(test_tensor_dataset, batch_size=batch_size, shuffle=False)


torch.Size([13336, 2560]) torch.Size([13336, 768]) torch.Size([13336, 1])


In [6]:
import torch
import torch.nn as nn

class Contrastive_learning_layer(nn.Module):
    def __init__(self):
        super().__init__()
        self.enzy_refine_layer_1 = nn.Linear(2560, 2560) # W1 and b
        self.smiles_refine_layer_1 = nn.Linear(768, 768) # W1 and b
        self.enzy_refine_layer_2 = nn.Linear(2560, 128) # W1 and b
        self.smiles_refine_layer_2 = nn.Linear(768, 128) # W1 and b

        self.relu = nn.ReLU()
        self.batch_norm_enzy = nn.BatchNorm1d(2560)
        self.batch_norm_smiles = nn.BatchNorm1d(768)
        self.batch_norm_shared = nn.BatchNorm1d(128)

    def forward(self, enzy_embed, smiles_embed):
        refined_enzy_embed = self.enzy_refine_layer_1(enzy_embed)
        refined_smiles_embed = self.smiles_refine_layer_1(smiles_embed)

        refined_enzy_embed = self.batch_norm_enzy(refined_enzy_embed)
        refined_smiles_embed = self.batch_norm_smiles(refined_smiles_embed)

        refined_enzy_embed = self.relu(refined_enzy_embed)
        refined_smiles_embed = self.relu(refined_smiles_embed)

        refined_enzy_embed = self.enzy_refine_layer_2(refined_enzy_embed)
        refined_smiles_embed = self.smiles_refine_layer_2(refined_smiles_embed)

        refined_enzy_embed = self.batch_norm_shared(refined_enzy_embed)
        refined_smiles_embed = self.batch_norm_shared(refined_smiles_embed)
        refined_enzy_embed = torch.nn.functional.normalize(refined_enzy_embed, dim=1)
        refined_smiles_embed = torch.nn.functional.normalize(refined_smiles_embed, dim=1)

        return refined_enzy_embed, refined_smiles_embed


In [7]:
loss_fn = nn.MSELoss().to(device)

In [8]:
def run_validation(model, val_loader,loss_fn, device):
    model.eval()
    loss_sum = 0
    num_batch = len(val_loader)
    total_y_true=[]
    total_y_pred=[]
    total_y_prob=[]
    for ESP_val_df_enzy,ESP_val_df_smiles, y_val in val_loader:

        ESP_val_df_enzy = ESP_val_df_enzy.to(device)
        ESP_val_df_smiles = ESP_val_df_smiles.to(device)
        y_val = y_val.squeeze(1).to(device)

        refined_enzy_embed, refined_smiles_embed = model(ESP_val_df_enzy,ESP_val_df_smiles)
        cos_sim = torch.nn.functional.cosine_similarity(refined_enzy_embed, refined_smiles_embed, dim=1)
        loss = loss_fn(cos_sim, y_val).detach().cpu().numpy()
        loss_sum = loss_sum + loss # count all the loss in the training process
        y_pred = (cos_sim > 0.5).float().cpu().numpy() # if score > 0.5, assign label 1 otherwise 0, transfer to cpu as numpy
        total_y_true.append(y_val.cpu().numpy())
        total_y_pred.append(y_pred)
        total_y_prob.append(cos_sim.detach().cpu().numpy())

    loss_sum = loss_sum/num_batch # get the overall average loss (Notice: this method is not 100% accurate)

    arrange_y_true = np.concatenate(total_y_true, axis=0)
    arrange_y_pred = np.concatenate(total_y_pred, axis=0)
    arrange_y_prob = np.concatenate(total_y_prob, axis=0)
    tn,fp,fn,tp = confusion_matrix(arrange_y_true, arrange_y_pred).ravel()
    acc = (tp+tn)/(tp+tn+fp+fn)
    specificity = tn/(tn+fp)
    sensitivity = tp/(tp+fn)
    recall = tp/(tp+fn)
    precision = tp/(tp+fp)
    bacc = (sensitivity + specificity)/2
    MCC = (tp*tn-fp*fn)/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    AUC = roc_auc_score(arrange_y_true, arrange_y_prob)
    f1 = 2*precision*recall/(precision+recall)
    print("loss_sum= ",loss_sum, "ACC= ",acc, "bacc= ",bacc, "precision= ",precision,"specificity= ",specificity, "sensitivity= ",sensitivity, "recall= ",recall, "MCC= ",MCC, "AUC= ",AUC, "f1= ",f1)
    return loss_sum, acc, bacc   # , precision, sensitivity, recall, MCC, AUC, f1


In [11]:
import os
os.chdir('/content/drive/MyDrive/EC_number_kroll/esm2_2560')

In [12]:
import torch
# Specify the file path where the entire model is saved
load_path = 'best_model_esm2_2560_add_>2800_<8000_in_valid2_ACC=0.9357.pt'
# Load the entire model
model_test = torch.load(load_path)
run_validation(model_test,test_loader,loss_fn, device)

loss_sum=  0.06158351360281988 ACC=  0.9356628674265147 bacc=  0.9051909339672698 precision=  0.9084204056545789 specificity=  0.969653767820774 sensitivity=  0.8407281001137656 recall=  0.8407281001137656 MCC=  0.8313574088968375 AUC=  0.9593826968481587 f1=  0.8732644017725258


(0.06158351360281988, 0.9356628674265147, 0.9051909339672698)

In [None]:
## looks great because the loaded model is the correct model we plan to use for evaluation

### model performance in the unseen and lower occurance molecules


#### embedding of smaller dataset separated from test dataset (bypass this block if you have embedded)

In [None]:
import os
os.chdir('/content/drive/MyDrive/EC_number_kroll/analyze_final_model')

In [None]:
df_0 = pd.read_csv('df_0.csv', header= 0)
df_1 = pd.read_csv('df_1.csv', header= 0)
df_2 = pd.read_csv('df_2.csv', header= 0)
df_3 = pd.read_csv('df_3.csv', header= 0)
df_4 = pd.read_csv('df_4.csv', header= 0)
df_5 = pd.read_csv('df_5.csv', header= 0)
df_6 = pd.read_csv('df_6.csv', header= 0)
df_7 = pd.read_csv('df_7.csv', header= 0)
df_8 = pd.read_csv('df_8.csv', header= 0)
df_9 = pd.read_csv('df_9.csv', header= 0)
df_10 = pd.read_csv('df_10.csv', header= 0)
df_11 = pd.read_csv('df_11.csv', header= 0)

In [None]:
import numpy as np
import pandas as pd
import esm
model, alphabet = esm.pretrained.esm2_t36_3B_UR50D()
# generate the peptide embeddings
embeddings_results_enzy = []
embeddings_results_smiles = []
embeddings_results_label = []
for i in range(df_0.shape[0]):
    seq_enzy = df_0['Protein sequence'].iloc[i]
    seq_smiles = df_0['SMILES'].iloc[i]
    if len(seq_enzy) < 5500:
        # print(len(seq_enzy))
        tuple_sequence = tuple(['protein',seq_enzy])
        peptide_sequence_list = []
        peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = esm_embeddings_2560(model, alphabet, peptide_sequence_list)
        embeddings_results_enzy.append(one_seq_embeddings)
        # the smiles embeddings
        smiles_list = []
        smiles_list.append(Chem.CanonSmiles(seq_smiles)) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = MolFormer_embedding(model_smiles, tokenizer, smiles_list)
        embeddings_results_smiles.append(one_seq_embeddings)
        # record the lable info
        label = torch.tensor(df_0['output'].iloc[i], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
        embeddings_results_label.append(label)
        # print(seq_enzy, seq_smiles)
        # print(i)
embeddings_results_enzy_torch = torch.cat(embeddings_results_enzy, dim=0)
torch.save(embeddings_results_enzy_torch, 'ESP_test_df_enzy_df_0.pt')

embeddings_results_smiles_torch = torch.cat(embeddings_results_smiles, dim=0)
torch.save(embeddings_results_smiles_torch, 'ESP_test_df_smiles_df_0.pt')

embeddings_results_label_torch = torch.cat(embeddings_results_label, dim=0)
torch.save(embeddings_results_label_torch, 'ESP_test_df_label_df_0.pt')


In [None]:
import numpy as np
import pandas as pd
import esm
model, alphabet = esm.pretrained.esm2_t36_3B_UR50D()
# generate the peptide embeddings
embeddings_results_enzy = []
embeddings_results_smiles = []
embeddings_results_label = []
for i in range(df_1.shape[0]):
    seq_enzy = df_1['Protein sequence'].iloc[i]
    seq_smiles = df_1['SMILES'].iloc[i]
    if len(seq_enzy) < 5500:
        # print(len(seq_enzy))
        tuple_sequence = tuple(['protein',seq_enzy])
        peptide_sequence_list = []
        peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = esm_embeddings_2560(model, alphabet, peptide_sequence_list)
        embeddings_results_enzy.append(one_seq_embeddings)
        # the smiles embeddings
        smiles_list = []
        smiles_list.append(Chem.CanonSmiles(seq_smiles)) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = MolFormer_embedding(model_smiles, tokenizer, smiles_list)
        embeddings_results_smiles.append(one_seq_embeddings)
        # record the lable info
        label = torch.tensor(df_1['output'].iloc[i], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
        embeddings_results_label.append(label)
        print(seq_enzy, seq_smiles)
        print(i)
embeddings_results_enzy_torch = torch.cat(embeddings_results_enzy, dim=0)
torch.save(embeddings_results_enzy_torch, 'ESP_test_df_enzy_df_1.pt')

embeddings_results_smiles_torch = torch.cat(embeddings_results_smiles, dim=0)
torch.save(embeddings_results_smiles_torch, 'ESP_test_df_smiles_df_1.pt')

embeddings_results_label_torch = torch.cat(embeddings_results_label, dim=0)
torch.save(embeddings_results_label_torch, 'ESP_test_df_label_df_1.pt')


MPSEAESSANSAPATPPPPPNFWGTMPEEEYYTSQGVRNSKSYFETPNGKLFTQSFLPLDGEIKGTVYMSHGYGSDTSWMFQKICMSFSSWGYAVFAADLLGHGRSDGIRCYMGDMEKVAATSLAFFKHVRCSDPYKDLPAFLFGESMGGLVTLLMYFQSEPETWTGLMFSAPLFVIPEDMKPSKAHLFAYGLLFGLADTWAAMPDNKMVGKAIKDPEKLKIIASNPQRYTGKPRVGTMRELLRKTQYVQENFGKVTIPVFTAHGTADGVTCPTSSKLLYEKASSADKTLKIYEGMYHSLIQGEPDENAEIVLKDMREWIDEKVKKYGSKTA CC(C)(COP(=O)(O)OP(=O)(O)OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)(O)O)[C@@H](O)C(=O)NCCC(=O)NCCSC(=O)/C=C/c1ccc(O)c(O)c1
0
MDENEFDNQRENKAVARVIISFLKYEEYALKEIYNLRVKKWASISDRQKDMVPNYTKYLANLKAAIIENGKFFRSVAEYALQSISFEPGEIVQPNDLDMSKTCSLLTQVYREWSAEAISERNCLNSRLVPFLKTLSPPKADILIPGCGTGRLLVDLSRMGYNCEGNEFSYHMLLVSQYMLNAGLLQNQIIIYPFIHCFSHWKKIEDQLSPIKVPDIEAWSSNKGMGSMSICAGSFVDCYGRNQGTKISSHYTFSRRMQLSRAKAENSKDVVVTNFFIDTGSNILDYLDTIGHVLKPGGIWCNFGPLLYHFENDHGVETTYEVNPYSGFQDKINDYTPLMGLELSSDDIISIATNHLDFELIRRESGILCGYGRYAGPESCAMPGYMCHYWILKSNPTNES CC(C)(COP(=O)(O)OP(=O)(O)OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)(O)O)[C@@H](O)C(=O)NCCC(=O)NCCSC(=O)/C=C/c1ccc(O)c

In [None]:
import numpy as np
import pandas as pd
import esm
model, alphabet = esm.pretrained.esm2_t36_3B_UR50D()
# generate the peptide embeddings
embeddings_results_enzy = []
embeddings_results_smiles = []
embeddings_results_label = []
for i in range(df_2.shape[0]):
    seq_enzy = df_2['Protein sequence'].iloc[i]
    seq_smiles = df_2['SMILES'].iloc[i]
    if len(seq_enzy) < 5500:
        # print(len(seq_enzy))
        tuple_sequence = tuple(['protein',seq_enzy])
        peptide_sequence_list = []
        peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = esm_embeddings_2560(model, alphabet, peptide_sequence_list)
        embeddings_results_enzy.append(one_seq_embeddings)
        # the smiles embeddings
        smiles_list = []
        smiles_list.append(Chem.CanonSmiles(seq_smiles)) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = MolFormer_embedding(model_smiles, tokenizer, smiles_list)
        embeddings_results_smiles.append(one_seq_embeddings)
        # record the lable info
        label = torch.tensor(df_2['output'].iloc[i], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
        embeddings_results_label.append(label)
        # print(seq_enzy, seq_smiles)
        # print(i)
embeddings_results_enzy_torch = torch.cat(embeddings_results_enzy, dim=0)
torch.save(embeddings_results_enzy_torch, 'ESP_test_df_enzy_df_2.pt')

embeddings_results_smiles_torch = torch.cat(embeddings_results_smiles, dim=0)
torch.save(embeddings_results_smiles_torch, 'ESP_test_df_smiles_df_2.pt')

embeddings_results_label_torch = torch.cat(embeddings_results_label, dim=0)
torch.save(embeddings_results_label_torch, 'ESP_test_df_label_df_2.pt')


In [None]:
import numpy as np
import pandas as pd
import esm
model, alphabet = esm.pretrained.esm2_t36_3B_UR50D()
# generate the peptide embeddings
embeddings_results_enzy = []
embeddings_results_smiles = []
embeddings_results_label = []
for i in range(df_3.shape[0]):
    seq_enzy = df_3['Protein sequence'].iloc[i]
    seq_smiles = df_3['SMILES'].iloc[i]
    if len(seq_enzy) < 5500:
        print(len(seq_enzy))
        tuple_sequence = tuple(['protein',seq_enzy])
        peptide_sequence_list = []
        peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = esm_embeddings_2560(model, alphabet, peptide_sequence_list)
        embeddings_results_enzy.append(one_seq_embeddings)
        # the smiles embeddings
        smiles_list = []
        smiles_list.append(Chem.CanonSmiles(seq_smiles)) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = MolFormer_embedding(model_smiles, tokenizer, smiles_list)
        embeddings_results_smiles.append(one_seq_embeddings)
        # record the lable info
        label = torch.tensor(df_3['output'].iloc[i], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
        embeddings_results_label.append(label)
        # print(seq_enzy, seq_smiles)
        # print(i)
embeddings_results_enzy_torch = torch.cat(embeddings_results_enzy, dim=0)
torch.save(embeddings_results_enzy_torch, 'ESP_test_df_enzy_df_3.pt')

embeddings_results_smiles_torch = torch.cat(embeddings_results_smiles, dim=0)
torch.save(embeddings_results_smiles_torch, 'ESP_test_df_smiles_df_3.pt')

embeddings_results_label_torch = torch.cat(embeddings_results_label, dim=0)
torch.save(embeddings_results_label_torch, 'ESP_test_df_label_df_3.pt')


288
213
290
313
1488
238
325
600
1274
507
508
501
298
365
448
374
429
316
374
311
1697
188
453
321
324
764
502
444
733
438
937
395
631
382
1359
374
317
316
161
161
161
161
317
334
77
368
420
167
272
322
317
692
317
157
319
247
570
329
901
1697
318
374
637
297
659
689
255
187
413
320
361
530
351
330
985
533
525
240
118
324
250
935
132
310
260
502
465
595
439
237
240
329
382
435
192
453
320
338
306
330
202
446
337
480
974
265
403
479
1433
1146
323
342
650
887
887
1102
581
244
276
354
388
669
308
323
564
722
507
753
411
288
325
743
420
1274
348
311
323
197
651
339
285
265
257
265
208
861
524
1085
361
654
267
401
201
626
427
842
357
359
361
296
989
471
510
440
344
415
1054
614
362
437
186
1144
441
1405
426
799
332
418
548
136
120
626
427
852
174
575
443
414
604
588
332
332
523
575
463
450
375
354
342
930
159
367
434
170
463
217
495
419
224
190
121
140
147
492
282
523
500
367
452
439
513
129
575
278
281
489
725
140
147
728
275
1150
688
185
564
443
288
489
476
689
359
323
317
316
161
161
161

In [None]:
import numpy as np
import pandas as pd
import esm
model, alphabet = esm.pretrained.esm2_t36_3B_UR50D()
# generate the peptide embeddings
embeddings_results_enzy = []
embeddings_results_smiles = []
embeddings_results_label = []
for i in range(df_4.shape[0]):
    seq_enzy = df_4['Protein sequence'].iloc[i]
    seq_smiles = df_4['SMILES'].iloc[i]
    if len(seq_enzy) < 5500:
        print(len(seq_enzy))
        tuple_sequence = tuple(['protein',seq_enzy])
        peptide_sequence_list = []
        peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = esm_embeddings_2560(model, alphabet, peptide_sequence_list)
        embeddings_results_enzy.append(one_seq_embeddings)
        # the smiles embeddings
        smiles_list = []
        smiles_list.append(Chem.CanonSmiles(seq_smiles)) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = MolFormer_embedding(model_smiles, tokenizer, smiles_list)
        embeddings_results_smiles.append(one_seq_embeddings)
        # record the lable info
        label = torch.tensor(df_4['output'].iloc[i], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
        embeddings_results_label.append(label)
        # print(seq_enzy, seq_smiles)
        # print(i)
embeddings_results_enzy_torch = torch.cat(embeddings_results_enzy, dim=0)
torch.save(embeddings_results_enzy_torch, 'ESP_test_df_enzy_df_4.pt')

embeddings_results_smiles_torch = torch.cat(embeddings_results_smiles, dim=0)
torch.save(embeddings_results_smiles_torch, 'ESP_test_df_smiles_df_4.pt')

embeddings_results_label_torch = torch.cat(embeddings_results_label, dim=0)
torch.save(embeddings_results_label_torch, 'ESP_test_df_label_df_4.pt')


683
684
626
393
535
453
194
480
315
452
565
516
516
275
325
318
319
512
319
316
1450
318
479
317
318
1697
318
346
530
523
10
350
354
460
409
659
495
493
519
285
346
454
307
484
484
476
1465
136
459
317
902
413
556
575
247
510
406
362
224
111
412
136
331
1514
1531
565
665
458
409
412
297
500
2324
344
207
243
578
615
231
413
232
229
297
152
390
335
563
266
346
295
424
292
439
799
265
552
610
617
626
684
393
683
525
412
569
535
365
507
402
1697
500
531
389
389
1135
586
240
611
309
302
360
434
613
1102
736
445
371
362
698
329
332
666
2345
202
456
543
374
460
413
413
379
282
355
346
479
449
528
232
229
390
533
334
332
254
337
372
317
231
333
317
520
453
412
398
489
703
757
414
344
421
525
412
415
317
238
237
524
533
461
438
814
329
403
447
210
212
227
241
311
361
307
459
333
152
482
502
313
300
417
520
291
635
518
267
250
512
225
190
459
250
302
190
313
325
275
410
193
148
432
623
646
706
414
995
891
909
909
1064
192
476
390
354
361
359
184
493
440
252
344
974
415
520
646
676
504
198
479
44

In [None]:
import numpy as np
import pandas as pd
import esm
model, alphabet = esm.pretrained.esm2_t36_3B_UR50D()
# generate the peptide embeddings
embeddings_results_enzy = []
embeddings_results_smiles = []
embeddings_results_label = []
for i in range(df_5.shape[0]):
    seq_enzy = df_5['Protein sequence'].iloc[i]
    seq_smiles = df_5['SMILES'].iloc[i]
    if len(seq_enzy) < 5500:
        print(len(seq_enzy))
        tuple_sequence = tuple(['protein',seq_enzy])
        peptide_sequence_list = []
        peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = esm_embeddings_2560(model, alphabet, peptide_sequence_list)
        embeddings_results_enzy.append(one_seq_embeddings)
        # the smiles embeddings
        smiles_list = []
        smiles_list.append(Chem.CanonSmiles(seq_smiles)) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = MolFormer_embedding(model_smiles, tokenizer, smiles_list)
        embeddings_results_smiles.append(one_seq_embeddings)
        # record the lable info
        label = torch.tensor(df_5['output'].iloc[i], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
        embeddings_results_label.append(label)
        # print(seq_enzy, seq_smiles)
        # print(i)
embeddings_results_enzy_torch = torch.cat(embeddings_results_enzy, dim=0)
torch.save(embeddings_results_enzy_torch, 'ESP_test_df_enzy_df_5.pt')

embeddings_results_smiles_torch = torch.cat(embeddings_results_smiles, dim=0)
torch.save(embeddings_results_smiles_torch, 'ESP_test_df_smiles_df_5.pt')

embeddings_results_label_torch = torch.cat(embeddings_results_label, dim=0)
torch.save(embeddings_results_label_torch, 'ESP_test_df_label_df_5.pt')


356
244
244
244
244
244
335
187
364
530
227
256
461
554
584
575
497
244
256
256
278
237
249
387
662
430
616
432
431
543
336
332
610
398
546
933
304
516
835
929
546
767
475
488
652
310
534
139
580
1432
577
562
672
554
383
406
339
406
457
510
556
863
575
267
224
111
503
594
224
316
226
1153
213
742
241
447
378
18
495
409
412
835
243
573
147
340
2324
1150
543
445
1764
304
291
1162
452
288
439
217
299
186
292
186
260
333
356
484
288
277
346
402
1697
189
500
497
637
374
152
279
277
565
396
416
599
398
542
381
2346
382
430
296
372
659
283
332
312
660
532
108
145
298
249
330
651
521
533
796
325
591
590
325
325
610
288
250
300
959
494
258
198
266
313
252
266
332
272
8
406
725
332
412
512
198
413
458
1040
517
332
402
1424
2129
1008
1053
1191
1008
572
1514
418
195
430
403
1500
615
323
402
299
646
442
1622
845
1262
659
409
550
725
406
247
569
267
510
655
224
299
111
145
145
260
431
467
433
455
444
1897
429
321
740
935
946
272
195
197
688
322
414
430
463
573
421
336
393
683
263
312
626
337
317
317

In [None]:
import numpy as np
import pandas as pd
import esm
model, alphabet = esm.pretrained.esm2_t36_3B_UR50D()
# generate the peptide embeddings
embeddings_results_enzy = []
embeddings_results_smiles = []
embeddings_results_label = []
for i in range(df_6.shape[0]):
    seq_enzy = df_6['Protein sequence'].iloc[i]
    seq_smiles = df_6['SMILES'].iloc[i]
    if len(seq_enzy) < 5500:
        print(len(seq_enzy))
        tuple_sequence = tuple(['protein',seq_enzy])
        peptide_sequence_list = []
        peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = esm_embeddings_2560(model, alphabet, peptide_sequence_list)
        embeddings_results_enzy.append(one_seq_embeddings)
        # the smiles embeddings
        smiles_list = []
        smiles_list.append(Chem.CanonSmiles(seq_smiles)) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = MolFormer_embedding(model_smiles, tokenizer, smiles_list)
        embeddings_results_smiles.append(one_seq_embeddings)
        # record the lable info
        label = torch.tensor(df_6['output'].iloc[i], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
        embeddings_results_label.append(label)
        # print(seq_enzy, seq_smiles)
        # print(i)
embeddings_results_enzy_torch = torch.cat(embeddings_results_enzy, dim=0)
torch.save(embeddings_results_enzy_torch, 'ESP_test_df_enzy_df_6.pt')

embeddings_results_smiles_torch = torch.cat(embeddings_results_smiles, dim=0)
torch.save(embeddings_results_smiles_torch, 'ESP_test_df_smiles_df_6.pt')

embeddings_results_label_torch = torch.cat(embeddings_results_label, dim=0)
torch.save(embeddings_results_label_torch, 'ESP_test_df_label_df_6.pt')


402
374
374
374
311
374
283
374
277
767
293
471
319
612
501
189
615
426
508
467
508
501
199
503
500
503
318
295
153
318
496
741
346
565
279
416
386
1005
390
599
398
277
542
1442
356
358
296
440
510
690
418
385
1010
513
404
283
454
561
476
543
285
199
238
208
371
350
213
337
297
152
321
498
335
244
317
501
393
2225
182
333
129
743
479
249
292
302
829
195
902
495
449
341
379
479
694
413
346
365
195
64
335
332
403
271
351
942
499
499
324
444
505
795
332
499
552
448
152
210
1502
723
1178
412
246
881
302
229
709
283
282
564
438
887
412
487
390
422
707
298
552
379
426
244
339
395
261
444
460
198
477
455
263
459
346
603
296
190
935
392
170
753
166
233
814
476
527
335
985
293
288
531
202
361
471
531
265
210
403
422
535
626
510
386
1813
299
220
481
149
147
314
505
1098
278
333
258
457
333
727
401
586
439
611
427
468
461
314
333
354
1144
310
198
434
386
355
334
413
222
258
266
266
610
349
617
590
273
619
1358
354
325
224
386
552
244
222
184
346
707
533
452
325
77
210
795
338
420
332
310
233
342


In [None]:
import numpy as np
import pandas as pd
import esm
model, alphabet = esm.pretrained.esm2_t36_3B_UR50D()
# generate the peptide embeddings
embeddings_results_enzy = []
embeddings_results_smiles = []
embeddings_results_label = []
for i in range(df_7.shape[0]):
    seq_enzy = df_7['Protein sequence'].iloc[i]
    seq_smiles = df_7['SMILES'].iloc[i]
    if len(seq_enzy) < 5500:

        tuple_sequence = tuple(['protein',seq_enzy])
        peptide_sequence_list = []
        peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = esm_embeddings_2560(model, alphabet, peptide_sequence_list)
        embeddings_results_enzy.append(one_seq_embeddings)
        # the smiles embeddings
        smiles_list = []
        smiles_list.append(Chem.CanonSmiles(seq_smiles)) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = MolFormer_embedding(model_smiles, tokenizer, smiles_list)
        embeddings_results_smiles.append(one_seq_embeddings)
        # record the lable info
        label = torch.tensor(df_7['output'].iloc[i], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
        embeddings_results_label.append(label)
        # print(seq_enzy, seq_smiles)
        # print(i)
embeddings_results_enzy_torch = torch.cat(embeddings_results_enzy, dim=0)
torch.save(embeddings_results_enzy_torch, 'ESP_test_df_enzy_df_7.pt')

embeddings_results_smiles_torch = torch.cat(embeddings_results_smiles, dim=0)
torch.save(embeddings_results_smiles_torch, 'ESP_test_df_smiles_df_7.pt')

embeddings_results_label_torch = torch.cat(embeddings_results_label, dim=0)
torch.save(embeddings_results_label_torch, 'ESP_test_df_label_df_7.pt')


In [None]:
import numpy as np
import pandas as pd
import esm
model, alphabet = esm.pretrained.esm2_t36_3B_UR50D()
# generate the peptide embeddings
embeddings_results_enzy = []
embeddings_results_smiles = []
embeddings_results_label = []
for i in range(df_8.shape[0]):
    seq_enzy = df_8['Protein sequence'].iloc[i]
    seq_smiles = df_8['SMILES'].iloc[i]
    if len(seq_enzy) < 5500:

        tuple_sequence = tuple(['protein',seq_enzy])
        peptide_sequence_list = []
        peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = esm_embeddings_2560(model, alphabet, peptide_sequence_list)
        embeddings_results_enzy.append(one_seq_embeddings)
        # the smiles embeddings
        smiles_list = []
        smiles_list.append(Chem.CanonSmiles(seq_smiles)) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = MolFormer_embedding(model_smiles, tokenizer, smiles_list)
        embeddings_results_smiles.append(one_seq_embeddings)
        # record the lable info
        label = torch.tensor(df_8['output'].iloc[i], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
        embeddings_results_label.append(label)
        # print(seq_enzy, seq_smiles)
        # print(i)
embeddings_results_enzy_torch = torch.cat(embeddings_results_enzy, dim=0)
torch.save(embeddings_results_enzy_torch, 'ESP_test_df_enzy_df_8.pt')

embeddings_results_smiles_torch = torch.cat(embeddings_results_smiles, dim=0)
torch.save(embeddings_results_smiles_torch, 'ESP_test_df_smiles_df_8.pt')

embeddings_results_label_torch = torch.cat(embeddings_results_label, dim=0)
torch.save(embeddings_results_label_torch, 'ESP_test_df_label_df_8.pt')


In [None]:
import numpy as np
import pandas as pd
import esm
model, alphabet = esm.pretrained.esm2_t36_3B_UR50D()
# generate the peptide embeddings
embeddings_results_enzy = []
embeddings_results_smiles = []
embeddings_results_label = []
for i in range(df_9.shape[0]):
    seq_enzy = df_9['Protein sequence'].iloc[i]
    seq_smiles = df_9['SMILES'].iloc[i]
    if len(seq_enzy) < 5500:

        tuple_sequence = tuple(['protein',seq_enzy])
        peptide_sequence_list = []
        peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = esm_embeddings_2560(model, alphabet, peptide_sequence_list)
        embeddings_results_enzy.append(one_seq_embeddings)
        # the smiles embeddings
        smiles_list = []
        smiles_list.append(Chem.CanonSmiles(seq_smiles)) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = MolFormer_embedding(model_smiles, tokenizer, smiles_list)
        embeddings_results_smiles.append(one_seq_embeddings)
        # record the lable info
        label = torch.tensor(df_9['output'].iloc[i], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
        embeddings_results_label.append(label)
        # print(seq_enzy, seq_smiles)
        # print(i)
embeddings_results_enzy_torch = torch.cat(embeddings_results_enzy, dim=0)
torch.save(embeddings_results_enzy_torch, 'ESP_test_df_enzy_df_9.pt')

embeddings_results_smiles_torch = torch.cat(embeddings_results_smiles, dim=0)
torch.save(embeddings_results_smiles_torch, 'ESP_test_df_smiles_df_9.pt')

embeddings_results_label_torch = torch.cat(embeddings_results_label, dim=0)
torch.save(embeddings_results_label_torch, 'ESP_test_df_label_df_9.pt')


In [None]:
import numpy as np
import pandas as pd
import esm
model, alphabet = esm.pretrained.esm2_t36_3B_UR50D()
# generate the peptide embeddings
embeddings_results_enzy = []
embeddings_results_smiles = []
embeddings_results_label = []
for i in range(df_10.shape[0]):
    seq_enzy = df_10['Protein sequence'].iloc[i]
    seq_smiles = df_10['SMILES'].iloc[i]
    if len(seq_enzy) < 5500:

        tuple_sequence = tuple(['protein',seq_enzy])
        peptide_sequence_list = []
        peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = esm_embeddings_2560(model, alphabet, peptide_sequence_list)
        embeddings_results_enzy.append(one_seq_embeddings)
        # the smiles embeddings
        smiles_list = []
        smiles_list.append(Chem.CanonSmiles(seq_smiles)) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = MolFormer_embedding(model_smiles, tokenizer, smiles_list)
        embeddings_results_smiles.append(one_seq_embeddings)
        # record the lable info
        label = torch.tensor(df_10['output'].iloc[i], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
        embeddings_results_label.append(label)
        # print(seq_enzy, seq_smiles)
        # print(i)
embeddings_results_enzy_torch = torch.cat(embeddings_results_enzy, dim=0)
torch.save(embeddings_results_enzy_torch, 'ESP_test_df_enzy_df_10.pt')

embeddings_results_smiles_torch = torch.cat(embeddings_results_smiles, dim=0)
torch.save(embeddings_results_smiles_torch, 'ESP_test_df_smiles_df_10.pt')

embeddings_results_label_torch = torch.cat(embeddings_results_label, dim=0)
torch.save(embeddings_results_label_torch, 'ESP_test_df_label_df_10.pt')




In [None]:
import numpy as np
import pandas as pd
import esm
model, alphabet = esm.pretrained.esm2_t36_3B_UR50D()
# generate the peptide embeddings
embeddings_results_enzy = []
embeddings_results_smiles = []
embeddings_results_label = []
for i in range(df_11.shape[0]):
    seq_enzy = df_11['Protein sequence'].iloc[i]
    seq_smiles = df_11['SMILES'].iloc[i]
    if len(seq_enzy) < 5500:

        tuple_sequence = tuple(['protein',seq_enzy])
        peptide_sequence_list = []
        peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = esm_embeddings_2560(model, alphabet, peptide_sequence_list)
        embeddings_results_enzy.append(one_seq_embeddings)
        # the smiles embeddings
        smiles_list = []
        smiles_list.append(Chem.CanonSmiles(seq_smiles)) # build a summarize list variable including all the sequence information
        # employ ESM model for converting and save the converted data in csv format
        one_seq_embeddings = MolFormer_embedding(model_smiles, tokenizer, smiles_list)
        embeddings_results_smiles.append(one_seq_embeddings)
        # record the lable info
        label = torch.tensor(df_11['output'].iloc[i], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
        embeddings_results_label.append(label)
        # print(seq_enzy, seq_smiles)
        # print(i)
embeddings_results_enzy_torch = torch.cat(embeddings_results_enzy, dim=0)
torch.save(embeddings_results_enzy_torch, 'ESP_test_df_enzy_df_11.pt')

embeddings_results_smiles_torch = torch.cat(embeddings_results_smiles, dim=0)
torch.save(embeddings_results_smiles_torch, 'ESP_test_df_smiles_df_11.pt')

embeddings_results_label_torch = torch.cat(embeddings_results_label, dim=0)
torch.save(embeddings_results_label_torch, 'ESP_test_df_label_df_11.pt')




####  use the prediction model to predict the results in the subsets from test dataset

In [18]:
import os
os.chdir('/content/drive/MyDrive/EC_number_kroll/analyze_final_model')

In [19]:
def run_validation(model, val_loader,loss_fn, device):
    model.eval()
    loss_sum = 0
    num_batch = len(val_loader)
    total_y_true=[]
    total_y_pred=[]
    total_y_prob=[]
    for ESP_val_df_enzy,ESP_val_df_smiles, y_val in val_loader:

        ESP_val_df_enzy = ESP_val_df_enzy.to(device)
        ESP_val_df_smiles = ESP_val_df_smiles.to(device)
        y_val = y_val.squeeze(1).to(device)

        refined_enzy_embed, refined_smiles_embed = model(ESP_val_df_enzy,ESP_val_df_smiles)
        cos_sim = torch.nn.functional.cosine_similarity(refined_enzy_embed, refined_smiles_embed, dim=1)
        loss = loss_fn(cos_sim, y_val).detach().cpu().numpy()
        loss_sum = loss_sum + loss # count all the loss in the training process
        y_pred = (cos_sim > 0.5).float().cpu().numpy() # if score > 0.5, assign label 1 otherwise 0, transfer to cpu as numpy
        total_y_true.append(y_val.cpu().numpy())
        total_y_pred.append(y_pred)
        total_y_prob.append(cos_sim.detach().cpu().numpy())

    loss_sum = loss_sum/num_batch # get the overall average loss (Notice: this method is not 100% accurate)

    arrange_y_true = np.concatenate(total_y_true, axis=0)
    arrange_y_pred = np.concatenate(total_y_pred, axis=0)
    arrange_y_prob = np.concatenate(total_y_prob, axis=0)
    tn,fp,fn,tp = confusion_matrix(arrange_y_true, arrange_y_pred).ravel()
    acc = (tp+tn)/(tp+tn+fp+fn)
    specificity = tn/(tn+fp)
    sensitivity = tp/(tp+fn)
    recall = tp/(tp+fn)
    precision = tp/(tp+fp)
    bacc = (sensitivity + specificity)/2
    MCC = (tp*tn-fp*fn)/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    AUC = roc_auc_score(arrange_y_true, arrange_y_prob)
    f1 = 2*precision*recall/(precision+recall)
    print("loss_sum= ",loss_sum, "ACC= ",acc, "bacc= ",bacc, "precision= ",precision,"specificity= ",specificity, "sensitivity= ",sensitivity, "recall= ",recall, "MCC= ",MCC, "AUC= ",AUC, "f1= ",f1)
    return acc*100, MCC, AUC   # , precision, sensitivity, recall, MCC, AUC, f1


In [20]:
import torch
from torch.utils.data import TensorDataset, DataLoader
ESP_test_df_enzy = torch.load('ESP_test_df_enzy_df_0.pt')
ESP_test_df_smiles = torch.load('ESP_test_df_smiles_df_0.pt')
y_test = torch.load('ESP_test_df_label_df_0.pt')
test_tensor_dataset = TensorDataset(ESP_test_df_enzy, ESP_test_df_smiles, y_test)
# Create TensorDataset and DataLoaders
batch_size = 1
test_loader = DataLoader(test_tensor_dataset, batch_size=batch_size, shuffle=False)

run_validation(model_test,test_loader,loss_fn, device)

loss_sum=  0.16968611028716735 ACC=  0.7671875 bacc=  0.552734375 precision=  0.352112676056338 specificity=  0.91015625 sensitivity=  0.1953125 recall=  0.1953125 MCC=  0.13433168320946834 AUC=  0.6637115478515625 f1=  0.25125628140703515


(76.71875, 0.13433168320946834, 0.6637115478515625)

In [21]:
import torch
from torch.utils.data import TensorDataset, DataLoader
ESP_test_df_enzy = torch.load('ESP_test_df_enzy_df_1.pt')
ESP_test_df_smiles = torch.load('ESP_test_df_smiles_df_1.pt')
y_test = torch.load('ESP_test_df_label_df_1.pt')
test_tensor_dataset = TensorDataset(ESP_test_df_enzy, ESP_test_df_smiles, y_test)
# Create TensorDataset and DataLoaders
batch_size = 16
test_loader = DataLoader(test_tensor_dataset, batch_size=batch_size, shuffle=False)

run_validation(model_test,test_loader,loss_fn, device)

loss_sum=  0.09542646846966818 ACC=  0.8795180722891566 bacc=  0.8005050505050505 precision=  0.723404255319149 specificity=  0.9343434343434344 sensitivity=  0.6666666666666666 recall=  0.6666666666666666 MCC=  0.6198332086575372 AUC=  0.887378688849277 f1=  0.6938775510204082


(87.95180722891565, 0.6198332086575372, 0.887378688849277)

In [22]:
import torch
from torch.utils.data import TensorDataset, DataLoader
ESP_test_df_enzy = torch.load('ESP_test_df_enzy_df_2.pt')
ESP_test_df_smiles = torch.load('ESP_test_df_smiles_df_2.pt')
y_test = torch.load('ESP_test_df_label_df_2.pt')
test_tensor_dataset = TensorDataset(ESP_test_df_enzy, ESP_test_df_smiles, y_test)
# Create TensorDataset and DataLoaders
batch_size = 16
test_loader = DataLoader(test_tensor_dataset, batch_size=batch_size, shuffle=False)

run_validation(model_test,test_loader,loss_fn, device)

loss_sum=  0.10054705288537123 ACC=  0.888631090487239 bacc=  0.811029651352232 precision=  0.7625 specificity=  0.9442815249266863 sensitivity=  0.6777777777777778 recall=  0.6777777777777778 MCC=  0.6503268449542855 AUC=  0.8737699576409254 f1=  0.7176470588235294


(88.8631090487239, 0.6503268449542855, 0.8737699576409254)

In [23]:
import torch
from torch.utils.data import TensorDataset, DataLoader
ESP_test_df_enzy = torch.load('ESP_test_df_enzy_df_3.pt')
ESP_test_df_smiles = torch.load('ESP_test_df_smiles_df_3.pt')
y_test = torch.load('ESP_test_df_label_df_3.pt')
test_tensor_dataset = TensorDataset(ESP_test_df_enzy, ESP_test_df_smiles, y_test)
# Create TensorDataset and DataLoaders
batch_size = 16
test_loader = DataLoader(test_tensor_dataset, batch_size=batch_size, shuffle=False)

run_validation(model_test,test_loader,loss_fn, device)

loss_sum=  0.05888644019393502 ACC=  0.9385665529010239 bacc=  0.8988357655696111 precision=  0.8608695652173913 specificity=  0.9657387580299786 sensitivity=  0.8319327731092437 recall=  0.8319327731092437 MCC=  0.80797261191328 AUC=  0.9583790689723427 f1=  0.8461538461538463


(93.85665529010238, 0.80797261191328, 0.9583790689723427)

In [24]:
import torch
from torch.utils.data import TensorDataset, DataLoader
ESP_test_df_enzy = torch.load('ESP_test_df_enzy_df_4.pt')
ESP_test_df_smiles = torch.load('ESP_test_df_smiles_df_4.pt')
y_test = torch.load('ESP_test_df_label_df_4.pt')
test_tensor_dataset = TensorDataset(ESP_test_df_enzy, ESP_test_df_smiles, y_test)
# Create TensorDataset and DataLoaders
batch_size = 16
test_loader = DataLoader(test_tensor_dataset, batch_size=batch_size, shuffle=False)

run_validation(model_test,test_loader,loss_fn, device)

loss_sum=  0.06915706328370354 ACC=  0.9202279202279202 bacc=  0.89408199467823 precision=  0.7848101265822784 specificity=  0.9388489208633094 sensitivity=  0.8493150684931506 recall=  0.8493150684931506 MCC=  0.7659535242252689 AUC=  0.9397358825268552 f1=  0.8157894736842105


(92.02279202279202, 0.7659535242252689, 0.9397358825268552)

In [25]:
import torch
from torch.utils.data import TensorDataset, DataLoader
ESP_test_df_enzy = torch.load('ESP_test_df_enzy_df_5.pt')
ESP_test_df_smiles = torch.load('ESP_test_df_smiles_df_5.pt')
y_test = torch.load('ESP_test_df_label_df_5.pt')
test_tensor_dataset = TensorDataset(ESP_test_df_enzy, ESP_test_df_smiles, y_test)
# Create TensorDataset and DataLoaders
batch_size = 16
test_loader = DataLoader(test_tensor_dataset, batch_size=batch_size, shuffle=False)

run_validation(model_test,test_loader,loss_fn, device)

loss_sum=  0.06997901140354 ACC=  0.9202898550724637 bacc=  0.8456846068042387 precision=  0.8873239436619719 specificity=  0.9754601226993865 sensitivity=  0.7159090909090909 recall=  0.7159090909090909 MCC=  0.7503846447294928 AUC=  0.9431818181818182 f1=  0.7924528301886792


(92.02898550724638, 0.7503846447294928, 0.9431818181818182)

In [26]:
import torch
from torch.utils.data import TensorDataset, DataLoader
ESP_test_df_enzy = torch.load('ESP_test_df_enzy_df_6.pt')
ESP_test_df_smiles = torch.load('ESP_test_df_smiles_df_6.pt')
y_test = torch.load('ESP_test_df_label_df_6.pt')
test_tensor_dataset = TensorDataset(ESP_test_df_enzy, ESP_test_df_smiles, y_test)
# Create TensorDataset and DataLoaders
batch_size = 16
test_loader = DataLoader(test_tensor_dataset, batch_size=batch_size, shuffle=False)

run_validation(model_test,test_loader,loss_fn, device)

loss_sum=  0.06429762191449602 ACC=  0.9244604316546763 bacc=  0.8938918757467145 precision=  0.8253968253968254 specificity=  0.9490740740740741 sensitivity=  0.8387096774193549 recall=  0.8387096774193549 MCC=  0.7833218386214272 AUC=  0.9579599761051374 f1=  0.832


(92.44604316546763, 0.7833218386214272, 0.9579599761051374)

In [27]:
import torch
from torch.utils.data import TensorDataset, DataLoader
ESP_test_df_enzy = torch.load('ESP_test_df_enzy_df_7.pt')
ESP_test_df_smiles = torch.load('ESP_test_df_smiles_df_7.pt')
y_test = torch.load('ESP_test_df_label_df_7.pt')
test_tensor_dataset = TensorDataset(ESP_test_df_enzy, ESP_test_df_smiles, y_test)
# Create TensorDataset and DataLoaders
batch_size = 16
test_loader = DataLoader(test_tensor_dataset, batch_size=batch_size, shuffle=False)

run_validation(model_test,test_loader,loss_fn, device)

loss_sum=  0.05940863313153386 ACC=  0.9488817891373802 bacc=  0.9271330462111512 precision=  0.8873239436619719 specificity=  0.9669421487603306 sensitivity=  0.8873239436619719 recall=  0.8873239436619719 MCC=  0.8542660924223024 AUC=  0.9624898149225933 f1=  0.8873239436619719


(94.88817891373802, 0.8542660924223024, 0.9624898149225933)

In [28]:
import torch
from torch.utils.data import TensorDataset, DataLoader
ESP_test_df_enzy = torch.load('ESP_test_df_enzy_df_8.pt')
ESP_test_df_smiles = torch.load('ESP_test_df_smiles_df_8.pt')
y_test = torch.load('ESP_test_df_label_df_8.pt')
test_tensor_dataset = TensorDataset(ESP_test_df_enzy, ESP_test_df_smiles, y_test)
# Create TensorDataset and DataLoaders
batch_size = 16
test_loader = DataLoader(test_tensor_dataset, batch_size=batch_size, shuffle=False)

run_validation(model_test,test_loader,loss_fn, device)

loss_sum=  0.07634337725383895 ACC=  0.9144144144144144 bacc=  0.8618851049191606 precision=  0.8478260869565217 specificity=  0.9590643274853801 sensitivity=  0.7647058823529411 recall=  0.7647058823529411 MCC=  0.7511879232324704 AUC=  0.9432404540763673 f1=  0.8041237113402062


(91.44144144144144, 0.7511879232324704, 0.9432404540763673)

In [29]:
import torch
from torch.utils.data import TensorDataset, DataLoader
ESP_test_df_enzy = torch.load('ESP_test_df_enzy_df_9.pt')
ESP_test_df_smiles = torch.load('ESP_test_df_smiles_df_9.pt')
y_test = torch.load('ESP_test_df_label_df_9.pt')
test_tensor_dataset = TensorDataset(ESP_test_df_enzy, ESP_test_df_smiles, y_test)
# Create TensorDataset and DataLoaders
batch_size = 16
test_loader = DataLoader(test_tensor_dataset, batch_size=batch_size, shuffle=False)

run_validation(model_test,test_loader,loss_fn, device)

loss_sum=  0.058197140134871005 ACC=  0.9400630914826499 bacc=  0.9093447505584512 precision=  0.9054054054054054 specificity=  0.9705882352941176 sensitivity=  0.8481012658227848 recall=  0.8481012658227848 MCC=  0.8371480087138901 AUC=  0.970428677800234 f1=  0.8758169934640523


(94.00630914826499, 0.8371480087138901, 0.970428677800234)

In [30]:
import torch
from torch.utils.data import TensorDataset, DataLoader
ESP_test_df_enzy = torch.load('ESP_test_df_enzy_df_10.pt')
ESP_test_df_smiles = torch.load('ESP_test_df_smiles_df_10.pt')
y_test = torch.load('ESP_test_df_label_df_10.pt')
test_tensor_dataset = TensorDataset(ESP_test_df_enzy, ESP_test_df_smiles, y_test)
# Create TensorDataset and DataLoaders
batch_size = 16
test_loader = DataLoader(test_tensor_dataset, batch_size=batch_size, shuffle=False)

run_validation(model_test,test_loader,loss_fn, device)

loss_sum=  0.055127155780792234 ACC=  0.9554140127388535 bacc=  0.9347633790358248 precision=  0.918918918918919 specificity=  0.9747899159663865 sensitivity=  0.8947368421052632 recall=  0.8947368421052632 MCC=  0.8775194375221654 AUC=  0.9736842105263158 f1=  0.9066666666666667


(95.54140127388536, 0.8775194375221654, 0.9736842105263158)

In [31]:
import torch
from torch.utils.data import TensorDataset, DataLoader
ESP_test_df_enzy = torch.load('ESP_test_df_enzy_df_11.pt')
ESP_test_df_smiles = torch.load('ESP_test_df_smiles_df_11.pt')
y_test = torch.load('ESP_test_df_label_df_11.pt')
test_tensor_dataset = TensorDataset(ESP_test_df_enzy, ESP_test_df_smiles, y_test)
# Create TensorDataset and DataLoaders
batch_size = 16
test_loader = DataLoader(test_tensor_dataset, batch_size=batch_size, shuffle=False)

run_validation(model_test,test_loader,loss_fn, device)

loss_sum=  0.05040595540021722 ACC=  0.9530193780982424 bacc=  0.9340073392159671 precision=  0.948749487494875 specificity=  0.9800637958532695 sensitivity=  0.8879508825786646 recall=  0.8879508825786646 MCC=  0.8855241111224615 AUC=  0.9732650453315316 f1=  0.9173439048562934


(95.30193780982424, 0.8855241111224615, 0.9732650453315316)