# This notebook assumes that you have extracted the embeddings (using the procedure mentioned in the 0_ESM_Embeddings_Extractor.ipynb notebook) and have stored them in a zipped format

## Since we have used google colab; we copy the embeddings from google drive before training the model; similar procedure can be used to run it locally;

## A example is shown below for CDR3alpha, CDR3beta and peptide:

In [None]:
#####################################################1_origVDJDB_NoMHC/train_AB
!cp -r /content/drive/MyDrive/TCR-pMHC-results/vdjdb/1_origVDJDB_NoMHC/train_AB/*.csv '/content/'
!cp -r /content/drive/MyDrive/TCR-pMHC-results/vdjdb/1_origVDJDB_NoMHC/train_AB/*.zip '/content'

## A example is shown below for CDR3alpha, CDR3beta, peptide and MHC:

In [None]:
# ###################################################3_origVDJDB_HMHC/train_AB
!cp -r /content/drive/MyDrive/TCR-pMHC-results/vdjdb/3_origVDJDB_HMHC/train_AB/*.csv '/content/'
!cp -r /content/drive/MyDrive/TCR-pMHC-results/vdjdb/3_origVDJDB_HMHC/train_AB/*.zip '/content'

In [1]:
###### pmtnet data External Validation

# !cp -r /content/drive/MyDrive/TCR-pMHC-results/pmtnet_exp/set1/*.csv '/content/'
# !cp -r /content/drive/MyDrive/TCR-pMHC-results/pmtnet_exp/set1/*.zip '/content'

## Unzip the embeddings to folder for developing the train and test set

In [None]:
########################################################################## AB and A
#### unzip train
!unzip -q <path_to_train_cdr3a.zip>     -d  train_cdr3a
!unzip -q <path_to_train_cdr3b.zip>     -d  train_cdr3b
!unzip -q <path_to_train_peptide.zip>   -d  train_peptide

#### unzip test
!unzip -q <path_to_test_cdr3a.zip>     -d  test_cdr3a
!unzip -q <path_to_test_cdr3b.zip>     -d  test_cdr3b
!unzip -q <path_to_test_peptide.zip>   -d  test_peptide

########################################################################## AB and A wMHC
#### unzip train
!unzip -q <path_to_train_cdr3a.zip>     -d  train_cdr3a
!unzip -q <path_to_train_cdr3b.zip>     -d  train_cdr3b
!unzip -q <path_to_train_peptide.zip>   -d  train_peptide
!unzip -q <path_to_train_mhc.zip>       -d  train_mhc

#### unzip test
!unzip -q <path_to_test_cdr3a.zip>     -d  test_cdr3a
!unzip -q <path_to_test_cdr3b.zip>     -d  test_cdr3b
!unzip -q <path_to_test_peptide.zip>   -d  test_peptide
!unzip -q <path_to_test_mhc.zip>       -d  test_mhc

In [None]:
import sys
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from natsort import natsorted
import matplotlib.pyplot as plt
plt.style.use('seaborn')
%matplotlib inline
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Layer,Input, Dense, Dropout, Activation, Concatenate, Flatten, BatchNormalization
from tensorflow.keras.regularizers import l2,l1
from tensorflow.keras.optimizers import SGD,Adam,RMSprop
import tensorflow.keras.backend as K
from tensorflow.keras.preprocessing.image import array_to_img, img_to_array, load_img
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint,ReduceLROnPlateau
from tensorflow.keras.models import load_model
import tensorflow.keras.metrics
####import tensorflow_addons as tfa not required
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
tf.random.set_seed(1)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.utils import to_categorical, plot_model
import sklearn
import os
from natsort import natsorted
from sklearn.metrics import *
import torch
from tqdm import tqdm

# From now on comment out code for subset2 if running for subset 1 and vice versa

## For MCPAS subset-1 which doesn't have MHC information
### Model CDR3a + CDR3b + peptide
### Model CDR3a + peptide
### Model CDR3b + peptide

In [None]:
### train files
path_train_cdr3a = <'path_to_train_cdr3a_embeddings'>  
path_train_cdr3b = <'path_to_train_cdr3b_embeddings'>
path_train_pepti = <'path_to_train_peptide_embeddings'> 

### test files
path_test_cdr3a = <'path_to_test_cdr3a_embeddings'>  
path_test_cdr3b = <'path_to_test_cdr3b_embeddings'>  
path_test_pepti = <'path_to_test_peptide_embeddings'>  


trainmat_cdr3a = os.listdir(path_train_cdr3a) 
trainmat_cdr3b = os.listdir(path_train_cdr3b) 
trainmat_pepti = os.listdir(path_train_pepti)


testmat_cdr3a = os.listdir(path_test_cdr3a) 
testmat_cdr3b = os.listdir(path_test_cdr3b) 
testmat_pepti = os.listdir(path_test_pepti)

## For MCPAS subset-2 which includes MHC information
### Model CDR3a + CDR3b + MHC + peptide
### Model CDR3a + MHC +peptide
### Model CDR3b + MHC + peptide
### Model CDR3a + CDR3b + peptide
### Model CDR3a + peptide
### Model CDR3b + peptide

In [None]:
### train files
path_train_cdr3a = <'path_to_train_cdr3a_embeddings'>  
path_train_cdr3b = <'path_to_train_cdr3b_embeddings'>
path_train_pepti = <'path_to_train_peptide_embeddings'> 
path_train_mhc   = <'path_to_train_mhc_embeddings'> 

### test files
path_test_cdr3a = <'path_to_test_cdr3a_embeddings'>  
path_test_cdr3b = <'path_to_test_cdr3b_embeddings'>  
path_test_pepti = <'path_to_test_peptide_embeddings'>
path_test_mhc   = <'path_to_test_mhc_embeddings'> 


trainmat_cdr3a = os.listdir(path_train_cdr3a) 
trainmat_cdr3b = os.listdir(path_train_cdr3b) 
trainmat_pepti = os.listdir(path_train_pepti)
trainmat_mhc   = os.listdir(path_train_mhc)

testmat_cdr3a = os.listdir(path_test_cdr3a) 
testmat_cdr3b = os.listdir(path_test_cdr3b) 
testmat_pepti = os.listdir(path_test_pepti)
testmat_mhc   = os.listdir(path_test_mhc)

## natsort is used to order the pairs as they appear in the .csv; this would be helpful later to map the pairs with their respective labels

In [None]:
### AB-pep, A-pep, B-pep

###train
train_nmat_cdr3a = natsorted(trainmat_cdr3a)
train_nmat_cdr3b = natsorted(trainmat_cdr3b)
train_nmat_pepti = natsorted(trainmat_pepti)



###test
test_nmat_cdr3a = natsorted(testmat_cdr3a)
test_nmat_cdr3b = natsorted(testmat_cdr3b)
test_nmat_pepti = natsorted(testmat_pepti)

In [None]:
### ABH-pep, BH-pep, AH-pep,AB-pep, A-pep, B-pep 
### train
train_nmat_cdr3a = natsorted(trainmat_cdr3a)
train_nmat_cdr3b = natsorted(trainmat_cdr3b)
train_nmat_pepti = natsorted(trainmat_pepti)
train_nmat_mhc   = natsorted(trainmat_mhc)

### test
test_nmat_cdr3a = natsorted(testmat_cdr3a)
test_nmat_cdr3b = natsorted(testmat_cdr3b)
test_nmat_pepti = natsorted(testmat_pepti)
test_nmat_mhc   = natsorted(testmat_mhc)

## the following step extracts the embeddings and stores in a numpy matrix for MCPAS subset 1


In [None]:
train_matmat_cdr3a = np.zeros((train_samples,1280))
train_matmat_cdr3b = np.zeros((train_samples,1280))
train_matmat_pepti = np.zeros((train_samples,1280))


test_matmat_cdr3a = np.zeros((test_samples,1280))
test_matmat_cdr3b = np.zeros((test_samples,1280))
test_matmat_pepti = np.zeros((test_samples,1280))

### load train samples 

for i in tqdm(range(train_samples)):

    train_matmat_cdr3a[i] = torch.load(path_train_cdr3a+train_nmat_cdr3a[i])['mean_representations'][33]

    train_matmat_cdr3b[i] = torch.load(path_train_cdr3b+train_nmat_cdr3b[i])['mean_representations'][33]

    train_matmat_pepti[i] = torch.load(path_train_pepti+train_nmat_pepti[i])['mean_representations'][33]


    for j in tqdm(range(test_samples)):

    test_matmat_cdr3a[j]  = torch.load(path_test_cdr3a+test_nmat_cdr3a[j])['mean_representations'][33]

    test_matmat_cdr3b[j]  = torch.load(path_test_cdr3b+test_nmat_cdr3b[j])['mean_representations'][33]

    test_matmat_pepti[j]  = torch.load(path_test_pepti+test_nmat_pepti[j])['mean_representations'][33]



## the following step extracts the embeddings and stores in a numpy matrix for MCPAS subset 2

In [None]:
train_matmat_cdr3a = np.zeros((train_samples,1280))
train_matmat_cdr3b = np.zeros((train_samples,1280))
train_matmat_pepti = np.zeros((train_samples,1280))
train_matmat_mhc   = np.zeros((train_samples,1280))


test_matmat_cdr3a = np.zeros((test_samples,1280))
test_matmat_cdr3b = np.zeros((test_samples,1280))
test_matmat_pepti = np.zeros((test_samples,1280))
test_matmat_mhc   = np.zeros((test_samples,1280))

### load train samples 

for i in tqdm(range(train_samples)):

    train_matmat_cdr3a[i] = torch.load(path_train_cdr3a+train_nmat_cdr3a[i])['mean_representations'][33]

    train_matmat_cdr3b[i] = torch.load(path_train_cdr3b+train_nmat_cdr3b[i])['mean_representations'][33]

    train_matmat_pepti[i] = torch.load(path_train_pepti+train_nmat_pepti[i])['mean_representations'][33]

    train_matmat_mhc[i]   = torch.load(path_train_mhc+train_nmat_mhc[i])['mean_representations'][33]


for j in tqdm(range(test_samples)):

    test_matmat_cdr3a[j]  = torch.load(path_test_cdr3a+test_nmat_cdr3a[j])['mean_representations'][33]

    test_matmat_cdr3b[j]  = torch.load(path_test_cdr3b+test_nmat_cdr3b[j])['mean_representations'][33]

    test_matmat_pepti[j]  = torch.load(path_test_pepti+test_nmat_pepti[j])['mean_representations'][33]

    test_matmat_mhc[j]    = torch.load(path_test_mhc+test_nmat_mhc[j])['mean_representations'][33]


## load *.csv files for labels

### for mcpas subset 1

In [None]:
df_train = pd.read_csv('/content/1_origMCPAS_noMHC_train_AB.csv')
df_test  = pd.read_csv('/content/4_origMCPAS_noMHC_test_AB.csv')

### for mcpas subset 2

In [None]:
df_train = pd.read_csv('/content/1_origMCPAS_HMHC_train_AB.csv')
df_test  = pd.read_csv('/content/4_origMCPAS_HMHC_test_AB.csv')

In [None]:
############ load labels
y_train = df_train['sign'].values.reshape(-1,1)
y_test  = df_test['sign'].values.reshape(-1,1)


In [None]:
### model
def clear_sess():
  try:
    del model 
    del history 
  except:
    pass
  from tensorflow.keras import backend as K
  K.clear_session()
  import gc
  gc.collect()



  return None

def keras_mcc(y_true, y_pred):
    tp = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    tn = K.sum(K.round(K.clip((1 - y_true) * (1 - y_pred), 0, 1)))
    fp = K.sum(K.round(K.clip((1 - y_true) * y_pred, 0, 1)))
    fn = K.sum(K.round(K.clip(y_true * (1 - y_pred), 0, 1)))

    num = tp * tn - fp * fn
    den = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    return num / K.sqrt(den + K.epsilon())

## MLP for CDR3A, CDR3B and peptide

In [None]:
clear_sess()
acti  = 'swish'

#input_1
input_1 = Input(shape = (1280,), name='i_1')
dense1_1 = Dense(128, activation = acti)(input_1)
bn1_1 = BatchNormalization()(dense1_1)
drop_1 = Dropout(0.5)(bn1_1)

#input_2
input_2 = Input(shape = (1280,), name='i_2')
dense2_1 = Dense(128, activation = acti)(input_2)
bn2_1 = BatchNormalization()(dense2_1)
drop_2 = Dropout(0.5)(bn2_1)

#input_3
input_3 = Input(shape = (1280,), name='i_3')
dense3_1 = Dense(128, activation = acti)(input_3)
bn3_1 = BatchNormalization()(dense3_1)
drop_3 = Dropout(0.5)(bn3_1)

# concatenate
concat   = Concatenate()([drop_1,drop_2,drop_3])
fc_1   = Dense(512, activation = acti)(concat)
drop_4 = Dropout(0.5)(fc_1)
fc_2   = Dense(256, activation = acti)(drop_4)
#classification output- TCR-Peptide Binding yes/no
output  = Dense(1, activation = 'sigmoid')(fc_2)
 
# create model with two inputs
model = Model(inputs=[input_1,input_2, input_3], outputs=output)

### CDR3A, peptide and MHC and CDR3B, peptide and MHC

In [None]:
clear_sess()

acti  = 'gelu'

#input_1
input_1 = Input(shape = (1280,), name='i_1')
dense1_1 = Dense(128, activation = acti)(input_1)
bn1_1 = BatchNormalization()(dense1_1)
drop_1 = Dropout(0.5)(bn1_1)

#input_2
input_2 = Input(shape = (1280,), name='i_2')
dense2_1 = Dense(128, activation = acti)(input_2)
bn2_1 = BatchNormalization()(dense2_1)
drop_2 = Dropout(0.5)(bn2_1)

#input_3
input_3 = Input(shape = (1280,), name='i_3')
dense3_1 = Dense(64, activation = acti)(input_3)
bn3_1 = BatchNormalization()(dense3_1)
drop_3 = Dropout(0.5)(bn3_1)

# concatenate
##concat   = Concatenate()([dense1_1, dense2_1])
##concat   = Concatenate()([bn1_1, bn2_1])
concat   = Concatenate()([drop_1,drop_2,drop_3])
fc_1   = Dense(512, activation = acti)(concat)
drop_4 = Dropout(0.5)(fc_1)
fc_2   = Dense(256, activation = acti)(drop_4)
#classification output- TCR-Peptide Binding yes/no
output  = Dense(1, activation = 'sigmoid')(fc_2)
 
# create model with two inputs
model = Model(inputs=[input_1,input_2, input_3], outputs=output)

### CDR3A peptide and CDR3B peptide 

In [None]:
clear_sess()

acti  = 'swish'

#input_1


input_1 = Input(shape = (1280,), name='i_1')
dense1_1 = Dense(128, activation = acti)(input_1)
bn1_1 = BatchNormalization()(dense1_1)
drop_1 = Dropout(0.5)(bn1_1)

#input_2
input_2 = Input(shape = (1280,), name='i_2')
dense2_1 = Dense(128, activation = acti)(input_2)
bn2_1 = BatchNormalization()(dense2_1)
drop_2 = Dropout(0.5)(bn2_1)


# concatenate
concat   = Concatenate()([drop_1,drop_2])
fc_1   = Dense(512, activation = acti)(concat)
drop_4 = Dropout(0.5)(fc_1)
fc_2   = Dense(256, activation = acti)(drop_4)
#classification output- TCR-Peptide Binding yes/no
output  = Dense(1, activation = 'sigmoid')(fc_2)
 
# create model with two inputs
model = Model(inputs=[input_1,input_2], outputs=output)

### CDR3A CDR3B peptide MHC 

In [None]:
clear_sess()

acti  = 'gelu'

#input_1
input_1 = Input(shape = (1280,), name='i_1')
dense1_1 = Dense(128, activation = acti)(input_1)
bn1_1 = BatchNormalization()(dense1_1)
drop_1 = Dropout(0.5)(bn1_1)

#input_2
input_2 = Input(shape = (1280,), name='i_2')
dense2_1 = Dense(128, activation = acti)(input_2)
bn2_1 = BatchNormalization()(dense2_1)
drop_2 = Dropout(0.5)(bn2_1)


#input_3
input_3 = Input(shape = (1280,), name='i_3')
dense3_1 = Dense(128, activation = acti)(input_3)
bn3_1 = BatchNormalization()(dense3_1)
drop_3 = Dropout(0.5)(bn3_1)

#input_4
input_4 = Input(shape = (1280,), name='i_4')
dense4_1 = Dense(64, activation = acti)(input_4)
bn4_1 = BatchNormalization()(dense4_1)
drop_4 = Dropout(0.5)(bn4_1)


# concatenate
##concat   = Concatenate()([dense1_1, dense2_1])
##concat   = Concatenate()([bn1_1, bn2_1])
concat   = Concatenate()([drop_1,drop_2, drop_3, drop_4])
fc_1   = Dense(512, activation = acti)(concat)
drop_5 = Dropout(0.5)(fc_1)
fc_2   = Dense(256, activation = acti)(drop_5)
#classification output- TCR-Peptide Binding yes/no
output  = Dense(1, activation = 'sigmoid')(fc_2)
 
# create model with two inputs
model = Model(inputs=[input_1,input_2, input_3, input_4], outputs=output)

In [None]:
metrics_c = [tensorflow.keras.metrics.AUC(name="auc_roc",curve="ROC"),tensorflow.keras.metrics.AUC(name="auc_pr",curve="PR"),keras_mcc]
model.compile(loss='binary_crossentropy', optimizer=tensorflow.keras.optimizers.Adam(learning_rate=0.009),   metrics=metrics_c)
reduce_lr = ReduceLROnPlateau(monitor='val_keras_mcc', factor=0.95,patience=50, min_lr=0.005, verbose=0)

In [None]:
checkpoint_filepath_1 = 'weights-improvement-val-auc-pr.hdf5'
model_checkpoint_callback_1 = ModelCheckpoint(filepath=checkpoint_filepath_1,save_weights_only=False,monitor='val_auc_pr',mode='max',save_best_only=True)

checkpoint_filepath_2 = 'weights-improvement-val-keras-mcc.hdf5'
model_checkpoint_callback_2 = ModelCheckpoint(filepath=checkpoint_filepath_2,save_weights_only=False,monitor='val_keras_mcc',mode='max',save_best_only=True)

# Note: while training model for one set, comment out the others;
## For example for training, model for CDR3A, CDR3B , peptide, MHC, comment out the other 5.

### fit the keras model for CDR3A, CDR3B , peptide, MHC

In [None]:
history=model.fit([train_matmat_cdr3a, train_matmat_cdr3b,train_matmat_pepti, train_matmat_mhc],y_train,
                  batch_size=512, epochs=600,
                  validation_split=0.1,
                  callbacks=[model_checkpoint_callback_1, model_checkpoint_callback_2,reduce_lr])

### fit the keras model for CDR3B , peptide, MHC

In [None]:
history=model.fit([train_matmat_cdr3b,train_matmat_pepti, train_matmat_mhc],y_train,
                  batch_size=512, epochs=600,
                  validation_split=0.1,
                  callbacks=[model_checkpoint_callback_1, model_checkpoint_callback_2, reduce_lr])

### fit the keras model for CDR3A, peptide, MHC

In [None]:
history=model.fit([train_matmat_cdr3a,train_matmat_pepti, train_matmat_mhc],y_train,
                  batch_size=512, epochs=600,
                  validation_split=0.1,
                  callbacks=[model_checkpoint_callback_1, model_checkpoint_callback_2, reduce_lr])

### fit the keras model for CDR3A, CDR3B and peptide

In [None]:
history=model.fit([train_matmat_cdr3a, train_matmat_cdr3b,train_matmat_pepti],y_train,
                  batch_size=1024, epochs=500,
                  validation_split=0.1,
                  callbacks=[model_checkpoint_callback_1, model_checkpoint_callback_2,reduce_lr])

### fit the keras model for CDR3B and peptide

In [None]:
history=model.fit([train_matmat_cdr3b,train_matmat_pepti],y_train,
                  batch_size=1024, epochs=1000,
                  validation_split=0.1,
                  callbacks=[model_checkpoint_callback_1, model_checkpoint_callback_2,reduce_lr])

### fit the keras model for CDR3A and peptide

In [None]:
history=model.fit([train_matmat_cdr3a, train_matmat_pepti],y_train,
                  batch_size=1024, epochs=500,
                  validation_split=0.1,
                  callbacks=[model_checkpoint_callback_1, model_checkpoint_callback_2,reduce_lr])

## once the best model is trained, we can test it over the evaluation dataset

In [None]:
model_loaded = '/content/weights-improvement-val-keras-mcc.hdf5'
model = tensorflow.keras.models.load_model(model_loaded,compile=False, custom_objects={'metrics_c': keras_mcc})

#pick the model you trained

#y_pred = model.predict([test_matmat_cdr3a, test_matmat_cdr3b, test_matmat_pepti, test_matmat_mhc])
#y_pred = model.predict([test_matmat_cdr3b, test_matmat_pepti, test_matmat_mhc])
#y_pred = model.predict([test_matmat_cdr3a, test_matmat_pepti, test_matmat_mhc])
#y_pred = model.predict([test_matmat_cdr3a, test_matmat_cdr3b, test_matmat_pepti])
# y_pred = model.predict([test_matmat_cdr3b, test_matmat_pepti])
#y_pred = model.predict([ test_matmat_cdr3a, test_matmat_pepti])


y_act = y_test.flatten()
y_pred= y_pred.flatten()
y_pred_c=np.where(y_pred>0.5,1,0)


print(roc_auc_score(y_act, y_pred),average_precision_score(y_act, y_pred), matthews_corrcoef(y_act,y_pred_c),f1_score(y_act,y_pred_c))

In [None]:
def predict_mode(mode, test_matmat_cdr3a, test_matmat_cdr3b, test_matmat_pepti, test_matmat_mhc, model_loaded, model):

  if   mode=='abh':
    y_pred = model.predict([test_matmat_cdr3a, test_matmat_cdr3b, test_matmat_pepti, test_matmat_mhc])
  elif mode=='bh':
    y_pred = model.predict([test_matmat_cdr3b, test_matmat_pepti, test_matmat_mhc])
  elif mode=='ah':
    y_pred = model.predict([test_matmat_cdr3a, test_matmat_pepti, test_matmat_mhc])
  elif mode=='ab':
    y_pred = model.predict([test_matmat_cdr3a, test_matmat_cdr3b, test_matmat_pepti])
  elif mode=='b':
    y_pred = model.predict([test_matmat_cdr3b, test_matmat_pepti])
  elif mode=='a':
    y_pred = model.predict([ test_matmat_cdr3a, test_matmat_pepti])

  return y_pred

### set correct mode to predict correctly

In [None]:
#mode='ah'
#mode='bh'
#mode='a'
#mode='b'
#mode='abh'
# mode='b'

In [None]:
if mode=='b':
    train_matmat_cdr3a =0
    test_matmat_cdr3a  =0

    train_matmat_mhc =0
    test_matmat_mhc  =0

In [None]:
####model load max aucpr


model_loaded = '/content/weights-improvement-val-auc-pr.hdf5'
model = tensorflow.keras.models.load_model(model_loaded,compile=False)

y_pred = predict_mode(mode, test_matmat_cdr3a, test_matmat_cdr3b, test_matmat_pepti, test_matmat_mhc, model_loaded, model)

y_act = y_test.flatten()
y_pred= y_pred.flatten()
y_pred_c=np.where(y_pred>0.5,1,0)


print(roc_auc_score(y_act, y_pred),average_precision_score(y_act, y_pred),matthews_corrcoef(y_act,y_pred_c),cohen_kappa_score(y_act,y_pred_c))

## for peptide and MHC specific analysis

In [None]:
# ##### peptide wise test metrics
peplist = ['GILGFVFTL','LLWNGPMAV','NLVPMVATV','GLCTLVAML','CINGVCWTV','PKYVKQNTLKLAT','RPRGEVRFL','TAFTIPSI','ELAGIGILTV','KLVALGINAV']
# y_pred_c.shape

for pep in peplist:

  y_test_pep  = y_test[df_test['peptide'] == pep]
  y_pred_pep  = y_pred[df_test['peptide'] == pep]

  y_act_pep = y_test_pep.flatten()
  y_pred_pep= y_pred_pep.flatten()
  y_pred_c_pep=np.where(y_pred_pep>0.5,1,0)


  print(pep, matthews_corrcoef(y_act_pep,y_pred_c_pep), roc_auc_score(y_act_pep, y_pred_pep), average_precision_score(y_act_pep, y_pred_pep),f1_score(y_act_pep,y_pred_c_pep))

In [None]:
##### mhc wise test metrics
mhclist = ["HLA-A*02:01:98","HLA-DRA*01:01:02","HLA-B*07:02","HLA-B*35:08:01","HLA-B*51:01","HLA-A*11:01","HLA-B*08:01:29"]
# y_pred_c.shape

for mhc in mhclist:

  y_test_pep  = y_test[df_test['mhc_short'] == mhc]
  y_pred_pep  = y_pred[df_test['mhc_short'] == mhc]

  y_act_pep = y_test_pep.flatten()
  y_pred_pep= y_pred_pep.flatten()
  y_pred_c_pep=np.where(y_pred_pep>0.5,1,0)


  print(mhc, matthews_corrcoef(y_act_pep,y_pred_c_pep), roc_auc_score(y_act_pep, y_pred_pep), average_precision_score(y_act_pep, y_pred_pep),f1_score(y_act_pep,y_pred_c_pep))

### once we get the test evaluation, we can do 10 fold CV 

In [None]:
#### make_model for 10 fold CV

def make_data(orig_matmat_cdr3a, orig_matmat_cdr3b, orig_matmat_pepti, orig_matmat_mhc, orig_y_train, train_ind, test_ind, mode):

  acti = 'gelu'

  if mode == 'ab':

    ##dataprep
    matmat_cdr3a   = train_matmat_cdr3a[train_ind]
    matmat_cdr3b   = train_matmat_cdr3b[train_ind]
    matmat_pepti   = train_matmat_pepti[train_ind]
    matmat_y_train = y_train[train_ind]

    #### internal eval
    evalmatmat_cdr3a = train_matmat_cdr3a[test_ind]
    evalmatmat_cdr3b = train_matmat_cdr3b[test_ind]
    evalmatmat_pepti = train_matmat_pepti[test_ind]
    evalmatmat_y_eval= y_train[test_ind]

    train_input = [matmat_cdr3a,matmat_cdr3b,matmat_pepti]
    train_output= matmat_y_train
    test_input = [evalmatmat_cdr3a,evalmatmat_cdr3b,evalmatmat_pepti]
    test_output= evalmatmat_y_eval

    #model
    #input_1
    input_1 = Input(shape = (1280,), name='i_1')
    dense1_1 = Dense(128, activation = acti)(input_1)
    bn1_1 = BatchNormalization()(dense1_1)
    drop_1 = Dropout(0.5)(bn1_1)

    #input_2
    input_2 = Input(shape = (1280,), name='i_2')
    dense2_1 = Dense(128, activation = acti)(input_2)
    bn2_1 = BatchNormalization()(dense2_1)
    drop_2 = Dropout(0.5)(bn2_1)

    #input_3
    input_3 = Input(shape = (1280,), name='i_3')
    dense3_1 = Dense(128, activation = acti)(input_3)
    bn3_1 = BatchNormalization()(dense3_1)
    drop_3 = Dropout(0.5)(bn3_1)

    # concatenate
    concat   = Concatenate()([drop_1,drop_2,drop_3])
    fc_1   = Dense(512, activation = acti)(concat)
    drop_4 = Dropout(0.5)(fc_1)
    fc_2   = Dense(256, activation = acti)(drop_4)
    #classification output- TCR-Peptide Binding yes/no
    output  = Dense(1, activation = 'sigmoid')(fc_2)

    # create model with two inputs
    model = Model(inputs=[input_1,input_2, input_3], outputs=output)


  elif mode =='a':

    ##data prep

    matmat_cdr3a   = train_matmat_cdr3a[train_ind]
    matmat_pepti   = train_matmat_pepti[train_ind]
    matmat_y_train = y_train[train_ind]

    #### internal eval
    evalmatmat_cdr3a = train_matmat_cdr3a[test_ind]
    evalmatmat_pepti = train_matmat_pepti[test_ind]
    evalmatmat_y_eval= y_train[test_ind]

    train_input = [matmat_cdr3a,matmat_pepti]
    train_output= matmat_y_train
    test_input = [evalmatmat_cdr3a,evalmatmat_pepti]
    test_output= evalmatmat_y_eval


    #model
    #input_1
    input_1 = Input(shape = (1280,), name='i_1')
    dense1_1 = Dense(128, activation = acti)(input_1)
    bn1_1 = BatchNormalization()(dense1_1)
    drop_1 = Dropout(0.5)(bn1_1)

    #input_2
    input_2 = Input(shape = (1280,), name='i_2')
    dense2_1 = Dense(128, activation = acti)(input_2)
    bn2_1 = BatchNormalization()(dense2_1)
    drop_2 = Dropout(0.5)(bn2_1)


    # concatenate
    concat   = Concatenate()([drop_1,drop_2])
    fc_1   = Dense(512, activation = acti)(concat)
    drop_4 = Dropout(0.5)(fc_1)
    fc_2   = Dense(256, activation = acti)(drop_4)
    #classification output- TCR-Peptide Binding yes/no
    output  = Dense(1, activation = 'sigmoid')(fc_2)

    # create model with two inputs
    model = Model(inputs=[input_1,input_2], outputs=output)


  elif mode == 'b':

    #model

    matmat_cdr3b   = train_matmat_cdr3b[train_ind]
    matmat_pepti   = train_matmat_pepti[train_ind]
    matmat_y_train = y_train[train_ind]

    #### internal eval
    evalmatmat_cdr3b = train_matmat_cdr3b[test_ind]
    evalmatmat_pepti = train_matmat_pepti[test_ind]
    evalmatmat_y_eval= y_train[test_ind]

    train_input = [matmat_cdr3b,matmat_pepti]
    train_output= matmat_y_train
    test_input = [evalmatmat_cdr3b,evalmatmat_pepti]
    test_output= evalmatmat_y_eval

    #input_1
    input_1 = Input(shape = (1280,), name='i_1')
    dense1_1 = Dense(128, activation = acti)(input_1)
    bn1_1 = BatchNormalization()(dense1_1)
    drop_1 = Dropout(0.5)(bn1_1)

    #input_2
    input_2 = Input(shape = (1280,), name='i_2')
    dense2_1 = Dense(128, activation = acti)(input_2)
    bn2_1 = BatchNormalization()(dense2_1)
    drop_2 = Dropout(0.5)(bn2_1)


    # concatenate
    concat   = Concatenate()([drop_1,drop_2])
    fc_1   = Dense(512, activation = acti)(concat)
    drop_4 = Dropout(0.5)(fc_1)
    fc_2   = Dense(256, activation = acti)(drop_4)
    #classification output- TCR-Peptide Binding yes/no
    output  = Dense(1, activation = 'sigmoid')(fc_2)

    # create model with two inputs
    model = Model(inputs=[input_1,input_2], outputs=output)

  elif mode == 'abh':

    ##dataprep

    matmat_cdr3a   = train_matmat_cdr3a[train_ind]
    matmat_cdr3b   = train_matmat_cdr3b[train_ind]
    matmat_pepti   = train_matmat_pepti[train_ind]
    matmat_mhc     = train_matmat_mhc[train_ind]
    matmat_y_train = y_train[train_ind]

    #### internal eval
    evalmatmat_cdr3a = train_matmat_cdr3a[test_ind]
    evalmatmat_cdr3b = train_matmat_cdr3b[test_ind]
    evalmatmat_pepti = train_matmat_pepti[test_ind]
    evalmatmat_mhc   = train_matmat_mhc[test_ind]
    evalmatmat_y_eval= y_train[test_ind]

    train_input = [matmat_cdr3a,matmat_cdr3b,matmat_pepti, matmat_mhc]
    train_output= matmat_y_train
    test_input = [evalmatmat_cdr3a,evalmatmat_cdr3b,evalmatmat_pepti, evalmatmat_mhc]
    test_output= evalmatmat_y_eval

    #input_1
    input_1 = Input(shape = (1280,), name='i_1')
    dense1_1 = Dense(128, activation = acti)(input_1)
    bn1_1 = BatchNormalization()(dense1_1)
    drop_1 = Dropout(0.5)(bn1_1)

    #input_2
    input_2 = Input(shape = (1280,), name='i_2')
    dense2_1 = Dense(128, activation = acti)(input_2)
    bn2_1 = BatchNormalization()(dense2_1)
    drop_2 = Dropout(0.5)(bn2_1)


    #input_3
    input_3 = Input(shape = (1280,), name='i_3')
    dense3_1 = Dense(128, activation = acti)(input_3)
    bn3_1 = BatchNormalization()(dense3_1)
    drop_3 = Dropout(0.5)(bn3_1)

    #input_4
    input_4 = Input(shape = (1280,), name='i_4')
    dense4_1 = Dense(64, activation = acti)(input_4)
    bn4_1 = BatchNormalization()(dense4_1)
    drop_4 = Dropout(0.5)(bn4_1)


    # concatenate
    concat   = Concatenate()([drop_1,drop_2, drop_3, drop_4])
    fc_1   = Dense(512, activation = acti)(concat)
    drop_5 = Dropout(0.5)(fc_1)
    fc_2   = Dense(256, activation = acti)(drop_5)
    #classification output- TCR-Peptide Binding yes/no
    output  = Dense(1, activation = 'sigmoid')(fc_2)
    
    # create model with two inputs
    model = Model(inputs=[input_1,input_2, input_3, input_4], outputs=output)


  elif mode == 'bh':

    ##dataprep
    matmat_cdr3b   = train_matmat_cdr3b[train_ind]
    matmat_pepti   = train_matmat_pepti[train_ind]
    matmat_mhc     = train_matmat_mhc[train_ind]
    matmat_y_train = y_train[train_ind]

    #### internal eval
    evalmatmat_cdr3b = train_matmat_cdr3b[test_ind]
    evalmatmat_pepti = train_matmat_pepti[test_ind]
    evalmatmat_mhc   = train_matmat_mhc[test_ind]
    evalmatmat_y_eval= y_train[test_ind]

    train_input = [matmat_cdr3b,matmat_pepti, matmat_mhc]
    train_output= matmat_y_train
    test_input = [evalmatmat_cdr3b,evalmatmat_pepti, evalmatmat_mhc]
    test_output= evalmatmat_y_eval


    #input_2
    input_2 = Input(shape = (1280,), name='i_2')
    dense2_1 = Dense(128, activation = acti)(input_2)
    bn2_1 = BatchNormalization()(dense2_1)
    drop_2 = Dropout(0.5)(bn2_1)

    #input_3
    input_3 = Input(shape = (1280,), name='i_3')
    dense3_1 = Dense(128, activation = acti)(input_3)
    bn3_1 = BatchNormalization()(dense3_1)
    drop_3 = Dropout(0.5)(bn3_1)

    #input_4
    input_4 = Input(shape = (1280,), name='i_4')
    dense4_1 = Dense(64, activation = acti)(input_4)
    bn4_1 = BatchNormalization()(dense4_1)
    drop_4 = Dropout(0.5)(bn4_1)


    # concatenate
    concat   = Concatenate()([drop_2, drop_3, drop_4])
    fc_1   = Dense(512, activation = acti)(concat)
    drop_5 = Dropout(0.5)(fc_1)
    fc_2   = Dense(256, activation = acti)(drop_5)
    #classification output- TCR-Peptide Binding yes/no
    output  = Dense(1, activation = 'sigmoid')(fc_2)
    
    # create model with two inputs
    model = Model(inputs=[input_2, input_3, input_4], outputs=output)

  elif mode == 'ah':

    ##dataprep
    matmat_cdr3a   = train_matmat_cdr3a[train_ind]
    matmat_pepti   = train_matmat_pepti[train_ind]
    matmat_mhc     = train_matmat_mhc[train_ind]
    matmat_y_train = y_train[train_ind]

    #### internal eval
    evalmatmat_cdr3a = train_matmat_cdr3a[test_ind]
    evalmatmat_pepti = train_matmat_pepti[test_ind]
    evalmatmat_mhc   = train_matmat_mhc[test_ind]
    evalmatmat_y_eval= y_train[test_ind]

    train_input = [matmat_cdr3a,matmat_pepti, matmat_mhc]
    train_output= matmat_y_train
    test_input = [evalmatmat_cdr3a,evalmatmat_pepti, evalmatmat_mhc]
    test_output= evalmatmat_y_eval

    #input_1
    input_1 = Input(shape = (1280,), name='i_1')
    dense1_1 = Dense(128, activation = acti)(input_1)
    bn1_1 = BatchNormalization()(dense1_1)
    drop_1 = Dropout(0.5)(bn1_1)

    #input_3
    input_3 = Input(shape = (1280,), name='i_3')
    dense3_1 = Dense(128, activation = acti)(input_3)
    bn3_1 = BatchNormalization()(dense3_1)
    drop_3 = Dropout(0.5)(bn3_1)

    #input_4
    input_4 = Input(shape = (1280,), name='i_4')
    dense4_1 = Dense(64, activation = acti)(input_4)
    bn4_1 = BatchNormalization()(dense4_1)
    drop_4 = Dropout(0.5)(bn4_1)


    # concatenate
    concat   = Concatenate()([drop_1, drop_3, drop_4])
    fc_1   = Dense(512, activation = acti)(concat)
    drop_5 = Dropout(0.5)(fc_1)
    fc_2   = Dense(256, activation = acti)(drop_5)
    #classification output- TCR-Peptide Binding yes/no
    output  = Dense(1, activation = 'sigmoid')(fc_2)
    
    # create model with two inputs
    model = Model(inputs=[input_1, input_3, input_4], outputs=output)

  return model, train_input, test_input, train_output, test_output

## CDR3A+CDR3B+MHC+peptide

In [None]:
#### 10 Fold CV

###set mode
mode='abh'

from sklearn.model_selection import KFold
# Define the K-fold Cross Validator
kfold = KFold(n_splits=10, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

for train_ind, test_ind in kfold.split(train_matmat_pepti):
    clear_sess()
    model, train_input, test_input, train_output, test_output = make_data(train_matmat_cdr3a, train_matmat_cdr3b, train_matmat_pepti, train_matmat_mhc, y_train, train_ind, test_ind, mode)
    metrics_c = [tensorflow.keras.metrics.AUC(name="auc_roc",curve="ROC"),tensorflow.keras.metrics.AUC(name="auc_pr",curve="PR"),keras_mcc]
    model.compile(loss="binary_crossentropy", optimizer=Adam(learning_rate=0.008), metrics=metrics_c)
    reduce_lr = ReduceLROnPlateau(monitor='val_auc_roc', factor=0.95,patience=50, min_lr=0.005, verbose=0)
    checkpoint_filepath = f'weights-improvement-{fold_no}.hdf5'
    #model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath,save_weights_only=False,monitor='val_auc_pr',mode='max',save_best_only=True)
    model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath,save_weights_only=False,monitor='val_keras_mcc',mode='max',save_best_only=True)


    history = model.fit(train_input, train_output,epochs=600,batch_size=512, verbose=0, validation_data=(test_input,test_output),
                  class_weight={0: 1.0, 1: 5.0},
                  callbacks=[model_checkpoint_callback,reduce_lr])

    model_loaded = f'weights-improvement-{fold_no}.hdf5'
    model = tensorflow.keras.models.load_model(model_loaded,compile=False)

    y_pred = model.predict(test_input, verbose=0)
    y_act = test_output.flatten()
    y_pred= y_pred.flatten()
    y_pred_c=np.where(y_pred>0.5,1,0)
    print(roc_auc_score(y_act, y_pred),average_precision_score(y_act, y_pred),matthews_corrcoef(y_act,y_pred_c),f1_score(y_act,y_pred_c))
    fold_no = fold_no + 1

## CDR3B+MHC+peptide

In [None]:
#### 5/10 Fold CV

###set mode
mode='bh'

from sklearn.model_selection import KFold
# Define the K-fold Cross Validator
kfold = KFold(n_splits=10, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

for train_ind, test_ind in kfold.split(train_matmat_pepti):
    clear_sess()

    model, train_input, test_input, train_output, test_output = make_data(train_matmat_cdr3a, train_matmat_cdr3b, train_matmat_pepti, train_matmat_mhc, y_train, train_ind, test_ind, mode)

    metrics_c = [tensorflow.keras.metrics.AUC(name="auc_roc",curve="ROC"),tensorflow.keras.metrics.AUC(name="auc_pr",curve="PR"),keras_mcc]
    model.compile(loss="binary_crossentropy", optimizer=Adam(learning_rate=0.008), metrics=metrics_c)
    reduce_lr = ReduceLROnPlateau(monitor='val_keras_mcc', factor=0.95,patience=50, min_lr=0.003, verbose=0)
    checkpoint_filepath = f'weights-improvement-{fold_no}.hdf5'
    model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath,save_weights_only=False,monitor='val_keras_mcc',mode='max',save_best_only=True)


    history = model.fit(train_input, train_output,epochs=500,batch_size=1024, verbose=0,validation_data=(test_input,test_output),callbacks=[model_checkpoint_callback,reduce_lr])

    model_loaded = f'weights-improvement-{fold_no}.hdf5'
    model = tensorflow.keras.models.load_model(model_loaded,compile=False)

    y_pred = model.predict(test_input, verbose=0)
    y_act = test_output.flatten()
    y_pred= y_pred.flatten()
    y_pred_c=np.where(y_pred>0.5,1,0)
    print(roc_auc_score(y_act, y_pred),average_precision_score(y_act, y_pred),matthews_corrcoef(y_act,y_pred_c),f1_score(y_act,y_pred_c))
    fold_no = fold_no + 1

## CDR3A+MHC+peptide

In [None]:
#### 5/10 Fold CV

###set mode
mode='ah'

from sklearn.model_selection import KFold
# Define the K-fold Cross Validator
kfold = KFold(n_splits=10, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

for train_ind, test_ind in kfold.split(train_matmat_pepti):
    clear_sess()
    model, train_input, test_input, train_output, test_output = make_data(train_matmat_cdr3a, train_matmat_cdr3b, train_matmat_pepti, train_matmat_mhc, y_train, train_ind, test_ind, mode)

    metrics_c = [tensorflow.keras.metrics.AUC(name="auc_roc",curve="ROC"),tensorflow.keras.metrics.AUC(name="auc_pr",curve="PR"),keras_mcc]
    model.compile(loss="binary_crossentropy", optimizer=Adam(learning_rate=0.009), metrics=metrics_c)
    reduce_lr = ReduceLROnPlateau(monitor='val_keras_mcc', factor=0.95,patience=50, min_lr=0.003, verbose=0)
    checkpoint_filepath = f'weights-improvement-{fold_no}.hdf5'
    model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath,save_weights_only=False,monitor='val_keras_mcc',mode='max',save_best_only=True)

    history = model.fit(train_input, train_output,epochs=500,batch_size=1024, verbose=0, validation_data=(test_input,test_output), callbacks=[model_checkpoint_callback,reduce_lr])
    model_loaded = f'weights-improvement-{fold_no}.hdf5'
    model = tensorflow.keras.models.load_model(model_loaded,compile=False)
    y_pred = model.predict(test_input, verbose=0)
    y_act = test_output.flatten()
    y_pred= y_pred.flatten()
    y_pred_c=np.where(y_pred>0.5,1,0)
    print(roc_auc_score(y_act, y_pred),average_precision_score(y_act, y_pred), matthews_corrcoef(y_act,y_pred_c),f1_score(y_act,y_pred_c))
    fold_no = fold_no + 1

## CDR3A+CDR3B+peptide

In [None]:
#### 5/10 Fold CV

###set mode
mode='ab'



from sklearn.model_selection import KFold
# Define the K-fold Cross Validator
kfold = KFold(n_splits=10, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

for train_ind, test_ind in kfold.split(train_matmat_pepti):
  clear_sess()
  #print('------------------------------------------------------------------------')
  #print(f'Training for fold {fold_no} ...')
  #print('Train shape',train_ind.shape, 'Test shape',test_ind.shape)

  model, train_input, test_input, train_output, test_output = make_data(train_matmat_cdr3a, train_matmat_cdr3b, train_matmat_pepti, train_matmat_mhc, y_train, train_ind, test_ind, mode)

  #print('Training', train_input[0].shape, train_input[1].shape, train_input[2].shape, train_output.shape)
  #print('Evaluation', test_input[0].shape, test_input[1].shape, test_input[2].shape, test_output.shape)

  metrics_c = [tensorflow.keras.metrics.AUC(name="auc_roc",curve="ROC"),tensorflow.keras.metrics.AUC(name="auc_pr",curve="PR"),keras_mcc]
  #metrics_c = [keras_mcc]
  model.compile(loss="binary_crossentropy", optimizer=Adam(learning_rate=0.009), metrics=metrics_c)
  #early_stop = EarlyStopping(monitor='loss',min_delta=0,patience=10, verbose=0,mode='min',restore_best_weights=True)
  reduce_lr = ReduceLROnPlateau(monitor='val_keras_mcc', factor=0.95,patience=50, min_lr=0.003, verbose=0)
  checkpoint_filepath = f'weights-improvement-{fold_no}.hdf5'
  #model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath,save_weights_only=False,monitor='val_auc_pr',mode='max',save_best_only=True)
  model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath,save_weights_only=False,monitor='val_keras_mcc',mode='max',save_best_only=True)


  history = model.fit(train_input, train_output,epochs=500,batch_size=1024, verbose=0, validation_data=(test_input,test_output),
                  callbacks=[model_checkpoint_callback,reduce_lr])

  model_loaded = f'weights-improvement-{fold_no}.hdf5'
  model = tensorflow.keras.models.load_model(model_loaded,compile=False)

  y_pred = model.predict(test_input, verbose=0)
  y_act = test_output.flatten()
  y_pred= y_pred.flatten()
  y_pred_c=np.where(y_pred>0.5,1,0)
  print(#roc_auc_score(y_act, y_pred),average_precision_score(y_act, y_pred),
        matthews_corrcoef(y_act,y_pred_c),#cohen_kappa_score(y_act,y_pred_c)
        )
  fold_no = fold_no + 1

## CDR3B+MHC+peptide

In [None]:
#set these two to 0

if mode=='b'
    train_matmat_cdr3a =0
    test_matmat_cdr3a =0

    train_matmat_mhc =0
    test_matmat_mhc =0

In [None]:
#### 10 Fold CV

###set mode
mode='b'



from sklearn.model_selection import KFold
# Define the K-fold Cross Validator
kfold = KFold(n_splits=10, shuffle=True, random_state=0)

# K-fold Cross Validation model evaluation
fold_no = 1

for train_ind, test_ind in kfold.split(train_matmat_pepti):
    clear_sess()


    model, train_input, test_input, train_output, test_output = make_data(train_matmat_cdr3a, train_matmat_cdr3b, train_matmat_pepti, train_matmat_mhc, y_train, train_ind, test_ind, mode)

    metrics_c = [tensorflow.keras.metrics.AUC(name="auc_roc",curve="ROC"),tensorflow.keras.metrics.AUC(name="auc_pr",curve="PR"),keras_mcc]
    model.compile(loss="binary_crossentropy", optimizer=Adam(learning_rate=0.009), metrics=metrics_c)
    reduce_lr = ReduceLROnPlateau(monitor='val_keras_mcc', factor=0.95,patience=50, min_lr=0.003, verbose=0)
    checkpoint_filepath = f'weights-improvement-{fold_no}.hdf5'
    model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath,save_weights_only=False,monitor='val_keras_mcc',mode='max',save_best_only=True)


    history = model.fit(train_input, train_output,epochs=1000,batch_size=1024, verbose=0, validation_data=(test_input,test_output),
                  callbacks=[model_checkpoint_callback,reduce_lr])

    model_loaded = f'weights-improvement-{fold_no}.hdf5'
    model = tensorflow.keras.models.load_model(model_loaded,compile=False)

    y_pred = model.predict(test_input, verbose=0)
    y_act = test_output.flatten()
    y_pred= y_pred.flatten()
    y_pred_c=np.where(y_pred>0.5,1,0)
    print(roc_auc_score(y_act, y_pred),average_precision_score(y_act, y_pred), matthews_corrcoef(y_act,y_pred_c),f1_score(y_act,y_pred_c))
    fold_no = fold_no + 1

## CDR3A+peptide

In [None]:
#### 10 Fold CV

###set mode
mode='a'

from sklearn.model_selection import KFold
# Define the K-fold Cross Validator
kfold = KFold(n_splits=10, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

for train_ind, test_ind in kfold.split(train_matmat_pepti):
    clear_sess()

    model, train_input, test_input, train_output, test_output = make_data(train_matmat_cdr3a, train_matmat_cdr3b, train_matmat_pepti, train_matmat_mhc, y_train, train_ind, test_ind, mode)
    metrics_c = [tensorflow.keras.metrics.AUC(name="auc_roc",curve="ROC"),tensorflow.keras.metrics.AUC(name="auc_pr",curve="PR"),keras_mcc]
    model.compile(loss="binary_crossentropy", optimizer=Adam(learning_rate=0.008), metrics=metrics_c)
    reduce_lr = ReduceLROnPlateau(monitor='val_keras_mcc', factor=0.95,patience=50, min_lr=0.005, verbose=0)
    checkpoint_filepath = f'weights-improvement-{fold_no}.hdf5'
    model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath,save_weights_only=False,monitor='val_keras_mcc',mode='max',save_best_only=True)


    history = model.fit(train_input, train_output,epochs=500,batch_size=1024, verbose=0, validation_data=(test_input,test_output),
                  callbacks=[model_checkpoint_callback,reduce_lr])

    model_loaded = f'weights-improvement-{fold_no}.hdf5'
    model = tensorflow.keras.models.load_model(model_loaded,compile=False)

    y_pred = model.predict(test_input, verbose=0)
    y_act = test_output.flatten()
    y_pred= y_pred.flatten()
    y_pred_c=np.where(y_pred>0.5,1,0)
    print(roc_auc_score(y_act, y_pred),average_precision_score(y_act, y_pred),matthews_corrcoef(y_act,y_pred_c),f1_score(y_act,y_pred_c)
        )
    fold_no = fold_no + 1