In [1]:
#-------- Import Libraries --------#
import torch
import os
import sys
import random
import pickle
import mlflow
import numpy as np
import pandas as pd
import torch.nn as nn
import seaborn as sn
import matplotlib.pyplot as plt
from datetime import date
from sklearn.metrics import matthews_corrcoef
import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F  # All functions that don't have any parameters
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc

In [2]:
#-------- Import Modules from project--------#
import encoding as enc
from model import Net, Net_thesis, Net_project, Net_project_simple_CNN_RNN, Net_project_transformer_CNN_RNN
import functions as func

In [3]:
#-------- Set Device --------#

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
else:
    print('No GPUs available. Using CPU instead.')
    device = torch.device('cpu')

No GPUs available. Using CPU instead.


In [4]:
#-------- Seeds --------#

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

torch.use_deterministic_algorithms(True)

In [5]:
#-------- Directories --------#

DATADIR = '/data/'
TRAINDIR = '../data/train'
VALIDATIONDIR = '../data/validation'
MATRICES = '/data/Matrices'


In [6]:
#-------- Unzip Train --------#

try:
    if len(os.listdir(TRAINDIR)) != 0:
        print("{} already unzipped.".format(TRAINDIR))
except:
    !unzip ../data/train.zip -d ../data/train

    
#-------- Unzip Validation --------#

try:
    if len(os.listdir(VALIDATIONDIR)) != 0:
        print("{} already unzipped.".format(VALIDATIONDIR))
except:
    !unzip ../data/validation.zip -d ../data/validation
    
print('Train directory:\n\n', '\n'.join(str(p) for p in os.listdir(TRAINDIR)), '\n\n')
print('Validation directory:\n\n','\n'.join(str(p) for p in os.listdir(VALIDATIONDIR)))

../data/train already unzipped.
../data/validation already unzipped.
Train directory:

 P2_labels.npz
P3_input.npz
P4_input.npz
P2_input.npz
__MACOSX
P1_input.npz
P3_labels.npz
P4_labels.npz
P1_labels.npz 


Validation directory:

 P5_input.npz
P5_labels.npz
__MACOSX


In [7]:
#-------- Import Dataset --------#

data_list = []
target_list = []

import glob
for fp in glob.glob("../data/train/*input.npz"):
    data = np.load(fp)["arr_0"]
    targets = np.load(fp.replace("input", "labels"))["arr_0"]
    data_list.append(data[0:2])
    target_list.append(targets[0:2])
    
for fp in glob.glob("../data/validation/*input.npz"):
    data = np.load(fp)["arr_0"]
    targets = np.load(fp.replace("input", "labels"))["arr_0"]
    data_list.append(data[0:2])
    target_list.append(targets[0:2])
    
data_partitions = len(data_list)

print("Number of files:", data_partitions)

for i in range(data_partitions):
    print("Size of file", i, len(data_list[i]))

Number of files: 5
Size of file 0 2
Size of file 1 2
Size of file 2 2
Size of file 3 2
Size of file 4 2


In [8]:
cross_validation = True

#-------- Hyperparameters to fine tune -------#
embedding = "esm-1b" #esm-1b
merge = False
numHN=64
numFilter=100
dropOutRate=0.1
keep_energy=True

In [9]:
for i in range(5):
    print("File", i)
    seq = func.extract_sequences(data_list[i])

    seq['PEP_len'] = seq['peptide'].str.len()
    seq['TCR_len'] = seq['TCR'].str.len()
    seq['MHC_len'] = seq['MHC'].str.len()

    leng_TCR = list()
    leng_MHC = list()
    leng_PEP = list()

    for i in range(len(seq)):

        if seq['PEP_len'][i] not in leng_PEP:
            leng_PEP.append(seq['PEP_len'][i])

        if seq['MHC_len'][i] not in leng_MHC:
            leng_MHC.append(seq['MHC_len'][i]) 

        if seq['TCR_len'][i] not in leng_TCR:
            leng_TCR.append(seq['TCR_len'][i])

    print(sorted(leng_TCR))
    print(leng_MHC)
    print(leng_PEP)
    print("\n")

File 0
[211, 214]
[179]
[9]


File 1
[217]
[179]
[9]


File 2
[219, 220]
[179]
[9]


File 3
[211]
[179]
[9]


File 4
[213, 215]
[179]
[9]




In [10]:
data_list_aa = list()

for i in range(5):
    data_list_aa.append(func.extract_aa_and_energy_terms(data_list[i]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['aa'][i] = return_aa(list(df.iloc[i,0:20]))


In [11]:
# Structure of 1 complex:
pd.DataFrame(data_list_aa[0][0])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,-1.782,0.138,2.034,-0.177,0.0,0.0,0.845,-4.7732,-0.130164,-1.04623,...,-47.814,1.699,-571.57,229.289,-359.096,2.145,727.371,-40.566,0.554,G
1,-2.758,0.143,2.069,-0.791,0.088,-0.405,-2.431,-4.7732,-0.130164,-1.04623,...,-47.814,1.699,-571.57,229.289,-359.096,2.145,727.371,-40.566,0.554,S
2,-7.25,0.533,5.178,-3.381,1.675,-0.279,-5.73,-4.7732,-0.130164,-1.04623,...,-47.814,1.699,-571.57,229.289,-359.096,2.145,727.371,-40.566,0.554,H
3,-4.617,0.141,4.256,-2.504,0.133,-0.516,-4.379,-4.7732,-0.130164,-1.04623,...,-47.814,1.699,-571.57,229.289,-359.096,2.145,727.371,-40.566,0.554,S
4,-9.749,1.539,3.135,-2.211,1.483,0.093,-4.812,-4.7732,-0.130164,-1.04623,...,-47.814,1.699,-571.57,229.289,-359.096,2.145,727.371,-40.566,0.554,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,X
416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,X
417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,X
418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,X


In [12]:
#embedding of data
data_list_enc_mhc = list()
data_list_enc_pep = list() 
data_list_enc_tcr = list()

            
#create directory to fetch/store embedded
embedding_dir= '../data/embeddedFiles/'
try:
    os.mkdir(embedding_dir)
except:
    pass

#try to fecth if already exist
try:
    with open(embedding_dir+'dataset-{}'.format(embedding)) as f:
        data_list_enc =  pickle.load(f)


#if no prior file, then embbed:
except:
    data_list_enc = []
    if embedding == "Baseline":
        data_list_enc = data_list
    
    elif embedding == "esm-1b":
        count = 0
        
        for dataset in data_list:
            print("\nWorking on file", count)
            count += 1
            x_enc = func.extract_sequences(dataset, merge=merge)
            print("Sequences are extracted")
            
            if merge:
                print("Merge true")
                #print(x_enc['all'].tolist())
                x_enc = enc.esm_1b(x_enc['all'].tolist(), pooling=False)
                x_enc = np.array(x_enc[0])
                print(x_enc.shape)
                data_list_enc.append(x_enc)
                
            else:
                print("Merge false")
                mhc_enc = enc.esm_1b(x_enc['MHC'].tolist(), pooling=False)
                pep_enc = enc.esm_1b(x_enc['peptide'].tolist(), pooling=False)
                tcr_enc = enc.esm_1b(x_enc['TCR'].tolist(), pooling=False)
                
                mhc_enc = [x.tolist() for x in mhc_enc]
                pep_enc = [x.tolist() for x in pep_enc]
                tcr_enc = [x.tolist() for x in tcr_enc]
                
                #print("results are stacking")
                #x_enc = np.vstack((mhc_enc[0],pep_enc[0],tcr_enc[0]))
                #print("before extending:", x_enc.shape)
                
                # x_enc should be enc + energy terms
                data_list_enc_mhc.append(mhc_enc) 
                data_list_enc_pep.append(pep_enc) 
                data_list_enc_tcr.append(tcr_enc) 
            print("ESM_1B is done\n")
  
        #save
        outfile = open(embedding_dir + 'dataset-{}-file{}'.format(embedding, count),'wb')
        pickle.dump(data_list_enc, outfile)
        outfile.close()
        
    elif embedding == "esm_ASM":
        for dataset in data_list:
            x_enc = func.extract_sequences(dataset, merge=merge).values.tolist()
            print(data_list_enc)
            x_enc = [enc.esm_ASM(seq, pooling=False) for seq in x_enc]
            data_list_enc.append(x_enc)
            
        #save
        outfile = open(embedding_dir + 'dataset-{}-file{}'.format(embedding, count),'wb')
        pickle.dump(data_list_enc, outfile)
        outfile.close()
        
    else:         
        for dataset in data_list:
            x_enc = func.extract_sequences(dataset, merge=merge).values.tolist()
            print(data_list_enc)
            x_enc = [enc.encodePeptidesCNN(seq, scheme=embedding) for seq in x_enc]
            data_list_enc.append(x_enc)
        print(data_list_enc)
        
        #save
        outfile = open(embedding_dir + 'dataset-{}-file{}'.format(embedding, count),'wb')
        pickle.dump(data_list_enc, outfile)
        outfile.close()
            


Working on file 0
Sequences are extracted
Merge false
	esm:
	model and alphabet are ready
179
179
	data is ready - model starts running
	model is done
179
		Flag 2
179
	esm:
	model and alphabet are ready
9
9
	data is ready - model starts running
	model is done
9
		Flag 2
9
	esm:
	model and alphabet are ready
214
211
	data is ready - model starts running
	model is done
214
		Flag 2
211
ESM_1B is done


Working on file 1
Sequences are extracted
Merge false
	esm:
	model and alphabet are ready
179
179
	data is ready - model starts running
	model is done
179
		Flag 2
179
	esm:
	model and alphabet are ready
9
9
	data is ready - model starts running
	model is done
9
		Flag 2
9
	esm:
	model and alphabet are ready
217
217
	data is ready - model starts running
	model is done
217
		Flag 2
217
ESM_1B is done


Working on file 2
Sequences are extracted
Merge false
	esm:
	model and alphabet are ready
179
179
	data is ready - model starts running
	model is done
179
		Flag 2
179
	esm:
	model and alph

In [26]:
pd.DataFrame(data_list_enc_tcr[0][0])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1270,1271,1272,1273,1274,1275,1276,1277,1278,1279
0,-0.054443,0.137631,-0.195034,-0.072012,-0.206849,-0.374784,-0.163538,0.006105,-0.128767,0.215657,...,0.541550,-0.202693,0.328685,-0.397300,-0.426061,-0.187441,-0.423419,-0.063479,-0.390429,-0.227817
1,-0.138125,-0.136872,-0.374252,0.200674,-0.357856,-0.527312,-0.141446,0.156920,0.158504,-0.294526,...,0.240763,-0.482353,-0.217940,0.207436,-0.388233,0.309229,-0.049273,0.279448,-0.517590,0.145132
2,-0.512320,0.349702,-0.144986,0.048609,-0.305839,-0.312475,0.363981,-0.257945,0.303378,-0.047669,...,0.168561,-0.435102,0.037227,-0.070618,-1.328073,-0.094030,0.078935,0.497553,-0.309430,0.055513
3,-0.057762,-0.018830,-0.092917,-0.052258,-0.035687,-0.288753,-0.106249,-0.070236,-0.031196,-0.577864,...,0.043803,-0.014047,-0.419157,0.177469,-0.237929,-0.046196,-0.283648,0.121468,-0.381629,0.441846
4,0.049319,0.430291,-0.604225,0.168374,0.357174,-0.338217,0.036753,0.381295,-0.103713,-0.112071,...,0.501387,0.191378,0.366893,-0.074698,-0.865642,-0.133250,-0.208617,0.305176,-0.460901,0.217016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,0.433964,0.555939,-0.279253,-0.516747,-0.548022,-0.306758,-0.444220,0.070340,-0.136611,0.089117,...,0.175210,0.202791,-0.183425,-0.445449,-0.264967,-0.151364,0.277345,-0.252773,-0.410160,0.226790
207,0.265232,0.191367,-0.633639,-0.131879,-0.174869,-0.209964,0.032052,0.227209,0.008555,-0.034126,...,0.215912,0.132191,0.436630,0.059855,-1.230389,0.010071,-0.056032,0.544022,-0.734884,0.226314
208,0.363033,0.356251,-0.494937,-0.350075,-0.502074,-0.010059,-0.338410,-0.098375,-0.350276,-0.249320,...,0.343773,-0.278054,-0.438971,-0.125448,-0.383149,0.094297,-0.267519,0.268513,-0.314609,0.341245
209,0.530993,0.272302,-0.133036,-0.107412,0.066908,-0.407384,-0.073210,0.203524,-0.123781,-0.032344,...,0.363367,-0.101728,0.019988,-0.516566,-0.560254,0.058668,0.025192,0.317295,-0.226773,-0.204475


In [30]:
print(len(data_list))
print(i)
print(cmplx)
data_list[i][cmplx]

5
213
1


IndexError: list index out of range

In [35]:
for t in range(len(data_list)):
    print(t)
    for cmplx in range(len(data_list[t])):
        print(cmplx)
        print(".")

        df = pd.DataFrame( data_list[t][cmplx] )
        new_df = pd.DataFrame( func.extract_aa_and_energy_terms(data_list[t])[cmplx] )

        df_emb_mhc = pd.DataFrame(data_list_enc_mhc[t][cmplx])
        df_emb_pep = pd.DataFrame(data_list_enc_pep[t][cmplx])
        df_emb_tcr = pd.DataFrame(data_list_enc_tcr[t][cmplx])
        print("len mhc, pep, tcr:\n ", len(df_emb_mhc),"\n", len(df_emb_pep),"\n", len(df_emb_tcr), "--")

        df_emb_mhc['aa'] = '-'
        df_emb_pep['aa'] = '-'    
        df_emb_tcr['aa'] = '-'

        for mhc in range(len(df_emb_mhc)):
            df_emb_mhc['aa'][mhc] = func.return_aa(list(df.iloc[i,0:20]))

        for pep in range(len(df_emb_pep)):
            df_emb_pep['aa'][pep] = func.return_aa(list(df.iloc[179+i,0:20]))

        print()
        pad_index_list = new_df[new_df.iloc[:,34]=='X'].index.tolist()
        print(pad_index_list)

        for pad in range(len(pad_index_list)):
            if pad_index_list[pad+1]-pad_index_list[pad] > 100:
                tcr_start = pad_index_list[pad] + 1
                tcr_end = pad_index_list[pad+1]
                break
        print("start and end:",tcr_start, tcr_end)

        for tcr in range(len(df_emb_tcr)):
            df_emb_tcr['aa'][tcr] = func.return_aa(list(df.iloc[tcr_start+tcr, 0:20]))
 
        mhc = pd.concat([df.iloc[:179,20:], df_emb_mhc], axis=1)
        pep = pd.concat([df.iloc[179:188,20:], df_emb_pep], axis=1)
        tcr = pd.concat([(df.iloc[tcr_start:tcr_end,20:]).reset_index(), df_emb_tcr.reset_index()], axis=1)
        
        print("len1_", len(df.iloc[tcr_start:tcr_end,20:]))
        print("len2_", len(df_emb_tcr))
        print("len tcr:\n" , (new_df.iloc[tcr_start:tcr_end,34].tolist()), "\n", (df_emb_tcr['aa'].tolist()) )
        print(tcr)
        print("-----")
        print("-----")
        print("-----")
        print("-----")





0
0
.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['aa'][i] = return_aa(list(df.iloc[i,0:20]))


len mhc, pep, tcr:
  179 
 9 
 214 --

[188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 413, 414, 415, 416, 417, 418, 419]
start and end: 199 413
len1_ 214
len2_ 214
len tcr:
 ['E', 'Q', 'S', 'P', 'Q', 'F', 'L', 'S', 'I', 'Q', 'E', 'G', 'E', 'N', 'L', 'T', 'V', 'Y', 'C', 'N', 'S', 'S', 'S', 'V', 'F', 'S', 'S', 'L', 'Q', 'W', 'Y', 'R', 'Q', 'E', 'P', 'G', 'E', 'G', 'P', 'V', 'L', 'L', 'V', 'T', 'V', 'V', 'T', 'G', 'G', 'E', 'V', 'K', 'K', 'L', 'K', 'R', 'L', 'T', 'F', 'Q', 'F', 'G', 'D', 'A', 'R', 'K', 'D', 'S', 'S', 'L', 'H', 'I', 'T', 'A', 'A', 'Q', 'P', 'G', 'D', 'T', 'G', 'L', 'Y', 'L', 'C', 'A', 'G', 'A', 'P', 'G', 'G', 'G', 'S', 'Q', 'G', 'N', 'L', 'I', 'F', 'G', 'K', 'G', 'T', 'K', 'L', 'S', 'V', 'K', 'P', 'G', 'I', 'T', 'Q', 'S', 'P', 'K', 'Y', 'L', 'F', 'R', 'K', 'E', 'G', 'Q', 'N', 'V', 'T', 'L', 'S', 'C', 'E', 'Q', 'N', 'L', 'N', 'H', 'D', 'A', 'M', 'Y', 'W', 'Y', 'R', 'Q', 'D', 'P', 'G', 'Q', 'G', 'L', 'R', 'L', 'I', 'Y', 'Y', 'S', 'Q', 'I', 'V', 'N', 'D', 'F', 'Q', '

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_mhc['aa'][mhc] = func.return_aa(list(df.iloc[i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_pep['aa'][pep] = func.return_aa(list(df.iloc[179+i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_tcr['aa'][tcr] = func.return_aa(list(df.iloc[tcr_start+tcr, 0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.htm

len mhc, pep, tcr:
  179 
 9 
 211 --

[188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 411, 412, 413, 414, 415, 416, 417, 418, 419]
start and end: 200 411
len1_ 211
len2_ 211
len tcr:
 ['E', 'Q', 'S', 'P', 'Q', 'F', 'L', 'S', 'I', 'Q', 'E', 'G', 'E', 'N', 'L', 'T', 'V', 'Y', 'C', 'N', 'S', 'S', 'S', 'V', 'F', 'S', 'S', 'L', 'Q', 'W', 'Y', 'R', 'Q', 'E', 'P', 'G', 'E', 'G', 'P', 'V', 'L', 'L', 'V', 'T', 'V', 'V', 'T', 'G', 'G', 'E', 'V', 'K', 'K', 'L', 'K', 'R', 'L', 'T', 'F', 'Q', 'F', 'G', 'D', 'A', 'R', 'K', 'D', 'S', 'S', 'L', 'H', 'I', 'T', 'A', 'A', 'Q', 'P', 'G', 'D', 'T', 'G', 'L', 'Y', 'L', 'C', 'A', 'G', 'A', 'G', 'S', 'Q', 'G', 'N', 'L', 'I', 'F', 'G', 'K', 'G', 'T', 'K', 'L', 'S', 'V', 'K', 'P', 'G', 'V', 'S', 'Q', 'N', 'P', 'R', 'H', 'K', 'I', 'T', 'K', 'R', 'G', 'Q', 'N', 'V', 'T', 'F', 'R', 'C', 'D', 'P', 'I', 'S', 'E', 'H', 'N', 'R', 'L', 'Y', 'W', 'Y', 'R', 'Q', 'T', 'L', 'G', 'Q', 'G', 'P', 'E', 'F', 'L', 'T', 'Y', 'F', 'Q', 'N', 'E', 'A', 'Q', 'L', 'E', '

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_mhc['aa'][mhc] = func.return_aa(list(df.iloc[i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_pep['aa'][pep] = func.return_aa(list(df.iloc[179+i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_tcr['aa'][tcr] = func.return_aa(list(df.iloc[tcr_start+tcr, 0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.htm

len mhc, pep, tcr:
  179 
 9 
 217 --

[188, 189, 190, 191, 192, 193, 194, 195, 196, 414, 415, 416, 417, 418, 419]
start and end: 197 414
len1_ 217
len2_ 217
len tcr:
 ['V', 'E', 'Q', 'H', 'P', 'S', 'T', 'L', 'S', 'V', 'Q', 'E', 'G', 'D', 'S', 'A', 'V', 'I', 'K', 'C', 'T', 'Y', 'S', 'D', 'S', 'A', 'S', 'N', 'Y', 'F', 'P', 'W', 'Y', 'K', 'Q', 'E', 'L', 'G', 'K', 'G', 'P', 'Q', 'L', 'I', 'I', 'D', 'I', 'R', 'S', 'N', 'V', 'G', 'E', 'K', 'K', 'D', 'Q', 'R', 'I', 'A', 'V', 'T', 'L', 'N', 'K', 'T', 'A', 'K', 'H', 'F', 'S', 'L', 'H', 'I', 'T', 'E', 'T', 'Q', 'P', 'E', 'D', 'S', 'A', 'V', 'Y', 'F', 'C', 'A', 'A', 'S', 'R', 'K', 'P', 'D', 'K', 'I', 'I', 'F', 'G', 'K', 'G', 'T', 'R', 'L', 'H', 'I', 'L', 'P', 'A', 'G', 'V', 'A', 'Q', 'S', 'P', 'R', 'Y', 'K', 'I', 'I', 'E', 'K', 'R', 'Q', 'S', 'V', 'A', 'F', 'W', 'C', 'N', 'P', 'I', 'S', 'G', 'H', 'A', 'T', 'L', 'Y', 'W', 'Y', 'Q', 'Q', 'I', 'L', 'G', 'Q', 'G', 'P', 'K', 'L', 'L', 'I', 'Q', 'F', 'Q', 'N', 'N', 'G', 'V', 'V', 'D', 'D', 'S', 'Q', '

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_mhc['aa'][mhc] = func.return_aa(list(df.iloc[i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_pep['aa'][pep] = func.return_aa(list(df.iloc[179+i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_tcr['aa'][tcr] = func.return_aa(list(df.iloc[tcr_start+tcr, 0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.htm

len mhc, pep, tcr:
  179 
 9 
 217 --

[188, 189, 190, 191, 192, 193, 194, 195, 196, 414, 415, 416, 417, 418, 419]
start and end: 197 414
len1_ 217
len2_ 217
len tcr:
 ['T', 'Q', 'T', 'Q', 'P', 'G', 'M', 'F', 'V', 'Q', 'E', 'K', 'E', 'A', 'V', 'T', 'L', 'D', 'C', 'T', 'Y', 'D', 'T', 'S', 'D', 'P', 'S', 'Y', 'G', 'L', 'F', 'W', 'Y', 'K', 'Q', 'P', 'S', 'S', 'G', 'E', 'M', 'I', 'F', 'L', 'I', 'Y', 'Q', 'G', 'S', 'Y', 'D', 'Q', 'Q', 'N', 'A', 'T', 'E', 'G', 'R', 'Y', 'S', 'L', 'N', 'F', 'Q', 'K', 'A', 'R', 'K', 'S', 'A', 'N', 'L', 'V', 'I', 'S', 'A', 'S', 'Q', 'L', 'G', 'D', 'S', 'A', 'M', 'Y', 'F', 'C', 'A', 'M', 'R', 'D', 'S', 'Q', 'G', 'G', 'S', 'E', 'K', 'L', 'V', 'F', 'G', 'K', 'G', 'T', 'K', 'L', 'T', 'V', 'N', 'P', 'A', 'G', 'V', 'T', 'Q', 'T', 'P', 'K', 'F', 'R', 'V', 'L', 'K', 'T', 'G', 'Q', 'S', 'M', 'T', 'L', 'L', 'C', 'A', 'Q', 'D', 'M', 'N', 'H', 'E', 'Y', 'M', 'Y', 'W', 'Y', 'R', 'Q', 'D', 'P', 'G', 'M', 'G', 'L', 'R', 'L', 'I', 'H', 'Y', 'S', 'V', 'G', 'E', 'G', 'T', 'T', '

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_mhc['aa'][mhc] = func.return_aa(list(df.iloc[i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_pep['aa'][pep] = func.return_aa(list(df.iloc[179+i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_tcr['aa'][tcr] = func.return_aa(list(df.iloc[tcr_start+tcr, 0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.htm

len mhc, pep, tcr:
  179 
 9 
 219 --

[188, 189, 190, 191, 192, 193, 194, 195, 415, 416, 417, 418, 419]
start and end: 196 415
len1_ 219
len2_ 219
len tcr:
 ['Q', 'S', 'V', 'T', 'Q', 'L', 'G', 'S', 'H', 'V', 'S', 'V', 'S', 'E', 'G', 'A', 'L', 'V', 'L', 'L', 'R', 'C', 'N', 'Y', 'S', 'S', 'S', 'V', 'P', 'P', 'Y', 'L', 'F', 'W', 'Y', 'V', 'Q', 'Y', 'P', 'N', 'Q', 'G', 'L', 'Q', 'L', 'L', 'L', 'K', 'Y', 'T', 'S', 'A', 'A', 'T', 'L', 'V', 'K', 'G', 'I', 'N', 'G', 'F', 'E', 'A', 'E', 'F', 'K', 'K', 'S', 'E', 'T', 'S', 'F', 'H', 'L', 'T', 'K', 'P', 'S', 'A', 'H', 'M', 'S', 'D', 'A', 'A', 'E', 'Y', 'F', 'C', 'A', 'V', 'R', 'P', 'N', 'A', 'R', 'L', 'M', 'F', 'G', 'D', 'G', 'T', 'Q', 'L', 'V', 'V', 'K', 'P', 'A', 'G', 'V', 'A', 'Q', 'S', 'P', 'R', 'Y', 'K', 'I', 'I', 'E', 'K', 'R', 'Q', 'S', 'V', 'A', 'F', 'W', 'C', 'N', 'P', 'I', 'S', 'G', 'H', 'A', 'T', 'L', 'Y', 'W', 'Y', 'Q', 'Q', 'I', 'L', 'G', 'Q', 'G', 'P', 'K', 'L', 'L', 'I', 'Q', 'F', 'Q', 'N', 'N', 'G', 'V', 'V', 'D', 'D', 'S', 'Q', '

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_mhc['aa'][mhc] = func.return_aa(list(df.iloc[i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_pep['aa'][pep] = func.return_aa(list(df.iloc[179+i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_tcr['aa'][tcr] = func.return_aa(list(df.iloc[tcr_start+tcr, 0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.htm

len mhc, pep, tcr:
  179 
 9 
 220 --

[188, 189, 190, 191, 192, 193, 194, 195, 416, 417, 418, 419]
start and end: 196 416
len1_ 220
len2_ 220
len tcr:
 ['V', 'E', 'Q', 'S', 'P', 'Q', 'S', 'L', 'H', 'V', 'Q', 'E', 'G', 'D', 'S', 'T', 'N', 'F', 'T', 'C', 'S', 'F', 'P', 'S', 'S', 'N', 'F', 'Y', 'A', 'L', 'H', 'W', 'Y', 'R', 'W', 'E', 'T', 'A', 'K', 'S', 'P', 'E', 'A', 'L', 'F', 'V', 'M', 'T', 'L', 'N', 'G', 'D', 'E', 'K', 'K', 'K', 'G', 'R', 'I', 'S', 'A', 'T', 'L', 'N', 'T', 'K', 'E', 'G', 'Y', 'S', 'Y', 'L', 'Y', 'I', 'K', 'G', 'S', 'Q', 'P', 'E', 'D', 'S', 'A', 'T', 'Y', 'L', 'C', 'A', 'F', 'E', 'L', 'I', 'L', 'G', 'A', 'Q', 'K', 'L', 'V', 'F', 'G', 'Q', 'G', 'T', 'R', 'L', 'T', 'I', 'N', 'P', 'A', 'G', 'V', 'T', 'Q', 'T', 'P', 'K', 'F', 'R', 'V', 'L', 'K', 'T', 'G', 'Q', 'S', 'M', 'T', 'L', 'L', 'C', 'A', 'Q', 'D', 'M', 'N', 'H', 'E', 'Y', 'M', 'Y', 'W', 'Y', 'R', 'Q', 'D', 'P', 'G', 'M', 'G', 'L', 'R', 'L', 'I', 'H', 'Y', 'S', 'V', 'G', 'E', 'G', 'T', 'T', 'A', 'K', 'G', 'E', 'V', '

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_mhc['aa'][mhc] = func.return_aa(list(df.iloc[i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_pep['aa'][pep] = func.return_aa(list(df.iloc[179+i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_tcr['aa'][tcr] = func.return_aa(list(df.iloc[tcr_start+tcr, 0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.htm

len mhc, pep, tcr:
  179 
 9 
 211 --

[188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 411, 412, 413, 414, 415, 416, 417, 418, 419]
start and end: 200 411
len1_ 211
len2_ 211
len tcr:
 ['E', 'Q', 'S', 'P', 'Q', 'F', 'L', 'S', 'I', 'Q', 'E', 'G', 'E', 'N', 'L', 'T', 'V', 'Y', 'C', 'N', 'S', 'S', 'S', 'V', 'F', 'S', 'S', 'L', 'Q', 'W', 'Y', 'R', 'Q', 'E', 'P', 'G', 'E', 'G', 'P', 'V', 'L', 'L', 'V', 'T', 'V', 'V', 'T', 'G', 'G', 'E', 'V', 'K', 'K', 'L', 'K', 'R', 'L', 'T', 'F', 'Q', 'F', 'G', 'D', 'A', 'R', 'K', 'D', 'S', 'S', 'L', 'H', 'I', 'T', 'A', 'A', 'Q', 'P', 'G', 'D', 'T', 'G', 'L', 'Y', 'L', 'C', 'A', 'G', 'A', 'W', 'N', 'T', 'G', 'K', 'L', 'I', 'F', 'G', 'Q', 'G', 'T', 'T', 'L', 'Q', 'V', 'K', 'P', 'G', 'I', 'T', 'Q', 'S', 'P', 'K', 'Y', 'L', 'F', 'R', 'K', 'E', 'G', 'Q', 'N', 'V', 'T', 'L', 'S', 'C', 'E', 'Q', 'N', 'L', 'N', 'H', 'D', 'A', 'M', 'Y', 'W', 'Y', 'R', 'Q', 'D', 'P', 'G', 'Q', 'G', 'L', 'R', 'L', 'I', 'Y', 'Y', 'S', 'Q', 'I', 'V', 'N', 'D', 'F', 'Q', '

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_mhc['aa'][mhc] = func.return_aa(list(df.iloc[i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_pep['aa'][pep] = func.return_aa(list(df.iloc[179+i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_tcr['aa'][tcr] = func.return_aa(list(df.iloc[tcr_start+tcr, 0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.htm

len mhc, pep, tcr:
  179 
 9 
 211 --

[188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 411, 412, 413, 414, 415, 416, 417, 418, 419]
start and end: 200 411
len1_ 211
len2_ 211
len tcr:
 ['E', 'Q', 'S', 'P', 'Q', 'F', 'L', 'S', 'I', 'Q', 'E', 'G', 'E', 'N', 'L', 'T', 'V', 'Y', 'C', 'N', 'S', 'S', 'S', 'V', 'F', 'S', 'S', 'L', 'Q', 'W', 'Y', 'R', 'Q', 'E', 'P', 'G', 'E', 'G', 'P', 'V', 'L', 'L', 'V', 'T', 'V', 'V', 'T', 'G', 'G', 'E', 'V', 'K', 'K', 'L', 'K', 'R', 'L', 'T', 'F', 'Q', 'F', 'G', 'D', 'A', 'R', 'K', 'D', 'S', 'S', 'L', 'H', 'I', 'T', 'A', 'A', 'Q', 'P', 'G', 'D', 'T', 'G', 'L', 'Y', 'L', 'C', 'A', 'G', 'A', 'G', 'S', 'Q', 'G', 'N', 'L', 'I', 'F', 'G', 'K', 'G', 'T', 'K', 'L', 'S', 'V', 'K', 'P', 'G', 'I', 'T', 'Q', 'S', 'P', 'K', 'Y', 'L', 'F', 'R', 'K', 'E', 'G', 'Q', 'N', 'V', 'T', 'L', 'S', 'C', 'E', 'Q', 'N', 'L', 'N', 'H', 'D', 'A', 'M', 'Y', 'W', 'Y', 'R', 'Q', 'V', 'P', 'G', 'Q', 'G', 'L', 'R', 'L', 'I', 'Y', 'Y', 'S', 'H', 'I', 'V', 'N', 'D', 'F', 'Q', '

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_mhc['aa'][mhc] = func.return_aa(list(df.iloc[i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_pep['aa'][pep] = func.return_aa(list(df.iloc[179+i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_tcr['aa'][tcr] = func.return_aa(list(df.iloc[tcr_start+tcr, 0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.htm

len mhc, pep, tcr:
  179 
 9 
 213 --

[188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 412, 413, 414, 415, 416, 417, 418, 419]
start and end: 199 412
len1_ 213
len2_ 213
len tcr:
 ['E', 'Q', 'S', 'P', 'Q', 'F', 'L', 'S', 'I', 'Q', 'E', 'G', 'E', 'N', 'L', 'T', 'V', 'Y', 'C', 'N', 'S', 'S', 'S', 'V', 'F', 'S', 'S', 'L', 'Q', 'W', 'Y', 'R', 'Q', 'E', 'P', 'G', 'E', 'G', 'P', 'V', 'L', 'L', 'V', 'T', 'V', 'V', 'T', 'G', 'G', 'E', 'V', 'K', 'K', 'L', 'K', 'R', 'L', 'T', 'F', 'Q', 'F', 'G', 'D', 'A', 'R', 'K', 'D', 'S', 'S', 'L', 'H', 'I', 'T', 'A', 'A', 'Q', 'P', 'G', 'D', 'T', 'G', 'L', 'Y', 'L', 'C', 'A', 'G', 'A', 'S', 'N', 'T', 'G', 'K', 'L', 'I', 'F', 'G', 'Q', 'G', 'T', 'T', 'L', 'Q', 'V', 'K', 'P', 'G', 'V', 'S', 'Q', 'N', 'P', 'R', 'H', 'K', 'I', 'T', 'K', 'R', 'G', 'Q', 'N', 'V', 'T', 'F', 'R', 'C', 'D', 'P', 'I', 'S', 'E', 'H', 'N', 'R', 'L', 'Y', 'W', 'Y', 'R', 'Q', 'T', 'L', 'G', 'Q', 'G', 'P', 'E', 'F', 'L', 'T', 'Y', 'F', 'Q', 'N', 'E', 'A', 'Q', 'L', 'E', 'K', 'S', '

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_mhc['aa'][mhc] = func.return_aa(list(df.iloc[i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_pep['aa'][pep] = func.return_aa(list(df.iloc[179+i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_tcr['aa'][tcr] = func.return_aa(list(df.iloc[tcr_start+tcr, 0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.htm

len mhc, pep, tcr:
  179 
 9 
 215 --

[188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 413, 414, 415, 416, 417, 418, 419]
start and end: 198 413
len1_ 215
len2_ 215
len tcr:
 ['V', 'E', 'Q', 'D', 'P', 'G', 'P', 'L', 'S', 'V', 'P', 'E', 'G', 'A', 'I', 'V', 'S', 'L', 'N', 'C', 'T', 'Y', 'S', 'N', 'S', 'A', 'F', 'Q', 'Y', 'F', 'M', 'W', 'Y', 'R', 'Q', 'Y', 'S', 'R', 'K', 'G', 'P', 'E', 'L', 'L', 'M', 'Y', 'T', 'Y', 'S', 'S', 'G', 'N', 'K', 'E', 'D', 'G', 'R', 'F', 'T', 'A', 'Q', 'V', 'D', 'K', 'S', 'S', 'K', 'Y', 'I', 'S', 'L', 'F', 'I', 'R', 'D', 'S', 'Q', 'P', 'S', 'D', 'S', 'A', 'T', 'Y', 'L', 'C', 'A', 'M', 'G', 'G', 'G', 'G', 'G', 'S', 'Q', 'G', 'N', 'L', 'I', 'F', 'G', 'K', 'G', 'T', 'K', 'L', 'S', 'V', 'K', 'P', 'G', 'I', 'T', 'Q', 'S', 'P', 'K', 'Y', 'L', 'F', 'R', 'K', 'E', 'G', 'Q', 'N', 'V', 'T', 'L', 'S', 'C', 'E', 'Q', 'N', 'L', 'N', 'H', 'D', 'A', 'M', 'Y', 'W', 'Y', 'R', 'Q', 'D', 'P', 'G', 'Q', 'G', 'L', 'R', 'L', 'I', 'Y', 'Y', 'S', 'Q', 'I', 'V', 'N', 'D', 'F', 'Q', '

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_mhc['aa'][mhc] = func.return_aa(list(df.iloc[i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_pep['aa'][pep] = func.return_aa(list(df.iloc[179+i,0:20]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb_tcr['aa'][tcr] = func.return_aa(list(df.iloc[tcr_start+tcr, 0:20]))


Unnamed: 0,index,20,21,22,23,24,25,26,27,28,...,1271,1272,1273,1274,1275,1276,1277,1278,1279,aa
0,198,-4.695,0.632,0.346,-1.274,0.050,0.000,-2.236,-4.98454,-0.81213,...,-0.261842,0.318959,-0.302438,-0.178821,-0.106577,-0.293007,-0.012541,-0.477113,-0.253735,V
1,199,-3.769,0.152,4.433,-2.937,3.312,0.084,-2.402,-4.98454,-0.81213,...,-0.448362,-0.041220,-0.372262,-0.949059,-0.213172,0.071800,-0.024457,-0.432776,-0.095488,E
2,200,-8.540,0.630,6.725,-1.489,3.055,0.017,-0.984,-4.98454,-0.81213,...,-0.474454,-0.111477,0.194302,-0.275686,-0.038940,-0.050820,0.339075,-0.214406,0.378418,Q
3,201,-2.044,3.280,2.470,0.489,1.763,-0.120,9.697,-4.98454,-0.81213,...,-0.375556,0.068368,-0.016245,-0.748762,-0.120805,0.174512,0.594407,-0.164669,0.188548,D
4,202,-4.787,3.479,3.043,-0.813,1.203,-0.724,6.332,-4.98454,-0.81213,...,-0.175857,-0.013255,0.108006,-0.560611,-0.038123,-0.126484,0.420752,-0.299313,0.181997,P
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,408,-8.046,0.887,0.266,-0.799,1.518,-0.270,-5.403,-4.98454,-0.81213,...,0.119816,0.074561,-0.218537,-0.979636,0.007139,0.154814,0.040478,-0.454242,0.259208,F
211,409,-3.354,0.651,2.399,-0.706,0.000,-0.452,-1.777,-4.98454,-0.81213,...,-0.106908,-0.103697,-0.497400,-0.655102,-0.215537,0.253768,0.182911,-0.626912,0.472217,G
212,410,-3.382,0.491,2.842,-0.262,0.320,-0.387,-2.719,-4.98454,-0.81213,...,-0.085758,0.209268,-0.173697,-1.047406,0.097230,-0.053223,0.502321,-0.634978,0.180022,P
213,411,-3.481,0.474,3.606,-1.902,0.000,-1.465,-2.393,-4.98454,-0.81213,...,-0.334805,0.034396,-0.172064,-0.708118,0.023359,-0.092624,0.209804,-0.716793,-0.059019,G


[188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 413, 414, 415, 416, 417, 418, 419]
413 198


In [44]:
pd.DataFrame([x.tolist() for x in mhc_enc][0])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1270,1271,1272,1273,1274,1275,1276,1277,1278,1279
0,0.439624,-0.339024,0.246856,-0.106012,-0.090285,0.136046,-0.592692,0.335869,-0.021328,-0.036366,...,0.202188,-0.333468,0.138033,-0.150008,-0.062714,0.090927,-0.037180,-0.182691,-0.504514,-0.504901
1,0.235954,-0.140196,0.363386,0.095183,-0.537544,0.080510,-0.266401,0.289420,0.139821,-0.380183,...,0.129283,-0.311503,0.248841,-0.145652,-1.022484,0.204703,0.293085,-0.313953,-0.657916,-0.038972
2,0.392389,-0.016479,-0.279777,0.090132,0.054507,-0.051145,-0.546321,0.313616,-0.294827,-0.520055,...,-0.177830,-0.566007,-0.711802,0.005774,0.083998,-0.129809,-0.120198,0.102715,-0.685578,-0.413422
3,0.135565,0.039273,-0.064170,-0.328225,-0.507823,0.171422,-0.949577,0.758239,-0.142359,-0.565847,...,0.549012,-0.254741,-0.032258,-0.315260,-0.383384,0.023485,-0.264611,0.135441,-0.425064,-0.720705
4,0.400735,-0.865931,-0.087207,0.079493,-0.528614,-0.178162,-0.345221,0.319983,-0.643379,-0.831840,...,0.029289,-0.412285,-0.554560,-0.260757,0.078211,-0.215760,-0.109324,0.236160,-0.410305,-0.142305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,0.229744,-0.157452,-0.331253,-0.556625,-0.231989,-0.134140,-0.539770,0.066874,-0.176931,-0.455542,...,0.515300,-0.620805,-0.144598,-0.164273,0.150280,0.126577,-0.016019,0.237807,-0.102915,-0.500890
175,0.221406,-0.284619,-0.360284,0.058842,0.212572,-0.281089,-0.573750,0.496373,-0.107100,-0.355994,...,-0.176609,-0.267852,0.033872,0.092982,-0.668183,0.083169,-0.379386,0.292646,-0.369259,0.104223
176,0.101892,0.061703,-0.072025,-0.116071,0.137245,-0.140815,-0.455053,0.067431,0.032182,-0.396088,...,0.143088,-0.159454,0.323822,0.268432,-1.340114,-0.002521,0.187493,-0.089299,-0.220591,-0.065007
177,0.124529,0.024780,-0.000870,0.258847,0.299120,-0.348629,-0.489133,0.346041,0.002367,-0.334790,...,0.070216,-0.043551,0.433854,-0.215745,-0.319264,0.194516,-0.266034,0.093439,0.131698,-0.039140


In [42]:
len()

2

In [None]:
#fixed hyperparameters
features = list(range(54))  #to be redefined
residues = list(range(416))  #to be redefined
num_classes = 1
learning_rate = 0.001
n_features = len(features)
input_size = len(residues)
bat_size = 128
epochs = 100

In [None]:
#data load

X_train = np.concatenate(data_list_enc[ :-1])
y_train = np.concatenate(target_list[:-1])
nsamples, nx, ny = X_train.shape
print("Training set shape:", nsamples,nx,ny)

X_valid = np.concatenate(data_list_enc[-1: ])
y_valid = np.concatenate(target_list[-1: ])
nsamples, nx, ny = X_valid.shape
print("Validation set shape:", nsamples,nx,ny)

# Dataloader
train_ds = []
for i in range(len(X_train)):
    train_ds.append([np.transpose(X_train[i][:,features]), y_train[i]])
val_ds = []
for i in range(len(X_valid)):
    val_ds.append([np.transpose(X_valid[i][:,features]), y_valid[i]])
train_ldr = torch.utils.data.DataLoader(train_ds,batch_size=bat_size, shuffle=True)
val_ldr = torch.utils.data.DataLoader(val_ds,batch_size=bat_size, shuffle=True)


In [None]:
###############################
###    CNN+RNN (thesis)     ###
###############################

if cross_validation == False:
    
    #-------- Initials --------#

    # All features
    features = list(range(54))
    residues = list(range(416))
    input_size = len(residues)
    num_classes = 1
    learning_rate = 0.001
    bat_size = 128
    epochs = 100
    n_features = len(features)
    criterion = nn.BCEWithLogitsLoss()
    

    #-------- Train --------#
    nsamples, nx, ny = X_train.shape
    print("Training set shape:", nsamples,nx,ny)

    nsamples, nx, ny = X_valid.shape
    print("Validation set shape:", nsamples,nx,ny)

    # Dataloader
    train_ds = []
    for i in range(len(X_train)):
        train_ds.append([np.transpose(X_train[i][:,features]), y_train[i]])
    val_ds = []
    for i in range(len(X_valid)):
        val_ds.append([np.transpose(X_valid[i][:,features]), y_valid[i]])
    train_ldr = torch.utils.data.DataLoader(train_ds,batch_size=bat_size, shuffle=True)
    val_ldr = torch.utils.data.DataLoader(val_ds,batch_size=bat_size, shuffle=True)

    # Initialize network
    net = Net_project_simple_CNN_RNN(num_classes=num_classes, 
             n_features=n_features, 
             numHN=numHN, 
             numFilter=numFilter,
             dropOutRate=dropOutRate).to(device)
    
    optimizer = optim.Adam(net.parameters(), lr=learning_rate,
                           weight_decay=0.0005,
                           amsgrad=True,)
    
    train_acc, train_losses, train_auc, valid_acc, valid_losses, valid_auc, val_preds, val_targs = func.train_project(net, optimizer, train_ldr, val_ldr, [], X_valid, epochs, criterion)


    #-------- Performance --------#
    epoch = np.arange(1,len(train_losses)+1)
    plt.figure()
    plt.plot(epoch, train_losses, 'r', epoch, valid_losses, 'b')
    plt.legend(['Train Loss','Validation Loss'])
    plt.xlabel('Epoch'), plt.ylabel('Loss')

    epoch = np.arange(1,len(train_auc)+1)
    plt.figure()
    plt.plot(epoch, train_auc, 'r', epoch, valid_auc, 'b')
    plt.legend(['Train AUC','Validation AUC'])
    plt.xlabel('Epoch'), plt.ylabel('AUC')

    epoch = np.arange(1,len(train_acc)+1)
    plt.figure()
    plt.plot(epoch, train_acc, 'r', epoch, valid_acc, 'b')
    plt.legend(['Train Accuracy','Validation Accuracy'])
    plt.xlabel('Epoch'), plt.ylabel('Acc')
    plt.show()

    
    #-------- Save results --------#

    results = pd.DataFrame(list(zip( (int(x) for x in val_targs), (int(x) for x in val_preds))),columns =['target', 'pred'])
    print(results)

    #results.to_csv('results/df_targets_preds_th.csv'.format(str(date.today())), index=False)
    
    
    #-------- Performance Evaluation --------#
    # The results change every time we train, we should check why (maybe we missed something or did wrong with the seeds?)

    print("AUC: ", roc_auc_score(results['target'], results['pred']))
    print("MCC: ", matthews_corrcoef(results['target'], results['pred']))

    confusion_matrix = pd.crosstab(results['target'], results['pred'], rownames=['Actual'], colnames=['Predicted'])
    sn.heatmap(confusion_matrix, annot=True, cmap='Blues', fmt='g')
    plt.show()
    
    # Plot roc curve

    fpr, tpr, thres = roc_curve(results['target'], results['pred'])
    print('AUC: {:.3f}'.format(roc_auc_score(results['target'], results['pred'])))

    print( len([i for i, (a, b) in enumerate(zip(results['pred'], results['target'])) if a != b]))
    
    plt.figure(figsize=(8,6))

    # roc curve
    plt.plot(fpr, tpr, "b", label='ROC Curve')
    plt.plot([0,1],[0,1], "k--", label='Random Guess')
    plt.xlabel("false positive rate")
    plt.ylabel("true positive rate")
    plt.legend(loc="best")
    plt.title("ROC curve")

    plt.show()
    
    AUC = roc_auc_score(results['target'], results['pred'])
    MCC = matthews_corrcoef(results['target'], results['pred'])
    print("AUC: ", AUC)
    print("MCC: ", MCC)


In [None]:
#storing values
with mlflow.start_run():
    mlflow.log_param('embedding', embedding) 
    mlflow.log_param('Hidden Neurons', numHN)
    mlflow.log_param('filters CNN', numFilter)
    mlflow.log_param('Dropout rate', dropOutRate)
    mlflow.log_metric('AUC', AUC)
    mlflow.log_metric('MCC', MCC)
    #ADD ARTIFACTS (PLOTS)