In [1]:
#-------- Import Libraries --------#
import torch
#import esm
import os
import time
import sys
import random
import pickle
import gc
#import mlflow
import collections
import numpy as np
import pandas as pd
#import torch.nn as nn
#import seaborn as sn
#import matplotlib.pyplot as plt
from datetime import date
#from sklearn.metrics import matthews_corrcoef
#import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
#import torch.nn.functional as F  # All functions that don't have any parameters
#from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc

In [2]:
!pip install utils

Collecting utils
  Downloading utils-1.0.1-py2.py3-none-any.whl (21 kB)
Installing collected packages: utils
Successfully installed utils-1.0.1


In [3]:
import DGraphDTAscripts as sp
import DGraphDTAdata_process as dp

In [19]:
def return_aa(one_hot):
    mapping = dict(zip(range(20),"ACDEFGHIKLMNPQRSTVWY"))
    try:
        index = one_hot.index(1)
        return mapping[index]     
    except:
        return 'X'

def reverseOneHot(encoding):
    """
    Converts one-hot encoded array back to string sequence
    """
    seq=''
    for i in range(len(encoding)):
            if return_aa(encoding[i].tolist()) != 'X':
                seq+=return_aa(encoding[i].tolist())
    return seq

def extract_sequences(dataset_X, merge=False):
    """
    Return DataFrame with MHC, peptide and TCR a/b sequences from
    one-hot encoded complex sequences in dataset X
    """
    mhc_sequences = [reverseOneHot(arr[0:179,0:20]) for arr in dataset_X]
    pep_sequences = [reverseOneHot(arr[179:192,0:20]) for arr in dataset_X] ## 190 or 192 ????
    tcr_sequences = [reverseOneHot(arr[192:,0:20]) for arr in dataset_X]
    all_sequences = [reverseOneHot(arr[:,0:20]) for arr in dataset_X]

    if merge:
        df_sequences = pd.DataFrame({"all": all_sequences})

    else:
        df_sequences = pd.DataFrame({"MHC":mhc_sequences,
                                 "peptide":pep_sequences,
                                 "TCR":tcr_sequences})
        
    return df_sequences       

In [20]:
#-------- Directories --------#

DATADIR = '02456_TCR-pMHC/data/'
TRAINDIR = '02456_TCR-pMHC/data/train'
VALIDATIONDIR = '02456_TCR-pMHC/data/validation'
MATRICES = '02456_TCR-pMHC/data/Matrices'

In [21]:
#-------- Unzip Train --------#

try:
    if len(os.listdir(TRAINDIR)) != 0:
        print("{} already unzipped.".format(TRAINDIR))
except:
    !unzip ../data/train.zip -d ../data/train

    
#-------- Unzip Validation --------#

try:
    if len(os.listdir(VALIDATIONDIR)) != 0:
        print("{} already unzipped.".format(VALIDATIONDIR))
except:
    !unzip ../data/validation.zip -d ../data/validation
    
print('Train directory:\n\n', '\n'.join(str(p) for p in os.listdir(TRAINDIR)), '\n\n')
print('Validation directory:\n\n','\n'.join(str(p) for p in os.listdir(VALIDATIONDIR)))

02456_TCR-pMHC/data/train already unzipped.
02456_TCR-pMHC/data/validation already unzipped.
Train directory:

 P1_input.npz
P1_labels.npz
P2_input.npz
P2_labels.npz
P3_input.npz
P3_labels.npz
P4_input.npz
P4_labels.npz
__MACOSX 


Validation directory:

 P4_input.npz
P4_labels.npz
__MACOSX


In [22]:
#-------- Import Dataset --------#

data_list = []
target_list = []

import glob
for fp in glob.glob("02456_TCR-pMHC/data/train/*input.npz"):
    data = np.load(fp)["arr_0"]
    targets = np.load(fp.replace("input", "labels"))["arr_0"]
    data_list.append(data)
    target_list.append(targets)
    
for fp in glob.glob("02456_TCR-pMHC/data/validation/*input.npz"):
    data = np.load(fp)["arr_0"]
    targets = np.load(fp.replace("input", "labels"))["arr_0"]
    data_list.append(data)
    target_list.append(targets)
    
data_partitions = len(data_list)

print("Number of files:", data_partitions)

for i in range(data_partitions):
    print("Size of file", i, len(data_list[i]))

Number of files: 5
Size of file 0 1526
Size of file 1 1168
Size of file 2 1480
Size of file 3 1532
Size of file 4 1532


In [23]:
#-------- Encode ALL --------#

count=0

batch = 5 #### DEBUGGED

print("batch:", batch)

for dataset in data_list[:-1]: ## last file

    dataset = dataset
    count += 1

    #print("\nWorking on file", count, "- size:", len(dataset))
    x_enc = extract_sequences(dataset, merge=False)
    print("Sequences are extracted")
    
print(x_enc)

batch: 5
Sequences are extracted
Sequences are extracted
Sequences are extracted
Sequences are extracted
                                                    MHC    peptide  \
0     GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...  GLCTLVAML   
1     GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...  GILGFVFTL   
2     GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...  GLCTLVAML   
3     GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...  GILGFVFTL   
4     GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...  GILGFVFTL   
...                                                 ...        ...   
1527  GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...  NLVPMVATV   
1528  GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...  NLVPMVATV   
1529  GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...  GLCTLVAML   
1530  GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...  GLCTLVAML   
1531  GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...  GILGFVFTL   

                                                    TC