In [1]:
import pandas as pd 
import os 
import numpy as np
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import json

pd.options.mode.chained_assignment = None

In [2]:
data = pd.read_csv('data/data_for_CV.csv', index_col=None)

In [3]:
test = pd.read_excel('data/Identified_Datasheet.xlsx')

In [4]:
test = test[['Unnamed: 5', 'Total : 181']]
test['seq'] = test['Unnamed: 5']
test['label'] = test['Total : 181']
test.drop(columns=['Unnamed: 5', 'Total : 181'], inplace=True)
test = test.iloc[4:]
test.label = test.label.apply(lambda x : str(x))

In [5]:
data.label = data.label.apply(lambda x : [int(i) for i in x[1:-1].split(', ')])
test.label = test.label.apply(lambda x : [int(i) for i in x.split(',')])

In [6]:
data.head()

Unnamed: 0,seq,label
0,MKHPYGYRWHWLYALVVTLMTALATFSAHAAVTAGPGAWSSQQTWA...,"[9, 10]"
1,MKHPYGYRWHWLYALVVTLMTALATLSAHAAVTAGPGAWSSQQTWA...,"[9, 10]"
2,MKHPYGYRWHWLYALVVTLMTALATFSAHAAVTAGPGAWSSQQTWA...,"[9, 10]"
3,MARRLWRRIAGWLAACVAILCTFPLHAATAGPGAWSSQQTWAADAV...,"[9, 10]"
4,MQLLSRRQWRWLVGFFAALATTSALAAVTPGPGTWSAQQTWAADSV...,"[9, 10]"


In [7]:
def get_new_labels(y):
    """ Convert each multilabel vector to a unique string """
    yy = [''.join(str(l)) for l in y]
    y_new = LabelEncoder().fit_transform(yy)
    return y_new

In [8]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
data['fold'] = 0

for i, (train_index, test_index) in enumerate(kf.split(data.seq, get_new_labels(data.label))):
    data['fold'].iloc[test_index] = i



In [9]:
class SequenceEncoding:
    
    encoding_types = ['One_hot_6_bit', 'Binary_5_bit', 'Hydrophobicity_matrix', 
                      'Meiler_parameters', 'Acthely_factors', 'PAM250', 'BLOSUM62', 'Miyazawa_energies', 
                      'Micheletti_potentials', 'AESNN3', 'ANN4D', 'ProtVec']
    residue_types = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y','X']
    
    def __init__(self, encoding_type="One_hot"):
        if encoding_type not in SequenceEncoding.encoding_types:
            raise Exception("Encoding type \'%s\' not found" % encoding_type)
        self.encoding_type = encoding_type
        with open("../../nasdatafolder/encodings/%s.json" % self.encoding_type, 'r') as load_f:
            self.encoding = json.load(load_f)

    def get_ProtVec_encoding(self, ProtVec, seq):
        return [ProtVec[seq[i:i+3]] if ProtVec.__contains__(seq[i:i+3]) else ProtVec["<unk>"] for i in range(len(seq) - 2)]
        
    def get_encoding(self, seq):
        encoding_data = []
        if self.encoding_type == "ProtVec":            
            encoding_data = self.get_ProtVec_encoding(self.encoding, seq)
        else:
            for res in seq:
                if res not in SequenceEncoding.residue_types:
                    res = "X"
                encoding_data.append(self.encoding[res])
                    
        return np.array(encoding_data)

In [10]:
for encoding in ['One_hot_6_bit', 'Binary_5_bit', 'Hydrophobicity_matrix', 'Meiler_parameters', 'Acthely_factors', 'PAM250', 'BLOSUM62', 'Miyazawa_energies', 
                 'Micheletti_potentials', 'AESNN3', 'ANN4D', 'ProtVec'] :
    
    new_data = data.copy()
    new_test = test.copy()
    enc = SequenceEncoding(encoding)
    
    new_data.seq = new_data.seq.apply(lambda x : enc.get_encoding(x))
    new_data.seq = new_data.seq.apply(lambda x : np.pad(x, [(0, 1000 - len(x)), (0, 0)], 'constant', constant_values=0))
    new_test.seq = new_test.seq.apply(lambda x : enc.get_encoding(x))
    new_test.seq = new_test.seq.apply(lambda x : np.pad(x, [(0, 1000 - len(x)), (0, 0)], 'constant', constant_values=0))
                                      
    new_binary = new_data.copy()
    new_combined = new_data.copy() 
    
    new_binary['label'] = new_binary.label.apply(lambda x : 0 if 13 in x else 1)
    
    new_data['positive'] = new_data.label.apply(lambda x : 0 if 13 in x else 1)
    new_data = new_data[new_data.positive == 1].drop(columns=['positive'])
    
    encoder = OneHotEncoder()
    encoder.fit(np.array([i for i in range(13)]).reshape(-1,1))
    new_data.label = new_data.label.apply(lambda x : encoder.transform(np.array(x).reshape(-1, 1)).toarray()[0])
    
    encoder = OneHotEncoder()
    encoder.fit(np.array([i for i in range(14)]).reshape(-1,1))
    new_combined.label = new_combined.label.apply(lambda x : encoder.transform(np.array(x).reshape(-1, 1)).toarray()[0])
    new_test.label = new_test.label.apply(lambda x : encoder.transform(np.array(x).reshape(-1, 1)).toarray()[0])
    
    new_data = new_data.to_numpy()
    with open(f'./data/{encoding}_labels.npy', 'wb+') as f:
        np.save(f, new_data)
    
    new_binary = new_binary.to_numpy()
    with open(f'./data/{encoding}_binary.npy', 'wb+') as f:
        np.save(f, new_binary)
    
    new_combined = new_combined.to_numpy()
    with open(f'./data/{encoding}_combined.npy', 'wb+') as f:
        np.save(f, new_combined)
        
    new_test = new_test.to_numpy()
    with open(f'./data/{encoding}_test.npy', 'wb+') as f:
        np.save(f, new_test)
        
    break

In [11]:
transformer_data = data.copy() 
transformer_test = test.copy()

with open('../tokenizer.pickle', 'rb') as t :
        tokenizer = pickle.load(t)

transformer_data.seq = tokenizer.texts_to_sequences(transformer_data.seq)
transformer_data.seq = transformer_data.seq.apply(lambda x : [22] + x + [0 for _ in range(999-len(x))])      
transformer_test.seq = tokenizer.texts_to_sequences(transformer_test.seq)
transformer_test.seq = transformer_test.seq.apply(lambda x : [22] + x + [0 for _ in range(999-len(x))])   

transformer_binary = transformer_data.copy()
transformer_combined = transformer_data.copy()

transformer_binary['label'] = transformer_binary.label.apply(lambda x : 0 if 13 in x else 1)

transformer_data['positive'] = transformer_data.label.apply(lambda x : 0 if 13 in x else 1)
transformer_data = transformer_data[transformer_data.positive == 1].drop(columns=['positive'])

encoder = OneHotEncoder()
encoder.fit(np.array([i for i in range(13)]).reshape(-1,1))

transformer_data.label = transformer_data.label.apply(lambda x : encoder.transform(np.array(x).reshape(-1, 1)).toarray()[0])

transformer_binary = transformer_binary.to_numpy()

encoder = OneHotEncoder()
encoder.fit(np.array([i for i in range(14)]).reshape(-1,1))

transformer_combined.label = transformer_combined.label.apply(lambda x : encoder.transform(np.array(x).reshape(-1, 1)).toarray()[0])
transformer_test.label = transformer_test.label.apply(lambda x : encoder.transform(np.array(x).reshape(-1, 1)).toarray()[0])
    
with open(f'./data/transformer_binary.npy', 'wb+') as f:
    np.save(f, transformer_binary)
    
transformer_data = transformer_data.to_numpy()
    
with open(f'./data/transformer_labels.npy', 'wb+') as f:
    np.save(f, transformer_data)
    
transformer_combined = transformer_combined.to_numpy()
    
with open(f'./data/transformer_combined.npy', 'wb+') as f:
    np.save(f, transformer_combined)
    
transformer_test = transformer_test.to_numpy()
    
with open(f'./data/transformer_test.npy', 'wb+') as f:
    np.save(f, transformer_test)

In [12]:
data.seq = tokenizer.texts_to_sequences(data.seq)
data.seq = data.seq.apply(lambda x : x + [0 for _ in range(1000-len(x))])
test.seq = tokenizer.texts_to_sequences(test.seq)
test.seq = test.seq.apply(lambda x : x + [0 for _ in range(1000-len(x))])

In [13]:
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(np.array([i for i in range(1,22)]).reshape(-1,1))

data.seq = data.seq.apply(lambda x : encoder.transform(np.array(x).reshape(-1, 1)).toarray())
test.seq = test.seq.apply(lambda x : encoder.transform(np.array(x).reshape(-1, 1)).toarray())

In [14]:
binary = data.copy()
combined = data.copy()

In [15]:
binary['label'] = binary.label.apply(lambda x : 0 if 13 in x else 1)

In [16]:
binary.head()

Unnamed: 0,seq,label,fold
0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1,4
1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1,0
2,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1,3
3,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1,2
4,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1,1


In [17]:
data['positive'] = data.label.apply(lambda x : 0 if 13 in x else 1)

In [18]:
data = data[data.positive == 1].drop(columns=['positive'])

In [19]:
data.head()

Unnamed: 0,seq,label,fold
0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[9, 10]",4
1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[9, 10]",0
2,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[9, 10]",3
3,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[9, 10]",2
4,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[9, 10]",1


In [20]:
encoder = OneHotEncoder()
encoder.fit(np.array([i for i in range(13)]).reshape(-1,1))

data.label = data.label.apply(lambda x : encoder.transform(np.array(x).reshape(-1, 1)).toarray()[0])

In [21]:
data.head()

Unnamed: 0,seq,label,fold
0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4
1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
2,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3
3,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2
4,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1


In [22]:
binary.head()

Unnamed: 0,seq,label,fold
0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1,4
1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1,0
2,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1,3
3,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1,2
4,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1,1


In [23]:
data.head()

Unnamed: 0,seq,label,fold
0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4
1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
2,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3
3,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2
4,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1


In [24]:
combined.head()

Unnamed: 0,seq,label,fold
0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[9, 10]",4
1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[9, 10]",0
2,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[9, 10]",3
3,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[9, 10]",2
4,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[9, 10]",1


In [25]:
binary = binary.to_numpy()
    
with open(f'./data/one_hot_binary.npy', 'wb+') as f:
    np.save(f, binary)

In [26]:
data = data.to_numpy()
    
with open(f'./data/one_hot_labels.npy', 'wb+') as f:
    np.save(f, data)

In [27]:
encoder = OneHotEncoder()
encoder.fit(np.array([i for i in range(14)]).reshape(-1,1))

combined.label = combined.label.apply(lambda x : encoder.transform(np.array(x).reshape(-1, 1)).toarray()[0])
test.label = test.label.apply(lambda x : encoder.transform(np.array(x).reshape(-1, 1)).toarray()[0])

In [28]:
combined = combined.to_numpy()
    
with open(f'./data/one_hot_combined.npy', 'wb+') as f:
    np.save(f, combined)

In [29]:
test = test.to_numpy()
    
with open(f'./data/one_hot_test.npy', 'wb+') as f:
    np.save(f, test)