# Project

Before running this, make sure that all prerequisites are installed. This is done by running 

$ pip install -r requirements.txt



## Import modules

In [98]:
import numpy as np
import Bio as bio
from Bio import SeqIO
import random
import scipy
import sklearn
from sklearn import cross_validation
import os
from keras.callbacks import ModelCheckpoint
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers.embeddings import Embedding

In [22]:
np.random.seed(1337)


In [23]:
POSITIVE_TM_PATH = "../data/training_data/positive_examples/tm"
POSITIVE_NON_TM_PATH = "../data/training_data/positive_examples/non_tm"
NEGATIVE_TM_PATH = "../data/training_data/negative_examples/tm"
NEGATIVE_NON_TM_PATH = "../data/training_data/negative_examples/non_tm"

In [24]:
positive_tm = [seq for path in os.listdir(POSITIVE_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(POSITIVE_TM_PATH, path), "fasta")]

positive_non_tm = [seq for path in os.listdir(POSITIVE_NON_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(POSITIVE_NON_TM_PATH, path), "fasta")]

negative_tm = [seq for path in os.listdir(NEGATIVE_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(NEGATIVE_TM_PATH, path), "fasta")]
               
negative_non_tm = [seq for path in os.listdir(NEGATIVE_NON_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(NEGATIVE_NON_TM_PATH, path), "fasta")]

In [25]:
print("positives, tm:",  len(positive_tm))
print("positives, non_tm:",  len(positive_non_tm))
print("negavtive, tm:",  len(negative_tm))
print("negavtive, non_tm:", len( negative_non_tm))

positives, tm: 45
positives, non_tm: 1275
negavtive, tm: 247
negavtive, non_tm: 1087


In [26]:

positive = positive_tm + positive_non_tm
negative = negative_tm + negative_non_tm


In [27]:
print("positives",  len(positive))
print("negavtive",  len(negative))
x = np.array(positive + negative)
y = np.array([1]*len(positive) + [0]*len(negative))

assert x.shape == y.shape, "There should be one lable for every datapoint"
print("x shape",  x.shape)
print("x shape",  y.shape)

positives 1320
negavtive 1334
x shape (2654,)
x shape (2654,)


In [28]:
X_train_seq, X_test_seq, Y_train, Y_test = cross_validation.train_test_split(x, y, random_state=7, train_size=0.9)

def vector_to_onehot(vector, vocabulary_size):
    matrix = np.zeros((vector.shape[0], vocabulary_size))
    for i, j in enumerate(vector):
        matrix[i, j] = 1
    return matrix 
    
    

In [69]:
    
X_train = [[ord(c)-65 for c in str(seq.seq).split('#')[0].strip()] for seq in X_train_seq ]
Z_train = [[ord(c.upper())-65 for c in str(seq.seq).split('#')[0].strip()] for seq in X_train_seq ]
X_test = [[ord(c)-65 for c in str(seq.seq).split('#')[0].strip()] for seq in X_test_seq ]
Z_test = [[ord(c.upper())-65 for c in str(seq.seq).split('#')[1].strip()] for seq in X_test_seq ]

max_len = [len(s) for s in X_train][-20]
print("Padding sequences to length {}".format(max_len))
X_train_padded = sequence.pad_sequences(X_train, maxlen=max_len)
Z_train_padded = sequence.pad_sequences(X_train, maxlen=max_len)
X_test_padded = sequence.pad_sequences(X_test, maxlen=max_len)
Z_test_padded = sequence.pad_sequences(Z_test, maxlen=max_len)

print("X_train_padded shape", X_train_padded.shape)
print("Z_train_padded shape", Z_train_padded.shape)
print("X_test_padded shape", X_test_padded.shape)
print("Z_test_padded shape", Z_test_padded.shape)



Padding sequences to length 143
X_train_padded shape (2388, 143)
Z_train_padded shape (2388, 143)
X_test_padded shape (266, 143)
Z_test_padded shape (266, 143)


In [70]:
X_train_onehot = np.array([ vector_to_onehot(seq, 27) for seq in X_train_padded])
Z_train_onehot = np.array([ vector_to_onehot(seq, 27) for seq in Z_train_padded])
X_test_onehot = np.array([ vector_to_onehot(seq, 27) for seq in X_test_padded])
Z_test_onehot = np.array([ vector_to_onehot(seq, 27) for seq in Z_test_padded])

In [71]:
np.swapaxes(X_train_onehot, 0, 1)
np.swapaxes(Z_train_onehot, 0, 1)
np.swapaxes(X_test_onehot, 0, 1)
np.swapaxes(Z_test_onehot , 0, 1)
print(X_train_onehot.shape)
print(Z_train_onehot.shape)
print(X_test_onehot.shape)
print(Z_test_onehot.shape)

(2388, 143, 27)
(2388, 143, 27)
(266, 143, 27)
(266, 143, 27)


In [102]:
# fix random seed for reproducibility
model = Sequential()
model.add(LSTM(50, input_shape=(X_train_onehot.shape[1], X_train_onehot.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(Z_train_onehot.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
print(model.summary())

filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

model.fit(X_train_onehot, Z_train_onehot, nb_epoch=20, batch_size=128, callbacks=callbacks_list)

model.fit(X, y, nb_epoch=20, batch_size=128, callbacks=callbacks_list)

# Final evaluation of the model
scores = model.evaluate(X_test_onehot, Z_test_onehot, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lstm_37 (LSTM)                   (None, 50)            15600       lstm_input_35[0][0]              
____________________________________________________________________________________________________
dropout_4 (Dropout)              (None, 50)            0           lstm_37[0][0]                    
____________________________________________________________________________________________________
dense_4 (Dense)                  (None, 143)           7293        dropout_4[0][0]                  
Total params: 22893
____________________________________________________________________________________________________
None


Exception: Error when checking model target: expected dense_4 to have 2 dimensions, but got array with shape (2388, 143, 27)

In [75]:
from datetime import datetime
model_json = model.to_json()
with open("../results/seq2seq/{}.json".format(datetime.today().strftime("%d_%m_%Y_%I:%M%p")),  "w+") as json_file:
    json_file.write(model_json)
model.save_weights("../results/seq/{}.h5".format(datetime.today().strftime("%d_%m_%Y_%I:%M%p")))
print("Saved model to disk")

Saved model to disk


In [76]:
pred = model.predict(X_test_onehot, verbose=0)

In [77]:
for i, m in enumerate(np.argmax(pred[5, :,:], axis=1)):
    print("--")
    print(m)
    print(np.argmax(Z_test_onehot[5, i,:]))
    print("--")

--
4
14
--
--
10
14
--
--
19
14
--
--
17
14
--
--
8
14
--
--
6
14
--
--
13
14
--
--
6
14
--
--
21
14
--
--
24
14
--
--
16
14
--
--
5
14
--
--
10
14
--
--
8
14
--
--
6
14
--
--
4
14
--
--
21
14
--
--
21
14
--
--
3
14
--
--
21
14
--
--
8
14
--
--
11
14
--
--
16
14
--
--
13
14
--
--
0
14
--
--
13
14
--
--
12
14
--
--
12
14
--
--
10
14
--
--
4
14
--
--
13
14
--
--
11
14
--
--
18
14
--
--
4
14
--
--
19
14
--
--
7
14
--
--
15
14
--
--
22
14
--
--
7
14
--
--
11
14
--
--
7
14
--
--
6
14
--
--
7
14
--
--
3
14
--
--
5
14
--
--
22
14
--
--
21
14
--
--
11
14
--
--
6
14
--
--
24
14
--
--
6
14
--
--
3
14
--
--
6
14
--
--
10
14
--
--
5
14
--
--
18
14
--
--
0
14
--
--
4
14
--
--
4
14
--
--
4
14
--
--
18
14
--
--
18
14
--
--
11
14
--
--
13
14
--
--
11
14
--
--
10
14
--
--
13
14
--
--
15
14
--
--
15
14
--
--
11
14
--
--
17
14
--
--
13
14
--
--
19
14
--
--
21
14
--
--
21
14
--
--
8
14
--
--
5
14
--
--
15
14
--
--
24
14
--
--
6
14
--
--
22
14
--
--
19
14
--
--
0
14
--
--
8
14
--
--
17
14
--
--
5
14
--
--
