# Project

Before running this, make sure that all prerequisites are installed. This is done by running 

$ pip install -r requirements.txt



## Import modules

In [3]:
import numpy as np
import Bio as bio
from Bio import SeqIO
import random
import scipy
import sklearn
from sklearn import cross_validation
import os
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding

Using Theano backend.


In [4]:
np.random.seed(1337)


In [5]:
POSITIVE_TM_PATH = "../data/training_data/positive_examples/tm"
POSITIVE_NON_TM_PATH = "../data/training_data/positive_examples/non_tm"
NEGATIVE_TM_PATH = "../data/training_data/negative_examples/tm"
NEGATIVE_NON_TM_PATH = "../data/training_data/negative_examples/non_tm"

In [6]:
positive_tm = [seq for path in os.listdir(POSITIVE_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(POSITIVE_TM_PATH, path), "fasta")]

positive_non_tm = [seq for path in os.listdir(POSITIVE_NON_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(POSITIVE_NON_TM_PATH, path), "fasta")]

negative_tm = [seq for path in os.listdir(NEGATIVE_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(NEGATIVE_TM_PATH, path), "fasta")]
               
negative_non_tm = [seq for path in os.listdir(NEGATIVE_NON_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(NEGATIVE_NON_TM_PATH, path), "fasta")]

In [7]:
print("positives, tm:",  len(positive_tm))
print("positives, non_tm:",  len(positive_non_tm))
print("negavtive, tm:",  len(negative_tm))
print("negavtive, non_tm:", len( negative_non_tm))

positives, tm: 45
positives, non_tm: 1275
negavtive, tm: 247
negavtive, non_tm: 1087


In [8]:

positive = positive_tm + positive_non_tm
negative = negative_tm + negative_non_tm


In [9]:
print("positives",  len(positive))
print("negavtive",  len(negative))
x = np.array(positive + negative)
y = np.array([1]*len(positive) + [0]*len(negative))

assert x.shape == y.shape, "There should be one lable for every datapoint"
print("x shape",  x.shape)
print("x shape",  y.shape)

positives 1320
negavtive 1334
x shape (2654,)
x shape (2654,)


In [63]:
X_train_seq, X_test_seq, Y_train, Y_test = cross_validation.train_test_split(x, y, random_state=7, train_size=0.9)

def vector_to_onehot(vector, vocabulary_size):
    matrix = np.zeros((vector.shape[0], vocabulary_size))
    for i, j in enumerate(vector):
        matrix[i, j] = 1
    return matrix 
    
    

In [69]:
    
X_train = [[ord(c)-65 for c in str(seq.seq).split('#')[0].strip()] for seq in X_train_seq ]
Z_train = [[ord(c.upper())-65 for c in str(seq.seq).split('#')[0].strip()] for seq in X_train_seq ]
X_test = [[ord(c)-65 for c in str(seq.seq).split('#')[0].strip()] for seq in X_test_seq ]
Z_test = [[ord(c.upper())-65 for c in str(seq.seq).split('#')[1].strip()] for seq in X_test_seq ]

max_len = max([len(s) for s in X_train])
print("Padding sequences to length {}".format(max_len))
X_train_padded = sequence.pad_sequences(X_train, maxlen=max_len)
Z_train_padded = sequence.pad_sequences(X_train, maxlen=max_len)
X_test_padded = sequence.pad_sequences(X_test, maxlen=max_len)
Z_test_padded = sequence.pad_sequences(Z_test, maxlen=max_len)

print("X_train_padded shape", X_train_padded.shape)
print("Z_train_padded shape", Z_train_padded.shape)
print("X_test_padded shape", X_test_padded.shape)
print("Z_test_padded shape", Z_test_padded.shape)

Padding sequences to length 4563
X_train_padded shape (2388, 4563)
Z_train_padded shape (2388, 4563)
X_test_padded shape (266, 4563)
Z_test_padded shape (266, 4563)


In [73]:
X_train_onehot = np.array([ vector_to_onehot(seq, 27) for seq in X_train_padded])
Z_train_onehot = np.array([ vector_to_onehot(seq, 27) for seq in Z_train_padded])
X_test_onehot = np.array([ vector_to_onehot(seq, 27) for seq in X_test_padded])
Z_test_onehot = np.array([ vector_to_onehot(seq, 27) for seq in Z_test_padded])

In [75]:
print(X_train_onehot.shape)
print(Z_train_onehot.shape)
print(X_test_onehot.shape)
print(Z_test_onehot.shape)

(2388, 4563, 27)
(2388, 4563, 27)
(266, 4563, 27)
(266, 4563, 27)


In [10]:
# fix random seed for reproducibility
model = Sequential()
model.add(LSTM(50, input_shape = (4563, 27)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train_padded, Y_train, nb_epoch=5, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test_padded, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 4563, 27)      729         embedding_input_1[0][0]          
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 50)            15600       embedding_1[0][0]                
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 1)             51          lstm_1[0][0]                     
Total params: 16380
____________________________________________________________________________________________________
None


KeyboardInterrupt: 

In [None]:
from datetime import datetime
model_json = model.to_json()
with open("../results/{}.json".format(datetime.today().strftime("%d_%m_%Y_%I:%M%p")),  "w+") as json_file:
    json_file.write(model_json)
model.save_weights("../results/{}.h5".format(datetime.today().strftime("%d_%m_%Y_%I:%M%p")))
print("Saved model to disk")

In [11]:
# fix random seed for reproducibility
model = Sequential()
model.add(Embedding(27, 27, input_length=max_len))
model.add(LSTM(50))
model.add(Dense(max_len, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train_padded, Z_train, nb_epoch=5, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test_padded, Z_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_2 (Embedding)          (None, 4563, 27)      729         embedding_input_2[0][0]          
____________________________________________________________________________________________________
lstm_2 (LSTM)                    (None, 50)            15600       embedding_2[0][0]                
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 4563)          232713      lstm_2[0][0]                     
Total params: 249042
____________________________________________________________________________________________________
None


Exception: Error when checking model target: expected dense_2 to have shape (None, 4563) but got array with shape (2388, 1)