# Project

Before running this, make sure that all prerequisites are installed. This is done by running 

$ pip install -r requirements.txt



## Import modules

In [26]:
import numpy as np
import Bio as bio
from Bio import SeqIO
import random
import scipy
import sklearn
from sklearn import cross_validation
from sklearn.preprocessing import OneHotEncoder
import os
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding

In [27]:
np.random.seed(1337)


In [28]:
POSITIVE_TM_PATH = "../data/training_data/positive_examples/tm"
POSITIVE_NON_TM_PATH = "../data/training_data/positive_examples/non_tm"
NEGATIVE_TM_PATH = "../data/training_data/negative_examples/tm"
NEGATIVE_NON_TM_PATH = "../data/training_data/negative_examples/non_tm"

In [29]:
positive_tm = [seq for path in os.listdir(POSITIVE_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(POSITIVE_TM_PATH, path), "fasta")]

positive_non_tm = [seq for path in os.listdir(POSITIVE_NON_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(POSITIVE_NON_TM_PATH, path), "fasta")]

negative_tm = [seq for path in os.listdir(NEGATIVE_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(NEGATIVE_TM_PATH, path), "fasta")]
               
negative_non_tm = [seq for path in os.listdir(NEGATIVE_NON_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(NEGATIVE_NON_TM_PATH, path), "fasta")]

In [30]:
print("positives, tm:",  len(positive_tm))
print("positives, non_tm:",  len(positive_non_tm))
print("negavtive, tm:",  len(negative_tm))
print("negavtive, non_tm:", len( negative_non_tm))

positives, tm: 45
positives, non_tm: 1275
negavtive, tm: 247
negavtive, non_tm: 1087


In [31]:

positive = positive_tm + positive_non_tm
negative = negative_tm + negative_non_tm


In [32]:
print("positives",  len(positive))
print("negavtive",  len(negative))
x = np.array(positive + negative)
y = np.array([1]*len(positive) + [0]*len(negative))

assert x.shape == y.shape, "There should be one lable for every datapoint"
print("x shape",  x.shape)
print("x shape",  y.shape)

positives 1320
negavtive 1334
x shape (2654,)
x shape (2654,)


In [33]:
X_train_seq, X_test_seq, Y_train, Y_test = cross_validation.train_test_split(x, y, random_state=7, train_size=0.9)

def int_to_onehot(integer, length):
    ret_v = q
    
    
X_train = [[ord(c)-65 for c in str(seq.seq).split('#')[0].strip()] for seq in X_train_seq ]
Z_train = [[ord(c)-65 for c in str(seq.seq).split('#')[0].strip()] for seq in X_train_seq ]
X_test = [[ord(c)-65 for c in str(seq.seq).split('#')[0].strip()] for seq in X_test_seq ]
Z_test = [[ord(c)-65 for c in str(seq.seq).split('#')[1].strip()] for seq in X_test_seq ]

max_len = max([len(s) for s in X_train])
top_ten = sorted([len(s) for s in X_train])[-100:]
print("Padding sequences to length {}".format(max_len))
X_train_padded = sequence.pad_sequences(X_train, maxlen=max_len)
Z_train_padded = sequence.pad_sequences(X_train, maxlen=max_len)
X_test_padded = sequence.pad_sequences(X_test, maxlen=max_len)
Z_test_padded = sequence.pad_sequences(Z_test, maxlen=max_len)


Padding sequences to length 4563


In [34]:
print(top_ten)

[820, 821, 822, 825, 830, 835, 844, 844, 844, 855, 857, 862, 872, 879, 880, 885, 886, 894, 894, 901, 907, 907, 908, 910, 917, 918, 919, 921, 929, 930, 947, 967, 969, 985, 988, 995, 999, 1001, 1010, 1015, 1015, 1019, 1023, 1023, 1028, 1034, 1039, 1040, 1045, 1049, 1053, 1065, 1070, 1071, 1072, 1077, 1086, 1098, 1101, 1116, 1119, 1164, 1167, 1210, 1231, 1245, 1247, 1257, 1315, 1367, 1401, 1426, 1448, 1473, 1474, 1476, 1524, 1531, 1562, 1581, 1608, 1663, 1669, 1670, 1782, 1807, 1820, 1826, 1853, 1886, 2148, 2179, 2201, 2303, 2332, 2386, 2750, 2813, 3084, 4563]


In [41]:
# fix random seed for reproducibility
model = Sequential()
model.add(Embedding(27, 27, input_length=max_len))
model.add(LSTM(50))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train_padded, Y_train, nb_epoch=5, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test_padded, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_2 (Embedding)          (None, 4563, 27)      729         embedding_input_2[0][0]          
____________________________________________________________________________________________________
lstm_8 (LSTM)                    (None, 50)            15600       embedding_2[0][0]                
____________________________________________________________________________________________________
dense_4 (Dense)                  (None, 1)             51          lstm_8[0][0]                     
Total params: 16380
____________________________________________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 58.27%


In [10]:
from datetime import datetime
model_json = model.to_json()
c_time = datetime.today().strftime("%d_%m_%Y_%I:%M%p")
with open("../results/{}.json".format(c_time,  "w+") as json_file:
    json_file.write(model_json)
model.save_weights("../results/{}.h5".format(c_time))
print("Saved model to disk, timestamped with {}".format(c_time))

Saved model to disk


FileNotFoundError: [Errno 2] No such file or directory: 'model.json'