# Project

Before running this, make sure that all prerequisites are installed. This is done by running 

$ pip install -r requirements.txt



## Import modules

In [1]:
import numpy as np
import Bio as bio
from Bio import SeqIO
import random
import scipy
from hmmlearn import hmm
import sklearn
from sklearn import cross_validation
import os
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding

Using Theano backend.


In [2]:
np.random.seed(1337)


In [3]:
POSITIVE_TM_PATH = "../data/training_data/positive_examples/tm"
POSITIVE_NON_TM_PATH = "../data/training_data/positive_examples/non_tm"
NEGATIVE_TM_PATH = "../data/training_data/negative_examples/tm"
NEGATIVE_NON_TM_PATH = "../data/training_data/negative_examples/non_tm"

In [4]:
positive_tm = [seq for path in os.listdir(POSITIVE_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(POSITIVE_TM_PATH, path), "fasta")]

positive_non_tm = [seq for path in os.listdir(POSITIVE_NON_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(POSITIVE_NON_TM_PATH, path), "fasta")]

negative_tm = [seq for path in os.listdir(NEGATIVE_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(NEGATIVE_TM_PATH, path), "fasta")]
               
negative_non_tm = [seq for path in os.listdir(NEGATIVE_NON_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(NEGATIVE_NON_TM_PATH, path), "fasta")]

In [5]:
print("positives, tm:",  len(positive_tm))
print("positives, non_tm:",  len(positive_non_tm))
print("negavtive, tm:",  len(negative_tm))
print("negavtive, non_tm:", len( negative_non_tm))

positives, tm: 45
positives, non_tm: 1275
negavtive, tm: 247
negavtive, non_tm: 1087


In [6]:
positive = positive_tm + positive_non_tm
negative = negative_tm + negative_non_tm

In [7]:
def get_dataset(dataset):
    x = np.array([[c for c in str(seq.seq).split('#')[0].strip()] for seq in dataset ])
    symbols = { c for seq in x for c in seq}
    symbols_map = { s : i for i, s in enumerate(symbols)}
    #inv_symbols_map = {v: k for k, v in symbols_map.items()}
    x = np.array(list(map(lambda i : list(map(lambda j: symbols_map[j], i)), x)))

    y = [[c for c in str(seq.seq).split('#')[1].strip()] for seq in dataset ]
    states = { c for seq in y for c in seq}
    state_map = { s : i for i, s in enumerate(states)}
    #inv_state_map = {v: k for k, v in state_map.items()}
    y = np.array(list(map(lambda i : list(map(lambda j: state_map[j], i)), y)))

    n_symbols = len(symbols)
    n_states = len(states)
    X_train_seq, X_test_seq, Y_train, Y_test = cross_validation.train_test_split(x, y, random_state=7, train_size=0.6)

    assert len(x) == len(y), "x:{}, y:{}".format(len(x), len(y))
    return X_train_seq, X_test_seq, Y_train, Y_test, n_states, n_symbols


In [8]:
def hmm_model(X_train_seq, Y_train, n_states, n_symbols):
    # Estimate initial matrix
    Pi = np.zeros(n_states)
    for state_seq in Y_train:
        Pi[state_seq[0]] += 1
    Pi = Pi/sum(Pi)

    # Estimate transition matrix
    A = np.zeros((n_states, n_states))
    for state_seq in Y_train:
        for i in range(len(state_seq)-1):
            A[state_seq[i], state_seq[i+1]] += 1

    # Normalize transition matrix
    for row in range(n_states):
        A[row] = A[row]/sum(A[row])

    # Estimate emission matrix
    B = np.zeros((n_states, n_symbols))
    for i, seq in enumerate(X_train_seq):
        for j in range(len(seq)):
            B[Y_train[i][j], seq[j]] += 1

    # Normalize emission matrix
    for row in range(n_states):
        B[row] = B[row]/sum(B[row])

    model = hmm.MultinomialHMM(n_components=n_states)
    model.startprob_ = Pi
    model.transmat_ = A
    model.emissionprob_ = B
    return model

X_train_seq_p, X_test_seq_p, Y_train_p, Y_test_p, n_states_p, n_symbols_p = get_dataset(positive) 
positive_model = hmm_model(X_train_seq_p, Y_train_p, n_states_p, n_symbols_p)

X_train_seq_n, X_test_seq_n, Y_train_n, Y_test_n, n_states_n, n_symbols_n = get_dataset(negative) 
negative_model = hmm_model(X_train_seq_n, Y_train_n, n_states_n, n_symbols_n)

hit = 0
tot = 0
for seq_n in X_test_seq_n:
    p_score = positive_model.score(np.asmatrix(seq_p[0]).T)
    n_score = negative_model.score(np.asmatrix(seq_p[0]).T)
    if p_score < n_score:
        hit += 1
    tot += 1
print(hit, tot)
print(float(hit)/tot)

118 534
0.2209737827715356


In [9]:
print("s:", s)
print("s2:", s2)

NameError: name 's' is not defined

In [10]:
s = positive_model.score(np.asmatrix(X_seq_n[0]).T)
s2 = negative_model.score(np.asmatrix(X_seq_n[0]).T)
print("s:", s)
print("s2:", s2)

NameError: name 'X_seq_n' is not defined