# Project

Before running this, make sure that all prerequisites are installed. This is done by running 

$ pip install -r requirements.txt



## Import modules

In [1]:
import numpy as np
import Bio as bio
from Bio import SeqIO
import random
import scipy
from hmmlearn import hmm
import sklearn
from sklearn import cross_validation
import os
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


In [2]:
np.random.seed(1337)


In [3]:
POSITIVE_TM_PATH = "../data/training_data/positive_examples/tm"
POSITIVE_NON_TM_PATH = "../data/training_data/positive_examples/non_tm"
NEGATIVE_TM_PATH = "../data/training_data/negative_examples/tm"
NEGATIVE_NON_TM_PATH = "../data/training_data/negative_examples/non_tm"

In [4]:
positive_tm = [seq for path in os.listdir(POSITIVE_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(POSITIVE_TM_PATH, path), "fasta")]

positive_non_tm = [seq for path in os.listdir(POSITIVE_NON_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(POSITIVE_NON_TM_PATH, path), "fasta")]

negative_tm = [seq for path in os.listdir(NEGATIVE_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(NEGATIVE_TM_PATH, path), "fasta")]
               
negative_non_tm = [seq for path in os.listdir(NEGATIVE_NON_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(NEGATIVE_NON_TM_PATH, path), "fasta")]

In [5]:
print("positives, tm:",  len(positive_tm))
print("positives, non_tm:",  len(positive_non_tm))
print("negavtive, tm:",  len(negative_tm))
print("negavtive, non_tm:", len( negative_non_tm))

positives, tm: 45
positives, non_tm: 1275
negavtive, tm: 247
negavtive, non_tm: 1087


In [64]:
positive = positive_tm + positive_non_tm
negative = negative_tm + negative_non_tm

In [82]:
def get_dataset(dataset, seq_state):
    x = np.array([[c for c in str(seq.seq).split('#')[0].strip()] for seq in dataset ])
    symbols = { c for seq in x for c in seq}
    symbols_map = { s : i for i, s in enumerate(symbols)}
    x = np.array(list(map(lambda i : list(map(lambda j: symbols_map[j], i)), x)))

    y = [[c for c in str(seq.seq).split('#')[1].strip()] for seq in dataset ]
    states = { c for seq in y for c in seq}
    state_map = { s : i for i, s in enumerate(states)}
    y = np.array(list(map(lambda i : list(map(lambda j: state_map[j], i)), y)))
    
    n_symbols = len(symbols)
    n_states = len(states)

    z = np.array([y, seq_state]).T
    X_train_seq, X_test_seq, Z_train, Z_test = cross_validation.train_test_split(x, z, random_state=7, train_size=0.6)

    assert len(x) == len(y), "x:{}, y:{}".format(len(x), len(y))
    return X_train_seq, X_test_seq, Z_train, Z_test, n_states, n_symbols, state_map


In [86]:
def hmm_model(X_train_seq, Y_train, n_states, n_symbols):
    # Estimate initial matrix
    Pi = np.zeros(n_states)
    for state_seq in Y_train:
        Pi[state_seq[0]] += 1
    Pi = Pi/sum(Pi)

    # Estimate transition matrix
    A = np.zeros((n_states, n_states))
    for state_seq in Y_train:
        for i in range(len(state_seq)-1):
            A[state_seq[i], state_seq[i+1]] += 1

    # Normalize transition matrix
    for row in range(n_states):
        A[row] = A[row]/sum(A[row])

    # Estimate emission matrix
    B = np.zeros((n_states, n_symbols))
    for i, seq in enumerate(X_train_seq):
        for j in range(len(seq)):
            B[Y_train[i][j], seq[j]] += 1

    # Normalize emission matrix
    for row in range(n_states):
        B[row] = B[row]/sum(B[row])
    
    model = hmm.MultinomialHMM(n_components=n_states)
    model.startprob_ = Pi
    model.transmat_ = A
    model.emissionprob_ = B
    return model

dataset = np.array(positive + negative)
seq_state = np.array([1]*len(positive) + [0]*len(negative))

X_train_seq, X_test_seq, Z_train, Z_test, n_states, n_symbols, state_map = get_dataset(dataset, seq_state) 
model = hmm_model(X_train_seq, Z_train.T[0], n_states, n_symbols)

hit = 0
tot = 0
for i, seq in enumerate(X_test_seq):
    if Z_test[i][1] == 1:
        if state_map['C'] in positive_negative_model.predict(np.asmatrix(seq).T):
            hit += 1
    else:
        if state_map['C'] not in positive_negative_model.predict(np.asmatrix(seq).T):
            hit += 1
    tot += 1
print(float(hit)/tot)

  return np.log(self.emissionprob_)[:, np.concatenate(X)].T
  n_samples, n_components, np.log(self.startprob_),
  np.log(self.transmat_), framelogprob)


0.9227871939736346


In [53]:
def get_dataset(dataset):
    x = np.array([[c for c in str(seq.seq).split('#')[0].strip()] for seq in dataset ])
    symbols = { c for seq in x for c in seq}
    symbols_map = { s : i for i, s in enumerate(symbols)}
    x = np.array(list(map(lambda i : list(map(lambda j: symbols_map[j], i)), x)))

    y = [[c for c in str(seq.seq).split('#')[1].strip()] for seq in dataset ]
    states = { c for seq in y for c in seq}
    state_map = { s : i for i, s in enumerate(states)}
    y = np.array(list(map(lambda i : list(map(lambda j: state_map[j], i)), y)))

    n_symbols = len(symbols)
    n_states = len(states)
    X_train_seq, X_test_seq, Y_train, Y_test = cross_validation.train_test_split(x, y, random_state=7, train_size=0.6)

    assert len(x) == len(y), "x:{}, y:{}".format(len(x), len(y))
    return X_train_seq, X_test_seq, Y_train, Y_test, n_states, n_symbols, state_map


In [63]:
def hmm_model(X_train_seq, Y_train, n_states, n_symbols):
    # Estimate initial matrix
    Pi = np.zeros(n_states)
    for state_seq in Y_train:
        Pi[state_seq[0]] += 1
    Pi = Pi/sum(Pi)

    # Estimate transition matrix
    A = np.zeros((n_states, n_states))
    for state_seq in Y_train:
        for i in range(len(state_seq)-1):
            A[state_seq[i], state_seq[i+1]] += 1

    # Normalize transition matrix
    for row in range(n_states):
        A[row] = A[row]/sum(A[row])

    # Estimate emission matrix
    B = np.zeros((n_states, n_symbols))
    for i, seq in enumerate(X_train_seq):
        for j in range(len(seq)):
            B[Y_train[i][j], seq[j]] += 1

    # Normalize emission matrix
    for row in range(n_states):
        B[row] = B[row]/sum(B[row])
    
    model = hmm.MultinomialHMM(n_components=n_states)
    model.startprob_ = Pi
    model.transmat_ = A
    model.emissionprob_ = B
    return model

def accuracy(dataset, model_1, model_2):
    hit = 0
    tot = 0
    for seq in dataset:
        score_1 = model_1.score(np.asmatrix(seq[0]).T)
        score_2 = model_2.score(np.asmatrix(seq[0]).T)
        if score_1 > score_2:
            hit += 1
        tot += 1
    return float(hit)/tot

X_train_seq_p, X_test_seq_p, Y_train_p, Y_test_p, n_states_p, n_symbols_p, state_map_p = get_dataset(positive) 
positive_model = hmm_model(X_train_seq_p, Y_train_p, n_states_p, n_symbols_p)

X_train_seq_n, X_test_seq_n, Y_train_n, Y_test_n, n_states_n, n_symbols_n, state_map_n = get_dataset(negative) 
negative_model = hmm_model(X_train_seq_n, Y_train_n, n_states_n, n_symbols_n)

print("True positive:", accuracy(X_train_seq_p, positive_model, negative_model))
print("True negative:", accuracy(X_train_seq_n, negative_model, positive_model))

X_train_seq_pn, X_test_seq_pn, Y_train_pn, Y_test_pn, n_states_pn, n_symbols_pn, state_map_pn = get_dataset(positive + negative) 
positive_negative_model = hmm_model(X_train_seq_pn, Y_train_pn, n_states_pn, n_symbols_pn)
print(state_map_pn)


hit = 0
tot = 0
for seq_p in X_test_seq_p:
    if state_map_pn['C'] in positive_negative_model.predict(np.asmatrix(seq_p).T):
        hit += 1
    tot += 1

for seq_n in X_test_seq_n:
    if state_map_pn['C'] not in positive_negative_model.predict(np.asmatrix(seq_n).T):
        hit += 1
    tot += 1
    
print(float(hit)/tot)

  return np.log(self.emissionprob_)[:, np.concatenate(X)].T
  np.log(self.startprob_),
  np.log(self.transmat_),


True positive: 1.0
True negative: 0.25125
{'n': 0, 'o': 3, 'h': 4, 'C': 5, 'O': 6, 'i': 1, 'c': 7, 'M': 2}


  n_samples, n_components, np.log(self.startprob_),
  np.log(self.transmat_), framelogprob)


0.9322033898305084
