# Project

Before running this, make sure that all prerequisites are installed. This is done by running 

$ pip install -r requirements.txt



## Import modules

In [1]:
import os
import sklearn
import Bio as bio
import numpy as np
from Bio import SeqIO
from hmmlearn import hmm
from sklearn import cross_validation

In [2]:
SEED = 1337
np.random.seed(SEED)


In [3]:
POSITIVE_TM_PATH = "../data/training_data/positive_examples/tm"
POSITIVE_NON_TM_PATH = "../data/training_data/positive_examples/non_tm"
NEGATIVE_TM_PATH = "../data/training_data/negative_examples/tm"
NEGATIVE_NON_TM_PATH = "../data/training_data/negative_examples/non_tm"

In [4]:
positive_tm = [seq for path in os.listdir(POSITIVE_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(POSITIVE_TM_PATH, path), "fasta")]

positive_non_tm = [seq for path in os.listdir(POSITIVE_NON_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(POSITIVE_NON_TM_PATH, path), "fasta")]

negative_tm = [seq for path in os.listdir(NEGATIVE_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(NEGATIVE_TM_PATH, path), "fasta")]
               
negative_non_tm = [seq for path in os.listdir(NEGATIVE_NON_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(NEGATIVE_NON_TM_PATH, path), "fasta")]

In [5]:
print("positives, tm:",  len(positive_tm))
print("positives, non_tm:",  len(positive_non_tm))
print("negavtive, tm:",  len(negative_tm))
print("negavtive, non_tm:", len( negative_non_tm))

positives, tm: 45
positives, non_tm: 1275
negavtive, tm: 247
negavtive, non_tm: 1087


In [6]:
positive = positive_tm + positive_non_tm
negative = negative_tm + negative_non_tm

In [30]:
def encode_data(x, symbols_map=False):
    
    symbols = { c for seq in x for c in seq}
    symbols.update('*')
    if not symbols_map:
        symbols_map = { s : i for i, s in enumerate(symbols)}
    x = np.array(list(map(lambda i : list(map(lambda j: symbols_map[j], i)), x)))
    n_symbols = len(symbols)

    return x, n_symbols, symbols_map

def train_and_test_encode(dataset, labels):
    x = [[c for c in str(seq.seq).split('#')[0].strip()] for seq in dataset]
    x_e, n_symbols, symbols_map = encode_data(x)
    z = [[c for c in str(seq.seq).split('#')[1].strip()] for seq in dataset]
    z_e, n_states, states_map = encode_data(z)
    
    X_train, X_test, Z_train, Z_test, train_labels, test_labels = \
        cross_validation.train_test_split(x_e, z_e, labels, random_state=SEED, train_size=.9)
    return X_train, X_test, Z_train, Z_test, train_labels, test_labels, n_symbols, n_states, symbols_map, states_map
    
    

In [31]:
def hmm_model(X_train_seq, Y_train, n_states, n_symbols):
    
    ε = 1e-10
    
    # Estimate initial matrix
    Pi = np.zeros(n_states) + ε
    for state_seq in Y_train:
        Pi[state_seq[0]] += 1
    Pi = Pi/sum(Pi)

    # Estimate transition matrix
    A = np.zeros((n_states, n_states)) + ε
    for state_seq in Y_train:
        for i in range(len(state_seq)-1):
            A[state_seq[i], state_seq[i+1]] += 1

    # Normalize transition matrix
    for row in range(n_states):
        A[row] = A[row]/sum(A[row])

    # Estimate emission matrix
    B = np.zeros((n_states, n_symbols)) + ε
    for i, seq in enumerate(X_train_seq):
        for j in range(len(seq)):
            B[Y_train[i][j], seq[j]] += 1

    # Normalize emission matrix
    for row in range(n_states):
        B[row] = B[row]/sum(B[row])
    
    model = hmm.MultinomialHMM(n_components=n_states)
    model.startprob_ = Pi
    model.transmat_ = A
    model.emissionprob_ = B
    
    return model

## Train model

In [33]:
dataset = np.array(positive + negative)
labels = np.array([1]*len(positive) + [0]*len(negative))

X_train, X_test, Z_train, Z_test, train_labels, test_labels, n_symbols, n_states, symbols_map, states_map = train_and_test_encode(dataset, labels)
model = hmm_model(X_train, Z_train, n_states, n_symbols)
symbols_map

{'A': 9,
 'C': 11,
 'D': 7,
 'E': 1,
 'F': 19,
 'G': 14,
 'H': 3,
 'I': 20,
 'K': 2,
 'L': 0,
 'M': 4,
 'N': 6,
 'P': 17,
 'Q': 8,
 'R': 13,
 'S': 12,
 'T': 15,
 'V': 10,
 'W': 16,
 'X': 18,
 'Y': 5}

## Evaluate model

In [20]:
def evaluate(model, test_set, test_labels, states_map):
    hit = 0
    tot = 0
    for i, seq in enumerate(test_set):
        if test_labels[i] == 1:
            if states_map['C'] in model.predict(np.asmatrix(seq).T):
                hit += 1
        else:
            if states_map['C'] not in model.predict(np.asmatrix(seq).T):
                hit += 1
        tot += 1
    print("Accuracy on test data: {:.4}%".format(float(hit)*100/tot))
evaluate(model, X_test, test_labels, states_map)

Accuracy on test data: 92.11%


## Import proteomes

In [24]:
HUMAN_PROTEOMS_PATH = "../data/proteomes/Homo_sapiens.GRCh38.pep.all.fa"
human_data = [seq for seq in SeqIO.parse(HUMAN_PROTEOMS_PATH, "fasta") if not "*" in str(seq.seq)]

In [None]:
human_data = [seq for seq in SeqIO.parse(HUMAN_PROTEOMS_PATH, "fasta") if not "*" in str(seq.seq)]

In [34]:
x, n_symbols, _ = encode_data(human_data)


In [35]:
symbols_map

{'A': 14,
 'C': 20,
 'D': 7,
 'E': 12,
 'F': 5,
 'G': 16,
 'H': 1,
 'I': 11,
 'K': 0,
 'L': 17,
 'M': 6,
 'N': 18,
 'P': 4,
 'Q': 19,
 'R': 3,
 'S': 15,
 'T': 9,
 'U': 13,
 'V': 8,
 'W': 21,
 'X': 10,
 'Y': 2}

In [26]:
def predict(model, test_set, states_map):
    res = []
    for seq in test_set:
        if states_map['C'] in model.predict(np.asmatrix(seq).T):
            res.append(1)
        else:
            res.append(0)
    return res

In [27]:
prediction = predict(model, x, states_map)

IndexError: index 21 is out of bounds for axis 1 with size 21