# Project

Before running this, make sure that all prerequisites are installed. This is done by running 

$ pip install -r requirements.txt



## Import modules

In [17]:
import os
import sklearn
import Bio as bio
import numpy as np
from Bio import SeqIO
from hmmlearn import hmm
from sklearn import cross_validation

In [18]:
SEED = 1337
np.random.seed(SEED)


In [19]:
POSITIVE_TM_PATH = "../data/training_data/positive_examples/tm"
POSITIVE_NON_TM_PATH = "../data/training_data/positive_examples/non_tm"
NEGATIVE_TM_PATH = "../data/training_data/negative_examples/tm"
NEGATIVE_NON_TM_PATH = "../data/training_data/negative_examples/non_tm"

In [20]:
positive_tm = [seq for path in os.listdir(POSITIVE_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(POSITIVE_TM_PATH, path), "fasta")]

positive_non_tm = [seq for path in os.listdir(POSITIVE_NON_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(POSITIVE_NON_TM_PATH, path), "fasta")]

negative_tm = [seq for path in os.listdir(NEGATIVE_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(NEGATIVE_TM_PATH, path), "fasta")]
               
negative_non_tm = [seq for path in os.listdir(NEGATIVE_NON_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(NEGATIVE_NON_TM_PATH, path), "fasta")]

In [21]:
print("positives, tm:",  len(positive_tm))
print("positives, non_tm:",  len(positive_non_tm))
print("negavtive, tm:",  len(negative_tm))
print("negavtive, non_tm:", len( negative_non_tm))

positives, tm: 45
positives, non_tm: 1275
negavtive, tm: 247
negavtive, non_tm: 1087


In [22]:
positive = positive_tm + positive_non_tm
negative = negative_tm + negative_non_tm

In [23]:
def encode_data(x, symbols_map=False):
    
    symbols = { c for seq in x for c in seq}
    symbols.update('*')
    if not symbols_map:
        symbols_map = { s : i for i, s in enumerate(symbols)}
    encoded_data = []
    for seq in x:
        seq_encoded = []
        for c in seq:
            try:
                i = symbols_map[c]
            except KeyError:
                i = symbols_map['*']
            seq_encoded.append(i) 
        encoded_data.append(seq_encoded)
    encoded_data = np.array(encoded_data) 
            
    n_symbols = len(symbols)

    return encoded_data, n_symbols, symbols_map

def train_and_test_encode(dataset, labels):
    x = [[c for c in str(seq.seq).split('#')[0].strip()] for seq in dataset]
    x_e, n_symbols, symbols_map = encode_data(x)
    z = [[c for c in str(seq.seq).split('#')[1].strip()] for seq in dataset]
    z_e, n_states, states_map = encode_data(z)
    
    X_train, X_test, Z_train, Z_test, train_labels, test_labels = \
        cross_validation.train_test_split(x_e, z_e, labels, random_state=SEED, train_size=.9)
    return X_train, X_test, Z_train, Z_test, train_labels, test_labels, n_symbols, n_states, symbols_map, states_map
    
    

In [24]:
def hmm_model(X_train_seq, Y_train, n_states, n_symbols):
    
    ε = 1e-10
    
    # Estimate initial matrix
    Pi = np.zeros(n_states) + ε
    for state_seq in Y_train:
        Pi[state_seq[0]] += 1
    Pi = Pi/sum(Pi)

    # Estimate transition matrix
    A = np.zeros((n_states, n_states)) + ε
    for state_seq in Y_train:
        for i in range(len(state_seq)-1):
            A[state_seq[i], state_seq[i+1]] += 1

    # Normalize transition matrix
    for row in range(n_states):
        A[row] = A[row]/sum(A[row])

    # Estimate emission matrix
    B = np.zeros((n_states, n_symbols)) + ε
    for i, seq in enumerate(X_train_seq):
        for j in range(len(seq)):
            B[Y_train[i][j], seq[j]] += 1

    # Normalize emission matrix
    for row in range(n_states):
        B[row] = B[row]/sum(B[row])
    
    model = hmm.MultinomialHMM(n_components=n_states)
    model.startprob_ = Pi
    model.transmat_ = A
    model.emissionprob_ = B
    
    return model

## Train model

In [25]:
dataset = np.array(positive_tm + positive_non_tm + negative_tm + negative_non_tm)
labels = np.array([[1,1]]*len(positive_tm) + [[1,0]]*len(positive_non_tm)+[[0,1]]*len(negative_tm) + [[0,0]]*len(negative_non_tm))

X_train, X_test, Z_train, Z_test, train_labels, test_labels, n_symbols, n_states, symbols_map, states_map = train_and_test_encode(dataset, labels)
model = hmm_model(X_train, Z_train, n_states, n_symbols)

## Evaluate model

In [26]:
def predict(model, test_set, states_map):
    res = []
    for seq in test_set:
        if states_map['C'] in model.predict(np.asmatrix(seq).T):
            res.append(1)
        else:
            res.append(0)
    return np.array(res)

def evaluate(model, test_set, test_labels, states_map):
    res = []
    prediction = predict(model, test_set, states_map)
    result = test_labels == prediction
    true_positive = ((test_labels == 1) & (prediction == 1))
    true_negative = ((test_labels == 0) & (prediction == 0))
    
    print("-------------------")
    print("All positive", test_labels.sum())
    print("True positive", true_positive.sum())
    print("True negative", true_negative.sum())
    print("Precission", (true_positive == 1).sum() / (prediction == 1).sum() )
    print("Recal", true_positive.sum() /  test_labels.sum() )
    print("Accuracy on test data: {:.4}%".format(result.mean()*100))
    print("-------------------")
          
# evaluate all dataset
print("Evaluated on full data set:")
evaluate(model, X_test, test_labels[:,0], states_map)
# evaluate non-tm
print("Evaluated on non-tm:")
evaluate(model, X_test[test_labels[:,1] == 0], test_labels[test_labels[:,1] == 0][:,0], states_map)
# evaluate tm
print("Evaluated on tm:")
evaluate(model, X_test[test_labels[:,1] == 1], test_labels[test_labels[:,1] == 1][:,0], states_map)

Evaluated on full data set:
-------------------
All positive 127
True positive 117
True negative 128
Precission 0.9140625
Recal 0.92125984252
Accuracy on test data: 92.11%
-------------------
Evaluated on non-tm:
-------------------
All positive 120
True positive 110
True negative 106
Precission 0.932203389831
Recal 0.916666666667
Accuracy on test data: 92.31%
-------------------
Evaluated on tm:
-------------------
All positive 7
True positive 7
True negative 22
Precission 0.7
Recal 1.0
Accuracy on test data: 90.62%
-------------------


## Import proteomes

In [27]:
HUMAN_PROTEOMS_PATH = "../data/proteomes/human.fa"
MOUSE_PROTEOMS_PATH = "../data/proteomes/mouse.fa"
HUMAN_PROTEOMS_SIGNAL_PATH = "../data/proteomes/human_signal.fa"
MOUSE_PROTEOMS_SIGNAL_PATH = "../data/proteomes/mouse_signal.fa"

In [28]:
human_data = [seq for seq in SeqIO.parse(HUMAN_PROTEOMS_PATH, "fasta")]
mouse_data = [seq for seq in SeqIO.parse(MOUSE_PROTEOMS_PATH, "fasta")]

In [29]:
human_data_signal = [seq for seq in SeqIO.parse(HUMAN_PROTEOMS_SIGNAL_PATH, "fasta")]
mouse_data_signal = [seq for seq in SeqIO.parse(MOUSE_PROTEOMS_SIGNAL_PATH, "fasta")]

In [33]:
def stats(model, symbols_map, states_map, data, data_signal):
    x, n_symbols, _ = encode_data(data, symbols_map)
    prediction = predict(model, x, states_map)
    all_names = {seq.description for seq in data}
    true_names = {seq.description for seq in data_signal}
    negative_names = all_names - true_names 
    positive_names = {seq.description for seq in np.array(data)[prediction == 1]}
    negtive_names = {seq.description for seq in np.array(data)[prediction == 0]}

    precision = len(true_names & positive_names) / len(positive_names)
    recall = len(true_names & positive_names) / len(true_names)
    
    print("Predicted signal peptides: {:.4}%".format(np.array(prediction).mean()*100))
    print("Accuracy: {:.4}".format(len((true_names & positive_names)|((all_names - false_names) & negative_names)) / len(all_names)))
    print('Precision: {:.4}'.format(precision))
    print('Recall: {:.4}'.format(recall))

In [31]:
print('HUMAN')
stats(model, symbols_map, states_map, human_data, human_data_signal)

HUMAN
Predicted signal peptides: 7.966%
Precision: 0.6178
Recall: 0.8217


In [32]:
print('MOUSE')
stats(model, symbols_map, states_map, mouse_data, mouse_data_signal)

MOUSE
Predicted signal peptides: 8.049%
Precision: 0.625
Recall: 0.8053
