# Project

Before running this, make sure that all prerequisites are installed. This is done by running 

$ pip install -r requirements.txt



## Import modules

In [1]:
import os
import sklearn
import Bio as bio
import numpy as np
from Bio import SeqIO
from hmmlearn import hmm
from sklearn import cross_validation

In [2]:
SEED = 1337
np.random.seed(SEED)


In [3]:
POSITIVE_TM_PATH = "../data/training_data/positive_examples/tm"
POSITIVE_NON_TM_PATH = "../data/training_data/positive_examples/non_tm"
NEGATIVE_TM_PATH = "../data/training_data/negative_examples/tm"
NEGATIVE_NON_TM_PATH = "../data/training_data/negative_examples/non_tm"

In [4]:
positive_tm = [seq for path in os.listdir(POSITIVE_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(POSITIVE_TM_PATH, path), "fasta")]

positive_non_tm = [seq for path in os.listdir(POSITIVE_NON_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(POSITIVE_NON_TM_PATH, path), "fasta")]

negative_tm = [seq for path in os.listdir(NEGATIVE_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(NEGATIVE_TM_PATH, path), "fasta")]
               
negative_non_tm = [seq for path in os.listdir(NEGATIVE_NON_TM_PATH) 
                       for seq in SeqIO.parse(os.path.join(NEGATIVE_NON_TM_PATH, path), "fasta")]

In [5]:
print("positives, tm:",  len(positive_tm))
print("positives, non_tm:",  len(positive_non_tm))
print("negavtive, tm:",  len(negative_tm))
print("negavtive, non_tm:", len( negative_non_tm))

positives, tm: 45
positives, non_tm: 1275
negavtive, tm: 247
negavtive, non_tm: 1087


In [6]:
positive = positive_tm + positive_non_tm
negative = negative_tm + negative_non_tm

In [46]:
def encode_data(x, symbols_map=False):
    
    symbols = { c for seq in x for c in seq}
    symbols.update('*')
    if not symbols_map:
        symbols_map = { s : i for i, s in enumerate(symbols)}
    encoded_data = []
    for seq in x:
        seq_encoded = []
        for c in seq:
            try:
                i = symbols_map[c]
            except KeyError:
                i = symbols_map['*']
            seq_encoded.append(i) 
        encoded_data.append(seq_encoded)
    encoded_data = np.array(encoded_data) 
            
    n_symbols = len(symbols)

    return encoded_data, n_symbols, symbols_map

def train_and_test_encode(dataset, labels):
    x = [[c for c in str(seq.seq).split('#')[0].strip()] for seq in dataset]
    x_e, n_symbols, symbols_map = encode_data(x)
    z = [[c for c in str(seq.seq).split('#')[1].strip()] for seq in dataset]
    z_e, n_states, states_map = encode_data(z)
    
    X_train, X_test, Z_train, Z_test, train_labels, test_labels = \
        cross_validation.train_test_split(x_e, z_e, labels, random_state=SEED, train_size=.9)
    return X_train, X_test, Z_train, Z_test, train_labels, test_labels, n_symbols, n_states, symbols_map, states_map
    
    

In [47]:
def hmm_model(X_train_seq, Y_train, n_states, n_symbols):
    
    ε = 1e-10
    
    # Estimate initial matrix
    Pi = np.zeros(n_states) + ε
    for state_seq in Y_train:
        Pi[state_seq[0]] += 1
    Pi = Pi/sum(Pi)

    # Estimate transition matrix
    A = np.zeros((n_states, n_states)) + ε
    for state_seq in Y_train:
        for i in range(len(state_seq)-1):
            A[state_seq[i], state_seq[i+1]] += 1

    # Normalize transition matrix
    for row in range(n_states):
        A[row] = A[row]/sum(A[row])

    # Estimate emission matrix
    B = np.zeros((n_states, n_symbols)) + ε
    for i, seq in enumerate(X_train_seq):
        for j in range(len(seq)):
            B[Y_train[i][j], seq[j]] += 1

    # Normalize emission matrix
    for row in range(n_states):
        B[row] = B[row]/sum(B[row])
    
    model = hmm.MultinomialHMM(n_components=n_states)
    model.startprob_ = Pi
    model.transmat_ = A
    model.emissionprob_ = B
    
    return model

## Train model

In [48]:
dataset = np.array(positive + negative)
labels = np.array([1]*len(positive) + [0]*len(negative))

X_train, X_test, Z_train, Z_test, train_labels, test_labels, n_symbols, n_states, symbols_map, states_map = train_and_test_encode(dataset, labels)
model = hmm_model(X_train, Z_train, n_states, n_symbols)

n_symbols

22

## Evaluate model

In [49]:
def evaluate(model, test_set, test_labels, states_map):
    hit = 0
    tot = 0
    for i, seq in enumerate(test_set):
        if test_labels[i] == 1:
            if states_map['C'] in model.predict(np.asmatrix(seq).T):
                hit += 1
        else:
            if states_map['C'] not in model.predict(np.asmatrix(seq).T):
                hit += 1
        tot += 1
    print("Accuracy on test data: {:.4}%".format(float(hit)*100/tot))
evaluate(model, X_test, test_labels, states_map)

Accuracy on test data: 92.11%


## Import proteomes

In [71]:
HUMAN_PROTEOMS_PATH = "../data/proteomes/human.fa"

In [72]:
human_data = [seq for seq in SeqIO.parse(HUMAN_PROTEOMS_PATH, "fasta")]

In [73]:
x, n_symbols, _ = encode_data(human_data, symbols_map)


In [74]:
n_symbols

33

In [75]:
def predict(model, test_set, states_map):
    res = []
    for seq in test_set:
        if states_map['C'] in model.predict(np.asmatrix(seq).T):
            res.append(1)
        else:
            res.append(0)
    return np.array(res)

In [76]:
prediction = predict(model, x, states_map)

In [77]:
np.array(prediction).mean()

0.079660443942221743

In [78]:
HUMAN_PROTEOMS_SIGNAL_PATH = "../data/proteomes/human_signal.fa"
human_data_signal = [seq for seq in SeqIO.parse(HUMAN_PROTEOMS_SIGNAL_PATH, "fasta")]


In [86]:
signal_names = {seq.description for seq in human_data_signal}
predicted_names = {seq.description for seq in np.array(human_data)[prediction == 1]}

In [89]:
print('In predicted in true:' ,len(signal_names & predicted_names))
print('Signal peptides not in predicted', len(signal_names - predicted_names))

In predicted in true: 10626
Signal peptides not in predicted 2305


In [64]:
MOUSE_PROTEOMS_PATH = "../data/proteomes/Mus_musculus.GRCm38.pep.all.fa"

In [65]:
mouse_data = [seq for seq in SeqIO.parse(MOUSE_PROTEOMS_PATH, "fasta")]

In [66]:
mouse_x, _, _ = encode_data(mouse_data, symbols_map)
mouse_prediction = predict(model, mouse_x, states_map)
np.array(mouse_prediction).mean()

0.17854817708333334

In [69]:
np.array(prediction).sum()

18667

In [70]:
len(prediction)

102821