In [1]:
import pandas as pd
import sklearn 
import scipy
from sklearn import linear_model as lm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cross_validation import KFold, train_test_split, cross_val_score, StratifiedKFold, LabelKFold, ShuffleSplit
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from mhcflurry.amino_acid import common_amino_acids
from mhcflurry import dataset
from mhcflurry.dataset import Dataset
import matplotlib.pyplot as plt 
% matplotlib inline
import numpy as np
import math 
from mhcflurry import peptide_encoding, amino_acid
import statsmodels.api as sm
from keras import models, layers, optimizers
from keras.optimizers import Adam 
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Input, merge, Convolution1D, AveragePooling1D, Activation, Flatten
from keras.preprocessing import sequence
from keras.models import Model
from keras.engine import topology
import seaborn as sns
from keras.layers.core import Lambda
from keras import backend as K

Using Theano backend.


In [2]:
ds = Dataset.from_csv("bdata.2009.mhci.public.1.txt")
ds_h = ds.slice(ds.alleles == 'HLA-A0201')

In [3]:
df = pd.read_table("bdata.2009.mhci.public.1.txt")

df['log_meas']=1-np.log(df['meas'])/math.log(50000)
df['peptide_length'] = df['sequence'].str.len()


max_len=df['sequence'].str.len().max()
n_peptides = df['sequence'].count()

def amino_acid_hotshot_encoding(s):
    return common_amino_acids.hotshot_encoding([s],len(s)).flatten().astype(int)
df['hotshot_encoded_peptides'] = df.sequence.apply(lambda seq: amino_acid_hotshot_encoding(seq))

def amino_acid_index_encoding(s, maxlen):
    a = 1+common_amino_acids.index_encoding([s],len(s)).flatten()
    return np.concatenate([a, np.zeros(maxlen-len(a),dtype=int)])
df['index_encoded_peptides'] = df.sequence.apply(lambda seq: amino_acid_index_encoding(seq, max_len))

def measured_affinity_less_than(Y,k):
    IC50 = 50000**(1-Y)
    return (IC50 < k).astype(int) 

def affinity_label(Y):
    return measured_affinity_less_than(Y,50) + measured_affinity_less_than(Y,500) + measured_affinity_less_than(Y,5000) + measured_affinity_less_than(Y,50000)

df['affinity_label'] = affinity_label(df['log_meas'])
df_h = df[df['mhc']=='HLA-A-0201'][['hotshot_encoded_peptides','index_encoded_peptides','log_meas','peptide_length']]
X = np.array(list(df_h['index_encoded_peptides']))
y = np.array(list(df_h['log_meas']))
y[y<0]=0

def first_and_last_three(Y):
    k = np.count_nonzero(Y)
    return np.concatenate([Y[:3],Y[-3+k:k]])
def first_and_last_four(Y):
    k = np.count_nonzero(Y)
    return np.concatenate([Y[:4],Y[-4+k:k]])
def first_and_last_two(Y):
    k = np.count_nonzero(Y)
    return np.concatenate([Y[:2],Y[-2+k:k]])
X_44 = np.apply_along_axis(first_and_last_four,1,X)
X_33 = np.apply_along_axis(first_and_last_three,1,X)
X_22 = np.apply_along_axis(first_and_last_two,1,X)

In [4]:
def regroup_together(affinities, weights , original_indices):
    affinities = affinities.ravel()
    weights = weights.ravel()
    
    assert affinities.shape == weights.shape, "%s should be %s" % (affinities.shape, weights.shape)
    assert affinities.shape == original_indices.shape
    assert len(affinities) == len(affinities.ravel())
    
    weighted_affinities = (affinities * weights)
    index_set = set(original_indices)
    n_indices = len(index_set)
    result_order = {original_index: i for (i, original_index) in enumerate(sorted(index_set))}
    result = np.zeros(n_indices)
    for i, x in enumerate(weighted_affinities):
        result_idx = result_order[original_indices[i]]
        result[result_idx] += x
    return result

def slicing(dataset, index, i):
    return dataset.slice(index).kmer_index_encoding()[i]

def label_transform(array):
    result = 1-np.log(array)/math.log(50000)
    result[result<0]=0
    return result

def index_to_hotshot_encoding(index_encoded_nine_mer):
    result = np.zeros((9,21))
    for position, amino_acid in enumerate(index_encoded_nine_mer):
        result[position][amino_acid]= 1
    return result.flatten()

def real_labels(dataset,index):
    
    y = label_transform(slicing(dataset,index,1))
    weights = slicing(dataset,index,2)
    original_indices = slicing(dataset,index,3)
    
    return regroup_together(y, weights , original_indices)

def fit(model,dataset,index, neural_network = False, hotshot = False): # to be left out or modified 
    
    X = slicing(dataset,index,0)
    
    if (hotshot == True):
        X = np.apply_along_axis(index_to_hotshot_encoding, 1, X)
        
    y = label_transform(slicing(dataset,index,1))
    weights = slicing(dataset,index,2)
    
    if (neural_network == True):
        model.fit(X, y, sample_weight = weights, batch_size = 16, nb_epoch = 1)
    else: 
        model.fit(X, y, sample_weight = weights)
        
def predict(model, dataset, index, hotshot = False):
    
    X = slicing(dataset,index,0)
    
    if (hotshot == True):
        X = np.apply_along_axis(index_to_hotshot_encoding, 1, X)
        
    weights = slicing(dataset,index,2)
    original_indices = slicing(dataset,index,3)
    
    return regroup_together(model.predict(X), weights , original_indices)

def AUC(model, dataset, index, hotshot = False):
        
    real_affinity = measured_affinity_less_than(real_labels(dataset,index),500)
    predicted_affinity = predict(model, dataset, index, hotshot = hotshot)
    
    return roc_auc_score(real_affinity, predicted_affinity)

def AUC_simple(model, features, labels, index):
    real_affinity = measured_affinity_less_than(labels[index],500)
    predicted_affinity = model.predict(features[index])
    
    return roc_auc_score(real_affinity, predicted_affinity)

In [5]:
def split_by_length(X,index,length=9):
    length_idx = np.array([i for i in index if (np.count_nonzero(X[i])==length)])
    non_length_idx = np.array([i for i in index if (np.count_nonzero(X[i])!=length)])
    return index, length_idx, non_length_idx

In [None]:
def random_dropout_prediction_by_lentgh(model,array,length):
    array_of_lengths = np.apply_along_axis(np.count_nonzero,1,array)
    bool_array = (array_of_lengths == length)
    result = np.zeros(len(array))
    for i in range(length):
        result[bool_array] = result[bool_array] + model.predict(array[bool_array])
    print(result.shape, model.predict(array[bool_array]).shape)
    return result/length

def random_dropout_array_prediction(model,array):
    array_of_lengths = np.apply_along_axis(np.count_nonzero,1,array)
    result = np.zeros(len(array))
    for length in np.unique(array_of_lengths):
        result = result + random_dropout_prediction_by_lentgh(model,array,length)
    return result

def AUC_random_dropout(model, features, labels, index):
    real_affinity = measured_affinity_less_than(labels[index],500)
    predicted_affinity = random_dropout_array_prediction(model, features[index])
    
    return roc_auc_score(real_affinity, predicted_affinity)

In [None]:
folds = 3
batch_size_nn = 16
batch_size_lstm = 16
hidden = 50
dropout_probability = 0.25

n_epochs = 40
epoch = 0



lstm_aucs = np.zeros((6, folds,n_epochs))



for i, (train_idx, test_idx) in enumerate(KFold(len(df_h),folds, shuffle=True)):
    
    list_index = split_by_length(X,train_idx,length=9)+split_by_length(X,test_idx,length=9)
    
    
    # lstm 
    sequence = Input( shape= (26, ),dtype='int32')
    embedded = Embedding(input_dim = 21, output_dim= 32, mask_zero = True)(sequence)
    forwards = LSTM(hidden)(embedded)
    backwards = LSTM(hidden, go_backwards=True)(embedded)

    merged = merge([forwards, backwards], mode = 'concat', concat_axis=-1)
    after_dp = Lambda(lambda x: K.dropout(x, level=0.5))(merged)
    output = Dense(1, activation = 'sigmoid')(after_dp)
    lstm = Model(input = sequence, output = output)
    adam = Adam(lr = 0.01)
    lstm.compile(optimizer = adam, loss='mean_squared_error')
    

      
    
    for epoch in range(n_epochs):
        
        
        #lstm 
        adam.lr.set_value(0.01*(1+epoch)**(-2))
        lstm.fit(X[train_idx],y[train_idx], batch_size = 16, nb_epoch = 1)
        for k, index in enumerate(list_index):
            lstm_aucs[k][i][epoch] = AUC_random_dropout(lstm, X, y, index)
            

        
        print("allmers:", lstm_aucs[3][i][epoch], i, epoch)  

    

Epoch 1/1




(6376,) (81, 1)
(6376,) (4647, 1)
(6376,) (1568, 1)
(6376,) (61, 1)
(6376,) (5, 1)
(6376,) (4, 1)
(6376,) (6, 1)
(6376,) (4, 1)
(4647,) (4647, 1)
(1729,) (81, 1)
(1729,) (1568, 1)
(1729,) (61, 1)
(1729,) (5, 1)
(1729,) (4, 1)
(1729,) (6, 1)
(1729,) (4, 1)
(3189,) (44, 1)
(3189,) (2314, 1)
(3189,) (788, 1)
(3189,) (31, 1)
(3189,) (3, 1)
(3189,) (2, 1)
(3189,) (4, 1)
(3189,) (3, 1)
(2314,) (2314, 1)
(875,) (44, 1)
(875,) (788, 1)
(875,) (31, 1)
(875,) (3, 1)
(875,) (2, 1)
(875,) (4, 1)
(875,) (3, 1)
allmers: 0.475603899631 0 0
Epoch 1/1
(6376,) (81, 1)
(6376,) (4647, 1)
(6376,) (1568, 1)
(6376,) (61, 1)
(6376,) (5, 1)
(6376,) (4, 1)
(6376,) (6, 1)
(6376,) (4, 1)
(4647,) (4647, 1)
(1729,) (81, 1)
(1729,) (1568, 1)
(1729,) (61, 1)
(1729,) (5, 1)
(1729,) (4, 1)
(1729,) (6, 1)
(1729,) (4, 1)
(3189,) (44, 1)
(3189,) (2314, 1)
(3189,) (788, 1)
(3189,) (31, 1)
(3189,) (3, 1)
(3189,) (2, 1)
(3189,) (4, 1)
(3189,) (3, 1)
(2314,) (2314, 1)
(875,) (44, 1)
(875,) (788, 1)
(875,) (31, 1)
(875,) (3, 1

In [77]:
a=X[1]

In [78]:
a.shape

(26,)

In [76]:
np.transpose(a[:,None])[0,:].shape

(26,)

In [11]:
a=X[0:20]

In [17]:
X[20]

array([ 1,  5,  4,  5,  8, 12, 16, 10, 10,  9,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0])

In [21]:
(np.apply_along_axis(np.count_nonzero,1,X) == 10).sum()

2356

In [27]:
for i in np.unique(np.apply_along_axis(np.count_nonzero,1,X)):
    print(i)

8
9
10
11
12
13
14
15
