In [2]:
import pandas as pd
import sklearn 
import scipy
from sklearn import linear_model as lm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cross_validation import KFold, train_test_split, cross_val_score, StratifiedKFold, LabelKFold, ShuffleSplit
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from mhcflurry import peptide_encoding, amino_acid
from mhcflurry.amino_acid import common_amino_acids
from mhcflurry import dataset
from mhcflurry.dataset import Dataset
import matplotlib.pyplot as plt 
% matplotlib inline
import numpy as np
import math 
import statsmodels.api as sm
from keras import models, layers, optimizers
from keras.optimizers import Adam, SGD
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Input, merge, Convolution1D, AveragePooling1D, Activation, Flatten
from keras.preprocessing import sequence
from keras.models import Model
from keras.engine import topology
import seaborn as sns
from keras import backend as K
from keras.layers import LSTM

In [3]:
class AttentionLSTM(LSTM):
    def __init__(self, output_dim, attention_vec, **kwargs):
        self.attention_vec = attention_vec
        super(AttentionLSTM, self).__init__(output_dim, **kwargs)

    def build(self, input_shape):
        super(AttentionLSTM, self).build(input_shape)

        assert hasattr(self.attention_vec, '_keras_shape')
        attention_dim = self.attention_vec._keras_shape[1]

        self.U_a = self.inner_init((self.output_dim, self.output_dim),
                                   name='{}_U_a'.format(self.name))
        self.b_a = K.zeros((self.output_dim,), name='{}_b_a'.format(self.name))

        self.U_m = self.inner_init((attention_dim, self.output_dim),
                                   name='{}_U_m'.format(self.name))
        self.b_m = K.zeros((self.output_dim,), name='{}_b_m'.format(self.name))

        self.U_s = self.inner_init((self.output_dim, self.output_dim),
                                   name='{}_U_s'.format(self.name))
        self.b_s = K.zeros((self.output_dim,), name='{}_b_s'.format(self.name))

        self.trainable_weights += [self.U_a, self.U_m, self.U_s,
                                   self.b_a, self.b_m, self.b_s]

        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
            del self.initial_weights

    def step(self, x, states):
        h, [h, c] = super(AttentionLSTM, self).step(x, states)
        attention = states[4]

        m = K.tanh(K.dot(h, self.U_a) + attention + self.b_a)
        s = K.exp(K.dot(m, self.U_s) + self.b_s)
        h = h * s

        return h, [h, c]

    def get_constants(self, x):
        constants = super(AttentionLSTM, self).get_constants(x)
        constants.append(K.dot(self.attention_vec, self.U_m) + self.b_m)
        return constants

In [4]:
df = pd.read_table("bdata.2009.mhci.public.1.txt")

df['log_meas']=1-np.log(df['meas'])/math.log(50000)
df['peptide_length'] = df['sequence'].str.len()


max_len=df['sequence'].str.len().max()
n_peptides = df['sequence'].count()

def amino_acid_hotshot_encoding(s):
    return common_amino_acids.hotshot_encoding([s],len(s)).flatten().astype(int)
df['hotshot_encoded_peptides'] = df.sequence.apply(lambda seq: amino_acid_hotshot_encoding(seq))

def amino_acid_index_encoding(s, maxlen):
    a = 1+common_amino_acids.index_encoding([s],len(s)).flatten()
    return np.concatenate([a, np.zeros(maxlen-len(a),dtype=int)])
df['index_encoded_peptides'] = df.sequence.apply(lambda seq: amino_acid_index_encoding(seq, max_len))

def measured_affinity_less_than(Y,k):
    IC50 = 50000**(1-Y)
    return (IC50 < k).astype(int) 

def affinity_label(Y):
    return measured_affinity_less_than(Y,50) + measured_affinity_less_than(Y,500) + measured_affinity_less_than(Y,5000) + measured_affinity_less_than(Y,50000)

df['affinity_label'] = affinity_label(df['log_meas'])
df_h = df[df['mhc']=='HLA-A-0201'][['hotshot_encoded_peptides','index_encoded_peptides','log_meas','peptide_length']]
X = np.array(list(df_h['index_encoded_peptides']))
y = np.array(list(df_h['log_meas']))
y[y<0]=0





In [8]:
folds =3 
n_epochs = 5

train_lstm_aucs = np.zeros((folds,n_epochs))
test_lstm_aucs = np.zeros((folds,n_epochs))
nine_train_lstm_aucs = np.zeros((folds,n_epochs))
nine_test_lstm_aucs = np.zeros((folds,n_epochs))
non_nine_train_lstm_aucs = np.zeros((folds,n_epochs))
non_nine_test_lstm_aucs = np.zeros((folds,n_epochs))

train_attentional_lstm_aucs = np.zeros((folds,n_epochs))
test_attentional_lstm_aucs = np.zeros((folds,n_epochs))
nine_train_attentional_lstm_aucs = np.zeros((folds,n_epochs))
nine_test_attentional_lstm_aucs = np.zeros((folds,n_epochs))
non_nine_train_attentional_lstm_aucs = np.zeros((folds,n_epochs))
non_nine_test_attentional_lstm_aucs = np.zeros((folds,n_epochs))

In [22]:
batch_size_nn = 16
batch_size_lstm = 16
hidden = 50
dropout_probability = 0.25

for i, (train_idx, test_idx) in enumerate(KFold(len(df_h),folds, shuffle=True)):
    
    # normal LSTM
    
    sequence = Input( shape= (26, ),dtype='int32')
    embedded = Embedding(input_dim = 21, input_length = 26, output_dim= 32, mask_zero = True)(sequence)
    forwards = LSTM(hidden)(embedded)
    backwards = LSTM(hidden, go_backwards=True)(embedded)

    merged = merge([forwards, backwards], mode = 'concat', concat_axis=-1)
    after_dp = Dropout(dropout_probability)(merged)
    output = Dense(1, activation = 'sigmoid')(after_dp)
    lstm = Model(input = sequence, output = output)
    adam = Adam(lr = 0.01)
    lstm.compile(optimizer = adam , loss='mean_squared_error')
    
    # attentional LSTM
    
    sequence = Input( shape= (26, ),dtype='int32')
    embedded = Embedding(input_dim = 21, input_length = 26, output_dim= 32, mask_zero = True)(sequence)
    forwards = AttentionLSTM(hidden, Input(shape=(32,), ) )(embedded)
    backwards = AttentionLSTM(hidden,Input(shape=(32,), ),go_backwards=True)(embedded)

    merged = merge([forwards, backwards], mode = 'concat', concat_axis=-1)
    after_dp = Dropout(dropout_probability)(merged)
    output = Dense(1, activation = 'sigmoid')(after_dp)
    attentional_lstm = Model(input = sequence, output = output)
    attentional_adam = Adam(lr = 0.01)
    attentional_lstm.compile(optimizer = adam , loss='mean_squared_error')
    
    # index sets
    
    nine_train_idx = np.array([i for i in train_idx if (np.count_nonzero(X[i])==9)])
    non_nine_train_idx = np.array([i for i in train_idx if (np.count_nonzero(X[i])!=9)])
    nine_test_idx = np.array([i for i in test_idx if (np.count_nonzero(X[i])==9)])
    non_nine_test_idx = np.array([i for i in test_idx if (np.count_nonzero(X[i])!=9)])
    
    for epoch in range(n_epochs):
        
        adam.lr.set_value(0.01*(epoch+1)**(-2))
        lstm.fit(X[train_idx],y[train_idx], batch_size = batch_size_lstm, nb_epoch=1)
    

        train_lstm_auc = roc_auc_score(measured_affinity_less_than(y[train_idx],500),lstm.predict(X[train_idx]))
        test_lstm_auc = roc_auc_score(measured_affinity_less_than(y[test_idx],500),lstm.predict(X[test_idx]))
        nine_train_lstm_auc = roc_auc_score(measured_affinity_less_than(y[nine_train_idx],500),lstm.predict(X[nine_train_idx]))
        nine_test_lstm_auc = roc_auc_score(measured_affinity_less_than(y[nine_test_idx],500),lstm.predict(X[nine_test_idx]))
        non_nine_train_lstm_auc = roc_auc_score(measured_affinity_less_than(y[non_nine_train_idx],500),lstm.predict(X[non_nine_train_idx]))
        non_nine_test_lstm_auc = roc_auc_score(measured_affinity_less_than(y[non_nine_test_idx],500),lstm.predict(X[non_nine_test_idx]))
        
        train_lstm_aucs[i][epoch]=train_lstm_auc
        test_lstm_aucs[i][epoch]=test_lstm_auc
        nine_train_lstm_aucs[i][epoch]=nine_train_lstm_auc
        nine_test_lstm_aucs[i][epoch]=nine_test_lstm_auc
        non_nine_train_lstm_aucs[i][epoch]=non_nine_train_lstm_auc
        non_nine_test_lstm_aucs[i][epoch]=non_nine_test_lstm_auc
        
        
        attentional_adam.lr.set_value(0.01*(epoch+1)**(-2))
        attentional_lstm.fit(X[train_idx],y[train_idx], batch_size = batch_size_lstm, nb_epoch=1)
    

        train_attentional_lstm_auc = roc_auc_score(measured_affinity_less_than(y[train_idx],500),attentional_lstm.predict(X[train_idx]))
        test_attentional_lstm_auc = roc_auc_score(measured_affinity_less_than(y[test_idx],500),attentional_lstm.predict(X[test_idx]))
        nine_train_attentional_lstm_auc = roc_auc_score(measured_affinity_less_than(y[nine_train_idx],500),attentional_lstm.predict(X[nine_train_idx]))
        nine_test_attentional_lstm_auc = roc_auc_score(measured_affinity_less_than(y[nine_test_idx],500),attentional_lstm.predict(X[nine_test_idx]))
        non_nine_train_attentional_lstm_auc = roc_auc_score(measured_affinity_less_than(y[non_nine_train_idx],500),attentional_lstm.predict(X[non_nine_train_idx]))
        non_nine_test_attentional_lstm_auc = roc_auc_score(measured_affinity_less_than(y[non_nine_test_idx],500),attentional_lstm.predict(X[non_nine_test_idx]))
        
        train_attentional_lstm_aucs[i][epoch]=train_attentional_lstm_auc
        test_attentional_lstm_aucs[i][epoch]=test_attentional_lstm_auc
        nine_train_attentional_lstm_aucs[i][epoch]=nine_train_attentional_lstm_auc
        nine_test_attentional_lstm_aucs[i][epoch]=nine_test_attentional_lstm_auc
        non_nine_train_attentional_lstm_aucs[i][epoch]=non_nine_train_attentional_lstm_auc
        non_nine_test_attentional_lstm_aucs[i][epoch]=non_nine_test_attentional_lstm_auc

Epoch 1/1


MissingInputError: A variable that is an input to the graph was neither provided as an input to the function nor given a value. A chain of variables leading from this input to an output is [input_36, dot.0, Elemwise{add,no_inplace}.0, Elemwise{add,no_inplace}.0, Elemwise{add,no_inplace}.0, Elemwise{tanh,no_inplace}.0, dot.0, Elemwise{add,no_inplace}.0, Elemwise{exp,no_inplace}.0, Elemwise{mul,no_inplace}.0, Elemwise{mul,no_inplace}.0, DimShuffle{x,0,1}.0, Rebroadcast{0}.0, Shape.0, Subtensor{int64}.0, Elemwise{add,no_inplace}.0, AllocEmpty{dtype='float32'}.0, IncSubtensor{Set;:int64:}.0, for{cpu,scan_fn}.0, Subtensor{int64::}.0, DimShuffle{0,1,2}.0, Subtensor{int64}.0, Join.0, Elemwise{mul,no_inplace}.0, Elemwise{true_div,no_inplace}.0, Elemwise{switch,no_inplace}.0, dot.0, Elemwise{add,no_inplace}.0, sigmoid.0, Elemwise{sub,no_inplace}.0, Elemwise{sqr,no_inplace}.0, Sum{axis=[1], acc_dtype=float64}.0, mean, mean, Elemwise{mul,no_inplace}.0, Elemwise{true_div,no_inplace}.0, Sum{acc_dtype=float64}.0, mean, Elemwise{mul,no_inplace}.0]. This chain may not be unique
Backtrace when the variable is created:
  File "/Users/giancarlokerg/anaconda/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/giancarlokerg/anaconda/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/giancarlokerg/anaconda/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/giancarlokerg/anaconda/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-22-7f57e95e8c6b>", line 27, in <module>
    backwards = AttentionLSTM(hidden,Input(shape=(32,), ),go_backwards=True)(embedded)
  File "/Users/giancarlokerg/keras/keras/engine/topology.py", line 1091, in Input
    input_tensor=tensor)
  File "/Users/giancarlokerg/keras/keras/engine/topology.py", line 1010, in __init__
    name=self.name)
  File "/Users/giancarlokerg/keras/keras/backend/theano_backend.py", line 84, in placeholder
    x = T.TensorType(dtype, broadcast)(name)
