In [1]:
import pandas as pd
import sklearn 
from sklearn import linear_model as lm
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import numpy as np
import math 
from mhcflurry import peptide_encoding, amino_acid
import statsmodels.api as sm
from keras import models, layers, optimizers

Using Theano backend.


In [2]:
df = pd.read_table("bdata.2009.mhci.public.1.txt")

In [6]:
df.groupby('mhc').size().nlargest(11)

mhc
HLA-A-0201    9565
HLA-A-0301    6141
HLA-A-0203    5542
HLA-A-1101    5399
HLA-A-0206    4827
HLA-A-3101    4796
HLA-A-6802    4768
HLA-A-0202    3919
HLA-A-0101    3725
HLA-B-0702    3412
H-2-Kb        3407
dtype: int64

In [4]:
df_h=df[df['mhc']=='HLA-A-0201'][['sequence','meas']]
df_h['log_meas']=1-np.log(df_h['meas'])/math.log(50000)

In [15]:
max_len=df_h['sequence'].str.len().max()
n_peptides = df_h['sequence'].count()
from mhcflurry.amino_acid import common_amino_acids
def amino_acid_encoding(s, maxlen):
    a = 1+common_amino_acids.index_encoding([s],len(s)).flatten()
    return np.concatenate([a, np.zeros(maxlen-len(a),dtype=int)])
df_h['encoded_peptides'] = df_h.sequence.apply(lambda seq: amino_acid_encoding(seq, max_len))
df_h['peptide_length'] = df_h['sequence'].str.len()
df_h.groupby('peptide_length').count()
df_h.head(10)

Unnamed: 0,sequence,meas,log_meas,encoded_peptides,peptide_length
11816,AAAKTPVIVV,44318.996782,0.011147,"[1, 1, 1, 9, 17, 13, 18, 8, 18, 18, 0, 0, 0, 0...",10
11817,AAASSTHRKV,69444.444444,-0.030361,"[1, 1, 1, 16, 16, 17, 7, 15, 9, 18, 0, 0, 0, 0...",10
11818,AACIVGCENV,28175.0,0.053013,"[1, 1, 2, 8, 18, 6, 2, 4, 12, 18, 0, 0, 0, 0, 0]",10
11819,AADLTQIFEV,16.529414,0.740739,"[1, 1, 3, 10, 17, 14, 8, 5, 4, 18, 0, 0, 0, 0, 0]",10
11820,AAERGPGQML,2277.020021,0.28551,"[1, 1, 4, 15, 6, 13, 6, 14, 11, 10, 0, 0, 0, 0...",10
11821,AAGIGILTVI,5555.0,0.203084,"[1, 1, 6, 8, 6, 8, 10, 17, 18, 8, 0, 0, 0, 0, 0]",10
11822,AAGLQDCTML,50000.0,0.0,"[1, 1, 6, 10, 14, 3, 2, 17, 11, 10, 0, 0, 0, 0...",10
11823,AAITDAAVAV,5310.920858,0.207237,"[1, 1, 8, 17, 3, 1, 1, 18, 1, 18, 0, 0, 0, 0, 0]",10
11824,AAITLVVISV,368.627244,0.453798,"[1, 1, 8, 17, 10, 18, 18, 8, 16, 18, 0, 0, 0, ...",10
11825,AANPHATFGV,2950.0,0.261578,"[1, 1, 12, 13, 7, 1, 17, 5, 6, 18, 0, 0, 0, 0, 0]",10


In [None]:
df_h_9 = df_h[df_h['peptide_length']==9]

In [6]:
len(df_h['encoded_peptides'])

5399

In [7]:
def measured_affinity_less_than_500(Y):
    IC50 = 50000**(1-Y)
    return IC50 < 500

In [8]:
t=len(df_h['encoded_peptides'])/5
train= np.arange(0,4*t,1)
test = np.arange(4*t+1,5*t,1)


In [9]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Input, merge, Convolution1D, AveragePooling1D, Activation, Flatten
from keras.preprocessing import sequence
from keras.models import Model
from keras.engine import topology 

X_train = pd.DataFrame(list(df_h['encoded_peptides'].iloc[train])).values
Y_train = pd.DataFrame(list(df_h['log_meas'].iloc[train])).values
X_test = pd.DataFrame(list(df_h['encoded_peptides'].iloc[test])).values
Y_test = pd.DataFrame(list(df_h['log_meas'].iloc[test])).values

#Prepare the model 
sequence = Input( shape= (max_len,), dtype='int32')
embedded = Embedding(21, 100, mask_zero = True)(sequence)
before_dp = Dropout(0)(embedded)
#flat = Flatten()(embedded)

#conv_filter = Convolution1D(64, 14 , border_mode='same')(embedded)
#act = Activation('relu')(conv_filter)
#flat = Flatten()(act)


forwards = LSTM(80)(before_dp)
backwards = LSTM(80, go_backwards=True)(before_dp)

merged = merge([forwards, backwards], mode='concat', concat_axis=-1)





#after_dp_bis = Dropout(0.5)(merged_bis)
#pre_pre_pre_pre_output = Dense(80, activation='hard_sigmoid')(merged)
#pre_pre_pre_after_dp = Dropout(0.5)(pre_pre_pre_pre_output)
#pre_pre_pre_output = Dense(480, activation='hard_sigmoid')(pre_pre_pre_after_dp)
#pre_pre_after_dp = Dropout(0.5)(pre_pre_pre_output)
#pre_pre_output = Dense(280, activation='hard_sigmoid')(pre_pre_after_dp)
#pre_pre_after_dp = Dropout(0.5)(merged)
#pre_output = Dense(280, activation='hard_sigmoid')(pre_pre_after_dp)
pre_after_dp = Dropout(0.5)(merged)
output = Dense(1, activation='hard_sigmoid')(pre_after_dp)
model = Model(input=sequence, output=output)
#Compile the model
model.compile(optimizer = 'Adam', loss='mean_squared_error')

  result = getitem(key)


In [10]:
batch_size = 16
n_epochs = 100
epoch = 0
for epoch in range(n_epochs):
    for batch_idx in range(len(X_train) // batch_size):   
        model.train_on_batch(X_train[batch_idx * batch_size:(batch_idx + 1) * batch_size], 
                             Y_train[batch_idx * batch_size:(batch_idx + 1) * batch_size])
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    print(epoch, batch_idx, roc_auc_score(measured_affinity_less_than_500(Y_train),train_pred), 
          roc_auc_score(measured_affinity_less_than_500(Y_test),test_pred))

0 269 0.878934574411 0.885728117593
1 269 0.901851727316 0.903789934878
2 269 0.915685783908 0.916560123814
3 269 0.922904197539 0.923427464878
4 269 0.926244901915 0.928279837606
5 269 0.932778955738 0.933726607302
6 269 0.937501483439 0.940343479204
7 269 0.944248905222 0.945012672992
8 269 0.94490557417 0.945519218835
9 269 0.946188748808 0.947726711975
10 269 0.948240097551 0.948822047268
11 269 0.946884976127 0.949633268286
12 269 0.948239603071 0.948810832231
13 269 0.951902831786 0.95055290133
14 269 0.953023446246 0.950373460736
15 269 0.951698858939 0.950541686293
16 269 0.955331182677 0.950433274267
17 269 0.958090131769 0.947438859356
18 269 0.960574397427 0.950231403599
19 269 0.96286606327 0.949083731467
20 269 0.965092952281 0.952407120801
21 269 0.966290334703 0.948038863842
22 269 0.966680108271 0.947939797681
23 269 0.969518421344 0.945965951147
24 269 0.970595274357 0.9445921091
25 269 0.976191300323 0.946169690988
26 269 0.975839972546 0.945687444392


KeyboardInterrupt: 

In [11]:
from IPython.display import SVG
from keras.utils.visualize_util import model_to_dot
plot(model, to_file='model.png')

ImportError: No module named 'pydot'

In [None]:
SVG(model_to_dot(model).create(prog='dot', format='svg'))