In [25]:
import pandas as pd
import sklearn 
from sklearn import linear_model as lm
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import numpy as np
import math 
from mhcflurry import peptide_encoding, amino_acid
import statsmodels.api as sm
from keras import models, layers, optimizers

In [26]:
df = pd.read_table("bdata.2009.mhci.public.1.txt")
df_h=df[df['mhc']=='HLA-A-0201'][['sequence','meas']]
df_h['log_meas']=1-np.log(df_h['meas'])/math.log(50000)

In [27]:
max_len=df_h['sequence'].str.len().max()
n_peptides = df_h['sequence'].count()
from mhcflurry.amino_acid import common_amino_acids
df_h['encoded_peptides'] = list(common_amino_acids.index_encoding(df_h['sequence'],max_len))
df_h.head(10)

Unnamed: 0,sequence,meas,log_meas,encoded_peptides
11816,AAAKTPVIVV,44318.996782,0.011147,"[0, 0, 0, 8, 16, 12, 17, 7, 17, 17, 0, 0, 0, 0..."
11817,AAASSTHRKV,69444.444444,-0.030361,"[0, 0, 0, 15, 15, 16, 6, 14, 8, 17, 0, 0, 0, 0..."
11818,AACIVGCENV,28175.0,0.053013,"[0, 0, 1, 7, 17, 5, 1, 3, 11, 17, 0, 0, 0, 0, 0]"
11819,AADLTQIFEV,16.529414,0.740739,"[0, 0, 2, 9, 16, 13, 7, 4, 3, 17, 0, 0, 0, 0, 0]"
11820,AAERGPGQML,2277.020021,0.28551,"[0, 0, 3, 14, 5, 12, 5, 13, 10, 9, 0, 0, 0, 0, 0]"
11821,AAGIGILTVI,5555.0,0.203084,"[0, 0, 5, 7, 5, 7, 9, 16, 17, 7, 0, 0, 0, 0, 0]"
11822,AAGLQDCTML,50000.0,0.0,"[0, 0, 5, 9, 13, 2, 1, 16, 10, 9, 0, 0, 0, 0, 0]"
11823,AAITDAAVAV,5310.920858,0.207237,"[0, 0, 7, 16, 2, 0, 0, 17, 0, 17, 0, 0, 0, 0, 0]"
11824,AAITLVVISV,368.627244,0.453798,"[0, 0, 7, 16, 9, 17, 17, 7, 15, 17, 0, 0, 0, 0..."
11825,AANPHATFGV,2950.0,0.261578,"[0, 0, 11, 12, 6, 0, 16, 4, 5, 17, 0, 0, 0, 0, 0]"


In [28]:
len(df_h['encoded_peptides'])

9565

In [19]:
1-math.log(500)/math.log(50000)

0.4256251898085073

In [29]:
def measured_affinity_less_than_500(Y):
    IC50 = 50000**(1-Y)
    return IC50 < 500

In [38]:
arr = map(measured_affinity_less_than_500, pd.DataFrame(list(df_h['encoded_peptides'].iloc[test])).values)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Input, merge
from keras.preprocessing import sequence
from keras.models import Model
from keras.engine import topology
folds = 5
n_epochs = 100 
train_aucs = np.zeros((folds,n_epochs))
test_aucs = np.zeros((folds,n_epochs))

for i, (train, test) in enumerate(KFold(len(df_h['encoded_peptides']),n_folds=folds, shuffle = True, random_state =0)):
    # Read in the data
    X_train = pd.DataFrame(list(df_h['encoded_peptides'].iloc[train])).values
    Y_train = pd.DataFrame(list(df_h['log_meas'].iloc[train])).values
    X_test = pd.DataFrame(list(df_h['encoded_peptides'].iloc[test])).values
    Y_test = pd.DataFrame(list(df_h['log_meas'].iloc[test])).values

    #Prepare the model 
    sequence = Input( shape= (max_len,), dtype='int32')
    embedded = Embedding(21, 100)(sequence)

    forwards = LSTM(64)(embedded)
    backwards = LSTM(64, go_backwards=True)(embedded)

    merged = merge([forwards, backwards], mode='concat', concat_axis=-1)
    after_dp = Dropout(0.5)(merged)
    output = Dense(1, activation='sigmoid')(after_dp)
    model = Model(input=sequence, output=output)

    #Compile the model
    model.compile(optimizer = 'adam', loss='mean_squared_error')
    batch_size = 16
    epoch = 0
    for epoch in range(n_epochs):
        
        for batch_idx in range(len(X_train) // batch_size):   
            model.train_on_batch(X_train[batch_idx * batch_size:(batch_idx + 1) * batch_size], Y_train[batch_idx * batch_size:(batch_idx + 1) * batch_size])
        train_pred = model.predict(X_train)
        test_pred = model.predict(X_test)
        train_auc[i][epoch] = roc_auc_score(measured_affinity_less_than_500(Y_train),train_pred)
        test_auc[i][epoch] = roc_auc_score(measured_affinity_less_than_500(Y_test),test_pred)
train_aucs_mean=np.mean(train_aucs, axis=0)
test_aucs_mean = np.mean(test_aucs, axis=0)
plt.plot(np.arange(0,n_epochs-1,1), train_aucs_mean[0:n_epochs-1], 'r.', np.arange(0,n_epochs-1,1), test_aucs_mean[0:n_epochs-1],'b.')
plt.show()

In [43]:
import dis

In [45]:
dis.dis(measured_affinity_less_than_500)

  2           0 LOAD_CONST               1 (50000)
              3 LOAD_CONST               2 (1)
              6 LOAD_FAST                0 (Y)
              9 BINARY_SUBTRACT
             10 BINARY_POWER
             11 STORE_FAST               1 (IC50)

  3          14 LOAD_FAST                1 (IC50)
             17 LOAD_CONST               3 (500)
             20 COMPARE_OP               0 (<)
             23 RETURN_VALUE


In [67]:
class A(object):
    def __init__(self):
        self.x = 1
    def __lt__(self, other):
        return other is 3

In [68]:
A()

<__main__.A at 0x12e666d68>

In [69]:
A().__dict__

{'x': 1}

In [13]:
import matplotlib.pyplot as plt
t = np.arange(0, 5 ,0.5)
plt.plot(t, t, 'b.',t,t**2,'r.', t,t**3, 'g.')
plt.axis([0, 6, 0, 20])
plt.show()