In [1]:
import pandas as pd
import sklearn 
import scipy
from sklearn import linear_model as lm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cross_validation import KFold, train_test_split, cross_val_score, StratifiedKFold, LabelKFold, ShuffleSplit
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from mhcflurry.amino_acid import common_amino_acids
from mhcflurry import dataset
from mhcflurry.dataset import Dataset
import matplotlib.pyplot as plt 
% matplotlib inline
import numpy as np
import math 
from mhcflurry import peptide_encoding, amino_acid
import statsmodels.api as sm
from keras import models, layers, optimizers
from keras.models import Sequential
from keras.utils.layer_utils import layer_from_config
from keras.layers import Dense, Dropout, TimeDistributed, Embedding, LSTM, Input, merge, Convolution1D, AveragePooling1D, Activation, Flatten
from keras.preprocessing import sequence
from keras.models import Model
from keras.engine import topology
import seaborn as sns

Using Theano backend.


In [2]:
def regroup_together(affinities, weights , original_indices):
    affinities = affinities.ravel()
    weights = weights.ravel()
    assert affinities.shape == weights.shape, "%s should be %s" % (affinities.shape, weights.shape)
    assert affinities.shape == original_indices.shape
    assert len(affinities) == len(affinities.ravel())
    weighted_affinities = (affinities * weights)
    index_set = set(original_indices)
    n_indices = len(index_set)
    result_order = {original_index: i for (i, original_index) in enumerate(sorted(index_set))}
    result = np.zeros(n_indices)
    for i, x in enumerate(weighted_affinities):
        result_idx = result_order[original_indices[i]]
        result[result_idx] += x
    return result
def slicing(dataset, index, i):
    return dataset.slice(index).kmer_index_encoding()[i]

def label_transform(array):
    result = 1-np.log(array)/math.log(50000)
    result[result<0]=0
    return result

In [3]:
df = pd.read_table("bdata.2009.mhci.public.1.txt")

df['log_meas']=1-np.log(df['meas'])/math.log(50000)
df['peptide_length'] = df['sequence'].str.len()


max_len=df['sequence'].str.len().max()
n_peptides = df['sequence'].count()

def amino_acid_hotshot_encoding(s):
    return common_amino_acids.hotshot_encoding([s],len(s)).flatten().astype(int)
df['hotshot_encoded_peptides'] = df.sequence.apply(lambda seq: amino_acid_hotshot_encoding(seq))

def amino_acid_index_encoding(s, maxlen):
    a = 1+common_amino_acids.index_encoding([s],len(s)).flatten()
    return np.concatenate([a, np.zeros(maxlen-len(a),dtype=int)])
df['index_encoded_peptides'] = df.sequence.apply(lambda seq: amino_acid_index_encoding(seq, max_len))

def measured_affinity_less_than(Y,k):
    IC50 = 50000**(1-Y)
    return (IC50 < k).astype(int) 

def affinity_label(Y):
    return measured_affinity_less_than(Y,50) + measured_affinity_less_than(Y,500) + measured_affinity_less_than(Y,5000) + measured_affinity_less_than(Y,50000)

df['affinity_label'] = affinity_label(df['log_meas'])
df_h = df[df['mhc']=='HLA-A-0201'][['hotshot_encoded_peptides','index_encoded_peptides','log_meas','peptide_length']]
X = np.array(list(df_h['index_encoded_peptides']))
y = np.array(list(df_h['log_meas']))
y[y<0]=0

In [4]:
ds = Dataset.from_csv("bdata.2009.mhci.public.1.txt")
ds_h = ds.slice(ds.alleles == 'HLA-A0201')

In [5]:
np.count_nonzero(X[:,1] == 10 )

2805

In [6]:
y_soft = y
y_soft[(X[:,1] == 10) & (X[:,7] == 10)] = 1

In [7]:
y.shape

(9565,)

In [8]:
y_hard = np.zeros((9565,))
y_hard[(X[:,1] == 10) & (X[:,7] == 10)] = 1

In [9]:
index_soft = ds_h.to_dataframe().peptide.apply(lambda seq: (amino_acid_index_encoding(seq, max_len)[1] == 10) & (amino_acid_index_encoding(seq, max_len)[7] == 10))

In [10]:
type(index_soft)

pandas.core.series.Series

In [11]:
ds_h.to_dataframe().loc[index_soft,'affinity'] = 1
ds_h.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,species,allele,peptide_length,cv,peptide,inequality,affinity,sample_weight
allele,peptide,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
HLA-A0201,AAAKTPVIVV,human,HLA-A0201,10,TBD,AAAKTPVIVV,=,44318.996782,1.0
HLA-A0201,AAASSTHRKV,human,HLA-A0201,10,TBD,AAASSTHRKV,>,69444.444444,1.0
HLA-A0201,AACIVGCENV,human,HLA-A0201,10,TBD,AACIVGCENV,>,28175.000000,1.0
HLA-A0201,AADLTQIFEV,human,HLA-A0201,10,TBD,AADLTQIFEV,=,16.529414,1.0
HLA-A0201,AAERGPGQML,human,HLA-A0201,10,TBD,AAERGPGQML,=,2277.020021,1.0
HLA-A0201,AAGIGILTVI,human,HLA-A0201,10,TBD,AAGIGILTVI,=,5555.000000,1.0
HLA-A0201,AAGLQDCTML,human,HLA-A0201,10,TBD,AAGLQDCTML,>,50000.000000,1.0
HLA-A0201,AAITDAAVAV,human,HLA-A0201,10,TBD,AAITDAAVAV,=,5310.920858,1.0
HLA-A0201,AAITLVVISV,human,HLA-A0201,10,TBD,AAITLVVISV,=,368.627244,1.0
HLA-A0201,AANPHATFGV,human,HLA-A0201,10,TBD,AANPHATFGV,=,2950.000000,1.0


In [13]:
folds = 3
batch_size_nn = 16
batch_size_lstm = 16
hidden = 50
dropout_probability = 0.25

n_epochs = 40
epoch = 0

train_nn_aucs = np.zeros((folds,n_epochs))
test_nn_aucs = np.zeros((folds,n_epochs))

train_lstm_aucs = np.zeros((folds,n_epochs))
test_lstm_aucs = np.zeros((folds,n_epochs))

for i, (train_idx, test_idx) in enumerate(KFold(len(df_h),folds, shuffle=True)):

    
    nn = Sequential()
    nn.add(Embedding(input_dim = 21, input_length =9, output_dim= 32))
    nn.add(Flatten())
    nn.add(Dense(10, init='glorot_uniform', activation='sigmoid'))
    nn.add(Dense(1, init='glorot_uniform', activation='sigmoid'))
    
    nn.compile(optimizer = 'adam', loss='mean_squared_error')
    
    
    
    sequence = Input( shape= (26, ),dtype='int32')
    embedded = Embedding(input_dim = 21, input_length = 26, output_dim= 32, mask_zero = True)(sequence)
    forwards = LSTM(hidden)(embedded)
    backwards = LSTM(hidden, go_backwards=True)(embedded)

    merged = merge([forwards, backwards], mode = 'concat', concat_axis=-1)
    after_dp = Dropout(dropout_probability)(merged)
    output = TimeDistributed(Dense(1, activation = 'sigmoid')(after_dp))
    lstm = Model(input = sequence, output = output)
    
    lstm.compile(optimizer = 'adam', loss='mean_squared_error')
    
    X_train = slicing(ds_h,train_idx,0)
    y_train = label_transform(slicing(ds_h,train_idx,1))
    
    X_test = slicing(ds_h,test_idx,0)
    y_test = label_transform(slicing(ds_h,test_idx,1))
    
    weights_train = slicing(ds_h,train_idx,2)
    weights_test = slicing(ds_h,test_idx,2)
    
    original_indices_train = slicing(ds_h,train_idx,3)
    original_indices_test = slicing(ds_h,test_idx,3)
    
    train_real_labels = regroup_together(y_train, weights_train , original_indices_train)
    test_real_labels = regroup_together(y_test, weights_test , original_indices_test)
    
    
    for epoch in range(n_epochs):
        # nn
        nn.fit(X_train,y_train, sample_weight = weights_train, batch_size = batch_size_nn, nb_epoch=1)
        
        train_pred_grouped = regroup_together(nn.predict(X_train), weights_train, original_indices_train)
        test_pred_grouped = regroup_together(nn.predict(X_test), weights_test, original_indices_test)
        
        train_nn_auc = roc_auc_score(measured_affinity_less_than(train_real_labels,500), train_pred_grouped)     
        test_nn_auc = roc_auc_score(measured_affinity_less_than(test_real_labels,500), test_pred_grouped)
        
        train_nn_aucs[i][epoch]=train_nn_auc
        test_nn_aucs[i][epoch]=test_nn_auc
        print("NN: ", train_nn_auc, test_nn_auc, epoch)
        #lstm 
        lstm.fit(X[train_idx],y_soft[train_idx], batch_size = batch_size_lstm, nb_epoch=1)
    
        
        train_lstm_auc = roc_auc_score(measured_affinity_less_than(y_soft[train_idx],500),lstm.predict(X[train_idx]))
        test_lstm_auc = roc_auc_score(measured_affinity_less_than(y_soft[test_idx],500),lstm.predict(X[test_idx]))
        
        train_lstm_aucs[i][epoch]=train_lstm_auc
        test_lstm_aucs[i][epoch]=test_lstm_auc
        print("LSTM: ", train_lstm_auc, test_lstm_auc, epoch)
train_nn_aucs_mean = np.mean(train_nn_aucs, axis=0)
test_nn_aucs_mean = np.mean(test_nn_aucs, axis=0)
train_lstm_aucs_mean=np.mean(train_lstm_aucs, axis=0)
test_lstm_aucs_mean = np.mean(test_lstm_aucs, axis=0)

AttributeError: 'TensorVariable' object has no attribute 'uses_learning_phase'

In [19]:
def count_leucine(x):
    count = 0 
    for i in range(len(x)):
        count = count + (x[i]==10).astype(int)
    return count
def more_than_three_leucines(x):
    return (count_leucine(x) >= 3)

In [25]:
df['more than 3 leucines'] = df.index_encoded_peptides.apply(lambda x: (count_leucine(x) >= 3))
df['more than 3 leucines']

0         False
1         False
2         False
3         False
4         False
5         False
6         False
7         False
8         False
9         False
10        False
11        False
12        False
13        False
14        False
15        False
16        False
17        False
18        False
19        False
20        False
21        False
22        False
23        False
24        False
25        False
26        False
27        False
28        False
29        False
          ...  
137624     True
137625    False
137626    False
137627    False
137628    False
137629    False
137630    False
137631    False
137632    False
137633    False
137634    False
137635    False
137636    False
137637    False
137638    False
137639    False
137640    False
137641    False
137642    False
137643    False
137644    False
137645    False
137646    False
137647    False
137648    False
137649    False
137650     True
137651    False
137652    False
137653    False
Name: more than 3 leucin

In [39]:
leucine_index = np.array([i for i in range(len(df['more than 3 leucines'])) if (df['more than 3 leucines'][i] == True)])

In [40]:
y[leucine_index] 

IndexError: index 9680 is out of bounds for axis 1 with size 9565