In [1]:
import pandas as pd
import sklearn 
import scipy
from sklearn import linear_model as lm
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold, train_test_split, cross_val_score, StratifiedKFold, LabelKFold, ShuffleSplit
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from mhcflurry.amino_acid import common_amino_acids
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.grid_search import GridSearchCV
import matplotlib.pyplot as plt 
% matplotlib inline
import numpy as np
import math 
from mhcflurry import peptide_encoding, amino_acid
import statsmodels.api as sm

Using Theano backend.


### preparing the data

In [2]:
df = pd.read_table("bdata.2009.mhci.public.1.txt")
df['log_meas']=1-np.log(df['meas'])/math.log(50000)
df['peptide_length'] = df['sequence'].str.len()

In [38]:
max_len=df['sequence'].str.len().max()
n_peptides = df['sequence'].count()
def amino_acid_encoding(s):
    return common_amino_acids.hotshot_encoding([s],len(s)).flatten().astype(int)
df['encoded_peptides'] = df.sequence.apply(lambda seq: amino_acid_encoding(seq))

In [39]:
def measured_affinity_less_than(Y,k):
    IC50 = 50000**(1-Y)
    return (IC50 < k).astype(int) 
def affinity_label(Y):
    return measured_affinity_less_than(Y,50) + measured_affinity_less_than(Y,500) + measured_affinity_less_than(Y,5000) + measured_affinity_less_than(Y,50000)

In [40]:
df['affinity_label'] = affinity_label(df['log_meas'])
df.head(10)

Unnamed: 0,species,mhc,peptide_length,cv,sequence,inequality,meas,log_meas,encoded_peptides,affinity_label
0,,ELA-A1,12,TBD,GSQKLTTGNCNW,=,605.0,0.408007,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2
1,,ELA-A1,12,TBD,HVKDETNTTEYW,=,880.0,0.373377,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",2
2,,ELA-A1,12,TBD,LVEDVTNTAEYW,=,170.0,0.525332,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",3
3,,ELA-A1,12,TBD,RVEDKTNTAEYW,=,70.0,0.60734,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",3
4,,ELA-A1,12,TBD,RVEDVKNTAEYW,=,65.0,0.614189,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",3
5,,ELA-A1,12,TBD,RVEDVTLTAEYW,=,150.0,0.5369,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",3
6,,ELA-A1,12,TBD,RVEDVTNKAEYW,=,80.0,0.594998,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",3
7,,ELA-A1,12,TBD,RVEDVTNTAELW,=,25.0,0.702501,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",4
8,,ELA-A1,12,TBD,RVEDVTNTAEYL,=,97.0,0.57719,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",3
9,,ELA-A1,12,TBD,RVEDVTNTAEYW,=,39.0,0.661401,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",4


In [41]:
df.groupby('mhc').size().nlargest(11)

mhc
HLA-A-0201    9565
HLA-A-0301    6141
HLA-A-0203    5542
HLA-A-1101    5399
HLA-A-0206    4827
HLA-A-3101    4796
HLA-A-6802    4768
HLA-A-0202    3919
HLA-A-0101    3725
HLA-B-0702    3412
H-2-Kb        3407
dtype: int64

### only retaining 9 mers

In [42]:
df_9 = df[df['peptide_length']==9]
df_9 = df_9.reindex(np.random.permutation(df_9.index))
df_9.groupby('mhc').size().nlargest(11)

mhc
HLA-A-0201    6961
HLA-A-0301    4601
HLA-A-0203    3937
HLA-A-1101    3862
HLA-A-3101    3309
HLA-A-0206    3223
HLA-A-6802    3188
HLA-A-0101    3169
HLA-B-1501    3142
HLA-B-0702    2974
HLA-B-5801    2444
dtype: int64

# AUC scores

In [43]:
def auc_scorer(estimator, X, y):
    return sklearn.metrics.roc_auc_score(measured_affinity_less_than(y,500),estimator.predict(X))

## HLA-A-0201

In [50]:
df_h = df_9[df_9['mhc']=='HLA-A-0201'][['encoded_peptides','log_meas']]
X = pd.DataFrame(list(df_h['encoded_peptides'])).values
y = pd.DataFrame(list(df_h['log_meas'])).values
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)

In [51]:
param_grid = {'n_estimators': [330, 331,332], 'max_depth': [1,2,3], 'learning_rate': [0.327, 0.328, 0.329]}
grid_search = GridSearchCV(GradientBoostingRegressor(),param_grid,cv=5, scoring = auc_scorer)
grid_search.fit(X_train,y_train[:,0])
print("test set score: %.5f" % grid_search.score(X_test,y_test[:,0]))
print("Best cross-validation score: %.5f" %grid_search.best_score_)
# the latter attributes the mean cross-validation score
print("Best parameters: ", grid_search.best_params_)

test set score: 0.95366
Best cross-validation score: 0.94863
Best parameters:  {'n_estimators': 330, 'max_depth': 2, 'learning_rate': 0.327}




## HLA-A-0301

In [12]:
df_h = df_9[df_9['mhc']=='HLA-A-0301'][['encoded_peptides','log_meas']]
X = pd.DataFrame(list(df_h['encoded_peptides'])).values
y = pd.DataFrame(list(df_h['log_meas'])).values
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

In [20]:
param_grid = {'n_estimators': [323,324,325], 'max_depth': [1,2,3], 'learning_rate': [0.326, 0.327, 0.328]}
grid_search = GridSearchCV(GradientBoostingRegressor(),param_grid,cv=5, scoring = auc_scorer)
grid_search.fit(X_train,y_train[:,0])
print("test set score: %.5f" % grid_search.score(X_test,y_test[:,0]))
print("Best cross-validation score: %.5f" %grid_search.best_score_)
# the latter attributes the mean cross-validation score
print("Best parameters: ", grid_search.best_params_)

test set score: 0.93059
Best cross-validation score: 0.91897
Best parameters:  {'n_estimators': 324, 'max_depth': 2, 'learning_rate': 0.327}




## HLA-A-0203

In [48]:
df_h = df_9[df_9['mhc']=='HLA-A-0203'][['encoded_peptides','log_meas']]
X = pd.DataFrame(list(df_h['encoded_peptides'])).values
y = pd.DataFrame(list(df_h['log_meas'])).values
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

In [49]:
param_grid = {'n_estimators': [333,334,335], 'max_depth': [1,2,3], 'learning_rate': [0.327, 0.328, 0.329]}
grid_search = GridSearchCV(GradientBoostingRegressor(),param_grid,cv=5, scoring = auc_scorer) 
grid_search.fit(X_train,y_train[:,0])
print("test set score: %.5f" % grid_search.score(X_test,y_test[:,0]))
print("Best cross-validation score: %.5f" %grid_search.best_score_)
# the latter attributes the mean cross-validation score
print("Best parameters: ", grid_search.best_params_)

test set score: 0.94103
Best cross-validation score: 0.93422
Best parameters:  {'n_estimators': 334, 'max_depth': 1, 'learning_rate': 0.329}




## HLA-A-1101

In [36]:
df_h = df_9[df_9['mhc']=='HLA-A-1101'][['encoded_peptides','log_meas']]
X = pd.DataFrame(list(df_h['encoded_peptides'])).values
y = pd.DataFrame(list(df_h['log_meas'])).values
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

In [37]:
param_grid = {'n_estimators': [332,333,334], 'max_depth': [1,2,3], 'learning_rate': [0.328, 0.329,0.33]}
grid_search = GridSearchCV(GradientBoostingRegressor(),param_grid,cv=5, scoring = auc_scorer) 
grid_search.fit(X_train,y_train[:,0])
print("test set score: %.5f" % grid_search.score(X_test,y_test[:,0]))
print("Best cross-validation score: %.5f" %grid_search.best_score_)
# the latter attributes the mean cross-validation score
print("Best parameters: ", grid_search.best_params_)

test set score: 0.92655
Best cross-validation score: 0.93749
Best parameters:  {'n_estimators': 333, 'max_depth': 2, 'learning_rate': 0.329}




## HLA-A-3101

In [46]:
df_h = df_9[df_9['mhc']=='HLA-A-3101'][['encoded_peptides','log_meas']]
X = pd.DataFrame(list(df_h['encoded_peptides'])).values
y = pd.DataFrame(list(df_h['log_meas'])).values
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

In [47]:
param_grid = {'n_estimators': [332,333,334,335], 'max_depth': [1,2,3], 'learning_rate': [0.324, 0.325, 0.326,0.327]}
grid_search = GridSearchCV(GradientBoostingRegressor(),param_grid,cv=5, scoring = auc_scorer) 
grid_search.fit(X_train,y_train[:,0])
print("test set score: %.5f" % grid_search.score(X_test,y_test[:,0]))
print("Best cross-validation score: %.5f" %grid_search.best_score_)
# the latter attributes the mean cross-validation score
print("Best parameters: ", grid_search.best_params_)

test set score: 0.93218
Best cross-validation score: 0.93146
Best parameters:  {'n_estimators': 334, 'max_depth': 2, 'learning_rate': 0.326}




## HLA-A-0206

In [40]:
df_h = df_9[df_9['mhc']=='HLA-A-0206'][['encoded_peptides','log_meas']]
X = pd.DataFrame(list(df_h['encoded_peptides'])).values
y = pd.DataFrame(list(df_h['log_meas'])).values
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

In [43]:
param_grid = {'n_estimators': [333,334,335], 'max_depth': [1,2,3], 'learning_rate': [0.328, 0.329,0.33]}
grid_search = GridSearchCV(GradientBoostingRegressor(),param_grid,cv=5, scoring = auc_scorer) 
grid_search.fit(X_train,y_train[:,0])
print("test set score: %.5f" % grid_search.score(X_test,y_test[:,0]))
print("Best cross-validation score: %.5f" %grid_search.best_score_)
# the latter attributes the mean cross-validation score
print("Best parameters: ", grid_search.best_params_)

test set score: 0.89687
Best cross-validation score: 0.89808
Best parameters:  {'n_estimators': 334, 'max_depth': 2, 'learning_rate': 0.329}




## HLA-A-6802

In [21]:
df_h = df_9[df_9['mhc']=='HLA-A-6802'][['encoded_peptides','log_meas']]
X = pd.DataFrame(list(df_h['encoded_peptides'])).values
y = pd.DataFrame(list(df_h['log_meas'])).values
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

In [24]:
param_grid = {'n_estimators': [333,334,335], 'max_depth': [1,2,3], 'learning_rate': [0.327, 0.328,0.329]}
grid_search = GridSearchCV(GradientBoostingRegressor(),param_grid,cv=5, scoring = auc_scorer) 
grid_search.fit(X_train,y_train[:,0])
print("test set score: %.5f" % grid_search.score(X_test,y_test[:,0]))
print("Best cross-validation score: %.5f" %grid_search.best_score_)
# the latter attributes the mean cross-validation score
print("Best parameters: ", grid_search.best_params_)

test set score: 0.91572
Best cross-validation score: 0.91443
Best parameters:  {'n_estimators': 334, 'max_depth': 2, 'learning_rate': 0.328}




In [36]:
pd.DataFrame({'n_estimators': [331,324,334,333,334,334,334], 'max_depth': [2,2,2,2,2,2,2], 'learning_rate': [0.328,0.327,0.328,0.329, 0.326, 0.329,0.328], 'best cv': [0.95,0.91897,0.93288,0.93749,0.93146,0.89808,0.91443], 'test': [0.95,0.93059,0.94190,0.92655,0.93218,0.89687,0.91572], '9mers': [6961,4601,3937,3862,3309,3223,3188]}, index = ['HLA-A-0201','HLA-A-0301','HLA-A-0203','HLA-A-1101','HLA-A-3101','HLA-A-0206','HLA-A-6802'])

Unnamed: 0,9mers,best cv,learning_rate,max_depth,n_estimators,test
HLA-A-0201,6961,0.95,0.328,2,331,0.95
HLA-A-0301,4601,0.91897,0.327,2,324,0.93059
HLA-A-0203,3937,0.93288,0.328,2,334,0.9419
HLA-A-1101,3862,0.93749,0.329,2,333,0.92655
HLA-A-3101,3309,0.93146,0.326,2,334,0.93218
HLA-A-0206,3223,0.89808,0.329,2,334,0.89687
HLA-A-6802,3188,0.91443,0.328,2,334,0.91572


# F1- score