In [25]:
%matplotlib inline 

import matplotlib.pyplot as plt 
import pandas 
from sklearn import preprocessing, svm, metrics, pipeline, cross_validation, decomposition, grid_search

## SVM for expression using enzyme design features 

In [26]:
# # features 

# ! mkdir feature_sets

# feature_set = [ str( i ) for i in [ 
#     u'total_score', u'fa_rep', u'hbond_sc', u'tot_pstat_pm', u'tot_nlpstat_pm', u'tot_burunsat_pm', u'tot_hbond_pm', u'tot_NLconts_pm', u'tot_nlsurfaceE_pm', 
#     u'SR_1_total_score', u'SR_1_fa_rep', u'SR_1_hbond_sc', u'SR_1_hbond_pm', u'SR_1_burunsat_pm', u'SR_1_pstat_pm', u'SR_1_nlpstat_pm', 
#     u'SR_2_total_score', u'SR_2_fa_rep', u'SR_2_hbond_sc', u'SR_2_hbond_pm', u'SR_2_burunsat_pm', u'SR_2_pstat_pm', u'SR_2_nlpstat_pm', 
#     u'SR_3_total_score', u'SR_3_fa_rep', u'SR_3_hbond_sc', u'SR_3_hbond_pm', u'SR_3_burunsat_pm', u'SR_3_pstat_pm', u'SR_3_nlpstat_pm', 
#     u'SR_4_total_score', u'SR_4_fa_rep', u'SR_4_hbond_sc', u'SR_4_hbond_pm', u'SR_4_burunsat_pm', u'SR_4_pstat_pm', u'SR_4_nlpstat_pm', 
#     u'SR_5_total_score', u'SR_5_fa_rep', u'SR_5_hbond_sc', u'SR_5_interf_E_1_2', u'SR_5_dsasa_1_2', u'SR_5_hbond_pm', u'SR_5_burunsat_pm'   
# ] ]


# for eature in feature_set:
#     f = pandas.read_csv( '/Users/alex/Documents/bagel-benchmark/data_sets/calculated/enzyme_design_talaris_2014.csv', usecols=feature_set+['name'] )
#     f.groupby( 'name' ).apply( lambda x: x.sort_values( eature ).head( 10 ).mean() ).to_csv( 'feature_sets/{}.csv'.format( eature ) )

In [27]:
# experimental values 

E = pandas.read_csv( '../data_set.csv', index_col='mutant', usecols=['mutant','expression'] ) 
F = pandas.read_csv( 'feature_sets/total_score.csv', index_col='name' )

J = F.join( E ).dropna()

In [28]:
# machine learning setup 

X = J.drop( 'expression', axis=1 ) 
y = J[ 'expression' ] 

print X.shape
print y.shape

(107, 44)
(107,)


In [None]:
pln = pipeline.Pipeline([
    ( 'scaler', preprocessing.StandardScaler() ), 
    ( 'pca', decomposition.PCA() ), 
    ( 'svm', svm.SVC() ), 
])

param_grid = {
    'svm__kernel': [ 'rbf', 'linear' ], 
    'svm__C': [ 0.003, 0.03, 0.3, 3, 30, 300, 3000 ],
    'svm__gamma': [ 0.003, 0.03, 0.3, 3, 30, 300, 3000 ],
    'svm__class_weight': [ 'balanced', None ], 
    'pca__n_components': [ 2, 4, 8 ], 
    'pca__whiten': [ True, False ], 
}

skf = cross_validation.StratifiedKFold( y, n_folds=6 )
gs = grid_search.GridSearchCV( pln, param_grid, cv=skf )
gs.fit( X, y ) 

In [None]:
print gs.best_estimator_
print gs.best_estimator_.predict( X ) 
print gs.best_estimator_.score( X, y ) 

In [None]:
# now for the blind data 

E = pandas.read_csv( '../blind_data.csv', index_col='name' ) 
F = pandas.read_csv( 'feature_sets/total_score.csv', index_col='name' )

blind = F.join( E ).dropna()

X_test = blind.drop( 'expression', axis=1 )
y_test = blind.expression

In [None]:
gs.predict( X_test ) 

In [None]:
tpr, fpr, thresh = metrics.roc_curve( y_test, gs.decision_function( X_test ) )

plt.figure( figsize=( 4,4 ) ) 
plt.plot( tpr, fpr, label='blind test set', c='orange' ) 
plt.legend(loc='lower right')
plt.xlabel( 'False positive rate' )
plt.ylabel( 'True positive rate' ) 

print 'score={}'.format( gs.score( X_test, y_test ) ) 

-----------

## Predicting Tm 

In [25]:
df = pandas.read_csv( '../data_set.csv', index_col='mutant' ) 
features = pandas.read_csv( '/Users/alex/Documents/bagel-benchmark/data_sets/calculated/enzyme_design_talaris_2014.csv', index_col='name' )

J = df.join( features )
J[ 'mutant_name' ] = J.index 
G = J.groupby( 'mutant_name' ).apply( lambda x: x.sort_values( 'total_score' ).head( 10 ).mean() ) 

feature_set = [
    u'tm',
    u'total_score', u'fa_rep', u'hbond_sc', u'tot_pstat_pm', u'tot_nlpstat_pm', u'tot_burunsat_pm', u'tot_hbond_pm', u'tot_NLconts_pm', u'tot_nlsurfaceE_pm', 
    u'SR_1_total_score', u'SR_1_fa_rep', u'SR_1_hbond_sc', u'SR_1_hbond_pm', u'SR_1_burunsat_pm', u'SR_1_pstat_pm', u'SR_1_nlpstat_pm', 
    u'SR_2_total_score', u'SR_2_fa_rep', u'SR_2_hbond_sc', u'SR_2_hbond_pm', u'SR_2_burunsat_pm', u'SR_2_pstat_pm', u'SR_2_nlpstat_pm', 
    u'SR_3_total_score', u'SR_3_fa_rep', u'SR_3_hbond_sc', u'SR_3_hbond_pm', u'SR_3_burunsat_pm', u'SR_3_pstat_pm', u'SR_3_nlpstat_pm', 
    u'SR_4_total_score', u'SR_4_fa_rep', u'SR_4_hbond_sc', u'SR_4_hbond_pm', u'SR_4_burunsat_pm', u'SR_4_pstat_pm', u'SR_4_nlpstat_pm', 
    u'SR_5_total_score', u'SR_5_fa_rep', u'SR_5_hbond_sc', u'SR_5_interf_E_1_2', u'SR_5_dsasa_1_2', u'SR_5_hbond_pm', u'SR_5_burunsat_pm'   
]