### Predicting expression from calculated structural features 

Use of 50 Rosetta features (enzyme design) to predict soluble expression of BglB. 

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt 
from sklearn.decomposition import PCA
from sklearn.svm import SVC 
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler 
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_curve, auc, precision_recall_curve, precision_score
from sklearn.pipeline import Pipeline
from numpy import log 
import pandas

Read in our features and experimental data. 

In [3]:
# get the expression data 

df = pandas.read_csv( 'data_sets/experimental/expr_144_mutants.csv', index_col=0 ) 

# get the features (enzyme design) 

f = pandas.read_csv( 'data_sets/calculated/enzyme_design_talaris_2014.csv' ) 
f = f.groupby( 'name' ).apply( lambda x: x.sort_values( by='total_score' ).head(10).mean() )

df = df.join( f ).dropna()

df.shape

(133, 61)

We will use 45 features and 131 protein models (each is average of lowest 10 by total_score) to train an SVM classifier 

In [4]:
features = [
    u'total_score', u'fa_rep', u'hbond_sc', u'tot_pstat_pm', u'tot_nlpstat_pm', u'tot_burunsat_pm',
    u'tot_hbond_pm', u'tot_NLconts_pm', u'tot_nlsurfaceE_pm',
    u'SR_1_total_score', u'SR_1_fa_rep', u'SR_1_hbond_sc', u'SR_1_hbond_pm',
    u'SR_1_burunsat_pm', u'SR_1_pstat_pm', u'SR_1_nlpstat_pm', 
    u'SR_2_total_score', u'SR_2_fa_rep', u'SR_2_hbond_sc', 
    u'SR_2_hbond_pm', u'SR_2_burunsat_pm', u'SR_2_pstat_pm',
    u'SR_2_nlpstat_pm', u'SR_3_total_score', u'SR_3_fa_rep',
    u'SR_3_hbond_sc', u'SR_3_hbond_pm', u'SR_3_burunsat_pm', u'SR_3_pstat_pm', u'SR_3_nlpstat_pm',
    u'SR_4_total_score', u'SR_4_fa_rep', u'SR_4_hbond_sc', 
    u'SR_4_hbond_pm', u'SR_4_burunsat_pm', u'SR_4_pstat_pm',
    u'SR_4_nlpstat_pm', u'SR_5_total_score', u'SR_5_fa_rep',
    u'SR_5_hbond_sc', u'SR_5_interf_E_1_2', u'SR_5_dsasa_1_2', u'SR_5_hbond_pm', u'SR_5_burunsat_pm'
]

X = df[ features ] 
y = df[ 'expression' ] 

X.shape, y.shape

((133, 44), (133,))

In [11]:
# production way 

pln = Pipeline([
    ( 'scaler', StandardScaler() ), 
    ( 'pca', PCA() ), 
    ( 'svc', SVC() ), 
])

L = [ .03, .3, 3, 30, 300 ] 

param_grid = [
    {
        'svc__C': L,
        'svc__gamma': L,
        'svc__probability': [ True ], 
        'svc__kernel': [ 'linear', 'rbf' ], 
        
        'pca__n_components': [ 2, 4, 6, 10 ], 
        'pca__whiten': [ True ], 
    }
]

gs = GridSearchCV( pln, param_grid, cv=10 )

Now to train the classifier and extract some info that we want 

In [12]:
# gs.fit( X, y ) 
# probs = gs.decision_function( X ) 

In [13]:
gs.fit( X, y ) 
#probs = gs.decision_function( X ) 

KeyboardInterrupt: 

In [None]:
print(gs.best_score_)
print(gs.best_params_)

clf = gs.best_estimator_
clf.fit(X, y)

print('Test accuracy: %.3f' % clf.score(X, y))

#### ROC curve 

ROC curve for evaluating classifier performance 

In [None]:
fpr, tpr, thresholds = roc_curve( y, probs )
roc_auc = auc(fpr, tpr)

plt.figure( figsize=(4,4) )
plt.plot(fpr, tpr, label='score = %.2f' % (roc_auc))
plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='luck')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(''.join( [ '{}={}\n'.format( k, v ) for k, v in gs.best_params_.items() ] ) )
plt.legend(loc="lower right")
plt.show()

#### Precision-recall curve 

Xiaokang: PRC may be less optimistic than ROC

In [None]:
precision, recall, __ = precision_recall_curve( y, probs ) 

plt.figure( figsize=(4,4) )
plt.plot( recall, precision ) 
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc="lower left")
plt.show()

#### Random things I find on the Internet 

[This first one](http://scikit-learn.org/stable/auto_examples/linear_model/plot_sparse_recovery.html) had this cool figure 

![](http://scikit-learn.org/stable/_images/plot_sparse_recovery_003.png)

from the scikit-learn docs that would be cool to see if it would be useful here. 

In [None]:
# Author: Alexandre Gramfort and Gael Varoquaux
# License: BSD 3 clause

import warnings

import matplotlib.pyplot as plt
import numpy as np
from scipy import linalg

from sklearn.linear_model import RandomizedLasso, lasso_stability_path, LassoLarsCV
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import auc, precision_recall_curve
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.utils.extmath import pinvh
from sklearn.utils import ConvergenceWarning


#alpha_grid, scores_path = lasso_stability_path( X, y )

lasso_stability_path?

# plt.figure()

# # We plot the path as a function of alpha/alpha_max to the power 1/3: the
# # power 1/3 scales the path less brutally than the log, and enables to
# # see the progression along the path

# hg = plt.plot(alpha_grid[1:] ** .333, scores_path[coef != 0].T[1:], 'r')
# hb = plt.plot(alpha_grid[1:] ** .333, scores_path[coef == 0].T[1:], 'k')

# ymin, ymax = plt.ylim()

# plt.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$')
# plt.ylabel('Stability score: proportion of times selected')
# plt.title('Stability Scores Path - Mutual incoherence: %.1f' % mi)
# plt.axis('tight')
# plt.legend((hg[0], hb[0]), ('relevant features', 'irrelevant features'), loc='best')


In [None]:
%who 