In [31]:
%matplotlib inline

from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_predict, StratifiedKFold
from sklearn.grid_search import GridSearchCV
from scipy.stats import pearsonr
import matplotlib.pyplot as plt 
import pandas
from numpy import log10, inf, linspace, arctan, nan

In [32]:
# clean up experimental data 
# and establish our three targets 

df = pandas.read_csv( '/Users/alex/Documents/bagel-thermal/data_sets/experimental_data/thermo_paper_data_set.csv', index_col=0 )   

df[ 'kcat' ] = log10( df.kcat / 880 ) 
df[ 'km' ] = log10( df.km / 5 )
df[ 'kcatkm' ] = log10( df.kcatkm / 174000 )
df[ 'tm' ] = df.tm - 39.8 

targets = [
    ( 'kcat', df [ [ 'kcat' ] ] ), 
    ( 'km', df[ [ 'km' ] ] ), 
    ( 'kcatkm', df[ [ 'kcatkm' ] ] ),
    ( 'tm', df[ [ 'tm' ] ] ),
]

In [39]:
# read in enzyme design features from PLOS paper 

features = [ 
    u'total_score', u'fa_rep', u'hbond_sc', u'tot_pstat_pm',
       u'tot_nlpstat_pm', u'tot_burunsat_pm', u'tot_hbond_pm',
       u'tot_NLconts_pm', u'tot_nlsurfaceE_pm', u'tot_total_charge',
       u'tot_total_pos_charges', u'tot_total_neg_charges', 
       u'SR_1_total_score', u'SR_1_fa_rep', u'SR_1_hbond_sc',
       u'SR_1_hbond_pm', u'SR_1_burunsat_pm',
       u'SR_1_pstat_pm', u'SR_1_nlpstat_pm', u'SR_2_total_score',
       u'SR_2_fa_rep', u'SR_2_hbond_sc',  u'SR_2_hbond_pm',
       u'SR_2_burunsat_pm', u'SR_2_pstat_pm', u'SR_2_nlpstat_pm',
       u'SR_3_total_score', u'SR_3_fa_rep', u'SR_3_hbond_sc', 
       u'SR_3_hbond_pm', u'SR_3_burunsat_pm', u'SR_3_pstat_pm',
       u'SR_3_nlpstat_pm', u'SR_4_total_score', u'SR_4_fa_rep',
       u'SR_4_hbond_sc', u'SR_4_hbond_pm',
       u'SR_4_burunsat_pm', u'SR_4_pstat_pm', u'SR_4_nlpstat_pm',
       u'SR_5_total_score', u'SR_5_fa_rep', u'SR_5_hbond_sc',
       u'SR_5_interf_E_1_2', u'SR_5_dsasa_1_2', u'SR_5_hbond_pm',
       u'SR_5_burunsat_pm' 
] 

from sklearn import preprocessing, cross_validation, linear_model

f = pandas.read_csv( '../data_sets/rosetta/enzyme_design_talaris_2013.csv', index_col=0 )
f = f.groupby( 'name' ).apply( lambda x: x.sort_values( 'total_score' ).head( 10 ).mean() ) 
f = f[ features ]

plt.figure( figsize=( 12, 4 ) ) 

for i, ( target_name, target ) in enumerate( targets ):
    J = target.join( f ).dropna().replace( [ -inf, inf ], -5 )  
    y = J[ target_name ]
    X = J.drop( target_name, axis=1 ) 

    scaler = preprocessing.StandardScaler()
    X = scaler.fit_transform( X ) 
    
    print X.shape
    print y.shape 
    
    L = [ .03, .3, .5, .75, .9, .95, .99 ]
    skf = cross_validation.StratifiedKFold?
    en = linear_model.ElasticNetCV( l1_ratio=L )
    preds = cross_validation.cross_val_predict( en, X, y, cv=skf ) 
    
    plt.subplot( 1, 3, i+1 ) 
    plt.scatter( y, preds, c='k' )
    plt.xlabel( 'Actual' ) 
    plt.ylabel( 'Predicted' ) 

(67, 47)
(67,)


ValueError: Found array with 0 sample(s) (shape=(0, 47)) while a minimum of 1 is required.

<matplotlib.figure.Figure at 0x1172920d0>

In [None]:
    skf = cross_validation.StratifiedKFold

In [None]:


# wt = df.loc[ 'BglB' ]
# df.drop( [ 'BglB' ], inplace=True )
# df = log( df / wt )


# clean up enzyme design features

f = pandas.read_csv( 'data_sets/scorefile.csv' ) 
f = f.groupby( 'name' ).apply( lambda x: x.mean() ) 

features = [ u'total_score', u'fa_rep', u'hbond_sc', u'tot_pstat_pm',
       u'tot_nlpstat_pm', u'tot_burunsat_pm', u'tot_hbond_pm',
       u'tot_NLconts_pm', u'tot_nlsurfaceE_pm', u'tot_total_charge',
       u'tot_total_pos_charges', u'tot_total_neg_charges', u'tot_seq_recovery',
       u'SR_1_total_score', u'SR_1_fa_rep', u'SR_1_hbond_sc',
       u'SR_1_hbond_pm', u'SR_1_burunsat_pm',
       u'SR_1_pstat_pm', u'SR_1_nlpstat_pm', u'SR_2_total_score',
       u'SR_2_fa_rep', u'SR_2_hbond_sc', u'SR_2_all_cst', u'SR_2_hbond_pm',
       u'SR_2_burunsat_pm', u'SR_2_pstat_pm', u'SR_2_nlpstat_pm',
       u'SR_3_total_score', u'SR_3_fa_rep', u'SR_3_hbond_sc', 
       u'SR_3_hbond_pm', u'SR_3_burunsat_pm', u'SR_3_pstat_pm',
       u'SR_3_nlpstat_pm', u'SR_4_total_score', u'SR_4_fa_rep',
       u'SR_4_hbond_sc', u'SR_4_hbond_pm',
       u'SR_4_burunsat_pm', u'SR_4_pstat_pm', u'SR_4_nlpstat_pm',
       u'SR_5_total_score', u'SR_5_fa_rep', u'SR_5_hbond_sc', u'SR_5_all_cst',
       u'SR_5_interf_E_1_2', u'SR_5_dsasa_1_2', u'SR_5_hbond_pm',
       u'SR_5_burunsat_pm' ] 

plt.figure( figsize=( 9, 3 ) ) 

for i, ( name, target ) in enumerate( targets ):
    
    # read in data 
    J = f.join( target ).replace( [ -inf, inf ], nan ).dropna()
    X = J[ features ] 
    y = J[ name ] 
    print len( y ) 

    # init ML models
    
    L = [ .5, .75, .9, .95 ] 
    net = ElasticNetCV( l1_ratio=L )
    bag = BaggingRegressor( net, n_estimators=100 ) 
    pln = Pipeline([
        ( 'scaler', StandardScaler() ), 
        ( 'bag', bag ) , 
    ])
    
    # cross-validate the bag 
    preds = cross_val_predict( pln, X, y, cv=10 ) 
    R = pearsonr( y, preds )  

    # plot actual versus predicted for training set 
    plt.subplot( 1, 3, i + 1 ) 
    plt.scatter( y, preds, marker='.' ) 
    plt.xlabel( 'Actual' ) 
    plt.title( '{0} PCC={1:.2f}'.format( name, R[0] ) )