# Using features from the FeaturesReporter 

In [None]:
%%timeit -r1 -n1

import sqlite3
import pandas
from sklearn.linear_model import ElasticNetCV
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import BaggingRegressor
from scipy.stats import pearsonr
from matplotlib import use; use('Agg')
import matplotlib.pyplot as plt 
%matplotlib inline

# this is the new benchmark script 

def low_10( df ):
    return df.sort_values( by='total_score' ).head( 10 )

for feature_db in [ 'features.db3' ]:
    
    print '------------------------------------------------------------------------'
    print 'Results for features in "{}"'.format( feature_db )
    print '------------------------------------------------------------------------'
    
    con = sqlite3.connect( feature_db )
    for i in [ 'interfaces', 'interface_sides' ]:
        f = pandas.read_sql_query( 'select * from {}'.format( i ), con, index_col='struct_id' )
        names = pandas.read_sql_query( 'select * from structures', con, index_col='struct_id' )
        feats = f.join( names ).dropna()
        #print f.head()
        #print names.head()
        #print feats.head()

        feats['shlag'] = feats.tag.str[:-10]
        feats.set_index( 'shlag', inplace=True )

        train_set = pandas.read_csv( '../data/train_set.csv' )
        train_set.set_index( 'mutant', inplace=True )
        fig, ax = plt.subplots( ncols=3, nrows=1, figsize=(10,3) )
        constants = [ 'kcat', '1/km', 'kcat/km' ]
        for const_index, constant in enumerate( constants ): 

            J = feats.join( train_set ).dropna()
            y = J[ constant ]
            X = J.ix[:,'dSASA':'batch_id'] 

            net = ElasticNetCV( normalize=True, selection='random' )

            params_grid = {
                'cv': [ 10 ], 
                'l1_ratio': [ 0.001, 0.01, 0.1, 0.5, 0.9 ], 
            }

            print 'Training on constant "{}" ...'.format( constant ) , 
            grid = GridSearchCV( net, params_grid )
            bag = BaggingRegressor( base_estimator=grid, n_estimators=1000, bootstrap_features=True )
            bag.fit( X, y )

            print 'done'
            print 'Calculating predictions for "{}" ...'.format( constant ),
            preds = bag.predict( X )

            pcc = pearsonr( preds, y )
            score = bag.score( X, y )
            #params = bag.get_params()

            print 'done', 
            #print '------------------------------------------------------------------------'
            print '\tPCC: {:.2f}, model score: {:.3f}'.format( pcc[0], score )
            ax[ const_index ].scatter( preds, y, alpha=0.3, marker='.', color='magenta' )
            ax[ const_index ].set_xlabel( 'Predicted {}'.format( constant ) )
            ax[ const_index ].set_ylabel( 'Actual' )
            plt.tight_layout()

        fig.suptitle( feature_db )
        fig.tight_layout()
        fig.show()

------------------------------------------------------------------------
Results for features in "features.db3"
------------------------------------------------------------------------
Training on constant "kcat" ...