In [None]:
import pandas
from sklearn.linear_model import ElasticNetCV
import matplotlib.pyplot as plt
% matplotlib inline

# data
df = pandas.read_csv( 'data.csv' )
#df = df.fillna( { 'kcat': -5, 'km': -5, 'eff': -5 } )
df = df.dropna()

drop = [ 'index', 'kcat', 'km', 'eff', 'SR_1_all_cst', 
         'SR_2_all_cst', 'SR_3_all_cst', 'SR_4_all_cst', ]

y = df.kcat.as_matrix()
X = df.drop( drop, 1 ).as_matrix()

In [None]:
# elastic net
params = {
    'cv': 779,
    'normalize': False, 
    'l1_ratio': [.1, .5, 0.75, 0.9, 0.95, 0.99 ],
    'n_jobs': 4, 
}

net = ElasticNetCV( **params )
net.fit( X, y )

In [None]:
# results
preds = [ net.predict( X[ i ] ) for i in range( len( X ) ) ]
plt.scatter( y, preds, color='g', alpha=0.19 )
plt.title( 'Elastic net with {}-fold cross validation\nmodel score (R$^2$) = {:0.2f}'.format( params['cv'], net.score( X, y) ) )
plt.xlabel( 'Measured $k_{cat}$' ); plt.ylabel( 'Predicted $k_{cat}$' )
weights = pandas.DataFrame( zip( df.columns[1:-3], net.coef_ ), columns=['feature', 'weight'] )
weights.sort( 'weight' )[0:11]

In [None]:
net.l1_ratio_

In [None]:
from sklearn.linear_model import MultiTaskElasticNetCV

# data
df = pandas.read_csv( 'data.csv' )
#df = df.fillna( { 'kcat': -5, 'km': -5, 'eff': -5 } )
df = df.dropna()

drop = [ 'index', 'kcat', 'km', 'eff', 'SR_1_all_cst', 
         'SR_2_all_cst', 'SR_3_all_cst', 'SR_4_all_cst', ]

y = df[['kcat', 'km', 'eff']].as_matrix()
X = df.drop( drop, 1 ).as_matrix()

# multi elastic net
params = {
    'cv': 779,
    'l1_ratio': [ 0.95, 0.99 ],
    'n_jobs': 4, 
}

multi_net = MultiTaskElasticNetCV( **params )
multi_net.fit( X, y )

In [None]:
# results
from numpy import concatenate
preds = concatenate( [ multi_net.predict( X[ i ] ) for i in range( len( X ) ) ] )

predicts = pandas.DataFrame( preds, index=df.index, columns=['kcat', 'km', 'eff'] )
actuals = pandas.DataFrame( y, index=df.index, columns=['kcat', 'km', 'eff'] )

fig, ax = plt.subplots( nrows=1, ncols=3, figsize=(16,3) )
fig.suptitle( 'Elastic net with {}-fold cross validation'.format( params['cv'] ) ) 

ax[0].scatter( actuals.kcat, predicts.kcat, color='g', alpha=0.19 )
ax[0].set_xlabel( 'Measured $k_{cat}$' ); ax[0].set_ylabel( 'Predicted $k_{cat}$' )

ax[1].scatter( actuals.km, predicts.km, color='b', alpha=0.19 )
ax[1].set_xlabel( 'Measured $K_{M}$' ); ax[1].set_ylabel( 'Predicted $K_{M}$' )

ax[2].scatter( actuals.eff, predicts.eff, color='r', alpha=0.19 )
ax[2].set_xlabel( 'Measured $k_{cat}/K_M$' ); ax[2].set_ylabel( 'Predicted $k_{cat}/K_M$' )

#weights = pandas.DataFrame( zip( df.columns[1:-3], net.coef_ ), columns=['feature', 'weight'] )
#weights.sort( 'weight' )[0:11]

In [None]:
test = pandas.read_csv( 'test_set/test_set.sc', sep=r'\s+' )
drop_test = [ 'SR_1_all_cst', 'SR_2_all_cst', 'SR_3_all_cst', 'SR_4_all_cst', 'description', 'SR_1', 'SR_2', 'SR_4', 'SR_5', ]
test_set = test.drop( drop_test, axis=1 ).as_matrix()

In [None]:
# test set results
from numpy import concatenate 
preds = [ multi_net.predict( test_set[ i ] ) for i in range( len( test_set ) ) ] 
combined = concatenate( preds )
predictions = pandas.DataFrame( combined, index=test.description, columns=['pred_kcat', 'pred_1/km', 'pred_eff'] )
predictions.describe()

In [None]:
predictions.hist(bins=20)