In [58]:
from sklearn.ensemble import (AdaBoostRegressor, 
                              BaggingRegressor, 
                              ExtraTreesRegressor, 
                              GradientBoostingRegressor, 
                              RandomForestRegressor)

import pandas as pd
import custom_funcs as cf
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Purpose

In this notebook, I intend to setup an automated ensemble learning model selection procedure for any given drug.

In [57]:
## GREAT! I THINK WE HAVE THE MASTER FUNCTION FOR READING DATA COMPOSED!
drug = 'FPV'
protein = 'protease'
data, feat_cols = cf.get_cleaned_data(protein, drug) 

# Just checking:
cf.test_data_integrity(data)

## Now, let's do data transformations.
from isoelectric_point import isoelectric_points

# Replace amino acids with pKa values
for col in feat_cols:
    data[col] = data[col].replace(isoelectric_points.keys(), isoelectric_points.values())
    
# log10-transform the drug column.
data[drug] = data[drug].apply(lambda x:np.log10(x))
data.head()

Unnamed: 0_level_0,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,...,P91,P92,P93,P94,P95,P96,P97,P98,P99,FPV
SeqID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4432,6.3,5.65,6.04,5.6,6.04,5.88,5.65,10.76,6.3,6.04,...,5.6,5.65,6.04,6.06,5.02,5.6,6.04,10.76,5.91,0.176091
4664,6.3,5.65,6.04,5.6,6.04,5.88,5.65,10.76,6.3,6.04,...,5.6,5.65,6.04,6.06,5.02,5.6,6.04,10.76,5.91,0.491362
5279,6.3,5.65,6.04,5.6,6.04,5.88,5.65,10.76,6.3,6.04,...,5.6,5.65,6.04,6.06,5.02,5.6,6.04,10.76,5.91,0.919078
5444,6.3,5.65,6.04,5.6,6.04,5.88,5.65,10.76,6.3,6.04,...,5.6,5.65,6.04,6.06,5.02,5.6,6.04,10.76,5.91,0.431364
5462,6.3,5.65,6.04,5.6,6.04,5.88,5.65,10.76,6.3,6.04,...,5.6,5.65,6.04,6.06,5.02,5.6,6.04,10.76,5.91,0.322219


In [165]:
X = data[feat_cols]
Y = data[drug]

from sklearn.cross_validation import train_test_split, ShuffleSplit

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1)

sscv = ShuffleSplit(n=len(X_train), n_iter=3, test_size=0.2)

In [166]:
np.arange(0.1, 1.1, 0.1)

array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ])

In [183]:
params = dict()
params['gbr'] = {'n_estimators': np.arange(200, 601, 100),
                 'max_depth': np.arange(3, 5),
                 'min_samples_split': np.arange(4, 6),
                 'min_samples_leaf': np.arange(1, 6)
                }

models = dict()
models['gbr'] = GradientBoostingRegressor()

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

gs = GridSearchCV(models['gbr'], params['gbr'], n_jobs=-1, verbose=3, cv=sscv, scoring='mean_squared_error')
gs.fit(X, Y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] n_estimators=200, min_samples_split=4, max_depth=3, min_samples_leaf=1 
[CV] n_estimators=200, min_samples_split=4, max_depth=3, min_samples_leaf=1 
[CV] n_estimators=200, min_samples_split=4, max_depth=3, min_samples_leaf=1 
[CV] n_estimators=300, min_samples_split=4, max_depth=3, min_samples_leaf=1 
[CV]  n_estimators=200, min_samples_split=4, max_depth=3, min_samples_leaf=1, score=-0.101825 -   0.4s
[CV]  n_estimators=200, min_samples_split=4, max_depth=3, min_samples_leaf=1, score=-0.099655 -   0.5s
[CV]  n_estimators=200, min_samples_split=4, max_depth=3, min_samples_leaf=1, score=-0.123134 -   0.5s
[CV] n_estimators=300, min_samples_split=4, max_depth=3, min_samples_leaf=1 
[CV] n_estimators=300, min_samples_split=4, max_depth=3, min_samples_leaf=1 
[CV] n_estimators=400, min_samples_split=4, max_depth=3, min_samples_leaf=1 
[CV]  n_estimators=300, min_samples_split=4, max_depth=3, min_samples_leaf=1, score=-0.09

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    4.7s


[CV] n_estimators=600, min_samples_split=5, max_depth=3, min_samples_leaf=1 
[CV]  n_estimators=500, min_samples_split=5, max_depth=3, min_samples_leaf=1, score=-0.123916 -   0.8s
[CV] n_estimators=600, min_samples_split=5, max_depth=3, min_samples_leaf=1 
[CV]  n_estimators=500, min_samples_split=5, max_depth=3, min_samples_leaf=1, score=-0.107507 -   0.8s
[CV] n_estimators=200, min_samples_split=4, max_depth=3, min_samples_leaf=2 
[CV]  n_estimators=600, min_samples_split=5, max_depth=3, min_samples_leaf=1, score=-0.104539 -   0.9s
[CV] n_estimators=200, min_samples_split=4, max_depth=3, min_samples_leaf=2 
[CV]  n_estimators=200, min_samples_split=4, max_depth=3, min_samples_leaf=2, score=-0.094793 -   0.3s
[CV] n_estimators=200, min_samples_split=4, max_depth=3, min_samples_leaf=2 
[CV]  n_estimators=600, min_samples_split=5, max_depth=3, min_samples_leaf=1, score=-0.102411 -   0.9s
[CV] n_estimators=300, min_samples_split=4, max_depth=3, min_samples_leaf=2 
[CV]  n_estimators=200,

[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   21.5s


[CV]  n_estimators=600, min_samples_split=5, max_depth=3, min_samples_leaf=4, score=-0.104628 -   0.9s
[CV] n_estimators=400, min_samples_split=4, max_depth=3, min_samples_leaf=5 
[CV]  n_estimators=300, min_samples_split=4, max_depth=3, min_samples_leaf=5, score=-0.151159 -   1.0s
[CV] n_estimators=400, min_samples_split=4, max_depth=3, min_samples_leaf=5 
[CV]  n_estimators=300, min_samples_split=4, max_depth=3, min_samples_leaf=5, score=-0.090865 -   1.2s
[CV] n_estimators=400, min_samples_split=4, max_depth=3, min_samples_leaf=5 
[CV]  n_estimators=300, min_samples_split=4, max_depth=3, min_samples_leaf=5, score=-0.094377 -   1.3s
[CV] n_estimators=500, min_samples_split=4, max_depth=3, min_samples_leaf=5 
[CV]  n_estimators=400, min_samples_split=4, max_depth=3, min_samples_leaf=5, score=-0.111231 -   1.6s
[CV] n_estimators=500, min_samples_split=4, max_depth=3, min_samples_leaf=5 
[CV]  n_estimators=400, min_samples_split=4, max_depth=3, min_samples_leaf=5, score=-0.152017 -   1.

[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   55.9s


[CV]  n_estimators=500, min_samples_split=4, max_depth=4, min_samples_leaf=5, score=-0.089705 -   1.2s
[CV] n_estimators=200, min_samples_split=5, max_depth=4, min_samples_leaf=5 
[CV]  n_estimators=600, min_samples_split=4, max_depth=4, min_samples_leaf=5, score=-0.116403 -   1.5s
[CV] n_estimators=200, min_samples_split=5, max_depth=4, min_samples_leaf=5 
[CV]  n_estimators=200, min_samples_split=5, max_depth=4, min_samples_leaf=5, score=-0.085468 -   0.5s
[CV] n_estimators=200, min_samples_split=5, max_depth=4, min_samples_leaf=5 
[CV]  n_estimators=600, min_samples_split=4, max_depth=4, min_samples_leaf=5, score=-0.137920 -   1.4s
[CV] n_estimators=300, min_samples_split=5, max_depth=4, min_samples_leaf=5 
[CV]  n_estimators=600, min_samples_split=4, max_depth=4, min_samples_leaf=5, score=-0.110527 -   1.3s
[CV] n_estimators=300, min_samples_split=5, max_depth=4, min_samples_leaf=5 
[CV]  n_estimators=200, min_samples_split=5, max_depth=4, min_samples_leaf=5, score=-0.132454 -   0.

[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.0min finished


GridSearchCV(cv=ShuffleSplit(653, n_iter=3, test_size=0.2, random_state=None),
       error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='ls',
             max_depth=3, max_features=None, max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': array([200, 300, 400, 500, 600]), 'min_samples_split': array([4, 5]), 'min_samples_leaf': array([1, 2, 3, 4, 5]), 'max_depth': array([3, 4])},
       pre_dispatch='2*n_jobs', refit=True, scoring='mean_squared_error',
       verbose=3)

In [190]:
-gs.best_score_

0.093685128269463708

In [186]:
gs.grid_scores_

[mean: -0.10820, std: 0.01059, params: {'n_estimators': 200, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_depth': 3},
 mean: -0.11489, std: 0.02547, params: {'n_estimators': 300, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_depth': 3},
 mean: -0.11234, std: 0.00513, params: {'n_estimators': 400, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_depth': 3},
 mean: -0.09912, std: 0.00663, params: {'n_estimators': 500, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_depth': 3},
 mean: -0.10656, std: 0.02382, params: {'n_estimators': 600, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_depth': 3},
 mean: -0.09620, std: 0.01614, params: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 3},
 mean: -0.09834, std: 0.00254, params: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 3},
 mean: -0.11513, std: 0.01771, params: {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 3},


In [191]:
preds = gs.best_estimator_.predict(X_test)
mse(preds, Y_test)

0.033424047103745184