#Grid vs. Randomized Searches for Hyperparameter Optimization
###Author: Andrew Mehrmann
###ASTR 596

In [1]:
%matplotlib inline

In [170]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
from collections import Counter
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV
from sklearn.metrics import classification_report
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier

###Load in and Inspect the Data

In [38]:
dgts = datasets.load_digits()
X = dgts.data
y = dgts.target

In [None]:
Counter(y)

In [41]:
X.shape

(1797, 64)

###Define the classifier

In [175]:
forest = RandomForestClassifier(n_estimators = 100)

##Grid (Exhaustive) Search

In [197]:
feat_range = np.arange(1,X.shape[1],10)
depth_range = np.arange(1,51,10)
boot_range = [True, False]

params = {'max_features':feat_range,
         'max_depth':depth_range,
         'bootstrap':boot_range}

n_combos = len(feat_range) + len(depth_range) + len(boot_range)

print('the exhaustive search will examine {0} candidates'.format(n_combos))

the exhaustive search will examine 14 candidates


In [198]:
grid = GridSearchCV(forest, params, cv = 5, n_jobs=-1, scoring='accuracy')

start = datetime.datetime.now()

temp = grid.fit(X,y)

elapsed = datetime.datetime.now() - start
print('{0} seconds elapsed during exhaustive search'.format(elapsed.seconds))

df = pd.DataFrame(grid.grid_scores_)
df.columns = ['params','mean_score','scores']
df['sd'] = df.scores.apply(np.std)
df = df.drop('scores',axis=1)
df_sorted = df.sort('mean_score', ascending=False)
df_sorted.head(10)

115 seconds elapsed during exhaustive search


Unnamed: 0,params,mean_score,sd
49,"{u'max_features': 1, u'bootstrap': False, u'ma...",0.947134,0.023032
64,"{u'max_features': 11, u'bootstrap': False, u'm...",0.941569,0.021652
57,"{u'max_features': 11, u'bootstrap': False, u'm...",0.941569,0.019424
21,"{u'max_features': 1, u'bootstrap': True, u'max...",0.941013,0.020036
63,"{u'max_features': 1, u'bootstrap': False, u'ma...",0.940456,0.017386
43,"{u'max_features': 11, u'bootstrap': False, u'm...",0.9399,0.0192
8,"{u'max_features': 11, u'bootstrap': True, u'ma...",0.9399,0.020229
56,"{u'max_features': 1, u'bootstrap': False, u'ma...",0.9399,0.021638
50,"{u'max_features': 11, u'bootstrap': False, u'm...",0.939343,0.026921
14,"{u'max_features': 1, u'bootstrap': True, u'max...",0.938787,0.023742


##Randomized Search

In [194]:
n_iter = 5

feat_range = sp_randint(1,X.shape[1])
depth_range = sp_randint(1,51)
boot_range = [True, False]


params = {'max_features':feat_range,
         'max_depth':depth_range,
         'bootstrap':boot_range}

print('the randomized search will examine {0} candidates'.format(n_iter))

the randomized search will examine 5 candidates


In [195]:
rand = RandomizedSearchCV(forest, params, cv=5, n_jobs=-1, n_iter=n_iter, scoring='accuracy')

start = datetime.datetime.now()

temp = rand.fit(X,y)

elapsed = datetime.datetime.now() - start
print('{0} seconds elapsed during randomized search'.format(elapsed.seconds))

df_rand = pd.DataFrame(rand.grid_scores_)
df_rand.columns = ['params','mean_score','scores']
df_rand['sd'] = df_rand.scores.apply(np.std)
df_rand = df_rand.drop('scores',axis=1)
df_rand_sorted = df_rand.sort('mean_score', ascending=False)
df_rand_sorted


8 seconds elapsed during randomized search


Unnamed: 0,params,mean_score,scores,sd
1,"{u'max_features': 8, u'bootstrap': True, u'cri...",0.942682,"[0.93956043956, 0.906077348066, 0.961002785515...",0.021952
3,"{u'max_features': 13, u'bootstrap': True, u'cr...",0.940456,"[0.936813186813, 0.922651933702, 0.95821727019...",0.015103
2,"{u'max_features': 8, u'bootstrap': False, u'cr...",0.9399,"[0.934065934066, 0.911602209945, 0.95821727019...",0.019892
0,"{u'max_features': 42, u'bootstrap': False, u'c...",0.90039,"[0.903846153846, 0.872928176796, 0.94150417827...",0.029601
4,"{u'max_features': 10, u'bootstrap': True, u'cr...",0.844741,"[0.879120879121, 0.812154696133, 0.82729805013...",0.02876


##Pipelines

In [200]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from scipy.stats import uniform

In [221]:
pca = PCA()
pipeline = Pipeline([("components", pca), ("rfc", forest)])

feat_range = uniform(0,1)
depth_range = sp_randint(1,51)
boot_range = [True, False]
pc_range = sp_randint(1,X.shape[1])

params = {'rfc__max_features':feat_range,
         'rfc__max_depth':depth_range,
         'rfc__bootstrap':boot_range,
         'components__n_components':pc_range}


In [222]:
search = RandomizedSearchCV(pipeline, params, cv=5, n_jobs=-1, n_iter=20, scoring='accuracy')

start = datetime.datetime.now()

temp = search.fit(X,y)

elapsed = datetime.datetime.now() - start
print('{0} seconds elapsed during randomized search'.format(elapsed.seconds))

df_pipe = pd.DataFrame(search.grid_scores_)
df_pipe.columns = ['params','mean_score','scores']
df_pipe['sd'] = df_pipe.scores.apply(np.std)
df_pipe = df_pipe.drop('scores',axis=1)
df_pipe_sorted = df_pipe.sort('mean_score', ascending=False)
df_pipe_sorted


61 seconds elapsed during randomized search


Unnamed: 0,params,mean_score,sd
6,"{u'rfc__max_depth': 12, u'rfc__bootstrap': Tru...",0.934335,0.018038
10,"{u'rfc__max_depth': 37, u'rfc__bootstrap': Fal...",0.93044,0.016398
8,"{u'rfc__max_depth': 50, u'rfc__bootstrap': Tru...",0.928214,0.018737
11,"{u'rfc__max_depth': 32, u'rfc__bootstrap': Fal...",0.918753,0.023948
4,"{u'rfc__max_depth': 28, u'rfc__bootstrap': Fal...",0.914858,0.017474
13,"{u'rfc__max_depth': 35, u'rfc__bootstrap': Tru...",0.914302,0.017552
3,"{u'rfc__max_depth': 49, u'rfc__bootstrap': Tru...",0.913745,0.020179
17,"{u'rfc__max_depth': 36, u'rfc__bootstrap': Tru...",0.90985,0.020246
9,"{u'rfc__max_depth': 45, u'rfc__bootstrap': Tru...",0.907624,0.021491
5,"{u'rfc__max_depth': 9, u'rfc__bootstrap': True...",0.907067,0.026705


In [228]:
df_pipe_sorted.iloc[5].params

{'components__n_components': 38,
 'rfc__bootstrap': True,
 'rfc__max_depth': 35,
 'rfc__max_features': 0.012961589710068444}

In [199]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

###References

* http://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf
* http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html