In [1]:
import numpy as np

from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint

In [2]:
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

In [3]:
# get some data
digits = load_digits()
X, y = digits.data, digits.target

In [4]:
# build a classifier
clf = RandomForestClassifier(n_estimators=20)


In [None]:
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [5]:
# specify parameters and distributions to sample from
'''
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}'''

param_dist = {"max_features": sp_randint(1, 11)}

In [6]:
# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search,verbose=1)

In [8]:
random_search

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params={}, iid=True, n_iter=20, n_jobs=1,
          param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000007A70D50780>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=1)

In [9]:
start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    1.9s
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    2.2s finished


Fitting 3 folds for each of 20 candidates, totalling 60 fits
RandomizedSearchCV took 2.41 seconds for 20 candidates parameter settings.


In [10]:
#Try grid search now
# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [13]:
# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid,verbose=1)
start = time()
grid_search.fit(X, y)

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    0.9s
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:    5.3s
[Parallel(n_jobs=1)]: Done 449 tasks       | elapsed:   13.5s
[Parallel(n_jobs=1)]: Done 648 out of 648 | elapsed:   22.3s finished


Fitting 3 folds for each of 216 candidates, totalling 648 fits


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'min_samples_leaf': [1, 3, 10], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy'], 'max_features': [1, 3, 10], 'min_samples_split': [1, 3, 10], 'max_depth': [3, None]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=1)

In [17]:
clf.feature_importances_

NotFittedError: Estimator not fitted, call `fit` before `feature_importances_`.