In [1]:
from time import time

import numpy as np
from scipy.stats import randint
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from machine_learning.utils.utils_optimization import report_hyperparameter_search_result as report

### data

In [2]:
digits = load_digits()
X = digits.data  # (1797, 64)
Y = digits.target  # (1797,)

### random forest

In [3]:
n_estimators = 20
random_state = 0
clf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)

In [4]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 20,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

### grid search for best radom forest

In [5]:
# set up the search
param_grid = {'max_depth': [3, None],
              'max_features': [1, 3, 9],
              'min_samples_split': [2, 4, 8, 16],
              'min_samples_leaf': [2, 4, 8, 16],
              'bootstrap': [True, False],
              'criterion': ['gini', 'entropy']}
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs=-1)

In [6]:
# running search
grid_start = time()
grid_search.fit(X, Y)
grid_end = time()

In [7]:
n_params_settings = len(grid_search.cv_results_['params'])
print(f"GridSearchCV searched {n_params_settings} candidate parameter settings in {grid_end - grid_start:.2f} seconds")

GridSearchCV searched 384 candidate parameter settings in 10.99 seconds


In [8]:
report(grid_search.cv_results_)

model with rank: 1
mean validation score: 0.938
std validation score:  0.015
parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_leaf': 2, 'min_samples_split': 2}


model with rank: 1
mean validation score: 0.938
std validation score:  0.015
parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_leaf': 2, 'min_samples_split': 4}


model with rank: 3
mean validation score: 0.929
std validation score:  0.026
parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 3, 'min_samples_leaf': 2, 'min_samples_split': 2}


model with rank: 3
mean validation score: 0.929
std validation score:  0.026
parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 3, 'min_samples_leaf': 2, 'min_samples_split': 4}




### search for best radom forest with randomized search

In [9]:
# set up the search
n_iter = 300
n_jobs = -1
random_state = 0
param_distribution = {'max_depth': [3, None],
                      'max_features': randint(1, 10),
                      'min_samples_split': randint(2, 17),
                      'min_samples_leaf': randint(2, 17),
                      'bootstrap': [True, False],
                      'criterion': ['gini', 'entropy']}
randomized_search = RandomizedSearchCV(clf, param_distributions=param_distribution,
                                       n_iter=n_iter, n_jobs=-n_jobs, random_state=random_state)

In [10]:
# running search
randomized_start = time()
randomized_search.fit(X, Y)
randomized_end = time()

In [11]:
print(f"RandomizedSearchCV searched {n_iter} candidates parameter settings in {randomized_end - randomized_start:.2f} seconds")

RandomizedSearchCV searched 300 candidates parameter settings in 47.85 seconds


In [12]:
report(randomized_search.cv_results_)

model with rank: 1
mean validation score: 0.929
std validation score:  0.019
parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 8, 'min_samples_leaf': 2, 'min_samples_split': 4}


model with rank: 2
mean validation score: 0.926
std validation score:  0.023
parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 4, 'min_samples_leaf': 3, 'min_samples_split': 7}


model with rank: 3
mean validation score: 0.925
std validation score:  0.020
parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 7, 'min_samples_leaf': 2, 'min_samples_split': 7}


