<h1>GridSearchCV vs RandomizedSearchCV on digits dataset with RandomForestClassifier</h1>

In [33]:
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score

In [20]:
# Load the data
X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [14]:
# Instantiating random forest classifier
rf = RandomForestClassifier(n_estimators==50)

<h2>1. Random Search</h2>

In [59]:
# Defining the parameter grid for randomized search cv

param_matrix = {
    'max_depth':[3,None],
    'max_features':sp_randint(1,11),
    'min_samples_split':sp_randint(2,11),
    'min_samples_leaf':sp_randint(3,11),
    'bootstrap':[True,False],
    'criterion':['gini','entropy']
}

In [60]:
# Creating the RandomizedSearchCV object

randomcv = RandomizedSearchCV(estimator=rf, param_distributions=param_matrix, n_iter=10, cv=5, n_jobs=-1)

In [61]:
# Model fitting

randomcv.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators=50,
                                                    n_jobs=None,
  

In [62]:
# best_estimator_ gives the best model

randomcv.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=9, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [63]:
# best_params_ gives the best parameters for the model

randomcv.best_params_

{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 9,
 'min_samples_leaf': 3,
 'min_samples_split': 5}

In [64]:
final_model = randomcv.best_estimator_

In [65]:
pred = final_model.predict(X_test)
print("Accuracy:",accuracy_score(y_test, pred))

Accuracy: 0.9666666666666667


<h2>2. Grid Search</h2>

In [35]:
# Defining the parameter grid for grid search cv


param_grid = {
    'max_depth':[2,3,None],
    'max_features':[1,3,10],
    'min_samples_split':[2,3,10],
    'min_samples_leaf':[1,3,10],
    'criterion':['gini','entropy'],
    'bootstrap':[True, False],
             }

In [39]:
# Creating the GridSearchCV object

gridsearch = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

In [40]:
# Model fitting

gridsearch.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=50, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='wa

In [41]:
# Best Parameters for the model

gridsearch.best_params_

{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 3}

In [42]:
# Best model

gridsearch.best_estimator_

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=None, max_features=10, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [43]:
# Prediction and Evaluation

pred = gridsearch.predict(X_test)
print("Accuracy score for grid search", accuracy_score(y_test, pred))

Accuracy score for grid search 0.975


<h1>Notes:</h1>
<ol>
    <li>Here, the grid search seems to perform slightly better than the random search. But this is not true for all cases</li>
    <li>In fact, the training time of grid search was much longer than the random search.</li>
    <li>RandomizedSearchCV can hit the right values randomly but in the case of grid search, we should manually specify the perfect values.</li>
</ol>