# Exercise 4 - Hyperparameter Tuning

## Original

Below is the original exercise

In [1]:
from sklearn import datasets
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

In [2]:
iris = datasets.load_iris()
gbc = GradientBoostingClassifier()
parameters = {'learning_rate':[0.01, 0.05, 0.1, 0.5, 1], 
              'min_samples_split':[2,5,10,20], 
              'max_depth':[2,3,5,10]}

In [3]:
clf = GridSearchCV(gbc, parameters, cv=5)
clf.fit(iris.data, iris.target)
print(clf.best_params_)
# {'learning_rate': 1, 'max_depth': 2, 'min_samples_split': 2}

{'learning_rate': 0.01, 'max_depth': 3, 'min_samples_split': 5}


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, 
                                                 test_size=0.33, 
                                                 random_state=42)

clf = GridSearchCV(gbc, parameters, cv=3, iid=False)
clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid=False, n_jobs=None,
       param_grid={'learning_rate': [0.01, 0.05, 0.1, 0.5, 1], 'min_samples_split': [2, 5, 10, 20], 'max_depth': [2, 3, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [6]:
print(clf.best_params_)
# {'learning_rate': 0.01, 'max_depth': 5, 'min_samples_split': 2}

{'learning_rate': 0.05, 'max_depth': 10, 'min_samples_split': 2}


In [7]:
clf.best_score_
# 0.95
cross_val_score(estimator=clf.best_estimator_, X=X_train, y=y_train, cv=5)  
# array([1.        , 0.8       , 0.95      , 1.        , 0.94736842])

array([1.        , 0.8       , 0.9       , 1.        , 0.94736842])

## Exercise

The following is the solution to the exercise. Rather than using GraidentBoostingClassifier, RandomForestClassifier is used instead.

In [29]:
# Import from sklearn
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from numpy import mean

In [42]:
# Use RandomForestClassifier as our estimator and assign parameter values
rfc = RandomForestClassifier(random_state=42, n_jobs=1, n_estimators=100)
rf_parameters = {'max_depth':[2,3,5,10],
                 'n_estimators':[4,6,8,11],
                 'min_samples_split':[2,5,7,9],
                 'max_leaf_nodes':[None,2,3,4],
                 'max_features':[None,'auto','log2',0.5,1,2,3,4]}

In [43]:
# Load dataset and apply it and parameters to GridSearchCV
rf_iris = datasets.load_iris()
rf_clf = GridSearchCV(rfc, rf_parameters, cv=5)

In [44]:
# Training via fit
rf_clf.fit(rf_iris.data, rf_iris.target) 

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [2, 3, 5, 10], 'n_estimators': [4, 6, 8, 11], 'min_samples_split': [2, 5, 7, 9], 'max_leaf_nodes': [None, 2, 3, 4], 'max_features': [None, 'auto', 'log2', 0.5, 1, 2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [45]:
# Print the best parameters
print(rf_clf.best_params_)

{'max_depth': 2, 'max_features': None, 'max_leaf_nodes': 4, 'min_samples_split': 2, 'n_estimators': 8}


In [46]:
# Train the data using train_test_split 
rf_X_train, rf_X_test, rf_y_train, rf_y_test = train_test_split(rf_iris.data, rf_iris.target, 
                                                 test_size=0.33, 
                                                 random_state=42)

rf_clf = GridSearchCV(rfc, rf_parameters, cv=3, iid=False)
rf_clf.fit(rf_X_train, rf_y_train)

# Print the best parameters
print(rf_clf.best_params_)

{'max_depth': 2, 'max_features': None, 'max_leaf_nodes': 2, 'min_samples_split': 2, 'n_estimators': 6}


In [47]:
# Return the best score
rf_clf.best_score_

0.9491792929292929

In [48]:
# Return cross value score
cross_val_score(estimator=rf_clf.best_estimator_, X=rf_X_train, y=rf_y_train, cv=5)  

array([1.        , 0.8       , 0.65      , 0.95      , 0.89473684])

In [49]:
# Determine the mean value
mean(cross_val_score(estimator=rf_clf.best_estimator_, X=rf_X_train, y=rf_y_train, cv=5))

0.8589473684210527

In [50]:
# Return param_grid
rf_clf.param_grid

{'max_depth': [2, 3, 5, 10],
 'n_estimators': [4, 6, 8, 11],
 'min_samples_split': [2, 5, 7, 9],
 'max_leaf_nodes': [None, 2, 3, 4],
 'max_features': [None, 'auto', 'log2', 0.5, 1, 2, 3, 4]}