# Hyperparameter tuning
This notebook explores hyperparameter tuning. It uses the pima indians dataset built into Sklearn.

## Import

In [55]:
# Core libraries
import pandas as pd
import numpy as np
# Sklearn processing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# Sklearn regression algorithms
from sklearn.neighbors import KNeighborsClassifier

# Sklearn regression model evaluation functions
from sklearn.metrics import r2_score

## Load data, split into X and y and scale data

In [56]:
# Load Boston housing data set
dataframe = pd.read_csv("..\..\..\datasets\pima-indians_classification_train.csv")

##names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
array = dataframe.values
X = array[:,0:8]
y = array[:,8]

## Build a model with default hyperparameters

In [57]:
# Create an empty model
model = GaussianNB()

In [58]:
# Inspect the model's default hyperparameters:
model

GaussianNB()

In [59]:
# What hyperparameters can we tune?
model.get_params()

{'priors': None, 'var_smoothing': 1e-09}

## Tune hyperparameters with grid search 

In [60]:
# Select an algorithm
algorithm = GaussianNB()

# Create 3 folds
seed = 13
kfold = KFold(n_splits=5, shuffle=True, random_state=seed)

# Define our candidate hyperparameters
hp_candidates = [{'var_smoothing': np.logspace(0,-9, num=100)}]
# Search for best hyperparameters
grid = GridSearchCV(estimator=algorithm, param_grid=hp_candidates, cv=kfold, scoring='accuracy')
grid.fit(X, y)

# Get the results
print(grid.best_score_)
print(grid.best_estimator_)
print(grid.best_params_)

0.7562515915457091
GaussianNB(var_smoothing=0.01)
{'var_smoothing': 0.01}


### Get a full breakdown of the grid search

In [63]:
grid.cv_results_

{'mean_fit_time': array([0.00200038, 0.00159483, 0.0013998 , 0.00119963, 0.00099936,
        0.00120006, 0.00099998, 0.0009994 , 0.00099921, 0.00140047,
        0.00100007, 0.00119929, 0.00119967, 0.00140357, 0.0009995 ,
        0.00099559, 0.00119958, 0.00119524, 0.00119901, 0.00120034,
        0.00119982, 0.00140319, 0.0009995 , 0.00099978, 0.00119953,
        0.00119934, 0.00119915, 0.0010005 , 0.00159907, 0.00099955,
        0.00100403, 0.00119958, 0.00099969, 0.00099931, 0.00119929,
        0.00119982, 0.00140004, 0.00139952, 0.00099945, 0.00119886,
        0.00100002, 0.00119963, 0.00139947, 0.00119948, 0.00099974,
        0.0009995 , 0.00100002, 0.00099964, 0.00119929, 0.00100002,
        0.00099983, 0.0009994 , 0.00119958, 0.00100064, 0.00120158,
        0.00099912, 0.00119939, 0.00100064, 0.00099978, 0.00119877,
        0.00119939, 0.00099959, 0.00099912, 0.00119977, 0.00119939,
        0.00099974, 0.00119944, 0.00119934, 0.00099936, 0.00119925,
        0.00119972, 0.0009994 ,

### Prove that best_score_ is the mean of all the k-fold scores
Here's a little check to see how best_score_ is derived from cv_results_

In [64]:
# Get the index of the best hyperparameter combination chosen by GridSearchCv()
grid.best_index_

22

In [65]:
# Get the mean and std of the k-fold scores for the best hyperparameter combination
print(grid.cv_results_['mean_test_score'][grid.best_index_])
print(grid.cv_results_['std_test_score'][grid.best_index_])

0.7562515915457091
0.025033231190582886
