# Hyper Parameter Tuning

Using hyper parameter to pick the best version of algorithm

We will use credit card default data from https://www.kaggle.com/datasets/uciml/default-of-credit-card-clients-dataset

References

- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV
- https://www.analyticsvidhya.com/blog/2021/06/tune-hyperparameters-with-gridsearchcv/


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.float_format = '{:,.2f}'.format

## Step-1: Download Data

In [None]:
import os
import urllib.request

data_url = 'https://raw.githubusercontent.com/elephantscale/datasets/master/credit-card-default/default2.csv'
data_location = os.path.basename(data_url)

if not os.path.exists (data_location):
    print("Downloading : ", data_url)
    urllib.request.urlretrieve(data_url, data_location)
print('data_location:', data_location)

In [None]:
data = pd.read_csv(data_location)
data.sample(10)

## Step-2: EDA

In [None]:
## Check data skew
data['default'].value_counts()

In [None]:
data['default'].value_counts(normalize=True)

## Step-3: Clean up 

TODO

## Step-4: Shape data

In [None]:
label_col = 'default'

feature_columns = data.columns
## TODO : drop 'ID' and 'default' columns
feature_columns = feature_columns.drop (['ID', 'default'])
#print (feature_columns)

In [None]:
X = data[feature_columns]
y = data[[label_col]]

print (X.shape)
print (y.shape)

## Step-5: Build a Parameter Grid

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ParameterGrid

algo = LogisticRegression(max_iter=500)

# find out parameters
algo.get_params()

# build a param-grid
param_grid =  {
                   # 'penalty' : ['l1', 'l2'],
                    'C' : [0.5, 1.0],
                    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                     'max_iter' : [50, 100, 200, 300],
              }

Let's try a different algorithm

In [None]:
# ## Bulid a param grid for RandomForest

# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import ParameterGrid

# algo = RandomForestClassifier()
# algo.get_params()

# # build a param-grid
# param_grid =  {
#                    'n_estimators' : (50, 100, 150),
#                    'max_depth' : (10,15,20,25),
#               }

## Step-6: Grid Search

In [None]:
%%time 

from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=algo, 
                             param_grid=param_grid, 
                             cv = 5, 
                             scoring='accuracy',
                             return_train_score=True,
                             n_jobs = -1)

grid_search_results = grid_search.fit (X,np.ravel(y))

## Step-7: Get Grid Search Results

In [None]:
# get best parameters

best_model = grid_search_results.best_estimator_
print ("Best model : ", best_model)
print ("Best hyper params : ", grid_search_results.best_params_)
print ("Best score : ", grid_search_results.best_score_)

In [None]:
# internal details

grid_search_results.cv_results_