### Grid Search Implementation using Scikit-learn GridSearchCV

This notebook demonstrates the implementation of Grid Search using Sklearn GridSearchCV method

A RandomForestClassifier is used to train the model while GridSearchCV method of SKlearn was used to tuned it to understand the best hyparameter that produces the best accuracy.


### Import Necessary Libaries

In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score

### Dataset Preprocessing

In [5]:
dataset = pd.read_csv('credit-card-full.csv')
dataset.head(10)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
5,6,50000,1,1,2,37,0,0,0,0,...,19394,19619,20024,2500,1815,657,1000,1000,800,0
6,7,500000,1,1,2,29,0,0,0,0,...,542653,483003,473944,55000,40000,38000,20239,13750,13770,0
7,8,100000,2,2,2,23,0,-1,-1,0,...,221,-159,567,380,601,0,581,1687,1542,0
8,9,140000,2,3,1,28,0,0,2,0,...,12211,11793,3719,3329,0,432,1000,1000,1000,0
9,10,20000,1,3,2,35,-2,-2,-2,-2,...,0,13007,13912,0,0,0,13007,1122,0,0


In [6]:
#drop personal attributes in dataset
dataset = pd.get_dummies(dataset, columns=['SEX', 'EDUCATION', 'MARRIAGE'], drop_first=True)

X = dataset.drop(['ID', 'default payment next month'], axis=1)
y = dataset['default payment next month']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [7]:
X.head()

Unnamed: 0,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,...,SEX_2,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3
0,20000,24,2,2,-1,-1,-2,-2,3913,3102,...,1,0,1,0,0,0,0,1,0,0
1,120000,26,-1,2,0,0,0,2,2682,1725,...,1,0,1,0,0,0,0,0,1,0
2,90000,34,0,0,0,0,0,0,29239,14027,...,1,0,1,0,0,0,0,0,1,0
3,50000,37,0,0,0,0,0,0,46990,48233,...,1,0,1,0,0,0,0,1,0,0
4,50000,57,-1,0,-1,0,0,0,8617,5670,...,0,0,1,0,0,0,0,1,0,0


In [8]:
y.head()

0    1
1    1
2    0
3    0
4    0
Name: default payment next month, dtype: int64

In [9]:
dataset.head()

Unnamed: 0,ID,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,...,SEX_2,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3
0,1,20000,24,2,2,-1,-1,-2,-2,3913,...,1,0,1,0,0,0,0,1,0,0
1,2,120000,26,-1,2,0,0,0,2,2682,...,1,0,1,0,0,0,0,0,1,0
2,3,90000,34,0,0,0,0,0,0,29239,...,1,0,1,0,0,0,0,0,1,0
3,4,50000,37,0,0,0,0,0,0,46990,...,1,0,1,0,0,0,0,1,0,0
4,5,50000,57,-1,0,-1,0,0,0,8617,...,0,0,1,0,0,0,0,1,0,0


## Model Definition

In [10]:
# Create a Random Forest Classifier with specified criterion

rf = RandomForestClassifier()

# Create the parameter grid
param_grid = {
    'max_depth':[2, 4, 8, 15],
    'max_features':['auto', 'sqrt', 'log2'],
    'criterion':['gini', 'entropy', 'log_loss']
}

#Build GridSearchCV object
grid_rf_class = GridSearchCV(
    estimator=rf,
    param_grid = param_grid,
    scoring='roc_auc',
    n_jobs = 4,
    cv = 5,
    refit = True,
    return_train_score = True
)

print(grid_rf_class)

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=4,
             param_grid={'criterion': ['gini', 'entropy', 'log_loss'],
                         'max_depth': [2, 4, 8, 15],
                         'max_features': ['auto', 'sqrt', 'log2']},
             return_train_score=True, scoring='roc_auc')


In [11]:
from datetime import datetime, timedelta
start_time = datetime.now()
print("%-20s %s" % ("Start Time", start_time))

grid_rf_class.fit(X_train, y_train)

end_time = datetime.now()
print("%-20s %s" % ("Start Time", start_time))
print("%-20s %s" % ("End Time", end_time))
print(str(timedelta(seconds=(end_time-start_time).seconds)))

Start Time           2023-01-17 17:51:34.014059


60 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Muheeb\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Muheeb\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "C:\Users\Muheeb\anaconda3\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\Muheeb\anaconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\Muheeb\

Start Time           2023-01-17 17:51:34.014059
End Time             2023-01-17 17:57:23.283812
0:05:49


In [12]:
cv_results_df = pd.DataFrame(grid_rf_class.cv_results_)
print(cv_results_df)

    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        2.436381      0.134584         0.102204        0.010903   
1        2.355999      0.141299         0.113198        0.029821   
2        2.195596      0.084444         0.101802        0.021929   
3        3.977884      0.156796         0.167505        0.035916   
4        3.866335      0.268256         0.129202        0.019362   
5        3.287163      0.229740         0.151208        0.030389   
6        6.914326      0.094755         0.196806        0.030486   
7        7.370319      0.463633         0.212919        0.048188   
8        6.893634      0.585031         0.211201        0.046238   
9       10.411196      0.869213         0.281808        0.055340   
10      15.047590      0.861966         0.443404        0.138153   
11      10.972789      0.343521         0.315600        0.106520   
12       3.700793      0.307875         0.160004        0.049458   
13       3.746806      0.427284         0.131595

## Viewing the best parameter for the model


In [13]:
best_score = grid_rf_class.best_score_
print('Best Score obtained by the Parameters', best_score)

# Create a variable from the row related to the best-performing square
cv_results_df = pd.DataFrame(grid_rf_class.cv_results_)
best_row = cv_results_df.loc[[grid_rf_class.best_index_]]
print("\n", best_row)

Best Score obtained by the Parameters 0.7775407642607253

     mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
23       9.322544      0.113254         0.168001        0.017842   

   param_criterion param_max_depth param_max_features  \
23         entropy              15               log2   

                                               params  split0_test_score  \
23  {'criterion': 'entropy', 'max_depth': 15, 'max...           0.790771   

    split1_test_score  ...  mean_test_score  std_test_score  rank_test_score  \
23           0.773215  ...         0.777541        0.007109                1   

    split0_train_score  split1_train_score  split2_train_score  \
23            0.972874            0.976928              0.9723   

    split3_train_score  split4_train_score  mean_train_score  std_train_score  
23            0.975185            0.973062           0.97407         0.001731  

[1 rows x 23 columns]


In [18]:
# Get the max_depth parameter from the best-performing square and print
best_max_depth = grid_rf_class.best_params_['max_depth']
print("\n", best_max_depth)

# Row 23 and max _depth 15 gives the best accuracy

best_max_features = grid_rf_class.best_params_['max_features']
print(best_max_features)


 15
log2


In [15]:
print(type(grid_rf_class.best_estimator_))

# Create an array of predictions directly using the best_estimator_property
predictions = grid_rf_class.best_estimator_.predict(X_test)

#view result
print(predictions[0:5])

#show result in confusion matrix
print("Confusion Matrix \n", confusion_matrix(y_test, predictions))

# Get the ROC-AUC score
predictions_proba = grid_rf_class.best_estimator_.predict_proba(X_test)[:, 1]
print("ROC-AUC Score \n", roc_auc_score(y_test, predictions_proba))

print("Accuracy Score \n", accuracy_score(y_test, predictions))

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
[0 0 1 0 0]
Confusion Matrix 
 [[6692  328]
 [1286  694]]
ROC-AUC Score 
 0.7845121082621083
Accuracy Score 
 0.8206666666666667
