### Random Search Implementation using Scikit-learn RandomizedSearchCV

This notebook demonstrates the implementation of Random Search using Sklearn RandomizedSearchCV method

A RandomForestClassifier is used to train the model while RandomSearchCV method of SKlearn was used to tuned it to understand the best hyparameter that produces the best accuracy.


### Import Necessary Libaries

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score

### Dataset Preprocessing

In [2]:
dataset = pd.read_csv('credit-card-full.csv')
dataset.head(10)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
5,6,50000,1,1,2,37,0,0,0,0,...,19394,19619,20024,2500,1815,657,1000,1000,800,0
6,7,500000,1,1,2,29,0,0,0,0,...,542653,483003,473944,55000,40000,38000,20239,13750,13770,0
7,8,100000,2,2,2,23,0,-1,-1,0,...,221,-159,567,380,601,0,581,1687,1542,0
8,9,140000,2,3,1,28,0,0,2,0,...,12211,11793,3719,3329,0,432,1000,1000,1000,0
9,10,20000,1,3,2,35,-2,-2,-2,-2,...,0,13007,13912,0,0,0,13007,1122,0,0


In [3]:
#drop personal attributes in dataset
dataset = pd.get_dummies(dataset, columns=['SEX', 'EDUCATION', 'MARRIAGE'], drop_first=True)

X = dataset.drop(['ID', 'default payment next month'], axis=1)
y = dataset['default payment next month']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [4]:
X.head()

Unnamed: 0,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,...,SEX_2,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3
0,20000,24,2,2,-1,-1,-2,-2,3913,3102,...,1,0,1,0,0,0,0,1,0,0
1,120000,26,-1,2,0,0,0,2,2682,1725,...,1,0,1,0,0,0,0,0,1,0
2,90000,34,0,0,0,0,0,0,29239,14027,...,1,0,1,0,0,0,0,0,1,0
3,50000,37,0,0,0,0,0,0,46990,48233,...,1,0,1,0,0,0,0,1,0,0
4,50000,57,-1,0,-1,0,0,0,8617,5670,...,0,0,1,0,0,0,0,1,0,0


In [5]:
y.head()

0    1
1    1
2    0
3    0
4    0
Name: default payment next month, dtype: int64

In [6]:
dataset.head()

Unnamed: 0,ID,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,...,SEX_2,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3
0,1,20000,24,2,2,-1,-1,-2,-2,3913,...,1,0,1,0,0,0,0,1,0,0
1,2,120000,26,-1,2,0,0,0,2,2682,...,1,0,1,0,0,0,0,0,1,0
2,3,90000,34,0,0,0,0,0,0,29239,...,1,0,1,0,0,0,0,0,1,0
3,4,50000,37,0,0,0,0,0,0,46990,...,1,0,1,0,0,0,0,1,0,0
4,5,50000,57,-1,0,-1,0,0,0,8617,...,0,0,1,0,0,0,0,1,0,0


## Model Definition

In [7]:
# Create a Random Forest Classifier with specified criterion

# Create the parameter grid
param_grid = {
    'max_depth': list(range(5, 26)),
    'max_features':['auto', 'sqrt']
}

#Build RandomSearchCV object
random_rf_class = RandomizedSearchCV(
    estimator=RandomForestClassifier(n_estimators=750),
    param_distributions = param_grid,
    scoring='roc_auc',
    n_jobs = 4,
    n_iter= 5,
    cv = 3,
    refit = True,
    return_train_score = True
)

print(random_rf_class)

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(n_estimators=750),
                   n_iter=5, n_jobs=4,
                   param_distributions={'max_depth': [5, 6, 7, 8, 9, 10, 11, 12,
                                                      13, 14, 15, 16, 17, 18,
                                                      19, 20, 21, 22, 23, 24,
                                                      25],
                                        'max_features': ['auto', 'sqrt']},
                   return_train_score=True, scoring='roc_auc')


In [8]:
from datetime import datetime, timedelta
start_time = datetime.now()
print("%-20s %s" % ("Start Time", start_time))

random_rf_class.fit(X_train, y_train)

end_time = datetime.now()
print("%-20s %s" % ("Start Time", start_time))
print("%-20s %s" % ("End Time", end_time))
print(str(timedelta(seconds=(end_time-start_time).seconds)))

Start Time           2023-01-17 18:36:02.765577
Start Time           2023-01-17 18:36:02.765577
End Time             2023-01-17 18:39:27.465801
0:03:24


In [9]:
cv_results_df = pd.DataFrame(random_rf_class.cv_results_)
print(cv_results_df)

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0      27.380716      0.184428         1.580338        0.028193   
1      50.753515      1.242097         2.543667        0.245860   
2      26.786464      0.747188         1.423667        0.108725   
3      51.331866      0.289887         2.599003        0.158753   
4      27.819317      3.667299         1.184673        0.242771   

  param_max_features param_max_depth  \
0               sqrt               8   
1               auto              17   
2               sqrt               7   
3               sqrt              18   
4               sqrt               9   

                                      params  split0_test_score  \
0   {'max_features': 'sqrt', 'max_depth': 8}           0.770885   
1  {'max_features': 'auto', 'max_depth': 17}           0.768506   
2   {'max_features': 'sqrt', 'max_depth': 7}           0.770485   
3  {'max_features': 'sqrt', 'max_depth': 18}           0.768716   
4   {'max_features': 

## Viewing the best parameter for the model


In [10]:
best_score = random_rf_class.best_score_
print('Best Score obtained by the Parameters', best_score)

# Create a variable from the row related to the best-performing square
cv_results_df = pd.DataFrame(random_rf_class.cv_results_)
best_row = cv_results_df.loc[[random_rf_class.best_index_]]
print("\n", best_row)

Best Score obtained by the Parameters 0.7771550475329022

    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
4      27.819317      3.667299         1.184673        0.242771   

  param_max_features param_max_depth  \
4               sqrt               9   

                                     params  split0_test_score  \
4  {'max_features': 'sqrt', 'max_depth': 9}           0.771788   

   split1_test_score  split2_test_score  mean_test_score  std_test_score  \
4           0.779847            0.77983         0.777155        0.003795   

   rank_test_score  split0_train_score  split1_train_score  \
4                1            0.857573            0.854483   

   split2_train_score  mean_train_score  std_train_score  
4            0.856614          0.856223         0.001291  


In [11]:
# Get the max_depth parameter from the best-performing square and print
best_max_depth = random_rf_class.best_params_['max_depth']
print("\n", best_max_depth)

# Row 19 and max _depth 10 gives the best accuracy
best_max_features = random_rf_class.best_params_['max_features']
print(best_max_features)


 9
sqrt


In [12]:
print(random_rf_class.cv_results_['param_max_depth'])
print(random_rf_class.cv_results_['param_max_features'])

[8 17 7 18 9]
['sqrt' 'auto' 'sqrt' 'sqrt' 'sqrt']


In [13]:
print(type(random_rf_class.best_estimator_))

# Create an array of predictions directly using the best_estimator_property
predictions = random_rf_class.best_estimator_.predict(X_test)

#view result
print(predictions[0:5])

#show result in confusion matrix
print("Confusion Matrix \n", confusion_matrix(y_test, predictions))

# Get the ROC-AUC score
predictions_proba = random_rf_class.best_estimator_.predict_proba(X_test)[:, 1]
print("ROC-AUC Score \n", roc_auc_score(y_test, predictions_proba))

print("Accuracy Score \n", accuracy_score(y_test, predictions))

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
[0 0 0 0 0]
Confusion Matrix 
 [[6674  332]
 [1306  688]]
ROC-AUC Score 
 0.7847663029052901
Accuracy Score 
 0.818
