### Bayesian Search Implementation using Scikit-learn and Hyperopt

This notebook demonstrates the implementation of Bayesian optimization strategy in Scikit-Learn that adopts use the knowlegde gained from previous tuning to improve the performance of subsequent ones. Scikit-Learn does not offer a specific method to do Bayesian hyperoptimization search, the solution presented herein is the implementation using a third-party Bayesian optimization libary called Hyperopt. 

### Import Necessary Libaries

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score

### Dataset Preprocessing

In [2]:
dataset = pd.read_csv('credit-card-full.csv')
dataset.head(10)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
5,6,50000,1,1,2,37,0,0,0,0,...,19394,19619,20024,2500,1815,657,1000,1000,800,0
6,7,500000,1,1,2,29,0,0,0,0,...,542653,483003,473944,55000,40000,38000,20239,13750,13770,0
7,8,100000,2,2,2,23,0,-1,-1,0,...,221,-159,567,380,601,0,581,1687,1542,0
8,9,140000,2,3,1,28,0,0,2,0,...,12211,11793,3719,3329,0,432,1000,1000,1000,0
9,10,20000,1,3,2,35,-2,-2,-2,-2,...,0,13007,13912,0,0,0,13007,1122,0,0


In [3]:
#drop personal attributes in dataset
dataset = pd.get_dummies(dataset, columns=['SEX', 'EDUCATION', 'MARRIAGE'], drop_first=True)

X = dataset.drop(['ID', 'default payment next month'], axis=1)
y = dataset['default payment next month']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [4]:
X.head()

Unnamed: 0,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,...,SEX_2,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3
0,20000,24,2,2,-1,-1,-2,-2,3913,3102,...,1,0,1,0,0,0,0,1,0,0
1,120000,26,-1,2,0,0,0,2,2682,1725,...,1,0,1,0,0,0,0,0,1,0
2,90000,34,0,0,0,0,0,0,29239,14027,...,1,0,1,0,0,0,0,0,1,0
3,50000,37,0,0,0,0,0,0,46990,48233,...,1,0,1,0,0,0,0,1,0,0
4,50000,57,-1,0,-1,0,0,0,8617,5670,...,0,0,1,0,0,0,0,1,0,0


In [5]:
y.head()

0    1
1    1
2    0
3    0
4    0
Name: default payment next month, dtype: int64

In [6]:
dataset.head()

Unnamed: 0,ID,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,...,SEX_2,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3
0,1,20000,24,2,2,-1,-1,-2,-2,3913,...,1,0,1,0,0,0,0,1,0,0
1,2,120000,26,-1,2,0,0,0,2,2682,...,1,0,1,0,0,0,0,0,1,0
2,3,90000,34,0,0,0,0,0,0,29239,...,1,0,1,0,0,0,0,0,1,0
3,4,50000,37,0,0,0,0,0,0,46990,...,1,0,1,0,0,0,0,1,0,0
4,5,50000,57,-1,0,-1,0,0,0,8617,...,0,0,1,0,0,0,0,1,0,0


## Model Definition

In [7]:
from hyperopt import tpe, hp, fmin
from sklearn.model_selection import cross_val_score

#Define hyperparameters
space = {'max_depth': hp.choice('max_depth', [2, 4, 8]),
        'n_estimators': hp.choice('n_estimators', [10, 20, 50])}

# Set up objective function
def objective(params):
    params = {'max_depth': int(params['max_depth']),
             'n_estimators': int(params['n_estimators'])}
    rfc = RandomForestClassifier(**params)
    best_score = cross_val_score(rfc, X_train, y_train, scoring='roc_auc', cv=3, n_jobs=4).mean()
    
    loss = 1 - best_score
    return loss

from datetime import datetime, timedelta
start_time = datetime.now()
print("%-20s %s" % ("Start Time", start_time))

# Run the algorithm
best = fmin(fn=objective, space=space, max_evals=20, algo=tpe.suggest)

print(best)

end_time = datetime.now()
print("%-20s %s" % ("End Time", end_time))
print(str(timedelta(seconds=(end_time-start_time).seconds)))

Start Time           2023-01-17 17:47:27.519613
100%|███████████████████████████████████████████████| 20/20 [00:27<00:00,  1.37s/trial, best loss: 0.22144362944250717]
{'max_depth': 2, 'n_estimators': 2}
End Time             2023-01-17 17:47:54.926731
0:00:27


## Viewing the best parameter for the model


In [8]:
# Evaluate Best Model
print('Best Model Test with Accuracy', best)

Best Model Test with Accuracy {'max_depth': 2, 'n_estimators': 2}
