In [174]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

import pickle

from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix
from xgboost import XGBClassifier

from tune_model import tune

In [175]:
# load the train data
data = pd.read_csv('../../data/processed/engineered_features_data/train_imputed_engineered_poly.csv')

# load the selected features from the .pkl file
with open('../../data/processed/selected_features/rfecv_features_to_keep.pkl', 'rb') as f:
    selected_features = pickle.load(f)

In [176]:
data_to_use = data[selected_features]

# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(data_to_use.drop('SeriousDlqin2yrs', axis=1), data_to_use['SeriousDlqin2yrs'], test_size=0.2, random_state=42)

In [177]:
# define the space of hyperparameters to search
param_space = {
    'max_iter': [200],
    'max_depth': [10, 20, 30],
    'l2_regularization': [0, 0.001],
    'max_features': [0.5, 0.75, 1.0],
    'learning_rate': [0.1],
    'min_samples_leaf': [10, 20, 30],
    'class_weight': ['balanced'],
}

model = HistGradientBoostingClassifier()

In [178]:
# tune the model

best_params, best_model = tune(X=X_train, y=y_train, space=param_space, 
                               model=model, search_type='grid', n_iter_random=120, 
                               n_splits=3, n_repeats=1, scoring='roc_auc')

Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best Score: 0.8625496489782695
Best Hyperparameters:
class_weight: balanced
l2_regularization: 0
learning_rate: 0.1
max_depth: 20
max_features: 0.5
max_iter: 200
min_samples_leaf: 10


In [179]:
# save the model
with open('../../models/hist_gradient_boosting_classifier.pkl', 'wb') as f:
    pickle.dump(best_model, f)

In [180]:
# evaluate on the test set
y_pred = best_model.predict(X_test)

# print the evaluation metrics
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'ROC AUC: {roc_auc_score(y_test, y_pred)}')
print(f'F1 score: {f1_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'Confusion matrix: {confusion_matrix(y_test, y_pred)}')

Accuracy: 0.7960444444444444
ROC AUC: 0.7811025400999293
F1 score: 0.3291916386493203
Precision: 0.2098006335010248
Recall: 0.7639077340569878
Confusion matrix: [[16785  4241]
 [  348  1126]]
