In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix
from xgboost import XGBClassifier

from tune_model import tune

In [2]:
# load the train data
data = pd.read_csv('../../data/processed/engineered_features_data/train_imputed_engineered_poly.csv')

# load the selected features from the .pkl file
with open('../../data/processed/selected_features/rfecv_features_to_keep.pkl', 'rb') as f:
    selected_features = pickle.load(f)

In [3]:
data_to_use = data[selected_features]

# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(data_to_use.drop('SeriousDlqin2yrs', axis=1), data_to_use['SeriousDlqin2yrs'], test_size=0.2, random_state=42)

In [6]:
# find the scale_pos_weight
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

In [7]:
# define the space of hyperparameters to search for xgboost
param_space = {
    'n_estimators': [150],
    'max_depth': [2, 3, 5],
    'gamma': [0],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.6, 0.8, 1],
    'colsample_bytree': [0.6, 0.8, 1],
    'colsample_bylevel': [0.6, 0.8, 1],
    'scale_pos_weight': [scale_pos_weight],
    'objective': ['binary:logistic'],
}
model = XGBClassifier()

# define a multi metric to use in the tuning
scoring = {
    'Accuracy': 'accuracy',
    'AUC': 'roc_auc',
    'F1': 'f1',
    'Precision': 'precision',
    'Recall': 'recall'
}

In [8]:
# tune the model
best_params, best_model = tune(X=X_train, y=y_train, space=param_space, 
                               model=model, search_type='grid', n_iter_random=120, 
                               n_splits=5, n_repeats=1, scoring=scoring)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Score: 0.8637367302594668
Best Hyperparameters:
colsample_bylevel: 0.6
colsample_bytree: 0.8
gamma: 0
learning_rate: 0.1
max_depth: 3
n_estimators: 150
objective: binary:logistic
scale_pos_weight: 13.885874958650348
subsample: 0.8


In [9]:
# save the model
with open('../../models/xgboost_classifier.pkl', 'wb') as f:
    pickle.dump(best_model, f)

In [10]:
# evaluate on the test set
y_pred = best_model.predict(X_test)

# print the evaluation metrics
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'ROC AUC: {roc_auc_score(y_test, y_pred)}')
print(f'F1 score: {f1_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'Confusion matrix: {confusion_matrix(y_test, y_pred)}')

Accuracy: 0.792
ROC AUC: 0.7802002844317193
F1 score: 0.3256484149855908
Precision: 0.20673252835711672
Recall: 0.7666214382632293
Confusion matrix: [[16690  4336]
 [  344  1130]]
