## Preamble, Data Ingestion

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns

import os
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import  LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score, classification_report

In [2]:
os.chdir('data')

In [3]:
cc_train = pd.read_csv(r'Data/cc_train.csv')
cc_test = pd.read_csv(r'Data/cc_test.csv')

ros_train = pd.read_csv(r'Data/lr_train.csv')
ros_test = pd.read_csv(r'Data/lr_test.csv')
cc_train.head()

Unnamed: 0,LIMIT_BAL,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,...,LIMIT_UTIL4,PERC_PAID4,LIMIT_UTIL5,PERC_PAID5,LIMIT_UTIL6,PERC_PAID6,EXCEEDED_LIMIT,OVERPAID,NEG_BILL,default.payment.next.month
0,50000.0,0,0,0,0,0,-2,46727.0,47973.0,48990.0,...,0.98522,0.0,0.0,1.0,0.0,1.0,0,0,0,1
1,200000.0,-1,-1,-1,-1,-1,-1,780.0,0.0,390.0,...,0.00195,1.0,0.00195,1.0,0.00195,1.0,1,1,0,0
2,400000.0,0,0,-1,0,0,0,10260.0,17794.0,6402.0,...,0.05193,0.24249,0.031015,0.407384,0.027363,0.2836,1,1,0,0
3,160000.0,4,3,2,0,0,0,103398.0,100925.0,97701.0,...,0.621463,0.004345,0.622044,0.005486,0.102069,0.033678,0,0,0,0
4,10000.0,2,2,6,6,6,6,142.0,142.0,142.0,...,0.0142,0.0,0.0142,0.0,0.0142,0.0,0,0,0,1


In [4]:
X_train = cc_train.drop(columns=['default.payment.next.month'])
y_train = cc_train[['default.payment.next.month']].values

X_test = cc_test.drop(columns=['default.payment.next.month'])
y_test = cc_test[['default.payment.next.month']].values

X_train_ros = ros_train.drop(columns=['default.payment.next.month'])
y_train_ros = ros_train[['default.payment.next.month']].values

In [5]:
def get_metrics(classifier, name, Xtrain = X_train, ytrain=y_train, 
                Xtest = X_test, ytest=y_test):
    results = pd.DataFrame(index=[name])
    results['Test_F1'] = f1_score(ytest, classifier.predict(Xtest), average='macro')
    results['Test_recall'] = recall_score(ytest, classifier.predict(Xtest))
    results['Test_precision'] = precision_score(ytest, classifier.predict(Xtest))
    results['Test_ROC_AUC'] = roc_auc_score(ytest, classifier.predict(Xtest))
    print(classification_report(ytest, classifier.predict(Xtest)))
    return results

In [6]:
results = pd.DataFrame()

### Logistic Regression

In [7]:
lr_pipe = Pipeline([('scaler', StandardScaler()),
                     ('lr', LogisticRegression(random_state=42))])
lr_pipe.fit(X_train_ros, y_train_ros)

r = get_metrics(lr_pipe, 'LR', X_train_ros, y_train_ros)
results = pd.concat([results, r])
results

              precision    recall  f1-score   support

           0       0.87      0.72      0.79      7009
           1       0.39      0.62      0.48      1991

    accuracy                           0.70      9000
   macro avg       0.63      0.67      0.64      9000
weighted avg       0.77      0.70      0.72      9000



Unnamed: 0,Test_F1,Test_recall,Test_precision,Test_ROC_AUC
LR,0.636295,0.624309,0.391619,0.674403


### Decision Tree and Random Forests

In [8]:
# set parameters for GridSearch tunning
dt_params = {
    'dt__criterion': ['gini','entropy'],
    'dt__max_depth': [3, 4, 5, 6, 7, 8]
}

dt_pipe = Pipeline([('scaler', StandardScaler()),
                     ('dt', DecisionTreeClassifier(random_state=7))])

dt_grid = GridSearchCV(dt_pipe, dt_params, scoring='f1_macro')
dt_grid.fit(X_train_ros, y_train_ros)


dt_best_params = dt_grid.best_params_
print(dt_best_params)
print(dt_grid.best_score_)

dt_pipe = Pipeline([('scaler', StandardScaler()),
                     ('dt', DecisionTreeClassifier(criterion=dt_best_params.get('dt__criterion', 'gini'),
                                                    max_depth=dt_best_params.get('dt__max_depth', None),
                                                   random_state=7))])
dt_pipe.fit(X_train_ros, y_train_ros)
r = get_metrics(dt_pipe, 'DT', X_train_ros, y_train_ros)
results = pd.concat([results, r])
results

{'dt__criterion': 'gini', 'dt__max_depth': 8}
0.7319089516553341
              precision    recall  f1-score   support

           0       0.87      0.79      0.83      7009
           1       0.45      0.59      0.51      1991

    accuracy                           0.75      9000
   macro avg       0.66      0.69      0.67      9000
weighted avg       0.78      0.75      0.76      9000



Unnamed: 0,Test_F1,Test_recall,Test_precision,Test_ROC_AUC
LR,0.636295,0.624309,0.391619,0.674403
DT,0.669332,0.587142,0.447378,0.690561


In [9]:
# set parameters for GridSearch tunning
rf_params = {
    'rf__criterion': ['gini','entropy'],
    'rf__max_depth': [3,4,5,6,7,8],
    'rf__min_samples_split': [3,4,5],
    'rf__n_estimators': [100,150]
}
rf_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rf', RandomForestClassifier(random_state=7))])

rf_grid = RandomizedSearchCV(rf_pipe, rf_params, scoring='f1_macro')
rf_grid.fit(X_train_ros, y_train_ros)


rf_best_params = rf_grid.best_params_
print(rf_best_params)
print(rf_grid.best_score_)

rf_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rf', RandomForestClassifier(criterion = rf_best_params.get('rf__criterion', 'gini'),
                                                   max_depth = rf_best_params.get('rf__max_depth', None),
                                                   min_samples_split = rf_best_params.get('rf__min_samples_split', 2),
                                                   n_estimators = rf_best_params.get('rf__n_estimators', 100),
                                                   random_state=7))])

rf_pipe.fit(X_train_ros, y_train_ros)
r = get_metrics(rf_pipe, 'RF', X_train_ros, y_train_ros)
results = pd.concat([results, r])
results

{'rf__n_estimators': 100, 'rf__min_samples_split': 4, 'rf__max_depth': 8, 'rf__criterion': 'entropy'}
0.741606870270824
              precision    recall  f1-score   support

           0       0.88      0.83      0.85      7009
           1       0.49      0.59      0.54      1991

    accuracy                           0.77      9000
   macro avg       0.68      0.71      0.69      9000
weighted avg       0.79      0.77      0.78      9000



Unnamed: 0,Test_F1,Test_recall,Test_precision,Test_ROC_AUC
LR,0.636295,0.624309,0.391619,0.674403
DT,0.669332,0.587142,0.447378,0.690561
RF,0.693345,0.589151,0.491412,0.707973


### KNN

In [10]:
# set parameters for GridSearch tunning
kbest = SelectKBest(k=20)
sampler = SMOTE(random_state=42)
X_train_smote, y_train_smote = sampler.fit_resample(X_train, y_train)

knn_params= {'feat_select__k': [10, 15, 20, 25, 30],
              'knn__n_neighbors': np.arange(2, 11)}

knn = KNeighborsClassifier(n_neighbors=3)
knn_pipe = Pipeline([('scaler', StandardScaler()),
                     ('feat_select', kbest),
                     ('knn', knn)])

knn_grid = RandomizedSearchCV(knn_pipe, param_distributions=knn_params,
                     scoring='f1_macro',
                     verbose=1)
knn_grid.fit(X_train_ros, y_train_ros)


knn_best_params = knn_grid.best_params_
print(knn_best_params)
print(knn_grid.best_score_)

knn_pipe = Pipeline([('scaler', StandardScaler()),
                     ('feat_select', SelectKBest(k=knn_best_params.get('feat_select__k', 20))),
                     ('knn', KNeighborsClassifier(n_neighbors = knn_best_params.get('knn__n_neighbors', 5)))])


knn_pipe.fit(X_train_smote, y_train_smote)
r = get_metrics(knn_pipe, 'KNN', X_train_smote, y_train_smote)
results = pd.concat([results, r])
results

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'knn__n_neighbors': 4, 'feat_select__k': 15}
0.7802852115580696
              precision    recall  f1-score   support

           0       0.84      0.82      0.83      7009
           1       0.41      0.44      0.42      1991

    accuracy                           0.74      9000
   macro avg       0.63      0.63      0.63      9000
weighted avg       0.74      0.74      0.74      9000



Unnamed: 0,Test_F1,Test_recall,Test_precision,Test_ROC_AUC
LR,0.636295,0.624309,0.391619,0.674403
DT,0.669332,0.587142,0.447378,0.690561
RF,0.693345,0.589151,0.491412,0.707973
KNN,0.627293,0.43546,0.413054,0.629843


### SVM

In [None]:
svm_params = {'feat_select__k': [10, 15, 20, 25, 30],
            'svm__C': [0.1, 1, 10, 100],
            'svm__gamma': ['auto', 1, 0.1, 0.01, 0.001]}

svm_pipe = Pipeline([('scaler', StandardScaler()),
                     ('feat_select', SelectKBest(k=15)),
                     ('svm', SVC(kernel='rbf', random_state=42))])


svm_grid = RandomizedSearchCV(svm_pipe, param_distributions=svm_params,
                     scoring='f1_macro',
                     verbose=1)
svm_grid.fit(X_train_smote, y_train_smote)

svm_best_params = svm_grid.best_params_
print(svm_best_params)
print(svm_grid.best_score_)

svm_pipe = Pipeline([('scaler', StandardScaler()),
                     ('feat_select', SelectKBest(k=svm_best_params.get('feat_select__k', 15))),
                     ('svm', SVC(C = svm_best_params.get('svm__C', 1.0),
                                 gamma = svm_best_params.get('svm__gamma', 'auto'),
                                 random_state=42))])


svm_pipe.fit(X_train_smote, y_train_smote)
r = get_metrics(svm_pipe, 'SVM', X_train_smote, y_train_smote)
results = pd.concat([results, r])
results

Fitting 5 folds for each of 10 candidates, totalling 50 fits


### XGBClassifier

In [11]:
xgb_pipe = Pipeline([('scaler', StandardScaler()),
                     ('xgb', XGBClassifier(booster='gbtree', 
                                           objective='binary:logistic', 
                                           use_label_encoder=True, 
                                           random_state = 0))])

kf = StratifiedKFold(n_splits=5, shuffle=False)
xgb_params = {'xgb__learning_rate': sp.stats.uniform(loc = 0.01, scale = 0.09),
              'xgb__max_depth': np.arange(4, 11),
              'xgb__n_estimators': np.arange(200, 601, 100),
              'xgb__min_child_weights':sp.stats.uniform(loc = 1, scale = 2),
              'xgb__gamma': sp.stats.uniform(loc = 0, scale = 10),
              'xgb__reg_alpha': sp.stats.uniform(loc = 0.01, scale = 0.09),
              'xgb__reg_lambda': sp.stats.uniform(loc = 0.01, scale = 0.09)}

xgb_grid = RandomizedSearchCV(xgb_pipe, param_distributions=xgb_params,
                              scoring='f1_macro', verbose=10,
                              n_iter=50, cv = kf,
                              random_state=0, n_jobs=10)
xgb_grid.fit(X_train_smote, y_train_smote)

xgb_best_params = xgb_grid.best_params_
print(xgb_best_params)
print(xgb_grid.best_score_)

xgb_pipe = Pipeline([('scaler', StandardScaler()),
                     ('xgb', XGBClassifier(booster='gbtree', 
                                           objective='binary:logistic', 
                                           use_label_encoder=True, 
                                           random_state = 0,
                                           learning_rate = xgb_best_params.get('xgb__learning_rate', 0.1),
                                           max_depth = xgb_best_params.get('xgb__max_depth', 3),
                                           n_estimators = xgb_best_params.get('xgb__n_estimators', 100),
                                           min_child_weights = xgb_best_params.get('xgb__min_child_weights',1.0),
                                           gamma = xgb_best_params.get('xgb__gamma', 0), 
                                           reg_alpha = xgb_best_params.get('xgb__reg_alpha', 0.0),
                                           reg_lambda = xgb_best_params.get('xgb__reg_lambda', 1.0)))])

xgb_pipe.fit(X_train, y_train)
r = get_metrics(xgb_pipe, 'XGBClassifier')
results = pd.concat([results, r])
results

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Parameters: { min_child_weights } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


{'xgb__gamma': 2.212626851837828, 'xgb__learning_rate': 0.022713751443330703, 'xgb__max_depth': 8, 'xgb__min_child_weights': 1.8717298505312536, 'xgb__n_estimators': 500, 'xgb__reg_alpha': 0.0190204198581071, 'xgb__reg_lambda': 0.0927534352370206}
0.847264180747931
Parameters: { min_child_weights } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


              precision    recall  f1-score   support

           0       0.84      0.

Unnamed: 0,Test_F1,Test_recall,Test_precision,Test_ROC_AUC
LR,0.636295,0.624309,0.391619,0.674403
DT,0.669332,0.587142,0.447378,0.690561
RF,0.693345,0.589151,0.491412,0.707973
KNN,0.627293,0.43546,0.413054,0.629843
XGBClassifier,0.679311,0.36213,0.662684,0.654884


### LGBMClassifier

In [12]:
lgbm_pipe = Pipeline([('scaler', StandardScaler()),
                     ('lgbm', LGBMClassifier(random_state = 42))])

kf = StratifiedKFold(n_splits=5, shuffle=False)
lgbm_params = {'lgbm__num_leaves': np.arange(5, 20, 5),
               'lgbm__max_depth': np.arange(1, 20, 2),
               'lgbm__min_data': np.arange(10, 110, 10)}

lgbm_grid = RandomizedSearchCV(lgbm_pipe, param_distributions=lgbm_params,
                              scoring='f1_macro', n_iter=20,
                              random_state=42)
lgbm_grid.fit(X_train_ros, y_train_ros)

lgbm_best_params = lgbm_grid.best_params_
print(lgbm_best_params)
print(lgbm_grid.best_score_)

lgbm_pipe = Pipeline([('scaler', StandardScaler()),
                     ('lgbm', LGBMClassifier(num_leaves = lgbm_best_params.get('lgbm__num_leaves', 31),
                                             max_depth = lgbm_best_params.get('lgbm__max_depth', -1),
                                             min_data = lgbm_best_params.get('lgbm__min_data', 20),
                                             random_state = 42))])

lgbm_pipe.fit(X_train_ros, y_train_ros)
r = get_metrics(lgbm_pipe, 'LGBMClassifier', X_train_ros, y_train_ros)
results = pd.concat([results, r])
results

{'lgbm__num_leaves': 15, 'lgbm__min_data': 10, 'lgbm__max_depth': 13}
0.7536645605112765
              precision    recall  f1-score   support

           0       0.88      0.80      0.84      7009
           1       0.47      0.62      0.53      1991

    accuracy                           0.76      9000
   macro avg       0.67      0.71      0.69      9000
weighted avg       0.79      0.76      0.77      9000



Unnamed: 0,Test_F1,Test_recall,Test_precision,Test_ROC_AUC
LR,0.636295,0.624309,0.391619,0.674403
DT,0.669332,0.587142,0.447378,0.690561
RF,0.693345,0.589151,0.491412,0.707973
KNN,0.627293,0.43546,0.413054,0.629843
XGBClassifier,0.679311,0.36213,0.662684,0.654884
LGBMClassifier,0.685944,0.616775,0.468881,0.709158


In [13]:
results

Unnamed: 0,Test_F1,Test_recall,Test_precision,Test_ROC_AUC
LR,0.636295,0.624309,0.391619,0.674403
DT,0.669332,0.587142,0.447378,0.690561
RF,0.693345,0.589151,0.491412,0.707973
KNN,0.627293,0.43546,0.413054,0.629843
XGBClassifier,0.679311,0.36213,0.662684,0.654884
LGBMClassifier,0.685944,0.616775,0.468881,0.709158


### Save Models (and Corresponding Data Files)
Logistic Regression, RF, XGB

In [14]:
import joblib

In [15]:
joblib.dump(lr_pipe, '../logreg.sav')
joblib.dump(rf_pipe, '../rf.sav')
joblib.dump(xgb_pipe, '../xgb.sav')

['../xgb.sav']