In [1]:
import pandas as pd
import numpy as np

# Tools
from collections import Counter
import pickle
import joblib

# Sampling & Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Model Selection
from sklearn.model_selection import train_test_split, KFold 
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier

# Metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_curve, precision_score, recall_score
from sklearn.metrics import f1_score, fbeta_score
from sklearn.metrics import roc_auc_score, roc_curve, auc

Using TensorFlow backend.


In [2]:
# Load train and test sets
# df            = pd.read_pickle('../data/df.pkl')
X_train_under = pd.read_pickle('../data/03_X_train_under.pkl')
X_test        = pd.read_pickle('../data/03_X_test.pkl')
y_train_under = pd.read_pickle('../data/03_y_train_under.pkl')
y_test        = pd.read_pickle('../data/03_y_test.pkl')

# Modeling

Objective: Maximize F1 & ROC AUC Score as both recall and precision are equally important and the classes are imbalanced

In [4]:
# Create list to store model performance
model_performance = []

User RandomizedSearchCV instead:
https://blog.usejournal.com/a-comparison-of-grid-search-and-randomized-search-using-scikit-learn-29823179bc85


## Baseline Models

In [5]:
#Dummy Classifier
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy= 'most_frequent').fit(X_train_under, y_train_under)
y_pred = clf.predict(X_test)

#Distribution of y test
print('y actual\t:', Counter(y_test))

#Distribution of y predicted
print('y predicted\t:', Counter(y_pred))

# Confusion matrix
print('\nConfusion Matrix\n' + str(confusion_matrix(y_test, y_pred)))

y actual	: Counter({0: 83653, 1: 22301})
y predicted	: Counter({0: 105954})

Confusion Matrix
[[83653     0]
 [22301     0]]


In [None]:
# base_models = []

# # Instantiate the models
# base_models.append(('LogisticRegression', LogisticRegression(solver='liblinear')))
# base_models.append(('KNeighbors', KNeighborsClassifier()))
# base_models.append(('RandomForest', RandomForestClassifier(n_estimators=10)))
# base_models.append(('XGBoost', XGBClassifier()))

# cv_results = []
# names = []

# # Cross Validate - 5 fold
# for name, model in base_models:
#     names.append(name)
#     cv_results.append(np.round_(cross_val_score(model, X_train_under, y_train_under, 
#                                                 cv=5, scoring='roc_auc', n_jobs=-1), 3))

# for i in range(len(names)):
#     print(names[i], round(cv_results[i].mean(), 3))

## Logistic Regression

In [7]:
# Grid searching key hyperparameters for logistic regression

# Instantiate model and sampler
lg_model = LogisticRegression()
under = RandomUnderSampler(random_state=2020)

# Construct pipeline
steps = [('und', under), ('lgr', lg_model)]
pipeline = Pipeline(steps)

# Define parameter grid values to be searched
param_grid = {
    'und__random_state': [2020],
    'lgr__solver': ['saga', 'liblinear'],
    'lgr__penalty': ['l1', 'l2'],
    'lgr__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

# Use stratify version of k-fold to keep class imbalance ratio
k_fold = StratifiedKFold(n_splits=3, shuffle=True, random_state=2020)

# Cross Validation
# lg_grid = GridSearchCV(pipeline, param_grid=param_grid, cv=k_fold, n_jobs=-1, return_train_score=True, scoring='roc_auc')
lg_rndm = RandomizedSearchCV(pipeline, param_distributions=param_grid, cv=k_fold, n_jobs=-1, return_train_score=True, scoring='f1')

# Train with balanced classes
grid_result = lg_rndm.fit(X_train_under, y_train_under)

# Summarize results
print(f'Best Score: {round(grid_result.best_score_, 3)}\nParams: {grid_result.best_params_}\n')

mean_train = grid_result.cv_results_['mean_train_score']
mean_test = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean_tr, mean_ts, param in zip(mean_train, mean_test, params):
    print(f'Train {round(mean_tr, 3)}\tTest {round(mean_ts, 3)}\tParams: {param}')
    
# # Examine the best model
# print(lg_grid.best_score_)
# print(lg_grid.best_params_)

Best Score: 0.743
Params: {'und__random_state': 2020, 'lgr__solver': 'saga', 'lgr__penalty': 'l2', 'lgr__C': 10}

Train 0.744	Test 0.743	Params: {'und__random_state': 2020, 'lgr__solver': 'liblinear', 'lgr__penalty': 'l2', 'lgr__C': 10}
Train 0.741	Test 0.741	Params: {'und__random_state': 2020, 'lgr__solver': 'liblinear', 'lgr__penalty': 'l2', 'lgr__C': 0.1}
Train 0.743	Test 0.742	Params: {'und__random_state': 2020, 'lgr__solver': 'liblinear', 'lgr__penalty': 'l1', 'lgr__C': 0.1}
Train 0.744	Test 0.743	Params: {'und__random_state': 2020, 'lgr__solver': 'saga', 'lgr__penalty': 'l2', 'lgr__C': 100}
Train 0.744	Test 0.743	Params: {'und__random_state': 2020, 'lgr__solver': 'liblinear', 'lgr__penalty': 'l2', 'lgr__C': 1000}
Train 0.744	Test 0.743	Params: {'und__random_state': 2020, 'lgr__solver': 'liblinear', 'lgr__penalty': 'l1', 'lgr__C': 10}
Train 0.744	Test 0.743	Params: {'und__random_state': 2020, 'lgr__solver': 'saga', 'lgr__penalty': 'l2', 'lgr__C': 10}
Train 0.744	Test 0.743	Params:

In [8]:
# Instantiate model with best paramaters
lgr_best = LogisticRegression(C=10, penalty='l2', solver='saga', random_state=2020)

# Train with balanced classes
lgr_best.fit(X_train_under, y_train_under)

# Get predictions
lgr_best_preds = lgr_best.predict(X_test)
# lgr_best_y_score = lgr_best.predict_proba(X_test)

# Get ROC AUC Score, precision, recall, f1-score
accuracy  = round(accuracy_score(y_test, lgr_best_preds), 3)
roc_auc   = round(roc_auc_score(y_test, lgr_best_preds), 3)
precision = round(precision_score(y_test, lgr_best_preds), 3)
recall    = round(recall_score(y_test, lgr_best_preds), 3)
f1        = round(f1_score(y_test, lgr_best_preds), 3)

print(f'Accuracy  : {accuracy}')
print(f'ROC_AUC   : {roc_auc}')
print(f'Precision : {precision}')
print(f'Recall    : {recall}')
print(f'F-score   : {f1}')

# Confusion matrix
print('\nConfusion Matrix\n' + str(confusion_matrix(y_test, lgr_best_preds)))

Accuracy  : 0.74
ROC_AUC   : 0.741
Precision : 0.432
Recall    : 0.741
F-score   : 0.546

Confusion Matrix
[[61912 21741]
 [ 5778 16523]]


In [9]:
# Add model and accuracy dict to list
model_performance.append(dict([
    ('Model', 'Logistic Regression'),
    ('ROC AUC', round(roc_auc, 3)),
    ('Precision', round(precision, 3)),
    ('Recall', round(recall, 3)),
    ('F1', round(f1, 3))
     ]))

# Save model for later use
joblib.dump(lgr_best, '../models/lgr_best.sav')

['../models/lgr_best.sav']

## KNN

In [10]:
# Grid searching key hyperparameters for KNN

# Instantiate model and RandomUnderSampler
knn_model = KNeighborsClassifier()
under = RandomUnderSampler(random_state=2020)

# Construct pipeline
steps = [('und', under), ('knn', knn_model)]
pipeline = Pipeline(steps)

# Define parameter grid values to be searched
param_grid = {
    'und__random_state': [2020],
    'knn__n_neighbors' : [3, 5, 7]
}

# Use stratify version of k-fold to keep class imbalance ratio
k_fold = StratifiedKFold(n_splits=3, shuffle=True, random_state=2020)

# Cross Validation
# knn_grid = GridSearchCV(pipeline, param_grid=param_grid, cv=k_fold, n_jobs=-1, return_train_score=True, scoring='roc_auc')
knn_rndm = RandomizedSearchCV(pipeline, param_distributions=param_grid, cv=k_fold, n_jobs=-1, return_train_score=True, scoring='f1')

# Train with balanced classes
grid_result = knn_rndm.fit(X_train_under, y_train_under) # Should I use X_train, y_train here?

# Summarize results
print(f'Best Score: {round(grid_result.best_score_, 3)}\nParams: {grid_result.best_params_}\n')

mean_train = grid_result.cv_results_['mean_train_score']
mean_test = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean_tr, mean_ts, param in zip(mean_train, mean_test, params):
    print(f'Train {round(mean_tr, 3)}\tTest {round(mean_ts, 3)}\tParams: {param}')
    
# # Examine the best model
# print(knn_grid.best_score_)
# print(knn_grid.best_params_)



Best Score: 0.745
Params: {'und__random_state': 2020, 'knn__n_neighbors': 7}

Train 0.852	Test 0.731	Params: {'und__random_state': 2020, 'knn__n_neighbors': 3}
Train 0.82	Test 0.74	Params: {'und__random_state': 2020, 'knn__n_neighbors': 5}
Train 0.805	Test 0.745	Params: {'und__random_state': 2020, 'knn__n_neighbors': 7}


In [11]:
# Instantiate model with best paramaters
knn_best = KNeighborsClassifier(n_neighbors=7)

# Train with balanced classes
knn_best.fit(X_train_under, y_train_under)

# Get predictions
knn_best_preds = knn_best.predict(X_test)
# knn_best_y_score = knn_best.predict_proba(X_test)

# Get ROC AUC Score, precision, recall, f1-score
accuracy  = round(accuracy_score(y_test,  knn_best_preds), 3)
roc_auc   = round(roc_auc_score(y_test,   knn_best_preds), 3)
precision = round(precision_score(y_test, knn_best_preds), 3)
recall    = round(recall_score(y_test,    knn_best_preds), 3)
f1        = round(f1_score(y_test,        knn_best_preds), 3)

print(f'Accuracy  : {accuracy}')
print(f'ROC_AUC   : {roc_auc}')
print(f'Precision : {precision}')
print(f'Recall    : {recall}')
print(f'F-score   : {f1}')

# Confusion matrix
print('\nConfusion Matrix\n' + str(confusion_matrix(y_test, knn_best_preds)))

Accuracy  : 0.732
ROC_AUC   : 0.743
Precision : 0.424
Recall    : 0.761
F-score   : 0.544

Confusion Matrix
[[60605 23048]
 [ 5339 16962]]


In [12]:
# Add model and accuracy dict to list
model_performance.append(dict([
    ('Model', 'KNN'),
    ('ROC AUC', round(roc_auc, 3)),
    ('Precision', round(precision, 3)),
    ('Recall', round(recall, 3)),
    ('F1', round(f1, 3))
     ]))

# Save model for later use
joblib.dump(knn_best, '../models/knn_best.sav')

['../models/knn_best.sav']

## Random Forest

In [13]:
# Grid searching key hyperparameters for Random Forest

# Instantiate model and RandomUnderSampler
rf_model = RandomForestClassifier()
under = RandomUnderSampler(random_state=2020)

# Construct pipeline
steps = [('und', under), ('rfc', rf_model)]
pipeline = Pipeline(steps)

# Define parameter grid values to be searched
param_grid = {
    'und__random_state': [2020],
    'rfc__n_estimators': [50, 100, 150, 200, 1000],
    'rfc__max_depth' : [2, 3, 4],
    'rfc__max_features' : [5, 10, 15],
    'rfc__criterion' : ['gini', 'entropy'],
    'rfc__random_state' :[2020]
}

# Use stratify version of k-fold to keep class imbalance ratio
k_fold = StratifiedKFold(n_splits=3, shuffle=True, random_state=2020)

# Cross Validation
# rf_grid = GridSearchCV(pipeline, param_grid=param_grid, cv=k_fold, n_jobs=-1, return_train_score=True, scoring='roc_auc')
rf_rndm = RandomizedSearchCV(pipeline, param_distributions=param_grid, cv=k_fold, n_jobs=-1, return_train_score=True, scoring='f1')

# Train with balanced classes
grid_result = rf_rndm.fit(X_train_under, y_train_under) # Should I use X_train, y_train here?

# Summarize results
print(f'Best Score: {round(grid_result.best_score_, 3)}\nParams: {grid_result.best_params_}\n')

mean_train = grid_result.cv_results_['mean_train_score']
mean_test = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean_tr, mean_ts, param in zip(mean_train, mean_test, params):
    print(f'Train {round(mean_tr, 3)}\tTest {round(mean_ts, 3)}\tParams: {param}')
    
# # Examine the best model
# print(rf_grid.best_score_)
# print(rf_grid.best_params_)

Best Score: 0.726
Params: {'und__random_state': 2020, 'rfc__random_state': 2020, 'rfc__n_estimators': 100, 'rfc__max_features': 5, 'rfc__max_depth': 2, 'rfc__criterion': 'gini'}

Train 0.701	Test 0.701	Params: {'und__random_state': 2020, 'rfc__random_state': 2020, 'rfc__n_estimators': 100, 'rfc__max_features': 15, 'rfc__max_depth': 4, 'rfc__criterion': 'gini'}
Train 0.694	Test 0.694	Params: {'und__random_state': 2020, 'rfc__random_state': 2020, 'rfc__n_estimators': 200, 'rfc__max_features': 15, 'rfc__max_depth': 3, 'rfc__criterion': 'entropy'}
Train 0.706	Test 0.706	Params: {'und__random_state': 2020, 'rfc__random_state': 2020, 'rfc__n_estimators': 1000, 'rfc__max_features': 15, 'rfc__max_depth': 3, 'rfc__criterion': 'gini'}
Train 0.698	Test 0.697	Params: {'und__random_state': 2020, 'rfc__random_state': 2020, 'rfc__n_estimators': 50, 'rfc__max_features': 10, 'rfc__max_depth': 3, 'rfc__criterion': 'entropy'}
Train 0.726	Test 0.726	Params: {'und__random_state': 2020, 'rfc__random_state':

In [14]:
# Instantiate model with best paramaters
rfc_best = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=2, max_features=5, random_state=2020)

# Train with balanced classes
rfc_best.fit(X_train_under, y_train_under)

# Get predictions
rfc_best_preds = rfc_best.predict(X_test)
# rfc_best_y_score = rfc_best.predict_proba(X_test)

# Get ROC AUC Score, precision, recall, f1-score
accuracy  = round(accuracy_score(y_test,  rfc_best_preds), 3)
roc_auc   = round(roc_auc_score(y_test,   rfc_best_preds), 3)
precision = round(precision_score(y_test, rfc_best_preds), 3)
recall    = round(recall_score(y_test,    rfc_best_preds), 3)
f1        = round(f1_score(y_test,        rfc_best_preds), 3)

print(f'Accuracy  : {accuracy}')
print(f'ROC_AUC   : {roc_auc}')
print(f'Precision : {precision}')
print(f'Recall    : {recall}')
print(f'F-score   : {f1}')

# Confusion matrix
print('\nConfusion Matrix\n' + str(confusion_matrix(y_test, rfc_best_preds)))

Accuracy  : 0.654
ROC_AUC   : 0.702
Precision : 0.354
Recall    : 0.784
F-score   : 0.488

Confusion Matrix
[[51788 31865]
 [ 4806 17495]]


In [15]:
# Add model and accuracy dict to list
model_performance.append(dict([
    ('Model', 'Random Forest'),
    ('ROC AUC', round(roc_auc, 3)),
    ('Precision', round(precision, 3)),
    ('Recall', round(recall, 3)),
    ('F1', round(f1, 3))
     ]))

# Save model for later use
joblib.dump(rfc_best, '../models/rfc_best.sav')

['../models/rfc_best.sav']

## XGBoost

In [17]:
# Grid searching key hyperparameters for XGBoost
# Instantiate model and RandomUnderSampler
xgb_model = XGBClassifier()
under = RandomUnderSampler(random_state=2020)

# Construct pipeline
steps = [('und', under), ('xgb', xgb_model)]
pipeline = Pipeline(steps)

# Define parameter grid values to be searched
param_grid = {
    'und__random_state': [2020],
    'xgb__n_estimators': [100, 250, 500, 1000], 
    'xgb__max_depth': [3, 4, 5], 
    'xgb__learning_rate': [0.1] # 0.001, 0.01, 
}

# Use stratify version of k-fold to keep class imbalance ratio
k_fold = StratifiedKFold(n_splits=3, shuffle=True, random_state=2020)

# Cross Validation
# xgb_grid = GridSearchCV(pipeline, param_grid=param_grid, cv=k_fold, n_jobs=-1, return_train_score=True, scoring='roc_auc')
xgb_rndm = RandomizedSearchCV(pipeline, param_distributions=param_grid, cv=3, n_jobs=-1, return_train_score=True, scoring='f1')
# # Dask HyperbandSearchCV
# search = HyperbandSearchCV(xgb_model, param_grid, max_iter=3, patience=True)

# Train with balanced classes
grid_result = xgb_rndm.fit(X_train_under, y_train_under)

# Summarize results
print(f'Best Score: {round(grid_result.best_score_, 3)}\nParams: {grid_result.best_params_}\n')

mean_train = grid_result.cv_results_['mean_train_score']
mean_test = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean_tr, mean_ts, param in zip(mean_train, mean_test, params):
    print(f'Train {round(mean_tr, 3)}\tTest {round(mean_ts, 3)}\tParams: {param}')

# # Examine the best model
# print(lg_grid.best_score_)
# print(lg_grid.best_params_)



Best Score: 0.82
Params: {'xgb__n_estimators': 1000, 'xgb__max_depth': 5, 'xgb__learning_rate': 0.1, 'und__random_state': 2020}

Train 0.798	Test 0.795	Params: {'xgb__n_estimators': 500, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.1, 'und__random_state': 2020}
Train 0.811	Test 0.806	Params: {'xgb__n_estimators': 1000, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.1, 'und__random_state': 2020}
Train 0.81	Test 0.804	Params: {'xgb__n_estimators': 500, 'xgb__max_depth': 4, 'xgb__learning_rate': 0.1, 'und__random_state': 2020}
Train 0.826	Test 0.815	Params: {'xgb__n_estimators': 1000, 'xgb__max_depth': 4, 'xgb__learning_rate': 0.1, 'und__random_state': 2020}
Train 0.824	Test 0.814	Params: {'xgb__n_estimators': 500, 'xgb__max_depth': 5, 'xgb__learning_rate': 0.1, 'und__random_state': 2020}
Train 0.839	Test 0.82	Params: {'xgb__n_estimators': 1000, 'xgb__max_depth': 5, 'xgb__learning_rate': 0.1, 'und__random_state': 2020}


In [18]:
# Instantiate model with best paramaters
xgb_best = XGBClassifier(n_estimators=1000, max_depth=5, learning_rate=0.1)

# Train with balanced classes
xgb_best.fit(X_train_under, y_train_under)

# Get predictions
xgb_best_preds = xgb_best.predict(X_test)
# xgb_best_y_score = xgb_best.predict_proba(X_test)

# Get ROC AUC Score, precision, recall, f1-score
accuracy  = round(accuracy_score(y_test,  xgb_best_preds), 3)
roc_auc   = round(roc_auc_score(y_test,   xgb_best_preds), 3)
precision = round(precision_score(y_test, xgb_best_preds), 3)
recall    = round(recall_score(y_test,    xgb_best_preds), 3)
f1        = round(f1_score(y_test,        xgb_best_preds), 3)

print(f'Accuracy  : {accuracy}')
print(f'ROC_AUC   : {roc_auc}')
print(f'Precision : {precision}')
print(f'Recall    : {recall}')
print(f'F-score   : {f1}')

# Confusion matrix
print('\nConfusion Matrix\n' + str(confusion_matrix(y_test, xgb_best_preds)))

Accuracy  : 0.834
ROC_AUC   : 0.824
Precision : 0.576
Recall    : 0.807
F-score   : 0.672

Confusion Matrix
[[70383 13270]
 [ 4300 18001]]


In [19]:
# Add model and accuracy dict to list
model_performance.append(dict([
    ('Model', 'XGBoost'),
    ('ROC AUC', round(roc_auc, 3)),
    ('Precision', round(precision, 3)),
    ('Recall', round(recall, 3)),
    ('F1', round(f1, 3))
     ]))

# Save model for later use
joblib.dump(xgb_best, '../models/xgb_best.sav')

# Save predictions for later use
with open('../data/04_xgb_best_preds.pkl', 'wb') as f:
    pickle.dump(xgb_best_preds, f)
    f.close()

## Ensemble

In [None]:
# Instantiate model
ensemble = VotingClassifier(estimators=[('lrg', lgr_best), ('rfc', rfc_best), ('xgb', xgb_best)], voting='soft', weights=[1, 1, 1])

# Train with balanced classes
ensemble.fit(X_train_under, y_train_under)

# Get predictions
ensemble_preds = ensemble.predict(X_test)
# xgb_best_y_score = xgb_best.predict_proba(X_test)

# Get ROC AUC Score, precision, recall, f1-score
accuracy  = round(accuracy_score(y_test,  ensemble), 3)
roc_auc   = round(roc_auc_score(y_test,   ensemble), 3)
precision = round(precision_score(y_test, ensemble), 3)
recall    = round(recall_score(y_test,    ensemble), 3)
f1        = round(f1_score(y_test,        ensemble), 3)

print(f'Accuracy  : {accuracy}')
print(f'ROC_AUC   : {roc_auc}')
print(f'Precision : {precision}')
print(f'Recall    : {recall}')
print(f'F-score   : {f1}')

# Confusion matrix
print('\nConfusion Matrix\n' + str(confusion_matrix(y_test, ensemble)))

In [None]:
# Add model and accuracy dict to list
model_performance.append(dict([
    ('Model', 'Ensemble'),
    ('ROC AUC', round(roc_auc, 3)),
    ('Precision', round(precision, 3)),
    ('Recall', round(recall, 3)),
    ('F1', round(f1, 3))
     ]))

# Save model for later use
joblib.dump(xgb_best, '../models/ensemble.sav')

# Save predictions for later use
with open('../data/04_ensemble_preds.pkl', 'wb') as f:
    pickle.dump(ensemble_preds, f)
    f.close()

In [21]:
# Pickel model performance
with open('../data/04_model_performance.pkl', 'wb') as f:
    pickle.dump(model_performance, f)
    f.close()

# Appendix

In [None]:
# cat /proc/cpuinfo

# # Import Dask libraries

# from dask import delayed
# import joblib
# import dask.dataframe as dd
# import dask.array as da

# from dask_ml.model_selection import train_test_split
# from dask_ml.linear_model import LogisticRegression
# from dask_ml.xgboost import XGBClassifier
# from dask_ml.model_selection import RandomizedSearchCV
# from dask_ml.model_selection import HyperbandSearchCV 
# # HyperbandSearchCVis Dask-ML’s meta-estimator to find the best hyperparameters.
# # It can be used as an alternative to RandomizedSearchCV to find similar hyper-parameters
# # in less time by not wasting time on hyper-parameters that are not promising. 

# from dask.distributed import Client, progress
# from sklearn.externals.joblib import parallel_backend

# client = Client(processes=False)
# # client = Client(processes=False, n_workers=4, threads_per_worker=8)
# client
# # client.close()