In [12]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, f1_score, recall_score, precision_score, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, TunedThresholdClassifierCV, GridSearchCV, StratifiedKFold, LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier
from pprint import pprint
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline
from xgboost import XGBClassifier

#### Load Data

Load the cleaned data and convert all categorical columns into as many binary columns as necessary.

In [2]:
cleanedData = pd.read_csv('../Fully-Cleaned-Data.csv')

In [3]:
categoricalColumns = cleanedData.select_dtypes(include=['object']).columns.tolist()
numericalData = cleanedData.drop(columns=categoricalColumns)
oneHotData = pd.get_dummies(cleanedData[categoricalColumns])

In [4]:
cleanedDataOneHotEncoded = pd.concat([numericalData, oneHotData], axis=1)

In [5]:
X = cleanedDataOneHotEncoded.drop(['Future Relapse Binary'], axis=1)
y = cleanedDataOneHotEncoded['Future Relapse Binary']
XTrain, XTest, yTrain, yTest = train_test_split(X, y, stratify=y, random_state=42)

In [6]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#### Helper Functions

In [7]:
def print_scoring_metrics(fittedModel):
    train_preds = fittedModel.predict(XTrain)
    test_preds = fittedModel.predict(XTest)
    print('Training Scores')
    print(f'Accuracy: {accuracy_score(yTrain, train_preds)}')
    print(f'Recall: {recall_score(yTrain, train_preds)}')
    print(f'Precision: {precision_score(yTrain, train_preds)}')
    print(f'F1: {f1_score(yTrain, train_preds)}')
    print(f'ROC AUC: {roc_auc_score(yTrain, train_preds)}')
    print('--------------------------------')
    print('Test Scores')
    print(f'Accuracy: {accuracy_score(yTest, test_preds)}')
    print(f'Recall: {recall_score(yTest, test_preds)}')
    print(f'Precision: {precision_score(yTest, test_preds)}')
    print(f'F1: {f1_score(yTest, test_preds)}')
    print(f'ROC AUC: {roc_auc_score(yTest, test_preds)}')

## Decision Tree

In [23]:
decision_tree_model = imbPipeline(
    [
        ('smote', SMOTE(random_state=42)),
        ('model', DecisionTreeClassifier(random_state=42))
    ]
)

In [45]:
decision_tree_param_grid = {
    'model__max_depth': range(1, 15),
    'model__min_samples_split': range(2, 10),
    'model__criterion': ['gini', 'entropy'],
    'model__max_features': ['sqrt', 'log2']
}
decision_tree_gscv = GridSearchCV(
    estimator=decision_tree_model, 
    param_grid=decision_tree_param_grid, 
    scoring='roc_auc',
    cv=kf
)
decision_tree_gscv.fit(XTrain, yTrain)
print(decision_tree_gscv.best_params_)

{'model__criterion': 'entropy', 'model__max_depth': 8, 'model__max_features': 'sqrt', 'model__min_samples_split': 9}


  _data = np.array(data, dtype=dtype, copy=copy,


In [54]:
print_scoring_metrics(decision_tree_gscv)

Training Scores
Accuracy: 0.6644951140065146
Recall: 0.6867030965391621
Precision: 0.5236111111111111
F1: 0.5941686367218282
ROC AUC: 0.6694164569916906
--------------------------------
Test Scores
Accuracy: 0.55859375
Recall: 0.5737704918032787
Precision: 0.4150197628458498
F1: 0.481651376146789
ROC AUC: 0.5619612337435846


## Random Forest

In [57]:
random_forest_pipeline = imbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('model', RandomForestClassifier(random_state=42))
])

In [67]:
random_forest_param_grid = {
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': range(2, 5),
    'model__class_weight': ['balanced', None],
    'model__n_estimators': [50, 100, 150, 200, 250, 300, 350]
}
random_forest_gscv = GridSearchCV(
    estimator=random_forest_pipeline,
    param_grid=random_forest_param_grid,
    scoring='roc_auc',
    cv=kf
)
random_forest_gscv.fit(XTrain, yTrain)
print(random_forest_gscv.best_params_)

{'model__class_weight': 'balanced', 'model__criterion': 'gini', 'model__max_depth': 4, 'model__n_estimators': 250}


In [70]:
print_scoring_metrics(random_forest_gscv)

Training Scores
Accuracy: 0.6697068403908795
Recall: 0.692167577413479
Precision: 0.5292479108635098
F1: 0.5998421468034728
ROC AUC: 0.6746841943862527
--------------------------------
Test Scores
Accuracy: 0.591796875
Recall: 0.5519125683060109
Precision: 0.44298245614035087
F1: 0.49148418491484186
ROC AUC: 0.582947165611972


## KNN

In [64]:
knn_pipeline = imbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('model', KNeighborsClassifier())
])

In [75]:
knn_param_grid = {
    'model__n_neighbors': range(5, 100),
    'model__weights': ['uniform'],
    'model__p': [1, 2, 3]
}

knn_gridsearchcv = GridSearchCV(
    estimator=knn_pipeline, 
    param_grid=knn_param_grid, 
    scoring='roc_auc',
    cv=kf
).fit(XTrain, yTrain)
print(knn_gridsearchcv.best_params_)

{'model__n_neighbors': 99, 'model__p': 1, 'model__weights': 'uniform'}


In [76]:
print_scoring_metrics(knn_gridsearchcv)

Training Scores
Accuracy: 0.6123778501628665
Recall: 0.6794171220400729
Precision: 0.47095959595959597
F1: 0.5563012677106637
ROC AUC: 0.6272339159896104
--------------------------------
Test Scores
Accuracy: 0.564453125
Recall: 0.6284153005464481
Precision: 0.42592592592592593
F1: 0.5077262693156733
ROC AUC: 0.5786453402428289


## Naive Bayes

In [78]:
nb_model = imbPipeline(
    [('scaler', StandardScaler()),
     ('smote', SMOTE(random_state=42)),
     ('nb_model', GaussianNB())]
).fit(XTrain, yTrain)

print_scoring_metrics(nb_model)

Training Scores
Accuracy: 0.4254071661237785
Recall: 0.9726775956284153
Precision: 0.3811563169164882
F1: 0.5476923076923077
ROC AUC: 0.5466836254004145
--------------------------------
Test Scores
Accuracy: 0.419921875
Recall: 0.9672131147540983
Precision: 0.3782051282051282
F1: 0.543778801843318
ROC AUC: 0.5413573172554687


## Gradient Boosting

In [94]:
gb_pipeline = imbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('model', GradientBoostingClassifier(random_state=42))
])

gb_param_grid = {
    'model__max_depth': range(2, 15),
    'model__n_estimators': [50, 100, 150],
    'model__max_features': ['sqrt', 'log2', None]
}

gb_gridsearchcv = GridSearchCV(
    estimator=gb_pipeline,
    param_grid=gb_param_grid,
    scoring='roc_auc',
    cv=kf,
    n_jobs=-1
).fit(XTrain, yTrain)
print(gb_gridsearchcv.best_params_)

{'model__max_depth': 2, 'model__max_features': 'log2', 'model__n_estimators': 100}


In [95]:
print_scoring_metrics(gb_gridsearchcv)

Training Scores
Accuracy: 0.6827361563517915
Recall: 0.5774134790528234
Precision: 0.5541958041958042
F1: 0.5655664585191793
ROC AUC: 0.6593963946988255
--------------------------------
Test Scores
Accuracy: 0.615234375
Recall: 0.4644808743169399
Precision: 0.46195652173913043
F1: 0.46321525885558584
ROC AUC: 0.5817845101067982


In [96]:
gb_threshold_tuned = TunedThresholdClassifierCV(
    gb_gridsearchcv,
    cv=kf,
    scoring='roc_auc'
).fit(XTrain, yTrain)

print_scoring_metrics(gb_threshold_tuned)

Training Scores
Accuracy: 0.6201954397394137
Recall: 0.8342440801457195
Precision: 0.48210526315789476
F1: 0.6110740493662442
ROC AUC: 0.6676291394643404
--------------------------------
Test Scores
Accuracy: 0.546875
Recall: 0.7431693989071039
Precision: 0.4236760124610592
F1: 0.5396825396825397
ROC AUC: 0.5904296842559835


In [97]:
print(gb_threshold_tuned.best_threshold_)

0.3933515992692537


## XGBoost

In [None]:
xgb_pipeline = imbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier())
])

xgb_param_grid = {
    'model__n_estimators': [150, 250],
    'model__max_depth': range(2, 15),
    'model__grow_policy': ['depthwise', 'lossguide']
}

xgb_gridsearchcv = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=xgb_param_grid,
    scoring='roc_auc',
    cv=kf,
    n_jobs=-1
).fit(XTrain, yTrain)
print(xgb_gridsearchcv.best_params_)

  _data = np.array(data, dtype=dtype, copy=copy,


{'model__grow_policy': 'depthwise', 'model__max_depth': 2, 'model__n_estimators': 150}


In [11]:
print_scoring_metrics(xgb_gridsearchcv)

Training Scores
Accuracy: 0.7667752442996743
Recall: 0.5883424408014571
Precision: 0.7098901098901099
F1: 0.6434262948207171
ROC AUC: 0.7272341007252721
--------------------------------
Test Scores
Accuracy: 0.62109375
Recall: 0.36065573770491804
Precision: 0.46153846153846156
F1: 0.4049079754601227
ROC AUC: 0.5633065922567143


In [12]:
xgb_threshold_tuned = TunedThresholdClassifierCV(
    xgb_gridsearchcv,
    cv=kf,
    scoring='roc_auc'
).fit(XTrain, yTrain)

print_scoring_metrics(xgb_threshold_tuned)

Training Scores
Accuracy: 0.7355048859934853
Recall: 0.7978142076502732
Precision: 0.597544338335607
F1: 0.6833073322932918
ROC AUC: 0.7493127833383212
--------------------------------
Test Scores
Accuracy: 0.58984375
Recall: 0.546448087431694
Precision: 0.44052863436123346
F1: 0.4878048780487805
ROC AUC: 0.5802149251748137


In [15]:
print(xgb_threshold_tuned.best_threshold_)

0.38885078


### Nested Cross-Validation with XGBoost

In [None]:
NUM_TRIALS = 20
scores = np.zeros(NUM_TRIALS)
xgb_nested_param_grid = {
    'model__n_estimators': [150, 250],
    'model__max_depth': range(2, 15),
    'model__grow_policy': ['depthwise', 'lossguide']
}

for i in range(NUM_TRIALS):
    inner_cv = StratifiedKFold(n_splits=5, shuffle=True)
    outer_cv = StratifiedKFold(n_splits=5, shuffle=True)

    xgb_nested_pipeline = imbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('model', XGBClassifier())
    ])

    xgb_nested_gridsearchcv = GridSearchCV(
        estimator=xgb_nested_pipeline,
        param_grid=xgb_nested_param_grid,
        cv=inner_cv,
        scoring='roc_auc',
        n_jobs=-2
    ).fit(XTrain, yTrain)
    
    nested_cv_score = cross_val_score(
        estimator=xgb_nested_gridsearchcv,
        scoring='roc_auc',
        X=XTrain,
        y=yTrain,
        cv=outer_cv,
        n_jobs=-2
    )

    print(xgb_nested_gridsearchcv.best_params_)
    scores[i] = nested_cv_score.mean()

print(scores)
print(scores.mean())

{'model__grow_policy': 'depthwise', 'model__max_depth': 2, 'model__n_estimators': 150}
{'model__grow_policy': 'depthwise', 'model__max_depth': 2, 'model__n_estimators': 150}
{'model__grow_policy': 'depthwise', 'model__max_depth': 2, 'model__n_estimators': 150}
{'model__grow_policy': 'depthwise', 'model__max_depth': 2, 'model__n_estimators': 150}
{'model__grow_policy': 'depthwise', 'model__max_depth': 2, 'model__n_estimators': 150}
{'model__grow_policy': 'depthwise', 'model__max_depth': 2, 'model__n_estimators': 150}
{'model__grow_policy': 'depthwise', 'model__max_depth': 2, 'model__n_estimators': 150}
{'model__grow_policy': 'depthwise', 'model__max_depth': 2, 'model__n_estimators': 150}
{'model__grow_policy': 'depthwise', 'model__max_depth': 2, 'model__n_estimators': 150}
{'model__grow_policy': 'depthwise', 'model__max_depth': 2, 'model__n_estimators': 150}
{'model__grow_policy': 'depthwise', 'model__max_depth': 2, 'model__n_estimators': 150}
{'model__grow_policy': 'depthwise', 'model_