In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, f1_score, recall_score, precision_score, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, TunedThresholdClassifierCV, GridSearchCV, StratifiedKFold, LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier
from pprint import pprint
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline
from xgboost import XGBClassifier

#### Load Data

Load the cleaned data and convert all categorical columns into as many binary columns as necessary.

In [3]:
cleanedData = pd.read_csv('../Fully-Cleaned-Data.csv')

In [4]:
categoricalColumns = cleanedData.select_dtypes(include=['object']).columns.tolist()
numericalData = cleanedData.drop(columns=categoricalColumns)
oneHotData = pd.get_dummies(cleanedData[categoricalColumns])

In [5]:
cleanedDataOneHotEncoded = pd.concat([numericalData, oneHotData], axis=1)

In [6]:
X = cleanedDataOneHotEncoded.drop(['Future Relapse Binary'], axis=1)
y = cleanedDataOneHotEncoded['Future Relapse Binary']
XTrain, XTest, yTrain, yTest = train_test_split(X, y, stratify=y, random_state=42)

In [7]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#### Helper Functions

In [8]:
def print_scoring_metrics(fittedModel):
    train_preds = fittedModel.predict(XTrain)
    test_preds = fittedModel.predict(XTest)
    print('Training Scores')
    print(f'Accuracy: {accuracy_score(yTrain, train_preds)}')
    print(f'Recall: {recall_score(yTrain, train_preds)}')
    print(f'Specificity: {recall_score(yTrain, train_preds, pos_label=0)}')
    print(f'Precision: {precision_score(yTrain, train_preds)}')
    print(f'F1: {f1_score(yTrain, train_preds)}')
    print(f'ROC AUC: {roc_auc_score(yTrain, train_preds)}')
    print('--------------------------------')
    print('Test Scores')
    print(f'Accuracy: {accuracy_score(yTest, test_preds)}')
    print(f'Recall: {recall_score(yTest, test_preds)}')
    print(f'Specificity: {recall_score(yTest, test_preds, pos_label=0)}')
    print(f'Precision: {precision_score(yTest, test_preds)}')
    print(f'F1: {f1_score(yTest, test_preds)}')
    print(f'ROC AUC: {roc_auc_score(yTest, test_preds)}')

## Decision Tree

In [20]:
decision_tree_model = imbPipeline(
    [
        ('model', DecisionTreeClassifier(class_weight='balanced', random_state=42))
    ]
)

In [23]:
decision_tree_param_grid = {
    'model__max_depth': range(1, 15),
    'model__min_samples_split': range(2, 10),
    'model__criterion': ['gini', 'entropy'],
    'model__max_features': ['sqrt', 'log2'],
}
decision_tree_gscv = GridSearchCV(
    estimator=decision_tree_model, 
    param_grid=decision_tree_param_grid, 
    scoring='roc_auc',
    cv=kf,
    n_jobs=-1
)
decision_tree_gscv.fit(XTrain, yTrain)
print(decision_tree_gscv.best_params_)

{'model__criterion': 'entropy', 'model__max_depth': 6, 'model__max_features': 'sqrt', 'model__min_samples_split': 2}


In [24]:
print_scoring_metrics(decision_tree_gscv)

Training Scores
Accuracy: 0.6592833876221499
Recall: 0.6247723132969034
Specificity: 0.678498985801217
Precision: 0.5196969696969697
F1: 0.5674110835401158
ROC AUC: 0.6516356495490603
--------------------------------
Test Scores
Accuracy: 0.5625
Recall: 0.453551912568306
Specificity: 0.6231003039513677
Precision: 0.40096618357487923
F1: 0.4256410256410256
ROC AUC: 0.538326108259837


## Random Forest

In [12]:
random_forest_pipeline = imbPipeline([
    ('model', RandomForestClassifier(random_state=42))
])

In [13]:
random_forest_param_grid = {
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': range(2, 5),
    'model__class_weight': ['balanced', None],
    'model__n_estimators': [200, 250, 300]
}
random_forest_gscv = GridSearchCV(
    estimator=random_forest_pipeline,
    param_grid=random_forest_param_grid,
    scoring='roc_auc',
    cv=kf,
    n_jobs=-1
)
random_forest_gscv.fit(XTrain, yTrain)
print(random_forest_gscv.best_params_)

{'model__class_weight': 'balanced', 'model__criterion': 'entropy', 'model__max_depth': 4, 'model__n_estimators': 200}


In [16]:
print_scoring_metrics(random_forest_gscv)

Training Scores
Accuracy: 0.6592833876221499
Recall: 0.7340619307832422
Specificity: 0.6176470588235294
Precision: 0.5166666666666667
F1: 0.6064710308502633
ROC AUC: 0.6758544948033858
--------------------------------
Test Scores
Accuracy: 0.59375
Recall: 0.6174863387978142
Specificity: 0.5805471124620061
Precision: 0.450199203187251
F1: 0.5207373271889401
ROC AUC: 0.5990167256299102


## KNN

In [64]:
knn_pipeline = imbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('model', KNeighborsClassifier())
])

In [75]:
knn_param_grid = {
    'model__n_neighbors': range(5, 100),
    'model__weights': ['uniform'],
    'model__p': [1, 2, 3]
}

knn_gridsearchcv = GridSearchCV(
    estimator=knn_pipeline, 
    param_grid=knn_param_grid, 
    scoring='roc_auc',
    cv=kf
).fit(XTrain, yTrain)
print(knn_gridsearchcv.best_params_)

{'model__n_neighbors': 99, 'model__p': 1, 'model__weights': 'uniform'}


In [76]:
print_scoring_metrics(knn_gridsearchcv)

Training Scores
Accuracy: 0.6123778501628665
Recall: 0.6794171220400729
Precision: 0.47095959595959597
F1: 0.5563012677106637
ROC AUC: 0.6272339159896104
--------------------------------
Test Scores
Accuracy: 0.564453125
Recall: 0.6284153005464481
Precision: 0.42592592592592593
F1: 0.5077262693156733
ROC AUC: 0.5786453402428289


## Naive Bayes

In [9]:
nb_model = imbPipeline(
    [('scaler', StandardScaler()),
     ('nb_model', GaussianNB())]
).fit(XTrain, yTrain)

print_scoring_metrics(nb_model)

Training Scores
Accuracy: 0.4201954397394137
Recall: 0.9799635701275046
Precision: 0.3796753705010586
F1: 0.5473041709053916
ROC AUC: 0.5442414199521904
--------------------------------
Test Scores
Accuracy: 0.419921875
Recall: 0.9781420765027322
Precision: 0.3792372881355932
F1: 0.5465648854961832
ROC AUC: 0.543782284451974


## Gradient Boosting

In [94]:
gb_pipeline = imbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('model', GradientBoostingClassifier(random_state=42))
])

gb_param_grid = {
    'model__max_depth': range(2, 15),
    'model__n_estimators': [50, 100, 150],
    'model__max_features': ['sqrt', 'log2', None]
}

gb_gridsearchcv = GridSearchCV(
    estimator=gb_pipeline,
    param_grid=gb_param_grid,
    scoring='roc_auc',
    cv=kf,
    n_jobs=-1
).fit(XTrain, yTrain)
print(gb_gridsearchcv.best_params_)

{'model__max_depth': 2, 'model__max_features': 'log2', 'model__n_estimators': 100}


In [95]:
print_scoring_metrics(gb_gridsearchcv)

Training Scores
Accuracy: 0.6827361563517915
Recall: 0.5774134790528234
Precision: 0.5541958041958042
F1: 0.5655664585191793
ROC AUC: 0.6593963946988255
--------------------------------
Test Scores
Accuracy: 0.615234375
Recall: 0.4644808743169399
Precision: 0.46195652173913043
F1: 0.46321525885558584
ROC AUC: 0.5817845101067982


In [96]:
gb_threshold_tuned = TunedThresholdClassifierCV(
    gb_gridsearchcv,
    cv=kf,
    scoring='roc_auc'
).fit(XTrain, yTrain)

print_scoring_metrics(gb_threshold_tuned)

Training Scores
Accuracy: 0.6201954397394137
Recall: 0.8342440801457195
Precision: 0.48210526315789476
F1: 0.6110740493662442
ROC AUC: 0.6676291394643404
--------------------------------
Test Scores
Accuracy: 0.546875
Recall: 0.7431693989071039
Precision: 0.4236760124610592
F1: 0.5396825396825397
ROC AUC: 0.5904296842559835


In [97]:
print(gb_threshold_tuned.best_threshold_)

0.3933515992692537


## XGBoost

In [38]:
true_scale_pos_weight = (yTrain.shape[0] - np.count_nonzero(yTrain)) / np.count_nonzero(yTrain)
xgb_pipeline = imbPipeline([
    ('model', XGBClassifier(scale_pos_weight = true_scale_pos_weight))
])

In [None]:
xgb_param_grid = {
    'model__n_estimators': [50, 150, 250],
    'model__max_depth': range(2, 7),
    'model__eta': [0.25, 0.5],
    'model__gamma': [0, 5, 20, 100],
    'model__lambda': [0, 5],
    'model__alpha': [0, 5, 20, 100]
}

xgb_gridsearchcv = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=xgb_param_grid,
    scoring='roc_auc',
    cv=kf,
    n_jobs=-1,
    verbose=1
).fit(XTrain, yTrain)
print(xgb_gridsearchcv.best_params_)

Fitting 5 folds for each of 960 candidates, totalling 4800 fits
[CV 2/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.669 total time=   0.1s
[CV 1/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.660 total time=   0.1s
[CV 5/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.697 total time=   0.1s
[CV 3/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.669 total time=   0.1s
[CV 4/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.639 total time=   0.1s
[CV 2/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=150;, score=0.656 total time=   0.1s
[CV 1

In [42]:
print_scoring_metrics(xgb_gridsearchcv)

Training Scores
Accuracy: 0.7192182410423453
Recall: 0.7577413479052824
Specificity: 0.6977687626774848
Precision: 0.5826330532212886
F1: 0.6587490102929533
ROC AUC: 0.7277550552913835
--------------------------------
Test Scores
Accuracy: 0.58203125
Recall: 0.5245901639344263
Specificity: 0.6139817629179332
Precision: 0.4304932735426009
F1: 0.4729064039408867
ROC AUC: 0.5692859634261798


In [43]:
xgb_threshold_tuned = TunedThresholdClassifierCV(
    xgb_gridsearchcv,
    cv=kf,
    scoring='roc_auc'
).fit(XTrain, yTrain)

print_scoring_metrics(xgb_threshold_tuned)

Fitting 5 folds for each of 960 candidates, totalling 4800 fits
[CV 2/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.669 total time=   0.1s
[CV 1/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.660 total time=   0.1s
[CV 5/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.697 total time=   0.1s
[CV 3/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.669 total time=   0.1s
[CV 4/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.639 total time=   0.1s
[CV 1/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=150;, score=0.660 total time=   0.1s
[CV 2

  _data = np.array(data, dtype=dtype, copy=copy,


[CV 4/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.678 total time=   0.1s
[CV 2/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.706 total time=   0.1s
[CV 3/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.642 total time=   0.1s
[CV 1/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.637 total time=   0.1s
[CV 5/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.693 total time=   0.1s
[CV 1/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=150;, score=0.626 total time=   0.1s
[CV 2/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__

  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 960 candidates, totalling 4800 fits
[CV 2/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.680 total time=   0.0s
[CV 1/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.689 total time=   0.0s
[CV 3/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.689 total time=   0.0s
[CV 4/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.672 total time=   0.0s
[CV 5/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.626 total time=   0.0s
[CV 1/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=150;, score=0.687 total time=   0.1s
[CV 2

  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 960 candidates, totalling 4800 fits
[CV 1/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.707 total time=   0.0s
[CV 2/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.670 total time=   0.0s
[CV 3/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.651 total time=   0.0s
[CV 4/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.664 total time=   0.0s
[CV 5/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.672 total time=   0.0s
[CV 2/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=150;, score=0.651 total time=   0.0s
[CV 1

  _data = np.array(data, dtype=dtype, copy=copy,


[CV 5/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.711 total time=   0.0s
[CV 2/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=150;, score=0.679 total time=   0.0s
[CV 4/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=150;, score=0.671 total time=   0.0s
[CV 3/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=150;, score=0.642 total time=   0.0s
[CV 1/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=150;, score=0.643 total time=   0.1s
[CV 5/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=150;, score=0.709 total time=   0.1s
[CV 1/5] END model__alpha=0, model__eta=0.25, model__gamma=0, mod

  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 960 candidates, totalling 4800 fits
[CV 1/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.665 total time=   0.0s
[CV 2/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.605 total time=   0.0s
[CV 3/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.685 total time=   0.0s
[CV 5/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.665 total time=   0.0s
[CV 4/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=50;, score=0.659 total time=   0.0s
[CV 1/5] END model__alpha=0, model__eta=0.25, model__gamma=0, model__lambda=0, model__max_depth=2, model__n_estimators=150;, score=0.630 total time=   0.1s
[CV 2

  _data = np.array(data, dtype=dtype, copy=copy,


In [44]:
print(xgb_threshold_tuned.best_threshold_)

0.5080163


## Nested Cross-Validation with XGBoost

In [16]:
import tqdm.notebook

NUM_TRIALS = 5
scores = np.zeros(NUM_TRIALS)

xgb_nested_param_grid = {
    'model__n_estimators': [150, 250],
    'model__max_depth': range(2, 8, 2)
}

pbar = tqdm.notebook.tqdm(
    total=NUM_TRIALS*len(xgb_nested_param_grid['model__n_estimators'])*len(xgb_nested_param_grid['model__n_estimators']), 
    initial=0,
    unit='fits', 
    unit_divisor=NUM_TRIALS, 
    desc='Nested Gridsearch'
)

for i in range(NUM_TRIALS):
    inner_cv = StratifiedKFold(n_splits=10, shuffle=True)
    outer_cv = StratifiedKFold(n_splits=10, shuffle=True)

    xgb_nested_pipeline = imbPipeline([
        ('model', XGBClassifier())
    ])

    xgb_nested_gridsearchcv = GridSearchCV(
        estimator=xgb_nested_pipeline,
        param_grid=xgb_nested_param_grid,
        cv=inner_cv,
        scoring='roc_auc',
        n_jobs=2
    ).fit(XTrain, yTrain)
    print(f'gridsearch {i} done')
    nested_cv_score = cross_val_score(
        estimator=xgb_nested_gridsearchcv,
        scoring='roc_auc',
        X=XTrain,
        y=yTrain,
        cv=outer_cv,
        n_jobs=2
    )
    print(f'cv {i} done')
    print(xgb_nested_gridsearchcv.best_params_)
    scores[i] = nested_cv_score.mean()
    pbar.update(1)

print(scores)
print(scores.mean())
pbar.close()

Nested Gridsearch:   0%|          | 0/20 [00:00<?, ?fits/s]

gridsearch 0 done
cv 0 done
{'model__max_depth': 2, 'model__n_estimators': 150}
gridsearch 1 done
cv 1 done
{'model__max_depth': 2, 'model__n_estimators': 150}
gridsearch 2 done


KeyboardInterrupt: 

## LOOCV w/ XGBoost

In [17]:
true_scale_pos_weight = (yTrain.shape[0] - np.count_nonzero(yTrain)) / np.count_nonzero(yTrain)
xgb_pipeline = imbPipeline([
    ('model', XGBClassifier(scale_pos_weight = true_scale_pos_weight))
])

In [27]:
loo = LeaveOneOut()

loocv_param_grid = {
    'model__max_depth': range(2, 10, 3),
    'model__n_estimators': [100, 200],
    'model__lambda': [5, 50, 500]
}

loocv_gridsearch = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=loocv_param_grid,
    cv=loo,
    n_jobs=-1,
    verbose=1
).fit(XTrain, yTrain)

print(loocv_gridsearch.best_params_)

print_scoring_metrics(loocv_gridsearch)

Fitting 1535 folds for each of 18 candidates, totalling 27630 fits


KeyboardInterrupt: 

In [None]:
true_scale_pos_weight = (yTrain.shape[0] - np.count_nonzero(yTrain)) / np.count_nonzero(yTrain)
xgb_pipeline = imbPipeline([
    ('model', XGBClassifier(scale_pos_weight = true_scale_pos_weight))
])

In [31]:
from sklearn.ensemble import BaggingClassifier

xgb_bagging = BaggingClassifier(
    estimator=xgb_pipeline,
    n_estimators=50,
    random_state=42,
    n_jobs=3
).fit(XTrain, yTrain)

print_scoring_metrics(xgb_bagging)

Training Scores
Accuracy: 0.9882736156351791
Recall: 0.9872495446265938
Specificity: 0.9888438133874239
Precision: 0.9801084990958409
F1: 0.9836660617059891
ROC AUC: 0.9880466790070088
--------------------------------
Test Scores
Accuracy: 0.62890625
Recall: 0.39344262295081966
Specificity: 0.7598784194528876
Precision: 0.4768211920529801
F1: 0.4311377245508982
ROC AUC: 0.5766605212018536
