In [93]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, PrecisionRecallDisplay
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, TunedThresholdClassifierCV, GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from pprint import pprint
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline
from xgboost import XGBClassifier

In [32]:
cleanedData = pd.read_csv('../Fully-Cleaned-Data.csv')

In [33]:
categoricalColumns = cleanedData.select_dtypes(include=['object']).columns.tolist()
numericalData = cleanedData.drop(columns=categoricalColumns)
oneHotData = pd.get_dummies(cleanedData[categoricalColumns])

In [34]:
cleanedDataOneHotEncoded = pd.concat([numericalData, oneHotData], axis=1)
print(cleanedDataOneHotEncoded.shape)

(2047, 48)


In [35]:
X = cleanedDataOneHotEncoded.drop(['Future Relapse Binary'], axis=1)
y = cleanedDataOneHotEncoded['Future Relapse Binary']
XTrain, XTest, yTrain, yTest = train_test_split(X, y, stratify=y, random_state=42)

In [36]:
def print_scoring_statistics(yTrue, yPred):
    print(metrics.balanced_accuracy_score(yTrue, yPred))
    print(metrics.f1_score(yTrue, yPred))
    print(metrics.precision_score(yTrue, yPred))
    print(metrics.recall_score(yTrue, yPred))

In [41]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Decision Tree

In [51]:
decision_tree_model = DecisionTreeClassifier(random_state=42)

In [67]:
decision_tree_param_grid = {
    'max_depth': range(1, 15),
    'min_samples_split': (2, 50),
    'criterion': ['gini', 'entropy'],
}
decision_tree_gscv = GridSearchCV(
    estimator=decision_tree_model, 
    param_grid=decision_tree_param_grid, 
    scoring='roc_auc', 
    cv=kf
).fit(XTrain, yTrain)
print(decision_tree_gscv.best_params_)

{'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 50}


In [71]:
decision_tree_grid_search_params = DecisionTreeClassifier(
    random_state=42, 
    criterion=decision_tree_gscv.best_params_['criterion'], 
    min_samples_split=decision_tree_gscv.best_params_['min_samples_split'],
    max_depth=decision_tree_gscv.best_params_['max_depth']
).fit(XTrain, yTrain)

In [72]:
decision_tree_train_preds = decision_tree_grid_search_params.predict(XTrain)

print(classification_report(yTrain, decision_tree_train_preds))

print(classification_report(yTest, decision_tree_grid_search_params.predict(XTest)))

              precision    recall  f1-score   support

           0       0.75      0.71      0.73       986
           1       0.52      0.56      0.54       549

    accuracy                           0.66      1535
   macro avg       0.63      0.64      0.63      1535
weighted avg       0.66      0.66      0.66      1535

              precision    recall  f1-score   support

           0       0.70      0.68      0.69       329
           1       0.46      0.48      0.47       183

    accuracy                           0.61       512
   macro avg       0.58      0.58      0.58       512
weighted avg       0.61      0.61      0.61       512



In [13]:
print(cross_val_score(decision_tree_model, XTrain, yTrain, cv=15, scoring='balanced_accuracy').mean())
print(cross_val_score(decision_tree_model, XTrain, yTrain, cv=15, scoring='f1').mean())
print(cross_val_score(decision_tree_model, XTrain, yTrain, cv=15, scoring='precision').mean())
print(cross_val_score(decision_tree_model, XTrain, yTrain, cv=15, scoring='recall').mean())

0.6391278673648076
0.6400311259501071
0.630084751013117
0.6525961538461538


In [10]:
decision_tree_threshold_tuned = TunedThresholdClassifierCV(decision_tree_model, 
                                                           cv=15, 
                                                           scoring='recall_macro', 
                                                           random_state=42
                                                        ).fit(XTrain, yTrain)
print_scoring_statistics(yTest, decision_tree_threshold_tuned.predict(XTest))
print(confusion_matrix(yTest, decision_tree_threshold_tuned.predict(XTest)))

0.6005208333333333
0.5588822355289421
0.45307443365695793
0.7291666666666666
[[151 169]
 [ 52 140]]


## Random Forest

In [11]:
random_forest_pipeline = Pipeline([
    ('model', RandomForestClassifier(class_weight='balanced', random_state=42, max_depth=11))
])

In [12]:
random_forest_train_preds = random_forest_pipeline.fit(XTrain, yTrain).predict(XTrain)
print_scoring_statistics(yTrain, random_forest_train_preds)
print(confusion_matrix(yTrain, random_forest_train_preds))
print(classification_report(yTrain, random_forest_train_preds))

0.9371812767541411
0.9052997393570807
0.8527004909983633
0.9648148148148148
[[905  90]
 [ 19 521]]
              precision    recall  f1-score   support

           0       0.98      0.91      0.94       995
           1       0.85      0.96      0.91       540

    accuracy                           0.93      1535
   macro avg       0.92      0.94      0.92      1535
weighted avg       0.93      0.93      0.93      1535



In [13]:
print(cross_val_score(random_forest_pipeline, XTrain, yTrain, cv=15, scoring='balanced_accuracy').mean())
print(cross_val_score(random_forest_pipeline, XTrain, yTrain, cv=15, scoring='f1').mean())
print(cross_val_score(random_forest_pipeline, XTrain, yTrain, cv=15, scoring='precision').mean())
print(cross_val_score(random_forest_pipeline, XTrain, yTrain, cv=15, scoring='recall').mean())

0.6316850092969497
0.5237586288059768
0.5228178301947463
0.5296296296296296


In [14]:
for scoring_str in ['balanced_accuracy', 'f1', 'precision', 'recall']:
    random_forest_threshold_tuned = TunedThresholdClassifierCV(random_forest_pipeline, 
                                                            cv=15, 
                                                            scoring=scoring_str, 
                                                            random_state=42
                                                            ).fit(XTrain, yTrain)
    if scoring_str == 'balanced_accuracy':
        print(metrics.balanced_accuracy_score(yTest, random_forest_threshold_tuned.predict(XTest)))
    elif scoring_str == 'f1':
        print(metrics.f1_score(yTest, random_forest_threshold_tuned.predict(XTest)))
    elif scoring_str == 'precision':
        print(metrics.precision_score(yTest, random_forest_threshold_tuned.predict(XTest)))
    elif scoring_str == 'recall':
        print(metrics.recall_score(yTest, random_forest_threshold_tuned.predict(XTest)))

0.6515625
0.5703125
0.4444444444444444
1.0


In [80]:
rf_model = RandomForestClassifier(random_state=42)
rf_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(2, 10),
    'class_weight': ['balanced', None],
    'n_estimators': [50, 100, 150, 200, 250]
}

rf_gridsearchcv = GridSearchCV(
    estimator=rf_model, 
    param_grid=rf_param_grid, 
    scoring='roc_auc'
).fit(XTrain, yTrain)
print(rf_gridsearchcv.best_params_)

{'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 50}


In [81]:
rf_cv_model = RandomForestClassifier(
    random_state=42, 
    class_weight=rf_gridsearchcv.best_params_['class_weight'], 
    criterion=rf_gridsearchcv.best_params_['criterion'], 
    max_depth=rf_gridsearchcv.best_params_['max_depth'], 
    n_estimators=rf_gridsearchcv.best_params_['n_estimators']
)

print('Training Classification Report')
print(classification_report(yTrain, rf_cv_model.fit(XTrain, yTrain).predict(XTrain)))
print('------------------------------------------------------')
print('Test Classification Report')
print(classification_report(yTest, rf_cv_model.fit(XTrain, yTrain).predict(XTest)))

Training Classification Report
              precision    recall  f1-score   support

           0       0.84      0.69      0.76       986
           1       0.58      0.76      0.66       549

    accuracy                           0.72      1535
   macro avg       0.71      0.73      0.71      1535
weighted avg       0.75      0.72      0.72      1535

------------------------------------------------------
Test Classification Report
              precision    recall  f1-score   support

           0       0.73      0.62      0.67       329
           1       0.46      0.58      0.51       183

    accuracy                           0.61       512
   macro avg       0.59      0.60      0.59       512
weighted avg       0.63      0.61      0.61       512



## KNN

In [74]:
knn_model = KNeighborsClassifier()
knn_param_grid = {
    'n_neighbors': range(5, 25),
    'weights': ['uniform'],
    'p': [1, 2]
}

knn_gridsearchcv = GridSearchCV(
    estimator=knn_model, 
    param_grid=knn_param_grid, 
    scoring='recall'
).fit(XTrain, yTrain)
print(knn_gridsearchcv.best_params_)

{'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}


In [75]:
knn_cv_model = KNeighborsClassifier(
    n_neighbors=knn_gridsearchcv.best_params_['n_neighbors'],
    weights=knn_gridsearchcv.best_params_['weights'],
    p=knn_gridsearchcv.best_params_['p']
).fit(XTrain, yTrain)

print('Training Classification Report')
print(classification_report(yTrain, knn_cv_model.predict(XTrain)))
print('------------------------------------------------------')
print('Test Classification Report')
print(classification_report(yTest, knn_cv_model.predict(XTest)))

Training Classification Report
              precision    recall  f1-score   support

           0       0.76      0.88      0.82       986
           1       0.71      0.51      0.59       549

    accuracy                           0.75      1535
   macro avg       0.74      0.70      0.71      1535
weighted avg       0.75      0.75      0.74      1535

------------------------------------------------------
Test Classification Report
              precision    recall  f1-score   support

           0       0.66      0.76      0.71       329
           1       0.41      0.30      0.34       183

    accuracy                           0.59       512
   macro avg       0.53      0.53      0.52       512
weighted avg       0.57      0.59      0.58       512



## Naive Bayes

In [73]:
nb_model = Pipeline(
    [('scaler', StandardScaler()),
     ('nb_model', GaussianNB())]
).fit(XTrain, yTrain)

print('Training Classification Report')
print(classification_report(yTrain, nb_model.predict(XTrain)))
print('------------------------------------------------------')
print('Test Classification Report')
print(classification_report(yTest, nb_model.predict(XTest)))

Training Classification Report
              precision    recall  f1-score   support

           0       0.91      0.11      0.19       986
           1       0.38      0.98      0.55       549

    accuracy                           0.42      1535
   macro avg       0.64      0.54      0.37      1535
weighted avg       0.72      0.42      0.32      1535

------------------------------------------------------
Test Classification Report
              precision    recall  f1-score   support

           0       0.90      0.11      0.20       329
           1       0.38      0.98      0.55       183

    accuracy                           0.42       512
   macro avg       0.64      0.54      0.37       512
weighted avg       0.71      0.42      0.32       512



## Gradient Boosting

In [84]:
gb_model = GradientBoostingClassifier(random_state=42)
gb_param_grid = {
    'max_depth': range(2, 15),
    'n_estimators': [50, 150, 250, 350]
}

gb_gridsearchcv = GridSearchCV(
    estimator=gb_model,
    param_grid=gb_param_grid,
    scoring='roc_auc'
).fit(XTrain, yTrain)
print(gb_gridsearchcv.best_params_)

{'max_depth': 3, 'n_estimators': 50}


In [85]:
gb_cv_model = GradientBoostingClassifier(
    random_state=42,
    max_depth=gb_gridsearchcv.best_params_['max_depth'],
    n_estimators=gb_gridsearchcv.best_params_['n_estimators']
)

print('Training Classification Report')
print(classification_report(yTrain, gb_cv_model.fit(XTrain, yTrain).predict(XTrain)))
print('------------------------------------------------------')
print('Test Classification Report')
print(classification_report(yTest, gb_cv_model.fit(XTrain, yTrain).predict(XTest)))

Training Classification Report
              precision    recall  f1-score   support

           0       0.73      0.90      0.80       986
           1       0.69      0.40      0.50       549

    accuracy                           0.72      1535
   macro avg       0.71      0.65      0.65      1535
weighted avg       0.71      0.72      0.70      1535

------------------------------------------------------
Test Classification Report
              precision    recall  f1-score   support

           0       0.69      0.84      0.76       329
           1       0.54      0.33      0.41       183

    accuracy                           0.66       512
   macro avg       0.62      0.59      0.59       512
weighted avg       0.64      0.66      0.64       512



In [87]:
gb_threshold_tuned = TunedThresholdClassifierCV(
    gb_cv_model,
    cv=kf,
    scoring='roc_auc'
).fit(XTrain, yTrain)

print(classification_report(yTest, gb_threshold_tuned.predict(XTest)))
print(gb_threshold_tuned.best_threshold_)

              precision    recall  f1-score   support

           0       0.72      0.64      0.68       329
           1       0.46      0.55      0.50       183

    accuracy                           0.61       512
   macro avg       0.59      0.60      0.59       512
weighted avg       0.63      0.61      0.62       512

0.38243074254402926


## XGBoost

In [94]:
xgb_model = XGBClassifier()
xgb_param_grid = {
    'n_estimators': [50, 150, 250],
    'max_depth': range(2, 15),
    'grow_policy': ['depthwise', 'lossguide'],
    'base_score': [0.25, 0.35, 0.45, 0.5, 0.55, 0.65, 0.75]
}

xgb_gridsearchcv = GridSearchCV(
    estimator=xgb_model,
    param_grid=xgb_param_grid,
    scoring='roc_auc'
).fit(XTrain, yTrain)
print(xgb_gridsearchcv.best_params_)

{'base_score': 0.55, 'grow_policy': 'depthwise', 'max_depth': 3, 'n_estimators': 50}


  _data = np.array(data, dtype=dtype, copy=copy,


In [96]:
xgb_cv_model = XGBClassifier(
    n_estimators=xgb_gridsearchcv.best_params_['n_estimators'],
    max_depth=xgb_gridsearchcv.best_params_['max_depth'],
    grow_policy=xgb_gridsearchcv.best_params_['grow_policy'],
    base_score=xgb_gridsearchcv.best_params_['base_score'],
).fit(XTrain, yTrain)

print('Training Classification Report')
print(classification_report(yTrain, xgb_cv_model.predict(XTrain)))
print('------------------------------------------------------')
print('Test Classification Report')
print(classification_report(yTest, xgb_cv_model.predict(XTest)))

Training Classification Report
              precision    recall  f1-score   support

           0       0.78      0.92      0.84       986
           1       0.79      0.52      0.63       549

    accuracy                           0.78      1535
   macro avg       0.78      0.72      0.74      1535
weighted avg       0.78      0.78      0.77      1535

------------------------------------------------------
Test Classification Report
              precision    recall  f1-score   support

           0       0.68      0.84      0.75       329
           1       0.50      0.30      0.38       183

    accuracy                           0.64       512
   macro avg       0.59      0.57      0.56       512
weighted avg       0.62      0.64      0.62       512

