In [43]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, PrecisionRecallDisplay
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, TunedThresholdClassifierCV, GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from pprint import pprint
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline

In [32]:
cleanedData = pd.read_csv('../Fully-Cleaned-Data.csv')

In [33]:
categoricalColumns = cleanedData.select_dtypes(include=['object']).columns.tolist()
numericalData = cleanedData.drop(columns=categoricalColumns)
oneHotData = pd.get_dummies(cleanedData[categoricalColumns])

In [34]:
cleanedDataOneHotEncoded = pd.concat([numericalData, oneHotData], axis=1)
print(cleanedDataOneHotEncoded.shape)

(2047, 48)


In [35]:
X = cleanedDataOneHotEncoded.drop(['Future Relapse Binary'], axis=1)
y = cleanedDataOneHotEncoded['Future Relapse Binary']
XTrain, XTest, yTrain, yTest = train_test_split(X, y, stratify=y, random_state=42)

In [36]:
def print_scoring_statistics(yTrue, yPred):
    print(metrics.balanced_accuracy_score(yTrue, yPred))
    print(metrics.f1_score(yTrue, yPred))
    print(metrics.precision_score(yTrue, yPred))
    print(metrics.recall_score(yTrue, yPred))

In [41]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Decision Tree

In [51]:
decision_tree_pipeline = imbPipeline([
    #('smote', SMOTE(random_state=42)),
    ('model', DecisionTreeClassifier(random_state=42))
])

In [52]:
decision_tree_param_grid = {
    'model__max_depth': [i for i in range(1, 15)],
    'model__min_samples_split': [i for i in range(2, 50)],
    'model__criterion': ['gini', 'entropy']
}
decision_tree_gscv = GridSearchCV(estimator=decision_tree_pipeline, param_grid=decision_tree_param_grid, scoring='roc_auc', cv=kf)
decision_tree_gscv.fit(XTrain, yTrain)
print(decision_tree_gscv.best_params_)
print(decision_tree_gscv.best_estimator_)
print(decision_tree_gscv.best_score_)

{'model__criterion': 'gini', 'model__max_depth': 3, 'model__min_samples_split': 12}
Pipeline(steps=[('model',
                 DecisionTreeClassifier(max_depth=3, min_samples_split=12,
                                        random_state=42))])
0.6391323423778135


In [53]:
decision_tree_gscv_test_score = decision_tree_gscv.score(XTest, yTest)
print(decision_tree_gscv_test_score)

0.6124537013968476


In [19]:
decision_tree_grid_search_params = DecisionTreeClassifier(
    random_state=42, 
    criterion=decision_tree_gscv.best_params_['criterion'], 
    min_samples_split=decision_tree_gscv.best_params_['min_samples_split'],
    max_depth=decision_tree_gscv.best_params_['max_depth']
)

In [18]:
decision_tree_train_preds = decision_tree_grid_search_params.fit(XTrain, yTrain).predict(XTrain)
print_scoring_statistics(yTrain, decision_tree_train_preds)
print(confusion_matrix(yTrain, decision_tree_train_preds))
print(classification_report(yTrain, decision_tree_train_preds))

decision_tree_grid_search_params.predict(XTest)
print_scoring_statistics(yTest, decision_tree_grid_search_params.predict(XTest))
print(classification_report(yTest, decision_tree_grid_search_params.predict(XTest)))

0.8197378438998293
0.8200202224469161
0.8045634920634921
0.8360824742268042
[[805 197]
 [159 811]]
              precision    recall  f1-score   support

           0       0.84      0.80      0.82      1002
           1       0.80      0.84      0.82       970

    accuracy                           0.82      1972
   macro avg       0.82      0.82      0.82      1972
weighted avg       0.82      0.82      0.82      1972

0.6940269481872483
0.7074235807860262
0.7105263157894737
0.7043478260869566
              precision    recall  f1-score   support

           0       0.68      0.68      0.68       313
           1       0.71      0.70      0.71       345

    accuracy                           0.69       658
   macro avg       0.69      0.69      0.69       658
weighted avg       0.69      0.69      0.69       658



In [13]:
print(cross_val_score(decision_tree_pipeline, XTrain, yTrain, cv=15, scoring='balanced_accuracy').mean())
print(cross_val_score(decision_tree_pipeline, XTrain, yTrain, cv=15, scoring='f1').mean())
print(cross_val_score(decision_tree_pipeline, XTrain, yTrain, cv=15, scoring='precision').mean())
print(cross_val_score(decision_tree_pipeline, XTrain, yTrain, cv=15, scoring='recall').mean())

0.6391278673648076
0.6400311259501071
0.630084751013117
0.6525961538461538


In [10]:
decision_tree_threshold_tuned = TunedThresholdClassifierCV(decision_tree_pipeline, 
                                                           cv=15, 
                                                           scoring='recall_macro', 
                                                           random_state=42
                                                        ).fit(XTrain, yTrain)
print_scoring_statistics(yTest, decision_tree_threshold_tuned.predict(XTest))
print(confusion_matrix(yTest, decision_tree_threshold_tuned.predict(XTest)))

0.6005208333333333
0.5588822355289421
0.45307443365695793
0.7291666666666666
[[151 169]
 [ 52 140]]


## Random Forest

In [11]:
random_forest_pipeline = Pipeline([
    ('model', RandomForestClassifier(class_weight='balanced', random_state=42, max_depth=11))
])

In [12]:
random_forest_train_preds = random_forest_pipeline.fit(XTrain, yTrain).predict(XTrain)
print_scoring_statistics(yTrain, random_forest_train_preds)
print(confusion_matrix(yTrain, random_forest_train_preds))
print(classification_report(yTrain, random_forest_train_preds))

0.9371812767541411
0.9052997393570807
0.8527004909983633
0.9648148148148148
[[905  90]
 [ 19 521]]
              precision    recall  f1-score   support

           0       0.98      0.91      0.94       995
           1       0.85      0.96      0.91       540

    accuracy                           0.93      1535
   macro avg       0.92      0.94      0.92      1535
weighted avg       0.93      0.93      0.93      1535



In [13]:
print(cross_val_score(random_forest_pipeline, XTrain, yTrain, cv=15, scoring='balanced_accuracy').mean())
print(cross_val_score(random_forest_pipeline, XTrain, yTrain, cv=15, scoring='f1').mean())
print(cross_val_score(random_forest_pipeline, XTrain, yTrain, cv=15, scoring='precision').mean())
print(cross_val_score(random_forest_pipeline, XTrain, yTrain, cv=15, scoring='recall').mean())

0.6316850092969497
0.5237586288059768
0.5228178301947463
0.5296296296296296


In [14]:
for scoring_str in ['balanced_accuracy', 'f1', 'precision', 'recall']:
    random_forest_threshold_tuned = TunedThresholdClassifierCV(random_forest_pipeline, 
                                                            cv=15, 
                                                            scoring=scoring_str, 
                                                            random_state=42
                                                            ).fit(XTrain, yTrain)
    if scoring_str == 'balanced_accuracy':
        print(metrics.balanced_accuracy_score(yTest, random_forest_threshold_tuned.predict(XTest)))
    elif scoring_str == 'f1':
        print(metrics.f1_score(yTest, random_forest_threshold_tuned.predict(XTest)))
    elif scoring_str == 'precision':
        print(metrics.precision_score(yTest, random_forest_threshold_tuned.predict(XTest)))
    elif scoring_str == 'recall':
        print(metrics.recall_score(yTest, random_forest_threshold_tuned.predict(XTest)))

0.6515625
0.5703125
0.4444444444444444
1.0


## KNN

In [None]:
# for i in range(6, 17):
#     knn_pipeline = Pipeline([
#         ('scaler', StandardScaler()),
#         ('model', KNeighborsClassifier(weights='uniform'))
#     ]).fit(XTrain, yTrain)
#     print(i)
#     print('Training:')
#     knn_training_preds = knn_pipeline.predict(XTrain)
#     print_scoring_statistics(yTrain, knn_training_preds)
#     print('---------------')
#     print('Test:')
#     knn_test_preds = knn_pipeline.predict(XTest)
#     print_scoring_statistics(yTest, knn_test_preds)
#     print('Cross Validated:')

In [64]:
knn_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', KNeighborsClassifier(weights='uniform'))
])

In [66]:
param_grid = {'n_neighbors': np.arange(1, 25)}
knn_gscv = GridSearchCV(knn_pipeline['model'], param_grid, cv=5)
knn_gscv.fit(X, y)
best_n = knn_gscv.best_params_['n_neighbors']
print(knn_gscv.best_params_)
print(knn_gscv.best_estimator_)
print(knn_gscv.best_score_)

{'n_neighbors': np.int64(16)}
KNeighborsClassifier(n_neighbors=np.int64(16))
0.6150647027252669


In [68]:
knn_split_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', KNeighborsClassifier(n_neighbors=best_n, weights='uniform'))
]).fit(XTrain, yTrain)