In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, PrecisionRecallDisplay
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, TunedThresholdClassifierCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from pprint import pprint

In [3]:
cleanedData = pd.read_csv('../Fully-Cleaned-Data.csv')

In [4]:
categoricalColumns = cleanedData.select_dtypes(include=['object']).columns.tolist()
numericalData = cleanedData.drop(columns=categoricalColumns)
oneHotData = pd.get_dummies(cleanedData[categoricalColumns])

In [5]:
cleanedDataOneHotEncoded = pd.concat([numericalData, oneHotData], axis=1)
print(cleanedDataOneHotEncoded.shape)

(2047, 48)


In [6]:
X = cleanedDataOneHotEncoded.drop(['Future Relapse Binary'], axis=1)
y = cleanedDataOneHotEncoded['Future Relapse Binary']

XTrain, XTest, yTrain, yTest = train_test_split(X, y, random_state=42)

In [6]:
def print_scoring_statistics(yTrue, yPred):
    print(metrics.balanced_accuracy_score(yTrue, yPred))
    print(metrics.f1_score(yTrue, yPred))
    print(metrics.precision_score(yTrue, yPred))
    print(metrics.recall_score(yTrue, yPred))

## Decision Tree

In [7]:
decision_tree_pipeline = Pipeline([
    ('model', DecisionTreeClassifier(class_weight='balanced', random_state=42, max_depth=8))
])

In [8]:
decision_tree_train_preds = decision_tree_pipeline.fit(XTrain, yTrain).predict(XTrain)
print_scoring_statistics(yTrain, decision_tree_train_preds)
print(confusion_matrix(yTrain, decision_tree_train_preds))
print(classification_report(yTrain, decision_tree_train_preds))

0.7439186674111298
0.6701479547432551
0.632183908045977
0.7129629629629629
[[771 224]
 [155 385]]
              precision    recall  f1-score   support

           0       0.83      0.77      0.80       995
           1       0.63      0.71      0.67       540

    accuracy                           0.75      1535
   macro avg       0.73      0.74      0.74      1535
weighted avg       0.76      0.75      0.76      1535



In [9]:
print(cross_val_score(decision_tree_pipeline, XTrain, yTrain, cv=15, scoring='balanced_accuracy').mean())
print(cross_val_score(decision_tree_pipeline, XTrain, yTrain, cv=15, scoring='f1').mean())
print(cross_val_score(decision_tree_pipeline, XTrain, yTrain, cv=15, scoring='precision').mean())
print(cross_val_score(decision_tree_pipeline, XTrain, yTrain, cv=15, scoring='recall').mean())

0.6050819136640032
0.5152978379531878
0.45757981938817177
0.5981481481481482


In [10]:
decision_tree_threshold_tuned = TunedThresholdClassifierCV(decision_tree_pipeline, 
                                                           cv=15, 
                                                           scoring='recall_macro', 
                                                           random_state=42
                                                        ).fit(XTrain, yTrain)
print_scoring_statistics(yTest, decision_tree_threshold_tuned.predict(XTest))
print(confusion_matrix(yTest, decision_tree_threshold_tuned.predict(XTest)))

0.6005208333333333
0.5588822355289421
0.45307443365695793
0.7291666666666666
[[151 169]
 [ 52 140]]


## Random Forest

In [25]:
rf_model = RandomForestClassifier(random_state=42)
rf_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(2, 10),
    'class_weight': ['balanced', None],
    'n_estimators': [50, 100, 150]
}
rf_gridsearchcv = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, scoring='recall').fit(XTrain, yTrain)
print(rf_gridsearchcv.best_params_)

{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 4, 'n_estimators': 50}


In [26]:
rf_cv_model = RandomForestClassifier(
    random_state=42, 
    class_weight=rf_gridsearchcv.best_params_['class_weight'], 
    criterion=rf_gridsearchcv.best_params_['criterion'], 
    max_depth=rf_gridsearchcv.best_params_['max_depth'], 
    n_estimators=rf_gridsearchcv.best_params_['n_estimators']
)

print('Training Classification Report')
print(classification_report(yTrain, rf_cv_model.fit(XTrain, yTrain).predict(XTrain)))
print('------------------------------------------------------')
print('Test Classification Report')
print(classification_report(yTest, rf_cv_model.fit(XTrain, yTrain).predict(XTest)))

Training Classification Report
              precision    recall  f1-score   support

           0       0.81      0.62      0.70       995
           1       0.51      0.73      0.60       540

    accuracy                           0.66      1535
   macro avg       0.66      0.67      0.65      1535
weighted avg       0.70      0.66      0.66      1535

------------------------------------------------------
Test Classification Report
              precision    recall  f1-score   support

           0       0.77      0.55      0.64       320
           1       0.49      0.72      0.59       192

    accuracy                           0.62       512
   macro avg       0.63      0.64      0.62       512
weighted avg       0.67      0.62      0.62       512



## KNN

In [None]:
# for i in range(6, 17):
#     knn_pipeline = Pipeline([
#         ('scaler', StandardScaler()),
#         ('model', KNeighborsClassifier(weights='uniform'))
#     ]).fit(XTrain, yTrain)
#     print(i)
#     print('Training:')
#     knn_training_preds = knn_pipeline.predict(XTrain)
#     print_scoring_statistics(yTrain, knn_training_preds)
#     print('---------------')
#     print('Test:')
#     knn_test_preds = knn_pipeline.predict(XTest)
#     print_scoring_statistics(yTest, knn_test_preds)
#     print('Cross Validated:')

In [64]:
knn_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', KNeighborsClassifier(weights='uniform'))
])

In [66]:
param_grid = {'n_neighbors': np.arange(1, 25)}
knn_gscv = GridSearchCV(knn_pipeline['model'], param_grid, cv=5)
knn_gscv.fit(X, y)
best_n = knn_gscv.best_params_['n_neighbors']
print(knn_gscv.best_params_)
print(knn_gscv.best_estimator_)
print(knn_gscv.best_score_)

{'n_neighbors': np.int64(16)}
KNeighborsClassifier(n_neighbors=np.int64(16))
0.6150647027252669


In [68]:
knn_split_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', KNeighborsClassifier(n_neighbors=best_n, weights='uniform'))
]).fit(XTrain, yTrain)

In [45]:
knn_model = KNeighborsClassifier()
knn_param_grid = {
    'n_neighbors': range(5, 25),
    'weights': ['uniform'],
    'p': [1, 2]
}
knn_gridsearchcv = GridSearchCV(estimator=knn_model, param_grid=knn_param_grid, scoring='recall').fit(XTrain, yTrain)
print(knn_gridsearchcv.best_params_)

{'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}


In [46]:
knn_cv_model = KNeighborsClassifier(
    n_neighbors=knn_gridsearchcv.best_params_['n_neighbors'],
    weights=knn_gridsearchcv.best_params_['weights'],
    p=knn_gridsearchcv.best_params_['p']
).fit(XTrain, yTrain)

print('Training Classification Report')
print(classification_report(yTrain, knn_cv_model.predict(XTrain)))
print('------------------------------------------------------')
print('Test Classification Report')
print(classification_report(yTest, knn_cv_model.predict(XTest)))

Training Classification Report
              precision    recall  f1-score   support

           0       0.77      0.86      0.81       995
           1       0.67      0.51      0.58       540

    accuracy                           0.74      1535
   macro avg       0.72      0.69      0.70      1535
weighted avg       0.73      0.74      0.73      1535

------------------------------------------------------
Test Classification Report
              precision    recall  f1-score   support

           0       0.66      0.76      0.71       320
           1       0.47      0.35      0.40       192

    accuracy                           0.61       512
   macro avg       0.57      0.56      0.56       512
weighted avg       0.59      0.61      0.60       512



## Naive Bayes

In [7]:
nb_model = Pipeline(
    [('scaler', StandardScaler()),
     ('nb_model', GaussianNB())]
).fit(XTrain, yTrain)

print('Training Classification Report')
print(classification_report(yTrain, nb_model.predict(XTrain)))
print('------------------------------------------------------')
print('Test Classification Report')
print(classification_report(yTest, nb_model.predict(XTest)))

Training Classification Report
              precision    recall  f1-score   support

           0       0.97      0.07      0.12       995
           1       0.37      1.00      0.54       540

    accuracy                           0.39      1535
   macro avg       0.67      0.53      0.33      1535
weighted avg       0.76      0.39      0.27      1535

------------------------------------------------------
Test Classification Report
              precision    recall  f1-score   support

           0       0.84      0.05      0.09       320
           1       0.38      0.98      0.55       192

    accuracy                           0.40       512
   macro avg       0.61      0.52      0.32       512
weighted avg       0.67      0.40      0.27       512

