In [63]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, PrecisionRecallDisplay
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, TunedThresholdClassifierCV
from pprint import pprint

In [3]:
cleanedData = pd.read_csv('../Fully-Cleaned-Data.csv')

In [4]:
categoricalColumns = cleanedData.select_dtypes(include=['object']).columns.tolist()
numericalData = cleanedData.drop(columns=categoricalColumns)
oneHotData = pd.get_dummies(cleanedData[categoricalColumns])

In [5]:
cleanedDataOneHotEncoded = pd.concat([numericalData, oneHotData], axis=1)
print(cleanedDataOneHotEncoded.shape)

(2047, 48)


In [8]:
X = cleanedDataOneHotEncoded.drop(['Future Relapse Binary'], axis=1)
y = cleanedDataOneHotEncoded['Future Relapse Binary']

XTrain, XTest, yTrain, yTest = train_test_split(X, y, random_state=42)

In [60]:
def print_scoring_statistics(yTrue, yPred):
    print(metrics.balanced_accuracy_score(yTrue, yPred))
    print(metrics.f1_score(yTrue, yPred))
    print(metrics.precision_score(yTrue, yPred))
    print(metrics.recall_score(yTrue, yPred))

## Decision Tree

In [49]:
decision_tree_pipeline = Pipeline([
    ('model', DecisionTreeClassifier(class_weight='balanced', random_state=42, max_depth=8))
])

In [61]:
decision_tree_train_preds = decision_tree_pipeline.fit(XTrain, yTrain).predict(XTrain)
print_scoring_statistics(yTrain, decision_tree_train_preds)
print(confusion_matrix(yTrain, decision_tree_train_preds))
print(classification_report(yTrain, decision_tree_train_preds))

0.7439186674111298
0.6701479547432551
0.632183908045977
0.7129629629629629
[[771 224]
 [155 385]]
              precision    recall  f1-score   support

           0       0.83      0.77      0.80       995
           1       0.63      0.71      0.67       540

    accuracy                           0.75      1535
   macro avg       0.73      0.74      0.74      1535
weighted avg       0.76      0.75      0.76      1535



In [64]:
print(cross_val_score(decision_tree_pipeline, XTrain, yTrain, cv=15, scoring='balanced_accuracy').mean())
print(cross_val_score(decision_tree_pipeline, XTrain, yTrain, cv=15, scoring='f1').mean())
print(cross_val_score(decision_tree_pipeline, XTrain, yTrain, cv=15, scoring='precision').mean())
print(cross_val_score(decision_tree_pipeline, XTrain, yTrain, cv=15, scoring='recall').mean())

0.6050819136640032
0.5152978379531878
0.45757981938817177
0.5981481481481482


In [None]:
decision_tree_threshold_tuned = TunedThresholdClassifierCV(decision_tree_pipeline, 
                                                           cv=15, 
                                                           scoring='recall_macro', 
                                                           random_state=42
                                                        ).fit(XTrain, yTrain)
print_scoring_statistics(yTest, decision_tree_threshold_tuned.predict(XTest))
print(confusion_matrix(yTest, decision_tree_threshold_tuned.predict(XTest)))

0.6005208333333333
0.5588822355289421
0.45307443365695793
0.7291666666666666
[[151 169]
 [ 52 140]]


## Random Forest

In [99]:
random_forest_pipeline = Pipeline([
    ('model', RandomForestClassifier(class_weight='balanced', random_state=42, max_depth=11))
])

In [100]:
random_forest_train_preds = random_forest_pipeline.fit(XTrain, yTrain).predict(XTrain)
print_scoring_statistics(yTrain, random_forest_train_preds)
print(confusion_matrix(yTrain, random_forest_train_preds))
print(classification_report(yTrain, random_forest_train_preds))

0.9371812767541411
0.9052997393570807
0.8527004909983633
0.9648148148148148
[[905  90]
 [ 19 521]]
              precision    recall  f1-score   support

           0       0.98      0.91      0.94       995
           1       0.85      0.96      0.91       540

    accuracy                           0.93      1535
   macro avg       0.92      0.94      0.92      1535
weighted avg       0.93      0.93      0.93      1535



In [104]:
print(cross_val_score(random_forest_pipeline, XTrain, yTrain, cv=15, scoring='balanced_accuracy').mean())
print(cross_val_score(random_forest_pipeline, XTrain, yTrain, cv=15, scoring='f1').mean())
print(cross_val_score(random_forest_pipeline, XTrain, yTrain, cv=15, scoring='precision').mean())
print(cross_val_score(random_forest_pipeline, XTrain, yTrain, cv=15, scoring='recall').mean())

0.6316850092969497
0.5237586288059768
0.5228178301947463
0.5296296296296296


In [113]:
for scoring_str in ['balanced_accuracy', 'f1', 'precision', 'recall']:
    random_forest_threshold_tuned = TunedThresholdClassifierCV(random_forest_pipeline, 
                                                            cv=15, 
                                                            scoring=scoring_str, 
                                                            random_state=42
                                                            ).fit(XTrain, yTrain)
    if scoring_str == 'balanced_accuracy':
        print(metrics.balanced_accuracy_score(yTest, random_forest_threshold_tuned.predict(XTest)))
    elif scoring_str == 'f1':
        print(metrics.f1_score(yTest, random_forest_threshold_tuned.predict(XTest)))
    elif scoring_str == 'precision':
        print(metrics.precision_score(yTest, random_forest_threshold_tuned.predict(XTest)))
    elif scoring_str == 'recall':
        print(metrics.recall_score(yTest, random_forest_threshold_tuned.predict(XTest)))

0.6515625
0.5703125
0.4444444444444444
1.0
