# Intro

Using a dataset kindly provided to my professor by Harvard Medical school, these analyses are attempting to classify whether or not a patient with Multiple Sclerosis (MS) is likely to relapse sometime in the next 3 years.

## Imports

In [72]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import TunedThresholdClassifierCV, FixedThresholdClassifier, train_test_split

## Loading Data and Preparing for Classification

In [73]:
cleanedData = pd.read_csv('../Fully-Cleaned-Data.csv')

In [74]:
categoricalColumns = cleanedData.select_dtypes(include=['object']).columns.tolist()
numericalData = cleanedData.drop(columns=categoricalColumns)
oneHotData = pd.get_dummies(cleanedData[categoricalColumns])

In [75]:
cleanedDataOneHotEncoded = pd.concat([numericalData, oneHotData], axis=1)
print(cleanedDataOneHotEncoded.shape)
yData = cleanedDataOneHotEncoded['Future Relapse Binary']
XData = cleanedDataOneHotEncoded.drop(['Future Relapse Binary'], axis=1)

(2047, 48)


In [76]:
XTrain, XTest, yTrain, yTest = train_test_split(
    XData, yData, stratify=yData, random_state=42
)
print(pd.DataFrame(yTrain).value_counts())

Future Relapse Binary
0                        986
1                        549
Name: count, dtype: int64


## Basic Random Forest

In [77]:
classifier = RandomForestClassifier(max_depth=5, random_state=42, class_weight='balanced').fit(XTrain, yTrain)
print(classification_report(yTest, classifier.predict(XTest)))

              precision    recall  f1-score   support

           0       0.73      0.62      0.67       329
           1       0.47      0.59      0.52       183

    accuracy                           0.61       512
   macro avg       0.60      0.61      0.60       512
weighted avg       0.64      0.61      0.62       512



Since class 1 is underrepresented at a 2:1 ratio, we must rebalance.

In [78]:
classifier = RandomForestClassifier(max_depth=10, random_state=42, class_weight='balanced').fit(XTrain, yTrain)
print(classification_report(yTest, classifier.predict(XTest)))

              precision    recall  f1-score   support

           0       0.72      0.75      0.73       329
           1       0.51      0.48      0.49       183

    accuracy                           0.65       512
   macro avg       0.62      0.61      0.61       512
weighted avg       0.65      0.65      0.65       512



In [79]:
baseCM = confusion_matrix(yTest, classifier.predict(XTest))
print(baseCM)

[[246  83]
 [ 96  87]]


## Manually Setting Decision Threshold in Trees

In [86]:
classifierFixedThreshold = FixedThresholdClassifier(
    classifier, threshold=0.45, response_method='predict_proba'
).fit(XTrain, yTrain)
print(confusion_matrix(yTest, classifierFixedThreshold.predict(XTest)))

[[212 117]
 [ 73 110]]


## Base Accuracy

In [None]:
classifier_tuned = TunedThresholdClassifierCV(
    classifier, scoring='accuracy'
).fit(XTrain, yTrain)
print(
    f'Cut-off point found at {classifier_tuned.best_threshold_:.3f}'
)
print(classification_report(yTest, classifier_tuned.predict(XTest)))

Cut-off point found at 0.446
              precision    recall  f1-score   support

           0       0.70      0.75      0.72       329
           1       0.48      0.41      0.44       183

    accuracy                           0.63       512
   macro avg       0.59      0.58      0.58       512
weighted avg       0.62      0.63      0.62       512



## Balanced Accuracy (Changing Probability Threshold for Split)

In [None]:
classifier_tuned = TunedThresholdClassifierCV(
    classifier, scoring='balanced_accuracy'
).fit(XTrain, yTrain)
print(
    f'Cut-off point found at {classifier_tuned.best_threshold_:.3f}'
)
print(classification_report(yTest, classifier_tuned.predict(XTest)))

Cut-off point found at 0.399
              precision    recall  f1-score   support

           0       0.69      0.66      0.68       329
           1       0.44      0.48      0.46       183

    accuracy                           0.59       512
   macro avg       0.57      0.57      0.57       512
weighted avg       0.60      0.59      0.60       512



In [None]:
classifier_tuned = TunedThresholdClassifierCV(
    classifier, scoring='precision', cv=15
).fit(XTrain, yTrain)
print(
    f'Cut-off point found at {classifier_tuned.best_threshold_:.3f}'
)
print(classification_report(yTest, classifier_tuned.predict(XTest)))

Cut-off point found at 0.639
              precision    recall  f1-score   support

           0       0.65      0.95      0.77       329
           1       0.44      0.08      0.13       183

    accuracy                           0.63       512
   macro avg       0.54      0.51      0.45       512
weighted avg       0.57      0.63      0.54       512

