### Global Imports

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, FixedThresholdClassifier

### Data Loading

In [4]:
cleanedData = pd.read_csv('../Fully-Cleaned-Data.csv')

In [5]:
categoricalColumns = cleanedData.select_dtypes(include=['object']).columns.tolist()
numericalData = cleanedData.drop(columns=categoricalColumns)
oneHotData = pd.get_dummies(cleanedData[categoricalColumns])

In [6]:
cleanedDataOneHotEncoded = pd.concat([numericalData, oneHotData], axis=1)

In [7]:
X = cleanedDataOneHotEncoded.drop(['Future Relapse Binary'], axis=1)
y = cleanedDataOneHotEncoded['Future Relapse Binary']
XTrain, XTest, yTrain, yTest = train_test_split(X, y, stratify=y, random_state=42)

### Helper Funcs

In [57]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

def print_scoring_metrics(fittedModel):
    train_preds = fittedModel.predict(XTrain)
    test_preds = fittedModel.predict(XTest)
    print('Training Scores')
    print(f'Accuracy: {accuracy_score(yTrain, train_preds)}')
    print(f'Recall: {recall_score(yTrain, train_preds)}')
    print(f'Specificity: {recall_score(yTrain, train_preds, pos_label=0)}')
    print(f'Precision: {precision_score(yTrain, train_preds)}')
    print(f'F1: {f1_score(yTrain, train_preds)}')
    print(f'ROC AUC: {roc_auc_score(yTrain, train_preds)}')
    print('--------------------------------')
    print('Test Scores')
    print(f'Accuracy: {accuracy_score(yTest, test_preds)}')
    print(f'Recall: {recall_score(yTest, test_preds)}')
    print(f'Specificity: {recall_score(yTest, test_preds, pos_label=0)}')
    print(f'Precision: {precision_score(yTest, test_preds)}')
    print(f'F1: {f1_score(yTest, test_preds)}')
    print(f'ROC AUC: {roc_auc_score(yTest, test_preds)}')

    print(f'{accuracy_score(yTest, test_preds):.4f}\t{recall_score(yTest, test_preds):.4f}\t{recall_score(yTest, test_preds, pos_label=0):.4f}\t{precision_score(yTest, test_preds):.4f}\t{f1_score(yTest, test_preds):.4f}\t{roc_auc_score(yTest, test_preds):.4f}')

In [11]:
from sklearn.frozen import FrozenEstimator

def find_recall_threshold(fittedModel):
    recall_dict = {}
    for threshold in np.linspace(0, 1, 99):
        new_threshold_model = FixedThresholdClassifier(
            estimator = FrozenEstimator(fittedModel), 
            threshold = threshold
            ).fit(XTrain, yTrain)
        y_hat = new_threshold_model.predict(XTest)
        calculated_recall_score = recall_score(yTest, y_hat)
        if calculated_recall_score > 0.78 and calculated_recall_score < 0.82:
            recall_dict[threshold] = calculated_recall_score
    recall_dict_value_sorted = dict(sorted(recall_dict.items(), key=lambda item: item[1], reverse=True))
    print(recall_dict_value_sorted)

In [19]:
def using_recall_threshold(fittedModel, new_threshold):
    tuned_threshold_model = FixedThresholdClassifier(
        estimator = FrozenEstimator(fittedModel),
        threshold = new_threshold
    ).fit(XTrain, yTrain)
    print_scoring_metrics(tuned_threshold_model)

### Decision Tree

criterion: entropy, max_depth: 8, max_features: sqrt, min_samples_split: 9

In [8]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(
    class_weight='balanced',
    max_depth=8,
    max_features='sqrt',
    min_samples_split=9,
    criterion='entropy'
).fit(XTrain, yTrain)

In [56]:
print_scoring_metrics(decision_tree)

Training Scores
Accuracy: 0.609771986970684
Recall: 0.8397085610200364
Specificity: 0.4817444219066937
Precision: 0.4742798353909465
F1: 0.6061801446416831
ROC AUC: 0.6607264914633652
--------------------------------
Test Scores
Accuracy: 0.53125
Recall: 0.7486338797814208
Specificity: 0.41033434650455924
Precision: 0.41389728096676737
F1: 0.5330739299610895
ROC AUC: 0.57948411314299
0.5312	0.7486, 0.4103, 0.4139, 0.5331, 0.5795


In [12]:
find_recall_threshold(decision_tree)

{np.float64(0.3877551020408163): 0.7978142076502732, np.float64(0.39795918367346933): 0.7978142076502732, np.float64(0.4081632653061224): 0.7978142076502732, np.float64(0.4183673469387755): 0.7923497267759563, np.float64(0.42857142857142855): 0.7923497267759563, np.float64(0.4387755102040816): 0.7923497267759563, np.float64(0.44897959183673464): 0.7923497267759563, np.float64(0.4591836734693877): 0.7923497267759563, np.float64(0.4693877551020408): 0.7923497267759563}


In [20]:
using_recall_threshold(decision_tree, 0.39795918367346933)

Training Scores
Accuracy: 0.5837133550488599
Recall: 0.9143897996357013
Specificity: 0.3995943204868154
Precision: 0.45886654478976235
F1: 0.6110772976262934
ROC AUC: 0.6569920600612584
--------------------------------
Test Scores
Accuracy: 0.4765625
Recall: 0.7978142076502732
Specificity: 0.2978723404255319
Precision: 0.38726790450928383
F1: 0.5214285714285715
ROC AUC: 0.5478432740379026
0.4766, 0.7978, 0.2979, 0.3873, 0.5214, 0.5478


### Random Forest

class_weight: balanced, criterion: gini, max_depth: 4, n_estimators: 250

In [16]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(
    class_weight='balanced',
    criterion='gini',
    max_depth=4,
    n_estimators=250
).fit(XTrain, yTrain)

In [17]:
print_scoring_metrics(random_forest)

Training Scores
Accuracy: 0.6671009771986971
Recall: 0.7340619307832422
Specificity: 0.6298174442190669
Precision: 0.5247395833333334
F1: 0.6119969627942293
ROC AUC: 0.6819396875011545
--------------------------------
Test Scores
Accuracy: 0.603515625
Recall: 0.6284153005464481
Specificity: 0.5896656534954408
Precision: 0.46
F1: 0.5311778290993071
ROC AUC: 0.6090404770209444
0.6035, 0.6284, 0.5897, 0.4600, 0.5312, 0.6090


In [18]:
find_recall_threshold(random_forest)

{np.float64(0.4387755102040816): 0.8087431693989071, np.float64(0.44897959183673464): 0.7814207650273224}


In [21]:
using_recall_threshold(random_forest, 0.4387755102040816)

Training Scores
Accuracy: 0.5700325732899023
Recall: 0.8779599271402551
Specificity: 0.39858012170385393
Precision: 0.4483720930232558
F1: 0.5935960591133005
ROC AUC: 0.6382700244220545
--------------------------------
Test Scores
Accuracy: 0.51953125
Recall: 0.8087431693989071
Specificity: 0.3586626139817629
Precision: 0.41225626740947074
F1: 0.5461254612546126
ROC AUC: 0.5837028916903351
0.5195, 0.8087, 0.3587, 0.4123, 0.5461, 0.5837


### KNN

n_neighbors: 99, weights: uniform, p: 1

In [42]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(
    n_neighbors=3
).fit(XTrain, yTrain)

In [43]:
print_scoring_metrics(knn)

Training Scores
Accuracy: 0.7993485342019544
Recall: 0.6211293260473588
Specificity: 0.8985801217038539
Precision: 0.7732426303854876
F1: 0.6888888888888889
ROC AUC: 0.7598547238756064
--------------------------------
Test Scores
Accuracy: 0.58984375
Recall: 0.31693989071038253
Specificity: 0.7416413373860182
Precision: 0.40559440559440557
F1: 0.3558282208588957
ROC AUC: 0.5292906140482003
0.5898, 0.3169, 0.7416, 0.4056, 0.3558, 0.5293


In [44]:
find_recall_threshold(knn)

{np.float64(0.01020408163265306): 0.7868852459016393, np.float64(0.02040816326530612): 0.7868852459016393, np.float64(0.030612244897959183): 0.7868852459016393, np.float64(0.04081632653061224): 0.7868852459016393, np.float64(0.0510204081632653): 0.7868852459016393, np.float64(0.061224489795918366): 0.7868852459016393, np.float64(0.07142857142857142): 0.7868852459016393, np.float64(0.08163265306122448): 0.7868852459016393, np.float64(0.09183673469387754): 0.7868852459016393, np.float64(0.1020408163265306): 0.7868852459016393, np.float64(0.11224489795918366): 0.7868852459016393, np.float64(0.12244897959183673): 0.7868852459016393, np.float64(0.13265306122448978): 0.7868852459016393, np.float64(0.14285714285714285): 0.7868852459016393, np.float64(0.1530612244897959): 0.7868852459016393, np.float64(0.16326530612244897): 0.7868852459016393, np.float64(0.17346938775510204): 0.7868852459016393, np.float64(0.18367346938775508): 0.7868852459016393, np.float64(0.19387755102040816): 0.78688524590

In [45]:
using_recall_threshold(knn, 0.32653061224489793)

Training Scores
Accuracy: 0.6651465798045603
Recall: 1.0
Specificity: 0.4787018255578093
Precision: 0.5164628410159925
F1: 0.6811414392059554
ROC AUC: 0.7393509127789046
--------------------------------
Test Scores
Accuracy: 0.486328125
Recall: 0.7868852459016393
Specificity: 0.3191489361702128
Precision: 0.391304347826087
F1: 0.5226860254083484
ROC AUC: 0.553017091035926
0.4863, 0.7869, 0.3191, 0.3913, 0.5227, 0.5530


### XGBoost

grow_policy: depthwise, max_depth: 2, n_estimators: 150

In [28]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    grow_policy = 'depthwise',
    max_depth = 2,
    n_estimators = 150
).fit(XTrain, yTrain)

In [29]:
print_scoring_metrics(xgb)

Training Scores
Accuracy: 0.7713355048859935
Recall: 0.5373406193078324
Specificity: 0.9016227180527383
Precision: 0.7525510204081632
F1: 0.6269925611052072
ROC AUC: 0.7194816686802854
--------------------------------
Test Scores
Accuracy: 0.626953125
Recall: 0.3005464480874317
Specificity: 0.8085106382978723
Precision: 0.4661016949152542
F1: 0.3654485049833887
ROC AUC: 0.5545285431926521
0.6270, 0.3005, 0.8085, 0.4661, 0.3654, 0.5545


In [59]:
y_train_pred_proba = xgb.predict_proba(XTrain)
y_test_pred_proba = xgb.predict_proba(XTest)
for threshold in np.linspace(0, 1, 99):
    y_train_pred = (y_train_pred_proba[:, 1] >= threshold).astype(int)
    y_test_pred = (y_test_pred_proba[:,1]>=threshold).astype(int)
    if recall_score(yTest, y_test_pred) > 0.78 and recall_score(yTest, y_test_pred) < 0.80:
        print(threshold)
        print('Training Scores')
        print(f'Accuracy: {accuracy_score(yTrain, y_train_pred)}')
        print(f'Recall: {recall_score(yTrain, y_train_pred)}')
        print(f'Specificity: {recall_score(yTrain, y_train_pred, pos_label=0)}')
        print(f'Precision: {precision_score(yTrain, y_train_pred)}')
        print(f'F1: {f1_score(yTrain, y_train_pred)}')
        print(f'ROC AUC: {roc_auc_score(yTrain, y_train_pred)}')
        print('--------------------------------')
        print('Test Scores')
        print(f'Accuracy: {accuracy_score(yTest, y_test_pred)}')
        print(f'Recall: {recall_score(yTest, y_test_pred)}')
        print(f'Specificity: {recall_score(yTest, y_test_pred, pos_label=0)}')
        print(f'Precision: {precision_score(yTest, y_test_pred)}')
        print(f'F1: {f1_score(yTest, y_test_pred)}')
        print(f'ROC AUC: {roc_auc_score(yTest, y_test_pred)}')
        print(f'{accuracy_score(yTest, y_test_pred):.4f}\t{recall_score(yTest, y_test_pred):.4f}\t{recall_score(yTest, y_test_pred, pos_label=0):.4f}\t{precision_score(yTest, y_test_pred):.4f}\t{f1_score(yTest, y_test_pred):.4f}\t{roc_auc_score(yTest, y_test_pred):.4f}')


0.2346938775510204
Training Scores
Accuracy: 0.6312703583061889
Recall: 0.9435336976320583
Specificity: 0.45740365111561865
Precision: 0.4919278252611586
F1: 0.6466916354556804
ROC AUC: 0.7004686743738384
--------------------------------
Test Scores
Accuracy: 0.537109375
Recall: 0.7978142076502732
Specificity: 0.39209726443769
Precision: 0.42196531791907516
F1: 0.5519848771266541
ROC AUC: 0.5949557360439816
0.5371	0.7978	0.3921	0.4220	0.5520	0.5950


In [None]:
train_preds = xgb.predict(XTrain)
test_preds = xgb.predict_proba(XTest)
print('Training Scores')
print(f'Accuracy: {accuracy_score(yTrain, train_preds)}')
print(f'Recall: {recall_score(yTrain, train_preds)}')
print(f'Specificity: {recall_score(yTrain, train_preds, pos_label=0)}')
print(f'Precision: {precision_score(yTrain, train_preds)}')
print(f'F1: {f1_score(yTrain, train_preds)}')
print(f'ROC AUC: {roc_auc_score(yTrain, train_preds)}')
print('--------------------------------')
print('Test Scores')
print(f'Accuracy: {accuracy_score(yTest, test_preds)}')
print(f'Recall: {recall_score(yTest, test_preds)}')
print(f'Specificity: {recall_score(yTest, test_preds, pos_label=0)}')
print(f'Precision: {precision_score(yTest, test_preds)}')
print(f'F1: {f1_score(yTest, test_preds)}')
print(f'ROC AUC: {roc_auc_score(yTest, test_preds)}')

### Naive Bayes

In [49]:
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline

nb = GaussianNB().fit(XTrain, yTrain)

In [50]:
print_scoring_metrics(nb)

Training Scores
Accuracy: 0.43778501628664496
Recall: 0.9617486338797814
Specificity: 0.1460446247464503
Precision: 0.3854014598540146
F1: 0.5502866076081292
ROC AUC: 0.5538966293131159
--------------------------------
Test Scores
Accuracy: 0.439453125
Recall: 0.9672131147540983
Specificity: 0.1458966565349544
Precision: 0.3864628820960699
F1: 0.5522620904836193
ROC AUC: 0.5565548856445264
0.4395, 0.9672, 0.1459, 0.3865, 0.5523, 0.5566


In [51]:
find_recall_threshold(nb)

{}


In [53]:
y_pred_proba = nb.predict_proba(XTest)
for threshold in np.linspace(0, 1, 99):
    y_pred = (y_pred_proba[:,1]>=threshold).astype(int)
    if recall_score(yTest, y_pred) > 0.70 and recall_score(yTest, y_pred) < 0.89:
        print(threshold)
        print(recall_score(yTest, y_pred))

0.9897959183673468
0.8797814207650273


In [62]:
using_recall_threshold(nb, 0.9897959183673468)

Training Scores
Accuracy: 0.4762214983713355
Recall: 0.9143897996357013
Specificity: 0.23225152129817445
Precision: 0.3987291501191422
F1: 0.5553097345132744
ROC AUC: 0.5733206604669379
--------------------------------
Test Scores
Accuracy: 0.4609375
Recall: 0.8797814207650273
Specificity: 0.22796352583586627
Precision: 0.38795180722891565
F1: 0.5384615384615384
ROC AUC: 0.5538724733004468
0.4609	0.8798	0.2280	0.3880	0.5385	0.5539


### Logistic Regression

In [46]:
from sklearn.linear_model import LogisticRegression
#from sklearn.model_selection import GridSearchCV

log_reg = LogisticRegression(
    max_iter=500,
    class_weight='balanced'
).fit(XTrain, yTrain)

print_scoring_metrics(log_reg)

Training Scores
Accuracy: 0.6371335504885993
Recall: 0.6939890710382514
Specificity: 0.6054766734279919
Precision: 0.4948051948051948
F1: 0.577710386656558
ROC AUC: 0.6497328722331217
--------------------------------
Test Scores
Accuracy: 0.6171875
Recall: 0.6338797814207651
Specificity: 0.60790273556231
Precision: 0.47346938775510206
F1: 0.5420560747663551
ROC AUC: 0.6208912584915376
0.6172, 0.6339, 0.6079, 0.4735, 0.5421, 0.6209


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [47]:
find_recall_threshold(log_reg)

{np.float64(0.3877551020408163): 0.8142076502732241, np.float64(0.39795918367346933): 0.8087431693989071, np.float64(0.4081632653061224): 0.7923497267759563}


In [48]:
using_recall_threshold(log_reg, 0.4081632653061224)

Training Scores
Accuracy: 0.5830618892508144
Recall: 0.8469945355191257
Specificity: 0.43610547667342797
Precision: 0.455435847208619
F1: 0.5923566878980892
ROC AUC: 0.6415500060962769
--------------------------------
Test Scores
Accuracy: 0.55859375
Recall: 0.7923497267759563
Specificity: 0.42857142857142855
Precision: 0.43543543543543545
F1: 0.562015503875969
ROC AUC: 0.6104605776736924
0.5586, 0.7923, 0.4286, 0.4354, 0.5620, 0.6105
