### Global Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, FixedThresholdClassifier

### Data Loading

In [2]:
cleanedData = pd.read_csv('../Fully-Cleaned-Data.csv')

In [3]:
categoricalColumns = cleanedData.select_dtypes(include=['object']).columns.tolist()
numericalData = cleanedData.drop(columns=categoricalColumns)
oneHotData = pd.get_dummies(cleanedData[categoricalColumns])

In [4]:
cleanedDataOneHotEncoded = pd.concat([numericalData, oneHotData], axis=1)

In [5]:
X = cleanedDataOneHotEncoded.drop(['Future Relapse Binary'], axis=1)
y = cleanedDataOneHotEncoded['Future Relapse Binary']
XTrain, XTest, yTrain, yTest = train_test_split(X, y, stratify=y, random_state=42)

### Helper Funcs

In [29]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

def print_scoring_metrics(fittedModel, return_scores = False):
    train_preds = fittedModel.predict(XTrain)
    train_proba_preds = fittedModel.predict_proba(XTrain)[:, 1]
    test_preds = fittedModel.predict(XTest)
    test_proba_preds = fittedModel.predict_proba(XTest)[:, 1]
    print('Training Scores')
    print(f'Accuracy: {accuracy_score(yTrain, train_preds)}')
    print(f'Recall: {recall_score(yTrain, train_preds)}')
    print(f'Specificity: {recall_score(yTrain, train_preds, pos_label=0)}')
    print(f'Precision: {precision_score(yTrain, train_preds)}')
    print(f'F1: {f1_score(yTrain, train_preds)}')
    print(f'ROC AUC: {roc_auc_score(yTrain, train_proba_preds)}')
    print('--------------------------------')
    print('Test Scores')
    print(f'Accuracy: {accuracy_score(yTest, test_preds)}')
    print(f'Recall: {recall_score(yTest, test_preds)}')
    print(f'Specificity: {recall_score(yTest, test_preds, pos_label=0)}')
    print(f'Precision: {precision_score(yTest, test_preds)}')
    print(f'F1: {f1_score(yTest, test_preds)}')
    print(f'ROC AUC: {roc_auc_score(yTest, test_proba_preds)}')
    if return_scores:
        return f'{accuracy_score(yTest, test_preds):.4f}\t{recall_score(yTest, test_preds):.4f}\t{recall_score(yTest, test_preds, pos_label=0):.4f}\t{precision_score(yTest, test_preds):.4f}\t{f1_score(yTest, test_preds):.4f}\t{roc_auc_score(yTest, test_proba_preds):.4f}'
    else:
        print(f'{accuracy_score(yTest, test_preds):.4f}\t{recall_score(yTest, test_preds):.4f}\t{recall_score(yTest, test_preds, pos_label=0):.4f}\t{precision_score(yTest, test_preds):.4f}\t{f1_score(yTest, test_preds):.4f}\t{roc_auc_score(yTest, test_proba_preds):.4f}')

In [26]:
from sklearn.metrics import make_scorer

cross_validation_scoring = {
    'acc': 'accuracy', 
    'rec': 'recall', 
    'spec': make_scorer(recall_score, pos_label=0), 
    'prec': 'precision',
    'f1_score': 'f1', 
    'auc': 'roc_auc'
}

In [46]:
from sklearn.model_selection import cross_validate
from sklearn.frozen import FrozenEstimator

def cross_validated_threshold(fittedModel, new_threshold):
    tuned_threshold_model = FixedThresholdClassifier(
        estimator = FrozenEstimator(fittedModel),
        threshold = new_threshold
    )

    cv_results = cross_validate(tuned_threshold_model, X, y, cv=10, scoring=cross_validation_scoring)
    
    return tuned_threshold_model, cv_results

In [44]:
def find_recall_threshold(fittedModel):
    results_dict = {}
    #recall_dict = {}
    for threshold in np.linspace(0, 1, 99):
        # new_threshold_model = FixedThresholdClassifier(
        #     estimator = FrozenEstimator(fittedModel), 
        #     threshold = threshold
        # )
        tuned_threshold_model, cv_results = cross_validated_threshold(fittedModel, threshold)
        print(np.average(cv_results['test_rec']))
        if np.abs(np.average(cv_results['test_rec']) - 80) < 2:
            print(cv_results)
            results_dict[threshold] = print_scoring_metrics(tuned_threshold_model, True)

    print(results_dict)

        #y_hat = new_threshold_model.predict(XTest)
        #calculated_recall_score = recall_score(yTest, y_hat)
        #if calculated_recall_score > 0.78 and calculated_recall_score < 0.82:
        #    results_dict['accuracy'].append(accuracy_score(yTest, y_hat))
            #recall_dict[threshold] = calculated_recall_score
    #recall_dict_value_sorted = dict(sorted(recall_dict.items(), key=lambda item: item[1], reverse=True))
    #print(results_dict)

In [8]:
def using_recall_threshold(fittedModel, new_threshold):
    tuned_threshold_model = FixedThresholdClassifier(
        estimator = FrozenEstimator(fittedModel),
        threshold = new_threshold
    ).fit(XTrain, yTrain)
    print_scoring_metrics(tuned_threshold_model)

In [None]:
from sklearn.metrics import make_scorer

cross_validation_scoring = {
    'acc': 'accuracy', 
    'rec': 'recall', 
    'spec': make_scorer(recall_score, pos_label=0), 
    'prec': 'precision',
    'f1_score': 'f1', 
    'auc': 'roc_auc'
}

In [24]:
from sklearn.model_selection import cross_validate

def cross_validated_threshold(fittedModel, new_threshold):
    tuned_threshold_model = FixedThresholdClassifier(
        estimator = FrozenEstimator(fittedModel),
        threshold = new_threshold
    )

    cv_results = cross_validate(tuned_threshold_model, XTrain, yTrain, cv=10, scoring=cross_validation_scoring)

    return cv_results

### Decision Tree

criterion: entropy, max_depth: 8, max_features: sqrt, min_samples_split: 9

In [13]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(
    class_weight='balanced',
    max_depth=8,
    max_features='sqrt',
    min_samples_split=9,
    criterion='entropy'
).fit(XTrain, yTrain)

In [28]:
print_scoring_metrics(decision_tree)

Training Scores
Accuracy: 0.6742671009771987
Recall: 0.6575591985428051
Specificity: 0.6835699797160243
Precision: 0.5364041604754829
F1: 0.5908346972176759
ROC AUC: 0.7536559187458665
--------------------------------
Test Scores
Accuracy: 0.578125
Recall: 0.44808743169398907
Specificity: 0.6504559270516718
Precision: 0.41624365482233505
F1: 0.43157894736842106
ROC AUC: 0.5612304217117611
0.5781	0.4481	0.6505	0.4162	0.4316	0.5612


In [24]:
print(roc_auc_score(yTest, decision_tree.predict_proba(XTest)[:, 1]))

0.5612304217117611


In [47]:
find_recall_threshold(decision_tree)

1.0
0.9781192151055166
0.9781192151055166
0.9781192151055166
0.9781192151055166
0.9781192151055166
0.9781192151055166
0.9781192151055166
0.9740096260644207
0.9740096260644207
0.9740096260644207
0.972639763050722
0.9699000370233248
0.9671603109959275
0.9671603109959275
0.9671603109959275
0.9671603109959275
0.9671603109959275
0.9671603109959275
0.9671603109959275
0.9671603109959275
0.9657904479822289
0.9657904479822289
0.9603295075897818
0.9603295075897818
0.9603295075897818
0.958959644576083
0.958959644576083
0.958959644576083
0.958959644576083
0.958959644576083
0.9548500555349871
0.9466308774527953
0.9466308774527953
0.9466308774527953
0.9466308774527953
0.9466308774527953
0.9425398000740467
0.9425398000740467
0.9425398000740467
0.9425398000740467
0.9425398000740467
0.9425398000740467
0.9425398000740467
0.9425398000740467
0.9425398000740467
0.9425398000740467
0.9357089966679007
0.5927064050351721
0.5927064050351721
0.5927064050351721
0.522843391336542
0.5022954461310626
0.5022954461310

In [None]:
using_recall_threshold(decision_tree, 0.39795918367346933)

Training Scores
Accuracy: 0.5413680781758957
Recall: 0.97632058287796
Specificity: 0.29918864097363085
Precision: 0.43683781581092096
F1: 0.6036036036036037
ROC AUC: 0.7536559187458665
--------------------------------
Test Scores
Accuracy: 0.44140625
Recall: 0.8032786885245902
Specificity: 0.24012158054711247
Precision: 0.3702770780856423
F1: 0.506896551724138
ROC AUC: 0.5612304217117611
0.4414	0.8033	0.2401	0.3703	0.5069	0.5612


In [22]:
cross_validated_threshold(decision_tree, 0.39795918367346933)

fit_time
0.0009387016296386718
score_time
0.007428741455078125
test_acc
0.5179271708683473
test_rec
0.9726599326599328
test_spec
0.2647186147186147
test_prec
0.4250003580723882
test_f1_score
0.5911953502356774
test_auc
0.7036059974443812


### Random Forest

class_weight: balanced, criterion: gini, max_depth: 4, n_estimators: 250

In [32]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(
    class_weight='balanced',
    criterion='gini',
    max_depth=4,
    n_estimators=250
).fit(XTrain, yTrain)

In [17]:
print_scoring_metrics(random_forest)

Training Scores
Accuracy: 0.6671009771986971
Recall: 0.7340619307832422
Specificity: 0.6298174442190669
Precision: 0.5247395833333334
F1: 0.6119969627942293
ROC AUC: 0.6819396875011545
--------------------------------
Test Scores
Accuracy: 0.603515625
Recall: 0.6284153005464481
Specificity: 0.5896656534954408
Precision: 0.46
F1: 0.5311778290993071
ROC AUC: 0.6090404770209444
0.6035, 0.6284, 0.5897, 0.4600, 0.5312, 0.6090


In [33]:
find_recall_threshold(random_forest)

{np.float64(0.4387755102040816): 0.8032786885245902}


In [34]:
using_recall_threshold(random_forest, 0.4387755102040816)

Training Scores
Accuracy: 0.5641693811074918
Recall: 0.8652094717668488
Specificity: 0.39655172413793105
Precision: 0.4439252336448598
F1: 0.586781964175417
ROC AUC: 0.7416813531517751
--------------------------------
Test Scores
Accuracy: 0.51953125
Recall: 0.8032786885245902
Specificity: 0.3617021276595745
Precision: 0.4117647058823529
F1: 0.5444444444444444
ROC AUC: 0.6397096683109939
0.5195	0.8033	0.3617	0.4118	0.5444	0.6397


### KNN

n_neighbors: 99, weights: uniform, p: 1

In [35]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(
    n_neighbors=3
).fit(XTrain, yTrain)

In [36]:
print_scoring_metrics(knn)

Training Scores
Accuracy: 0.7993485342019544
Recall: 0.6211293260473588
Specificity: 0.8985801217038539
Precision: 0.7732426303854876
F1: 0.6888888888888889
ROC AUC: 0.8599592842601521
--------------------------------
Test Scores
Accuracy: 0.58984375
Recall: 0.31693989071038253
Specificity: 0.7416413373860182
Precision: 0.40559440559440557
F1: 0.3558282208588957
ROC AUC: 0.5610228046572657
0.5898	0.3169	0.7416	0.4056	0.3558	0.5610


In [37]:
find_recall_threshold(knn)

{np.float64(0.01020408163265306): 0.7868852459016393, np.float64(0.02040816326530612): 0.7868852459016393, np.float64(0.030612244897959183): 0.7868852459016393, np.float64(0.04081632653061224): 0.7868852459016393, np.float64(0.0510204081632653): 0.7868852459016393, np.float64(0.061224489795918366): 0.7868852459016393, np.float64(0.07142857142857142): 0.7868852459016393, np.float64(0.08163265306122448): 0.7868852459016393, np.float64(0.09183673469387754): 0.7868852459016393, np.float64(0.1020408163265306): 0.7868852459016393, np.float64(0.11224489795918366): 0.7868852459016393, np.float64(0.12244897959183673): 0.7868852459016393, np.float64(0.13265306122448978): 0.7868852459016393, np.float64(0.14285714285714285): 0.7868852459016393, np.float64(0.1530612244897959): 0.7868852459016393, np.float64(0.16326530612244897): 0.7868852459016393, np.float64(0.17346938775510204): 0.7868852459016393, np.float64(0.18367346938775508): 0.7868852459016393, np.float64(0.19387755102040816): 0.78688524590

In [38]:
using_recall_threshold(knn, 0.32653061224489793)

Training Scores
Accuracy: 0.6651465798045603
Recall: 1.0
Specificity: 0.4787018255578093
Precision: 0.5164628410159925
F1: 0.6811414392059554
ROC AUC: 0.8599592842601521
--------------------------------
Test Scores
Accuracy: 0.486328125
Recall: 0.7868852459016393
Specificity: 0.3191489361702128
Precision: 0.391304347826087
F1: 0.5226860254083484
ROC AUC: 0.5610228046572657
0.4863	0.7869	0.3191	0.3913	0.5227	0.5610


### XGBoost

grow_policy: depthwise, max_depth: 2, n_estimators: 150

In [39]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    grow_policy = 'depthwise',
    max_depth = 2,
    n_estimators = 150
).fit(XTrain, yTrain)

In [40]:
print_scoring_metrics(xgb)

Training Scores
Accuracy: 0.7713355048859935
Recall: 0.5373406193078324
Specificity: 0.9016227180527383
Precision: 0.7525510204081632
F1: 0.6269925611052072
ROC AUC: 0.8427234470196595
--------------------------------
Test Scores
Accuracy: 0.626953125
Recall: 0.3005464480874317
Specificity: 0.8085106382978723
Precision: 0.4661016949152542
F1: 0.3654485049833887
ROC AUC: 0.6213729300579667
0.6270	0.3005	0.8085	0.4661	0.3654	0.6214


In [41]:
y_train_pred_proba = xgb.predict_proba(XTrain)
y_test_pred_proba = xgb.predict_proba(XTest)
for threshold in np.linspace(0, 1, 99):
    y_train_pred = (y_train_pred_proba[:, 1] >= threshold).astype(int)
    y_test_pred = (y_test_pred_proba[:,1]>=threshold).astype(int)
    if recall_score(yTest, y_test_pred) > 0.78 and recall_score(yTest, y_test_pred) < 0.80:
        print(threshold)
        print('Training Scores')
        print(f'Accuracy: {accuracy_score(yTrain, y_train_pred)}')
        print(f'Recall: {recall_score(yTrain, y_train_pred)}')
        print(f'Specificity: {recall_score(yTrain, y_train_pred, pos_label=0)}')
        print(f'Precision: {precision_score(yTrain, y_train_pred)}')
        print(f'F1: {f1_score(yTrain, y_train_pred)}')
        print(f'ROC AUC: {roc_auc_score(yTrain, y_train_pred)}')
        print('--------------------------------')
        print('Test Scores')
        print(f'Accuracy: {accuracy_score(yTest, y_test_pred)}')
        print(f'Recall: {recall_score(yTest, y_test_pred)}')
        print(f'Specificity: {recall_score(yTest, y_test_pred, pos_label=0)}')
        print(f'Precision: {precision_score(yTest, y_test_pred)}')
        print(f'F1: {f1_score(yTest, y_test_pred)}')
        print(f'ROC AUC: {roc_auc_score(yTest, y_test_pred)}')
        print(f'{accuracy_score(yTest, y_test_pred):.4f}\t{recall_score(yTest, y_test_pred):.4f}\t{recall_score(yTest, y_test_pred, pos_label=0):.4f}\t{precision_score(yTest, y_test_pred):.4f}\t{f1_score(yTest, y_test_pred):.4f}\t{roc_auc_score(yTest, y_test_pred):.4f}')


0.2346938775510204
Training Scores
Accuracy: 0.6312703583061889
Recall: 0.9435336976320583
Specificity: 0.45740365111561865
Precision: 0.4919278252611586
F1: 0.6466916354556804
ROC AUC: 0.7004686743738384
--------------------------------
Test Scores
Accuracy: 0.537109375
Recall: 0.7978142076502732
Specificity: 0.39209726443769
Precision: 0.42196531791907516
F1: 0.5519848771266541
ROC AUC: 0.5949557360439816
0.5371	0.7978	0.3921	0.4220	0.5520	0.5950


In [None]:
train_preds = xgb.predict(XTrain)
test_preds = xgb.predict_proba(XTest)
print('Training Scores')
print(f'Accuracy: {accuracy_score(yTrain, train_preds)}')
print(f'Recall: {recall_score(yTrain, train_preds)}')
print(f'Specificity: {recall_score(yTrain, train_preds, pos_label=0)}')
print(f'Precision: {precision_score(yTrain, train_preds)}')
print(f'F1: {f1_score(yTrain, train_preds)}')
print(f'ROC AUC: {roc_auc_score(yTrain, train_preds)}')
print('--------------------------------')
print('Test Scores')
print(f'Accuracy: {accuracy_score(yTest, test_preds)}')
print(f'Recall: {recall_score(yTest, test_preds)}')
print(f'Specificity: {recall_score(yTest, test_preds, pos_label=0)}')
print(f'Precision: {precision_score(yTest, test_preds)}')
print(f'F1: {f1_score(yTest, test_preds)}')
print(f'ROC AUC: {roc_auc_score(yTest, test_preds)}')

### Naive Bayes

In [42]:
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline

nb = GaussianNB().fit(XTrain, yTrain)

In [43]:
print_scoring_metrics(nb)

Training Scores
Accuracy: 0.43778501628664496
Recall: 0.9617486338797814
Specificity: 0.1460446247464503
Precision: 0.3854014598540146
F1: 0.5502866076081292
ROC AUC: 0.6476351987940456
--------------------------------
Test Scores
Accuracy: 0.439453125
Recall: 0.9672131147540983
Specificity: 0.1458966565349544
Precision: 0.3864628820960699
F1: 0.5522620904836193
ROC AUC: 0.6206587273905029
0.4395	0.9672	0.1459	0.3865	0.5523	0.6207


In [46]:
find_recall_threshold(nb)

{}


In [47]:
y_pred_proba = nb.predict_proba(XTest)
for threshold in np.linspace(0, 1, 99):
    y_pred = (y_pred_proba[:,1]>=threshold).astype(int)
    if recall_score(yTest, y_pred) > 0.70 and recall_score(yTest, y_pred) < 0.89:
        print(threshold)
        print(recall_score(yTest, y_pred))

0.9897959183673468
0.8797814207650273


In [49]:
using_recall_threshold(nb, 0.9897959183673468)

Training Scores
Accuracy: 0.4762214983713355
Recall: 0.9143897996357013
Specificity: 0.23225152129817445
Precision: 0.3987291501191422
F1: 0.5553097345132744
ROC AUC: 0.6476351987940456
--------------------------------
Test Scores
Accuracy: 0.4609375
Recall: 0.8797814207650273
Specificity: 0.22796352583586627
Precision: 0.38795180722891565
F1: 0.5384615384615384
ROC AUC: 0.6206587273905029
0.4609	0.8798	0.2280	0.3880	0.5385	0.6207


### Logistic Regression

In [50]:
from sklearn.linear_model import LogisticRegression
#from sklearn.model_selection import GridSearchCV

log_reg = LogisticRegression(
    max_iter=500,
    class_weight='balanced'
).fit(XTrain, yTrain)

print_scoring_metrics(log_reg)

Training Scores
Accuracy: 0.6371335504885993
Recall: 0.6939890710382514
Specificity: 0.6054766734279919
Precision: 0.4948051948051948
F1: 0.577710386656558
ROC AUC: 0.7107261219920417
--------------------------------
Test Scores
Accuracy: 0.615234375
Recall: 0.6284153005464481
Specificity: 0.60790273556231
Precision: 0.4713114754098361
F1: 0.5386416861826698
ROC AUC: 0.6645572773929943
0.6152	0.6284	0.6079	0.4713	0.5386	0.6646


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [51]:
find_recall_threshold(log_reg)

{np.float64(0.3877551020408163): 0.8142076502732241, np.float64(0.39795918367346933): 0.8032786885245902, np.float64(0.4081632653061224): 0.7923497267759563}


In [52]:
using_recall_threshold(log_reg, 0.4081632653061224)

Training Scores
Accuracy: 0.5811074918566775
Recall: 0.8451730418943534
Specificity: 0.4340770791075051
Precision: 0.45401174168297453
F1: 0.5907065563335455
ROC AUC: 0.7107261219920417
--------------------------------
Test Scores
Accuracy: 0.55859375
Recall: 0.7923497267759563
Specificity: 0.42857142857142855
Precision: 0.43543543543543545
F1: 0.562015503875969
ROC AUC: 0.6645572773929943
0.5586	0.7923	0.4286	0.4354	0.5620	0.6646
