# OOBS add a combined ROC curve plot and a combined PRC plot!!! I lost this code somehow.. 

# Analysing Data

OOOBS: I could try to add 'scoring='roc_auc' in the GridSearchCV. That would mean that the GridSearchCV would use the roc_auc to evaluate the best model. 
But it is not the same as the model optimising the roc_auc. The model optimises the log loss (I think??)
- Maybe see this kaggle exampel: https://www.kaggle.com/code/arindambanerjee/grid-search-simplified

In [71]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, precision_recall_curve, auc 
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
import tensorflow as tf
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from pathlib import Path
from sklearn.dummy import DummyClassifier

## Model Functions

In [73]:
# Decorator for running a function on multiple dataset splits
def run_on_splits(func):
    def _run_loop(model, splits, **kwargs):
        results = {}
        roc_data = {}
        prc_data = {}
        test_roc_data = {}
        test_prc_data = {}
        model_name = kwargs.get('model_name', 'model')
        for split in splits:
            X, y, nsplit = split
            result, roc_info, prc_info = func(model, X, y, nsplit, **kwargs)
            results[nsplit] = result
            roc_data[nsplit] = roc_info
            prc_data[nsplit] = prc_info
            if nsplit == 'test':
                test_roc_data[model_name] = roc_info
                test_prc_data[model_name] = prc_info
        return results, roc_data, prc_data, test_roc_data, test_prc_data
    return _run_loop

@run_on_splits
def evaluate_classification(model, X, y, nsplit, model_name):
    preds = model.predict(X)
    pred_probs = model.predict_proba(X)[:, 1]
    accuracy = accuracy_score(y, preds)
    roc_auc = roc_auc_score(y, pred_probs)
    fpr, tpr, _ = roc_curve(y, pred_probs)
    precision, recall, _ = precision_recall_curve(y, pred_probs)
    prc_auc = auc(recall, precision)
    report = classification_report(y, preds, output_dict=True)
    print(f"{model_name} - {nsplit} - Accuracy: {accuracy}, ROC_AUC: {roc_auc}, PRC_AUC: {prc_auc}\n{report}")
    return (accuracy, report), (fpr, tpr, roc_auc), (precision, recall, prc_auc)

def save_model_results(results, model_name, results_dir):
    directory = results_dir
    os.makedirs(directory, exist_ok=True)
    filepath = os.path.join(directory, f'{model_name}_results.txt')
    with open(filepath, 'w') as f:
        for split, (accuracy, report) in results.items():
            f.write(f"{model_name} - {split} - Accuracy: {accuracy}\n")
            f.write("Classification Report:\n")
            for key, value in report.items():
                f.write(f"{key}: {value}\n")
            f.write("\n")  
    
def plot_roc_curves(roc_data, model_name, results_dir, filename='roc_curves.png'):
    plt.figure(figsize=(10, 8))
    for split, (fpr, tpr, roc_auc) in roc_data.items():
        plt.plot(fpr, tpr, label=f'{model_name} - {split} (ROC AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend(loc="lower right")
    full_path = os.path.join(results_dir, f'{model_name}_{filename}')
    plt.savefig(full_path)
    plt.close()

def plot_prc_curves(prc_data, model_name, results_dir, filename='prc_curves.png'):
    plt.figure(figsize=(10, 8))
    for split, (precision, recall, prc_auc) in prc_data.items():
        plt.plot(recall, precision, label=f'{model_name} - {split} (PRC AUC = {prc_auc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curves')
    plt.legend(loc="lower left")
    full_path = os.path.join(results_dir, f'{model_name}_{filename}')
    plt.savefig(full_path)
    plt.close()
    
def plot_feature_importances(model, model_name, feature_names, results_dir, filename='feature_importances.png'):
    feature_importances = model.feature_importances_
    indices = np.argsort(feature_importances)[-10:]
    plt.figure(figsize=(10, 6))
    plt.title('Feature Importances')
    plt.barh(range(len(indices)), feature_importances[indices], color='b', align='center')
    plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
    plt.xlabel('Relative Importance')
    full_path = os.path.join(results_dir, f'{model_name}_{filename}')
    plt.savefig(full_path)
    plt.close()
    
def plot_combined_prc_curves(all_prc_data, results_dir, filename='all_prc_curves.png'):
    plt.figure(figsize=(10, 8))
    for model_name, prc_data in all_prc_data.items():
        precision, recall, prc_auc = prc_data
        plt.plot(recall, precision, label=f'{model_name} (PRC AUC = {prc_auc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Combined Precision-Recall Curves')
    plt.legend(loc="lower left")
    full_path = os.path.join(results_dir, filename)
    plt.savefig(full_path)
    plt.close()

def plot_combined_roc_curves(all_roc_data, results_dir, filename='all_roc_curves.png'):
    plt.figure(figsize=(10, 8))
    for model_name, roc_data in all_roc_data.items():
        fpr, tpr, roc_auc = roc_data
        plt.plot(fpr, tpr, label=f'{model_name} (ROC AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Combined ROC Curves')
    plt.legend(loc="lower right")
    full_path = os.path.join(results_dir, filename)
    plt.savefig(full_path)
    plt.close()


## Load Data

In [74]:
root = Path.cwd().parents[1] # root dir microbiome-ML

# Load data
train_data_path = root / "data" / "reduced_0_1" / "train.csv"
val_data_path = root / "data" / "reduced_0_1" / "val.csv"
test_data_path = root / "data" / "reduced_0_1" / "test.csv"

train_data = pd.read_csv(train_data_path)
val_data = pd.read_csv(val_data_path)
test_data = pd.read_csv(test_data_path)

# Prepare datasets
X_train = train_data.iloc[:, :-1].values
y_train = train_data.iloc[:, -1].values
X_val = val_data.iloc[:, :-1].values
y_val = val_data.iloc[:, -1].values
X_test = test_data.iloc[:, :-1].values
y_test = test_data.iloc[:, -1].values

feature_names = train_data.columns[:-1]

# Results dir
results_dir = root / "results" / "model_reports"

all_roc_data = {}
all_prc_data = {}

## Random Forest

In [75]:
def tune_and_evaluate_rf(X_train, y_train, X_val, y_val, X_test, y_test, feature_names, results_dir):
    # Basic Random Forest model
    basic_rfc = RandomForestClassifier(random_state=42)
    basic_rfc.fit(X_train, y_train)
    
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    basic_results, basic_roc_data, basic_prc_data, test_roc_data, test_prc_data = evaluate_classification(basic_rfc, splits, model_name="Random_Forest_Basic")
    save_model_results(basic_results, "Random_Forest_Basic", results_dir)
    
    plot_roc_curves(basic_roc_data, "Random_Forest_Basic", results_dir, filename='roc_curves.png')
    plot_prc_curves(basic_prc_data, "Random_Forest_Basic", results_dir, filename='prc_curves.png')
    plot_feature_importances(basic_rfc, "Random_Forest_Basic", feature_names, results_dir, filename='feature_importances.png')

    all_roc_data["Random_Forest_Basic"] = test_roc_data["Random_Forest_Basic"]
    all_prc_data["Random_Forest_Basic"] = test_prc_data["Random_Forest_Basic"]

    # Hyperparameter-tuned Random Forest model
    rfc = RandomForestClassifier(random_state=42)
    param_grid = {
        'n_estimators': [10, 50, 100, 200],
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10],
        'max_features': ['sqrt', 'log2', None]
    }
    cv_rfc = RandomizedSearchCV(estimator=rfc, param_distributions=param_grid, scoring='accuracy', n_iter=10, cv=3, random_state=42)
    cv_rfc.fit(X_train, y_train)
    best_params = cv_rfc.best_params_
    print("Best parameters:", best_params)

    results, roc_data, prc_data, test_roc_data, test_prc_data = evaluate_classification(cv_rfc.best_estimator_, splits, model_name="Random_Forest_Optimized")
    save_model_results(results, "Random_Forest_Optimized", results_dir)

    plot_roc_curves(roc_data, "Random_Forest_Optimized", results_dir, filename='roc_curves.png')
    plot_prc_curves(prc_data, "Random_Forest_Optimized", results_dir, filename='prc_curves.png')
    plot_feature_importances(cv_rfc.best_estimator_, "Random_Forest_Optimized", feature_names, results_dir, filename='feature_importances.png')

    all_roc_data["Random_Forest_Optimized"] = test_roc_data["Random_Forest_Optimized"]
    all_prc_data["Random_Forest_Optimized"] = test_prc_data["Random_Forest_Optimized"]

    return results, roc_data, prc_data

results_rf, roc_data_rf, prc_data_rf = tune_and_evaluate_rf(X_train, y_train, X_val, y_val, X_test, y_test, feature_names, results_dir)


Random_Forest_Basic - train - Accuracy: 1.0, ROC_AUC: 1.0, PRC_AUC: 0.9999999999999999
{'0.0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1938.0}, '1.0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1912.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 3850.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 3850.0}}
Random_Forest_Basic - val - Accuracy: 0.715496368038741, ROC_AUC: 0.7954517462831002, PRC_AUC: 0.80797354874184
{'0.0': {'precision': 0.6855895196506551, 'recall': 0.7753086419753087, 'f1-score': 0.727694090382387, 'support': 405.0}, '1.0': {'precision': 0.7527173913043478, 'recall': 0.6579572446555819, 'f1-score': 0.7021546261089987, 'support': 421.0}, 'accuracy': 0.715496368038741, 'macro avg': {'precision': 0.7191534554775014, 'recall': 0.7166329433154452, 'f1-score': 0.7149243582456928, 'support': 826.0}, 'weighted avg': {'precision': 0.7198036043555033, 're

## XGBoost

In [76]:
def tune_clf_hyperparameters(clf, param_grid, X_train, y_train):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    clf_grid = GridSearchCV(clf, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
    clf_grid.fit(X_train, y_train)
    print("Best hyperparameters:\n", clf_grid.best_params_)
    return clf_grid.best_estimator_

def tune_and_evaluate_xgboost(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # Basic XGBoost model
    basic_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
    basic_model.fit(X_train, y_train)
    
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    basic_results, basic_roc_data, basic_prc_data, test_roc_data, test_prc_data = evaluate_classification(basic_model, splits, model_name="XGBoost_Basic")
    save_model_results(basic_results, "XGBoost_Basic", results_dir)
    
    plot_roc_curves(basic_roc_data, "XGBoost_Basic", results_dir, filename='roc_curves.png')
    plot_prc_curves(basic_prc_data, "XGBoost_Basic", results_dir, filename='prc_curves.png')
    
    all_roc_data["XGBoost_Basic"] = test_roc_data["XGBoost_Basic"]
    all_prc_data["XGBoost_Basic"] = test_prc_data["XGBoost_Basic"]

    # Hyperparameter-tuned XGBoost model
    xgb_param_grid = {
        'max_depth': range(3, 10, 2),
        'min_child_weight': range(1, 6, 2),
        'learning_rate': [0.0001, 0.01, 0.1],
        'n_estimators': [50, 200]
    }
    xgb_clf = xgb.XGBClassifier(random_state=0)
    xgb_opt = tune_clf_hyperparameters(xgb_clf, xgb_param_grid, X_train, y_train)

    results, roc_data, prc_data, test_roc_data, test_prc_data = evaluate_classification(xgb_opt, splits, model_name="XGBoost_Optimized")
    save_model_results(results, "XGBoost_Optimized", results_dir)

    plot_roc_curves(roc_data, "XGBoost_Optimized", results_dir, filename='roc_curves.png')
    plot_prc_curves(prc_data, "XGBoost_Optimized", results_dir, filename='prc_curves.png')

    all_roc_data["XGBoost_Optimized"] = test_roc_data["XGBoost_Optimized"]
    all_prc_data["XGBoost_Optimized"] = test_prc_data["XGBoost_Optimized"]

    return results, roc_data, prc_data

results_xgb, roc_data_xgb, prc_data_xgb = tune_and_evaluate_xgboost(X_train, y_train, X_val, y_val, X_test, y_test, results_dir)


XGBoost_Basic - train - Accuracy: 1.0, ROC_AUC: 1.0, PRC_AUC: 0.9999999999999999
{'0.0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1938.0}, '1.0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1912.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 3850.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 3850.0}}
XGBoost_Basic - val - Accuracy: 0.7433414043583535, ROC_AUC: 0.8143397554323921, PRC_AUC: 0.8184862925022971
{'0.0': {'precision': 0.7198177676537585, 'recall': 0.7802469135802469, 'f1-score': 0.7488151658767772, 'support': 405.0}, '1.0': {'precision': 0.7700258397932817, 'recall': 0.7078384798099763, 'f1-score': 0.7376237623762376, 'support': 421.0}, 'accuracy': 0.7433414043583535, 'macro avg': {'precision': 0.7449218037235201, 'recall': 0.7440426966951116, 'f1-score': 0.7432194641265074, 'support': 826.0}, 'weighted avg': {'precision': 0.7454080804512636, 'recall': 

## SVM Classifier

In [77]:
def tune_and_evaluate_svm(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # Train a basic SVM model
    basic_svm = SVC(probability=True, random_state=42)
    basic_svm.fit(X_train, y_train)
    
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    basic_results, basic_roc_data, basic_prc_data, test_roc_data, test_prc_data = evaluate_classification(basic_svm, splits, model_name="SVM_Basic")
    save_model_results(basic_results, "SVM_Basic", results_dir)
    
    plot_roc_curves(basic_roc_data, "SVM_Basic", results_dir, filename='roc_curves.png')
    plot_prc_curves(basic_prc_data, "SVM_Basic", results_dir, filename='prc_curves.png')

    all_roc_data["SVM_Basic"] = test_roc_data["SVM_Basic"]
    all_prc_data["SVM_Basic"] = test_prc_data["SVM_Basic"]

    # Hyperparameter-tuned SVM model
    svm = SVC(probability=True, random_state=42)
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001],
        'kernel': ['rbf', 'poly', 'sigmoid']
    }
    cv_svm = GridSearchCV(estimator=svm, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)
    cv_svm.fit(X_train, y_train)
    best_params = cv_svm.best_params_
    print("Best parameters:", best_params)

    results, roc_data, prc_data, test_roc_data, test_prc_data = evaluate_classification(cv_svm.best_estimator_, splits, model_name="SVM_Optimized")
    save_model_results(results, "SVM_Optimized", results_dir)

    plot_roc_curves(roc_data, "SVM_Optimized", results_dir, filename='roc_curves.png')
    plot_prc_curves(prc_data, "SVM_Optimized", results_dir, filename='prc_curves.png')

    all_roc_data["SVM_Optimized"] = test_roc_data["SVM_Optimized"]
    all_prc_data["SVM_Optimized"] = test_prc_data["SVM_Optimized"]

    return results, roc_data, prc_data

results_svm, roc_data_svm, prc_data_svm = tune_and_evaluate_svm(X_train, y_train, X_val, y_val, X_test, y_test, results_dir)

SVM_Basic - train - Accuracy: 0.7475324675324675, ROC_AUC: 0.8324802669361072, PRC_AUC: 0.8042438594898911
{'0.0': {'precision': 0.75, 'recall': 0.7476780185758514, 'f1-score': 0.7488372093023256, 'support': 1938.0}, '1.0': {'precision': 0.7450469238790407, 'recall': 0.7473849372384938, 'f1-score': 0.7462140992167102, 'support': 1912.0}, 'accuracy': 0.7475324675324675, 'macro avg': {'precision': 0.7475234619395204, 'recall': 0.7475314779071726, 'f1-score': 0.7475256542595179, 'support': 3850.0}, 'weighted avg': {'precision': 0.7475401866121365, 'recall': 0.7475324675324675, 'f1-score': 0.7475345115143524, 'support': 3850.0}}
SVM_Basic - val - Accuracy: 0.6610169491525424, ROC_AUC: 0.7211284126565203, PRC_AUC: 0.7205657414026476
{'0.0': {'precision': 0.6477541371158393, 'recall': 0.6765432098765433, 'f1-score': 0.6618357487922706, 'support': 405.0}, '1.0': {'precision': 0.674937965260546, 'recall': 0.6460807600950119, 'f1-score': 0.6601941747572816, 'support': 421.0}, 'accuracy': 0.6610

## Neural Network Classifier

In [57]:
def tune_and_evaluate_neural_network(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # Define the neural network model
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    # Compile the model, i.e., define the loss function and the optimizer
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val))

    # Evaluate the model on the test set
    test_loss, test_acc = model.evaluate(X_test, y_test)
    print('Neural Network Test accuracy:', test_acc)

    # Prepare results for consistency, this step is to compare with other models
    test_predictions = (model.predict(X_test) > 0.5).astype("int32")
    test_pred_probs = model.predict(X_test).flatten()
    test_report = classification_report(y_test, test_predictions, output_dict=True)

    # Calculate ROC and PRC data
    fpr, tpr, _ = roc_curve(y_test, test_pred_probs)
    precision, recall, _ = precision_recall_curve(y_test, test_pred_probs)
    roc_auc = roc_auc_score(y_test, test_pred_probs)
    prc_auc = auc(recall, precision)

    results = {
        'train': ('Not Evaluated', {}),
        'val': ('Not Evaluated', {}),
        'test': (test_acc, test_report)
    }
    save_model_results(results, "Neural_Network", results_dir)

    # Store ROC and PRC data for the test set
    test_roc_data = {"Neural_Network": (fpr, tpr, roc_auc)}
    test_prc_data = {"Neural_Network": (precision, recall, prc_auc)}

    all_roc_data["Neural_Network"] = test_roc_data["Neural_Network"]
    all_prc_data["Neural_Network"] = test_prc_data["Neural_Network"]

    # Plot ROC and PRC curves
    plot_roc_curves(test_roc_data, "Neural_Network", results_dir, filename='roc_curves.png')
    plot_prc_curves(test_prc_data, "Neural_Network", results_dir, filename='prc_curves.png')

    return results, test_roc_data, test_prc_data

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5118 - loss: 1.2697 - val_accuracy: 0.6138 - val_loss: 0.6660
Epoch 2/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6440 - loss: 0.6209 - val_accuracy: 0.6320 - val_loss: 0.6450
Epoch 3/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6840 - loss: 0.5822 - val_accuracy: 0.6453 - val_loss: 0.6356
Epoch 4/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7268 - loss: 0.5361 - val_accuracy: 0.6441 - val_loss: 0.6336
Epoch 5/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7312 - loss: 0.5280 - val_accuracy: 0.6610 - val_loss: 0.6277
Epoch 6/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7558 - loss: 0.5011 - val_accuracy: 0.6731 - val_loss: 0.6217
Epoch 7/10
[1m61/61[0m [32m━━━━━━━━━━

In [78]:
results_nn, roc_data_nn, prc_data_nn = tune_and_evaluate_neural_network(X_train, y_train, X_val, y_val, X_test, y_test, results_dir)

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5583 - loss: 0.7527 - val_accuracy: 0.6017 - val_loss: 0.6666
Epoch 2/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 919us/step - accuracy: 0.6598 - loss: 0.6060 - val_accuracy: 0.6356 - val_loss: 0.6478
Epoch 3/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7061 - loss: 0.5597 - val_accuracy: 0.6441 - val_loss: 0.6361
Epoch 4/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 957us/step - accuracy: 0.7375 - loss: 0.5229 - val_accuracy: 0.6501 - val_loss: 0.6380
Epoch 5/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7652 - loss: 0.4985 - val_accuracy: 0.6501 - val_loss: 0.6310
Epoch 6/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7837 - loss: 0.4691 - val_accuracy: 0.6610 - val_loss: 0.6344
Epoch 7/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━

## Basic Logistic Regression

In [79]:
def evaluate_logistic_regression(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # train a basic Logistic Regression model
    basic_lr = LogisticRegression(random_state=42, max_iter=10000)
    basic_lr.fit(X_train, y_train)
    
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    basic_results, basic_roc_data, basic_prc_data, test_roc_data, test_prc_data = evaluate_classification(basic_lr, splits, model_name="Logistic_Regression_Basic")
    save_model_results(basic_results, "Logistic_Regression_Basic", results_dir)
    
    plot_roc_curves(basic_roc_data, "Logistic_Regression_Basic", results_dir, filename='roc_curves.png')
    plot_prc_curves(basic_prc_data, "Logistic_Regression_Basic", results_dir, filename='prc_curves.png')

    all_roc_data["Logistic_Regression_Basic"] = test_roc_data["Logistic_Regression_Basic"]
    all_prc_data["Logistic_Regression_Basic"] = test_prc_data["Logistic_Regression_Basic"]

    return basic_results, basic_roc_data, basic_prc_data

In [80]:
basic_results, basic_roc_data, basic_prc_data = evaluate_logistic_regression(X_train, y_train, X_val, y_val, X_test, y_test, results_dir)

Logistic_Regression_Basic - train - Accuracy: 0.7212987012987013, ROC_AUC: 0.7982855551381531, PRC_AUC: 0.7839181751407068
{'0.0': {'precision': 0.7161419290354822, 'recall': 0.739422084623323, 'f1-score': 0.7275958365067275, 'support': 1938.0}, '1.0': {'precision': 0.7268793942671714, 'recall': 0.702928870292887, 'f1-score': 0.714703536293539, 'support': 1912.0}, 'accuracy': 0.7212987012987013, 'macro avg': {'precision': 0.7215106616513268, 'recall': 0.721175477458105, 'f1-score': 0.7211496864001332, 'support': 3850.0}, 'weighted avg': {'precision': 0.7214744052752198, 'recall': 0.7212987012987013, 'f1-score': 0.7211932188424116, 'support': 3850.0}}
Logistic_Regression_Basic - val - Accuracy: 0.6743341404358354, ROC_AUC: 0.7304184628016774, PRC_AUC: 0.7202329710737574
{'0.0': {'precision': 0.6538461538461539, 'recall': 0.7135802469135802, 'f1-score': 0.6824085005903188, 'support': 405.0}, '1.0': {'precision': 0.6979166666666666, 'recall': 0.6365795724465558, 'f1-score': 0.665838509316

## Elastic Net Logistic Regression

In [81]:
def evaluate_elastic_net_logistic_regression(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # train an Elastic Net Logistic Regression model
    elastic_net_lr = LogisticRegressionCV(cv=5, penalty='elasticnet', solver='saga', l1_ratios=[0.5], random_state=42, max_iter=10000)
    elastic_net_lr.fit(X_train, y_train)
    
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    enet_results, enet_roc_data, enet_prc_data, test_roc_data, test_prc_data = evaluate_classification(elastic_net_lr, splits, model_name="Elastic_Net_Logistic_Regression")
    save_model_results(enet_results, "Elastic_Net_Logistic_Regression", results_dir)
    
    plot_roc_curves(enet_roc_data, "Elastic_Net_Logistic_Regression", results_dir, filename='roc_curves.png')
    plot_prc_curves(enet_prc_data, "Elastic_Net_Logistic_Regression", results_dir, filename='prc_curves.png')

    all_roc_data["Elastic_Net_Logistic_Regression"] = test_roc_data["Elastic_Net_Logistic_Regression"]
    all_prc_data["Elastic_Net_Logistic_Regression"] = test_prc_data["Elastic_Net_Logistic_Regression"]

    return enet_results, enet_roc_data, enet_prc_data

enet_results, enet_roc_data, enet_prc_data = evaluate_elastic_net_logistic_regression(X_train, y_train, X_val, y_val, X_test, y_test, results_dir)


Elastic_Net_Logistic_Regression - train - Accuracy: 0.6971428571428572, ROC_AUC: 0.7653781342971014, PRC_AUC: 0.742435903355464
{'0.0': {'precision': 0.6918489065606361, 'recall': 0.718266253869969, 'f1-score': 0.7048101265822785, 'support': 1938.0}, '1.0': {'precision': 0.7029379760609358, 'recall': 0.6757322175732218, 'f1-score': 0.6890666666666667, 'support': 1912.0}, 'accuracy': 0.6971428571428572, 'macro avg': {'precision': 0.697393441310786, 'recall': 0.6969992357215954, 'f1-score': 0.6969383966244727, 'support': 3850.0}, 'weighted avg': {'precision': 0.6973559976994862, 'recall': 0.6971428571428572, 'f1-score': 0.6969915563592526, 'support': 3850.0}}
Elastic_Net_Logistic_Regression - val - Accuracy: 0.6731234866828087, ROC_AUC: 0.7345708336998915, PRC_AUC: 0.7281277817451977
{'0.0': {'precision': 0.6503340757238307, 'recall': 0.7209876543209877, 'f1-score': 0.6838407494145199, 'support': 405.0}, '1.0': {'precision': 0.7002652519893899, 'recall': 0.6270783847980997, 'f1-score': 0

## Baseline: Predicting Majority Class maybe? (can't define precision when a label has no predicted samples)

In [82]:
def evaluate_majority_class_classifier(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # train a dummy classifier that predicts the majority class
    majority_class_clf = DummyClassifier(strategy='most_frequent', random_state=42)
    majority_class_clf.fit(X_train, y_train)
    
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    majority_results, majority_roc_data, majority_prc_data, test_roc_data, test_prc_data = evaluate_classification(majority_class_clf, splits, model_name="Majority_Class_Classifier")
    save_model_results(majority_results, "Majority_Class_Classifier", results_dir)
    
    plot_roc_curves(majority_roc_data, "Majority_Class_Classifier", results_dir, filename='roc_curves.png')
    plot_prc_curves(majority_prc_data, "Majority_Class_Classifier", results_dir, filename='prc_curves.png')

    all_roc_data["Majority_Class_Classifier"] = test_roc_data["Majority_Class_Classifier"]
    all_prc_data["Majority_Class_Classifier"] = test_prc_data["Majority_Class_Classifier"]

    return majority_results, majority_roc_data, majority_prc_data

majority_results, majority_roc_data, majority_prc_data = evaluate_majority_class_classifier(X_train, y_train, X_val, y_val, X_test, y_test, results_dir)


Majority_Class_Classifier - train - Accuracy: 0.5033766233766234, ROC_AUC: 0.5, PRC_AUC: 0.7483116883116883
{'0.0': {'precision': 0.5033766233766234, 'recall': 1.0, 'f1-score': 0.6696613683483068, 'support': 1938.0}, '1.0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1912.0}, 'accuracy': 0.5033766233766234, 'macro avg': {'precision': 0.2516883116883117, 'recall': 0.5, 'f1-score': 0.3348306841741534, 'support': 3850.0}, 'weighted avg': {'precision': 0.25338802496205093, 'recall': 0.5033766233766234, 'f1-score': 0.3370918784049399, 'support': 3850.0}}
Majority_Class_Classifier - val - Accuracy: 0.49031476997578693, ROC_AUC: 0.5, PRC_AUC: 0.7548426150121066
{'0.0': {'precision': 0.49031476997578693, 'recall': 1.0, 'f1-score': 0.6580016246953696, 'support': 405.0}, '1.0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 421.0}, 'accuracy': 0.49031476997578693, 'macro avg': {'precision': 0.24515738498789347, 'recall': 0.5, 'f1-score': 0.3290008123476848, 'supp

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Baseline: Predicting Random Class maybe?

In [83]:
def evaluate_Chance_Class_Classifier(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # train a dummy classifier that predicts a random class
    random_class_clf = DummyClassifier(strategy='uniform', random_state=42)
    random_class_clf.fit(X_train, y_train)
    
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    random_results, random_roc_data, random_prc_data, test_roc_data, test_prc_data = evaluate_classification(random_class_clf, splits, model_name="Chance_Class_Classifier")
    save_model_results(random_results, "Chance_Class_Classifier", results_dir)
    
    plot_roc_curves(random_roc_data, "Chance_Class_Classifier", results_dir, filename='roc_curves.png')
    plot_prc_curves(random_prc_data, "Chance_Class_Classifier", results_dir, filename='prc_curves.png')

    all_roc_data["Chance_Class_Classifier"] = test_roc_data["Chance_Class_Classifier"]
    all_prc_data["Chance_Class_Classifier"] = test_prc_data["Chance_Class_Classifier"]

    return random_results, random_roc_data, random_prc_data

random_results, random_roc_data, random_prc_data = evaluate_Chance_Class_Classifier(X_train, y_train, X_val, y_val, X_test, y_test, results_dir)



Chance_Class_Classifier - train - Accuracy: 0.5038961038961038, ROC_AUC: 0.5, PRC_AUC: 0.7483116883116883
{'0.0': {'precision': 0.5072840790842872, 'recall': 0.5030959752321982, 'f1-score': 0.5051813471502591, 'support': 1938.0}, '1.0': {'precision': 0.5005186721991701, 'recall': 0.5047071129707112, 'f1-score': 0.5026041666666666, 'support': 1912.0}, 'accuracy': 0.5038961038961038, 'macro avg': {'precision': 0.5039013756417287, 'recall': 0.5039015441014547, 'f1-score': 0.5038927569084628, 'support': 3850.0}, 'weighted avg': {'precision': 0.5039242198727693, 'recall': 0.5038961038961038, 'f1-score': 0.5039014590763295, 'support': 3850.0}}
Chance_Class_Classifier - val - Accuracy: 0.5145278450363197, ROC_AUC: 0.5, PRC_AUC: 0.7548426150121066
{'0.0': {'precision': 0.505, 'recall': 0.49876543209876545, 'f1-score': 0.5018633540372671, 'support': 405.0}, '1.0': {'precision': 0.5234741784037559, 'recall': 0.5296912114014252, 'f1-score': 0.526564344746163, 'support': 421.0}, 'accuracy': 0.5145

## Plotting combined PRC and ROCAUC curves

In [84]:
plot_combined_roc_curves(all_roc_data, results_dir, filename='all_roc_curves.png')
plot_combined_prc_curves(all_prc_data, results_dir, filename='all_prc_curves.png')

## Baseline: Predicting Outcome Variable from Demographic Variables Only 