In [7]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, precision_recall_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
import tensorflow as tf
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from pathlib import Path
from sklearn.dummy import DummyClassifier

In [8]:
all_roc_data = {}
all_prc_data = {}

In [9]:
# Decorator for running a function on multiple dataset splits
def run_on_splits(func):
    def _run_loop(model, splits, **kwargs):
        results = {}
        roc_data = {}
        prc_data = {}
        test_roc_data = {}
        test_prc_data = {}
        model_name = kwargs.get('model_name', 'model')
        for split in splits:
            X, y, nsplit = split
            result, roc_info, prc_info = func(model, X, y, nsplit, **kwargs)
            results[nsplit] = result
            roc_data[nsplit] = roc_info
            prc_data[nsplit] = prc_info
            if nsplit == 'test':
                test_roc_data = {model_name: roc_info}
                test_prc_data = {model_name: prc_info}
        return results, roc_data, prc_data, test_roc_data, test_prc_data
    return _run_loop

@run_on_splits
def evaluate_classification(model, X, y, nsplit, model_name):
    preds = model.predict(X)
    pred_probs = model.predict_proba(X)[:, 1]
    accuracy = accuracy_score(y, preds)
    roc_auc = roc_auc_score(y, pred_probs)
    fpr, tpr, _ = roc_curve(y, pred_probs)
    precision, recall, _ = precision_recall_curve(y, pred_probs)
    prc_auc = auc(recall, precision)
    report = classification_report(y, preds, output_dict=True)
    print(f"{model_name} - {nsplit} - Accuracy: {accuracy}, ROC_AUC: {roc_auc}, PRC_AUC: {prc_auc}\n{report}")
    return (accuracy, report), (fpr, tpr, roc_auc), (precision, recall, prc_auc)

def save_model_results(results, model_name, results_dir):
    directory = results_dir
    os.makedirs(directory, exist_ok=True)
    filepath = os.path.join(directory, f'{model_name}_results.txt')
    with open(filepath, 'w') as f:
        for split, (accuracy, report) in results.items():
            f.write(f"{model_name} - {split} - Accuracy: {accuracy}\n")
            f.write("Classification Report:\n")
            for key, value in report.items():
                f.write(f"{key}: {value}\n")
            f.write("\n")

def plot_feature_importances(model, model_name, feature_names, results_dir, filename='feature_importances.png'):
    feature_importances = model.feature_importances_
    indices = np.argsort(feature_importances)[-10:]
    plt.figure(figsize=(10, 6))
    plt.title('Feature Importances')
    plt.barh(range(len(indices)), feature_importances[indices], color='b', align='center')
    plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
    plt.xlabel('Relative Importance')
    full_path = os.path.join(results_dir, f'{model_name}_{filename}')
    plt.savefig(full_path)
    plt.close()
    
def plot_roc_curves(roc_data, model_name, results_dir, filename='roc_curves.png'):
    plt.figure(figsize=(10, 8))
    for split, (fpr, tpr, roc_auc) in roc_data.items():
        plt.plot(fpr, tpr, label=f'{model_name} - {split} (ROC AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend(loc="lower right")
    full_path = os.path.join(results_dir, f'{model_name}_{filename}')
    plt.savefig(full_path)
    plt.close()

def plot_prc_curves(prc_data, model_name, results_dir, filename='prc_curves.png'):
    plt.figure(figsize=(10, 8))
    for split, (precision, recall, prc_auc) in prc_data.items():
        plt.plot(recall, precision, label=f'{model_name} - {split} (PRC AUC = {prc_auc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curves')
    plt.legend(loc="lower left")
    full_path = os.path.join(results_dir, f'{model_name}_{filename}')
    plt.savefig(full_path)
    plt.close()

def plot_combined_roc_curves(all_roc_data, results_dir, filename='all_roc_curves.png'):
    plt.figure(figsize=(10, 8))
    for model_name, (fpr, tpr, roc_auc) in all_roc_data.items():
        plt.plot(fpr, tpr, label=f'{model_name} (ROC AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Combined ROC Curves')
    plt.legend(loc="lower right")
    full_path = os.path.join(results_dir, filename)
    plt.savefig(full_path)
    plt.close()

def plot_combined_prc_curves(all_prc_data, results_dir, filename='all_prc_curves.png'):
    plt.figure(figsize=(10, 8))
    for model_name, prc_data in all_prc_data.items():
        precision, recall, prc_auc = prc_data
        plt.plot(recall, precision, label=f'{model_name} (PRC AUC = {prc_auc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Combined Precision-Recall Curves')
    plt.legend(loc="lower left")
    full_path = os.path.join(results_dir, filename)
    plt.savefig(full_path)
    plt.close()


def load_data(data_dir):
    train_data_path = data_dir / "train.csv"
    val_data_path = data_dir / "val.csv"
    test_data_path = data_dir / "test.csv"
    train_data = pd.read_csv(train_data_path)
    val_data = pd.read_csv(val_data_path)
    test_data = pd.read_csv(test_data_path)
    X_train = train_data.iloc[:, :-1].values
    y_train = train_data.iloc[:, -1].values
    X_val = val_data.iloc[:, :-1].values
    y_val = val_data.iloc[:, -1].values
    X_test = test_data.iloc[:, :-1].values
    y_test = test_data.iloc[:, -1].values
    feature_names = train_data.columns[:-1]
    return X_train, y_train, X_val, y_val, X_test, y_test, feature_names


In [10]:
def evaluate_deterministic_model(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    
    # calculate predictions for the deterministic model
    def deterministic_predict(X):
        sum_non_outcome = np.sum(X, axis=1)
        difference = 1 - sum_non_outcome
        return (difference >= 0.01).astype(int)

    splits = [
        (X_train, y_train, 'train'),
        (X_val, y_val, 'val'),
        (X_test, y_test, 'test')
    ]

    results = {}
    roc_data = {}
    prc_data = {}
    test_roc_data = {}
    test_prc_data = {}

    for X, y, nsplit in splits:
        preds = deterministic_predict(X)
        pred_probs = preds  # Since it's deterministic, we use the binary predictions
        accuracy = accuracy_score(y, preds)
        roc_auc = roc_auc_score(y, pred_probs)
        fpr, tpr, _ = roc_curve(y, pred_probs)
        precision, recall, _ = precision_recall_curve(y, pred_probs)
        prc_auc = auc(recall, precision)
        report = classification_report(y, preds, output_dict=True)
        print(f"Deterministic - {nsplit} - Accuracy: {accuracy}, ROC_AUC: {roc_auc}, PRC_AUC: {prc_auc}\n{report}")
        results[nsplit] = (accuracy, report)
        roc_data[nsplit] = (fpr, tpr, roc_auc)
        prc_data[nsplit] = (precision, recall, prc_auc)
        if nsplit == 'test':
            test_roc_data = {"Deterministic": (fpr, tpr, roc_auc)}
            test_prc_data = {"Deterministic": (precision, recall, prc_auc)}

    save_model_results(results, "Deterministic", results_dir)
    plot_roc_curves(roc_data, "Deterministic", results_dir, filename='roc_curves.png')
    plot_prc_curves(prc_data, "Deterministic", results_dir, filename='prc_curves.png')

    all_roc_data["Deterministic"] = test_roc_data["Deterministic"]
    all_prc_data["Deterministic"] = test_prc_data["Deterministic"]

    return results, roc_data, prc_data


In [11]:
def tune_and_evaluate_rf(X_train, y_train, X_val, y_val, X_test, y_test, feature_names, results_dir):
    # Basic Random Forest model
    basic_rfc = RandomForestClassifier(random_state=42)
    basic_rfc.fit(X_train, y_train)
    
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    basic_results, basic_roc_data, basic_prc_data, test_roc_data, test_prc_data = evaluate_classification(basic_rfc, splits, model_name="Random_Forest_Basic")
    save_model_results(basic_results, "Random_Forest_Basic", results_dir)
    
    plot_roc_curves(basic_roc_data, "Random_Forest_Basic", results_dir, filename='roc_curves.png')
    plot_prc_curves(basic_prc_data, "Random_Forest_Basic", results_dir, filename='prc_curves.png')
    plot_feature_importances(basic_rfc, "Random_Forest_Basic", feature_names, results_dir, filename='feature_importances.png')

    all_roc_data["Random_Forest_Basic"] = test_roc_data["Random_Forest_Basic"]
    all_prc_data["Random_Forest_Basic"] = test_prc_data["Random_Forest_Basic"]

    # Hyperparameter-tuned Random Forest model
    rfc = RandomForestClassifier(random_state=42)
    param_grid = {
        'n_estimators': [10, 50, 100, 200],
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10],
        'max_features': ['sqrt', 'log2', None]
    }
    cv_rfc = RandomizedSearchCV(estimator=rfc, param_distributions=param_grid, scoring='accuracy', n_iter=10, cv=3, random_state=42)
    cv_rfc.fit(X_train, y_train)
    best_params = cv_rfc.best_params_
    print("Best parameters:", best_params)

    results, roc_data, prc_data, test_roc_data, test_prc_data = evaluate_classification(cv_rfc.best_estimator_, splits, model_name="Random_Forest_Optimized")
    save_model_results(results, "Random_Forest_Optimized", results_dir)

    plot_roc_curves(roc_data, "Random_Forest_Optimized", results_dir, filename='roc_curves.png')
    plot_prc_curves(prc_data, "Random_Forest_Optimized", results_dir, filename='prc_curves.png')
    plot_feature_importances(cv_rfc.best_estimator_, "Random_Forest_Optimized", feature_names, results_dir, filename='feature_importances.png')

    all_roc_data["Random_Forest_Optimized"] = test_roc_data["Random_Forest_Optimized"]
    all_prc_data["Random_Forest_Optimized"] = test_prc_data["Random_Forest_Optimized"]

    return results, roc_data, prc_data


In [12]:
def tune_clf_hyperparameters(clf, param_grid, X_train, y_train):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    clf_grid = GridSearchCV(clf, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
    clf_grid.fit(X_train, y_train)
    print("Best hyperparameters:\n", clf_grid.best_params_)
    return clf_grid.best_estimator_

def tune_and_evaluate_xgboost(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # Basic XGBoost model
    basic_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
    basic_model.fit(X_train, y_train)
    
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    basic_results, basic_roc_data, basic_prc_data, test_roc_data, test_prc_data = evaluate_classification(basic_model, splits, model_name="XGBoost_Basic")
    save_model_results(basic_results, "XGBoost_Basic", results_dir)
    
    plot_roc_curves(basic_roc_data, "XGBoost_Basic", results_dir, filename='roc_curves.png')
    plot_prc_curves(basic_prc_data, "XGBoost_Basic", results_dir, filename='prc_curves.png')
    
    all_roc_data["XGBoost_Basic"] = test_roc_data["XGBoost_Basic"]
    all_prc_data["XGBoost_Basic"] = test_prc_data["XGBoost_Basic"]

    # Hyperparameter-tuned XGBoost model
    xgb_param_grid = {
        'max_depth': range(3, 10, 2),
        'min_child_weight': range(1, 6, 2),
        'learning_rate': [0.0001, 0.01, 0.1],
        'n_estimators': [50, 200]
    }
    xgb_clf = xgb.XGBClassifier(random_state=0)
    xgb_opt = tune_clf_hyperparameters(xgb_clf, xgb_param_grid, X_train, y_train)

    results, roc_data, prc_data, test_roc_data, test_prc_data = evaluate_classification(xgb_opt, splits, model_name="XGBoost_Optimized")
    save_model_results(results, "XGBoost_Optimized", results_dir)

    plot_roc_curves(roc_data, "XGBoost_Optimized", results_dir, filename='roc_curves.png')
    plot_prc_curves(prc_data, "XGBoost_Optimized", results_dir, filename='prc_curves.png')

    all_roc_data["XGBoost_Optimized"] = test_roc_data["XGBoost_Optimized"]
    all_prc_data["XGBoost_Optimized"] = test_prc_data["XGBoost_Optimized"]

    return results, roc_data, prc_data

In [13]:
# function to preprocess data for SVM (with imputation)
def preprocess_data_for_svm(X_train, X_val, X_test):
    imputer = SimpleImputer(strategy='mean')
    X_train_imputed = imputer.fit_transform(X_train)
    X_val_imputed = imputer.transform(X_val)
    X_test_imputed = imputer.transform(X_test)
    return X_train_imputed, X_val_imputed, X_test_imputed

In [14]:
def tune_and_evaluate_svm(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # Train a basic SVM model
    basic_svm = SVC(probability=True, random_state=42)
    basic_svm.fit(X_train, y_train)
    
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    basic_results, basic_roc_data, basic_prc_data, test_roc_data, test_prc_data = evaluate_classification(basic_svm, splits, model_name="SVM_Basic")
    save_model_results(basic_results, "SVM_Basic", results_dir)
    
    plot_roc_curves(basic_roc_data, "SVM_Basic", results_dir, filename='roc_curves.png')
    plot_prc_curves(basic_prc_data, "SVM_Basic", results_dir, filename='prc_curves.png')

    all_roc_data["SVM_Basic"] = test_roc_data["SVM_Basic"]
    all_prc_data["SVM_Basic"] = test_prc_data["SVM_Basic"]

    # Hyperparameter-tuned SVM model
    svm = SVC(probability=True, random_state=42)
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001],
        'kernel': ['rbf', 'poly', 'sigmoid']
    }
    cv_svm = GridSearchCV(estimator=svm, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)
    cv_svm.fit(X_train, y_train)
    best_params = cv_svm.best_params_
    print("Best parameters:", best_params)

    results, roc_data, prc_data, test_roc_data, test_prc_data = evaluate_classification(cv_svm.best_estimator_, splits, model_name="SVM_Optimized")
    save_model_results(results, "SVM_Optimized", results_dir)

    plot_roc_curves(roc_data, "SVM_Optimized", results_dir, filename='roc_curves.png')
    plot_prc_curves(prc_data, "SVM_Optimized", results_dir, filename='prc_curves.png')

    all_roc_data["SVM_Optimized"] = test_roc_data["SVM_Optimized"]
    all_prc_data["SVM_Optimized"] = test_prc_data["SVM_Optimized"]

    return results, roc_data, prc_data


In [15]:
def tune_and_evaluate_neural_network(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # Define the neural network model
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    # Compile the model, i.e., define the loss function and the optimizer
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val))

    # Evaluate the model on the test set
    test_loss, test_acc = model.evaluate(X_test, y_test)
    print('Neural Network Test accuracy:', test_acc)

    # Prepare results for consistency, this step is to compare with other models
    test_predictions = (model.predict(X_test) > 0.5).astype("int32")
    test_pred_probs = model.predict(X_test).flatten()
    test_report = classification_report(y_test, test_predictions, output_dict=True)

    # Calculate ROC and PRC data
    fpr, tpr, _ = roc_curve(y_test, test_pred_probs)
    precision, recall, _ = precision_recall_curve(y_test, test_pred_probs)
    roc_auc = roc_auc_score(y_test, test_pred_probs)
    prc_auc = auc(recall, precision)

    results = {
        'train': ('Not Evaluated', {}),
        'val': ('Not Evaluated', {}),
        'test': (test_acc, test_report)
    }
    save_model_results(results, "Neural_Network", results_dir)

    # Store ROC and PRC data for the test set
    test_roc_data = {"Neural_Network": (fpr, tpr, roc_auc)}
    test_prc_data = {"Neural_Network": (precision, recall, prc_auc)}

    all_roc_data["Neural_Network"] = test_roc_data["Neural_Network"]
    all_prc_data["Neural_Network"] = test_prc_data["Neural_Network"]

    # Plot ROC and PRC curves
    plot_roc_curves(test_roc_data, "Neural_Network", results_dir, filename='roc_curves.png')
    plot_prc_curves(test_prc_data, "Neural_Network", results_dir, filename='prc_curves.png')

    return results, test_roc_data, test_prc_data

In [16]:
def evaluate_logistic_regression(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # train a basic Logistic Regression model
    basic_lr = LogisticRegression(random_state=42, max_iter=10000)
    basic_lr.fit(X_train, y_train)
    
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    basic_results, basic_roc_data, basic_prc_data, test_roc_data, test_prc_data = evaluate_classification(basic_lr, splits, model_name="Logistic_Regression_Basic")
    save_model_results(basic_results, "Logistic_Regression_Basic", results_dir)
    
    plot_roc_curves(basic_roc_data, "Logistic_Regression_Basic", results_dir, filename='roc_curves.png')
    plot_prc_curves(basic_prc_data, "Logistic_Regression_Basic", results_dir, filename='prc_curves.png')

    all_roc_data["Logistic_Regression_Basic"] = test_roc_data["Logistic_Regression_Basic"]
    all_prc_data["Logistic_Regression_Basic"] = test_prc_data["Logistic_Regression_Basic"]

    return basic_results, basic_roc_data, basic_prc_data

In [17]:
def evaluate_elastic_net_logistic_regression(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # train an Elastic Net Logistic Regression model
    elastic_net_lr = LogisticRegressionCV(cv=5, penalty='elasticnet', solver='saga', l1_ratios=[0.5], random_state=42, max_iter=10000)
    elastic_net_lr.fit(X_train, y_train)
    
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    enet_results, enet_roc_data, enet_prc_data, test_roc_data, test_prc_data = evaluate_classification(elastic_net_lr, splits, model_name="Elastic_Net_Logistic_Regression")
    save_model_results(enet_results, "Elastic_Net_Logistic_Regression", results_dir)
    
    plot_roc_curves(enet_roc_data, "Elastic_Net_Logistic_Regression", results_dir, filename='roc_curves.png')
    plot_prc_curves(enet_prc_data, "Elastic_Net_Logistic_Regression", results_dir, filename='prc_curves.png')

    all_roc_data["Elastic_Net_Logistic_Regression"] = test_roc_data["Elastic_Net_Logistic_Regression"]
    all_prc_data["Elastic_Net_Logistic_Regression"] = test_prc_data["Elastic_Net_Logistic_Regression"]

    return enet_results, enet_roc_data, enet_prc_data


In [18]:
def evaluate_majority_class_classifier(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # train a dummy classifier that predicts the majority class
    majority_class_clf = DummyClassifier(strategy='most_frequent', random_state=42)
    majority_class_clf.fit(X_train, y_train)
    
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    majority_results, majority_roc_data, majority_prc_data, test_roc_data, test_prc_data = evaluate_classification(majority_class_clf, splits, model_name="Majority_Class_Classifier")
    save_model_results(majority_results, "Majority_Class_Classifier", results_dir)
    
    plot_roc_curves(majority_roc_data, "Majority_Class_Classifier", results_dir, filename='roc_curves.png')
    plot_prc_curves(majority_prc_data, "Majority_Class_Classifier", results_dir, filename='prc_curves.png')

    all_roc_data["Majority_Class_Classifier"] = test_roc_data["Majority_Class_Classifier"]
    all_prc_data["Majority_Class_Classifier"] = test_prc_data["Majority_Class_Classifier"]

    return majority_results, majority_roc_data, majority_prc_data

In [19]:
def evaluate_Chance_Class_Classifier(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # train a dummy classifier that predicts a random class
    random_class_clf = DummyClassifier(strategy='uniform', random_state=42)
    random_class_clf.fit(X_train, y_train)
    
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    random_results, random_roc_data, random_prc_data, test_roc_data, test_prc_data = evaluate_classification(random_class_clf, splits, model_name="Chance_Class_Classifier")
    save_model_results(random_results, "Chance_Class_Classifier", results_dir)
    
    plot_roc_curves(random_roc_data, "Chance_Class_Classifier", results_dir, filename='roc_curves.png')
    plot_prc_curves(random_prc_data, "Chance_Class_Classifier", results_dir, filename='prc_curves.png')

    all_roc_data["Chance_Class_Classifier"] = test_roc_data["Chance_Class_Classifier"]
    all_prc_data["Chance_Class_Classifier"] = test_prc_data["Chance_Class_Classifier"]

    return random_results, random_roc_data, random_prc_data

In [20]:
def run_all_models(data_dir, results_dir):
    global all_roc_data, all_prc_data
    all_roc_data = {}
    all_prc_data = {}
    
    X_train, y_train, X_val, y_val, X_test, y_test, feature_names = load_data(data_dir)
    
    # create a separate directory for this dataset's results
    dataset_name = data_dir.stem
    dataset_results_dir = results_dir / dataset_name
    os.makedirs(dataset_results_dir, exist_ok=True)
    
    # Deterministic Model (skip if from 'baseline_demographic' folder)
    if "baseline_demographic" not in str(data_dir):
        deterministic_results, deterministic_roc_data, deterministic_prc_data = evaluate_deterministic_model(X_train, y_train, X_val, y_val, X_test, y_test, dataset_results_dir)

    # Random Forest
    results_rf, roc_data_rf, prc_data_rf = tune_and_evaluate_rf(X_train, y_train, X_val, y_val, X_test, y_test, feature_names, dataset_results_dir)
    
    # XGBoost
    results_xgb, roc_data_xgb, prc_data_xgb = tune_and_evaluate_xgboost(X_train, y_train, X_val, y_val, X_test, y_test, dataset_results_dir)
    
    # Neural Network
    results_nn, roc_data_nn, prc_data_nn = tune_and_evaluate_neural_network(X_train, y_train, X_val, y_val, X_test, y_test, dataset_results_dir)
    
    # Logistic Regression
    basic_results, basic_roc_data, basic_prc_data = evaluate_logistic_regression(X_train, y_train, X_val, y_val, X_test, y_test, dataset_results_dir)
    
    # Elastic Net Logistic Regression
    enet_results, enet_roc_data, enet_prc_data = evaluate_elastic_net_logistic_regression(X_train, y_train, X_val, y_val, X_test, y_test, dataset_results_dir)
    
    # Majority Class Classifier
    majority_results, majority_roc_data, majority_prc_data = evaluate_majority_class_classifier(X_train, y_train, X_val, y_val, X_test, y_test, dataset_results_dir)
    
    # Random Class Classifier
    random_results, random_roc_data, random_prc_data = evaluate_Chance_Class_Classifier(X_train, y_train, X_val, y_val, X_test, y_test, dataset_results_dir)
    
    # Plot combined PRC and ROC curves for all models for the current dataset
    plot_combined_prc_curves(all_prc_data, dataset_results_dir, filename='all_prc_curves.png')
    plot_combined_roc_curves(all_roc_data, dataset_results_dir, filename='all_roc_curves.png')


In [21]:
# ------ OBS THIS IS JUST TO CHECK THAT THE DIFFERENT DATA SETS ARE COMPATIBLE (SO Y IS 0/1 INT AND NOT FLOATS)
import pandas as pd
from pathlib import Path

root = Path.cwd().parents[1]

# Function to load datasets and ensure integer type for Lacto_Binary
def load_and_verify(path):
    data = pd.read_csv(path)
    data['Lacto_Binary'] = data['Lacto_Binary'].astype(int)  # Ensure integer type
    return data

# BASELINE DATA 
baseline_path_train = root / 'data' / 'baseline_demographic' / 'train.csv'
baseline_path_val = root / 'data' / 'baseline_demographic' / 'val.csv'
baseline_path_test = root / 'data' / 'baseline_demographic' / 'test.csv'

# For the training dataset
train_data_baseline = load_and_verify(baseline_path_train)
print("Baseline Training Dataset Lacto_Binary Counts:")
print(train_data_baseline['Lacto_Binary'].value_counts())
print("Unique values:", train_data_baseline['Lacto_Binary'].unique())

# For the validation dataset
val_data_baseline = load_and_verify(baseline_path_val)
print("Baseline Validation Dataset Lacto_Binary Counts:")
print(val_data_baseline['Lacto_Binary'].value_counts())
print("Unique values:", val_data_baseline['Lacto_Binary'].unique())

# For the test dataset
test_data_baseline = load_and_verify(baseline_path_test)
print("Baseline Test Dataset Lacto_Binary Counts:")
print(test_data_baseline['Lacto_Binary'].value_counts())
print("Unique values:", test_data_baseline['Lacto_Binary'].unique())
print("-------------------------------------------------------------------")

# REAL DATA 
path_train = root / 'data' / 'reduced_0_1' / 'train.csv'
path_val = root / 'data' / 'reduced_0_1' / 'val.csv'
path_test = root / 'data' / 'reduced_0_1' / 'test.csv'

# For the training dataset
train_data_real = load_and_verify(path_train)
print("Real Training Dataset Lacto_Binary Counts:")
print(train_data_real['Lacto_Binary'].value_counts())
print("Unique values:", train_data_real['Lacto_Binary'].unique())

# For the validation dataset
val_data_real = load_and_verify(path_val)
print("Real Validation Dataset Lacto_Binary Counts:")
print(val_data_real['Lacto_Binary'].value_counts())
print("Unique values:", val_data_real['Lacto_Binary'].unique())

# For the test dataset
test_data_real = load_and_verify(path_test)
print("Real Test Dataset Lacto_Binary Counts:")
print(test_data_real['Lacto_Binary'].value_counts())
print("Unique values:", test_data_real['Lacto_Binary'].unique())



Baseline Training Dataset Lacto_Binary Counts:
Lacto_Binary
0    1585
1    1583
Name: count, dtype: int64
Unique values: [1 0]
Baseline Validation Dataset Lacto_Binary Counts:
Lacto_Binary
0    359
1    321
Name: count, dtype: int64
Unique values: [0 1]
Baseline Test Dataset Lacto_Binary Counts:
Lacto_Binary
0    348
1    332
Name: count, dtype: int64
Unique values: [0 1]
-------------------------------------------------------------------
Real Training Dataset Lacto_Binary Counts:
Lacto_Binary
0    1938
1    1912
Name: count, dtype: int64
Unique values: [1 0]
Real Validation Dataset Lacto_Binary Counts:
Lacto_Binary
1    421
0    405
Name: count, dtype: int64
Unique values: [1 0]
Real Test Dataset Lacto_Binary Counts:
Lacto_Binary
0    433
1    393
Name: count, dtype: int64
Unique values: [1 0]


In [22]:

def main():
    root = Path.cwd().parents[1]
    data_dirs = [root / "data" / "reduced_0_1", root / "data" / "reduced_0_1_PCA", root / "data" / "reduced_0_1_SVD", root / "data" / "baseline_demographic",root / "data" / "non_reduced"]
    results_dir = root / "results" / "model_reports"
    for data_dir in data_dirs:
        run_all_models(data_dir, results_dir)

# run the main function using 
main()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Deterministic - train - Accuracy: 0.5033766233766234, ROC_AUC: 0.5, PRC_AUC: 0.7483116883116883
{'0.0': {'precision': 0.5033766233766234, 'recall': 1.0, 'f1-score': 0.6696613683483068, 'support': 1938.0}, '1.0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1912.0}, 'accuracy': 0.5033766233766234, 'macro avg': {'precision': 0.2516883116883117, 'recall': 0.5, 'f1-score': 0.3348306841741534, 'support': 3850.0}, 'weighted avg': {'precision': 0.25338802496205093, 'recall': 0.5033766233766234, 'f1-score': 0.3370918784049399, 'support': 3850.0}}
Deterministic - val - Accuracy: 0.49031476997578693, ROC_AUC: 0.5, PRC_AUC: 0.7548426150121066
{'0.0': {'precision': 0.49031476997578693, 'recall': 1.0, 'f1-score': 0.6580016246953696, 'support': 405.0}, '1.0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 421.0}, 'accuracy': 0.49031476997578693, 'macro avg': {'precision': 0.24515738498789347, 'recall': 0.5, 'f1-score': 0.3290008123476848, 'support': 826.0}, 'weighted 



Best hyperparameters:
 {'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 50}
XGBoost_Optimized - train - Accuracy: 0.8631168831168832, ROC_AUC: 0.9430159203077839, PRC_AUC: 0.9479392783145983
{'0.0': {'precision': 0.8518703241895261, 'recall': 0.8813209494324046, 'f1-score': 0.8663454222673092, 'support': 1938.0}, '1.0': {'precision': 0.8753387533875339, 'recall': 0.8446652719665272, 'f1-score': 0.8597285067873304, 'support': 1912.0}, 'accuracy': 0.8631168831168832, 'macro avg': {'precision': 0.86360453878853, 'recall': 0.8629931106994659, 'f1-score': 0.8630369645273197, 'support': 3850.0}, 'weighted avg': {'precision': 0.8635252947418873, 'recall': 0.8631168831168832, 'f1-score': 0.8630593073588106, 'support': 3850.0}}
XGBoost_Optimized - val - Accuracy: 0.7760290556900726, ROC_AUC: 0.8640860971819008, PRC_AUC: 0.8760349252832931
{'0.0': {'precision': 0.748868778280543, 'recall': 0.817283950617284, 'f1-score': 0.781582054309327, 'support': 405.0}, '1.0': {'

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5561 - loss: 0.8257 - val_accuracy: 0.6259 - val_loss: 0.6617
Epoch 2/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6581 - loss: 0.6268 - val_accuracy: 0.6308 - val_loss: 0.6357
Epoch 3/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 999us/step - accuracy: 0.6875 - loss: 0.5793 - val_accuracy: 0.6368 - val_loss: 0.6528
Epoch 4/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 969us/step - accuracy: 0.7066 - loss: 0.5591 - val_accuracy: 0.6513 - val_loss: 0.6313
Epoch 5/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 971us/step - accuracy: 0.7121 - loss: 0.5483 - val_accuracy: 0.6465 - val_loss: 0.6270
Epoch 6/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 974us/step - accuracy: 0.7412 - loss: 0.5139 - val_accuracy: 0.6683 - val_loss: 0.6180
Epoch 7/10
[1m61/61[0m [32m━━━━━━━━━━━━━