In [288]:
# import packages
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, precision_recall_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
import tensorflow as tf
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from pathlib import Path
from sklearn.dummy import DummyClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

In [289]:
all_roc_data = {}
all_prc_data = {}

In [290]:
# Decorator for running a function on multiple dataset splits
def run_on_splits(func):
    def _run_loop(model, splits, **kwargs):
        results = {}
        roc_data = {}
        prc_data = {}
        test_roc_data = {}
        test_prc_data = {}
        model_name = kwargs.get('model_name', 'model')
        for split in splits:
            X, y, nsplit = split
            result, roc_info, prc_info = func(model, X, y, nsplit, **kwargs)
            results[nsplit] = result
            roc_data[nsplit] = roc_info
            prc_data[nsplit] = prc_info
            if nsplit == 'test':
                test_roc_data[model_name] = roc_info
                test_prc_data[model_name] = prc_info
        return results, roc_data, prc_data, test_roc_data, test_prc_data
    return _run_loop

@run_on_splits
def evaluate_classification(model, X, y, nsplit, model_name, best_params=None):
    preds = model.predict(X)
    pred_probs = model.predict_proba(X)[:, 1]
    accuracy = accuracy_score(y, preds)
    roc_auc = roc_auc_score(y, pred_probs)
    fpr, tpr, _ = roc_curve(y, pred_probs)
    precision, recall, _ = precision_recall_curve(y, pred_probs)
    prc_auc = auc(recall, precision)
    report = classification_report(y, preds, output_dict=True)
    print(f"{model_name} - {nsplit} - Accuracy: {accuracy}, ROC_AUC: {roc_auc}, PRC_AUC: {prc_auc}\n{report}")
    return (accuracy, report), (fpr, tpr, roc_auc), (precision, recall, prc_auc)

def save_model_results(results, model_name, results_dir):
    directory = results_dir
    os.makedirs(directory, exist_ok=True)
    filepath = os.path.join(directory, f'{model_name}_results.txt')
    with open(filepath, 'w') as f:
        for split, (accuracy, report) in results.items():
            f.write(f"{model_name} - {split} - Accuracy: {accuracy}\n")
            f.write("Classification Report:\n")
            for key, value in report.items():
                f.write(f"{key}: {value}\n")
            f.write("\n")           

def save_roc_auc_scores(roc_data, results_dir, filename='roc_auc_scores.txt'):
    with open(os.path.join(results_dir, filename), 'w') as f:
        for model_name, (fpr, tpr, roc_auc) in roc_data.items():
            f.write(f"{model_name}: ROC AUC = {roc_auc:.2f}\n")

def plot_feature_importances(model, model_name, feature_names, results_dir, filename='feature_importances.png'):
    feature_importances = model.feature_importances_
    indices = np.argsort(feature_importances)[-10:]
    plt.figure(figsize=(10, 6))
    plt.title('Feature Importances')
    plt.barh(range(len(indices)), feature_importances[indices], color='b', align='center')
    plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
    plt.xlabel('Relative Importance')
    full_path = os.path.join(results_dir, f'{model_name}_{filename}')
    plt.savefig(full_path)
    plt.close()
    
def plot_roc_curves(roc_data, model_name, results_dir, filename='roc_curves.png'):
    plt.figure(figsize=(10, 8))
    for split, (fpr, tpr, roc_auc) in roc_data.items():
        plt.plot(fpr, tpr, label=f'{model_name} - {split} (ROC AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend(loc="lower right")
    full_path = os.path.join(results_dir, f'{model_name}_{filename}')
    plt.savefig(full_path)
    plt.close()

def plot_prc_curves(prc_data, model_name, results_dir, filename='prc_curves.png'):
    plt.figure(figsize=(10, 8))
    for split, (precision, recall, prc_auc) in prc_data.items():
        plt.plot(recall, precision, label=f'{model_name} - {split} (PRC AUC = {prc_auc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curves')
    plt.legend(loc="lower left")
    full_path = os.path.join(results_dir, f'{model_name}_{filename}')
    plt.savefig(full_path)
    plt.close()

def plot_combined_roc_curves(all_roc_data, results_dir, filename='all_roc_curves.png'):
    plt.figure(figsize=(10, 8))
    for model_name, (fpr, tpr, roc_auc) in all_roc_data.items():
        plt.plot(fpr, tpr, label=f'{model_name} (ROC AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Combined ROC Curves')
    plt.legend(loc="lower right")
    full_path = os.path.join(results_dir, filename)
    plt.savefig(full_path)
    plt.close()

def plot_selected_roc_curves(selected_roc_data, results_dir, filename='selected_roc_curves.png'):
    plt.figure(figsize=(10, 8))
    for model_name, (fpr, tpr, roc_auc) in selected_roc_data.items():
        plt.plot(fpr, tpr, label=f'{model_name} (ROC AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Selected Models ROC Curves')
    plt.legend(loc="lower right")
    full_path = os.path.join(results_dir, filename)
    plt.savefig(full_path)
    plt.close()

def plot_combined_prc_curves(all_prc_data, results_dir, filename='all_prc_curves.png'):
    plt.figure(figsize=(10, 8))
    for model_name, prc_data in all_prc_data.items():
        precision, recall, prc_auc = prc_data
        plt.plot(recall, precision, label=f'{model_name} (PRC AUC = {prc_auc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Combined Precision-Recall Curves')
    plt.legend(loc="lower left")
    full_path = os.path.join(results_dir, filename)
    plt.savefig(full_path)
    plt.close()
    
    
def save_best_params(model_name, best_params, results_dir):
    filepath = os.path.join(results_dir, 'best_params.txt')
    with open(filepath, 'a') as f:
        f.write(f"{model_name}:\n")
        for param, value in best_params.items():
            f.write(f"  {param}: {value}\n")
        f.write("\n")

def load_data(data_dir):
    train_data_path = data_dir / "train.csv"
    val_data_path = data_dir / "val.csv"
    test_data_path = data_dir / "test.csv"
    train_data = pd.read_csv(train_data_path)
    val_data = pd.read_csv(val_data_path)
    test_data = pd.read_csv(test_data_path)
    X_train = train_data.iloc[:, :-1].values
    y_train = train_data.iloc[:, -1].values
    X_val = val_data.iloc[:, :-1].values
    y_val = val_data.iloc[:, -1].values
    X_test = test_data.iloc[:, :-1].values
    y_test = test_data.iloc[:, -1].values
    feature_names = train_data.columns[:-1]
    return X_train, y_train, X_val, y_val, X_test, y_test, feature_names


In [291]:
def evaluate_deterministic_model(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    
    # calculate predictions for the deterministic model
    def deterministic_predict(X):
        sum_non_outcome = np.sum(X, axis=1)
        difference = 1 - sum_non_outcome
        return (difference >= 0.01).astype(int)

    splits = [
        (X_train, y_train, 'train'),
        (X_val, y_val, 'val'),
        (X_test, y_test, 'test')
    ]

    results = {}
    roc_data = {}
    prc_data = {}
    test_roc_data = {}
    test_prc_data = {}

    for X, y, nsplit in splits:
        preds = deterministic_predict(X)
        pred_probs = preds  # Since it's deterministic, we use the binary predictions
        accuracy = accuracy_score(y, preds)
        roc_auc = roc_auc_score(y, pred_probs)
        fpr, tpr, _ = roc_curve(y, pred_probs)
        precision, recall, _ = precision_recall_curve(y, pred_probs)
        prc_auc = auc(recall, precision)
        report = classification_report(y, preds, output_dict=True)
        print(f"Deterministic - {nsplit} - Accuracy: {accuracy}, ROC_AUC: {roc_auc}, PRC_AUC: {prc_auc}\n{report}")
        results[nsplit] = (accuracy, report)
        roc_data[nsplit] = (fpr, tpr, roc_auc)
        prc_data[nsplit] = (precision, recall, prc_auc)
        if nsplit == 'test':
            test_roc_data = {"Deterministic": (fpr, tpr, roc_auc)}
            test_prc_data = {"Deterministic": (precision, recall, prc_auc)}

    save_model_results(results, "Deterministic", results_dir)
    plot_roc_curves(roc_data, "Deterministic", results_dir, filename='roc_curves.png')
    plot_prc_curves(prc_data, "Deterministic", results_dir, filename='prc_curves.png')

    all_roc_data["Deterministic"] = test_roc_data["Deterministic"]
    all_prc_data["Deterministic"] = test_prc_data["Deterministic"]

    return results, roc_data, prc_data


In [292]:
def tune_and_evaluate_rf(X_train, y_train, X_val, y_val, X_test, y_test, feature_names, results_dir):
    # Basic Random Forest model
    basic_rfc = RandomForestClassifier(random_state=42)
    basic_rfc.fit(X_train, y_train)
    
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    basic_results, basic_roc_data, basic_prc_data, test_roc_data, test_prc_data = evaluate_classification(basic_rfc, splits, model_name="Random_Forest_Basic")
    save_model_results(basic_results, "Random_Forest_Basic", results_dir)
    
    plot_roc_curves(basic_roc_data, "Random_Forest_Basic", results_dir, filename='roc_curves.png')
    plot_prc_curves(basic_prc_data, "Random_Forest_Basic", results_dir, filename='prc_curves.png')
    plot_feature_importances(basic_rfc, "Random_Forest_Basic", feature_names, results_dir, filename='feature_importances.png')

    all_roc_data["Random_Forest_Basic"] = test_roc_data["Random_Forest_Basic"]
    all_prc_data["Random_Forest_Basic"] = test_prc_data["Random_Forest_Basic"]

    # Hyperparameter-tuned Random Forest model
    rfc = RandomForestClassifier(random_state=42)
    param_grid = {
        'n_estimators': [10, 50, 80, 100, 120, 200, 300, 400],
        'max_depth': [None, 3, 5, 10, 15, 20],
        'min_samples_split': [2, 5, 10, 15, 20],
        'max_features': ['sqrt', 'log2', None]
    }
    #cv_rfc = RandomizedSearchCV(estimator=rfc, param_distributions=param_grid, scoring='accuracy', n_iter=20, cv=3, random_state=42)
    cv_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, scoring='balanced_accuracy', cv=StratifiedKFold(n_splits=5), n_jobs=-1)
    cv_rfc.fit(X_train, y_train)
    best_params = cv_rfc.best_params_
    save_best_params('Random_Forest_Optimized', best_params, results_dir)
    print("Best parameters:", best_params)

    results, roc_data, prc_data, test_roc_data, test_prc_data = evaluate_classification(cv_rfc.best_estimator_, splits, model_name="Random_Forest_Optimized")
    save_model_results(results, "Random_Forest_Optimized", results_dir)

    plot_roc_curves(roc_data, "Random_Forest_Optimized", results_dir, filename='roc_curves.png')
    plot_prc_curves(prc_data, "Random_Forest_Optimized", results_dir, filename='prc_curves.png')
    plot_feature_importances(cv_rfc.best_estimator_, "Random_Forest_Optimized", feature_names, results_dir, filename='feature_importances.png')

    all_roc_data["Random_Forest_Optimized"] = test_roc_data["Random_Forest_Optimized"]
    all_prc_data["Random_Forest_Optimized"] = test_prc_data["Random_Forest_Optimized"]

    return results, roc_data, prc_data


In [293]:
def tune_clf_hyperparameters(clf, param_grid, X_train, y_train):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    clf_grid = GridSearchCV(clf, param_grid, cv=cv, scoring='balanced_accuracy', n_jobs=-1)
    clf_grid.fit(X_train, y_train)
    print("Best hyperparameters:\n", clf_grid.best_params_)
    return clf_grid.best_estimator_

def tune_and_evaluate_xgboost(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # Basic XGBoost model
    basic_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
    basic_model.fit(X_train, y_train)
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    basic_results, basic_roc_data, basic_prc_data, test_roc_data, test_prc_data = evaluate_classification(basic_model, splits, model_name="XGBoost_Basic")
    save_model_results(basic_results, "XGBoost_Basic", results_dir)
    
    plot_roc_curves(basic_roc_data, "XGBoost_Basic", results_dir, filename='roc_curves.png')
    plot_prc_curves(basic_prc_data, "XGBoost_Basic", results_dir, filename='prc_curves.png')
    
    all_roc_data["XGBoost_Basic"] = test_roc_data["XGBoost_Basic"]
    all_prc_data["XGBoost_Basic"] = test_prc_data["XGBoost_Basic"]

    # Hyperparameter-tuned XGBoost model
    xgb_param_grid = {
        'max_depth': range(3, 10, 2),
        'min_child_weight': range(1, 6, 2),
        'learning_rate': [0.0001, 0.01, 0.1],
        'n_estimators': [50, 200]
    }
    
    xgb_clf = xgb.XGBClassifier(random_state=0)
    xgb_opt = tune_clf_hyperparameters(xgb_clf, xgb_param_grid, X_train, y_train)
    # identify the best hyperparameters
    best_params = xgb_opt.get_params()
    save_best_params('XGBoost_Optimized', best_params, results_dir)

    results, roc_data, prc_data, test_roc_data, test_prc_data = evaluate_classification(xgb_opt, splits, model_name="XGBoost_Optimized")
    save_model_results(results, "XGBoost_Optimized", results_dir)

    plot_roc_curves(roc_data, "XGBoost_Optimized", results_dir, filename='roc_curves.png')
    plot_prc_curves(prc_data, "XGBoost_Optimized", results_dir, filename='prc_curves.png')

    all_roc_data["XGBoost_Optimized"] = test_roc_data["XGBoost_Optimized"]
    all_prc_data["XGBoost_Optimized"] = test_prc_data["XGBoost_Optimized"]

    return results, roc_data, prc_data

In [294]:
from sklearn.calibration import CalibratedClassifierCV

def tune_and_evaluate_linear_svc(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # Define the LinearSVC model
    linear_svc = LinearSVC(random_state=42, dual=False)  # dual=False when n_samples > n_features

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'penalty': ['l1', 'l2'],
        'C': [0.001, 0.01, 0.1, 1, 10],
        'loss': ['hinge', 'squared_hinge'],
        'tol': [1e-4, 1e-3, 1e-2]
    }

    # Set up the GridSearchCV
    grid_search = GridSearchCV(linear_svc, param_grid, cv=5, n_jobs=-1, verbose=1)

    # Fit GridSearchCV
    grid_search.fit(X_train, y_train)

    # Best estimator
    best_linear_svc = grid_search.best_estimator_

    # Wrap the best LinearSVC model with CalibratedClassifierCV
    calibrated_svc = CalibratedClassifierCV(best_linear_svc, method='sigmoid', cv=5)
    calibrated_svc.fit(X_train, y_train)

    # Save the best parameters
    save_best_params('Linear_SVC_Optimized', grid_search.best_params_, results_dir)

    # Define splits
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]

    # Evaluate the calibrated model using the existing evaluate_classification function
    results, roc_data, prc_data, test_roc_data, test_prc_data = evaluate_classification(calibrated_svc, splits, model_name="Linear_SVC_Optimized")

    # Save and plot results
    save_model_results(results, "Linear_SVC_Optimized", results_dir)
    plot_roc_curves(roc_data, "Linear_SVC_Optimized", results_dir)
    plot_prc_curves(prc_data, "Linear_SVC_Optimized", results_dir)

    # Store results for combined plotting
    all_roc_data["Linear_SVC_Optimized"] = test_roc_data["Linear_SVC_Optimized"]
    all_prc_data["Linear_SVC_Optimized"] = test_prc_data["Linear_SVC_Optimized"]

    return results, roc_data, prc_data


In [295]:
def tune_and_evaluate_neural_network(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # Define the neural network model
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    # Compile the model, i.e., define the loss function and the optimizer
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val))

    # Evaluate the model on the test set
    test_loss, test_acc = model.evaluate(X_test, y_test)
    print('Neural Network Test accuracy:', test_acc)

    # Prepare results for consistency, this step is to compare with other models
    test_predictions = (model.predict(X_test) > 0.5).astype("int32")
    test_pred_probs = model.predict(X_test).flatten()
    test_report = classification_report(y_test, test_predictions, output_dict=True)

    # Calculate ROC and PRC data
    fpr, tpr, _ = roc_curve(y_test, test_pred_probs)
    precision, recall, _ = precision_recall_curve(y_test, test_pred_probs)
    roc_auc = roc_auc_score(y_test, test_pred_probs)
    prc_auc = auc(recall, precision)

    results = {
        'train': ('Not Evaluated', {}),
        'val': ('Not Evaluated', {}),
        'test': (test_acc, test_report)
    }
    save_model_results(results, "Neural_Network", results_dir)

    # Store ROC and PRC data for the test set
    test_roc_data = {"Neural_Network": (fpr, tpr, roc_auc)}
    test_prc_data = {"Neural_Network": (precision, recall, prc_auc)}

    all_roc_data["Neural_Network"] = test_roc_data["Neural_Network"]
    all_prc_data["Neural_Network"] = test_prc_data["Neural_Network"]

    # Plot ROC and PRC curves
    plot_roc_curves(test_roc_data, "Neural_Network", results_dir, filename='roc_curves.png')
    plot_prc_curves(test_prc_data, "Neural_Network", results_dir, filename='prc_curves.png')

    return results, test_roc_data, test_prc_data

In [296]:
def evaluate_logistic_regression(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # Grid search for hyperparameter tuning
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'solver': ['newton-cg', 'lbfgs', 'liblinear']
    }
    grid_search = GridSearchCV(LogisticRegression(random_state=42, max_iter=10000), param_grid, cv=5, scoring='balanced_accuracy')
    grid_search.fit(X_train, y_train)

    # identify best hyperparameters
    print("Best hyperparameters:", grid_search.best_params_)
    # save the best parameters
    save_best_params('Logistic_Regression_Best', grid_search.best_params_, results_dir)        
    best_lr = grid_search.best_estimator_
    
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    best_results, best_roc_data, best_prc_data, test_roc_data, test_prc_data = evaluate_classification(best_lr, splits, model_name="Logistic_Regression_Best")
    save_model_results(best_results, "Logistic_Regression_Best", results_dir)
    
    plot_roc_curves(best_roc_data, "Logistic_Regression_Best", results_dir, filename='roc_curves_best.png')
    plot_prc_curves(best_prc_data, "Logistic_Regression_Best", results_dir, filename='prc_curves_best.png')

    all_roc_data["Logistic_Regression_Best"] = test_roc_data["Logistic_Regression_Best"]
    all_prc_data["Logistic_Regression_Best"] = test_prc_data["Logistic_Regression_Best"]

    return best_results, best_roc_data, best_prc_data


In [297]:
def evaluate_elastic_net_logistic_regression(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # Grid search for hyperparameter tuning with Elastic Net penalty
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'l1_ratio': [0.1, 0.5, 0.7, 0.9],
        'solver': ['saga'],
        'penalty': ['elasticnet']
    }
    grid_search = GridSearchCV(LogisticRegression(random_state=42, max_iter=10000), param_grid, cv=5, scoring='balanced_accuracy')
    grid_search.fit(X_train, y_train)
    
    best_enet_lr = grid_search.best_estimator_
    # identify best hyperparameters
    print("Best hyperparameters:", grid_search.best_params_)
    # save the best hyperparameters
    save_best_params('Elastic_Net_Logistic_Regression_Best', grid_search.best_params_, results_dir)
    
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    enet_results, enet_roc_data, enet_prc_data, test_roc_data, test_prc_data = evaluate_classification(best_enet_lr, splits, model_name="Elastic_Net_Logistic_Regression_Best")
    save_model_results(enet_results, "Elastic_Net_Logistic_Regression_Best", results_dir)
    
    plot_roc_curves(enet_roc_data, "Elastic_Net_Logistic_Regression_Best", results_dir, filename='roc_curves_best.png')
    plot_prc_curves(enet_prc_data, "Elastic_Net_Logistic_Regression_Best", results_dir, filename='prc_curves_best.png')

    all_roc_data["Elastic_Net_Logistic_Regression_Best"] = test_roc_data["Elastic_Net_Logistic_Regression_Best"]
    all_prc_data["Elastic_Net_Logistic_Regression_Best"] = test_prc_data["Elastic_Net_Logistic_Regression_Best"]

    return enet_results, enet_roc_data, enet_prc_data

In [298]:
def tune_and_evaluate_knn(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # Define the KNN model
    knn = KNeighborsClassifier()

    # Define the parameter grid
    param_grid = {
        'n_neighbors': [3, 5, 7, 9, 11],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'minkowski']
    }

    # Perform Grid Search
    grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, scoring='balanced_accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best estimator
    best_knn = grid_search.best_estimator_
    print("Best parameters:", grid_search.best_params_)
    # save the best hyperparameters
    save_best_params('KNN_Optimized', grid_search.best_params_, results_dir)
    
    # Define splits
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]

    # Evaluate the model
    results, roc_data, prc_data, test_roc_data, test_prc_data = evaluate_classification(best_knn, splits, model_name="KNN_Optimized")
    
    # Save results
    save_model_results(results, "KNN_Optimized", results_dir)
    
    # Plot ROC and PRC curves
    plot_roc_curves(roc_data, "KNN_Optimized", results_dir, filename='roc_curves.png')
    plot_prc_curves(prc_data, "KNN_Optimized", results_dir, filename='prc_curves.png')

    # Store ROC and PRC data for the test set
    all_roc_data["KNN_Optimized"] = test_roc_data["KNN_Optimized"]
    all_prc_data["KNN_Optimized"] = test_prc_data["KNN_Optimized"]

    return results, roc_data, prc_data

In [299]:
def evaluate_majority_class_classifier(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # train a dummy classifier that predicts the majority class
    majority_class_clf = DummyClassifier(strategy='most_frequent', random_state=42)
    majority_class_clf.fit(X_train, y_train)
    
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    majority_results, majority_roc_data, majority_prc_data, test_roc_data, test_prc_data = evaluate_classification(majority_class_clf, splits, model_name="Majority_Class_Classifier")
    save_model_results(majority_results, "Majority_Class_Classifier", results_dir)
    
    plot_roc_curves(majority_roc_data, "Majority_Class_Classifier", results_dir, filename='roc_curves.png')
    plot_prc_curves(majority_prc_data, "Majority_Class_Classifier", results_dir, filename='prc_curves.png')

    all_roc_data["Majority_Class_Classifier"] = test_roc_data["Majority_Class_Classifier"]
    all_prc_data["Majority_Class_Classifier"] = test_prc_data["Majority_Class_Classifier"]

    return majority_results, majority_roc_data, majority_prc_data

In [300]:
def evaluate_Chance_Class_Classifier(X_train, y_train, X_val, y_val, X_test, y_test, results_dir):
    # train a dummy classifier that predicts a random class
    random_class_clf = DummyClassifier(strategy='uniform', random_state=42)
    random_class_clf.fit(X_train, y_train)
    
    splits = [(X_train, y_train, 'train'), (X_val, y_val, 'val'), (X_test, y_test, 'test')]
    random_results, random_roc_data, random_prc_data, test_roc_data, test_prc_data = evaluate_classification(random_class_clf, splits, model_name="Chance_Class_Classifier")
    save_model_results(random_results, "Chance_Class_Classifier", results_dir)
    
    plot_roc_curves(random_roc_data, "Chance_Class_Classifier", results_dir, filename='roc_curves.png')
    plot_prc_curves(random_prc_data, "Chance_Class_Classifier", results_dir, filename='prc_curves.png')

    all_roc_data["Chance_Class_Classifier"] = test_roc_data["Chance_Class_Classifier"]
    all_prc_data["Chance_Class_Classifier"] = test_prc_data["Chance_Class_Classifier"]

    return random_results, random_roc_data, random_prc_data

In [301]:
def run_all_models(data_dir, results_dir):
    global all_roc_data, all_prc_data
    all_roc_data = {}
    all_prc_data = {}
    
    X_train, y_train, X_val, y_val, X_test, y_test, feature_names = load_data(data_dir)
    
    # create a separate directory for this dataset's results
    dataset_name = data_dir.stem
    dataset_results_dir = results_dir / dataset_name
    os.makedirs(dataset_results_dir, exist_ok=True)
    
    # SVC
    SVC_results, SVC_roc_data, SVC_prc_data = tune_and_evaluate_linear_svc(X_train, y_train, X_val, y_val, X_test, y_test, dataset_results_dir)

    # Random Forest
    results_rf, roc_data_rf, prc_data_rf = tune_and_evaluate_rf(X_train, y_train, X_val, y_val, X_test, y_test, feature_names, dataset_results_dir)
    
    # XGBoost
    results_xgb, roc_data_xgb, prc_data_xgb = tune_and_evaluate_xgboost(X_train, y_train, X_val, y_val, X_test, y_test, dataset_results_dir)
    
    # Neural Network
    results_nn, roc_data_nn, prc_data_nn = tune_and_evaluate_neural_network(X_train, y_train, X_val, y_val, X_test, y_test, dataset_results_dir)
    
    # Logistic Regression
    basic_results, basic_roc_data, basic_prc_data = evaluate_logistic_regression(X_train, y_train, X_val, y_val, X_test, y_test, dataset_results_dir)
    
    # Elastic Net Logistic Regression
    enet_results, enet_roc_data, enet_prc_data = evaluate_elastic_net_logistic_regression(X_train, y_train, X_val, y_val, X_test, y_test, dataset_results_dir)
    
    # KNN Classifier
    knn_results, knn_roc_data, knn_prc_data = tune_and_evaluate_knn(X_train, y_train, X_val, y_val, X_test, y_test, dataset_results_dir)
    
    # Majority Class Classifier
    majority_results, majority_roc_data, majority_prc_data = evaluate_majority_class_classifier(X_train, y_train, X_val, y_val, X_test, y_test, dataset_results_dir)
    
    # Random Class Classifier
    random_results, random_roc_data, random_prc_data = evaluate_Chance_Class_Classifier(X_train, y_train, X_val, y_val, X_test, y_test, dataset_results_dir)
    
    # Plot combined PRC and ROC curves for all models for the current dataset
    plot_combined_prc_curves(all_prc_data, dataset_results_dir, filename='all_prc_curves.png')
    plot_combined_roc_curves(all_roc_data, dataset_results_dir, filename='all_roc_curves.png')
    save_roc_auc_scores(all_roc_data, dataset_results_dir)
    
    selected_roc_data = {
        "RF": all_roc_data["Random_Forest_Optimized"],
        "XG": all_roc_data["XGBoost_Optimized"],
        "SVC": all_roc_data["Linear_SVC_Optimized"],
        "LogReg": all_roc_data["Elastic_Net_Logistic_Regression_Best"],
        "KNN": all_roc_data["KNN_Optimized"],
        "NN": all_roc_data["Neural_Network"]
    }
    
    plot_selected_roc_curves(selected_roc_data, dataset_results_dir, filename='selected_roc_curves.png')



#root / "data" / "backup"]
#root / "data" / "CLR",
#root / "data" / "CLR_nonreduced",
#root / "data" / "CLR_PCA", 
#root / "data" / "CLR_SVD", 
#root / "data" / "reduced_0_1", 
#root / "data" / "reduced_0_1_PCA", 
#root / "data" / "reduced_0_1_SVD", 
#root / "data" / "baseline_demographic",
#root / "data" / "non_reduced"]

In [302]:
def main():
    root = Path.cwd().parents[1]
    data_dir = root / "data"
    results_dir = root / "results" / "model_reports"
    
    for current_dir in data_dir.iterdir():
        if current_dir.is_dir():
            # skip the directory if it's named 'raw'
            if current_dir.name == 'raw':
                print(f"Skipping {current_dir}")
                continue
            print(f"Now processing {current_dir}")
            run_all_models(current_dir, results_dir)

main()


Now processing /Users/emmaolsen/Desktop/datsci_backup_23/MB-LM-24/data/baseline_smote
Fitting 5 folds for each of 60 candidates, totalling 300 fits


150 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emmaolsen/Desktop/datsci_backup_23/MB-LM-24/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/emmaolsen/Desktop/datsci_backup_23/MB-LM-24/venv/lib/python3.10/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emmaolsen/Desktop/datsci_backup_23/MB-LM-24/venv/lib/python3.10/site-packages/sklearn/svm/_classes.py", line 325, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_libli

Linear_SVC_Optimized - train - Accuracy: 0.6913010967098704, ROC_AUC: 0.767356268681493, PRC_AUC: 0.761884305854288
{'0': {'precision': 0.7034190299496422, 'recall': 0.6615154536390827, 'f1-score': 0.6818240205523443, 'support': 4012.0}, '1': {'precision': 0.6805457539402493, 'recall': 0.7210867397806581, 'f1-score': 0.7002299406995038, 'support': 4012.0}, 'accuracy': 0.6913010967098704, 'macro avg': {'precision': 0.6919823919449457, 'recall': 0.6913010967098704, 'f1-score': 0.691026980625924, 'support': 8024.0}, 'weighted avg': {'precision': 0.6919823919449458, 'recall': 0.6913010967098704, 'f1-score': 0.6910269806259239, 'support': 8024.0}}
Linear_SVC_Optimized - val - Accuracy: 0.653160453808752, ROC_AUC: 0.7122606019151846, PRC_AUC: 0.5129792179292701
{'0': {'precision': 0.8016759776536313, 'recall': 0.6674418604651163, 'f1-score': 0.7284263959390863, 'support': 860.0}, '1': {'precision': 0.44787644787644787, 'recall': 0.6203208556149733, 'f1-score': 0.5201793721973094, 'support': 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5030 - loss: 0.8979 - val_accuracy: 0.5859 - val_loss: 0.6817
Epoch 2/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 521us/step - accuracy: 0.6181 - loss: 0.6620 - val_accuracy: 0.6418 - val_loss: 0.6563
Epoch 3/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 523us/step - accuracy: 0.6614 - loss: 0.6296 - val_accuracy: 0.6791 - val_loss: 0.5922
Epoch 4/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 520us/step - accuracy: 0.6648 - loss: 0.6139 - val_accuracy: 0.6702 - val_loss: 0.6023
Epoch 5/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 511us/step - accuracy: 0.6698 - loss: 0.6177 - val_accuracy: 0.6677 - val_loss: 0.5994
Epoch 6/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 527us/step - accuracy: 0.6703 - loss: 0.5996 - val_accuracy: 0.6872 - val_loss: 0.5664
Epoch 7/10
[1m126/126[0m [

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Majority_Class_Classifier - train - Accuracy: 0.5, ROC_AUC: 0.5, PRC_AUC: 0.75
{'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 4012.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 4012.0}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 8024.0}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 8024.0}}
Majority_Class_Classifier - val - Accuracy: 0.6969205834683955, ROC_AUC: 0.5, PRC_AUC: 0.6515397082658023
{'0': {'precision': 0.6969205834683955, 'recall': 1.0, 'f1-score': 0.8213944603629417, 'support': 860.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 374.0}, 'accuracy': 0.6969205834683955, 'macro avg': {'precision': 0.34846029173419774, 'recall': 0.5, 'f1-score': 0.41069723018147086, 'support': 1234.0}, 'weighted avg': {'precision': 0.4856982996619288, 'recall': 0.6969205834683955, 'f1-score': 0.5724

150 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emmaolsen/Desktop/datsci_backup_23/MB-LM-24/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/emmaolsen/Desktop/datsci_backup_23/MB-LM-24/venv/lib/python3.10/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emmaolsen/Desktop/datsci_backup_23/MB-LM-24/venv/lib/python3.10/site-packages/sklearn/svm/_classes.py", line 325, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_libli

Linear_SVC_Optimized - train - Accuracy: 0.5887421022400919, ROC_AUC: 0.6217573862262341, PRC_AUC: 0.5995068922392656
{'0': {'precision': 0.6393146979260595, 'recall': 0.40723721998851237, 'f1-score': 0.4975438596491228, 'support': 1741.0}, '1': {'precision': 0.5651074589127687, 'recall': 0.7702469844916715, 'f1-score': 0.6519202722411278, 'support': 1741.0}, 'accuracy': 0.5887421022400919, 'macro avg': {'precision': 0.6022110784194141, 'recall': 0.5887421022400919, 'f1-score': 0.5747320659451253, 'support': 3482.0}, 'weighted avg': {'precision': 0.602211078419414, 'recall': 0.5887421022400919, 'f1-score': 0.5747320659451254, 'support': 3482.0}}
Linear_SVC_Optimized - val - Accuracy: 0.4740680713128039, ROC_AUC: 0.5590878000248725, PRC_AUC: 0.3604177448452326
{'0': {'precision': 0.747072599531616, 'recall': 0.37093023255813956, 'f1-score': 0.49572649572649574, 'support': 860.0}, '1': {'precision': 0.32961586121437425, 'recall': 0.7112299465240641, 'f1-score': 0.4504657070279424, 'suppo

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5249 - loss: 0.7287 - val_accuracy: 0.5543 - val_loss: 0.6836
Epoch 2/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 729us/step - accuracy: 0.5739 - loss: 0.6730 - val_accuracy: 0.5211 - val_loss: 0.6864
Epoch 3/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 696us/step - accuracy: 0.5833 - loss: 0.6727 - val_accuracy: 0.4968 - val_loss: 0.6796
Epoch 4/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 708us/step - accuracy: 0.5838 - loss: 0.6689 - val_accuracy: 0.4951 - val_loss: 0.6881
Epoch 5/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 676us/step - accuracy: 0.5987 - loss: 0.6658 - val_accuracy: 0.4862 - val_loss: 0.6981
Epoch 6/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 675us/step - accuracy: 0.6011 - loss: 0.6655 - val_accuracy: 0.4806 - val_loss: 0.6917
Epoch 7/10
[1m55/55[0m [32m━━━━━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Majority_Class_Classifier - train - Accuracy: 0.5, ROC_AUC: 0.5, PRC_AUC: 0.75
{'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 1741.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1741.0}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 3482.0}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 3482.0}}
Majority_Class_Classifier - val - Accuracy: 0.6969205834683955, ROC_AUC: 0.5, PRC_AUC: 0.6515397082658023
{'0': {'precision': 0.6969205834683955, 'recall': 1.0, 'f1-score': 0.8213944603629417, 'support': 860.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 374.0}, 'accuracy': 0.6969205834683955, 'macro avg': {'precision': 0.34846029173419774, 'recall': 0.5, 'f1-score': 0.41069723018147086, 'support': 1234.0}, 'weighted avg': {'precision': 0.4856982996619288, 'recall': 0.6969205834683955, 'f1-score': 0.5724

150 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emmaolsen/Desktop/datsci_backup_23/MB-LM-24/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/emmaolsen/Desktop/datsci_backup_23/MB-LM-24/venv/lib/python3.10/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emmaolsen/Desktop/datsci_backup_23/MB-LM-24/venv/lib/python3.10/site-packages/sklearn/svm/_classes.py", line 325, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_libli

Linear_SVC_Optimized - train - Accuracy: 0.7020684860073005, ROC_AUC: 0.6469826591449088, PRC_AUC: 0.42636493866330255
{'0': {'precision': 0.7056928034371643, 'recall': 0.9825523429710867, 'f1-score': 0.8214211294019588, 'support': 4012.0}, '1': {'precision': 0.5808383233532934, 'recall': 0.05571510626076967, 'f1-score': 0.10167714884696016, 'support': 1741.0}, 'accuracy': 0.7020684860073005, 'macro avg': {'precision': 0.6432655633952289, 'recall': 0.5191337246159282, 'f1-score': 0.4615491391244595, 'support': 5753.0}, 'weighted avg': {'precision': 0.6679087516683447, 'recall': 0.7020684860073005, 'f1-score': 0.6036088105863404, 'support': 5753.0}}
Linear_SVC_Optimized - val - Accuracy: 0.7058346839546191, ROC_AUC: 0.5984392488496456, PRC_AUC: 0.3926765622761981
{'0': {'precision': 0.707256046705588, 'recall': 0.986046511627907, 'f1-score': 0.8237008256435163, 'support': 860.0}, '1': {'precision': 0.6571428571428571, 'recall': 0.06149732620320856, 'f1-score': 0.11246943765281174, 'supp

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6204 - loss: 0.7395 - val_accuracy: 0.6912 - val_loss: 0.6120
Epoch 2/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 555us/step - accuracy: 0.6928 - loss: 0.5887 - val_accuracy: 0.6961 - val_loss: 0.6016
Epoch 3/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 545us/step - accuracy: 0.7060 - loss: 0.5655 - val_accuracy: 0.6937 - val_loss: 0.5974
Epoch 4/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 550us/step - accuracy: 0.7135 - loss: 0.5506 - val_accuracy: 0.6969 - val_loss: 0.5974
Epoch 5/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 551us/step - accuracy: 0.7120 - loss: 0.5472 - val_accuracy: 0.7018 - val_loss: 0.6053
Epoch 6/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 557us/step - accuracy: 0.7291 - loss: 0.5344 - val_accuracy: 0.6896 - val_loss: 0.5975
Epoch 7/10
[1m90/90[0m [32m━━━━━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Majority_Class_Classifier - train - Accuracy: 0.6973752824613245, ROC_AUC: 0.5, PRC_AUC: 0.6513123587693377
{'0': {'precision': 0.6973752824613245, 'recall': 1.0, 'f1-score': 0.8217101894521249, 'support': 4012.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1741.0}, 'accuracy': 0.6973752824613245, 'macro avg': {'precision': 0.34868764123066226, 'recall': 0.5, 'f1-score': 0.41085509472606246, 'support': 5753.0}, 'weighted avg': {'precision': 0.4863322845880122, 'recall': 0.6973752824613245, 'f1-score': 0.5730403754705241, 'support': 5753.0}}
Majority_Class_Classifier - val - Accuracy: 0.6969205834683955, ROC_AUC: 0.5, PRC_AUC: 0.6515397082658023
{'0': {'precision': 0.6969205834683955, 'recall': 1.0, 'f1-score': 0.8213944603629417, 'support': 860.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 374.0}, 'accuracy': 0.6969205834683955, 'macro avg': {'precision': 0.34846029173419774, 'recall': 0.5, 'f1-score': 0.41069723018147086, 'support': 123

150 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emmaolsen/Desktop/datsci_backup_23/MB-LM-24/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/emmaolsen/Desktop/datsci_backup_23/MB-LM-24/venv/lib/python3.10/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emmaolsen/Desktop/datsci_backup_23/MB-LM-24/venv/lib/python3.10/site-packages/sklearn/svm/_classes.py", line 325, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_libli

Linear_SVC_Optimized - train - Accuracy: 0.6099942561746123, ROC_AUC: 0.6507444043890612, PRC_AUC: 0.6240213950259851
{'0': {'precision': 0.6236281471917366, 'recall': 0.5548535324526135, 'f1-score': 0.5872340425531914, 'support': 1741.0}, '1': {'precision': 0.5990688049663735, 'recall': 0.6651349798966112, 'f1-score': 0.6303756124115406, 'support': 1741.0}, 'accuracy': 0.6099942561746123, 'macro avg': {'precision': 0.6113484760790551, 'recall': 0.6099942561746123, 'f1-score': 0.6088048274823661, 'support': 3482.0}, 'weighted avg': {'precision': 0.6113484760790551, 'recall': 0.6099942561746123, 'f1-score': 0.608804827482366, 'support': 3482.0}}
Linear_SVC_Optimized - val - Accuracy: 0.5405186385737439, ROC_AUC: 0.5962162666335032, PRC_AUC: 0.37731686689075306
{'0': {'precision': 0.7470489038785835, 'recall': 0.5151162790697674, 'f1-score': 0.6097728836889195, 'support': 860.0}, '1': {'precision': 0.3494539781591264, 'recall': 0.5989304812834224, 'f1-score': 0.4413793103448276, 'support

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5351 - loss: 0.8111 - val_accuracy: 0.5308 - val_loss: 0.7182
Epoch 2/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 715us/step - accuracy: 0.5818 - loss: 0.6821 - val_accuracy: 0.5235 - val_loss: 0.7312
Epoch 3/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 676us/step - accuracy: 0.6109 - loss: 0.6528 - val_accuracy: 0.5478 - val_loss: 0.7130
Epoch 4/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 688us/step - accuracy: 0.6471 - loss: 0.6246 - val_accuracy: 0.5656 - val_loss: 0.6824
Epoch 5/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 666us/step - accuracy: 0.6482 - loss: 0.6163 - val_accuracy: 0.5251 - val_loss: 0.7332
Epoch 6/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 658us/step - accuracy: 0.6724 - loss: 0.6011 - val_accuracy: 0.5251 - val_loss: 0.7251
Epoch 7/10
[1m55/55[0m [32m━━━━━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Majority_Class_Classifier - train - Accuracy: 0.5, ROC_AUC: 0.5, PRC_AUC: 0.75
{'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 1741.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1741.0}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 3482.0}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 3482.0}}
Majority_Class_Classifier - val - Accuracy: 0.6969205834683955, ROC_AUC: 0.5, PRC_AUC: 0.6515397082658023
{'0': {'precision': 0.6969205834683955, 'recall': 1.0, 'f1-score': 0.8213944603629417, 'support': 860.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 374.0}, 'accuracy': 0.6969205834683955, 'macro avg': {'precision': 0.34846029173419774, 'recall': 0.5, 'f1-score': 0.41069723018147086, 'support': 1234.0}, 'weighted avg': {'precision': 0.4856982996619288, 'recall': 0.6969205834683955, 'f1-score': 0.5724

150 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emmaolsen/Desktop/datsci_backup_23/MB-LM-24/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/emmaolsen/Desktop/datsci_backup_23/MB-LM-24/venv/lib/python3.10/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emmaolsen/Desktop/datsci_backup_23/MB-LM-24/venv/lib/python3.10/site-packages/sklearn/svm/_classes.py", line 325, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_libli

Linear_SVC_Optimized - train - Accuracy: 0.6743251005169443, ROC_AUC: 0.7477147591898732, PRC_AUC: 0.737812385028359
{'0': {'precision': 0.6874613959234095, 'recall': 0.6392877656519241, 'f1-score': 0.6625, 'support': 1741.0}, '1': {'precision': 0.6629092860976918, 'recall': 0.7093624353819644, 'f1-score': 0.6853496115427303, 'support': 1741.0}, 'accuracy': 0.6743251005169443, 'macro avg': {'precision': 0.6751853410105506, 'recall': 0.6743251005169443, 'f1-score': 0.6739248057713652, 'support': 3482.0}, 'weighted avg': {'precision': 0.6751853410105507, 'recall': 0.6743251005169443, 'f1-score': 0.6739248057713652, 'support': 3482.0}}
Linear_SVC_Optimized - val - Accuracy: 0.6555915721231766, ROC_AUC: 0.7323700410396717, PRC_AUC: 0.5488544067054603
{'0': {'precision': 0.8222222222222222, 'recall': 0.6453488372093024, 'f1-score': 0.7231270358306189, 'support': 860.0}, '1': {'precision': 0.4543828264758497, 'recall': 0.679144385026738, 'f1-score': 0.5444801714898178, 'support': 374.0}, 'ac

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5004 - loss: 1.0795 - val_accuracy: 0.4003 - val_loss: 0.7041
Epoch 2/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 798us/step - accuracy: 0.5337 - loss: 0.6867 - val_accuracy: 0.7156 - val_loss: 0.6251
Epoch 3/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 725us/step - accuracy: 0.6210 - loss: 0.6633 - val_accuracy: 0.6515 - val_loss: 0.6460
Epoch 4/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 716us/step - accuracy: 0.6395 - loss: 0.6507 - val_accuracy: 0.6669 - val_loss: 0.6117
Epoch 5/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 709us/step - accuracy: 0.6497 - loss: 0.6331 - val_accuracy: 0.6823 - val_loss: 0.5834
Epoch 6/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 697us/step - accuracy: 0.6655 - loss: 0.6247 - val_accuracy: 0.6337 - val_loss: 0.6424
Epoch 7/10
[1m55/55[0m [32m━━━━━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Majority_Class_Classifier - train - Accuracy: 0.5, ROC_AUC: 0.5, PRC_AUC: 0.75
{'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 1741.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1741.0}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 3482.0}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 3482.0}}
Majority_Class_Classifier - val - Accuracy: 0.6969205834683955, ROC_AUC: 0.5, PRC_AUC: 0.6515397082658023
{'0': {'precision': 0.6969205834683955, 'recall': 1.0, 'f1-score': 0.8213944603629417, 'support': 860.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 374.0}, 'accuracy': 0.6969205834683955, 'macro avg': {'precision': 0.34846029173419774, 'recall': 0.5, 'f1-score': 0.41069723018147086, 'support': 1234.0}, 'weighted avg': {'precision': 0.4856982996619288, 'recall': 0.6969205834683955, 'f1-score': 0.5724

150 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emmaolsen/Desktop/datsci_backup_23/MB-LM-24/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/emmaolsen/Desktop/datsci_backup_23/MB-LM-24/venv/lib/python3.10/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/emmaolsen/Desktop/datsci_backup_23/MB-LM-24/venv/lib/python3.10/site-packages/sklearn/svm/_classes.py", line 325, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_libli

Linear_SVC_Optimized - train - Accuracy: 0.7212608158220025, ROC_AUC: 0.8034086245437224, PRC_AUC: 0.8150089585884177
{'0': {'precision': 0.6977463543968184, 'recall': 0.7807169344870211, 'f1-score': 0.7369035118422588, 'support': 4045.0}, '1': {'precision': 0.7511223344556678, 'recall': 0.661804697156984, 'f1-score': 0.703640425811539, 'support': 4045.0}, 'accuracy': 0.7212608158220025, 'macro avg': {'precision': 0.7244343444262431, 'recall': 0.7212608158220025, 'f1-score': 0.7202719688268988, 'support': 8090.0}, 'weighted avg': {'precision': 0.7244343444262431, 'recall': 0.7212608158220025, 'f1-score': 0.7202719688268989, 'support': 8090.0}}
Linear_SVC_Optimized - val - Accuracy: 0.6669354838709678, ROC_AUC: 0.6334128036958357, PRC_AUC: 0.3929819544795854
{'0': {'precision': 0.7451403887688985, 'recall': 0.7958477508650519, 'f1-score': 0.769659788064696, 'support': 867.0}, '1': {'precision': 0.43630573248407645, 'recall': 0.3672922252010724, 'f1-score': 0.3988355167394469, 'support':