In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, balanced_accuracy_score, roc_curve, auc, precision_recall_curve, confusion_matrix
import glob
import re
pd.set_option('display.max_colwidth', None)

## Metrics Tables - Standard Thresholds

In [None]:
def evaluate_predictions(predictionfile, outcome_col="outcome", pred_col="Prediction", pred_threshold=0.5):
    """
    Compute evaluation metrics for model predictions.

    Parameters:
    - df: DataFrame containing actual outcomes and predicted probabilities.
    - outcome_col: Name of the column with true binary outcomes (0 or 1).
    - pred_col: Name of the column with predicted probabilities.

    Returns:
    - A DataFrame row with computed metrics.
    """
    df_predictions=pd.read_csv(predictionfile)
    cohort = pd.read_csv("processed_data/cohort.csv")
    df=pd.merge(cohort, df_predictions, on='MRN', how='inner')
    
    # Extract true labels and predicted probabilities
    y_true = df[outcome_col]
    y_scores = df[pred_col]

    # Compute AUROC
    auroc = roc_auc_score(y_true, y_scores)

    # Compute Precision-Recall curve and AUPRC
    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    auprc = auc(recall, precision)

    # Convert probabilities to binary predictions using 0.5 threshold
    y_pred = (y_scores >= pred_threshold).astype(int)

    # Compute accuracy
    accuracy = accuracy_score(y_true, y_pred)

    # Compute confusion matrix (TN, FP, FN, TP)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    # Compute Sensitivity (Recall)
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0

    # Compute Specificity
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

    # Compute Positive Predictive Value (PPV) - Precision
    ppv = tp / (tp + fp) if (tp + fp) > 0 else 0

    # Compute Negative Predictive Value (NPV)
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0
    
    # Compute F1 Score
    f1 = f1_score(y_true, y_pred)

    # Create a DataFrame row with results
    results = pd.DataFrame([{
        "Model": predictionfile.removeprefix("final_predictions_"),
        "AUROC": auroc,
        "AUPRC": auprc,
        "Accuracy": accuracy,
        "Sensitivity": sensitivity,
        "Specificity": specificity,
        "PPV (Precision)": ppv,
        "NPV": npv, 
        "F1 Score": f1
    }])

    return results

In [None]:
df_predictions=pd.read_csv('out/final_test_predictions_AoUencoder_StanfordFinetune-15-100pct.csv')

cohort = pd.read_csv("processed_data/cohort.csv")
demo_df = pd.read_csv('processed_data/demo_not_1h_encoded.csv')

In [None]:
df = pd.merge(cohort, df_predictions, on='MRN', how='inner')
df = df.merge(demo_df, left_on='MRN', right_on='MRN', how='left')
df['race_consolidated'] = df['race'].replace({'aian': 'other', 'nhpi': 'asian'})
df.loc[df['ethnicity'].str.lower() == 'hispanic', 'race_consolidated'] = 'hispanic'

df['race_consolidated'] = df['race_consolidated'].str.capitalize()
df['race_consolidated'] = df['race_consolidated'].apply(lambda x: 'Non-Hispanic ' + x.capitalize() if (x.lower() != 'hispanic' and x.lower() != 'other') else x.capitalize())

In [None]:
demo_df.head()

In [None]:
df['race'].value_counts()

In [None]:
df['race_consolidated'].value_counts()

In [None]:
y_true = df["outcome"]
y_scores = df["Prediction"]

# Compute ROC curve and AUC
fpr, tpr, _ = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)

# Compute Precision-Recall curve
precision, recall, _ = precision_recall_curve(y_true, y_scores)
pr_auc = auc(recall, precision)

# Create subplots
fig, ax = plt.subplots(2, 2, figsize=(14, 12))

# Plot ROC Curve
ax[0, 0].plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUROC = {roc_auc:.3f})')
ax[0, 0].plot([0, 1], [0, 1], color='grey', linestyle='--')  # Random chance line
ax[0, 0].set_xlim([0.0, 1.0])
ax[0, 0].set_ylim([0.0, 1.05])
ax[0, 0].set_xlabel('False Positive Rate', fontsize=20)
ax[0, 0].set_ylabel('True Positive Rate', fontsize=20)
ax[0, 0].set_title('ROC Curve', fontsize=24)
ax[0, 0].legend(loc="lower right", fontsize=16)
ax[0, 0].grid()
ax[0, 0].text(-0.1, 1.1, 'A', transform=ax[0, 0].transAxes, fontsize=24, fontweight='bold', va='top', ha='right')

# Plot Precision-Recall Curve
ax[0, 1].plot(recall, precision, color='green', lw=2, label=f'PR curve (AUPRC = {pr_auc:.3f})')
ax[0, 1].set_xlim([0.0, 1.0])
ax[0, 1].set_ylim([0.0, 1.05])
ax[0, 1].set_xlabel('Recall', fontsize=20)
ax[0, 1].set_ylabel('Precision', fontsize=20)
ax[0, 1].set_title('Precision-Recall Curve', fontsize=24)
ax[0, 1].legend(loc="lower left", fontsize=16)
ax[0, 1].grid()
ax[0, 1].text(-0.1, 1.1, 'B', transform=ax[0, 1].transAxes, fontsize=24, fontweight='bold', va='top', ha='right')

# ROC and Precision-Recall Curves stratified by race
unique_races = df['race_consolidated'].unique()

# Iterate over each unique race
for race in unique_races:
    race_subset = df[df['race_consolidated'] == race]
    y_true = race_subset["outcome"]
    y_scores = race_subset["Prediction"]

    # ROC and AUROC
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)
    ax[1, 0].plot(fpr, tpr, lw=2, label=f'{race} (AUROC = {roc_auc:.3f})')

    # Precision-Recall and AUPRC
    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    pr_auc = auc(recall, precision)
    ax[1, 1].plot(recall, precision, lw=2, label=f'{race} (AUPRC = {pr_auc:.3f})')

# Configure ROC plot
ax[1, 0].plot([0, 1], [0, 1], color='grey', linestyle='--')
ax[1, 0].set_xlim([0.0, 1.0])
ax[1, 0].set_ylim([0.0, 1.05])
ax[1, 0].set_xlabel('False Positive Rate', fontsize=20)
ax[1, 0].set_ylabel('True Positive Rate', fontsize=20)
ax[1, 0].set_title('ROC Curves by Race/Ethnicity', fontsize=24)
ax[1, 0].legend(loc="lower right", fontsize=16)
ax[1, 0].grid()
ax[1, 0].text(-0.1, 1.1, 'C', transform=ax[1, 0].transAxes, fontsize=24, fontweight='bold', va='top', ha='right')

# Configure Precision-Recall plot
ax[1, 1].set_xlim([0.0, 1.0])
ax[1, 1].set_ylim([0.0, 1.05])
ax[1, 1].set_xlabel('Recall', fontsize=20)
ax[1, 1].set_ylabel('Precision', fontsize=20)
ax[1, 1].set_title('Precision-Recall Curves by Race/Ethnicity', fontsize=24)
ax[1, 1].legend(loc="lower left", fontsize=16)
ax[1, 1].grid()
ax[1, 1].text(-0.1, 1.1, 'D', transform=ax[1, 1].transAxes, fontsize=24, fontweight='bold', va='top', ha='right')

plt.tight_layout()
plt.savefig('figures/rocs_and_prs.tiff', bbox_inches='tight', format='tiff')
plt.show()



In [None]:
# Example for fixed number of trainable layers
prediction_files = glob.glob("out/final_test_predictions_AoUencoder_StanfordFinetune-15-*pct.csv")

plot_multiple_roc_pr_curves(prediction_files)



In [None]:
# This is a baseline set of metrics assuming a 0.5 threshold for prediction. 

prediction_files = glob.glob("out/final_test_predictions_*.csv")
# Uncomment this line instead to look at the retrained autoencoder predictions
# prediction_files = glob.glob("out/retrain_autoencoders_final_test_predictions_*.csv")

# Compute metrics for all models
all_metrics = pd.concat([evaluate_predictions(file).assign(Model=file) for file in prediction_files])
all_metrics.to_csv("pre_threshold_test_metrics_5_8.csv", index=False)

## Looking for threshold

In [None]:
def evaluate_threshold(predictionfile, outcome_col="outcome", pred_col="Prediction", threshold=0.5):
    """
    Compute classification metrics at a specified threshold.

    Parameters:
    - y_true: Ground truth binary labels (0 or 1).
    - y_scores: Predicted probabilities.
    - threshold: Decision threshold for classification.

    Returns:
    - Dictionary of computed metrics.
    """
    df_predictions=pd.read_csv(predictionfile)
    cohort = pd.read_csv("processed_data/cohort.csv")
    df=pd.merge(cohort, df_predictions, on='MRN', how='inner')
    
    # Extract true labels and predicted probabilities
    y_true = df[outcome_col]
    y_scores = df[pred_col]
    
    # Convert probabilities to binary predictions
    y_pred = (y_scores >= threshold).astype(int)

    # Compute standard metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    # Compute confusion matrix (TN, FP, FN, TP)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    # Compute Specificity (True Negative Rate)
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

    # Compute Negative Predictive Value (NPV)
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0

    # Compute AUROC
    auroc = roc_auc_score(y_true, y_scores)

    # Compute Precision-Recall curve and AUPRC
    precision_curve, recall_curve, _ = precision_recall_curve(y_true, y_scores)
    auprc = auc(recall_curve, precision_curve)

    return {
        "Threshold": threshold,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall (Sensitivity)": recall,
        "F1-score": f1,
        "Specificity": specificity,
        "NPV": npv,
        "AUROC": auroc,
        "AUPRC": auprc
    }

In [None]:
def find_best_threshold(predictionfile, outcome_col="outcome", pred_col="Prediction", thresholds=np.arange(0.00, 1.0, 0.05)):
    """
    Find the threshold that maximizes the F1-score.

    Parameters:
    - predictionfile: Path to the CSV file containing predictions.
    - outcome_col: Column name for the ground truth binary labels.
    - pred_col: Column name for the predicted probabilities.
    - thresholds: List or array of threshold values to test.

    Returns:
    - Best threshold and corresponding max F1-score.
    """
    best_threshold = None
    best_f1 = 0

    for threshold in thresholds:
        metrics = evaluate_threshold(predictionfile, outcome_col, pred_col, threshold)
        f1 = metrics["F1-score"]
        # print(f"Threshold: {threshold:.2f}, F1-score: {f1}, Accuracy: {metrics['Accuracy']}, Precision: {metrics['Precision']}, Recall: {metrics['Recall (Sensitivity)']}, Specificity: {metrics['Specificity']}, NPV: {metrics['NPV']}, AUROC: {metrics['AUROC']}, AUPRC: {metrics['AUPRC']}")

        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold

    return best_threshold, best_f1


In [None]:
results_df = pd.DataFrame(columns=["Model", "Best Threshold", "Best F1-score"])

Nlist = [20, 40, 60, 80, 100] 
num_unfrozen_layers = [0, 1, 3, 6, 7, 8, 9, 11, 12, 15, 16, 18]

for N in Nlist: 
    for num_unfrozen_layer in num_unfrozen_layers:
        filename = f"out/final_val_predictions_AoUencoder_StanfordFinetune-{num_unfrozen_layer}-{N}pct.csv"
        best_threshold, best_f1 = find_best_threshold(filename)
        
        result = pd.DataFrame({
            "Model": [filename],
            "Best Threshold": [best_threshold],
            "Best F1-score": [best_f1]
        })
        
        results_df = pd.concat([results_df, result], ignore_index=True)


In [None]:
results_df.head()

In [None]:
results_df.to_csv("thresholds_f1s_1_29.csv", index=False)

In [None]:
max_value_row = results_df.loc[results_df['Best F1-score'].idxmax()]

print(max_value_row)
print(max_value_row['Model'])


In [None]:
# After finding the best thresholds for each model on the validation set, get test metrics
results_df['Test Model'] = results_df['Model'].str.replace('final_val_predictions', 'final_test_predictions')

test_metrics = pd.concat([
    evaluate_predictions(row['Test Model'], pred_threshold=row['Best Threshold']).assign(Model=row['Test Model'])
    for _, row in results_df.iterrows()
])

# Display the combined results
print(test_metrics.head())


In [None]:
test_metrics.to_csv("test_metrics_1_20.csv", index=False)