In [None]:
import os
import platform
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from matplotlib import font_manager as fm

import utils
from custom_colors import *

font_path = r'C:\Users\Felix\AppData\Local\Microsoft\Windows\Fonts\SourceSansPro-Regular.ttf'
fm.fontManager.addfont(font_path)
source_sans_pro = fm.FontProperties(fname=font_path)

plt.rcParams['font.family'] = source_sans_pro.get_name()

In [None]:
STORE = False
# STORE = True

In [None]:
df_raw = pd.read_csv("data/runs-17_19_25-05-May-25.csv")
df_raw = df_raw.drop([0], axis=0)

In [None]:
df_raw

In [None]:
for l in df_raw.columns:
    if "ccur" in l:
        print(l)

In [None]:
# Other scores
df_raw[["model_config.model", "AUC_val", "F1_val", "Precision_val", "Recall_val", "Accuracy_val"]]

In [None]:
# Other scores
df_raw[["model_config.model", "AUC_val", "F1_val", "Precision_val", "Recall_val", "Accuracy_val"]]

In [None]:
df = df_raw[["experiment", 'AUC_val', 'AUC_train', 'training_time_min', 'avg_epoch_time', 'nr_params', 'nr_flops', "best_epoch", "model_config.model", "F1_val", "Precision_val", "Recall_val", "Accuracy_val"]]

In [None]:
df.loc[:, "model_config.model"] = ["baseline_freq" if "freq" in name else model.strip("\"") for name, model in zip(df["experiment"], df["model_config.model"])]
df = df.rename(columns={'model_config.model': 'model', "avg_epoch_time": "avg_epoch_min"})
df = df.drop("experiment", axis=1)

In [None]:
cols_to_convert = [col for col in df.columns if col != 'model']
df.loc[:, cols_to_convert] = df[cols_to_convert].astype(float)

In [None]:
df.columns

In [None]:
# custom colnames
# "val" scores are actually test, because the set to evaluate for this data was test
df.columns = ["test AUC", "train AUC", "train time (min.)", "avg epoch time (min.)", "# parameters", "# FLOPS", "best epoch", "model", "test F1 Score", "test Precision", "test Recall", "test Accuracy"]

In [None]:
# Adding validation AUC (tuning) results

    #  "MLP Baseline": {
    #     "val AUC": 0.6686054843599283,
    # "MLP Freq": {
    #     "val AUC": 0.7248128243994304,
    # "CNN": {
    #     "val AUC": 0.6908984428827338,
    # "LegNet": {
    #     "val AUC": 0.710789582472096,
    # "RiboNN": {
    #     "val AUC": 0.6990592990675669,
    # "LSTM": {
    #     "val AUC": 0.6774309861742686,
    # "GRU": {
    #     "val AUC": 0.6836840292131734,
    # "xLSTM": {
    #     "val AUC": 0.6890615957007028,
    # "Transformer": {
    #     "val AUC": 0.680868127325341,
    # "Mamba": {
    #     "val AUC": 0.686000643057278,

df_val = pd.DataFrame(
    {
        "val AUC (tuning)": [
            0.6686054843599283,
            0.7248128243994304,
            0.6908984428827338,
            0.710789582472096,
            0.6990592990675669,
            0.6774309861742686,
            0.6836840292131734,
            0.6890615957007028,
            0.680868127325341,
            0.686000643057278
        ]
    },
    index=["baseline", "baseline_freq", "cnn", "LegNet", "RiboNN", "lstm", "gru", "xlstm", "transformer", "mamba"]
)

# merge with df (on model column
df = df.merge(df_val, left_on="model", right_index=True) #, suffixes=("", "_val")

In [None]:
# Results RFC (train+val, and test)

# RandomForestClassifier
# Training Time (s): 18.88788938522339

# Train Accuracy: 1.0
# Train Precision: 1.0
# Train Recall: 1.0
# Train F1: 1.0

# Test ROC AUC: 0.6681921274260418
# Test Accuracy: 0.6119725928597187
# Test Precision: 0.6097212294496068
# Test Recall: 0.6167751265365148
# Test F1: 0.6132278936017254

# train, val data results
# Validation ROC AUC: 0.6634033806439759
# Validation Accuracy: 0.6123348017621145
# Validation Precision: 0.6101131071190952
# Validation Recall: 0.6216949152542373
# Validation F1: 0.615849563465413


rfc_data = {
    "test AUC": 0.6681921274260418,
    "test F1 Score": 0.6132278936017254,
    "test Precision": 0.6097212294496068,
    "test Recall": 0.6167751265365148,
    "val AUC (tuning)": 0.6634033806439759,
    "train AUC": np.nan,
    "train time (min.)": 18.887889 / 60,
    "avg epoch time (min.)": np.nan,
    "# parameters": np.nan,
    "# FLOPS": np.nan,
    "best epoch": np.nan,
    "model": "RFC (freq)"
}

rfc_df = pd.DataFrame(rfc_data, index=[0])

df = pd.concat([df, rfc_df], axis=0)
df.reset_index(drop=True, inplace=True)

In [None]:
df.index = df.model
df = df.rename(index={
    "baseline": "MLP (baseline)",
    "baseline_freq": "MLP (freq)",
    "cnn": "CNN",
    "gru": "GRU",
    "lstm": "LSTM",
    "mamba": "Mamba",
    "transformer": "Transformer",
    "xlstm": "xLSTM",
    "LEGnet": "LEGnet",
    "RFC (freq)": "RFC (freq)"
})

In [None]:
df

In [None]:
df_table = df.copy()[['test AUC', 'val AUC (tuning)', 'train AUC', 'train time (min.)', 'avg epoch time (min.)',
       '# parameters', '# FLOPS', 'best epoch']]
df_table.sort_values(by="test AUC", ascending=False, inplace=True)
#df_table.reset_index(inplace=True)
df_table = df_table.round(5)
df_table[["# parameters", "# FLOPS"]] = df_table[["# parameters", "# FLOPS"]].astype(pd.Int64Dtype())
df_table

In [None]:
def transparent_nan(val):
    if pd.isnull(val) or val is pd.NA:
        return 'background-color: white; color: white;'
    return ''

df_table.index.name = None

styled_df = (
    df_table.style
    .background_gradient(subset=['test AUC'], cmap='Greens')  # Color scale for 'test AUC'
    .background_gradient(subset=['val AUC (tuning)'], cmap='Greens')  # Color scale for 'test AUC'
    .background_gradient(subset=['train AUC'], cmap='Greens')  # Color scale for 'train AUC'
    .background_gradient(subset=['train time (min.)'], cmap='Reds')  # Color scale for 'train time'
    .background_gradient(subset=['# parameters'], cmap='Reds')  # Color scale for '# Parameters'
    .background_gradient(subset=['# FLOPS'], cmap='Reds')  # Color scale for '# Parameters'
    .background_gradient(subset=['avg epoch time (min.)'], cmap='Reds')  # Color scale for '# Parameters'
    .background_gradient(subset=['best epoch'], cmap='Reds')  # Color scale for '# Parameters'
    .map(transparent_nan)
    .format(precision=4)
)

styled_df

In [None]:
# write to html
if STORE:
    html_path = os.path.join(os.getenv("OUTPUT_DIR"), "benchmark_results.html")
    with open(html_path, "w", encoding="utf-8") as f:
        f.write(styled_df.to_html())

## Table of more Metrics

In [None]:
df_table2 = df.copy()[['train AUC', 'val AUC (tuning)', 'test AUC', "test F1 Score", "test Precision", "test Recall"]]
df_table2.sort_values(by="test AUC", ascending=False, inplace=True)
#df_table.reset_index(inplace=True)
df_table2 = df_table2.round(5)
df_table2

In [None]:
# Add best PTRnet results
# TODO

In [None]:
def transparent_nan(val):
    if pd.isnull(val) or val is pd.NA:
        return 'background-color: white; color: white;'
    return ''

df_table2.index.name = None

styled_df = (
    df_table2.style
    .background_gradient(subset=['train AUC'], cmap='Greens')  # Color scale for 'test AUC'
    .background_gradient(subset=['val AUC (tuning)'], cmap='Greens')  # Color scale for 'test AUC'
    .background_gradient(subset=['test AUC'], cmap='Greens')  # Color scale for 'test AUC'
    .background_gradient(subset=['test F1 Score'], cmap='Greens')  # Color scale for 'test AUC'
    .background_gradient(subset=['test Precision'], cmap='Greens')  # Color scale for 'test AUC'
    .background_gradient(subset=['test Recall'], cmap='Greens')  # Color scale for 'test AUC'
    .map(transparent_nan)
    .format(precision=4)
    # make columns equally wide
    .set_table_styles([{
        'selector': 'th',
        'props': [('width', '400px')]
    }])
)

styled_df

In [None]:
# write to html
if STORE:
    html_path = os.path.join(os.getenv("OUTPUT_DIR"), "benchmark_results_metrics.html")
    with open(html_path, "w", encoding="utf-8") as f:
        f.write(styled_df.to_html())

## Results Barplot

In [None]:
df_barplot = df_table.sort_values(by="test AUC", ascending=True)

fig, ax = plt.subplots(figsize=(8, 5))
bar_height = 0.35
y = range(len(df))

# Bars with spacing
ax.barh([i + bar_height / 2 for i in y], df_barplot["test AUC"], height=bar_height, label='Test', color=blue_shades[4])
ax.barh([i - bar_height / 2 for i in y], df_barplot["val AUC (tuning)"], height=bar_height, label='Validation', color=blue_shades[0])

# Labels and legend
ax.set_yticks(y)
ax.set_yticklabels(df_barplot.index)
ax.set_xlabel("AUC")
ax.set_title("Benchmarking Results")
ax.legend()
ax.set_xlim(0.5, max(df_barplot["val AUC (tuning)"].max(), df_barplot["test AUC"].max()) * 1.035)
plt.tight_layout()

if STORE:
    plt.savefig(os.getenv("OUTPUT_DIR") + f"/benchmark_barplot.pdf", format="pdf", bbox_inches="tight")

plt.show()

## Simulating AUC Scores for Random Majority Classifier

In [None]:
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, balanced_accuracy_score, precision_score, recall_score

# Simulated dataset
n_samples = 100
y_true = np.array([0]*90 + [1]*10)  # 90 negative, 10 positive
y_pred_majority_class = np.array([0]*100)  # Predict all as majority class (0)
y_scores_constant = np.array([0]*50 + [0.2]*50)  # Constant score for all (e.g., naive predictor)

# Metrics
roc_auc = roc_auc_score(y_true, y_scores_constant)
pr_auc = average_precision_score(y_true, y_scores_constant)
f1 = f1_score(y_true, y_pred_majority_class, zero_division=0)
balanced_acc = balanced_accuracy_score(y_true, y_pred_majority_class)
precision = precision_score(y_true, y_pred_majority_class, zero_division=0)
recall = recall_score(y_true, y_pred_majority_class, zero_division=0)

import pandas as pd
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'ROC AUC', 'PR AUC', 'F1 Score', 'Balanced Accuracy', 'Precision', 'Recall'],
    'Value': [
        np.mean(y_pred_majority_class == y_true),
        roc_auc,
        pr_auc,
        f1,
        balanced_acc,
        precision,
        recall
    ]
})


metrics_df

In [None]:
# Simulate random scores in [0, 0.5) to mimic a model that predicts low probabilities for all
# np.random.seed(42)
y_scores_low = np.random.uniform(0, 0.5, size=100)

# New metrics using these random low scores
roc_auc_low = roc_auc_score(y_true, y_scores_low)
pr_auc_low = average_precision_score(y_true, y_scores_low)

# Keep the thresholded class predictions (still all 0s since all probs < 0.5)
y_pred_low = (y_scores_low >= 0.5).astype(int)

f1_low = f1_score(y_true, y_pred_low, zero_division=0)
balanced_acc_low = balanced_accuracy_score(y_true, y_pred_low)
precision_low = precision_score(y_true, y_pred_low, zero_division=0)
recall_low = recall_score(y_true, y_pred_low, zero_division=0)

metrics_df_low = pd.DataFrame({
    'Metric': ['Accuracy', 'ROC AUC', 'PR AUC', 'F1 Score', 'Balanced Accuracy', 'Precision', 'Recall'],
    'Value': [
        np.mean(y_pred_low == y_true),
        roc_auc_low,
        pr_auc_low,
        f1_low,
        balanced_acc_low,
        precision_low,
        recall_low
    ]
})

metrics_df_low


In [None]:
y_scores_low