In [4]:
import sys, os
import warnings
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [11]:
folders   = ["tests/results/MNIST-OOD", "tests/results/R-MNIST"]
methods = ["map", "ensemble", "csghmc", "swg", "laplace_all", "laplace_last_layer", "subspace", "swag_laplace"]

def get_buckets(folders, methods):
    buckets = []
    for folder in folders:
        bucket = defaultdict(list)
        for fname in os.listdir(folder):
            hit = next((m for m in methods if fname.startswith(m)), None)
            if hit:
                bucket[hit].append(os.path.join(folder, fname))
        buckets.append(bucket)
    return buckets

buckets = get_buckets(folders, methods)

In [15]:
bucket = buckets[0]

results = []
for method in methods:
    runs = bucket.get(method, [])
    conf_runs = []
    auroc_runs = []
    time_runs = []
    for path in runs:
        arr = np.load(path, allow_pickle=True)[1:]
        conf_runs.append(np.mean([d["conf"] for d in arr]))
        auroc_runs.append(np.mean([d["auroc"] for d in arr]))
        time_runs.append(np.mean([d["test_time"] for d in arr]))
    with warnings.catch_warnings(action='ignore'):
        results.append({
            "Method": method,
            "conf_mean": np.mean(conf_runs),
            "conf_std": np.std(conf_runs),
            "auroc_mean": np.mean(auroc_runs),
            "auroc_std": np.std(auroc_runs),
            "time_mean": np.mean(time_runs),
            "time_std": np.std(time_runs),
        })

df = pd.DataFrame(results)
df["Confidence"] = (df["conf_mean"] * 100).round(1).astype(str) + "±" + (df["conf_std"] * 100).round(1).astype(str)
df["AUROC"]      = (df["auroc_mean"] * 100).round(1).astype(str) + "±" + (df["auroc_std"] * 100).round(1).astype(str)
df["Test time (s)"] = df["time_mean"].round(2).astype(str) + "±" + df["time_std"].round(2).astype(str)
df = df.set_index("Method").loc[methods].reset_index()
table = df[["Method", "Confidence", "AUROC", "Test time (s)"]]
print(table.to_markdown(index=False))


| Method             | Confidence   | AUROC    | Test time (s)   |
|:-------------------|:-------------|:---------|:----------------|
| map                | 75.0±0.6     | 96.5±0.2 | 7.35±0.51       |
| ensemble           | 65.7±0.5     | 97.5±0.0 | 9.1±0.61        |
| csghmc             | 69.2±3.2     | 96.1±0.3 | 11.9±0.69       |
| swg                | nan±nan      | nan±nan  | nan±nan         |
| laplace_all        | nan±nan      | nan±nan  | nan±nan         |
| laplace_last_layer | 43.1±0.9     | 95.7±0.4 | 7.18±0.21       |
| subspace           | nan±nan      | nan±nan  | nan±nan         |
| swag_laplace       | nan±nan      | nan±nan  | nan±nan         |
