In [1]:
%pip install pandas

Collecting pandas
  Downloading pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m5.1 MB/s[0m  [33m0:00:02[0mm0:00:01[0m00:01[0m
[?25hUsing cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Downloading tzdata-2025.3-py2.py3-none-any.whl (348 kB)
Installing collected packages: pytz, tzdata, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [pandas]2m2/3[0m [pandas]
[1A[2KSuccessfully installed pandas-2.3.3 pytz-2025.2 tzdata-2025.3
Note: you may need to restart the kernel to use updated packages.


In [1]:
import json
import numpy as np
import pandas as pd
from pathlib import Path

RUNS_DIR = Path("../../runs")
OUT_DIR = Path("./analysis_data")
OUT_DIR.mkdir(exist_ok=True)

rows = []

# ---------------------------------------------------
# Collect per-run, per-k rows
# ---------------------------------------------------
for run_dir in sorted(RUNS_DIR.iterdir()):
    metrics_path = run_dir / "metrics.json"
    if not metrics_path.exists():
        continue

    with open(metrics_path, "r") as f:
        m = json.load(f)

    n = len(m["k"])

    for i in range(n):
        rows.append({
            "run": run_dir.name,
            "k": m["k"][i],

            "mean_overlap": m["mean_overlap"][i],
            "std_overlap": m["std_overlap"][i],

            "mean_dist_divergence": m["mean_dist_divergence"][i],
            "std_dist_divergence": m["std_dist_divergence"][i],

            "mean_barycenter_shift": m["mean_barycenter_shift"][i],
            "std_barycenter_shift": m["std_barycenter_shift"][i],

            "lid_exact": m["lid_exact"][i],
            "lid_ann": m["lid_ann"][i],
            "mean_lid_diff": m["mean_lid_diff"][i],
            "std_lid_diff": m["std_lid_diff"][i],
        })

df = pd.DataFrame(rows)

# ---------------------------------------------------
# Clean numerical junk (VERY important)
# ---------------------------------------------------
df = df.replace([np.inf, -np.inf], np.nan)

# ---------------------------------------------------
# Aggregate across runs (mean over experiments)
# ---------------------------------------------------
agg = (
    df
    .groupby("k")
    .agg({
        "mean_overlap": "mean",
        "std_overlap": "mean",

        "mean_dist_divergence": "mean",
        "std_dist_divergence": "mean",

        "mean_barycenter_shift": "mean",
        "std_barycenter_shift": "mean",

        "lid_exact": "mean",
        "lid_ann": "mean",
        "mean_lid_diff": "mean",
        "std_lid_diff": "mean",
    })
    .reset_index()
)

# ---------------------------------------------------
# Save ONE canonical file
# ---------------------------------------------------
out_path = OUT_DIR / "aggregate_metrics.csv"
agg.to_csv(out_path, index=False)

print(f"Saved aggregated metrics → {out_path}")
print(f"ks: {len(agg)} | runs: {df['run'].nunique()}")



Saved aggregated metrics → analysis_data/aggregate_metrics.csv
ks: 100 | runs: 10
