# Benchmark Results Analysis

This notebook loads all JSON files under `bench/results` and summarizes the benchmark runs.


In [None]:
from __future__ import annotations

from pathlib import Path
import json
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
results_dir = Path.cwd() / "results"
json_files = sorted(results_dir.glob("bench-runs-*.json"))
print(f"Found {len(json_files)} results files")

In [None]:
runs = []
rows = []

for path in json_files:
    payload = json.loads(path.read_text())
    file_id = path.stem
    for result in payload.get("results", []):
        runs.append({
            "file": file_id,
            "generatedAt": payload.get("generatedAt"),
            "run": result.get("run"),
            "success": result.get("success"),
            "exitCode": result.get("exitCode"),
            "durationMs": result.get("durationMs"),
            "status": result.get("status"),
        })
        for block in result.get("uiMetrics", []) or []:
            for row in block.get("rows", []) or []:
                trustworthiness = row.get("trustworthiness")
                quality_percent = trustworthiness * 100 if trustworthiness is not None else None
                wasm_features = row.get("wasmFeatures") or row.get("wasmMode", "")
                rows.append({
                    "file": file_id,
                    "scenario": row.get("scope", ""),
                    "Dataset": row.get("datasetName", ""),
                    "Run": row.get("runId"),
                    "Runtime (ms)": row.get("runtimeMs"),
                    "Memory (MB)": row.get("memoryDeltaMb"),
                    "Quality (%)": quality_percent,
                    "FPS": row.get("fpsAvg"),
                    "Latency (ms)": row.get("responsivenessMs"),
                    "WASM Features": wasm_features,
                    "datasetSize": row.get("datasetSize"),
                    "dimensions": row.get("dimensions"),
                })

runs_df = pd.DataFrame(runs)
rows_df = pd.DataFrame(rows)

print(f"Runs: {len(runs_df)}")
print(f"Rows: {len(rows_df)}")


In [None]:
# Run-level overview
runs_df.groupby("success").size().rename("count")


In [None]:
# Duration by file
runs_df.groupby("file")["durationMs"].agg(["count", "mean", "min", "max"]).sort_values("mean", ascending=False)


In [None]:
# Scenario-level summary
summary = (
    rows_df.groupby(["scenario", "WASM Features", "Dataset"], dropna=False)
    .agg(
        runs=("Run", "count"),
        runtime_ms_mean=("Runtime (ms)", "mean"),
        runtime_ms_p95=("Runtime (ms)", lambda s: s.quantile(0.95)),
        memory_mb_mean=("Memory (MB)", "mean"),
        quality_mean=("Quality (%)", "mean"),
        fps_mean=("FPS", "mean"),
        latency_ms_mean=("Latency (ms)", "mean"),
    )
    .reset_index()
    .sort_values("runtime_ms_mean", ascending=False)
)
summary.head(20)


In [None]:
compare = (
    rows_df.groupby(["scenario", "Dataset", "WASM Features"], dropna=False)
    .agg(runtime_ms_mean=("Runtime (ms)", "mean"))
    .reset_index()
)

pivot = compare.pivot_table(
    index=["scenario", "Dataset"],
    columns="WASM Features",
    values="runtime_ms_mean",
)

plot_df = pivot.reset_index()
plot_df["label"] = plot_df["scenario"].astype(str) + " | " + plot_df["Dataset"].astype(str)
plot_df = plot_df.set_index("label").drop(columns=["scenario", "Dataset"])

ax = plot_df.plot(kind="bar", figsize=(10, 6))
ax.set_title("Mean runtime: WASM vs JS by scenario/dataset")
ax.set_ylabel("Runtime (ms)")
ax.set_xlabel("")
ax.legend(title="WASM Features")
plt.tight_layout()


In [None]:
# Multi-metric comparison plots
if rows_df.empty:
    print("No UI metrics rows found.")
else:
    metrics = {
        "Runtime (ms)": "Runtime (ms)",
        "Memory Usage (MB)": "Memory (MB)",
        "Embedding Quality (%)": "Quality (%)",
        "Visualization FPS": "FPS",
        "Responsiveness (ms)": "Latency (ms)",
    }

    group_cols = ["scenario", "Dataset", "WASM Features"]
    mean_df = (
        rows_df.groupby(group_cols, dropna=False)
        .mean(numeric_only=True)
        .reset_index()
    )
    mean_df["label"] = mean_df["scenario"].astype(str) + " | " + mean_df["Dataset"].astype(str)

    fig, axes = plt.subplots(2, 3, figsize=(16, 8))
    axes = axes.flatten()

    legend_handles = None
    legend_labels = None

    for i, (title, column) in enumerate(metrics.items()):
        pivot = mean_df.pivot_table(
            index="label",
            columns="WASM Features",
            values=column,
        ).sort_index()

        ax = axes[i]
        pivot.plot(kind="bar", ax=ax)
        ax.set_title(title)
        ax.set_ylabel(title)
        ax.set_xlabel("")

        if legend_handles is None:
            legend_handles, legend_labels = ax.get_legend_handles_labels()
        if ax.get_legend() is not None:
            ax.get_legend().remove()

    for j in range(len(metrics), len(axes)):
        axes[j].set_visible(False)

    if legend_handles and legend_labels:
        fig.legend(legend_handles, legend_labels, title="WASM Features", loc="upper right")
    plt.tight_layout()
