# Benchmark Results Analysis

This notebook loads all JSON files under `bench/results` and summarizes the benchmark runs.


In [1]:
from __future__ import annotations

from pathlib import Path
import json
import pandas as pd

In [2]:
results_dir = Path.cwd() / "results"
json_files = sorted(results_dir.glob("bench-runs-*.json"))
print(f"Found {len(json_files)} results files")

Found 6 results files


In [3]:
runs = []
rows = []

for path in json_files:
    payload = json.loads(path.read_text())
    file_id = path.stem
    for result in payload.get("results", []):
        runs.append({
            "file": file_id,
            "generatedAt": payload.get("generatedAt"),
            "run": result.get("run"),
            "success": result.get("success"),
            "exitCode": result.get("exitCode"),
            "durationMs": result.get("durationMs"),
            "status": result.get("status"),
        })
        for block in result.get("uiMetrics", []) or []:
            for row in block.get("rows", []) or []:
                rows.append({
                    "file": file_id,
                    "scenario": row.get("scenario", ""),
                    "datasetLabel": row.get("datasetLabel", ""),
                    "run": row.get("run"),
                    "runtimeMs": row.get("runtimeMs"),
                    "memoryMb": row.get("memoryMb"),
                    "qualityPercent": row.get("qualityPercent"),
                    "fps": row.get("fps"),
                    "latencyMs": row.get("latencyMs"),
                    "wasmFeatures": row.get("wasmFeatures", ""),
                    "dataset": row.get("dataset", ""),
                })

runs_df = pd.DataFrame(runs)
rows_df = pd.DataFrame(rows)

print(f"Runs: {len(runs_df)}")
print(f"Rows: {len(rows_df)}")


Runs: 60
Rows: 60


In [4]:
# Run-level overview
runs_df.groupby("success").size().rename("count")


success
True    60
Name: count, dtype: int64

In [5]:
# Duration by file
runs_df.groupby("file")["durationMs"].agg(["count", "mean", "min", "max"]).sort_values("mean", ascending=False)


Unnamed: 0_level_0,count,mean,min,max
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bench-runs-1769062118389,10,351645.1,266674,1052903
bench-runs-1769057694118,10,269170.0,265702,271671
bench-runs-1769061162918,10,95453.8,92374,102112
bench-runs-1769060386879,10,77519.0,74951,82843
bench-runs-1769057053231,10,64012.0,60010,72456
bench-runs-1769056507010,10,54548.0,51961,58115


In [6]:
# Scenario-level summary
summary = (
    rows_df.groupby(["scenario", "wasmFeatures", "datasetLabel"], dropna=False)
    .agg(
        runs=("run", "count"),
        runtime_ms_mean=("runtimeMs", "mean"),
        runtime_ms_p95=("runtimeMs", lambda s: s.quantile(0.95)),
        memory_mb_mean=("memoryMb", "mean"),
        quality_mean=("qualityPercent", "mean"),
        fps_mean=("fps", "mean"),
        latency_ms_mean=("latencyMs", "mean"),
    )
    .reset_index()
    .sort_values("runtime_ms_mean", ascending=False)
)
summary.head(20)


Unnamed: 0,scenario,wasmFeatures,datasetLabel,runs,runtime_ms_mean,runtime_ms_p95,memory_mb_mean,quality_mean,fps_mean,latency_ms_mean
0,,,,0,4374.833333,7519.055,,,,


In [7]:
# Quick comparison: WASM vs JS on the same scenario/dataset
compare = (
    rows_df.groupby(["scenario", "datasetLabel", "wasmFeatures"], dropna=False)
    .agg(runtime_ms_mean=("runtimeMs", "mean"))
    .reset_index()
)
compare.pivot_table(
    index=["scenario", "datasetLabel"],
    columns="wasmFeatures",
    values="runtime_ms_mean",
)


Unnamed: 0_level_0,wasmFeatures,Unnamed: 2_level_0
scenario,datasetLabel,Unnamed: 2_level_1
,,4374.833333
