# Benchmark Results Analysis

This notebook loads all JSON files under `bench/results` and summarizes the benchmark runs.


In [1]:
from __future__ import annotations

from pathlib import Path
import json
import pandas as pd

In [5]:
results_dir = Path.cwd() / "results"
json_files = sorted(results_dir.glob("bench-runs-*.json"))
print(f"Found {len(json_files)} results files")

Found 7 results files


In [6]:
runs = []
rows = []

for path in json_files:
    payload = json.loads(path.read_text())
    file_id = path.stem
    for result in payload.get("results", []):
        runs.append({
            "file": file_id,
            "generatedAt": payload.get("generatedAt"),
            "run": result.get("run"),
            "success": result.get("success"),
            "exitCode": result.get("exitCode"),
            "durationMs": result.get("durationMs"),
            "status": result.get("status"),
        })
        for block in result.get("uiMetrics", []) or []:
            for row in block.get("rows", []) or []:
                rows.append({
                    "file": file_id,
                    "scenario": row.get("scenario", ""),
                    "datasetLabel": row.get("datasetLabel", ""),
                    "run": row.get("run"),
                    "runtimeMs": row.get("runtimeMs"),
                    "memoryMb": row.get("memoryMb"),
                    "qualityPercent": row.get("qualityPercent"),
                    "fps": row.get("fps"),
                    "latencyMs": row.get("latencyMs"),
                    "wasmFeatures": row.get("wasmFeatures", ""),
                    "dataset": row.get("dataset", ""),
                })

runs_df = pd.DataFrame(runs)
rows_df = pd.DataFrame(rows)

print(f"Runs: {len(runs_df)}")
print(f"Rows: {len(rows_df)}")


Runs: 30
Rows: 52


In [7]:
# Run-level overview
runs_df.groupby("success").size().rename("count")


success
False     4
True     26
Name: count, dtype: int64

In [8]:
# Duration by file
runs_df.groupby("file")["durationMs"].agg(["count", "mean", "min", "max"]).sort_values("mean", ascending=False)


Unnamed: 0_level_0,count,mean,min,max
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bench-runs-1768978001013,3,58440.333333,58114,58791
bench-runs-1768977757633,3,43632.666667,42185,44612
bench-runs-1768979987507,10,37839.5,34653,41829
bench-runs-1768977401161,3,36732.0,35413,37727
bench-runs-1768979302528,10,36623.4,34479,37574
bench-runs-1768979037005,1,690.0,690,690


In [9]:
# Scenario-level summary
summary = (
    rows_df.groupby(["scenario", "wasmFeatures", "datasetLabel"], dropna=False)
    .agg(
        runs=("run", "count"),
        runtime_ms_mean=("runtimeMs", "mean"),
        runtime_ms_p95=("runtimeMs", lambda s: s.quantile(0.95)),
        memory_mb_mean=("memoryMb", "mean"),
        quality_mean=("qualityPercent", "mean"),
        fps_mean=("fps", "mean"),
        latency_ms_mean=("latencyMs", "mean"),
    )
    .reset_index()
    .sort_values("runtime_ms_mean", ascending=False)
)
summary.head(20)


Unnamed: 0,scenario,wasmFeatures,datasetLabel,runs,runtime_ms_mean,runtime_ms_p95,memory_mb_mean,quality_mean,fps_mean,latency_ms_mean
0,mid bench: two moderate datasets @mid,,,6,4569.916667,5470.825,0.0,97.1,53.883333,27.93
2,small bench: sequential lightweight datasets @...,"Dist, Tree, Matrix, NN, Opt",,20,3287.12,3762.88,0.0,91.95,54.69,14.085
3,small bench: sequential lightweight datasets @...,,,6,3006.833333,3549.775,0.0,93.166667,54.366667,15.031667
1,small bench: sequential lightweight datasets @...,Dist,,20,2996.505,3513.98,0.0,92.85,54.695,15.0675


In [11]:
# Quick comparison: WASM vs JS on the same scenario/dataset
compare = (
    rows_df.groupby(["scenario", "datasetLabel", "wasmFeatures"], dropna=False)
    .agg(runtime_ms_mean=("runtimeMs", "mean"))
    .reset_index()
)
compare.pivot_table(
    index=["scenario", "datasetLabel"],
    columns="wasmFeatures",
    values="runtime_ms_mean",
)


Unnamed: 0_level_0,wasmFeatures,Dist,"Dist, Tree, Matrix, NN, Opt",None
scenario,datasetLabel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mid bench: two moderate datasets @mid,,,,4569.916667
small bench: sequential lightweight datasets @small,,2996.505,3287.12,3006.833333
