From 4453417ee53fc1a599cb98fb9ec0cc91fd827212 Mon Sep 17 00:00:00 2001
From: Mykhailo Chalyi <mykhailo.chalyi-contractor@procore.com>
Date: Mon, 25 May 2026 23:45:53 -0500
Subject: [PATCH 1/5] feat(site): add benches history page

---
 site/package.json                       |    2 +
 site/scripts/build-performance-data.mjs |  418 +++
 site/src/components/Header.astro        |    1 +
 site/src/content/home.ts                |    8 +
 site/src/data/performance-timeline.json | 4539 +++++++++++++++++++++++
 site/src/pages/benches.astro            |  836 +++++
 site/src/pages/index.astro              |    8 +-
 7 files changed, 5810 insertions(+), 2 deletions(-)
 create mode 100644 site/scripts/build-performance-data.mjs
 create mode 100644 site/src/data/performance-timeline.json
 create mode 100644 site/src/pages/benches.astro

diff --git a/site/package.json b/site/package.json
index 156532aa..3af29ba9 100644
--- a/site/package.json
+++ b/site/package.json
@@ -10,6 +10,8 @@
   },
   "scripts": {
     "dev": "astro dev",
+    "data:performance": "node scripts/build-performance-data.mjs",
+    "prebuild": "node scripts/build-performance-data.mjs",
     "build": "astro build",
     "postbuild": "node scripts/normalize-generated-html.mjs && node scripts/verify-doc-routes.mjs && node scripts/verify-doc-markdown-routes.mjs && node scripts/verify-public-links.mjs && node scripts/verify-sitemap.mjs && node scripts/verify-robots.mjs && node scripts/verify-agent-skills.mjs && node scripts/verify-link-headers.mjs",
     "preview": "wrangler dev",
diff --git a/site/scripts/build-performance-data.mjs b/site/scripts/build-performance-data.mjs
new file mode 100644
index 00000000..f6b016e4
--- /dev/null
+++ b/site/scripts/build-performance-data.mjs
@@ -0,0 +1,418 @@
+import { mkdir, readFile, readdir, writeFile } from "node:fs/promises";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+
+// Decision: publish only aggregated history. Raw eval traces and per-iteration
+// benchmark samples are useful locally, but too large for the static site.
+const scriptDir = path.dirname(fileURLToPath(import.meta.url));
+const siteDir = path.resolve(scriptDir, "..");
+const repoRoot = path.resolve(siteDir, "..");
+
+const outputPath = path.join(siteDir, "src/data/performance-timeline.json");
+const benchDir = path.join(repoRoot, "crates/bashkit-bench/results");
+const criterionDir = path.join(repoRoot, "crates/bashkit/benches/results");
+const evalDir = path.join(repoRoot, "crates/bashkit-eval/results");
+
+function round(value, digits = 2) {
+  if (!Number.isFinite(value)) return null;
+  const scale = 10 ** digits;
+  return Math.round(value * scale) / scale;
+}
+
+function percentile(values, p) {
+  const sorted = values.filter(Number.isFinite).toSorted((a, b) => a - b);
+  if (sorted.length === 0) return null;
+  const index = (sorted.length - 1) * p;
+  const lower = Math.floor(index);
+  const upper = Math.ceil(index);
+  if (lower === upper) return sorted[lower];
+  return sorted[lower] + (sorted[upper] - sorted[lower]) * (index - lower);
+}
+
+function unixSecondsToIso(seconds) {
+  const n = Number(seconds);
+  if (!Number.isFinite(n) || n <= 0) return null;
+  return new Date(n * 1000).toISOString();
+}
+
+function dateLabel(iso) {
+  if (!iso) return "unknown";
+  return iso.slice(0, 10);
+}
+
+function parseJsonFileTimestamp(fileName) {
+  const isoMatch = fileName.match(/(\d{4}-\d{2}-\d{2})-(\d{6})/);
+  if (!isoMatch) return null;
+  const [, date, time] = isoMatch;
+  return `${date}T${time.slice(0, 2)}:${time.slice(2, 4)}:${time.slice(4, 6)}Z`;
+}
+
+function parseCriterionTimestamp(fileName, content) {
+  const contentMatch = content.match(/\*\*Timestamp\*\*:\s*([0-9]+)/);
+  if (contentMatch) return unixSecondsToIso(contentMatch[1]);
+  const fileMatch = fileName.match(/-([0-9]+)\.md$/);
+  return fileMatch ? unixSecondsToIso(fileMatch[1]) : null;
+}
+
+function parseTimeToUs(raw) {
+  if (typeof raw !== "string") return null;
+  const match = raw
+    .replaceAll("`", "")
+    .match(/([0-9]+(?:\.[0-9]+)?)\s*(ns|us|µs|ms|s)\b/i);
+  if (!match) return null;
+  const value = Number(match[1]);
+  const unit = match[2].toLowerCase();
+  if (unit === "ns") return value / 1000;
+  if (unit === "us" || unit === "µs") return value;
+  if (unit === "ms") return value * 1000;
+  if (unit === "s") return value * 1_000_000;
+  return null;
+}
+
+function parseMarkdownTables(content) {
+  const lines = content.split(/\r?\n/);
+  const rows = [];
+  let headers = null;
+
+  for (let i = 0; i < lines.length; i += 1) {
+    const line = lines[i].trim();
+    if (!line.startsWith("|") || !line.endsWith("|")) {
+      headers = null;
+      continue;
+    }
+
+    const cells = line
+      .slice(1, -1)
+      .split("|")
+      .map((cell) => cell.trim());
+
+    const next = lines[i + 1]?.trim() ?? "";
+    if (next.startsWith("|") && /^[-:|\s]+$/.test(next.slice(1, -1))) {
+      headers = cells.map((cell) => cell.toLowerCase());
+      i += 1;
+      continue;
+    }
+
+    if (headers && cells.length === headers.length) {
+      rows.push(Object.fromEntries(headers.map((header, index) => [header, cells[index]])));
+    }
+  }
+
+  return rows;
+}
+
+function parsePercent(raw) {
+  const match = raw?.match(/-?[0-9]+(?:\.[0-9]+)?/);
+  return match ? Number(match[0]) : null;
+}
+
+async function readJson(filePath) {
+  return JSON.parse(await readFile(filePath, "utf8"));
+}
+
+async function listFiles(dir, extension) {
+  return (await readdir(dir))
+    .filter((file) => file.endsWith(extension))
+    .toSorted((a, b) => a.localeCompare(b));
+}
+
+async function buildBenchRuns() {
+  const files = await listFiles(benchDir, ".json");
+  const runs = [];
+
+  for (const file of files) {
+    const sourcePath = path.join(benchDir, file);
+    const data = await readJson(sourcePath);
+    const timestamp = unixSecondsToIso(data.timestamp) ?? parseJsonFileTimestamp(file);
+    const stats = data.summary?.runner_stats ?? {};
+    const bashkit = stats.bashkit;
+    const bash = stats.bash;
+    if (!bashkit || !bash) continue;
+
+    const categoryPairs = new Map();
+    for (const row of data.results ?? []) {
+      if (!row.category || !row.runner || !Number.isFinite(row.mean_ns)) continue;
+      const key = `${row.category}:${row.case_name}`;
+      const existing = categoryPairs.get(key) ?? { category: row.category };
+      existing[row.runner] = row.mean_ns / 1_000_000;
+      categoryPairs.set(key, existing);
+    }
+
+    const byCategory = new Map();
+    for (const row of categoryPairs.values()) {
+      if (!Number.isFinite(row.bashkit) || !Number.isFinite(row.bash) || row.bashkit <= 0) {
+        continue;
+      }
+      const bucket = byCategory.get(row.category) ?? { ratios: [], cases: 0 };
+      bucket.ratios.push(row.bash / row.bashkit);
+      bucket.cases += 1;
+      byCategory.set(row.category, bucket);
+    }
+
+    const categories = [...byCategory.entries()]
+      .map(([category, bucket]) => ({
+        category,
+        cases: bucket.cases,
+        speedup: round(percentile(bucket.ratios, 0.5), 1),
+      }))
+      .sort((a, b) => b.speedup - a.speedup);
+
+    const speedup = bashkit.total_time_ms > 0 ? bash.total_time_ms / bashkit.total_time_ms : null;
+    runs.push({
+      id: file.replace(/\.json$/, ""),
+      kind: "bashkit-bench",
+      label: data.moniker ?? data.system?.moniker ?? file,
+      date: dateLabel(timestamp),
+      timestamp,
+      source: `crates/bashkit-bench/results/${file}`,
+      cases: data.summary?.total_cases ?? categories.reduce((sum, item) => sum + item.cases, 0),
+      speedup: round(speedup, 1),
+      bashkitMs: round(bashkit.total_time_ms, 2),
+      bashMs: round(bash.total_time_ms, 2),
+      errorRate: round(bashkit.error_rate * 100, 2),
+      matchRate: round(bashkit.output_match_rate * 100, 2),
+      categories,
+    });
+  }
+
+  return runs.toSorted((a, b) => new Date(a.timestamp) - new Date(b.timestamp));
+}
+
+function criterionFamily(fileName) {
+  const base = fileName.replace(/^criterion-/, "").replace(/-[0-9]+\.md$/, "");
+  if (base.startsWith("hotpath")) return "hotpath";
+  if (base.startsWith("file_ops")) return "file-ops";
+  if (base.startsWith("parallel")) return "parallel";
+  if (base.startsWith("sqlite")) return "sqlite";
+  return base.split("-")[0] || "criterion";
+}
+
+async function buildCriterionRuns() {
+  const files = await listFiles(criterionDir, ".md");
+  const runs = [];
+
+  for (const file of files) {
+    if (file === "README.md") continue;
+    const sourcePath = path.join(criterionDir, file);
+    const content = await readFile(sourcePath, "utf8");
+    const timestamp = parseCriterionTimestamp(file, content);
+    const title = content.match(/^#\s+(.+)$/m)?.[1] ?? file;
+    const rows = parseMarkdownTables(content);
+
+    const changes = rows
+      .map((row) => parsePercent(row.change))
+      .filter((value) => Number.isFinite(value));
+    const timesUs = rows
+      .map((row) => parseTimeToUs(row["time (median)"] ?? row.time ?? row.after ?? row["after (µs)"]))
+      .filter((value) => Number.isFinite(value));
+
+    const fastestRow = rows
+      .map((row) => ({
+        name: row.benchmark ?? row.case ?? row["group / case"] ?? row.bench ?? "case",
+        us: parseTimeToUs(row["time (median)"] ?? row.time ?? row.after ?? row["after (µs)"]),
+      }))
+      .filter((row) => Number.isFinite(row.us))
+      .toSorted((a, b) => a.us - b.us)[0];
+
+    const bestChangeRow = rows
+      .map((row) => ({
+        name: row.bench ?? row.case ?? row.benchmark ?? "case",
+        change: parsePercent(row.change),
+      }))
+      .filter((row) => Number.isFinite(row.change))
+      .toSorted((a, b) => a.change - b.change)[0];
+
+    const summaryMedianMatch = content.match(/median change:\s*\*\*(-?[0-9.]+)%\*\*/i);
+    const summaryMeanMatch = content.match(/mean change:\s*\*\*(-?[0-9.]+)%\*\*/i);
+
+    runs.push({
+      id: file.replace(/\.md$/, ""),
+      kind: "criterion",
+      family: criterionFamily(file),
+      label: title,
+      date: dateLabel(timestamp),
+      timestamp,
+      source: `crates/bashkit/benches/results/${file}`,
+      cases: Math.max(changes.length, timesUs.length),
+      medianUs: round(percentile(timesUs, 0.5), 2),
+      p95Us: round(percentile(timesUs, 0.95), 2),
+      medianChangePct: round(
+        summaryMedianMatch ? Number(summaryMedianMatch[1]) : percentile(changes, 0.5),
+        1,
+      ),
+      meanChangePct: round(
+        summaryMeanMatch ? Number(summaryMeanMatch[1]) : changes.reduce((sum, n) => sum + n, 0) / changes.length,
+        1,
+      ),
+      bestChangePct: round(bestChangeRow?.change, 1),
+      fastestCase: fastestRow ? { name: fastestRow.name, us: round(fastestRow.us, 2) } : null,
+      bestImprovement: bestChangeRow
+        ? { name: bestChangeRow.name, changePct: round(bestChangeRow.change, 1) }
+        : null,
+    });
+  }
+
+  return runs.toSorted((a, b) => new Date(a.timestamp) - new Date(b.timestamp));
+}
+
+async function buildEvalRuns() {
+  const files = await listFiles(evalDir, ".json");
+  const runs = [];
+
+  for (const file of files) {
+    const data = await readJson(path.join(evalDir, file));
+    const summary = data.summary;
+    if (!summary?.total_tasks || !Number.isFinite(summary.overall_rate)) continue;
+    const timestamp = data.timestamp ?? parseJsonFileTimestamp(file);
+    const categories = Object.entries(summary.by_category ?? {})
+      .map(([category, row]) => ({
+        category,
+        tasks: row.tasks,
+        passed: row.passed,
+        rate: round(row.rate * 100, 1),
+      }))
+      .sort((a, b) => a.rate - b.rate || b.tasks - a.tasks);
+
+    runs.push({
+      id: file.replace(/\.json$/, ""),
+      kind: file.startsWith("scripting-eval") ? "scripting-eval" : "llm-eval",
+      provider: data.provider ?? "unknown",
+      model: data.model ?? "unknown",
+      baseline: data.baseline ?? null,
+      label: `${data.provider ?? "unknown"}/${data.model ?? "unknown"}`,
+      date: dateLabel(timestamp),
+      timestamp,
+      source: `crates/bashkit-eval/results/${file}`,
+      tasks: summary.total_tasks,
+      passed: summary.total_passed,
+      scorePct: round(summary.overall_rate * 100, 1),
+      toolSuccessPct: round(summary.tool_call_success_rate * 100, 1),
+      avgTurns: round(summary.avg_turns_per_task, 2),
+      avgToolCalls: round(summary.avg_tool_calls_per_task, 2),
+      avgDurationMs: round(summary.avg_duration_ms, 0),
+      inputTokens: summary.total_input_tokens ?? null,
+      outputTokens: summary.total_output_tokens ?? null,
+      categories,
+    });
+  }
+
+  return runs.toSorted((a, b) => new Date(a.timestamp) - new Date(b.timestamp));
+}
+
+function bestBy(items, score) {
+  return items.reduce((best, item) => {
+    if (!best) return item;
+    return score(item) > score(best) ? item : best;
+  }, null);
+}
+
+function latest(items) {
+  return items.toSorted((a, b) => new Date(a.timestamp) - new Date(b.timestamp)).at(-1) ?? null;
+}
+
+function buildMilestones({ benchRuns, criterionRuns, evalRuns }) {
+  const points = [];
+
+  for (const run of benchRuns) {
+    points.push({
+      date: run.date,
+      timestamp: run.timestamp,
+      kind: "Benchmark",
+      title: `${run.speedup}x faster than bash`,
+      detail: `${run.cases} parity/perf cases on ${run.label}; output match ${run.matchRate}%.`,
+      metric: run.speedup,
+      source: run.source,
+    });
+  }
+
+  for (const run of criterionRuns) {
+    const improvement = run.bestImprovement
+      ? `${Math.abs(run.bestImprovement.changePct)}% faster in ${run.bestImprovement.name}`
+      : run.fastestCase
+        ? `${run.fastestCase.name} at ${run.fastestCase.us} us median`
+        : `${run.cases} criterion cases`;
+    points.push({
+      date: run.date,
+      timestamp: run.timestamp,
+      kind: "Criterion",
+      title: run.family,
+      detail: improvement,
+      metric: run.medianChangePct ?? run.medianUs,
+      source: run.source,
+    });
+  }
+
+  for (const run of evalRuns) {
+    if (run.tasks < 10 && !run.kind.includes("scripting")) continue;
+    const weakest = run.categories[0];
+    points.push({
+      date: run.date,
+      timestamp: run.timestamp,
+      kind: "Eval",
+      title: `${run.model}: ${run.scorePct}%`,
+      detail: `${run.passed}/${run.tasks} tasks passed. Weakest category: ${weakest?.category ?? "n/a"} (${weakest?.rate ?? "n/a"}%).`,
+      metric: run.scorePct,
+      source: run.source,
+    });
+  }
+
+  return points
+    .filter((point) => point.timestamp)
+    .toSorted((a, b) => new Date(a.timestamp) - new Date(b.timestamp));
+}
+
+function buildModelTrends(evalRuns) {
+  const byModel = new Map();
+  for (const run of evalRuns.filter((item) => item.tasks >= 10)) {
+    const key = `${run.provider}/${run.model}`;
+    const bucket = byModel.get(key) ?? [];
+    bucket.push({ date: run.date, timestamp: run.timestamp, scorePct: run.scorePct, passed: run.passed, tasks: run.tasks });
+    byModel.set(key, bucket);
+  }
+
+  return [...byModel.entries()]
+    .map(([model, points]) => ({
+      model,
+      points: points.toSorted((a, b) => new Date(a.timestamp) - new Date(b.timestamp)),
+    }))
+    .sort((a, b) => a.model.localeCompare(b.model));
+}
+
+const benchRuns = await buildBenchRuns();
+const criterionRuns = await buildCriterionRuns();
+const evalRuns = await buildEvalRuns();
+const newestSourceTimestamp = latest([...benchRuns, ...criterionRuns, ...evalRuns])?.timestamp ?? null;
+
+const payload = {
+  generatedAt: newestSourceTimestamp,
+  sources: {
+    bench: "crates/bashkit-bench/results/*.json",
+    criterion: "crates/bashkit/benches/results/*.md",
+    evals: "crates/bashkit-eval/results/*.json",
+  },
+  summary: {
+    benchRuns: benchRuns.length,
+    criterionRuns: criterionRuns.length,
+    evalRuns: evalRuns.length,
+    latestBench: latest(benchRuns),
+    latestEval: latest(evalRuns.filter((run) => run.tasks >= 10)),
+    bestEval: bestBy(evalRuns.filter((run) => run.tasks >= 10), (run) => run.scorePct),
+    bestBenchmark: bestBy(benchRuns, (run) => run.speedup ?? 0),
+    bestCriterionImprovement: bestBy(
+      criterionRuns.filter((run) => Number.isFinite(run.bestChangePct)),
+      (run) => Math.abs(run.bestChangePct),
+    ),
+  },
+  benchRuns,
+  criterionRuns,
+  evalRuns,
+  modelTrends: buildModelTrends(evalRuns),
+  milestones: buildMilestones({ benchRuns, criterionRuns, evalRuns }),
+};
+
+await mkdir(path.dirname(outputPath), { recursive: true });
+await writeFile(outputPath, `${JSON.stringify(payload, null, 2)}\n`);
+
+console.log(
+  `Wrote ${path.relative(repoRoot, outputPath)}: ${benchRuns.length} benchmark runs, ${criterionRuns.length} criterion runs, ${evalRuns.length} eval runs.`,
+);
diff --git a/site/src/components/Header.astro b/site/src/components/Header.astro
index 4352f64d..906d9c12 100644
--- a/site/src/components/Header.astro
+++ b/site/src/components/Header.astro
@@ -44,6 +44,7 @@
     <nav class="nav">
       <a href="/#features">Features</a>
       <a href="/#install">Install</a>
+      <a href="/benches">Benches</a>
       <a href="/docs">Docs</a>
       <a
         href="https://docs.rs/bashkit"
diff --git a/site/src/content/home.ts b/site/src/content/home.ts
index 77818cce..69faab91 100644
--- a/site/src/content/home.ts
+++ b/site/src/content/home.ts
@@ -40,6 +40,8 @@ export const evalSnapshot = {
   href: "https://github.com/everruns/bashkit/blob/main/crates/bashkit-eval/README.md",
 };
 
+export const benchesHref = "/benches";
+
 export const heroStats = [
   { label: "Built-in commands", value: "160", href: "/builtins" },
   {
@@ -283,6 +285,12 @@ export const resources = [
     href: "https://github.com/everruns/bashkit/blob/main/specs/threat-model.md",
     cta: "Security spec",
   },
+  {
+    title: "Benches history",
+    detail: "Interactive trends across benchmarks, criterion benches, and evals.",
+    href: benchesHref,
+    cta: "Benches",
+  },
   {
     title: "CLI reference",
     detail: "One-shot commands, script execution, and interactive shell usage.",
diff --git a/site/src/data/performance-timeline.json b/site/src/data/performance-timeline.json
new file mode 100644
index 00000000..36552373
--- /dev/null
+++ b/site/src/data/performance-timeline.json
@@ -0,0 +1,4539 @@
+{
+  "generatedAt": "2026-05-26T03:01:00.000Z",
+  "sources": {
+    "bench": "crates/bashkit-bench/results/*.json",
+    "criterion": "crates/bashkit/benches/results/*.md",
+    "evals": "crates/bashkit-eval/results/*.json"
+  },
+  "summary": {
+    "benchRuns": 6,
+    "criterionRuns": 5,
+    "evalRuns": 38,
+    "latestBench": {
+      "id": "bench-vm-linux-x86_64-1779764460",
+      "kind": "bashkit-bench",
+      "label": "vm-linux-x86_64",
+      "date": "2026-05-26",
+      "timestamp": "2026-05-26T03:01:00.000Z",
+      "source": "crates/bashkit-bench/results/bench-vm-linux-x86_64-1779764460.json",
+      "cases": 96,
+      "speedup": 20.9,
+      "bashkitMs": 42.95,
+      "bashMs": 898.83,
+      "errorRate": 0,
+      "matchRate": 100,
+      "categories": [
+        {
+          "category": "subshell",
+          "cases": 6,
+          "speedup": 40.3
+        },
+        {
+          "category": "io",
+          "cases": 6,
+          "speedup": 37.7
+        },
+        {
+          "category": "tools",
+          "cases": 21,
+          "speedup": 37
+        },
+        {
+          "category": "pipes",
+          "cases": 6,
+          "speedup": 36.3
+        },
+        {
+          "category": "startup",
+          "cases": 4,
+          "speedup": 32.1
+        },
+        {
+          "category": "strings",
+          "cases": 8,
+          "speedup": 31.2
+        },
+        {
+          "category": "variables",
+          "cases": 8,
+          "speedup": 30.2
+        },
+        {
+          "category": "arrays",
+          "cases": 6,
+          "speedup": 29
+        },
+        {
+          "category": "arithmetic",
+          "cases": 6,
+          "speedup": 28.8
+        },
+        {
+          "category": "control",
+          "cases": 9,
+          "speedup": 26.6
+        },
+        {
+          "category": "complex",
+          "cases": 7,
+          "speedup": 16.5
+        },
+        {
+          "category": "large",
+          "cases": 9,
+          "speedup": 4.4
+        }
+      ]
+    },
+    "latestEval": {
+      "id": "eval-openresponses-gpt-5.3-codex-2026-05-26-023642",
+      "kind": "llm-eval",
+      "provider": "openresponses",
+      "model": "gpt-5.3-codex",
+      "baseline": null,
+      "label": "openresponses/gpt-5.3-codex",
+      "date": "2026-05-26",
+      "timestamp": "2026-05-26T02:36:42Z",
+      "source": "crates/bashkit-eval/results/eval-openresponses-gpt-5.3-codex-2026-05-26-023642.json",
+      "tasks": 58,
+      "passed": 54,
+      "scorePct": 93,
+      "toolSuccessPct": 86.8,
+      "avgTurns": 2.97,
+      "avgToolCalls": 1.97,
+      "avgDurationMs": 14127,
+      "inputTokens": 91068,
+      "outputTokens": 48606,
+      "categories": [
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 50
+        },
+        {
+          "category": "file_operations",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 66.7
+        },
+        {
+          "category": "scripting",
+          "tasks": 7,
+          "passed": 5,
+          "rate": 68.6
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 8,
+          "rate": 100
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "text_processing",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 5,
+          "passed": 5,
+          "rate": 100
+        },
+        {
+          "category": "database_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "build_simulation",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "config_management",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "environment",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "code_search",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    "bestEval": {
+      "id": "eval-anthropic-claude-opus-4-6-2026-02-27-043856",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-opus-4-6",
+      "baseline": null,
+      "label": "anthropic/claude-opus-4-6",
+      "date": "2026-02-27",
+      "timestamp": "2026-02-27T04:38:56Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-27-043856.json",
+      "tasks": 23,
+      "passed": 23,
+      "scorePct": 100,
+      "toolSuccessPct": 90.5,
+      "avgTurns": 4.65,
+      "avgToolCalls": 4.13,
+      "avgDurationMs": 15994,
+      "inputTokens": 143073,
+      "outputTokens": 16086,
+      "categories": [
+        {
+          "category": "file_operations",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "text_processing",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "scripting",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "json_processing",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 1,
+          "passed": 1,
+          "rate": 100
+        }
+      ]
+    },
+    "bestBenchmark": {
+      "id": "bench-runsc-linux-x86_64-1770093060",
+      "kind": "bashkit-bench",
+      "label": "runsc-linux-x86_64",
+      "date": "2026-02-03",
+      "timestamp": "2026-02-03T04:31:00.000Z",
+      "source": "crates/bashkit-bench/results/bench-runsc-linux-x86_64-1770093060.json",
+      "cases": 75,
+      "speedup": 200.9,
+      "bashkitMs": 8.97,
+      "bashMs": 1802.42,
+      "errorRate": 0,
+      "matchRate": 89.33,
+      "categories": [
+        {
+          "category": "pipes",
+          "cases": 6,
+          "speedup": 367
+        },
+        {
+          "category": "tools",
+          "cases": 21,
+          "speedup": 239.9
+        },
+        {
+          "category": "startup",
+          "cases": 4,
+          "speedup": 216.4
+        },
+        {
+          "category": "strings",
+          "cases": 8,
+          "speedup": 201.4
+        },
+        {
+          "category": "arithmetic",
+          "cases": 6,
+          "speedup": 177.6
+        },
+        {
+          "category": "arrays",
+          "cases": 6,
+          "speedup": 172.3
+        },
+        {
+          "category": "variables",
+          "cases": 8,
+          "speedup": 162.2
+        },
+        {
+          "category": "control",
+          "cases": 9,
+          "speedup": 153.8
+        },
+        {
+          "category": "complex",
+          "cases": 7,
+          "speedup": 131.5
+        }
+      ]
+    },
+    "bestCriterionImprovement": {
+      "id": "criterion-hotpath-perf-linux-x86_64-1779744742",
+      "kind": "criterion",
+      "family": "hotpath",
+      "label": "Hot-path Performance: Before / After",
+      "date": "2026-05-25",
+      "timestamp": "2026-05-25T21:32:22.000Z",
+      "source": "crates/bashkit/benches/results/criterion-hotpath-perf-linux-x86_64-1779744742.md",
+      "cases": 23,
+      "medianUs": null,
+      "p95Us": null,
+      "medianChangePct": -36.6,
+      "meanChangePct": -39.7,
+      "bestChangePct": -64.9,
+      "fastestCase": null,
+      "bestImprovement": {
+        "name": "startup/empty",
+        "changePct": -64.9
+      }
+    }
+  },
+  "benchRuns": [
+    {
+      "id": "bench-runsc-linux-x86_64-1769970640",
+      "kind": "bashkit-bench",
+      "label": "runsc-linux-x86_64",
+      "date": "2026-02-01",
+      "timestamp": "2026-02-01T18:30:40.000Z",
+      "source": "crates/bashkit-bench/results/bench-runsc-linux-x86_64-1769970640.json",
+      "cases": 75,
+      "speedup": 0.4,
+      "bashkitMs": 4004.73,
+      "bashMs": 1663.26,
+      "errorRate": 5.33,
+      "matchRate": 80,
+      "categories": [
+        {
+          "category": "pipes",
+          "cases": 6,
+          "speedup": 3767.5
+        },
+        {
+          "category": "startup",
+          "cases": 4,
+          "speedup": 2401.9
+        },
+        {
+          "category": "strings",
+          "cases": 8,
+          "speedup": 1652.9
+        },
+        {
+          "category": "variables",
+          "cases": 8,
+          "speedup": 1611.5
+        },
+        {
+          "category": "arithmetic",
+          "cases": 6,
+          "speedup": 1307.2
+        },
+        {
+          "category": "arrays",
+          "cases": 6,
+          "speedup": 1260.4
+        },
+        {
+          "category": "control",
+          "cases": 9,
+          "speedup": 958.1
+        },
+        {
+          "category": "tools",
+          "cases": 21,
+          "speedup": 725.9
+        },
+        {
+          "category": "complex",
+          "cases": 7,
+          "speedup": 408.7
+        }
+      ]
+    },
+    {
+      "id": "bench-runsc-linux-x86_64-1770093060",
+      "kind": "bashkit-bench",
+      "label": "runsc-linux-x86_64",
+      "date": "2026-02-03",
+      "timestamp": "2026-02-03T04:31:00.000Z",
+      "source": "crates/bashkit-bench/results/bench-runsc-linux-x86_64-1770093060.json",
+      "cases": 75,
+      "speedup": 200.9,
+      "bashkitMs": 8.97,
+      "bashMs": 1802.42,
+      "errorRate": 0,
+      "matchRate": 89.33,
+      "categories": [
+        {
+          "category": "pipes",
+          "cases": 6,
+          "speedup": 367
+        },
+        {
+          "category": "tools",
+          "cases": 21,
+          "speedup": 239.9
+        },
+        {
+          "category": "startup",
+          "cases": 4,
+          "speedup": 216.4
+        },
+        {
+          "category": "strings",
+          "cases": 8,
+          "speedup": 201.4
+        },
+        {
+          "category": "arithmetic",
+          "cases": 6,
+          "speedup": 177.6
+        },
+        {
+          "category": "arrays",
+          "cases": 6,
+          "speedup": 172.3
+        },
+        {
+          "category": "variables",
+          "cases": 8,
+          "speedup": 162.2
+        },
+        {
+          "category": "control",
+          "cases": 9,
+          "speedup": 153.8
+        },
+        {
+          "category": "complex",
+          "cases": 7,
+          "speedup": 131.5
+        }
+      ]
+    },
+    {
+      "id": "bench-none-linux-x86_64-1773464548",
+      "kind": "bashkit-bench",
+      "label": "none-linux-x86_64",
+      "date": "2026-03-14",
+      "timestamp": "2026-03-14T05:02:28.000Z",
+      "source": "crates/bashkit-bench/results/bench-none-linux-x86_64-1773464548.json",
+      "cases": 96,
+      "speedup": 23.8,
+      "bashkitMs": 33.11,
+      "bashMs": 787.61,
+      "errorRate": 0,
+      "matchRate": 100,
+      "categories": [
+        {
+          "category": "pipes",
+          "cases": 6,
+          "speedup": 43.1
+        },
+        {
+          "category": "io",
+          "cases": 6,
+          "speedup": 31.8
+        },
+        {
+          "category": "subshell",
+          "cases": 6,
+          "speedup": 31.7
+        },
+        {
+          "category": "tools",
+          "cases": 21,
+          "speedup": 28.1
+        },
+        {
+          "category": "startup",
+          "cases": 4,
+          "speedup": 26.2
+        },
+        {
+          "category": "arithmetic",
+          "cases": 6,
+          "speedup": 22.4
+        },
+        {
+          "category": "strings",
+          "cases": 8,
+          "speedup": 21.7
+        },
+        {
+          "category": "arrays",
+          "cases": 6,
+          "speedup": 21.5
+        },
+        {
+          "category": "variables",
+          "cases": 8,
+          "speedup": 21.3
+        },
+        {
+          "category": "control",
+          "cases": 9,
+          "speedup": 20
+        },
+        {
+          "category": "complex",
+          "cases": 7,
+          "speedup": 11.8
+        },
+        {
+          "category": "large",
+          "cases": 9,
+          "speedup": 3.7
+        }
+      ]
+    },
+    {
+      "id": "bench-runsc-linux-x86_64-1776121540",
+      "kind": "bashkit-bench",
+      "label": "runsc-linux-x86_64",
+      "date": "2026-04-13",
+      "timestamp": "2026-04-13T23:05:40.000Z",
+      "source": "crates/bashkit-bench/results/bench-runsc-linux-x86_64-1776121540.json",
+      "cases": 96,
+      "speedup": 107.2,
+      "bashkitMs": 41.52,
+      "bashMs": 4449.32,
+      "errorRate": 0,
+      "matchRate": 100,
+      "categories": [
+        {
+          "category": "pipes",
+          "cases": 6,
+          "speedup": 220.4
+        },
+        {
+          "category": "tools",
+          "cases": 21,
+          "speedup": 167.5
+        },
+        {
+          "category": "io",
+          "cases": 6,
+          "speedup": 162.5
+        },
+        {
+          "category": "subshell",
+          "cases": 6,
+          "speedup": 133.5
+        },
+        {
+          "category": "startup",
+          "cases": 4,
+          "speedup": 116.7
+        },
+        {
+          "category": "strings",
+          "cases": 8,
+          "speedup": 98.9
+        },
+        {
+          "category": "variables",
+          "cases": 8,
+          "speedup": 93.5
+        },
+        {
+          "category": "control",
+          "cases": 9,
+          "speedup": 89.4
+        },
+        {
+          "category": "arrays",
+          "cases": 6,
+          "speedup": 89.3
+        },
+        {
+          "category": "arithmetic",
+          "cases": 6,
+          "speedup": 88.5
+        },
+        {
+          "category": "complex",
+          "cases": 7,
+          "speedup": 72.3
+        },
+        {
+          "category": "large",
+          "cases": 9,
+          "speedup": 19.8
+        }
+      ]
+    },
+    {
+      "id": "bench-after-perf-linux-x86_64",
+      "kind": "bashkit-bench",
+      "label": "vm-linux-x86_64",
+      "date": "2026-05-25",
+      "timestamp": "2026-05-25T21:35:05.000Z",
+      "source": "crates/bashkit-bench/results/bench-after-perf-linux-x86_64.json",
+      "cases": 96,
+      "speedup": 25.4,
+      "bashkitMs": 43.16,
+      "bashMs": 1095.06,
+      "errorRate": 0,
+      "matchRate": 100,
+      "categories": [
+        {
+          "category": "pipes",
+          "cases": 6,
+          "speedup": 83.1
+        },
+        {
+          "category": "tools",
+          "cases": 21,
+          "speedup": 62.9
+        },
+        {
+          "category": "subshell",
+          "cases": 6,
+          "speedup": 60.6
+        },
+        {
+          "category": "arithmetic",
+          "cases": 6,
+          "speedup": 52.7
+        },
+        {
+          "category": "strings",
+          "cases": 8,
+          "speedup": 51.3
+        },
+        {
+          "category": "io",
+          "cases": 6,
+          "speedup": 50.6
+        },
+        {
+          "category": "variables",
+          "cases": 8,
+          "speedup": 49.8
+        },
+        {
+          "category": "arrays",
+          "cases": 6,
+          "speedup": 49.7
+        },
+        {
+          "category": "startup",
+          "cases": 4,
+          "speedup": 43.5
+        },
+        {
+          "category": "control",
+          "cases": 9,
+          "speedup": 36.8
+        },
+        {
+          "category": "complex",
+          "cases": 7,
+          "speedup": 22.3
+        },
+        {
+          "category": "large",
+          "cases": 9,
+          "speedup": 8.6
+        }
+      ]
+    },
+    {
+      "id": "bench-vm-linux-x86_64-1779764460",
+      "kind": "bashkit-bench",
+      "label": "vm-linux-x86_64",
+      "date": "2026-05-26",
+      "timestamp": "2026-05-26T03:01:00.000Z",
+      "source": "crates/bashkit-bench/results/bench-vm-linux-x86_64-1779764460.json",
+      "cases": 96,
+      "speedup": 20.9,
+      "bashkitMs": 42.95,
+      "bashMs": 898.83,
+      "errorRate": 0,
+      "matchRate": 100,
+      "categories": [
+        {
+          "category": "subshell",
+          "cases": 6,
+          "speedup": 40.3
+        },
+        {
+          "category": "io",
+          "cases": 6,
+          "speedup": 37.7
+        },
+        {
+          "category": "tools",
+          "cases": 21,
+          "speedup": 37
+        },
+        {
+          "category": "pipes",
+          "cases": 6,
+          "speedup": 36.3
+        },
+        {
+          "category": "startup",
+          "cases": 4,
+          "speedup": 32.1
+        },
+        {
+          "category": "strings",
+          "cases": 8,
+          "speedup": 31.2
+        },
+        {
+          "category": "variables",
+          "cases": 8,
+          "speedup": 30.2
+        },
+        {
+          "category": "arrays",
+          "cases": 6,
+          "speedup": 29
+        },
+        {
+          "category": "arithmetic",
+          "cases": 6,
+          "speedup": 28.8
+        },
+        {
+          "category": "control",
+          "cases": 9,
+          "speedup": 26.6
+        },
+        {
+          "category": "complex",
+          "cases": 7,
+          "speedup": 16.5
+        },
+        {
+          "category": "large",
+          "cases": 9,
+          "speedup": 4.4
+        }
+      ]
+    }
+  ],
+  "criterionRuns": [
+    {
+      "id": "criterion-parallel-(none)-linux-x86_64-1773469129",
+      "kind": "criterion",
+      "family": "parallel",
+      "label": "Criterion Parallel Execution Benchmark",
+      "date": "2026-03-14",
+      "timestamp": "2026-03-14T06:18:49.000Z",
+      "source": "crates/bashkit/benches/results/criterion-parallel-(none)-linux-x86_64-1773469129.md",
+      "cases": 9,
+      "medianUs": 160.05,
+      "p95Us": 1122.24,
+      "medianChangePct": null,
+      "meanChangePct": null,
+      "bestChangePct": null,
+      "fastestCase": {
+        "name": "single_bash_new",
+        "us": 23.77
+      },
+      "bestImprovement": null
+    },
+    {
+      "id": "criterion-sqlite-vm-linux-x86_64-1777865268",
+      "kind": "criterion",
+      "family": "sqlite",
+      "label": "Criterion SQLite Builtin Benchmark",
+      "date": "2026-05-04",
+      "timestamp": "2026-05-04T03:27:48.000Z",
+      "source": "crates/bashkit/benches/results/criterion-sqlite-vm-linux-x86_64-1777865268.md",
+      "cases": 44,
+      "medianUs": 799.27,
+      "p95Us": 7677.82,
+      "medianChangePct": null,
+      "meanChangePct": null,
+      "bestChangePct": null,
+      "fastestCase": {
+        "name": "sqlite_query/aggregate_in_memory/1000",
+        "us": 739.46
+      },
+      "bestImprovement": null
+    },
+    {
+      "id": "criterion-hotpath-perf-linux-x86_64-1779744742",
+      "kind": "criterion",
+      "family": "hotpath",
+      "label": "Hot-path Performance: Before / After",
+      "date": "2026-05-25",
+      "timestamp": "2026-05-25T21:32:22.000Z",
+      "source": "crates/bashkit/benches/results/criterion-hotpath-perf-linux-x86_64-1779744742.md",
+      "cases": 23,
+      "medianUs": null,
+      "p95Us": null,
+      "medianChangePct": -36.6,
+      "meanChangePct": -39.7,
+      "bestChangePct": -64.9,
+      "fastestCase": null,
+      "bestImprovement": {
+        "name": "startup/empty",
+        "changePct": -64.9
+      }
+    },
+    {
+      "id": "criterion-file_ops-linux-x86_64-1779759850",
+      "kind": "criterion",
+      "family": "file-ops",
+      "label": "VFS / File-Ops Bench: Initial Baseline",
+      "date": "2026-05-26",
+      "timestamp": "2026-05-26T01:44:10.000Z",
+      "source": "crates/bashkit/benches/results/criterion-file_ops-linux-x86_64-1779759850.md",
+      "cases": 11,
+      "medianUs": 2000,
+      "p95Us": 3590,
+      "medianChangePct": null,
+      "meanChangePct": null,
+      "bestChangePct": null,
+      "fastestCase": {
+        "name": "`for f in /work/d00/*` (shallow glob)",
+        "us": 267
+      },
+      "bestImprovement": null
+    },
+    {
+      "id": "criterion-hotpath-attrs+shopt-linux-x86_64-1779759850",
+      "kind": "criterion",
+      "family": "hotpath",
+      "label": "Hot-path Bench: Attributes + SHOPT Extensions",
+      "date": "2026-05-26",
+      "timestamp": "2026-05-26T01:44:10.000Z",
+      "source": "crates/bashkit/benches/results/criterion-hotpath-attrs+shopt-linux-x86_64-1779759850.md",
+      "cases": 43,
+      "medianUs": 624,
+      "p95Us": 2713,
+      "medianChangePct": null,
+      "meanChangePct": null,
+      "bestChangePct": null,
+      "fastestCase": {
+        "name": "`startup/empty`",
+        "us": 34.4
+      },
+      "bestImprovement": null
+    }
+  ],
+  "evalRuns": [
+    {
+      "id": "eval-anthropic-claude-haiku-4-5-2026-02-07-052023",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-haiku-4-5",
+      "baseline": null,
+      "label": "anthropic/claude-haiku-4-5",
+      "date": "2026-02-07",
+      "timestamp": "2026-02-07T05:20:23Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-2026-02-07-052023.json",
+      "tasks": 25,
+      "passed": 19,
+      "scorePct": 91.5,
+      "toolSuccessPct": 80.2,
+      "avgTurns": 5.4,
+      "avgToolCalls": 4.64,
+      "avgDurationMs": 9881,
+      "inputTokens": 312188,
+      "outputTokens": 29393,
+      "categories": [
+        {
+          "category": "text_processing",
+          "tasks": 3,
+          "passed": 1,
+          "rate": 50
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 3,
+          "passed": 0,
+          "rate": 75
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 3,
+          "passed": 2,
+          "rate": 93.8
+        },
+        {
+          "category": "file_operations",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "scripting",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "jq_mastery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-openai-gpt-5.2-2026-02-07-052037",
+      "kind": "llm-eval",
+      "provider": "openai",
+      "model": "gpt-5.2",
+      "baseline": null,
+      "label": "openai/gpt-5.2",
+      "date": "2026-02-07",
+      "timestamp": "2026-02-07T05:20:37Z",
+      "source": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-07-052037.json",
+      "tasks": 25,
+      "passed": 19,
+      "scorePct": 86.8,
+      "toolSuccessPct": 57.1,
+      "avgTurns": 4.2,
+      "avgToolCalls": 3.36,
+      "avgDurationMs": 10059,
+      "inputTokens": 147871,
+      "outputTokens": 15067,
+      "categories": [
+        {
+          "category": "complex_tasks",
+          "tasks": 3,
+          "passed": 1,
+          "rate": 56.3
+        },
+        {
+          "category": "scripting",
+          "tasks": 3,
+          "passed": 1,
+          "rate": 66.7
+        },
+        {
+          "category": "text_processing",
+          "tasks": 3,
+          "passed": 2,
+          "rate": 87.5
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 3,
+          "passed": 2,
+          "rate": 93.8
+        },
+        {
+          "category": "file_operations",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "jq_mastery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-anthropic-claude-opus-4-6-2026-02-07-052536",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-opus-4-6",
+      "baseline": null,
+      "label": "anthropic/claude-opus-4-6",
+      "date": "2026-02-07",
+      "timestamp": "2026-02-07T05:25:36Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-07-052536.json",
+      "tasks": 25,
+      "passed": 17,
+      "scorePct": 86.8,
+      "toolSuccessPct": 75.2,
+      "avgTurns": 6.2,
+      "avgToolCalls": 5.64,
+      "avgDurationMs": 22480,
+      "inputTokens": 319405,
+      "outputTokens": 27106,
+      "categories": [
+        {
+          "category": "complex_tasks",
+          "tasks": 3,
+          "passed": 0,
+          "rate": 56.3
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 71.4
+        },
+        {
+          "category": "pipelines",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 80
+        },
+        {
+          "category": "scripting",
+          "tasks": 3,
+          "passed": 2,
+          "rate": 86.7
+        },
+        {
+          "category": "text_processing",
+          "tasks": 3,
+          "passed": 2,
+          "rate": 87.5
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 3,
+          "passed": 2,
+          "rate": 93.8
+        },
+        {
+          "category": "file_operations",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "jq_mastery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-anthropic-claude-haiku-4-5-2026-02-08-061414",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-haiku-4-5",
+      "baseline": null,
+      "label": "anthropic/claude-haiku-4-5",
+      "date": "2026-02-08",
+      "timestamp": "2026-02-08T06:14:14Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-2026-02-08-061414.json",
+      "tasks": 25,
+      "passed": 23,
+      "scorePct": 98.1,
+      "toolSuccessPct": 87.1,
+      "avgTurns": 4.48,
+      "avgToolCalls": 3.72,
+      "avgDurationMs": 6893,
+      "inputTokens": 166538,
+      "outputTokens": 19473,
+      "categories": [
+        {
+          "category": "scripting",
+          "tasks": 3,
+          "passed": 2,
+          "rate": 93.3
+        },
+        {
+          "category": "file_operations",
+          "tasks": 3,
+          "passed": 2,
+          "rate": 93.8
+        },
+        {
+          "category": "text_processing",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "jq_mastery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-openai-gpt-5.2-2026-02-08-061445",
+      "kind": "llm-eval",
+      "provider": "openai",
+      "model": "gpt-5.2",
+      "baseline": null,
+      "label": "openai/gpt-5.2",
+      "date": "2026-02-08",
+      "timestamp": "2026-02-08T06:14:45Z",
+      "source": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-08-061445.json",
+      "tasks": 25,
+      "passed": 18,
+      "scorePct": 81.1,
+      "toolSuccessPct": 77.7,
+      "avgTurns": 4.84,
+      "avgToolCalls": 4.12,
+      "avgDurationMs": 8202,
+      "inputTokens": 84322,
+      "outputTokens": 9621,
+      "categories": [
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 50
+        },
+        {
+          "category": "scripting",
+          "tasks": 3,
+          "passed": 1,
+          "rate": 53.3
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 3,
+          "passed": 2,
+          "rate": 62.5
+        },
+        {
+          "category": "pipelines",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 80
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 85.7
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 3,
+          "passed": 2,
+          "rate": 87.5
+        },
+        {
+          "category": "text_processing",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "file_operations",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "jq_mastery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-anthropic-claude-opus-4-6-2026-02-08-062003",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-opus-4-6",
+      "baseline": null,
+      "label": "anthropic/claude-opus-4-6",
+      "date": "2026-02-08",
+      "timestamp": "2026-02-08T06:20:03Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-08-062003.json",
+      "tasks": 25,
+      "passed": 21,
+      "scorePct": 93.4,
+      "toolSuccessPct": 87.4,
+      "avgTurns": 6.12,
+      "avgToolCalls": 5.72,
+      "avgDurationMs": 20885,
+      "inputTokens": 242338,
+      "outputTokens": 26325,
+      "categories": [
+        {
+          "category": "complex_tasks",
+          "tasks": 3,
+          "passed": 1,
+          "rate": 68.8
+        },
+        {
+          "category": "scripting",
+          "tasks": 3,
+          "passed": 2,
+          "rate": 93.3
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 3,
+          "passed": 2,
+          "rate": 93.8
+        },
+        {
+          "category": "file_operations",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "text_processing",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "jq_mastery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-openai-gpt-5.2-2026-02-09-054424",
+      "kind": "llm-eval",
+      "provider": "openai",
+      "model": "gpt-5.2",
+      "baseline": null,
+      "label": "openai/gpt-5.2",
+      "date": "2026-02-09",
+      "timestamp": "2026-02-09T05:44:24Z",
+      "source": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-09-054424.json",
+      "tasks": 37,
+      "passed": 23,
+      "scorePct": 79.9,
+      "toolSuccessPct": 71.3,
+      "avgTurns": 3.78,
+      "avgToolCalls": 2.92,
+      "avgDurationMs": 7805,
+      "inputTokens": 119122,
+      "outputTokens": 16864,
+      "categories": [
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 0,
+          "rate": 16.7
+        },
+        {
+          "category": "scripting",
+          "tasks": 4,
+          "passed": 1,
+          "rate": 52.6
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 4,
+          "passed": 2,
+          "rate": 66.7
+        },
+        {
+          "category": "text_processing",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 69.2
+        },
+        {
+          "category": "pipelines",
+          "tasks": 3,
+          "passed": 1,
+          "rate": 80
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 5,
+          "rate": 88.7
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 5,
+          "passed": 4,
+          "rate": 89.7
+        },
+        {
+          "category": "file_operations",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-anthropic-claude-haiku-4-5-20251001-2026-02-09-054558",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-haiku-4-5-20251001",
+      "baseline": null,
+      "label": "anthropic/claude-haiku-4-5-20251001",
+      "date": "2026-02-09",
+      "timestamp": "2026-02-09T05:45:58Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-02-09-054558.json",
+      "tasks": 37,
+      "passed": 32,
+      "scorePct": 94.6,
+      "toolSuccessPct": 80.7,
+      "avgTurns": 4.84,
+      "avgToolCalls": 4.05,
+      "avgDurationMs": 10345,
+      "inputTokens": 285917,
+      "outputTokens": 35290,
+      "categories": [
+        {
+          "category": "complex_tasks",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 91.7
+        },
+        {
+          "category": "text_processing",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 92.3
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 7,
+          "rate": 92.5
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 5,
+          "passed": 4,
+          "rate": 93.1
+        },
+        {
+          "category": "scripting",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 94.7
+        },
+        {
+          "category": "file_operations",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-anthropic-claude-opus-4-6-2026-02-09-142736",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-opus-4-6",
+      "baseline": null,
+      "label": "anthropic/claude-opus-4-6",
+      "date": "2026-02-09",
+      "timestamp": "2026-02-09T14:27:36Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-09-142736.json",
+      "tasks": 37,
+      "passed": 29,
+      "scorePct": 87,
+      "toolSuccessPct": 82.3,
+      "avgTurns": 5.57,
+      "avgToolCalls": 5.35,
+      "avgDurationMs": 40907,
+      "inputTokens": 315328,
+      "outputTokens": 30847,
+      "categories": [
+        {
+          "category": "complex_tasks",
+          "tasks": 4,
+          "passed": 1,
+          "rate": 54.2
+        },
+        {
+          "category": "text_processing",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 69.2
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 5,
+          "passed": 3,
+          "rate": 89.7
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 7,
+          "rate": 90.6
+        },
+        {
+          "category": "scripting",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 94.7
+        },
+        {
+          "category": "pipelines",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "file_operations",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-anthropic-claude-sonnet-4-20250514-2026-02-17-230312",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-sonnet-4-20250514",
+      "baseline": null,
+      "label": "anthropic/claude-sonnet-4-20250514",
+      "date": "2026-02-17",
+      "timestamp": "2026-02-17T23:03:12Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-20250514-2026-02-17-230312.json",
+      "tasks": 3,
+      "passed": 3,
+      "scorePct": 100,
+      "toolSuccessPct": 100,
+      "avgTurns": 2.33,
+      "avgToolCalls": 1.33,
+      "avgDurationMs": 4522,
+      "inputTokens": 4468,
+      "outputTokens": 489,
+      "categories": [
+        {
+          "category": "basic",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-anthropic-claude-sonnet-4-20250514-2026-02-17-231336",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-sonnet-4-20250514",
+      "baseline": null,
+      "label": "anthropic/claude-sonnet-4-20250514",
+      "date": "2026-02-17",
+      "timestamp": "2026-02-17T23:13:36Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-20250514-2026-02-17-231336.json",
+      "tasks": 37,
+      "passed": 32,
+      "scorePct": 92.9,
+      "toolSuccessPct": 89,
+      "avgTurns": 5.73,
+      "avgToolCalls": 4.92,
+      "avgDurationMs": 16511,
+      "inputTokens": 248295,
+      "outputTokens": 30238,
+      "categories": [
+        {
+          "category": "complex_tasks",
+          "tasks": 4,
+          "passed": 2,
+          "rate": 62.5
+        },
+        {
+          "category": "text_processing",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 92.3
+        },
+        {
+          "category": "scripting",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 94.7
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 7,
+          "rate": 96.2
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 5,
+          "passed": 5,
+          "rate": 100
+        },
+        {
+          "category": "file_operations",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-anthropic-claude-haiku-4-5-20251001-2026-02-25-044801",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-haiku-4-5-20251001",
+      "baseline": null,
+      "label": "anthropic/claude-haiku-4-5-20251001",
+      "date": "2026-02-25",
+      "timestamp": "2026-02-25T04:48:01Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-02-25-044801.json",
+      "tasks": 37,
+      "passed": 35,
+      "scorePct": 97.8,
+      "toolSuccessPct": 96.2,
+      "avgTurns": 3.76,
+      "avgToolCalls": 2.81,
+      "avgDurationMs": 5195,
+      "inputTokens": 171357,
+      "outputTokens": 21399,
+      "categories": [
+        {
+          "category": "scripting",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 89.5
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 7,
+          "rate": 96.2
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 5,
+          "passed": 5,
+          "rate": 100
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 4,
+          "passed": 4,
+          "rate": 100
+        },
+        {
+          "category": "text_processing",
+          "tasks": 4,
+          "passed": 4,
+          "rate": 100
+        },
+        {
+          "category": "file_operations",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-openai-gpt-5.2-2026-02-25-044904",
+      "kind": "llm-eval",
+      "provider": "openai",
+      "model": "gpt-5.2",
+      "baseline": null,
+      "label": "openai/gpt-5.2",
+      "date": "2026-02-25",
+      "timestamp": "2026-02-25T04:49:04Z",
+      "source": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-25-044904.json",
+      "tasks": 37,
+      "passed": 27,
+      "scorePct": 86.4,
+      "toolSuccessPct": 72.5,
+      "avgTurns": 3.65,
+      "avgToolCalls": 2.76,
+      "avgDurationMs": 6598,
+      "inputTokens": 87447,
+      "outputTokens": 13514,
+      "categories": [
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 0,
+          "rate": 50
+        },
+        {
+          "category": "scripting",
+          "tasks": 4,
+          "passed": 2,
+          "rate": 63.2
+        },
+        {
+          "category": "text_processing",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 69.2
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 4,
+          "passed": 2,
+          "rate": 87.5
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 5,
+          "passed": 4,
+          "rate": 89.7
+        },
+        {
+          "category": "pipelines",
+          "tasks": 3,
+          "passed": 2,
+          "rate": 90
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 7,
+          "rate": 92.5
+        },
+        {
+          "category": "file_operations",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-anthropic-claude-sonnet-4-20250514-2026-02-25-045328",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-sonnet-4-20250514",
+      "baseline": null,
+      "label": "anthropic/claude-sonnet-4-20250514",
+      "date": "2026-02-25",
+      "timestamp": "2026-02-25T04:53:28Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-20250514-2026-02-25-045328.json",
+      "tasks": 37,
+      "passed": 34,
+      "scorePct": 97.3,
+      "toolSuccessPct": 95.4,
+      "avgTurns": 4.97,
+      "avgToolCalls": 4.08,
+      "avgDurationMs": 14049,
+      "inputTokens": 196792,
+      "outputTokens": 24758,
+      "categories": [
+        {
+          "category": "scripting",
+          "tasks": 4,
+          "passed": 2,
+          "rate": 84.2
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 7,
+          "rate": 96.2
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 5,
+          "passed": 5,
+          "rate": 100
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 4,
+          "passed": 4,
+          "rate": 100
+        },
+        {
+          "category": "text_processing",
+          "tasks": 4,
+          "passed": 4,
+          "rate": 100
+        },
+        {
+          "category": "file_operations",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-anthropic-claude-opus-4-6-2026-02-25-045611",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-opus-4-6",
+      "baseline": null,
+      "label": "anthropic/claude-opus-4-6",
+      "date": "2026-02-25",
+      "timestamp": "2026-02-25T04:56:11Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-25-045611.json",
+      "tasks": 37,
+      "passed": 33,
+      "scorePct": 92.9,
+      "toolSuccessPct": 89.9,
+      "avgTurns": 4.97,
+      "avgToolCalls": 4.57,
+      "avgDurationMs": 18138,
+      "inputTokens": 275924,
+      "outputTokens": 32578,
+      "categories": [
+        {
+          "category": "complex_tasks",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 70.8
+        },
+        {
+          "category": "scripting",
+          "tasks": 4,
+          "passed": 2,
+          "rate": 89.5
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 7,
+          "rate": 92.5
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 5,
+          "passed": 5,
+          "rate": 100
+        },
+        {
+          "category": "text_processing",
+          "tasks": 4,
+          "passed": 4,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "file_operations",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-anthropic-claude-haiku-4-5-20251001-2026-02-27-040636",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-haiku-4-5-20251001",
+      "baseline": null,
+      "label": "anthropic/claude-haiku-4-5-20251001",
+      "date": "2026-02-27",
+      "timestamp": "2026-02-27T04:06:36Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-02-27-040636.json",
+      "tasks": 52,
+      "passed": 43,
+      "scorePct": 91.7,
+      "toolSuccessPct": 92.8,
+      "avgTurns": 5.08,
+      "avgToolCalls": 4.29,
+      "avgDurationMs": 8366,
+      "inputTokens": 397596,
+      "outputTokens": 46276,
+      "categories": [
+        {
+          "category": "file_operations",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 70.8
+        },
+        {
+          "category": "text_processing",
+          "tasks": 6,
+          "passed": 5,
+          "rate": 75
+        },
+        {
+          "category": "scripting",
+          "tasks": 7,
+          "passed": 4,
+          "rate": 86.5
+        },
+        {
+          "category": "code_search",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 92.3
+        },
+        {
+          "category": "environment",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 93.3
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 6,
+          "passed": 5,
+          "rate": 94.6
+        },
+        {
+          "category": "pipelines",
+          "tasks": 5,
+          "passed": 4,
+          "rate": 95
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 8,
+          "rate": 100
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-openai-gpt-5.2-2026-02-27-043813",
+      "kind": "llm-eval",
+      "provider": "openai",
+      "model": "gpt-5.2",
+      "baseline": null,
+      "label": "openai/gpt-5.2",
+      "date": "2026-02-27",
+      "timestamp": "2026-02-27T04:38:13Z",
+      "source": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-27-043813.json",
+      "tasks": 52,
+      "passed": 32,
+      "scorePct": 79.4,
+      "toolSuccessPct": 88.2,
+      "avgTurns": 3.38,
+      "avgToolCalls": 2.44,
+      "avgDurationMs": 6753,
+      "inputTokens": 123013,
+      "outputTokens": 20725,
+      "categories": [
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 50
+        },
+        {
+          "category": "code_search",
+          "tasks": 2,
+          "passed": 0,
+          "rate": 53.8
+        },
+        {
+          "category": "scripting",
+          "tasks": 7,
+          "passed": 3,
+          "rate": 64.9
+        },
+        {
+          "category": "pipelines",
+          "tasks": 5,
+          "passed": 2,
+          "rate": 65
+        },
+        {
+          "category": "file_operations",
+          "tasks": 4,
+          "passed": 2,
+          "rate": 66.7
+        },
+        {
+          "category": "text_processing",
+          "tasks": 6,
+          "passed": 5,
+          "rate": 78.6
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 6,
+          "passed": 4,
+          "rate": 80
+        },
+        {
+          "category": "environment",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 86.7
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 6,
+          "passed": 4,
+          "rate": 89.5
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 6,
+          "rate": 94.6
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-anthropic-claude-sonnet-4-6-2026-02-27-043854",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-sonnet-4-6",
+      "baseline": null,
+      "label": "anthropic/claude-sonnet-4-6",
+      "date": "2026-02-27",
+      "timestamp": "2026-02-27T04:38:54Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-6-2026-02-27-043854.json",
+      "tasks": 26,
+      "passed": 23,
+      "scorePct": 93.9,
+      "toolSuccessPct": 86.5,
+      "avgTurns": 4.5,
+      "avgToolCalls": 4,
+      "avgDurationMs": 15066,
+      "inputTokens": 211595,
+      "outputTokens": 27426,
+      "categories": [
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 50
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 57.1
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 3,
+          "passed": 2,
+          "rate": 93.8
+        },
+        {
+          "category": "text_processing",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "scripting",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "file_operations",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "json_processing",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-anthropic-claude-opus-4-6-2026-02-27-043856",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-opus-4-6",
+      "baseline": null,
+      "label": "anthropic/claude-opus-4-6",
+      "date": "2026-02-27",
+      "timestamp": "2026-02-27T04:38:56Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-27-043856.json",
+      "tasks": 23,
+      "passed": 23,
+      "scorePct": 100,
+      "toolSuccessPct": 90.5,
+      "avgTurns": 4.65,
+      "avgToolCalls": 4.13,
+      "avgDurationMs": 15994,
+      "inputTokens": 143073,
+      "outputTokens": 16086,
+      "categories": [
+        {
+          "category": "file_operations",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "text_processing",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "scripting",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "json_processing",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 1,
+          "passed": 1,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-openai-responses-gpt-5.3-codex-2026-02-27-055543",
+      "kind": "llm-eval",
+      "provider": "openai-responses",
+      "model": "gpt-5.3-codex",
+      "baseline": null,
+      "label": "openai-responses/gpt-5.3-codex",
+      "date": "2026-02-27",
+      "timestamp": "2026-02-27T05:55:43Z",
+      "source": "crates/bashkit-eval/results/eval-openai-responses-gpt-5.3-codex-2026-02-27-055543.json",
+      "tasks": 37,
+      "passed": 30,
+      "scorePct": 93,
+      "toolSuccessPct": 71.6,
+      "avgTurns": 3.46,
+      "avgToolCalls": 2.57,
+      "avgDurationMs": 17155,
+      "inputTokens": 96511,
+      "outputTokens": 32865,
+      "categories": [
+        {
+          "category": "scripting",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 78.9
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 85.7
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 5,
+          "passed": 3,
+          "rate": 86.2
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 4,
+          "passed": 2,
+          "rate": 87.5
+        },
+        {
+          "category": "file_operations",
+          "tasks": 3,
+          "passed": 2,
+          "rate": 93.8
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 8,
+          "rate": 100
+        },
+        {
+          "category": "text_processing",
+          "tasks": 4,
+          "passed": 4,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-openai-gpt-5.2-2026-02-28-204052",
+      "kind": "llm-eval",
+      "provider": "openai",
+      "model": "gpt-5.2",
+      "baseline": null,
+      "label": "openai/gpt-5.2",
+      "date": "2026-02-28",
+      "timestamp": "2026-02-28T20:40:52Z",
+      "source": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-28-204052.json",
+      "tasks": 58,
+      "passed": 41,
+      "scorePct": 77.3,
+      "toolSuccessPct": 67.3,
+      "avgTurns": 3.55,
+      "avgToolCalls": 2.69,
+      "avgDurationMs": 7194,
+      "inputTokens": 200797,
+      "outputTokens": 28751,
+      "categories": [
+        {
+          "category": "config_management",
+          "tasks": 2,
+          "passed": 0,
+          "rate": 7.1
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 33.3
+        },
+        {
+          "category": "build_simulation",
+          "tasks": 2,
+          "passed": 0,
+          "rate": 36.4
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 6,
+          "passed": 3,
+          "rate": 45.9
+        },
+        {
+          "category": "scripting",
+          "tasks": 7,
+          "passed": 3,
+          "rate": 64.9
+        },
+        {
+          "category": "file_operations",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 70.8
+        },
+        {
+          "category": "text_processing",
+          "tasks": 6,
+          "passed": 5,
+          "rate": 85.7
+        },
+        {
+          "category": "pipelines",
+          "tasks": 5,
+          "passed": 4,
+          "rate": 90
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 7,
+          "rate": 96.4
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 6,
+          "passed": 5,
+          "rate": 97.1
+        },
+        {
+          "category": "database_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "code_search",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "environment",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-anthropic-claude-haiku-4-5-20251001-2026-02-28-204232",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-haiku-4-5-20251001",
+      "baseline": null,
+      "label": "anthropic/claude-haiku-4-5-20251001",
+      "date": "2026-02-28",
+      "timestamp": "2026-02-28T20:42:32Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-02-28-204232.json",
+      "tasks": 58,
+      "passed": 54,
+      "scorePct": 97.2,
+      "toolSuccessPct": 87.8,
+      "avgTurns": 4.9,
+      "avgToolCalls": 4.1,
+      "avgDurationMs": 8912,
+      "inputTokens": 546648,
+      "outputTokens": 69289,
+      "categories": [
+        {
+          "category": "config_management",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 64.3
+        },
+        {
+          "category": "file_operations",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 91.7
+        },
+        {
+          "category": "database_operations",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 91.7
+        },
+        {
+          "category": "scripting",
+          "tasks": 7,
+          "passed": 6,
+          "rate": 97.3
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 8,
+          "rate": 100
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "text_processing",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 5,
+          "passed": 5,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "build_simulation",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "environment",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "code_search",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-openresponses-gpt-5.3-codex-2026-02-28-205331",
+      "kind": "llm-eval",
+      "provider": "openresponses",
+      "model": "gpt-5.3-codex",
+      "baseline": null,
+      "label": "openresponses/gpt-5.3-codex",
+      "date": "2026-02-28",
+      "timestamp": "2026-02-28T20:53:31Z",
+      "source": "crates/bashkit-eval/results/eval-openresponses-gpt-5.3-codex-2026-02-28-205331.json",
+      "tasks": 58,
+      "passed": 51,
+      "scorePct": 91,
+      "toolSuccessPct": 82.8,
+      "avgTurns": 4.1,
+      "avgToolCalls": 3.21,
+      "avgDurationMs": 20302,
+      "inputTokens": 238948,
+      "outputTokens": 68519,
+      "categories": [
+        {
+          "category": "config_management",
+          "tasks": 2,
+          "passed": 0,
+          "rate": 21.4
+        },
+        {
+          "category": "file_operations",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 66.7
+        },
+        {
+          "category": "build_simulation",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 72.7
+        },
+        {
+          "category": "scripting",
+          "tasks": 7,
+          "passed": 6,
+          "rate": 89.2
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 6,
+          "passed": 4,
+          "rate": 91.4
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 8,
+          "rate": 100
+        },
+        {
+          "category": "text_processing",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 5,
+          "passed": 5,
+          "rate": 100
+        },
+        {
+          "category": "code_search",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "environment",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "database_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-anthropic-claude-opus-4-6-2026-02-28-205358",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-opus-4-6",
+      "baseline": null,
+      "label": "anthropic/claude-opus-4-6",
+      "date": "2026-02-28",
+      "timestamp": "2026-02-28T20:53:58Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-28-205358.json",
+      "tasks": 58,
+      "passed": 50,
+      "scorePct": 91,
+      "toolSuccessPct": 87.7,
+      "avgTurns": 4.98,
+      "avgToolCalls": 4.64,
+      "avgDurationMs": 20817,
+      "inputTokens": 517521,
+      "outputTokens": 61283,
+      "categories": [
+        {
+          "category": "config_management",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 57.1
+        },
+        {
+          "category": "build_simulation",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 63.6
+        },
+        {
+          "category": "file_operations",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 70.8
+        },
+        {
+          "category": "database_operations",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 83.3
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 6,
+          "passed": 4,
+          "rate": 86.5
+        },
+        {
+          "category": "scripting",
+          "tasks": 7,
+          "passed": 6,
+          "rate": 91.9
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 7,
+          "rate": 96.4
+        },
+        {
+          "category": "text_processing",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 5,
+          "passed": 5,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "environment",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "code_search",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-anthropic-claude-sonnet-4-6-2026-02-28-211120",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-sonnet-4-6",
+      "baseline": null,
+      "label": "anthropic/claude-sonnet-4-6",
+      "date": "2026-02-28",
+      "timestamp": "2026-02-28T21:11:20Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-6-2026-02-28-211120.json",
+      "tasks": 58,
+      "passed": 48,
+      "scorePct": 92.5,
+      "toolSuccessPct": 85.1,
+      "avgTurns": 5.16,
+      "avgToolCalls": 4.5,
+      "avgDurationMs": 21165,
+      "inputTokens": 561128,
+      "outputTokens": 67103,
+      "categories": [
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 33.3
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 57.1
+        },
+        {
+          "category": "config_management",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 64.3
+        },
+        {
+          "category": "build_simulation",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 72.7
+        },
+        {
+          "category": "scripting",
+          "tasks": 7,
+          "passed": 4,
+          "rate": 86.5
+        },
+        {
+          "category": "file_operations",
+          "tasks": 4,
+          "passed": 2,
+          "rate": 87.5
+        },
+        {
+          "category": "pipelines",
+          "tasks": 5,
+          "passed": 4,
+          "rate": 95
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 8,
+          "rate": 100
+        },
+        {
+          "category": "text_processing",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "code_search",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "environment",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "database_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174422",
+      "kind": "scripting-eval",
+      "provider": "openai",
+      "model": "gpt-5.4",
+      "baseline": false,
+      "label": "openai/gpt-5.4",
+      "date": "2026-03-14",
+      "timestamp": "2026-03-14T17:44:22Z",
+      "source": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174422.json",
+      "tasks": 4,
+      "passed": 3,
+      "scorePct": 93.1,
+      "toolSuccessPct": 92.9,
+      "avgTurns": 3.75,
+      "avgToolCalls": 3.5,
+      "avgDurationMs": 9262,
+      "inputTokens": 21808,
+      "outputTokens": 1830,
+      "categories": [
+        {
+          "category": "many_tools",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 93.1
+        }
+      ]
+    },
+    {
+      "id": "scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174433",
+      "kind": "scripting-eval",
+      "provider": "openai",
+      "model": "gpt-5.4",
+      "baseline": false,
+      "label": "openai/gpt-5.4",
+      "date": "2026-03-14",
+      "timestamp": "2026-03-14T17:44:33Z",
+      "source": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174433.json",
+      "tasks": 3,
+      "passed": 3,
+      "scorePct": 100,
+      "toolSuccessPct": 75,
+      "avgTurns": 2.33,
+      "avgToolCalls": 1.33,
+      "avgDurationMs": 3586,
+      "inputTokens": 5250,
+      "outputTokens": 540,
+      "categories": [
+        {
+          "category": "large_output",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174446",
+      "kind": "scripting-eval",
+      "provider": "openai",
+      "model": "gpt-5.4",
+      "baseline": false,
+      "label": "openai/gpt-5.4",
+      "date": "2026-03-14",
+      "timestamp": "2026-03-14T17:44:46Z",
+      "source": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174446.json",
+      "tasks": 3,
+      "passed": 2,
+      "scorePct": 84.6,
+      "toolSuccessPct": 100,
+      "avgTurns": 2,
+      "avgToolCalls": 1,
+      "avgDurationMs": 4256,
+      "inputTokens": 4136,
+      "outputTokens": 779,
+      "categories": [
+        {
+          "category": "paginated_responses",
+          "tasks": 3,
+          "passed": 2,
+          "rate": 84.6
+        }
+      ]
+    },
+    {
+      "id": "scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174458",
+      "kind": "scripting-eval",
+      "provider": "openai",
+      "model": "gpt-5.4",
+      "baseline": false,
+      "label": "openai/gpt-5.4",
+      "date": "2026-03-14",
+      "timestamp": "2026-03-14T17:44:58Z",
+      "source": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174458.json",
+      "tasks": 4,
+      "passed": 0,
+      "scorePct": 75,
+      "toolSuccessPct": 100,
+      "avgTurns": 2.25,
+      "avgToolCalls": 1.5,
+      "avgDurationMs": 2898,
+      "inputTokens": 7151,
+      "outputTokens": 471,
+      "categories": [
+        {
+          "category": "discovery",
+          "tasks": 4,
+          "passed": 0,
+          "rate": 75
+        }
+      ]
+    },
+    {
+      "id": "scripting-eval-baseline-openai-gpt-5.4-2026-03-14-174521",
+      "kind": "scripting-eval",
+      "provider": "openai",
+      "model": "gpt-5.4",
+      "baseline": true,
+      "label": "openai/gpt-5.4",
+      "date": "2026-03-14",
+      "timestamp": "2026-03-14T17:45:21Z",
+      "source": "crates/bashkit-eval/results/scripting-eval-baseline-openai-gpt-5.4-2026-03-14-174521.json",
+      "tasks": 4,
+      "passed": 3,
+      "scorePct": 96.6,
+      "toolSuccessPct": 100,
+      "avgTurns": 3,
+      "avgToolCalls": 3.75,
+      "avgDurationMs": 5574,
+      "inputTokens": 12002,
+      "outputTokens": 1090,
+      "categories": [
+        {
+          "category": "many_tools",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 96.6
+        }
+      ]
+    },
+    {
+      "id": "scripting-eval-baseline-openai-gpt-5.4-2026-03-14-174530",
+      "kind": "scripting-eval",
+      "provider": "openai",
+      "model": "gpt-5.4",
+      "baseline": true,
+      "label": "openai/gpt-5.4",
+      "date": "2026-03-14",
+      "timestamp": "2026-03-14T17:45:30Z",
+      "source": "crates/bashkit-eval/results/scripting-eval-baseline-openai-gpt-5.4-2026-03-14-174530.json",
+      "tasks": 3,
+      "passed": 2,
+      "scorePct": 90,
+      "toolSuccessPct": 100,
+      "avgTurns": 2,
+      "avgToolCalls": 1,
+      "avgDurationMs": 2889,
+      "inputTokens": 8019,
+      "outputTokens": 336,
+      "categories": [
+        {
+          "category": "large_output",
+          "tasks": 3,
+          "passed": 2,
+          "rate": 90
+        }
+      ]
+    },
+    {
+      "id": "scripting-eval-baseline-openai-gpt-5.4-2026-03-14-174541",
+      "kind": "scripting-eval",
+      "provider": "openai",
+      "model": "gpt-5.4",
+      "baseline": true,
+      "label": "openai/gpt-5.4",
+      "date": "2026-03-14",
+      "timestamp": "2026-03-14T17:45:41Z",
+      "source": "crates/bashkit-eval/results/scripting-eval-baseline-openai-gpt-5.4-2026-03-14-174541.json",
+      "tasks": 3,
+      "passed": 3,
+      "scorePct": 100,
+      "toolSuccessPct": 100,
+      "avgTurns": 3.33,
+      "avgToolCalls": 6.33,
+      "avgDurationMs": 3626,
+      "inputTokens": 5673,
+      "outputTokens": 472,
+      "categories": [
+        {
+          "category": "paginated_responses",
+          "tasks": 3,
+          "passed": 3,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "scripting-eval-scripted-openai-gpt-4o-2026-03-24-003610",
+      "kind": "scripting-eval",
+      "provider": "openai",
+      "model": "gpt-4o",
+      "baseline": false,
+      "label": "openai/gpt-4o",
+      "date": "2026-03-24",
+      "timestamp": "2026-03-24T00:36:10Z",
+      "source": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-4o-2026-03-24-003610.json",
+      "tasks": 4,
+      "passed": 3,
+      "scorePct": 91.7,
+      "toolSuccessPct": 100,
+      "avgTurns": 4,
+      "avgToolCalls": 3.25,
+      "avgDurationMs": 5151,
+      "inputTokens": 12253,
+      "outputTokens": 516,
+      "categories": [
+        {
+          "category": "discovery",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 91.7
+        }
+      ]
+    },
+    {
+      "id": "eval-anthropic-claude-haiku-4-5-20251001-2026-05-26-012523",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-haiku-4-5-20251001",
+      "baseline": null,
+      "label": "anthropic/claude-haiku-4-5-20251001",
+      "date": "2026-05-26",
+      "timestamp": "2026-05-26T01:25:23Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-05-26-012523.json",
+      "tasks": 58,
+      "passed": 54,
+      "scorePct": 98.4,
+      "toolSuccessPct": 92.3,
+      "avgTurns": 4.28,
+      "avgToolCalls": 3.36,
+      "avgDurationMs": 8262,
+      "inputTokens": 372316,
+      "outputTokens": 53645,
+      "categories": [
+        {
+          "category": "file_operations",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 91.7
+        },
+        {
+          "category": "scripting",
+          "tasks": 7,
+          "passed": 5,
+          "rate": 94.3
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 6,
+          "passed": 5,
+          "rate": 97.1
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 8,
+          "rate": 100
+        },
+        {
+          "category": "text_processing",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 5,
+          "passed": 5,
+          "rate": 100
+        },
+        {
+          "category": "environment",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "database_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "config_management",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "build_simulation",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "code_search",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-anthropic-claude-sonnet-4-6-2026-05-26-014508",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-sonnet-4-6",
+      "baseline": null,
+      "label": "anthropic/claude-sonnet-4-6",
+      "date": "2026-05-26",
+      "timestamp": "2026-05-26T01:45:08Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-6-2026-05-26-014508.json",
+      "tasks": 58,
+      "passed": 49,
+      "scorePct": 94,
+      "toolSuccessPct": 91,
+      "avgTurns": 4.12,
+      "avgToolCalls": 3.24,
+      "avgDurationMs": 20426,
+      "inputTokens": 413200,
+      "outputTokens": 68169,
+      "categories": [
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 50
+        },
+        {
+          "category": "config_management",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 64.3
+        },
+        {
+          "category": "code_search",
+          "tasks": 2,
+          "passed": 0,
+          "rate": 84.6
+        },
+        {
+          "category": "pipelines",
+          "tasks": 5,
+          "passed": 3,
+          "rate": 85
+        },
+        {
+          "category": "text_processing",
+          "tasks": 6,
+          "passed": 5,
+          "rate": 89.3
+        },
+        {
+          "category": "scripting",
+          "tasks": 7,
+          "passed": 5,
+          "rate": 91.4
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 8,
+          "rate": 100
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "file_operations",
+          "tasks": 4,
+          "passed": 4,
+          "rate": 100
+        },
+        {
+          "category": "database_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "environment",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "build_simulation",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-anthropic-claude-opus-4-7-2026-05-26-020742",
+      "kind": "llm-eval",
+      "provider": "anthropic",
+      "model": "claude-opus-4-7",
+      "baseline": null,
+      "label": "anthropic/claude-opus-4-7",
+      "date": "2026-05-26",
+      "timestamp": "2026-05-26T02:07:42Z",
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-7-2026-05-26-020742.json",
+      "tasks": 58,
+      "passed": 56,
+      "scorePct": 97.8,
+      "toolSuccessPct": 90.3,
+      "avgTurns": 3.95,
+      "avgToolCalls": 3.02,
+      "avgDurationMs": 23349,
+      "inputTokens": 439514,
+      "outputTokens": 62545,
+      "categories": [
+        {
+          "category": "config_management",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 64.3
+        },
+        {
+          "category": "file_operations",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 91.7
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 8,
+          "rate": 100
+        },
+        {
+          "category": "scripting",
+          "tasks": 7,
+          "passed": 7,
+          "rate": 100
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "text_processing",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 5,
+          "passed": 5,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "database_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "build_simulation",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "environment",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "code_search",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-openai-gpt-5.5-2026-05-26-021853",
+      "kind": "llm-eval",
+      "provider": "openai",
+      "model": "gpt-5.5",
+      "baseline": null,
+      "label": "openai/gpt-5.5",
+      "date": "2026-05-26",
+      "timestamp": "2026-05-26T02:18:53Z",
+      "source": "crates/bashkit-eval/results/eval-openai-gpt-5.5-2026-05-26-021853.json",
+      "tasks": 58,
+      "passed": 50,
+      "scorePct": 92.7,
+      "toolSuccessPct": 91.5,
+      "avgTurns": 3.02,
+      "avgToolCalls": 2.03,
+      "avgDurationMs": 11560,
+      "inputTokens": 117599,
+      "outputTokens": 32240,
+      "categories": [
+        {
+          "category": "file_operations",
+          "tasks": 4,
+          "passed": 2,
+          "rate": 66.7
+        },
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 66.7
+        },
+        {
+          "category": "scripting",
+          "tasks": 7,
+          "passed": 6,
+          "rate": 88.6
+        },
+        {
+          "category": "pipelines",
+          "tasks": 5,
+          "passed": 4,
+          "rate": 90
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 6,
+          "passed": 5,
+          "rate": 91.4
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 6,
+          "rate": 92.9
+        },
+        {
+          "category": "text_processing",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "build_simulation",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "config_management",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "code_search",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "environment",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "database_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    },
+    {
+      "id": "eval-openresponses-gpt-5.3-codex-2026-05-26-023642",
+      "kind": "llm-eval",
+      "provider": "openresponses",
+      "model": "gpt-5.3-codex",
+      "baseline": null,
+      "label": "openresponses/gpt-5.3-codex",
+      "date": "2026-05-26",
+      "timestamp": "2026-05-26T02:36:42Z",
+      "source": "crates/bashkit-eval/results/eval-openresponses-gpt-5.3-codex-2026-05-26-023642.json",
+      "tasks": 58,
+      "passed": 54,
+      "scorePct": 93,
+      "toolSuccessPct": 86.8,
+      "avgTurns": 2.97,
+      "avgToolCalls": 1.97,
+      "avgDurationMs": 14127,
+      "inputTokens": 91068,
+      "outputTokens": 48606,
+      "categories": [
+        {
+          "category": "system_info",
+          "tasks": 2,
+          "passed": 1,
+          "rate": 50
+        },
+        {
+          "category": "file_operations",
+          "tasks": 4,
+          "passed": 3,
+          "rate": 66.7
+        },
+        {
+          "category": "scripting",
+          "tasks": 7,
+          "passed": 5,
+          "rate": 68.6
+        },
+        {
+          "category": "json_processing",
+          "tasks": 8,
+          "passed": 8,
+          "rate": 100
+        },
+        {
+          "category": "data_transformation",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "complex_tasks",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "text_processing",
+          "tasks": 6,
+          "passed": 6,
+          "rate": 100
+        },
+        {
+          "category": "pipelines",
+          "tasks": 5,
+          "passed": 5,
+          "rate": 100
+        },
+        {
+          "category": "database_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "build_simulation",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "archive_operations",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "config_management",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "environment",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "error_recovery",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        },
+        {
+          "category": "code_search",
+          "tasks": 2,
+          "passed": 2,
+          "rate": 100
+        }
+      ]
+    }
+  ],
+  "modelTrends": [
+    {
+      "model": "anthropic/claude-haiku-4-5",
+      "points": [
+        {
+          "date": "2026-02-07",
+          "timestamp": "2026-02-07T05:20:23Z",
+          "scorePct": 91.5,
+          "passed": 19,
+          "tasks": 25
+        },
+        {
+          "date": "2026-02-08",
+          "timestamp": "2026-02-08T06:14:14Z",
+          "scorePct": 98.1,
+          "passed": 23,
+          "tasks": 25
+        }
+      ]
+    },
+    {
+      "model": "anthropic/claude-haiku-4-5-20251001",
+      "points": [
+        {
+          "date": "2026-02-09",
+          "timestamp": "2026-02-09T05:45:58Z",
+          "scorePct": 94.6,
+          "passed": 32,
+          "tasks": 37
+        },
+        {
+          "date": "2026-02-25",
+          "timestamp": "2026-02-25T04:48:01Z",
+          "scorePct": 97.8,
+          "passed": 35,
+          "tasks": 37
+        },
+        {
+          "date": "2026-02-27",
+          "timestamp": "2026-02-27T04:06:36Z",
+          "scorePct": 91.7,
+          "passed": 43,
+          "tasks": 52
+        },
+        {
+          "date": "2026-02-28",
+          "timestamp": "2026-02-28T20:42:32Z",
+          "scorePct": 97.2,
+          "passed": 54,
+          "tasks": 58
+        },
+        {
+          "date": "2026-05-26",
+          "timestamp": "2026-05-26T01:25:23Z",
+          "scorePct": 98.4,
+          "passed": 54,
+          "tasks": 58
+        }
+      ]
+    },
+    {
+      "model": "anthropic/claude-opus-4-6",
+      "points": [
+        {
+          "date": "2026-02-07",
+          "timestamp": "2026-02-07T05:25:36Z",
+          "scorePct": 86.8,
+          "passed": 17,
+          "tasks": 25
+        },
+        {
+          "date": "2026-02-08",
+          "timestamp": "2026-02-08T06:20:03Z",
+          "scorePct": 93.4,
+          "passed": 21,
+          "tasks": 25
+        },
+        {
+          "date": "2026-02-09",
+          "timestamp": "2026-02-09T14:27:36Z",
+          "scorePct": 87,
+          "passed": 29,
+          "tasks": 37
+        },
+        {
+          "date": "2026-02-25",
+          "timestamp": "2026-02-25T04:56:11Z",
+          "scorePct": 92.9,
+          "passed": 33,
+          "tasks": 37
+        },
+        {
+          "date": "2026-02-27",
+          "timestamp": "2026-02-27T04:38:56Z",
+          "scorePct": 100,
+          "passed": 23,
+          "tasks": 23
+        },
+        {
+          "date": "2026-02-28",
+          "timestamp": "2026-02-28T20:53:58Z",
+          "scorePct": 91,
+          "passed": 50,
+          "tasks": 58
+        }
+      ]
+    },
+    {
+      "model": "anthropic/claude-opus-4-7",
+      "points": [
+        {
+          "date": "2026-05-26",
+          "timestamp": "2026-05-26T02:07:42Z",
+          "scorePct": 97.8,
+          "passed": 56,
+          "tasks": 58
+        }
+      ]
+    },
+    {
+      "model": "anthropic/claude-sonnet-4-20250514",
+      "points": [
+        {
+          "date": "2026-02-17",
+          "timestamp": "2026-02-17T23:13:36Z",
+          "scorePct": 92.9,
+          "passed": 32,
+          "tasks": 37
+        },
+        {
+          "date": "2026-02-25",
+          "timestamp": "2026-02-25T04:53:28Z",
+          "scorePct": 97.3,
+          "passed": 34,
+          "tasks": 37
+        }
+      ]
+    },
+    {
+      "model": "anthropic/claude-sonnet-4-6",
+      "points": [
+        {
+          "date": "2026-02-27",
+          "timestamp": "2026-02-27T04:38:54Z",
+          "scorePct": 93.9,
+          "passed": 23,
+          "tasks": 26
+        },
+        {
+          "date": "2026-02-28",
+          "timestamp": "2026-02-28T21:11:20Z",
+          "scorePct": 92.5,
+          "passed": 48,
+          "tasks": 58
+        },
+        {
+          "date": "2026-05-26",
+          "timestamp": "2026-05-26T01:45:08Z",
+          "scorePct": 94,
+          "passed": 49,
+          "tasks": 58
+        }
+      ]
+    },
+    {
+      "model": "openai-responses/gpt-5.3-codex",
+      "points": [
+        {
+          "date": "2026-02-27",
+          "timestamp": "2026-02-27T05:55:43Z",
+          "scorePct": 93,
+          "passed": 30,
+          "tasks": 37
+        }
+      ]
+    },
+    {
+      "model": "openai/gpt-5.2",
+      "points": [
+        {
+          "date": "2026-02-07",
+          "timestamp": "2026-02-07T05:20:37Z",
+          "scorePct": 86.8,
+          "passed": 19,
+          "tasks": 25
+        },
+        {
+          "date": "2026-02-08",
+          "timestamp": "2026-02-08T06:14:45Z",
+          "scorePct": 81.1,
+          "passed": 18,
+          "tasks": 25
+        },
+        {
+          "date": "2026-02-09",
+          "timestamp": "2026-02-09T05:44:24Z",
+          "scorePct": 79.9,
+          "passed": 23,
+          "tasks": 37
+        },
+        {
+          "date": "2026-02-25",
+          "timestamp": "2026-02-25T04:49:04Z",
+          "scorePct": 86.4,
+          "passed": 27,
+          "tasks": 37
+        },
+        {
+          "date": "2026-02-27",
+          "timestamp": "2026-02-27T04:38:13Z",
+          "scorePct": 79.4,
+          "passed": 32,
+          "tasks": 52
+        },
+        {
+          "date": "2026-02-28",
+          "timestamp": "2026-02-28T20:40:52Z",
+          "scorePct": 77.3,
+          "passed": 41,
+          "tasks": 58
+        }
+      ]
+    },
+    {
+      "model": "openai/gpt-5.5",
+      "points": [
+        {
+          "date": "2026-05-26",
+          "timestamp": "2026-05-26T02:18:53Z",
+          "scorePct": 92.7,
+          "passed": 50,
+          "tasks": 58
+        }
+      ]
+    },
+    {
+      "model": "openresponses/gpt-5.3-codex",
+      "points": [
+        {
+          "date": "2026-02-28",
+          "timestamp": "2026-02-28T20:53:31Z",
+          "scorePct": 91,
+          "passed": 51,
+          "tasks": 58
+        },
+        {
+          "date": "2026-05-26",
+          "timestamp": "2026-05-26T02:36:42Z",
+          "scorePct": 93,
+          "passed": 54,
+          "tasks": 58
+        }
+      ]
+    }
+  ],
+  "milestones": [
+    {
+      "date": "2026-02-01",
+      "timestamp": "2026-02-01T18:30:40.000Z",
+      "kind": "Benchmark",
+      "title": "0.4x faster than bash",
+      "detail": "75 parity/perf cases on runsc-linux-x86_64; output match 80%.",
+      "metric": 0.4,
+      "source": "crates/bashkit-bench/results/bench-runsc-linux-x86_64-1769970640.json"
+    },
+    {
+      "date": "2026-02-03",
+      "timestamp": "2026-02-03T04:31:00.000Z",
+      "kind": "Benchmark",
+      "title": "200.9x faster than bash",
+      "detail": "75 parity/perf cases on runsc-linux-x86_64; output match 89.33%.",
+      "metric": 200.9,
+      "source": "crates/bashkit-bench/results/bench-runsc-linux-x86_64-1770093060.json"
+    },
+    {
+      "date": "2026-02-07",
+      "timestamp": "2026-02-07T05:20:23Z",
+      "kind": "Eval",
+      "title": "claude-haiku-4-5: 91.5%",
+      "detail": "19/25 tasks passed. Weakest category: text_processing (50%).",
+      "metric": 91.5,
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-2026-02-07-052023.json"
+    },
+    {
+      "date": "2026-02-07",
+      "timestamp": "2026-02-07T05:20:37Z",
+      "kind": "Eval",
+      "title": "gpt-5.2: 86.8%",
+      "detail": "19/25 tasks passed. Weakest category: complex_tasks (56.3%).",
+      "metric": 86.8,
+      "source": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-07-052037.json"
+    },
+    {
+      "date": "2026-02-07",
+      "timestamp": "2026-02-07T05:25:36Z",
+      "kind": "Eval",
+      "title": "claude-opus-4-6: 86.8%",
+      "detail": "17/25 tasks passed. Weakest category: complex_tasks (56.3%).",
+      "metric": 86.8,
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-07-052536.json"
+    },
+    {
+      "date": "2026-02-08",
+      "timestamp": "2026-02-08T06:14:14Z",
+      "kind": "Eval",
+      "title": "claude-haiku-4-5: 98.1%",
+      "detail": "23/25 tasks passed. Weakest category: scripting (93.3%).",
+      "metric": 98.1,
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-2026-02-08-061414.json"
+    },
+    {
+      "date": "2026-02-08",
+      "timestamp": "2026-02-08T06:14:45Z",
+      "kind": "Eval",
+      "title": "gpt-5.2: 81.1%",
+      "detail": "18/25 tasks passed. Weakest category: archive_operations (50%).",
+      "metric": 81.1,
+      "source": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-08-061445.json"
+    },
+    {
+      "date": "2026-02-08",
+      "timestamp": "2026-02-08T06:20:03Z",
+      "kind": "Eval",
+      "title": "claude-opus-4-6: 93.4%",
+      "detail": "21/25 tasks passed. Weakest category: complex_tasks (68.8%).",
+      "metric": 93.4,
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-08-062003.json"
+    },
+    {
+      "date": "2026-02-09",
+      "timestamp": "2026-02-09T05:44:24Z",
+      "kind": "Eval",
+      "title": "gpt-5.2: 79.9%",
+      "detail": "23/37 tasks passed. Weakest category: archive_operations (16.7%).",
+      "metric": 79.9,
+      "source": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-09-054424.json"
+    },
+    {
+      "date": "2026-02-09",
+      "timestamp": "2026-02-09T05:45:58Z",
+      "kind": "Eval",
+      "title": "claude-haiku-4-5-20251001: 94.6%",
+      "detail": "32/37 tasks passed. Weakest category: complex_tasks (91.7%).",
+      "metric": 94.6,
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-02-09-054558.json"
+    },
+    {
+      "date": "2026-02-09",
+      "timestamp": "2026-02-09T14:27:36Z",
+      "kind": "Eval",
+      "title": "claude-opus-4-6: 87%",
+      "detail": "29/37 tasks passed. Weakest category: complex_tasks (54.2%).",
+      "metric": 87,
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-09-142736.json"
+    },
+    {
+      "date": "2026-02-17",
+      "timestamp": "2026-02-17T23:13:36Z",
+      "kind": "Eval",
+      "title": "claude-sonnet-4-20250514: 92.9%",
+      "detail": "32/37 tasks passed. Weakest category: complex_tasks (62.5%).",
+      "metric": 92.9,
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-20250514-2026-02-17-231336.json"
+    },
+    {
+      "date": "2026-02-25",
+      "timestamp": "2026-02-25T04:48:01Z",
+      "kind": "Eval",
+      "title": "claude-haiku-4-5-20251001: 97.8%",
+      "detail": "35/37 tasks passed. Weakest category: scripting (89.5%).",
+      "metric": 97.8,
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-02-25-044801.json"
+    },
+    {
+      "date": "2026-02-25",
+      "timestamp": "2026-02-25T04:49:04Z",
+      "kind": "Eval",
+      "title": "gpt-5.2: 86.4%",
+      "detail": "27/37 tasks passed. Weakest category: archive_operations (50%).",
+      "metric": 86.4,
+      "source": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-25-044904.json"
+    },
+    {
+      "date": "2026-02-25",
+      "timestamp": "2026-02-25T04:53:28Z",
+      "kind": "Eval",
+      "title": "claude-sonnet-4-20250514: 97.3%",
+      "detail": "34/37 tasks passed. Weakest category: scripting (84.2%).",
+      "metric": 97.3,
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-20250514-2026-02-25-045328.json"
+    },
+    {
+      "date": "2026-02-25",
+      "timestamp": "2026-02-25T04:56:11Z",
+      "kind": "Eval",
+      "title": "claude-opus-4-6: 92.9%",
+      "detail": "33/37 tasks passed. Weakest category: complex_tasks (70.8%).",
+      "metric": 92.9,
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-25-045611.json"
+    },
+    {
+      "date": "2026-02-27",
+      "timestamp": "2026-02-27T04:06:36Z",
+      "kind": "Eval",
+      "title": "claude-haiku-4-5-20251001: 91.7%",
+      "detail": "43/52 tasks passed. Weakest category: file_operations (70.8%).",
+      "metric": 91.7,
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-02-27-040636.json"
+    },
+    {
+      "date": "2026-02-27",
+      "timestamp": "2026-02-27T04:38:13Z",
+      "kind": "Eval",
+      "title": "gpt-5.2: 79.4%",
+      "detail": "32/52 tasks passed. Weakest category: archive_operations (50%).",
+      "metric": 79.4,
+      "source": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-27-043813.json"
+    },
+    {
+      "date": "2026-02-27",
+      "timestamp": "2026-02-27T04:38:54Z",
+      "kind": "Eval",
+      "title": "claude-sonnet-4-6: 93.9%",
+      "detail": "23/26 tasks passed. Weakest category: archive_operations (50%).",
+      "metric": 93.9,
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-6-2026-02-27-043854.json"
+    },
+    {
+      "date": "2026-02-27",
+      "timestamp": "2026-02-27T04:38:56Z",
+      "kind": "Eval",
+      "title": "claude-opus-4-6: 100%",
+      "detail": "23/23 tasks passed. Weakest category: file_operations (100%).",
+      "metric": 100,
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-27-043856.json"
+    },
+    {
+      "date": "2026-02-27",
+      "timestamp": "2026-02-27T05:55:43Z",
+      "kind": "Eval",
+      "title": "gpt-5.3-codex: 93%",
+      "detail": "30/37 tasks passed. Weakest category: scripting (78.9%).",
+      "metric": 93,
+      "source": "crates/bashkit-eval/results/eval-openai-responses-gpt-5.3-codex-2026-02-27-055543.json"
+    },
+    {
+      "date": "2026-02-28",
+      "timestamp": "2026-02-28T20:40:52Z",
+      "kind": "Eval",
+      "title": "gpt-5.2: 77.3%",
+      "detail": "41/58 tasks passed. Weakest category: config_management (7.1%).",
+      "metric": 77.3,
+      "source": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-28-204052.json"
+    },
+    {
+      "date": "2026-02-28",
+      "timestamp": "2026-02-28T20:42:32Z",
+      "kind": "Eval",
+      "title": "claude-haiku-4-5-20251001: 97.2%",
+      "detail": "54/58 tasks passed. Weakest category: config_management (64.3%).",
+      "metric": 97.2,
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-02-28-204232.json"
+    },
+    {
+      "date": "2026-02-28",
+      "timestamp": "2026-02-28T20:53:31Z",
+      "kind": "Eval",
+      "title": "gpt-5.3-codex: 91%",
+      "detail": "51/58 tasks passed. Weakest category: config_management (21.4%).",
+      "metric": 91,
+      "source": "crates/bashkit-eval/results/eval-openresponses-gpt-5.3-codex-2026-02-28-205331.json"
+    },
+    {
+      "date": "2026-02-28",
+      "timestamp": "2026-02-28T20:53:58Z",
+      "kind": "Eval",
+      "title": "claude-opus-4-6: 91%",
+      "detail": "50/58 tasks passed. Weakest category: config_management (57.1%).",
+      "metric": 91,
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-28-205358.json"
+    },
+    {
+      "date": "2026-02-28",
+      "timestamp": "2026-02-28T21:11:20Z",
+      "kind": "Eval",
+      "title": "claude-sonnet-4-6: 92.5%",
+      "detail": "48/58 tasks passed. Weakest category: archive_operations (33.3%).",
+      "metric": 92.5,
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-6-2026-02-28-211120.json"
+    },
+    {
+      "date": "2026-03-14",
+      "timestamp": "2026-03-14T05:02:28.000Z",
+      "kind": "Benchmark",
+      "title": "23.8x faster than bash",
+      "detail": "96 parity/perf cases on none-linux-x86_64; output match 100%.",
+      "metric": 23.8,
+      "source": "crates/bashkit-bench/results/bench-none-linux-x86_64-1773464548.json"
+    },
+    {
+      "date": "2026-03-14",
+      "timestamp": "2026-03-14T06:18:49.000Z",
+      "kind": "Criterion",
+      "title": "parallel",
+      "detail": "single_bash_new at 23.77 us median",
+      "metric": 160.05,
+      "source": "crates/bashkit/benches/results/criterion-parallel-(none)-linux-x86_64-1773469129.md"
+    },
+    {
+      "date": "2026-03-14",
+      "timestamp": "2026-03-14T17:44:22Z",
+      "kind": "Eval",
+      "title": "gpt-5.4: 93.1%",
+      "detail": "3/4 tasks passed. Weakest category: many_tools (93.1%).",
+      "metric": 93.1,
+      "source": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174422.json"
+    },
+    {
+      "date": "2026-03-14",
+      "timestamp": "2026-03-14T17:44:33Z",
+      "kind": "Eval",
+      "title": "gpt-5.4: 100%",
+      "detail": "3/3 tasks passed. Weakest category: large_output (100%).",
+      "metric": 100,
+      "source": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174433.json"
+    },
+    {
+      "date": "2026-03-14",
+      "timestamp": "2026-03-14T17:44:46Z",
+      "kind": "Eval",
+      "title": "gpt-5.4: 84.6%",
+      "detail": "2/3 tasks passed. Weakest category: paginated_responses (84.6%).",
+      "metric": 84.6,
+      "source": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174446.json"
+    },
+    {
+      "date": "2026-03-14",
+      "timestamp": "2026-03-14T17:44:58Z",
+      "kind": "Eval",
+      "title": "gpt-5.4: 75%",
+      "detail": "0/4 tasks passed. Weakest category: discovery (75%).",
+      "metric": 75,
+      "source": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174458.json"
+    },
+    {
+      "date": "2026-03-14",
+      "timestamp": "2026-03-14T17:45:21Z",
+      "kind": "Eval",
+      "title": "gpt-5.4: 96.6%",
+      "detail": "3/4 tasks passed. Weakest category: many_tools (96.6%).",
+      "metric": 96.6,
+      "source": "crates/bashkit-eval/results/scripting-eval-baseline-openai-gpt-5.4-2026-03-14-174521.json"
+    },
+    {
+      "date": "2026-03-14",
+      "timestamp": "2026-03-14T17:45:30Z",
+      "kind": "Eval",
+      "title": "gpt-5.4: 90%",
+      "detail": "2/3 tasks passed. Weakest category: large_output (90%).",
+      "metric": 90,
+      "source": "crates/bashkit-eval/results/scripting-eval-baseline-openai-gpt-5.4-2026-03-14-174530.json"
+    },
+    {
+      "date": "2026-03-14",
+      "timestamp": "2026-03-14T17:45:41Z",
+      "kind": "Eval",
+      "title": "gpt-5.4: 100%",
+      "detail": "3/3 tasks passed. Weakest category: paginated_responses (100%).",
+      "metric": 100,
+      "source": "crates/bashkit-eval/results/scripting-eval-baseline-openai-gpt-5.4-2026-03-14-174541.json"
+    },
+    {
+      "date": "2026-03-24",
+      "timestamp": "2026-03-24T00:36:10Z",
+      "kind": "Eval",
+      "title": "gpt-4o: 91.7%",
+      "detail": "3/4 tasks passed. Weakest category: discovery (91.7%).",
+      "metric": 91.7,
+      "source": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-4o-2026-03-24-003610.json"
+    },
+    {
+      "date": "2026-04-13",
+      "timestamp": "2026-04-13T23:05:40.000Z",
+      "kind": "Benchmark",
+      "title": "107.2x faster than bash",
+      "detail": "96 parity/perf cases on runsc-linux-x86_64; output match 100%.",
+      "metric": 107.2,
+      "source": "crates/bashkit-bench/results/bench-runsc-linux-x86_64-1776121540.json"
+    },
+    {
+      "date": "2026-05-04",
+      "timestamp": "2026-05-04T03:27:48.000Z",
+      "kind": "Criterion",
+      "title": "sqlite",
+      "detail": "sqlite_query/aggregate_in_memory/1000 at 739.46 us median",
+      "metric": 799.27,
+      "source": "crates/bashkit/benches/results/criterion-sqlite-vm-linux-x86_64-1777865268.md"
+    },
+    {
+      "date": "2026-05-25",
+      "timestamp": "2026-05-25T21:32:22.000Z",
+      "kind": "Criterion",
+      "title": "hotpath",
+      "detail": "64.9% faster in startup/empty",
+      "metric": -36.6,
+      "source": "crates/bashkit/benches/results/criterion-hotpath-perf-linux-x86_64-1779744742.md"
+    },
+    {
+      "date": "2026-05-25",
+      "timestamp": "2026-05-25T21:35:05.000Z",
+      "kind": "Benchmark",
+      "title": "25.4x faster than bash",
+      "detail": "96 parity/perf cases on vm-linux-x86_64; output match 100%.",
+      "metric": 25.4,
+      "source": "crates/bashkit-bench/results/bench-after-perf-linux-x86_64.json"
+    },
+    {
+      "date": "2026-05-26",
+      "timestamp": "2026-05-26T01:25:23Z",
+      "kind": "Eval",
+      "title": "claude-haiku-4-5-20251001: 98.4%",
+      "detail": "54/58 tasks passed. Weakest category: file_operations (91.7%).",
+      "metric": 98.4,
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-05-26-012523.json"
+    },
+    {
+      "date": "2026-05-26",
+      "timestamp": "2026-05-26T01:44:10.000Z",
+      "kind": "Criterion",
+      "title": "file-ops",
+      "detail": "`for f in /work/d00/*` (shallow glob) at 267 us median",
+      "metric": 2000,
+      "source": "crates/bashkit/benches/results/criterion-file_ops-linux-x86_64-1779759850.md"
+    },
+    {
+      "date": "2026-05-26",
+      "timestamp": "2026-05-26T01:44:10.000Z",
+      "kind": "Criterion",
+      "title": "hotpath",
+      "detail": "`startup/empty` at 34.4 us median",
+      "metric": 624,
+      "source": "crates/bashkit/benches/results/criterion-hotpath-attrs+shopt-linux-x86_64-1779759850.md"
+    },
+    {
+      "date": "2026-05-26",
+      "timestamp": "2026-05-26T01:45:08Z",
+      "kind": "Eval",
+      "title": "claude-sonnet-4-6: 94%",
+      "detail": "49/58 tasks passed. Weakest category: system_info (50%).",
+      "metric": 94,
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-6-2026-05-26-014508.json"
+    },
+    {
+      "date": "2026-05-26",
+      "timestamp": "2026-05-26T02:07:42Z",
+      "kind": "Eval",
+      "title": "claude-opus-4-7: 97.8%",
+      "detail": "56/58 tasks passed. Weakest category: config_management (64.3%).",
+      "metric": 97.8,
+      "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-7-2026-05-26-020742.json"
+    },
+    {
+      "date": "2026-05-26",
+      "timestamp": "2026-05-26T02:18:53Z",
+      "kind": "Eval",
+      "title": "gpt-5.5: 92.7%",
+      "detail": "50/58 tasks passed. Weakest category: file_operations (66.7%).",
+      "metric": 92.7,
+      "source": "crates/bashkit-eval/results/eval-openai-gpt-5.5-2026-05-26-021853.json"
+    },
+    {
+      "date": "2026-05-26",
+      "timestamp": "2026-05-26T02:36:42Z",
+      "kind": "Eval",
+      "title": "gpt-5.3-codex: 93%",
+      "detail": "54/58 tasks passed. Weakest category: system_info (50%).",
+      "metric": 93,
+      "source": "crates/bashkit-eval/results/eval-openresponses-gpt-5.3-codex-2026-05-26-023642.json"
+    },
+    {
+      "date": "2026-05-26",
+      "timestamp": "2026-05-26T03:01:00.000Z",
+      "kind": "Benchmark",
+      "title": "20.9x faster than bash",
+      "detail": "96 parity/perf cases on vm-linux-x86_64; output match 100%.",
+      "metric": 20.9,
+      "source": "crates/bashkit-bench/results/bench-vm-linux-x86_64-1779764460.json"
+    }
+  ]
+}
diff --git a/site/src/pages/benches.astro b/site/src/pages/benches.astro
new file mode 100644
index 00000000..e484bbe0
--- /dev/null
+++ b/site/src/pages/benches.astro
@@ -0,0 +1,836 @@
+---
+import BaseLayout from "../layouts/BaseLayout.astro";
+import performanceData from "../data/performance-timeline.json";
+
+// Decision: this page consumes the generated aggregate only. It must not import
+// raw eval traces or per-iteration benchmark samples into the browser bundle.
+const data = performanceData as typeof performanceData;
+const latestBench = data.summary.latestBench;
+const latestEval = data.summary.latestEval;
+const bestCriterion = data.summary.bestCriterionImprovement;
+const latestCriterion = data.criterionRuns.at(-1);
+const latestFullEvalRuns = data.evalRuns
+  .filter((run) => run.tasks >= 10)
+  .toSorted((a, b) => new Date(b.timestamp).getTime() - new Date(a.timestamp).getTime())
+  .slice(0, 8);
+const topBenchCategories = latestBench?.categories.slice(0, 8) ?? [];
+const weakestEvalCategories = latestEval?.categories.slice(0, 8) ?? [];
+
+const repoUrl = (source: string) =>
+  `https://github.com/everruns/bashkit/blob/main/${source}`;
+
+const compactNumber = (value: number | null | undefined, suffix = "") =>
+  typeof value === "number" ? `${value.toLocaleString()}${suffix}` : "n/a";
+
+const pageDescription =
+  "Interactive Bashkit benches history across criterion benches, bash-vs-bashkit benchmarks, and LLM evals.";
+---
+
+<BaseLayout
+  title="Bashkit Benches History"
+  description={pageDescription}
+>
+  <section class="perf-hero section">
+    <div class="container perf-hero__grid">
+      <div class="perf-hero__copy">
+        <span class="atlas-eyebrow">Benches history</span>
+        <h1>Benchmarks, benches, and evals over time.</h1>
+        <p class="perf-lede">
+          Aggregated from repo result artifacts at build time. The site keeps
+          trend points, category summaries, and highlights while leaving raw
+          traces in the repository.
+        </p>
+      </div>
+
+      <div class="perf-snapshot" aria-label="Latest performance snapshot">
+        <article>
+          <span>Latest benchmark</span>
+          <strong>{compactNumber(latestBench?.speedup, "x")}</strong>
+          <small>{latestBench?.date ?? "n/a"} vs bash</small>
+        </article>
+        <article>
+          <span>Latest eval</span>
+          <strong>{compactNumber(latestEval?.scorePct, "%")}</strong>
+          <small>
+            {latestEval ? `${latestEval.passed}/${latestEval.tasks} tasks` : "n/a"}
+          </small>
+        </article>
+        <article>
+          <span>Best criterion delta</span>
+          <strong>
+            {
+              bestCriterion?.bestChangePct
+                ? `${Math.abs(bestCriterion.bestChangePct)}%`
+                : "n/a"
+            }
+          </strong>
+          <small>{bestCriterion?.bestImprovement?.name ?? "n/a"}</small>
+        </article>
+      </div>
+    </div>
+  </section>
+
+  <section class="perf-dashboard section section-alt">
+    <div class="container">
+      <div class="perf-toolbar" aria-label="Chart controls">
+        <div class="segmented-control" role="group" aria-label="Metric">
+          <button type="button" class="is-active" data-view="eval">Eval score</button>
+          <button type="button" data-view="bench">Speedup</button>
+          <button type="button" data-view="criterion">Criterion delta</button>
+        </div>
+
+        <label class="perf-select">
+          <span>Series</span>
+          <select id="perf-series-select" aria-label="Series"></select>
+        </label>
+      </div>
+
+      <article class="perf-chart-shell">
+        <div class="perf-chart-head">
+          <div>
+            <span class="atlas-eyebrow" id="perf-chart-kind">Eval score</span>
+            <h2 id="perf-chart-title">Full eval score trend</h2>
+          </div>
+          <p id="perf-chart-note"></p>
+        </div>
+
+        <div class="perf-chart-wrap">
+          <svg
+            id="perf-chart"
+            viewBox="0 0 960 360"
+            role="img"
+            aria-labelledby="perf-chart-title"
+          ></svg>
+          <div id="perf-tooltip" class="perf-tooltip" hidden></div>
+        </div>
+      </article>
+    </div>
+  </section>
+
+  <section class="perf-milestones section">
+    <div class="container">
+      <div class="atlas-section-heading">
+        <div>
+          <span class="atlas-eyebrow">Timeline</span>
+          <h2>Milestones with highlights</h2>
+        </div>
+        <p>
+          Each point links back to the source artifact used by the aggregation
+          script.
+        </p>
+      </div>
+
+      <div class="perf-timeline">
+        {
+          data.milestones.slice(-14).reverse().map((item) => (
+            <article class="perf-milestone">
+              <time datetime={item.timestamp}>{item.date}</time>
+              <div>
+                <span>{item.kind}</span>
+                <h3>{item.title}</h3>
+                <p>{item.detail}</p>
+                <a href={repoUrl(item.source)} target="_blank" rel="noopener noreferrer">
+                  Source artifact
+                </a>
+              </div>
+            </article>
+          ))
+        }
+      </div>
+    </div>
+  </section>
+
+  <section class="perf-tables section section-alt">
+    <div class="container perf-table-grid">
+      <article class="perf-table-card">
+        <div class="perf-table-card__head">
+          <span class="atlas-eyebrow">Latest eval runs</span>
+          <h2>Model progress</h2>
+        </div>
+        <table>
+          <thead>
+            <tr><th>Run</th><th>Score</th><th>Tasks</th><th>Tools</th></tr>
+          </thead>
+          <tbody>
+            {
+              latestFullEvalRuns.map((run) => (
+                <tr>
+                  <td>
+                    <a href={repoUrl(run.source)} target="_blank" rel="noopener noreferrer">
+                      {run.model}
+                    </a>
+                    <small>{run.date}</small>
+                  </td>
+                  <td class="score">{run.scorePct}%</td>
+                  <td>{run.passed}/{run.tasks}</td>
+                  <td>{run.toolSuccessPct}%</td>
+                </tr>
+              ))
+            }
+          </tbody>
+        </table>
+      </article>
+
+      <article class="perf-table-card">
+        <div class="perf-table-card__head">
+          <span class="atlas-eyebrow">Latest benchmark categories</span>
+          <h2>Where bashkit is fastest</h2>
+        </div>
+        <table>
+          <thead>
+            <tr><th>Category</th><th>Cases</th><th>Median speedup</th></tr>
+          </thead>
+          <tbody>
+            {
+              topBenchCategories.map((row) => (
+                <tr>
+                  <td>{row.category}</td>
+                  <td>{row.cases}</td>
+                  <td class="score">{row.speedup}x</td>
+                </tr>
+              ))
+            }
+          </tbody>
+        </table>
+      </article>
+
+      <article class="perf-table-card">
+        <div class="perf-table-card__head">
+          <span class="atlas-eyebrow">Latest eval pressure</span>
+          <h2>Weakest categories</h2>
+        </div>
+        <table>
+          <thead>
+            <tr><th>Category</th><th>Tasks</th><th>Rate</th></tr>
+          </thead>
+          <tbody>
+            {
+              weakestEvalCategories.map((row) => (
+                <tr>
+                  <td>{row.category}</td>
+                  <td>{row.passed}/{row.tasks}</td>
+                  <td class="score">{row.rate}%</td>
+                </tr>
+              ))
+            }
+          </tbody>
+        </table>
+      </article>
+
+      <article class="perf-table-card">
+        <div class="perf-table-card__head">
+          <span class="atlas-eyebrow">Criterion coverage</span>
+          <h2>Latest microbench set</h2>
+        </div>
+        <dl class="perf-definition-list">
+          <div>
+            <dt>Run</dt>
+            <dd>{latestCriterion?.label ?? "n/a"}</dd>
+          </div>
+          <div>
+            <dt>Cases</dt>
+            <dd>{compactNumber(latestCriterion?.cases)}</dd>
+          </div>
+          <div>
+            <dt>Median time</dt>
+            <dd>{compactNumber(latestCriterion?.medianUs, " us")}</dd>
+          </div>
+          <div>
+            <dt>Fastest case</dt>
+            <dd>
+              {
+                latestCriterion?.fastestCase
+                  ? `${latestCriterion.fastestCase.name} (${latestCriterion.fastestCase.us} us)`
+                  : "n/a"
+              }
+            </dd>
+          </div>
+        </dl>
+      </article>
+    </div>
+  </section>
+
+  <script
+    id="performance-data"
+    type="application/json"
+    is:inline
+    set:html={JSON.stringify(data)}
+  />
+
+  <script is:inline>
+    const dataEl = document.getElementById("performance-data");
+    const perfData = JSON.parse(dataEl.textContent);
+    const chart = document.getElementById("perf-chart");
+    const tooltip = document.getElementById("perf-tooltip");
+    const buttons = [...document.querySelectorAll("[data-view]")];
+    const select = document.getElementById("perf-series-select");
+    const kind = document.getElementById("perf-chart-kind");
+    const title = document.getElementById("perf-chart-title");
+    const note = document.getElementById("perf-chart-note");
+
+    const chartConfig = {
+      eval: {
+        label: "Eval score",
+        title: "Full eval score trend",
+        unit: "%",
+        maxY: 100,
+        color: "#1769aa",
+        note: "Full eval runs only; smaller scripting evals stay in the milestone stream.",
+        options: () => [
+          { value: "all", label: "Best run per day" },
+          ...perfData.modelTrends.map((trend) => ({
+            value: trend.model,
+            label: trend.model,
+          })),
+        ],
+        points: (series) => {
+          const runs = perfData.evalRuns.filter((run) => run.tasks >= 10);
+          const source =
+            series === "all"
+              ? bestPerDay(runs, (run) => run.scorePct)
+              : runs.filter((run) => `${run.provider}/${run.model}` === series);
+          return source.map((run) => ({
+            x: new Date(run.timestamp).getTime(),
+            y: run.scorePct,
+            label: `${run.model}: ${run.scorePct}%`,
+            detail: `${run.passed}/${run.tasks} tasks, ${run.toolSuccessPct}% tool success`,
+            date: run.date,
+          }));
+        },
+      },
+      bench: {
+        label: "Speedup",
+        title: "bashkit-bench speedup vs bash",
+        unit: "x",
+        color: "#0f7b58",
+        note: "Aggregated from bashkit-vs-bash result JSON, using total runtime per run.",
+        options: () => [{ value: "all", label: "All benchmark runs" }],
+        points: () =>
+          perfData.benchRuns.map((run) => ({
+            x: new Date(run.timestamp).getTime(),
+            y: run.speedup,
+            label: `${run.speedup}x on ${run.label}`,
+            detail: `${run.cases} cases, ${run.matchRate}% output match`,
+            date: run.date,
+          })),
+      },
+      criterion: {
+        label: "Criterion delta",
+        title: "Criterion improvements and microbench coverage",
+        unit: "%",
+        color: "#a35f00",
+        note: "Before/after reports use improvement percent. Baseline-only reports show median case time separately in the tables.",
+        options: () => [{ value: "all", label: "Improvement reports" }],
+        points: () =>
+          perfData.criterionRuns
+            .filter((run) => Number.isFinite(run.medianChangePct))
+            .map((run) => ({
+              x: new Date(run.timestamp).getTime(),
+              y: Math.abs(run.medianChangePct),
+              label: `${run.family}: ${Math.abs(run.medianChangePct)}% median improvement`,
+              detail: run.bestImprovement
+                ? `${Math.abs(run.bestImprovement.changePct)}% best in ${run.bestImprovement.name}`
+                : `${run.cases} cases`,
+              date: run.date,
+            })),
+      },
+    };
+
+    function bestPerDay(runs, score) {
+      const byDay = new Map();
+      for (const run of runs) {
+        const current = byDay.get(run.date);
+        if (!current || score(run) > score(current)) byDay.set(run.date, run);
+      }
+      return [...byDay.values()].sort(
+        (a, b) => new Date(a.timestamp) - new Date(b.timestamp),
+      );
+    }
+
+    function setOptions(view) {
+      const previous = select.value;
+      const options = chartConfig[view].options();
+      select.innerHTML = options
+        .map((option) => `<option value="${option.value}">${option.label}</option>`)
+        .join("");
+      if (options.some((option) => option.value === previous)) {
+        select.value = previous;
+      }
+    }
+
+    function pathFrom(points) {
+      return points
+        .map((point, index) => `${index === 0 ? "M" : "L"} ${point.sx} ${point.sy}`)
+        .join(" ");
+    }
+
+    function renderChart() {
+      const view = document.querySelector("[data-view].is-active").dataset.view;
+      const config = chartConfig[view];
+      const points = config
+        .points(select.value)
+        .filter((point) => Number.isFinite(point.x) && Number.isFinite(point.y))
+        .sort((a, b) => a.x - b.x);
+
+      kind.textContent = config.label;
+      title.textContent = config.title;
+      note.textContent = config.note;
+
+      const width = 960;
+      const height = 360;
+      const pad = { left: 58, right: 28, top: 28, bottom: 54 };
+      const xs = points.map((point) => point.x);
+      const ys = points.map((point) => point.y);
+      const minX = Math.min(...xs);
+      const maxX = Math.max(...xs);
+      const minY = Math.min(0, ...ys);
+      const maxY = Math.max(...ys, 1);
+      const yTop = Math.max(config.maxY ?? maxY * 1.12, maxY);
+
+      const scaleX = (x) =>
+        pad.left + ((x - minX) / Math.max(1, maxX - minX)) * (width - pad.left - pad.right);
+      const scaleY = (y) =>
+        height - pad.bottom - ((y - minY) / Math.max(1, yTop - minY)) * (height - pad.top - pad.bottom);
+
+      const plotted = points.map((point) => ({
+        ...point,
+        sx: scaleX(point.x),
+        sy: scaleY(point.y),
+      }));
+
+      const yTicks = [0, 0.25, 0.5, 0.75, 1].map((step) => minY + (yTop - minY) * step);
+      const xTicks = plotted.filter((_, index) =>
+        plotted.length < 6 ? true : index % Math.ceil(plotted.length / 5) === 0,
+      );
+
+      chart.innerHTML = `
+        <rect x="0" y="0" width="${width}" height="${height}" fill="transparent"></rect>
+        ${yTicks
+          .map((tick) => {
+            const y = scaleY(tick);
+            return `<line x1="${pad.left}" x2="${width - pad.right}" y1="${y}" y2="${y}" class="grid-line"></line>
+              <text x="${pad.left - 12}" y="${y + 4}" class="axis-label" text-anchor="end">${formatMetric(tick, config.unit)}</text>`;
+          })
+          .join("")}
+        <path d="${pathFrom(plotted)}" fill="none" stroke="${config.color}" stroke-width="3" stroke-linecap="round"></path>
+        ${plotted
+          .map(
+            (point, index) => `<circle cx="${point.sx}" cy="${point.sy}" r="6" fill="${config.color}" data-index="${index}" tabindex="0"></circle>`,
+          )
+          .join("")}
+        ${xTicks
+          .map(
+            (point) => `<text x="${point.sx}" y="${height - 18}" class="axis-label" text-anchor="middle">${point.date.slice(5)}</text>`,
+          )
+          .join("")}
+      `;
+
+      chart.querySelectorAll("circle").forEach((circle) => {
+        const point = plotted[Number(circle.dataset.index)];
+        circle.addEventListener("mouseenter", () => showTooltip(circle, point, config.unit));
+        circle.addEventListener("focus", () => showTooltip(circle, point, config.unit));
+        circle.addEventListener("mouseleave", hideTooltip);
+        circle.addEventListener("blur", hideTooltip);
+      });
+    }
+
+    function formatMetric(value, unit) {
+      const rounded = Math.round(value * 10) / 10;
+      return `${rounded}${unit}`;
+    }
+
+    function showTooltip(node, point, unit) {
+      const chartBox = chart.getBoundingClientRect();
+      const nodeBox = node.getBoundingClientRect();
+      tooltip.innerHTML = `<strong>${point.label}</strong><span>${point.date}</span><p>${point.detail}</p><b>${formatMetric(point.y, unit)}</b>`;
+      tooltip.hidden = false;
+      tooltip.style.left = `${nodeBox.left - chartBox.left + 18}px`;
+      tooltip.style.top = `${nodeBox.top - chartBox.top - 18}px`;
+    }
+
+    function hideTooltip() {
+      tooltip.hidden = true;
+    }
+
+    buttons.forEach((button) => {
+      button.addEventListener("click", () => {
+        buttons.forEach((item) => item.classList.remove("is-active"));
+        button.classList.add("is-active");
+        setOptions(button.dataset.view);
+        renderChart();
+      });
+    });
+
+    select.addEventListener("change", renderChart);
+    setOptions("eval");
+    renderChart();
+  </script>
+</BaseLayout>
+
+<style>
+  .perf-hero {
+    background: var(--color-obsidian);
+    color: var(--color-white);
+  }
+
+  .perf-hero__grid {
+    display: grid;
+    grid-template-columns: minmax(0, 1.2fr) minmax(320px, 0.8fr);
+    gap: var(--space-xl);
+    align-items: end;
+  }
+
+  .perf-hero .atlas-eyebrow,
+  .perf-dashboard .atlas-eyebrow,
+  .perf-milestones .atlas-eyebrow,
+  .perf-tables .atlas-eyebrow {
+    display: inline-flex;
+    margin-bottom: var(--space-sm);
+    color: var(--color-gold);
+    font-size: 0.78rem;
+    font-weight: 700;
+    letter-spacing: 0.08em;
+    text-transform: uppercase;
+  }
+
+  .perf-hero h1 {
+    max-width: 760px;
+    font-size: clamp(2.35rem, 5vw, 4.8rem);
+    line-height: 0.98;
+  }
+
+  .perf-lede {
+    max-width: 720px;
+    margin-top: var(--space-md);
+    color: var(--color-silver);
+    font-size: 1.08rem;
+  }
+
+  .perf-snapshot {
+    display: grid;
+    gap: var(--space-sm);
+  }
+
+  .perf-snapshot article {
+    border: 1px solid rgba(255, 255, 255, 0.16);
+    background: rgba(255, 255, 255, 0.05);
+    padding: var(--space-md);
+  }
+
+  .perf-snapshot span,
+  .perf-snapshot small {
+    display: block;
+    color: var(--color-silver);
+  }
+
+  .perf-snapshot strong {
+    display: block;
+    margin: 0.15rem 0;
+    color: var(--color-white);
+    font-size: 2rem;
+    line-height: 1.1;
+  }
+
+  .perf-toolbar {
+    display: flex;
+    justify-content: space-between;
+    gap: var(--space-md);
+    align-items: center;
+    margin-bottom: var(--space-md);
+  }
+
+  .segmented-control {
+    display: inline-grid;
+    grid-template-columns: repeat(3, minmax(0, 1fr));
+    border: 1px solid #d7d7d7;
+    background: var(--color-white);
+  }
+
+  .segmented-control button {
+    min-width: 9.5rem;
+    min-height: 2.65rem;
+    border: 0;
+    border-right: 1px solid #d7d7d7;
+    background: transparent;
+    color: var(--color-slate);
+    cursor: pointer;
+    font: inherit;
+    font-weight: 600;
+  }
+
+  .segmented-control button:last-child {
+    border-right: 0;
+  }
+
+  .segmented-control button.is-active {
+    background: var(--color-obsidian);
+    color: var(--color-white);
+  }
+
+  .perf-select {
+    display: inline-flex;
+    align-items: center;
+    gap: var(--space-sm);
+    color: var(--color-slate);
+    font-weight: 600;
+  }
+
+  .perf-select select {
+    min-width: 260px;
+    min-height: 2.65rem;
+    border: 1px solid #d7d7d7;
+    background: var(--color-white);
+    color: var(--color-obsidian);
+    padding: 0 var(--space-sm);
+    font: inherit;
+  }
+
+  .perf-chart-shell,
+  .perf-table-card {
+    background: var(--color-white);
+    border: 1px solid #dddddd;
+  }
+
+  .perf-chart-head {
+    display: flex;
+    justify-content: space-between;
+    gap: var(--space-md);
+    padding: var(--space-lg) var(--space-lg) 0;
+  }
+
+  .perf-chart-head p {
+    max-width: 420px;
+    color: var(--color-slate);
+  }
+
+  .perf-chart-wrap {
+    position: relative;
+    padding: var(--space-md) var(--space-lg) var(--space-lg);
+    overflow-x: auto;
+  }
+
+  .perf-chart-wrap svg {
+    display: block;
+    width: 100%;
+    min-width: 720px;
+    height: auto;
+  }
+
+  .perf-chart-wrap :global(.grid-line) {
+    stroke: #e5e5e5;
+    stroke-width: 1;
+  }
+
+  .perf-chart-wrap :global(.axis-label) {
+    fill: #5d5d5d;
+    font-family: var(--font-mono);
+    font-size: 0.78rem;
+  }
+
+  .perf-chart-wrap :global(circle) {
+    cursor: pointer;
+    stroke: var(--color-white);
+    stroke-width: 2;
+  }
+
+  .perf-tooltip {
+    position: absolute;
+    z-index: 5;
+    width: min(280px, 80vw);
+    border: 1px solid #d7d7d7;
+    background: var(--color-white);
+    color: var(--color-obsidian);
+    padding: var(--space-sm);
+    box-shadow: 0 16px 36px rgba(0, 0, 0, 0.16);
+    pointer-events: none;
+  }
+
+  .perf-tooltip strong,
+  .perf-tooltip span,
+  .perf-tooltip b {
+    display: block;
+  }
+
+  .perf-tooltip span,
+  .perf-tooltip p {
+    color: var(--color-slate);
+    font-size: 0.9rem;
+  }
+
+  .perf-tooltip b {
+    margin-top: var(--space-xs);
+    color: var(--color-gold);
+  }
+
+  .atlas-section-heading {
+    display: flex;
+    justify-content: space-between;
+    gap: var(--space-lg);
+    align-items: end;
+    margin-bottom: var(--space-lg);
+  }
+
+  .atlas-section-heading p {
+    max-width: 520px;
+    color: var(--color-slate);
+  }
+
+  .perf-timeline {
+    display: grid;
+    gap: var(--space-md);
+  }
+
+  .perf-milestone {
+    display: grid;
+    grid-template-columns: 8.5rem minmax(0, 1fr);
+    gap: var(--space-md);
+    border-top: 1px solid #dddddd;
+    padding-top: var(--space-md);
+  }
+
+  .perf-milestone time {
+    color: var(--color-slate);
+    font-family: var(--font-mono);
+    font-size: 0.9rem;
+  }
+
+  .perf-milestone span {
+    color: var(--color-gold);
+    font-size: 0.72rem;
+    font-weight: 800;
+    letter-spacing: 0.08em;
+    text-transform: uppercase;
+  }
+
+  .perf-milestone h3 {
+    margin-top: 0.1rem;
+  }
+
+  .perf-milestone p {
+    color: var(--color-slate);
+    max-width: 760px;
+  }
+
+  .perf-milestone a {
+    display: inline-flex;
+    margin-top: var(--space-xs);
+    font-weight: 700;
+  }
+
+  .perf-table-grid {
+    display: grid;
+    grid-template-columns: repeat(2, minmax(0, 1fr));
+    gap: var(--space-md);
+  }
+
+  .perf-table-card {
+    overflow: hidden;
+  }
+
+  .perf-table-card__head {
+    padding: var(--space-lg) var(--space-lg) var(--space-sm);
+  }
+
+  .perf-table-card table {
+    width: 100%;
+    border-collapse: collapse;
+  }
+
+  .perf-table-card th,
+  .perf-table-card td {
+    border-top: 1px solid #eeeeee;
+    padding: 0.85rem var(--space-lg);
+    text-align: left;
+    vertical-align: top;
+  }
+
+  .perf-table-card th {
+    color: var(--color-slate);
+    font-size: 0.78rem;
+    letter-spacing: 0.04em;
+    text-transform: uppercase;
+  }
+
+  .perf-table-card td small {
+    display: block;
+    color: var(--color-slate);
+    font-family: var(--font-mono);
+    font-size: 0.8rem;
+  }
+
+  .score {
+    color: #0f7b58;
+    font-weight: 800;
+  }
+
+  .perf-definition-list {
+    display: grid;
+    gap: 0;
+  }
+
+  .perf-definition-list div {
+    display: grid;
+    grid-template-columns: 9rem minmax(0, 1fr);
+    gap: var(--space-md);
+    border-top: 1px solid #eeeeee;
+    padding: 0.85rem var(--space-lg);
+  }
+
+  .perf-definition-list dt {
+    color: var(--color-slate);
+    font-weight: 700;
+  }
+
+  .perf-definition-list dd {
+    min-width: 0;
+  }
+
+  @media (max-width: 900px) {
+    .perf-hero__grid,
+    .perf-table-grid {
+      grid-template-columns: 1fr;
+    }
+
+    .perf-toolbar,
+    .perf-chart-head,
+    .atlas-section-heading {
+      align-items: stretch;
+      flex-direction: column;
+    }
+
+    .segmented-control {
+      width: 100%;
+    }
+
+    .segmented-control button {
+      min-width: 0;
+      padding: 0 var(--space-xs);
+    }
+
+    .perf-select {
+      align-items: stretch;
+      flex-direction: column;
+    }
+
+    .perf-select select {
+      min-width: 0;
+      width: 100%;
+    }
+  }
+
+  @media (max-width: 620px) {
+    .perf-milestone,
+    .perf-definition-list div {
+      grid-template-columns: 1fr;
+      gap: var(--space-xs);
+    }
+
+    .perf-table-card {
+      overflow-x: auto;
+    }
+
+    .perf-table-card table {
+      min-width: 520px;
+    }
+  }
+</style>
diff --git a/site/src/pages/index.astro b/site/src/pages/index.astro
index 9517d812..22da1d9d 100644
--- a/site/src/pages/index.astro
+++ b/site/src/pages/index.astro
@@ -4,6 +4,7 @@ import { Code } from "astro:components";
 import {
   agentSteps,
   apiSnippet,
+  benchesHref,
   builtinPreview,
   defense,
   evalSnapshot,
@@ -301,6 +302,9 @@ import {
           </a>{" "}
           run:
         </p>
+        <a href={benchesHref} class="atlas-inline-link">
+          Explore historical trends
+        </a>
       </div>
 
       <article class="atlas-panel atlas-table-panel">
@@ -343,8 +347,8 @@ import {
             <a
               href={item.href}
               class="atlas-panel atlas-card atlas-resource-card"
-              target="_blank"
-              rel="noopener noreferrer"
+              target={item.href.startsWith("http") ? "_blank" : undefined}
+              rel={item.href.startsWith("http") ? "noopener noreferrer" : undefined}
             >
               <span class="atlas-eyebrow">{item.cta}</span>
               <h3>{item.title}</h3>

From 7e0e30656dc436eb4f9247e23825fba0e659cc16 Mon Sep 17 00:00:00 2001
From: Mykhailo Chalyi <mykhailo.chalyi-contractor@procore.com>
Date: Tue, 26 May 2026 08:38:35 -0500
Subject: [PATCH 2/5] fix(site): refine benches page layout

---
 site/src/pages/benches.astro | 392 +++++++++++++++++++++++++----------
 1 file changed, 277 insertions(+), 115 deletions(-)

diff --git a/site/src/pages/benches.astro b/site/src/pages/benches.astro
index e484bbe0..cf3c4ce6 100644
--- a/site/src/pages/benches.astro
+++ b/site/src/pages/benches.astro
@@ -5,11 +5,30 @@ import performanceData from "../data/performance-timeline.json";
 // Decision: this page consumes the generated aggregate only. It must not import
 // raw eval traces or per-iteration benchmark samples into the browser bundle.
 const data = performanceData as typeof performanceData;
-const latestBench = data.summary.latestBench;
-const latestEval = data.summary.latestEval;
-const bestCriterion = data.summary.bestCriterionImprovement;
-const latestCriterion = data.criterionRuns.at(-1);
-const latestFullEvalRuns = data.evalRuns
+const historyMonths = 6;
+const generatedAt = new Date(data.generatedAt ?? new Date().toISOString());
+const historyStart = new Date(generatedAt);
+historyStart.setMonth(historyStart.getMonth() - historyMonths);
+const inWindow = (timestamp: string) => new Date(timestamp) >= historyStart;
+function latestByTime<T extends { timestamp: string }>(items: T[]) {
+  return items
+    .toSorted((a, b) => new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime())
+    .at(-1);
+}
+
+const benchRuns = data.benchRuns.filter((run) => inWindow(run.timestamp));
+const evalRuns = data.evalRuns.filter((run) => inWindow(run.timestamp));
+const criterionRuns = data.criterionRuns.filter((run) => inWindow(run.timestamp));
+const milestones = data.milestones.filter((item) => inWindow(item.timestamp));
+
+const latestBench = latestByTime(benchRuns);
+const latestEval = latestByTime(evalRuns.filter((run) => run.tasks >= 10));
+const bestCriterion = criterionRuns
+  .filter((run) => typeof run.bestChangePct === "number")
+  .toSorted((a, b) => Math.abs(b.bestChangePct ?? 0) - Math.abs(a.bestChangePct ?? 0))
+  .at(0);
+const latestCriterion = latestByTime(criterionRuns);
+const latestFullEvalRuns = evalRuns
   .filter((run) => run.tasks >= 10)
   .toSorted((a, b) => new Date(b.timestamp).getTime() - new Date(a.timestamp).getTime())
   .slice(0, 8);
@@ -22,6 +41,30 @@ const repoUrl = (source: string) =>
 const compactNumber = (value: number | null | undefined, suffix = "") =>
   typeof value === "number" ? `${value.toLocaleString()}${suffix}` : "n/a";
 
+const dateLabel = (date: Date) => date.toISOString().slice(0, 10);
+const resultLinks = [
+  {
+    title: "Criterion benches",
+    detail: `${criterionRuns.length} reports in window`,
+    href: "https://github.com/everruns/bashkit/tree/main/crates/bashkit/benches/results",
+  },
+  {
+    title: "bashkit-bench",
+    detail: `${benchRuns.length} bash-vs-bash runs`,
+    href: "https://github.com/everruns/bashkit/tree/main/crates/bashkit-bench/results",
+  },
+  {
+    title: "LLM evals",
+    detail: `${evalRuns.length} eval result files`,
+    href: "https://github.com/everruns/bashkit/tree/main/crates/bashkit-eval/results",
+  },
+  {
+    title: "Aggregation script",
+    detail: "How this page is generated",
+    href: "https://github.com/everruns/bashkit/blob/main/site/scripts/build-performance-data.mjs",
+  },
+];
+
 const pageDescription =
   "Interactive Bashkit benches history across criterion benches, bash-vs-bashkit benchmarks, and LLM evals.";
 ---
@@ -30,99 +73,115 @@ const pageDescription =
   title="Bashkit Benches History"
   description={pageDescription}
 >
-  <section class="perf-hero section">
-    <div class="container perf-hero__grid">
-      <div class="perf-hero__copy">
-        <span class="atlas-eyebrow">Benches history</span>
-        <h1>Benchmarks, benches, and evals over time.</h1>
-        <p class="perf-lede">
-          Aggregated from repo result artifacts at build time. The site keeps
-          trend points, category summaries, and highlights while leaving raw
-          traces in the repository.
-        </p>
-      </div>
-
-      <div class="perf-snapshot" aria-label="Latest performance snapshot">
-        <article>
-          <span>Latest benchmark</span>
-          <strong>{compactNumber(latestBench?.speedup, "x")}</strong>
-          <small>{latestBench?.date ?? "n/a"} vs bash</small>
-        </article>
-        <article>
-          <span>Latest eval</span>
-          <strong>{compactNumber(latestEval?.scorePct, "%")}</strong>
-          <small>
-            {latestEval ? `${latestEval.passed}/${latestEval.tasks} tasks` : "n/a"}
-          </small>
-        </article>
-        <article>
-          <span>Best criterion delta</span>
-          <strong>
-            {
-              bestCriterion?.bestChangePct
-                ? `${Math.abs(bestCriterion.bestChangePct)}%`
-                : "n/a"
-            }
-          </strong>
-          <small>{bestCriterion?.bestImprovement?.name ?? "n/a"}</small>
-        </article>
+  <section class="benches-intro">
+    <div class="container benches-intro__grid">
+      <div>
+        <span class="atlas-eyebrow">Benches</span>
+        <h1>History for benchmarks and evals</h1>
       </div>
+      <p>
+        Last {historyMonths} months only, from {dateLabel(historyStart)} to{" "}
+        {dateLabel(generatedAt)}. Raw result files stay in GitHub; this page
+        ships the compact aggregate.
+      </p>
     </div>
   </section>
 
-  <section class="perf-dashboard section section-alt">
-    <div class="container">
-      <div class="perf-toolbar" aria-label="Chart controls">
-        <div class="segmented-control" role="group" aria-label="Metric">
-          <button type="button" class="is-active" data-view="eval">Eval score</button>
-          <button type="button" data-view="bench">Speedup</button>
-          <button type="button" data-view="criterion">Criterion delta</button>
+  <section class="perf-dashboard section-alt">
+    <div class="container benches-grid">
+      <div class="benches-main">
+        <div class="perf-toolbar" aria-label="Chart controls">
+          <div class="segmented-control" role="group" aria-label="Metric">
+            <button type="button" class="is-active" data-view="eval">Eval score</button>
+            <button type="button" data-view="bench">Speedup</button>
+            <button type="button" data-view="criterion">Criterion delta</button>
+          </div>
+
+          <label class="perf-select">
+            <span>Series</span>
+            <select id="perf-series-select" aria-label="Series"></select>
+          </label>
         </div>
 
-        <label class="perf-select">
-          <span>Series</span>
-          <select id="perf-series-select" aria-label="Series"></select>
-        </label>
-      </div>
+        <article class="perf-chart-shell">
+          <div class="perf-chart-head">
+            <div>
+              <span class="atlas-eyebrow" id="perf-chart-kind">Eval score</span>
+              <h2 id="perf-chart-title">Full eval score trend</h2>
+            </div>
+            <p id="perf-chart-note"></p>
+          </div>
 
-      <article class="perf-chart-shell">
-        <div class="perf-chart-head">
-          <div>
-            <span class="atlas-eyebrow" id="perf-chart-kind">Eval score</span>
-            <h2 id="perf-chart-title">Full eval score trend</h2>
+          <div class="perf-chart-wrap">
+            <svg
+              id="perf-chart"
+              viewBox="0 0 960 340"
+              role="img"
+              aria-labelledby="perf-chart-title"
+            ></svg>
+            <div id="perf-tooltip" class="perf-tooltip" hidden></div>
           </div>
-          <p id="perf-chart-note"></p>
+        </article>
+      </div>
+
+      <aside class="benches-side">
+        <div class="perf-snapshot" aria-label="Latest performance snapshot">
+          <a href={latestBench ? repoUrl(latestBench.source) : "#"} target="_blank" rel="noopener noreferrer">
+            <span>Latest benchmark</span>
+            <strong>{compactNumber(latestBench?.speedup, "x")}</strong>
+            <small>{latestBench?.date ?? "n/a"} vs bash</small>
+          </a>
+          <a href={latestEval ? repoUrl(latestEval.source) : "#"} target="_blank" rel="noopener noreferrer">
+            <span>Latest eval</span>
+            <strong>{compactNumber(latestEval?.scorePct, "%")}</strong>
+            <small>
+              {latestEval ? `${latestEval.passed}/${latestEval.tasks} tasks` : "n/a"}
+            </small>
+          </a>
+          <a href={bestCriterion ? repoUrl(bestCriterion.source) : "#"} target="_blank" rel="noopener noreferrer">
+            <span>Best criterion delta</span>
+            <strong>
+              {
+                bestCriterion?.bestChangePct
+                  ? `${Math.abs(bestCriterion.bestChangePct)}%`
+                  : "n/a"
+              }
+            </strong>
+            <small>{bestCriterion?.bestImprovement?.name ?? "n/a"}</small>
+          </a>
         </div>
 
-        <div class="perf-chart-wrap">
-          <svg
-            id="perf-chart"
-            viewBox="0 0 960 360"
-            role="img"
-            aria-labelledby="perf-chart-title"
-          ></svg>
-          <div id="perf-tooltip" class="perf-tooltip" hidden></div>
+        <div class="bench-source-list">
+          <span class="atlas-eyebrow">Source files</span>
+          {
+            resultLinks.map((item) => (
+              <a href={item.href} target="_blank" rel="noopener noreferrer">
+                <strong>{item.title}</strong>
+                <small>{item.detail}</small>
+              </a>
+            ))
+          }
         </div>
-      </article>
+      </aside>
     </div>
   </section>
 
-  <section class="perf-milestones section">
-    <div class="container">
+  <section class="perf-milestones">
+    <div class="container benches-grid benches-grid--timeline">
       <div class="atlas-section-heading">
         <div>
-          <span class="atlas-eyebrow">Timeline</span>
-          <h2>Milestones with highlights</h2>
+          <span class="atlas-eyebrow">Recent timeline</span>
+          <h2>Six-month milestones</h2>
         </div>
         <p>
-          Each point links back to the source artifact used by the aggregation
-          script.
+          Milestones are grouped to the current display window and link back to
+          the exact source artifact.
         </p>
       </div>
 
       <div class="perf-timeline">
         {
-          data.milestones.slice(-14).reverse().map((item) => (
+          milestones.slice(-10).reverse().map((item) => (
             <article class="perf-milestone">
               <time datetime={item.timestamp}>{item.date}</time>
               <div>
@@ -267,6 +326,19 @@ const pageDescription =
     const kind = document.getElementById("perf-chart-kind");
     const title = document.getElementById("perf-chart-title");
     const note = document.getElementById("perf-chart-note");
+    const generatedAt = new Date(perfData.generatedAt);
+    const historyStart = new Date(generatedAt);
+    historyStart.setMonth(historyStart.getMonth() - 6);
+    const inWindow = (run) => new Date(run.timestamp) >= historyStart;
+    const benchRuns = perfData.benchRuns.filter(inWindow);
+    const evalRuns = perfData.evalRuns.filter(inWindow);
+    const criterionRuns = perfData.criterionRuns.filter(inWindow);
+    const modelTrends = perfData.modelTrends
+      .map((trend) => ({
+        ...trend,
+        points: trend.points.filter(inWindow),
+      }))
+      .filter((trend) => trend.points.length > 0);
 
     const chartConfig = {
       eval: {
@@ -278,13 +350,13 @@ const pageDescription =
         note: "Full eval runs only; smaller scripting evals stay in the milestone stream.",
         options: () => [
           { value: "all", label: "Best run per day" },
-          ...perfData.modelTrends.map((trend) => ({
+          ...modelTrends.map((trend) => ({
             value: trend.model,
             label: trend.model,
           })),
         ],
         points: (series) => {
-          const runs = perfData.evalRuns.filter((run) => run.tasks >= 10);
+          const runs = evalRuns.filter((run) => run.tasks >= 10);
           const source =
             series === "all"
               ? bestPerDay(runs, (run) => run.scorePct)
@@ -306,7 +378,7 @@ const pageDescription =
         note: "Aggregated from bashkit-vs-bash result JSON, using total runtime per run.",
         options: () => [{ value: "all", label: "All benchmark runs" }],
         points: () =>
-          perfData.benchRuns.map((run) => ({
+          benchRuns.map((run) => ({
             x: new Date(run.timestamp).getTime(),
             y: run.speedup,
             label: `${run.speedup}x on ${run.label}`,
@@ -322,7 +394,7 @@ const pageDescription =
         note: "Before/after reports use improvement percent. Baseline-only reports show median case time separately in the tables.",
         options: () => [{ value: "all", label: "Improvement reports" }],
         points: () =>
-          perfData.criterionRuns
+          criterionRuns
             .filter((run) => Number.isFinite(run.medianChangePct))
             .map((run) => ({
               x: new Date(run.timestamp).getTime(),
@@ -375,9 +447,13 @@ const pageDescription =
       kind.textContent = config.label;
       title.textContent = config.title;
       note.textContent = config.note;
+      if (points.length === 0) {
+        chart.innerHTML = `<text x="58" y="160" class="axis-label">No results in this six-month window.</text>`;
+        return;
+      }
 
       const width = 960;
-      const height = 360;
+      const height = 340;
       const pad = { left: 58, right: 28, top: 28, bottom: 54 };
       const xs = points.map((point) => point.x);
       const ys = points.map((point) => point.y);
@@ -399,9 +475,12 @@ const pageDescription =
       }));
 
       const yTicks = [0, 0.25, 0.5, 0.75, 1].map((step) => minY + (yTop - minY) * step);
-      const xTicks = plotted.filter((_, index) =>
-        plotted.length < 6 ? true : index % Math.ceil(plotted.length / 5) === 0,
-      );
+      const tickIndexes = [...new Set([
+        0,
+        Math.floor((plotted.length - 1) / 2),
+        plotted.length - 1,
+      ])];
+      const xTicks = tickIndexes.map((index) => plotted[index]).filter(Boolean);
 
       chart.innerHTML = `
         <rect x="0" y="0" width="${width}" height="${height}" fill="transparent"></rect>
@@ -468,19 +547,20 @@ const pageDescription =
 </BaseLayout>
 
 <style>
-  .perf-hero {
-    background: var(--color-obsidian);
-    color: var(--color-white);
+  .benches-intro {
+    border-bottom: 1px solid #dddddd;
+    background: var(--color-white);
+    padding: 2.4rem 0 2rem;
   }
 
-  .perf-hero__grid {
+  .benches-intro__grid {
     display: grid;
-    grid-template-columns: minmax(0, 1.2fr) minmax(320px, 0.8fr);
-    gap: var(--space-xl);
+    grid-template-columns: minmax(0, 0.9fr) minmax(320px, 0.8fr);
+    gap: var(--space-lg);
     align-items: end;
   }
 
-  .perf-hero .atlas-eyebrow,
+  .benches-intro .atlas-eyebrow,
   .perf-dashboard .atlas-eyebrow,
   .perf-milestones .atlas-eyebrow,
   .perf-tables .atlas-eyebrow {
@@ -493,17 +573,31 @@ const pageDescription =
     text-transform: uppercase;
   }
 
-  .perf-hero h1 {
-    max-width: 760px;
-    font-size: clamp(2.35rem, 5vw, 4.8rem);
-    line-height: 0.98;
+  .benches-intro h1 {
+    max-width: 640px;
+    font-size: clamp(2rem, 4vw, 3.2rem);
+    line-height: 1;
   }
 
-  .perf-lede {
-    max-width: 720px;
-    margin-top: var(--space-md);
-    color: var(--color-silver);
-    font-size: 1.08rem;
+  .benches-intro p {
+    color: var(--color-slate);
+    font-size: 1rem;
+  }
+
+  .perf-dashboard {
+    padding: var(--space-lg) 0;
+  }
+
+  .benches-grid {
+    display: grid;
+    grid-template-columns: minmax(0, 1fr) 22rem;
+    gap: var(--space-md);
+    align-items: start;
+  }
+
+  .benches-main,
+  .benches-side {
+    min-width: 0;
   }
 
   .perf-snapshot {
@@ -511,26 +605,62 @@ const pageDescription =
     gap: var(--space-sm);
   }
 
-  .perf-snapshot article {
-    border: 1px solid rgba(255, 255, 255, 0.16);
-    background: rgba(255, 255, 255, 0.05);
+  .perf-snapshot a {
+    display: block;
+    border: 1px solid #d7d7d7;
+    background: var(--color-white);
+    color: var(--color-obsidian);
     padding: var(--space-md);
+    text-decoration: none;
+    transition:
+      border-color 0.12s ease,
+      transform 0.12s ease;
+  }
+
+  .perf-snapshot a:hover,
+  .bench-source-list a:hover {
+    border-color: rgb(10 22 54 / 0.36);
+    text-decoration: none;
+    transform: translateY(-1px);
   }
 
   .perf-snapshot span,
   .perf-snapshot small {
     display: block;
-    color: var(--color-silver);
+    color: var(--color-slate);
   }
 
   .perf-snapshot strong {
     display: block;
     margin: 0.15rem 0;
-    color: var(--color-white);
-    font-size: 2rem;
+    color: var(--color-obsidian);
+    font-size: 1.65rem;
     line-height: 1.1;
   }
 
+  .bench-source-list {
+    display: grid;
+    gap: var(--space-sm);
+    margin-top: var(--space-md);
+  }
+
+  .bench-source-list a {
+    display: grid;
+    gap: 0.15rem;
+    border: 1px solid #d7d7d7;
+    background: var(--color-white);
+    color: var(--color-obsidian);
+    padding: 0.85rem 1rem;
+    text-decoration: none;
+    transition:
+      border-color 0.12s ease,
+      transform 0.12s ease;
+  }
+
+  .bench-source-list small {
+    color: var(--color-slate);
+  }
+
   .perf-toolbar {
     display: flex;
     justify-content: space-between;
@@ -595,7 +725,11 @@ const pageDescription =
     display: flex;
     justify-content: space-between;
     gap: var(--space-md);
-    padding: var(--space-lg) var(--space-lg) 0;
+    padding: 1.2rem 1.2rem 0;
+  }
+
+  .perf-chart-head h2 {
+    font-size: clamp(1.45rem, 2.4vw, 2rem);
   }
 
   .perf-chart-head p {
@@ -605,7 +739,7 @@ const pageDescription =
 
   .perf-chart-wrap {
     position: relative;
-    padding: var(--space-md) var(--space-lg) var(--space-lg);
+    padding: var(--space-sm) 1.2rem 1.2rem;
     overflow-x: auto;
   }
 
@@ -663,11 +797,11 @@ const pageDescription =
   }
 
   .atlas-section-heading {
-    display: flex;
-    justify-content: space-between;
+    display: grid;
+    grid-template-columns: minmax(0, 1fr) 22rem;
     gap: var(--space-lg);
     align-items: end;
-    margin-bottom: var(--space-lg);
+    margin-bottom: 0;
   }
 
   .atlas-section-heading p {
@@ -677,7 +811,15 @@ const pageDescription =
 
   .perf-timeline {
     display: grid;
-    gap: var(--space-md);
+    gap: var(--space-sm);
+  }
+
+  .perf-milestones {
+    padding: var(--space-lg) 0;
+  }
+
+  .benches-grid--timeline {
+    align-items: start;
   }
 
   .perf-milestone {
@@ -685,7 +827,7 @@ const pageDescription =
     grid-template-columns: 8.5rem minmax(0, 1fr);
     gap: var(--space-md);
     border-top: 1px solid #dddddd;
-    padding-top: var(--space-md);
+    padding-top: var(--space-sm);
   }
 
   .perf-milestone time {
@@ -704,6 +846,7 @@ const pageDescription =
 
   .perf-milestone h3 {
     margin-top: 0.1rem;
+    font-size: 1.12rem;
   }
 
   .perf-milestone p {
@@ -786,18 +929,32 @@ const pageDescription =
   }
 
   @media (max-width: 900px) {
-    .perf-hero__grid,
+    .benches-intro__grid,
+    .benches-grid,
+    .atlas-section-heading,
     .perf-table-grid {
       grid-template-columns: 1fr;
     }
 
     .perf-toolbar,
-    .perf-chart-head,
-    .atlas-section-heading {
+    .perf-chart-head {
       align-items: stretch;
       flex-direction: column;
     }
 
+    .benches-side {
+      order: -1;
+    }
+
+    .perf-snapshot,
+    .bench-source-list {
+      grid-template-columns: repeat(2, minmax(0, 1fr));
+    }
+
+    .bench-source-list .atlas-eyebrow {
+      grid-column: 1 / -1;
+    }
+
     .segmented-control {
       width: 100%;
     }
@@ -819,6 +976,11 @@ const pageDescription =
   }
 
   @media (max-width: 620px) {
+    .perf-snapshot,
+    .bench-source-list {
+      grid-template-columns: 1fr;
+    }
+
     .perf-milestone,
     .perf-definition-list div {
       grid-template-columns: 1fr;

From 54536396d9383ec3b4c7f1ac454311cf164ae466 Mon Sep 17 00:00:00 2001
From: Mykhailo Chalyi <mykhailo.chalyi-contractor@procore.com>
Date: Tue, 26 May 2026 20:36:12 -0500
Subject: [PATCH 3/5] fix(site): simplify benches dashboard layout

---
 site/src/pages/benches.astro | 179 ++++++++++++++++++-----------------
 1 file changed, 90 insertions(+), 89 deletions(-)

diff --git a/site/src/pages/benches.astro b/site/src/pages/benches.astro
index cf3c4ce6..066baecb 100644
--- a/site/src/pages/benches.astro
+++ b/site/src/pages/benches.astro
@@ -88,81 +88,77 @@ const pageDescription =
   </section>
 
   <section class="perf-dashboard section-alt">
-    <div class="container benches-grid">
-      <div class="benches-main">
-        <div class="perf-toolbar" aria-label="Chart controls">
-          <div class="segmented-control" role="group" aria-label="Metric">
-            <button type="button" class="is-active" data-view="eval">Eval score</button>
-            <button type="button" data-view="bench">Speedup</button>
-            <button type="button" data-view="criterion">Criterion delta</button>
-          </div>
+    <div class="container dashboard-stack">
+      <div class="perf-snapshot" aria-label="Latest performance snapshot">
+        <a href={latestBench ? repoUrl(latestBench.source) : "#"} target="_blank" rel="noopener noreferrer">
+          <span>Latest benchmark</span>
+          <strong>{compactNumber(latestBench?.speedup, "x")}</strong>
+          <small>{latestBench?.date ?? "n/a"} vs bash</small>
+        </a>
+        <a href={latestEval ? repoUrl(latestEval.source) : "#"} target="_blank" rel="noopener noreferrer">
+          <span>Latest eval</span>
+          <strong>{compactNumber(latestEval?.scorePct, "%")}</strong>
+          <small>
+            {latestEval ? `${latestEval.passed}/${latestEval.tasks} tasks` : "n/a"}
+          </small>
+        </a>
+        <a href={bestCriterion ? repoUrl(bestCriterion.source) : "#"} target="_blank" rel="noopener noreferrer">
+          <span>Best criterion delta</span>
+          <strong>
+            {
+              bestCriterion?.bestChangePct
+                ? `${Math.abs(bestCriterion.bestChangePct)}%`
+                : "n/a"
+            }
+          </strong>
+          <small>{bestCriterion?.bestImprovement?.name ?? "n/a"}</small>
+        </a>
+      </div>
 
-          <label class="perf-select">
-            <span>Series</span>
-            <select id="perf-series-select" aria-label="Series"></select>
-          </label>
+      <div class="perf-toolbar" aria-label="Chart controls">
+        <div class="segmented-control" role="group" aria-label="Metric">
+          <button type="button" class="is-active" data-view="eval">Eval score</button>
+          <button type="button" data-view="bench">Speedup</button>
+          <button type="button" data-view="criterion">Criterion delta</button>
         </div>
 
-        <article class="perf-chart-shell">
-          <div class="perf-chart-head">
-            <div>
-              <span class="atlas-eyebrow" id="perf-chart-kind">Eval score</span>
-              <h2 id="perf-chart-title">Full eval score trend</h2>
-            </div>
-            <p id="perf-chart-note"></p>
-          </div>
-
-          <div class="perf-chart-wrap">
-            <svg
-              id="perf-chart"
-              viewBox="0 0 960 340"
-              role="img"
-              aria-labelledby="perf-chart-title"
-            ></svg>
-            <div id="perf-tooltip" class="perf-tooltip" hidden></div>
-          </div>
-        </article>
+        <label class="perf-select">
+          <span>Series</span>
+          <select id="perf-series-select" aria-label="Series"></select>
+        </label>
       </div>
 
-      <aside class="benches-side">
-        <div class="perf-snapshot" aria-label="Latest performance snapshot">
-          <a href={latestBench ? repoUrl(latestBench.source) : "#"} target="_blank" rel="noopener noreferrer">
-            <span>Latest benchmark</span>
-            <strong>{compactNumber(latestBench?.speedup, "x")}</strong>
-            <small>{latestBench?.date ?? "n/a"} vs bash</small>
-          </a>
-          <a href={latestEval ? repoUrl(latestEval.source) : "#"} target="_blank" rel="noopener noreferrer">
-            <span>Latest eval</span>
-            <strong>{compactNumber(latestEval?.scorePct, "%")}</strong>
-            <small>
-              {latestEval ? `${latestEval.passed}/${latestEval.tasks} tasks` : "n/a"}
-            </small>
-          </a>
-          <a href={bestCriterion ? repoUrl(bestCriterion.source) : "#"} target="_blank" rel="noopener noreferrer">
-            <span>Best criterion delta</span>
-            <strong>
-              {
-                bestCriterion?.bestChangePct
-                  ? `${Math.abs(bestCriterion.bestChangePct)}%`
-                  : "n/a"
-              }
-            </strong>
-            <small>{bestCriterion?.bestImprovement?.name ?? "n/a"}</small>
-          </a>
+      <article class="perf-chart-shell">
+        <div class="perf-chart-head">
+          <div>
+            <span class="atlas-eyebrow" id="perf-chart-kind">Eval score</span>
+            <h2 id="perf-chart-title">Full eval score trend</h2>
+          </div>
+          <p id="perf-chart-note"></p>
         </div>
 
-        <div class="bench-source-list">
-          <span class="atlas-eyebrow">Source files</span>
-          {
-            resultLinks.map((item) => (
-              <a href={item.href} target="_blank" rel="noopener noreferrer">
-                <strong>{item.title}</strong>
-                <small>{item.detail}</small>
-              </a>
-            ))
-          }
+        <div class="perf-chart-wrap">
+          <svg
+            id="perf-chart"
+            viewBox="0 0 960 340"
+            role="img"
+            aria-labelledby="perf-chart-title"
+          ></svg>
+          <div id="perf-tooltip" class="perf-tooltip" hidden></div>
         </div>
-      </aside>
+      </article>
+
+      <div class="bench-source-list" aria-label="Source files">
+        <span class="atlas-eyebrow">Source files</span>
+        {
+          resultLinks.map((item) => (
+            <a href={item.href} target="_blank" rel="noopener noreferrer">
+              <strong>{item.title}</strong>
+              <small>{item.detail}</small>
+            </a>
+          ))
+        }
+      </div>
     </div>
   </section>
 
@@ -189,7 +185,7 @@ const pageDescription =
                 <h3>{item.title}</h3>
                 <p>{item.detail}</p>
                 <a href={repoUrl(item.source)} target="_blank" rel="noopener noreferrer">
-                  Source artifact
+                  Open result file
                 </a>
               </div>
             </article>
@@ -585,24 +581,22 @@ const pageDescription =
   }
 
   .perf-dashboard {
-    padding: var(--space-lg) 0;
+    padding: 1.25rem 0 var(--space-lg);
   }
 
-  .benches-grid {
+  .dashboard-stack {
     display: grid;
-    grid-template-columns: minmax(0, 1fr) 22rem;
-    gap: var(--space-md);
-    align-items: start;
+    gap: 0.8rem;
   }
 
-  .benches-main,
-  .benches-side {
+  .dashboard-stack > * {
     min-width: 0;
   }
 
   .perf-snapshot {
     display: grid;
-    gap: var(--space-sm);
+    grid-template-columns: repeat(3, minmax(0, 1fr));
+    gap: 0.65rem;
   }
 
   .perf-snapshot a {
@@ -610,7 +604,7 @@ const pageDescription =
     border: 1px solid #d7d7d7;
     background: var(--color-white);
     color: var(--color-obsidian);
-    padding: var(--space-md);
+    padding: 0.85rem 1rem;
     text-decoration: none;
     transition:
       border-color 0.12s ease,
@@ -634,14 +628,22 @@ const pageDescription =
     display: block;
     margin: 0.15rem 0;
     color: var(--color-obsidian);
-    font-size: 1.65rem;
+    font-size: 1.45rem;
     line-height: 1.1;
   }
 
   .bench-source-list {
     display: grid;
-    gap: var(--space-sm);
-    margin-top: var(--space-md);
+    grid-template-columns: auto repeat(4, minmax(0, 1fr));
+    gap: 0.65rem;
+    align-items: stretch;
+    margin-top: 0.1rem;
+  }
+
+  .bench-source-list .atlas-eyebrow {
+    align-items: center;
+    margin-bottom: 0;
+    padding-right: 0.35rem;
   }
 
   .bench-source-list a {
@@ -650,7 +652,8 @@ const pageDescription =
     border: 1px solid #d7d7d7;
     background: var(--color-white);
     color: var(--color-obsidian);
-    padding: 0.85rem 1rem;
+    min-height: 3.8rem;
+    padding: 0.65rem 0.75rem;
     text-decoration: none;
     transition:
       border-color 0.12s ease,
@@ -666,7 +669,7 @@ const pageDescription =
     justify-content: space-between;
     gap: var(--space-md);
     align-items: center;
-    margin-bottom: var(--space-md);
+    margin-top: 0.35rem;
   }
 
   .segmented-control {
@@ -719,6 +722,7 @@ const pageDescription =
   .perf-table-card {
     background: var(--color-white);
     border: 1px solid #dddddd;
+    min-width: 0;
   }
 
   .perf-chart-head {
@@ -930,7 +934,6 @@ const pageDescription =
 
   @media (max-width: 900px) {
     .benches-intro__grid,
-    .benches-grid,
     .atlas-section-heading,
     .perf-table-grid {
       grid-template-columns: 1fr;
@@ -942,11 +945,6 @@ const pageDescription =
       flex-direction: column;
     }
 
-    .benches-side {
-      order: -1;
-    }
-
-    .perf-snapshot,
     .bench-source-list {
       grid-template-columns: repeat(2, minmax(0, 1fr));
     }
@@ -976,7 +974,10 @@ const pageDescription =
   }
 
   @media (max-width: 620px) {
-    .perf-snapshot,
+    .perf-snapshot {
+      grid-template-columns: 1fr;
+    }
+
     .bench-source-list {
       grid-template-columns: 1fr;
     }

From 5146e48e8dbf6ea2d35be86365ae7ca6557769ff Mon Sep 17 00:00:00 2001
From: Mykhailo Chalyi <mykhailo.chalyi-contractor@procore.com>
Date: Fri, 29 May 2026 19:16:31 -0500
Subject: [PATCH 4/5] fix(site): show latest benches snapshot

---
 site/src/pages/benches.astro | 1115 +++++++++-------------------------
 1 file changed, 284 insertions(+), 831 deletions(-)

diff --git a/site/src/pages/benches.astro b/site/src/pages/benches.astro
index 066baecb..1f996460 100644
--- a/site/src/pages/benches.astro
+++ b/site/src/pages/benches.astro
@@ -2,38 +2,21 @@
 import BaseLayout from "../layouts/BaseLayout.astro";
 import performanceData from "../data/performance-timeline.json";
 
-// Decision: this page consumes the generated aggregate only. It must not import
-// raw eval traces or per-iteration benchmark samples into the browser bundle.
+// Decision: show the latest aggregated snapshot instead of a time-series UI.
+// Raw result files remain linked so benchmark details stay inspectable.
 const data = performanceData as typeof performanceData;
-const historyMonths = 6;
-const generatedAt = new Date(data.generatedAt ?? new Date().toISOString());
-const historyStart = new Date(generatedAt);
-historyStart.setMonth(historyStart.getMonth() - historyMonths);
-const inWindow = (timestamp: string) => new Date(timestamp) >= historyStart;
-function latestByTime<T extends { timestamp: string }>(items: T[]) {
-  return items
-    .toSorted((a, b) => new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime())
-    .at(-1);
-}
-
-const benchRuns = data.benchRuns.filter((run) => inWindow(run.timestamp));
-const evalRuns = data.evalRuns.filter((run) => inWindow(run.timestamp));
-const criterionRuns = data.criterionRuns.filter((run) => inWindow(run.timestamp));
-const milestones = data.milestones.filter((item) => inWindow(item.timestamp));
-
-const latestBench = latestByTime(benchRuns);
-const latestEval = latestByTime(evalRuns.filter((run) => run.tasks >= 10));
-const bestCriterion = criterionRuns
-  .filter((run) => typeof run.bestChangePct === "number")
-  .toSorted((a, b) => Math.abs(b.bestChangePct ?? 0) - Math.abs(a.bestChangePct ?? 0))
+
+const latestBench = data.summary.latestBench;
+const latestEval = data.summary.latestEval;
+const latestCriterion = data.criterionRuns
+  .toSorted((a, b) => new Date(b.timestamp).getTime() - new Date(a.timestamp).getTime())
   .at(0);
-const latestCriterion = latestByTime(criterionRuns);
-const latestFullEvalRuns = evalRuns
+const latestFullEvalRuns = data.evalRuns
   .filter((run) => run.tasks >= 10)
   .toSorted((a, b) => new Date(b.timestamp).getTime() - new Date(a.timestamp).getTime())
-  .slice(0, 8);
+  .slice(0, 6);
 const topBenchCategories = latestBench?.categories.slice(0, 8) ?? [];
-const weakestEvalCategories = latestEval?.categories.slice(0, 8) ?? [];
+const evalCategories = latestEval?.categories.slice(0, 8) ?? [];
 
 const repoUrl = (source: string) =>
   `https://github.com/everruns/bashkit/blob/main/${source}`;
@@ -41,866 +24,384 @@ const repoUrl = (source: string) =>
 const compactNumber = (value: number | null | undefined, suffix = "") =>
   typeof value === "number" ? `${value.toLocaleString()}${suffix}` : "n/a";
 
-const dateLabel = (date: Date) => date.toISOString().slice(0, 10);
-const resultLinks = [
+const latestArtifacts = [
+  ...(latestBench
+    ? [
+        {
+          title: "Latest bashkit-bench run",
+          detail: `${latestBench.cases} cases on ${latestBench.label}`,
+          href: repoUrl(latestBench.source),
+        },
+      ]
+    : []),
+  ...(latestEval
+    ? [
+        {
+          title: "Latest LLM eval run",
+          detail: `${latestEval.model}, ${latestEval.passed}/${latestEval.tasks} tasks`,
+          href: repoUrl(latestEval.source),
+        },
+      ]
+    : []),
+  ...(latestCriterion
+    ? [
+        {
+          title: "Latest criterion bench",
+          detail: `${latestCriterion.family}, ${latestCriterion.cases} cases`,
+          href: repoUrl(latestCriterion.source),
+        },
+      ]
+    : []),
+];
+
+const resultIndexes = [
   {
-    title: "Criterion benches",
-    detail: `${criterionRuns.length} reports in window`,
+    title: "Criterion results",
     href: "https://github.com/everruns/bashkit/tree/main/crates/bashkit/benches/results",
   },
   {
-    title: "bashkit-bench",
-    detail: `${benchRuns.length} bash-vs-bash runs`,
+    title: "bashkit-bench results",
     href: "https://github.com/everruns/bashkit/tree/main/crates/bashkit-bench/results",
   },
   {
-    title: "LLM evals",
-    detail: `${evalRuns.length} eval result files`,
+    title: "Eval results",
     href: "https://github.com/everruns/bashkit/tree/main/crates/bashkit-eval/results",
   },
   {
     title: "Aggregation script",
-    detail: "How this page is generated",
     href: "https://github.com/everruns/bashkit/blob/main/site/scripts/build-performance-data.mjs",
   },
 ];
 
 const pageDescription =
-  "Interactive Bashkit benches history across criterion benches, bash-vs-bashkit benchmarks, and LLM evals.";
+  "Latest Bashkit benchmark, criterion bench, and LLM eval snapshot.";
 ---
 
-<BaseLayout
-  title="Bashkit Benches History"
-  description={pageDescription}
->
-  <section class="benches-intro">
-    <div class="container benches-intro__grid">
-      <div>
-        <span class="atlas-eyebrow">Benches</span>
-        <h1>History for benchmarks and evals</h1>
-      </div>
-      <p>
-        Last {historyMonths} months only, from {dateLabel(historyStart)} to{" "}
-        {dateLabel(generatedAt)}. Raw result files stay in GitHub; this page
-        ships the compact aggregate.
-      </p>
-    </div>
-  </section>
-
-  <section class="perf-dashboard section-alt">
-    <div class="container dashboard-stack">
-      <div class="perf-snapshot" aria-label="Latest performance snapshot">
-        <a href={latestBench ? repoUrl(latestBench.source) : "#"} target="_blank" rel="noopener noreferrer">
-          <span>Latest benchmark</span>
+<BaseLayout title="Bashkit Benches" description={pageDescription}>
+  <section class="bench-page">
+    <div class="container">
+      <header class="bench-header">
+        <div>
+          <span class="bench-eyebrow">Benches</span>
+          <h1>Latest benchmark snapshot</h1>
+        </div>
+        <p>
+          Static aggregate generated from repository result artifacts. Use the
+          linked files for raw measurements and full eval traces.
+        </p>
+      </header>
+
+      <section class="snapshot-grid" aria-label="Latest snapshot">
+        <a
+          href={latestBench ? repoUrl(latestBench.source) : "#"}
+          class="metric-card"
+          target="_blank"
+          rel="noopener noreferrer"
+        >
+          <span>bashkit-bench</span>
           <strong>{compactNumber(latestBench?.speedup, "x")}</strong>
-          <small>{latestBench?.date ?? "n/a"} vs bash</small>
+          <small>
+            {latestBench
+              ? `${latestBench.cases} cases, ${latestBench.matchRate}% match`
+              : "No run"}
+          </small>
         </a>
-        <a href={latestEval ? repoUrl(latestEval.source) : "#"} target="_blank" rel="noopener noreferrer">
-          <span>Latest eval</span>
+        <a
+          href={latestEval ? repoUrl(latestEval.source) : "#"}
+          class="metric-card"
+          target="_blank"
+          rel="noopener noreferrer"
+        >
+          <span>LLM eval</span>
           <strong>{compactNumber(latestEval?.scorePct, "%")}</strong>
           <small>
-            {latestEval ? `${latestEval.passed}/${latestEval.tasks} tasks` : "n/a"}
+            {latestEval ? `${latestEval.passed}/${latestEval.tasks} tasks` : "No run"}
           </small>
         </a>
-        <a href={bestCriterion ? repoUrl(bestCriterion.source) : "#"} target="_blank" rel="noopener noreferrer">
-          <span>Best criterion delta</span>
-          <strong>
-            {
-              bestCriterion?.bestChangePct
-                ? `${Math.abs(bestCriterion.bestChangePct)}%`
-                : "n/a"
-            }
-          </strong>
-          <small>{bestCriterion?.bestImprovement?.name ?? "n/a"}</small>
+        <a
+          href={latestCriterion ? repoUrl(latestCriterion.source) : "#"}
+          class="metric-card"
+          target="_blank"
+          rel="noopener noreferrer"
+        >
+          <span>Criterion</span>
+          <strong>{compactNumber(latestCriterion?.medianUs, " us")}</strong>
+          <small>
+            {latestCriterion?.fastestCase
+              ? `fastest: ${latestCriterion.fastestCase.name}`
+              : `${latestCriterion?.cases ?? 0} cases`}
+          </small>
         </a>
-      </div>
+      </section>
 
-      <div class="perf-toolbar" aria-label="Chart controls">
-        <div class="segmented-control" role="group" aria-label="Metric">
-          <button type="button" class="is-active" data-view="eval">Eval score</button>
-          <button type="button" data-view="bench">Speedup</button>
-          <button type="button" data-view="criterion">Criterion delta</button>
-        </div>
-
-        <label class="perf-select">
-          <span>Series</span>
-          <select id="perf-series-select" aria-label="Series"></select>
-        </label>
-      </div>
-
-      <article class="perf-chart-shell">
-        <div class="perf-chart-head">
-          <div>
-            <span class="atlas-eyebrow" id="perf-chart-kind">Eval score</span>
-            <h2 id="perf-chart-title">Full eval score trend</h2>
-          </div>
-          <p id="perf-chart-note"></p>
-        </div>
-
-        <div class="perf-chart-wrap">
-          <svg
-            id="perf-chart"
-            viewBox="0 0 960 340"
-            role="img"
-            aria-labelledby="perf-chart-title"
-          ></svg>
-          <div id="perf-tooltip" class="perf-tooltip" hidden></div>
-        </div>
-      </article>
-
-      <div class="bench-source-list" aria-label="Source files">
-        <span class="atlas-eyebrow">Source files</span>
-        {
-          resultLinks.map((item) => (
-            <a href={item.href} target="_blank" rel="noopener noreferrer">
-              <strong>{item.title}</strong>
-              <small>{item.detail}</small>
-            </a>
-          ))
-        }
-      </div>
-    </div>
-  </section>
-
-  <section class="perf-milestones">
-    <div class="container benches-grid benches-grid--timeline">
-      <div class="atlas-section-heading">
+      <section class="artifact-strip" aria-label="Result artifacts">
         <div>
-          <span class="atlas-eyebrow">Recent timeline</span>
-          <h2>Six-month milestones</h2>
-        </div>
-        <p>
-          Milestones are grouped to the current display window and link back to
-          the exact source artifact.
-        </p>
-      </div>
-
-      <div class="perf-timeline">
-        {
-          milestones.slice(-10).reverse().map((item) => (
-            <article class="perf-milestone">
-              <time datetime={item.timestamp}>{item.date}</time>
-              <div>
-                <span>{item.kind}</span>
-                <h3>{item.title}</h3>
-                <p>{item.detail}</p>
-                <a href={repoUrl(item.source)} target="_blank" rel="noopener noreferrer">
-                  Open result file
-                </a>
-              </div>
-            </article>
-          ))
-        }
-      </div>
-    </div>
-  </section>
-
-  <section class="perf-tables section section-alt">
-    <div class="container perf-table-grid">
-      <article class="perf-table-card">
-        <div class="perf-table-card__head">
-          <span class="atlas-eyebrow">Latest eval runs</span>
-          <h2>Model progress</h2>
-        </div>
-        <table>
-          <thead>
-            <tr><th>Run</th><th>Score</th><th>Tasks</th><th>Tools</th></tr>
-          </thead>
-          <tbody>
-            {
-              latestFullEvalRuns.map((run) => (
-                <tr>
-                  <td>
-                    <a href={repoUrl(run.source)} target="_blank" rel="noopener noreferrer">
-                      {run.model}
-                    </a>
-                    <small>{run.date}</small>
-                  </td>
-                  <td class="score">{run.scorePct}%</td>
-                  <td>{run.passed}/{run.tasks}</td>
-                  <td>{run.toolSuccessPct}%</td>
-                </tr>
-              ))
-            }
-          </tbody>
-        </table>
-      </article>
-
-      <article class="perf-table-card">
-        <div class="perf-table-card__head">
-          <span class="atlas-eyebrow">Latest benchmark categories</span>
-          <h2>Where bashkit is fastest</h2>
-        </div>
-        <table>
-          <thead>
-            <tr><th>Category</th><th>Cases</th><th>Median speedup</th></tr>
-          </thead>
-          <tbody>
-            {
-              topBenchCategories.map((row) => (
-                <tr>
-                  <td>{row.category}</td>
-                  <td>{row.cases}</td>
-                  <td class="score">{row.speedup}x</td>
-                </tr>
-              ))
-            }
-          </tbody>
-        </table>
-      </article>
-
-      <article class="perf-table-card">
-        <div class="perf-table-card__head">
-          <span class="atlas-eyebrow">Latest eval pressure</span>
-          <h2>Weakest categories</h2>
+          <span class="bench-eyebrow">Latest files</span>
+          <h2>Open the source results</h2>
         </div>
-        <table>
-          <thead>
-            <tr><th>Category</th><th>Tasks</th><th>Rate</th></tr>
-          </thead>
-          <tbody>
-            {
-              weakestEvalCategories.map((row) => (
-                <tr>
-                  <td>{row.category}</td>
-                  <td>{row.passed}/{row.tasks}</td>
-                  <td class="score">{row.rate}%</td>
-                </tr>
-              ))
-            }
-          </tbody>
-        </table>
-      </article>
-
-      <article class="perf-table-card">
-        <div class="perf-table-card__head">
-          <span class="atlas-eyebrow">Criterion coverage</span>
-          <h2>Latest microbench set</h2>
+        <div class="artifact-links">
+          {
+            latestArtifacts.map((item) => (
+              <a href={item.href} target="_blank" rel="noopener noreferrer">
+                <strong>{item.title}</strong>
+                <small>{item.detail}</small>
+              </a>
+            ))
+          }
         </div>
-        <dl class="perf-definition-list">
-          <div>
-            <dt>Run</dt>
-            <dd>{latestCriterion?.label ?? "n/a"}</dd>
+      </section>
+
+      <section class="snapshot-columns">
+        <article class="snapshot-panel">
+          <div class="panel-head">
+            <span class="bench-eyebrow">Runtime speed</span>
+            <h2>Fastest benchmark categories</h2>
           </div>
-          <div>
-            <dt>Cases</dt>
-            <dd>{compactNumber(latestCriterion?.cases)}</dd>
+          <table>
+            <thead>
+              <tr><th>Category</th><th>Cases</th><th>Speedup</th></tr>
+            </thead>
+            <tbody>
+              {
+                topBenchCategories.map((row) => (
+                  <tr>
+                    <td>{row.category}</td>
+                    <td>{row.cases}</td>
+                    <td class="score">{row.speedup}x</td>
+                  </tr>
+                ))
+              }
+            </tbody>
+          </table>
+        </article>
+
+        <article class="snapshot-panel">
+          <div class="panel-head">
+            <span class="bench-eyebrow">Eval pressure</span>
+            <h2>Lowest eval categories</h2>
           </div>
-          <div>
-            <dt>Median time</dt>
-            <dd>{compactNumber(latestCriterion?.medianUs, " us")}</dd>
+          <table>
+            <thead>
+              <tr><th>Category</th><th>Tasks</th><th>Rate</th></tr>
+            </thead>
+            <tbody>
+              {
+                evalCategories.map((row) => (
+                  <tr>
+                    <td>{row.category}</td>
+                    <td>{row.passed}/{row.tasks}</td>
+                    <td class="score">{row.rate}%</td>
+                  </tr>
+                ))
+              }
+            </tbody>
+          </table>
+        </article>
+      </section>
+
+      <section class="snapshot-columns snapshot-columns--bottom">
+        <article class="snapshot-panel">
+          <div class="panel-head">
+            <span class="bench-eyebrow">Latest full evals</span>
+            <h2>Recent model snapshots</h2>
           </div>
-          <div>
-            <dt>Fastest case</dt>
-            <dd>
+          <table>
+            <thead>
+              <tr><th>Run</th><th>Score</th><th>Tools</th></tr>
+            </thead>
+            <tbody>
               {
-                latestCriterion?.fastestCase
-                  ? `${latestCriterion.fastestCase.name} (${latestCriterion.fastestCase.us} us)`
-                  : "n/a"
+                latestFullEvalRuns.map((run) => (
+                  <tr>
+                    <td>
+                      <a href={repoUrl(run.source)} target="_blank" rel="noopener noreferrer">
+                        {run.model}
+                      </a>
+                      <small>{run.date}</small>
+                    </td>
+                    <td class="score">{run.scorePct}%</td>
+                    <td>{run.toolSuccessPct}%</td>
+                  </tr>
+                ))
               }
-            </dd>
+            </tbody>
+          </table>
+        </article>
+
+        <article class="snapshot-panel source-panel">
+          <div class="panel-head">
+            <span class="bench-eyebrow">Indexes</span>
+            <h2>Browse all result sets</h2>
           </div>
-        </dl>
-      </article>
+          <div class="index-links">
+            {
+              resultIndexes.map((item) => (
+                <a href={item.href} target="_blank" rel="noopener noreferrer">
+                  {item.title}
+                </a>
+              ))
+            }
+          </div>
+        </article>
+      </section>
     </div>
   </section>
-
-  <script
-    id="performance-data"
-    type="application/json"
-    is:inline
-    set:html={JSON.stringify(data)}
-  />
-
-  <script is:inline>
-    const dataEl = document.getElementById("performance-data");
-    const perfData = JSON.parse(dataEl.textContent);
-    const chart = document.getElementById("perf-chart");
-    const tooltip = document.getElementById("perf-tooltip");
-    const buttons = [...document.querySelectorAll("[data-view]")];
-    const select = document.getElementById("perf-series-select");
-    const kind = document.getElementById("perf-chart-kind");
-    const title = document.getElementById("perf-chart-title");
-    const note = document.getElementById("perf-chart-note");
-    const generatedAt = new Date(perfData.generatedAt);
-    const historyStart = new Date(generatedAt);
-    historyStart.setMonth(historyStart.getMonth() - 6);
-    const inWindow = (run) => new Date(run.timestamp) >= historyStart;
-    const benchRuns = perfData.benchRuns.filter(inWindow);
-    const evalRuns = perfData.evalRuns.filter(inWindow);
-    const criterionRuns = perfData.criterionRuns.filter(inWindow);
-    const modelTrends = perfData.modelTrends
-      .map((trend) => ({
-        ...trend,
-        points: trend.points.filter(inWindow),
-      }))
-      .filter((trend) => trend.points.length > 0);
-
-    const chartConfig = {
-      eval: {
-        label: "Eval score",
-        title: "Full eval score trend",
-        unit: "%",
-        maxY: 100,
-        color: "#1769aa",
-        note: "Full eval runs only; smaller scripting evals stay in the milestone stream.",
-        options: () => [
-          { value: "all", label: "Best run per day" },
-          ...modelTrends.map((trend) => ({
-            value: trend.model,
-            label: trend.model,
-          })),
-        ],
-        points: (series) => {
-          const runs = evalRuns.filter((run) => run.tasks >= 10);
-          const source =
-            series === "all"
-              ? bestPerDay(runs, (run) => run.scorePct)
-              : runs.filter((run) => `${run.provider}/${run.model}` === series);
-          return source.map((run) => ({
-            x: new Date(run.timestamp).getTime(),
-            y: run.scorePct,
-            label: `${run.model}: ${run.scorePct}%`,
-            detail: `${run.passed}/${run.tasks} tasks, ${run.toolSuccessPct}% tool success`,
-            date: run.date,
-          }));
-        },
-      },
-      bench: {
-        label: "Speedup",
-        title: "bashkit-bench speedup vs bash",
-        unit: "x",
-        color: "#0f7b58",
-        note: "Aggregated from bashkit-vs-bash result JSON, using total runtime per run.",
-        options: () => [{ value: "all", label: "All benchmark runs" }],
-        points: () =>
-          benchRuns.map((run) => ({
-            x: new Date(run.timestamp).getTime(),
-            y: run.speedup,
-            label: `${run.speedup}x on ${run.label}`,
-            detail: `${run.cases} cases, ${run.matchRate}% output match`,
-            date: run.date,
-          })),
-      },
-      criterion: {
-        label: "Criterion delta",
-        title: "Criterion improvements and microbench coverage",
-        unit: "%",
-        color: "#a35f00",
-        note: "Before/after reports use improvement percent. Baseline-only reports show median case time separately in the tables.",
-        options: () => [{ value: "all", label: "Improvement reports" }],
-        points: () =>
-          criterionRuns
-            .filter((run) => Number.isFinite(run.medianChangePct))
-            .map((run) => ({
-              x: new Date(run.timestamp).getTime(),
-              y: Math.abs(run.medianChangePct),
-              label: `${run.family}: ${Math.abs(run.medianChangePct)}% median improvement`,
-              detail: run.bestImprovement
-                ? `${Math.abs(run.bestImprovement.changePct)}% best in ${run.bestImprovement.name}`
-                : `${run.cases} cases`,
-              date: run.date,
-            })),
-      },
-    };
-
-    function bestPerDay(runs, score) {
-      const byDay = new Map();
-      for (const run of runs) {
-        const current = byDay.get(run.date);
-        if (!current || score(run) > score(current)) byDay.set(run.date, run);
-      }
-      return [...byDay.values()].sort(
-        (a, b) => new Date(a.timestamp) - new Date(b.timestamp),
-      );
-    }
-
-    function setOptions(view) {
-      const previous = select.value;
-      const options = chartConfig[view].options();
-      select.innerHTML = options
-        .map((option) => `<option value="${option.value}">${option.label}</option>`)
-        .join("");
-      if (options.some((option) => option.value === previous)) {
-        select.value = previous;
-      }
-    }
-
-    function pathFrom(points) {
-      return points
-        .map((point, index) => `${index === 0 ? "M" : "L"} ${point.sx} ${point.sy}`)
-        .join(" ");
-    }
-
-    function renderChart() {
-      const view = document.querySelector("[data-view].is-active").dataset.view;
-      const config = chartConfig[view];
-      const points = config
-        .points(select.value)
-        .filter((point) => Number.isFinite(point.x) && Number.isFinite(point.y))
-        .sort((a, b) => a.x - b.x);
-
-      kind.textContent = config.label;
-      title.textContent = config.title;
-      note.textContent = config.note;
-      if (points.length === 0) {
-        chart.innerHTML = `<text x="58" y="160" class="axis-label">No results in this six-month window.</text>`;
-        return;
-      }
-
-      const width = 960;
-      const height = 340;
-      const pad = { left: 58, right: 28, top: 28, bottom: 54 };
-      const xs = points.map((point) => point.x);
-      const ys = points.map((point) => point.y);
-      const minX = Math.min(...xs);
-      const maxX = Math.max(...xs);
-      const minY = Math.min(0, ...ys);
-      const maxY = Math.max(...ys, 1);
-      const yTop = Math.max(config.maxY ?? maxY * 1.12, maxY);
-
-      const scaleX = (x) =>
-        pad.left + ((x - minX) / Math.max(1, maxX - minX)) * (width - pad.left - pad.right);
-      const scaleY = (y) =>
-        height - pad.bottom - ((y - minY) / Math.max(1, yTop - minY)) * (height - pad.top - pad.bottom);
-
-      const plotted = points.map((point) => ({
-        ...point,
-        sx: scaleX(point.x),
-        sy: scaleY(point.y),
-      }));
-
-      const yTicks = [0, 0.25, 0.5, 0.75, 1].map((step) => minY + (yTop - minY) * step);
-      const tickIndexes = [...new Set([
-        0,
-        Math.floor((plotted.length - 1) / 2),
-        plotted.length - 1,
-      ])];
-      const xTicks = tickIndexes.map((index) => plotted[index]).filter(Boolean);
-
-      chart.innerHTML = `
-        <rect x="0" y="0" width="${width}" height="${height}" fill="transparent"></rect>
-        ${yTicks
-          .map((tick) => {
-            const y = scaleY(tick);
-            return `<line x1="${pad.left}" x2="${width - pad.right}" y1="${y}" y2="${y}" class="grid-line"></line>
-              <text x="${pad.left - 12}" y="${y + 4}" class="axis-label" text-anchor="end">${formatMetric(tick, config.unit)}</text>`;
-          })
-          .join("")}
-        <path d="${pathFrom(plotted)}" fill="none" stroke="${config.color}" stroke-width="3" stroke-linecap="round"></path>
-        ${plotted
-          .map(
-            (point, index) => `<circle cx="${point.sx}" cy="${point.sy}" r="6" fill="${config.color}" data-index="${index}" tabindex="0"></circle>`,
-          )
-          .join("")}
-        ${xTicks
-          .map(
-            (point) => `<text x="${point.sx}" y="${height - 18}" class="axis-label" text-anchor="middle">${point.date.slice(5)}</text>`,
-          )
-          .join("")}
-      `;
-
-      chart.querySelectorAll("circle").forEach((circle) => {
-        const point = plotted[Number(circle.dataset.index)];
-        circle.addEventListener("mouseenter", () => showTooltip(circle, point, config.unit));
-        circle.addEventListener("focus", () => showTooltip(circle, point, config.unit));
-        circle.addEventListener("mouseleave", hideTooltip);
-        circle.addEventListener("blur", hideTooltip);
-      });
-    }
-
-    function formatMetric(value, unit) {
-      const rounded = Math.round(value * 10) / 10;
-      return `${rounded}${unit}`;
-    }
-
-    function showTooltip(node, point, unit) {
-      const chartBox = chart.getBoundingClientRect();
-      const nodeBox = node.getBoundingClientRect();
-      tooltip.innerHTML = `<strong>${point.label}</strong><span>${point.date}</span><p>${point.detail}</p><b>${formatMetric(point.y, unit)}</b>`;
-      tooltip.hidden = false;
-      tooltip.style.left = `${nodeBox.left - chartBox.left + 18}px`;
-      tooltip.style.top = `${nodeBox.top - chartBox.top - 18}px`;
-    }
-
-    function hideTooltip() {
-      tooltip.hidden = true;
-    }
-
-    buttons.forEach((button) => {
-      button.addEventListener("click", () => {
-        buttons.forEach((item) => item.classList.remove("is-active"));
-        button.classList.add("is-active");
-        setOptions(button.dataset.view);
-        renderChart();
-      });
-    });
-
-    select.addEventListener("change", renderChart);
-    setOptions("eval");
-    renderChart();
-  </script>
 </BaseLayout>
 
 <style>
-  .benches-intro {
-    border-bottom: 1px solid #dddddd;
-    background: var(--color-white);
-    padding: 2.4rem 0 2rem;
+  .bench-page {
+    background: #f6f6f6;
+    padding: 2.25rem 0 3rem;
   }
 
-  .benches-intro__grid {
+  .bench-header {
     display: grid;
-    grid-template-columns: minmax(0, 0.9fr) minmax(320px, 0.8fr);
+    grid-template-columns: minmax(0, 1fr) minmax(20rem, 0.65fr);
     gap: var(--space-lg);
     align-items: end;
+    margin-bottom: var(--space-lg);
   }
 
-  .benches-intro .atlas-eyebrow,
-  .perf-dashboard .atlas-eyebrow,
-  .perf-milestones .atlas-eyebrow,
-  .perf-tables .atlas-eyebrow {
+  .bench-eyebrow {
     display: inline-flex;
-    margin-bottom: var(--space-sm);
+    margin-bottom: var(--space-xs);
     color: var(--color-gold);
-    font-size: 0.78rem;
+    font-size: 0.76rem;
     font-weight: 700;
     letter-spacing: 0.08em;
     text-transform: uppercase;
   }
 
-  .benches-intro h1 {
-    max-width: 640px;
-    font-size: clamp(2rem, 4vw, 3.2rem);
+  .bench-header h1 {
+    font-size: clamp(2rem, 4.5vw, 3.6rem);
     line-height: 1;
   }
 
-  .benches-intro p {
+  .bench-header p {
     color: var(--color-slate);
-    font-size: 1rem;
-  }
-
-  .perf-dashboard {
-    padding: 1.25rem 0 var(--space-lg);
   }
 
-  .dashboard-stack {
+  .snapshot-grid,
+  .snapshot-columns {
     display: grid;
-    gap: 0.8rem;
+    grid-template-columns: repeat(3, minmax(0, 1fr));
+    gap: var(--space-md);
   }
 
-  .dashboard-stack > * {
-    min-width: 0;
+  .snapshot-columns {
+    grid-template-columns: repeat(2, minmax(0, 1fr));
+    margin-top: var(--space-md);
   }
 
-  .perf-snapshot {
-    display: grid;
-    grid-template-columns: repeat(3, minmax(0, 1fr));
-    gap: 0.65rem;
+  .metric-card,
+  .snapshot-panel,
+  .artifact-strip {
+    border: 1px solid #dddddd;
+    background: var(--color-white);
   }
 
-  .perf-snapshot a {
-    display: block;
-    border: 1px solid #d7d7d7;
-    background: var(--color-white);
+  .metric-card {
+    display: grid;
+    gap: 0.2rem;
+    min-height: 8rem;
+    padding: 1.1rem;
     color: var(--color-obsidian);
-    padding: 0.85rem 1rem;
     text-decoration: none;
     transition:
       border-color 0.12s ease,
       transform 0.12s ease;
   }
 
-  .perf-snapshot a:hover,
-  .bench-source-list a:hover {
+  .metric-card:hover,
+  .artifact-links a:hover,
+  .index-links a:hover {
     border-color: rgb(10 22 54 / 0.36);
     text-decoration: none;
     transform: translateY(-1px);
   }
 
-  .perf-snapshot span,
-  .perf-snapshot small {
-    display: block;
+  .metric-card span,
+  .metric-card small,
+  td small {
     color: var(--color-slate);
   }
 
-  .perf-snapshot strong {
-    display: block;
-    margin: 0.15rem 0;
-    color: var(--color-obsidian);
-    font-size: 1.45rem;
-    line-height: 1.1;
+  .metric-card strong {
+    font-size: 2rem;
+    line-height: 1;
   }
 
-  .bench-source-list {
+  .artifact-strip {
     display: grid;
-    grid-template-columns: auto repeat(4, minmax(0, 1fr));
-    gap: 0.65rem;
+    grid-template-columns: minmax(14rem, 0.32fr) minmax(0, 1fr);
+    gap: var(--space-md);
     align-items: stretch;
-    margin-top: 0.1rem;
+    margin-top: var(--space-md);
+    padding: 1.1rem;
   }
 
-  .bench-source-list .atlas-eyebrow {
-    align-items: center;
-    margin-bottom: 0;
-    padding-right: 0.35rem;
+  .artifact-strip h2,
+  .panel-head h2 {
+    font-size: 1.35rem;
   }
 
-  .bench-source-list a {
+  .artifact-links {
     display: grid;
-    gap: 0.15rem;
-    border: 1px solid #d7d7d7;
-    background: var(--color-white);
-    color: var(--color-obsidian);
-    min-height: 3.8rem;
-    padding: 0.65rem 0.75rem;
-    text-decoration: none;
-    transition:
-      border-color 0.12s ease,
-      transform 0.12s ease;
-  }
-
-  .bench-source-list small {
-    color: var(--color-slate);
-  }
-
-  .perf-toolbar {
-    display: flex;
-    justify-content: space-between;
-    gap: var(--space-md);
-    align-items: center;
-    margin-top: 0.35rem;
-  }
-
-  .segmented-control {
-    display: inline-grid;
     grid-template-columns: repeat(3, minmax(0, 1fr));
-    border: 1px solid #d7d7d7;
-    background: var(--color-white);
-  }
-
-  .segmented-control button {
-    min-width: 9.5rem;
-    min-height: 2.65rem;
-    border: 0;
-    border-right: 1px solid #d7d7d7;
-    background: transparent;
-    color: var(--color-slate);
-    cursor: pointer;
-    font: inherit;
-    font-weight: 600;
-  }
-
-  .segmented-control button:last-child {
-    border-right: 0;
-  }
-
-  .segmented-control button.is-active {
-    background: var(--color-obsidian);
-    color: var(--color-white);
-  }
-
-  .perf-select {
-    display: inline-flex;
-    align-items: center;
     gap: var(--space-sm);
-    color: var(--color-slate);
-    font-weight: 600;
-  }
-
-  .perf-select select {
-    min-width: 260px;
-    min-height: 2.65rem;
-    border: 1px solid #d7d7d7;
-    background: var(--color-white);
-    color: var(--color-obsidian);
-    padding: 0 var(--space-sm);
-    font: inherit;
   }
 
-  .perf-chart-shell,
-  .perf-table-card {
-    background: var(--color-white);
+  .artifact-links a,
+  .index-links a {
+    display: grid;
+    gap: 0.15rem;
     border: 1px solid #dddddd;
-    min-width: 0;
-  }
-
-  .perf-chart-head {
-    display: flex;
-    justify-content: space-between;
-    gap: var(--space-md);
-    padding: 1.2rem 1.2rem 0;
-  }
-
-  .perf-chart-head h2 {
-    font-size: clamp(1.45rem, 2.4vw, 2rem);
-  }
-
-  .perf-chart-head p {
-    max-width: 420px;
-    color: var(--color-slate);
-  }
-
-  .perf-chart-wrap {
-    position: relative;
-    padding: var(--space-sm) 1.2rem 1.2rem;
-    overflow-x: auto;
-  }
-
-  .perf-chart-wrap svg {
-    display: block;
-    width: 100%;
-    min-width: 720px;
-    height: auto;
-  }
-
-  .perf-chart-wrap :global(.grid-line) {
-    stroke: #e5e5e5;
-    stroke-width: 1;
-  }
-
-  .perf-chart-wrap :global(.axis-label) {
-    fill: #5d5d5d;
-    font-family: var(--font-mono);
-    font-size: 0.78rem;
-  }
-
-  .perf-chart-wrap :global(circle) {
-    cursor: pointer;
-    stroke: var(--color-white);
-    stroke-width: 2;
-  }
-
-  .perf-tooltip {
-    position: absolute;
-    z-index: 5;
-    width: min(280px, 80vw);
-    border: 1px solid #d7d7d7;
-    background: var(--color-white);
+    padding: 0.85rem;
     color: var(--color-obsidian);
-    padding: var(--space-sm);
-    box-shadow: 0 16px 36px rgba(0, 0, 0, 0.16);
-    pointer-events: none;
-  }
-
-  .perf-tooltip strong,
-  .perf-tooltip span,
-  .perf-tooltip b {
-    display: block;
-  }
-
-  .perf-tooltip span,
-  .perf-tooltip p {
-    color: var(--color-slate);
-    font-size: 0.9rem;
-  }
-
-  .perf-tooltip b {
-    margin-top: var(--space-xs);
-    color: var(--color-gold);
-  }
-
-  .atlas-section-heading {
-    display: grid;
-    grid-template-columns: minmax(0, 1fr) 22rem;
-    gap: var(--space-lg);
-    align-items: end;
-    margin-bottom: 0;
-  }
-
-  .atlas-section-heading p {
-    max-width: 520px;
-    color: var(--color-slate);
-  }
-
-  .perf-timeline {
-    display: grid;
-    gap: var(--space-sm);
-  }
-
-  .perf-milestones {
-    padding: var(--space-lg) 0;
-  }
-
-  .benches-grid--timeline {
-    align-items: start;
-  }
-
-  .perf-milestone {
-    display: grid;
-    grid-template-columns: 8.5rem minmax(0, 1fr);
-    gap: var(--space-md);
-    border-top: 1px solid #dddddd;
-    padding-top: var(--space-sm);
-  }
-
-  .perf-milestone time {
-    color: var(--color-slate);
-    font-family: var(--font-mono);
-    font-size: 0.9rem;
-  }
-
-  .perf-milestone span {
-    color: var(--color-gold);
-    font-size: 0.72rem;
-    font-weight: 800;
-    letter-spacing: 0.08em;
-    text-transform: uppercase;
-  }
-
-  .perf-milestone h3 {
-    margin-top: 0.1rem;
-    font-size: 1.12rem;
+    text-decoration: none;
+    transition:
+      border-color 0.12s ease,
+      transform 0.12s ease;
   }
 
-  .perf-milestone p {
+  .artifact-links small {
     color: var(--color-slate);
-    max-width: 760px;
-  }
-
-  .perf-milestone a {
-    display: inline-flex;
-    margin-top: var(--space-xs);
-    font-weight: 700;
   }
 
-  .perf-table-grid {
-    display: grid;
-    grid-template-columns: repeat(2, minmax(0, 1fr));
-    gap: var(--space-md);
-  }
-
-  .perf-table-card {
+  .snapshot-panel {
     overflow: hidden;
   }
 
-  .perf-table-card__head {
-    padding: var(--space-lg) var(--space-lg) var(--space-sm);
+  .panel-head {
+    padding: 1rem 1.1rem 0.7rem;
   }
 
-  .perf-table-card table {
+  table {
     width: 100%;
     border-collapse: collapse;
   }
 
-  .perf-table-card th,
-  .perf-table-card td {
+  th,
+  td {
     border-top: 1px solid #eeeeee;
-    padding: 0.85rem var(--space-lg);
+    padding: 0.75rem 1.1rem;
     text-align: left;
     vertical-align: top;
   }
 
-  .perf-table-card th {
+  th {
     color: var(--color-slate);
-    font-size: 0.78rem;
-    letter-spacing: 0.04em;
+    font-size: 0.76rem;
+    letter-spacing: 0.05em;
     text-transform: uppercase;
   }
 
-  .perf-table-card td small {
+  td a {
+    font-weight: 700;
+  }
+
+  td small {
     display: block;
-    color: var(--color-slate);
     font-family: var(--font-mono);
     font-size: 0.8rem;
   }
@@ -910,90 +411,42 @@ const pageDescription =
     font-weight: 800;
   }
 
-  .perf-definition-list {
+  .source-panel {
     display: grid;
-    gap: 0;
+    align-content: start;
   }
 
-  .perf-definition-list div {
+  .index-links {
     display: grid;
-    grid-template-columns: 9rem minmax(0, 1fr);
-    gap: var(--space-md);
-    border-top: 1px solid #eeeeee;
-    padding: 0.85rem var(--space-lg);
-  }
-
-  .perf-definition-list dt {
-    color: var(--color-slate);
-    font-weight: 700;
-  }
-
-  .perf-definition-list dd {
-    min-width: 0;
+    gap: var(--space-sm);
+    padding: 0 1.1rem 1.1rem;
   }
 
   @media (max-width: 900px) {
-    .benches-intro__grid,
-    .atlas-section-heading,
-    .perf-table-grid {
+    .bench-header,
+    .snapshot-grid,
+    .artifact-strip,
+    .artifact-links,
+    .snapshot-columns {
       grid-template-columns: 1fr;
     }
-
-    .perf-toolbar,
-    .perf-chart-head {
-      align-items: stretch;
-      flex-direction: column;
-    }
-
-    .bench-source-list {
-      grid-template-columns: repeat(2, minmax(0, 1fr));
-    }
-
-    .bench-source-list .atlas-eyebrow {
-      grid-column: 1 / -1;
-    }
-
-    .segmented-control {
-      width: 100%;
-    }
-
-    .segmented-control button {
-      min-width: 0;
-      padding: 0 var(--space-xs);
-    }
-
-    .perf-select {
-      align-items: stretch;
-      flex-direction: column;
-    }
-
-    .perf-select select {
-      min-width: 0;
-      width: 100%;
-    }
   }
 
   @media (max-width: 620px) {
-    .perf-snapshot {
-      grid-template-columns: 1fr;
+    .bench-page {
+      padding-top: 1.5rem;
     }
 
-    .bench-source-list {
-      grid-template-columns: 1fr;
-    }
-
-    .perf-milestone,
-    .perf-definition-list div {
-      grid-template-columns: 1fr;
-      gap: var(--space-xs);
+    .metric-card {
+      min-height: 0;
     }
 
-    .perf-table-card {
+    .snapshot-panel {
       overflow-x: auto;
     }
 
-    .perf-table-card table {
-      min-width: 520px;
+    table {
+      min-width: 420px;
     }
   }
 </style>

From c0980c2803cda5abe63e5e96ed6fac20745c776d Mon Sep 17 00:00:00 2001
From: Mykhailo Chalyi <mykhailo.chalyi-contractor@procore.com>
Date: Fri, 29 May 2026 22:01:38 -0500
Subject: [PATCH 5/5] feat(site): refine benches snapshot

---
 AGENTS.md                               |   1 +
 crates/bashkit-bench/README.md          |   4 +
 crates/bashkit-bench/src/main.rs        |  68 ++-
 justfile                                |  46 +-
 site/README.md                          |   4 +
 site/scripts/build-performance-data.mjs |  55 ++-
 site/src/data/performance-timeline.json | 579 ++++++++++++++++++------
 site/src/pages/benches.astro            | 189 ++++----
 specs/eval.md                           |   4 +
 specs/performance-results.md            |  64 +++
 10 files changed, 767 insertions(+), 247 deletions(-)
 create mode 100644 specs/performance-results.md

diff --git a/AGENTS.md b/AGENTS.md
index 371fac10..932fa4b7 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -48,6 +48,7 @@ Fix root cause. Unsure: read more code; if stuck, ask w/ short options. Unrecogn
 | sqlite-builtin | Embedded SQLite via Turso (MemoryIO + VfsIO backends, dot-commands) |
 | coreutils-args-port | Port uutils `uu_app()` clap definitions (args mode) and platform-clean uucore modules (module mode, manifest-driven) into bashkit via codegen |
 | credential-injection | Transparent per-host credential injection for outbound HTTP requests, without exposing secrets to sandboxed scripts |
+| performance-results | Benchmark/eval result locations and `/benches` site aggregation contract |
 
 ### Documentation
 
diff --git a/crates/bashkit-bench/README.md b/crates/bashkit-bench/README.md
index 4791722e..963c27fe 100644
--- a/crates/bashkit-bench/README.md
+++ b/crates/bashkit-bench/README.md
@@ -127,6 +127,10 @@ cargo run -p bashkit-bench --release -- --list
 | `--verbose` | Show per-benchmark timing details |
 | `--list` | List available benchmarks |
 
+Saved JSON/Markdown reports in `crates/bashkit-bench/results/` feed the site
+`/benches` page. See `specs/performance-results.md` for the aggregation
+contract.
+
 ## Prerequisites
 
 | Runner | Setup |
diff --git a/crates/bashkit-bench/src/main.rs b/crates/bashkit-bench/src/main.rs
index c93865ac..0c52826d 100644
--- a/crates/bashkit-bench/src/main.rs
+++ b/crates/bashkit-bench/src/main.rs
@@ -391,21 +391,15 @@ async fn main() -> Result<()> {
 
     // Save if requested
     if let Some(ref save_arg) = args.save {
-        let base_name = if save_arg.is_empty() {
-            // Auto-generate filename with moniker and timestamp
-            let timestamp = chrono_lite_now();
-            format!("bench-{}-{}", system_info.moniker, timestamp)
-        } else {
-            // Use provided name, strip extension if present
-            let path = PathBuf::from(save_arg);
-            path.file_stem()
-                .and_then(|s| s.to_str())
-                .unwrap_or("bench-results")
-                .to_string()
-        };
+        let timestamp = chrono_lite_now();
+        let base_path = save_base_path(save_arg, &system_info.moniker, &timestamp);
 
-        let json_path = format!("{}.json", base_name);
-        let md_path = format!("{}.md", base_name);
+        let json_path = base_path.with_extension("json");
+        let md_path = base_path.with_extension("md");
+
+        if let Some(parent) = json_path.parent() {
+            std::fs::create_dir_all(parent).context("Failed to create results directory")?;
+        }
 
         // Save JSON
         let json = serde_json::to_string_pretty(&report)?;
@@ -418,14 +412,30 @@ async fn main() -> Result<()> {
         println!(
             "\n{} results to:\n  - {}\n  - {}",
             "Saved".green(),
-            json_path,
-            md_path
+            json_path.display(),
+            md_path.display()
         );
     }
 
     Ok(())
 }
 
+fn save_base_path(save_arg: &str, moniker: &str, timestamp: &str) -> PathBuf {
+    if save_arg.is_empty() {
+        // Auto-generate inside the repo-tracked results folder so site builds
+        // can pick up fresh benchmark runs.
+        return PathBuf::from("crates/bashkit-bench/results")
+            .join(format!("bench-{}-{}", moniker, timestamp));
+    }
+
+    let path = PathBuf::from(save_arg);
+    if path.extension().is_some() {
+        path.with_extension("")
+    } else {
+        path
+    }
+}
+
 async fn run_benchmark(
     runner: &mut Runner,
     case: &BenchCase,
@@ -780,3 +790,29 @@ fn print_summary(summary: &BenchSummary) {
         println!();
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::save_base_path;
+    use std::path::PathBuf;
+
+    #[test]
+    fn save_base_path_defaults_to_site_indexed_results_dir() {
+        assert_eq!(
+            save_base_path("", "vm-linux-x86_64", "1779764460"),
+            PathBuf::from("crates/bashkit-bench/results/bench-vm-linux-x86_64-1779764460")
+        );
+    }
+
+    #[test]
+    fn save_base_path_preserves_custom_directory_and_strips_extension() {
+        assert_eq!(
+            save_base_path(
+                "crates/bashkit-bench/results/manual-test.json",
+                "ignored",
+                "ignored"
+            ),
+            PathBuf::from("crates/bashkit-bench/results/manual-test")
+        );
+    }
+}
diff --git a/justfile b/justfile
index 7340285e..80405866 100644
--- a/justfile
+++ b/justfile
@@ -124,63 +124,75 @@ run-script file:
 
 # === Benchmarks ===
 
-# Run benchmarks comparing bashkit to bash
+# Run benchmarks comparing bashkit to bash and save site-indexed JSON/Markdown results
 bench:
-    cargo run -p bashkit-bench --release
+    cargo run -p bashkit-bench --release -- --save
+    pnpm --dir site run data:performance
 
-# Run benchmarks and save results to JSON
-bench-save file="bench-results.json":
+# Run benchmarks and save results to JSON/Markdown
+bench-save file="":
     cargo run -p bashkit-bench --release -- --save {{file}}
+    pnpm --dir site run data:performance
 
-# Run benchmarks with verbose output
+# Run benchmarks with verbose output and save site-indexed JSON/Markdown results
 bench-verbose:
-    cargo run -p bashkit-bench --release -- --verbose
+    cargo run -p bashkit-bench --release -- --verbose --save
+    pnpm --dir site run data:performance
 
-# Run specific benchmark category (startup, variables, arithmetic, control, strings, arrays, pipes, tools, complex)
+# Exploratory: run specific benchmark category without updating site results (startup, variables, arithmetic, control, strings, arrays, pipes, tools, complex)
 bench-category cat:
     cargo run -p bashkit-bench --release -- --category {{cat}}
 
-# Run benchmarks with more iterations for accuracy
+# Run benchmarks with more iterations for accuracy and save site-indexed JSON/Markdown results
 bench-accurate:
-    cargo run -p bashkit-bench --release -- --iterations 50 --warmup 5
+    cargo run -p bashkit-bench --release -- --iterations 50 --warmup 5 --save
+    pnpm --dir site run data:performance
 
 # List available benchmarks
 bench-list:
     cargo run -p bashkit-bench --release -- --list
 
-# Run benchmarks with all runners (including just-bash if available)
+# Run benchmarks with all runners and save site-indexed JSON/Markdown results (including just-bash if available)
 bench-all:
-    cargo run -p bashkit-bench --release -- --runners bashkit,bash,just-bash
+    cargo run -p bashkit-bench --release -- --runners bashkit,bash,just-bash --save
+    pnpm --dir site run data:performance
 
 # Run Criterion parallel_execution benchmark and save results
 bench-parallel:
     ./scripts/bench-parallel.sh
+    pnpm --dir site run data:performance
 
 # Run Criterion sqlite builtin benchmark and save results
 bench-sqlite:
     ./scripts/bench-sqlite.sh
+    pnpm --dir site run data:performance
 
 # === Eval ===
 
-# Run LLM eval (requires ANTHROPIC_API_KEY or OPENAI_API_KEY)
+# Run LLM eval and save site-indexed JSON/Markdown results (requires ANTHROPIC_API_KEY or OPENAI_API_KEY)
 eval dataset="crates/bashkit-eval/data/eval-tasks.jsonl" provider="anthropic" model="claude-sonnet-4-20250514":
-    cargo run -p bashkit-eval --release -- run --dataset {{dataset}} --provider {{provider}} --model {{model}}
+    cargo run -p bashkit-eval --release -- run --dataset {{dataset}} --provider {{provider}} --model {{model}} --save
+    pnpm --dir site run data:performance
 
 # Run eval and save results
 eval-save dataset="crates/bashkit-eval/data/eval-tasks.jsonl" provider="anthropic" model="claude-sonnet-4-20250514":
     cargo run -p bashkit-eval --release -- run --dataset {{dataset}} --provider {{provider}} --model {{model}} --save
+    pnpm --dir site run data:performance
 
-# Run scripting-tool eval (scripted mode)
+# Run scripting-tool eval (scripted mode) and save site-indexed JSON/Markdown results
 eval-scripting dataset="crates/bashkit-eval/data/scripting-tool/many-tools.jsonl" provider="openai" model="gpt-5.4":
-    cargo run -p bashkit-eval --release -- run --eval-type scripting-tool --dataset {{dataset}} --provider {{provider}} --model {{model}}
+    cargo run -p bashkit-eval --release -- run --eval-type scripting-tool --dataset {{dataset}} --provider {{provider}} --model {{model}} --save
+    pnpm --dir site run data:performance
 
-# Run scripting-tool eval (baseline mode — individual tools, no ScriptedTool)
+# Run scripting-tool eval (baseline mode — individual tools, no ScriptedTool) and save site-indexed JSON/Markdown results
 eval-scripting-baseline dataset="crates/bashkit-eval/data/scripting-tool/many-tools.jsonl" provider="openai" model="gpt-5.4":
-    cargo run -p bashkit-eval --release -- run --eval-type scripting-tool --baseline --dataset {{dataset}} --provider {{provider}} --model {{model}}
+    cargo run -p bashkit-eval --release -- run --eval-type scripting-tool --baseline --dataset {{dataset}} --provider {{provider}} --model {{model}} --save
+    pnpm --dir site run data:performance
 
 # Run scripting-tool eval and save results
 eval-scripting-save dataset="crates/bashkit-eval/data/scripting-tool/many-tools.jsonl" provider="openai" model="gpt-5.4":
     cargo run -p bashkit-eval --release -- run --eval-type scripting-tool --dataset {{dataset}} --provider {{provider}} --model {{model}} --save
+    pnpm --dir site run data:performance
 
 # === Security ===
 
diff --git a/site/README.md b/site/README.md
index dac65514..d8f378c1 100644
--- a/site/README.md
+++ b/site/README.md
@@ -18,6 +18,10 @@ pnpm run build     # emits ./dist
 pnpm run preview   # serve dist/ via wrangler
 ```
 
+`pnpm run build` regenerates `src/data/performance-timeline.json` from saved
+benchmark and eval artifacts before Astro builds. The `/benches` page contract is
+specified in `../specs/performance-results.md`.
+
 ## Deploy
 
 Deployment is intended to run from CI against the Cloudflare account that owns
diff --git a/site/scripts/build-performance-data.mjs b/site/scripts/build-performance-data.mjs
index f6b016e4..fc7a7301 100644
--- a/site/scripts/build-performance-data.mjs
+++ b/site/scripts/build-performance-data.mjs
@@ -1,4 +1,4 @@
-import { mkdir, readFile, readdir, writeFile } from "node:fs/promises";
+import { access, mkdir, readFile, readdir, writeFile } from "node:fs/promises";
 import path from "node:path";
 import { fileURLToPath } from "node:url";
 
@@ -13,6 +13,21 @@ const benchDir = path.join(repoRoot, "crates/bashkit-bench/results");
 const criterionDir = path.join(repoRoot, "crates/bashkit/benches/results");
 const evalDir = path.join(repoRoot, "crates/bashkit-eval/results");
 
+const benchmarkCategoryDescriptions = {
+  arithmetic: "Integer math, substitutions, and expression-heavy shell snippets.",
+  arrays: "Indexed array reads, writes, expansion, and iteration.",
+  complex: "Mixed shell workflows that combine multiple language features.",
+  control: "Conditionals, loops, case statements, and branching scripts.",
+  io: "File reads, writes, redirects, and filesystem-facing commands.",
+  large: "Bigger scripts and higher-volume data paths.",
+  pipes: "Pipeline construction, streaming, and command chaining.",
+  startup: "Small commands where interpreter startup dominates runtime.",
+  strings: "String expansion, pattern handling, and text manipulation.",
+  subshell: "Command substitution and nested shell execution paths.",
+  tools: "Builtin and external-tool style command workloads.",
+  variables: "Variable assignment, lookup, expansion, and environment handling.",
+};
+
 function round(value, digits = 2) {
   if (!Number.isFinite(value)) return null;
   const scale = 10 ** digits;
@@ -110,6 +125,18 @@ async function readJson(filePath) {
   return JSON.parse(await readFile(filePath, "utf8"));
 }
 
+async function existingMarkdownReport(relativeSource) {
+  if (relativeSource.endsWith(".md")) return relativeSource;
+
+  const reportSource = relativeSource.replace(/\.[^.]+$/, ".md");
+  try {
+    await access(path.join(repoRoot, reportSource));
+    return reportSource;
+  } catch {
+    return null;
+  }
+}
+
 async function listFiles(dir, extension) {
   return (await readdir(dir))
     .filter((file) => file.endsWith(extension))
@@ -143,7 +170,14 @@ async function buildBenchRuns() {
       if (!Number.isFinite(row.bashkit) || !Number.isFinite(row.bash) || row.bashkit <= 0) {
         continue;
       }
-      const bucket = byCategory.get(row.category) ?? { ratios: [], cases: 0 };
+      const bucket = byCategory.get(row.category) ?? {
+        bashkitMs: [],
+        bashMs: [],
+        ratios: [],
+        cases: 0,
+      };
+      bucket.bashkitMs.push(row.bashkit);
+      bucket.bashMs.push(row.bash);
       bucket.ratios.push(row.bash / row.bashkit);
       bucket.cases += 1;
       byCategory.set(row.category, bucket);
@@ -152,19 +186,24 @@ async function buildBenchRuns() {
     const categories = [...byCategory.entries()]
       .map(([category, bucket]) => ({
         category,
+        description: benchmarkCategoryDescriptions[category] ?? "Benchmarks grouped by harness category.",
         cases: bucket.cases,
+        bashkitMedianMs: round(percentile(bucket.bashkitMs, 0.5), 3),
+        bashMedianMs: round(percentile(bucket.bashMs, 0.5), 3),
         speedup: round(percentile(bucket.ratios, 0.5), 1),
       }))
-      .sort((a, b) => b.speedup - a.speedup);
+      .sort((a, b) => a.bashkitMedianMs - b.bashkitMedianMs);
 
     const speedup = bashkit.total_time_ms > 0 ? bash.total_time_ms / bashkit.total_time_ms : null;
+    const source = `crates/bashkit-bench/results/${file}`;
     runs.push({
       id: file.replace(/\.json$/, ""),
       kind: "bashkit-bench",
       label: data.moniker ?? data.system?.moniker ?? file,
       date: dateLabel(timestamp),
       timestamp,
-      source: `crates/bashkit-bench/results/${file}`,
+      source,
+      reportSource: await existingMarkdownReport(source),
       cases: data.summary?.total_cases ?? categories.reduce((sum, item) => sum + item.cases, 0),
       speedup: round(speedup, 1),
       bashkitMs: round(bashkit.total_time_ms, 2),
@@ -225,6 +264,7 @@ async function buildCriterionRuns() {
     const summaryMedianMatch = content.match(/median change:\s*\*\*(-?[0-9.]+)%\*\*/i);
     const summaryMeanMatch = content.match(/mean change:\s*\*\*(-?[0-9.]+)%\*\*/i);
 
+    const source = `crates/bashkit/benches/results/${file}`;
     runs.push({
       id: file.replace(/\.md$/, ""),
       kind: "criterion",
@@ -232,7 +272,8 @@ async function buildCriterionRuns() {
       label: title,
       date: dateLabel(timestamp),
       timestamp,
-      source: `crates/bashkit/benches/results/${file}`,
+      source,
+      reportSource: source,
       cases: Math.max(changes.length, timesUs.length),
       medianUs: round(percentile(timesUs, 0.5), 2),
       p95Us: round(percentile(timesUs, 0.95), 2),
@@ -273,6 +314,7 @@ async function buildEvalRuns() {
       }))
       .sort((a, b) => a.rate - b.rate || b.tasks - a.tasks);
 
+    const source = `crates/bashkit-eval/results/${file}`;
     runs.push({
       id: file.replace(/\.json$/, ""),
       kind: file.startsWith("scripting-eval") ? "scripting-eval" : "llm-eval",
@@ -282,7 +324,8 @@ async function buildEvalRuns() {
       label: `${data.provider ?? "unknown"}/${data.model ?? "unknown"}`,
       date: dateLabel(timestamp),
       timestamp,
-      source: `crates/bashkit-eval/results/${file}`,
+      source,
+      reportSource: await existingMarkdownReport(source),
       tasks: summary.total_tasks,
       passed: summary.total_passed,
       scorePct: round(summary.overall_rate * 100, 1),
diff --git a/site/src/data/performance-timeline.json b/site/src/data/performance-timeline.json
index 36552373..32182e9e 100644
--- a/site/src/data/performance-timeline.json
+++ b/site/src/data/performance-timeline.json
@@ -16,6 +16,7 @@
       "date": "2026-05-26",
       "timestamp": "2026-05-26T03:01:00.000Z",
       "source": "crates/bashkit-bench/results/bench-vm-linux-x86_64-1779764460.json",
+      "reportSource": "crates/bashkit-bench/results/bench-vm-linux-x86_64-1779764460.md",
       "cases": 96,
       "speedup": 20.9,
       "bashkitMs": 42.95,
@@ -23,64 +24,100 @@
       "errorRate": 0,
       "matchRate": 100,
       "categories": [
-        {
-          "category": "subshell",
-          "cases": 6,
-          "speedup": 40.3
-        },
-        {
-          "category": "io",
-          "cases": 6,
-          "speedup": 37.7
-        },
-        {
-          "category": "tools",
-          "cases": 21,
-          "speedup": 37
-        },
-        {
-          "category": "pipes",
-          "cases": 6,
-          "speedup": 36.3
-        },
         {
           "category": "startup",
+          "description": "Small commands where interpreter startup dominates runtime.",
           "cases": 4,
+          "bashkitMedianMs": 0.053,
+          "bashMedianMs": 1.662,
           "speedup": 32.1
         },
         {
           "category": "strings",
+          "description": "String expansion, pattern handling, and text manipulation.",
           "cases": 8,
+          "bashkitMedianMs": 0.057,
+          "bashMedianMs": 1.791,
           "speedup": 31.2
         },
         {
           "category": "variables",
+          "description": "Variable assignment, lookup, expansion, and environment handling.",
           "cases": 8,
+          "bashkitMedianMs": 0.058,
+          "bashMedianMs": 1.688,
           "speedup": 30.2
         },
         {
           "category": "arrays",
+          "description": "Indexed array reads, writes, expansion, and iteration.",
           "cases": 6,
+          "bashkitMedianMs": 0.059,
+          "bashMedianMs": 1.713,
           "speedup": 29
         },
+        {
+          "category": "subshell",
+          "description": "Command substitution and nested shell execution paths.",
+          "cases": 6,
+          "bashkitMedianMs": 0.061,
+          "bashMedianMs": 3.143,
+          "speedup": 40.3
+        },
         {
           "category": "arithmetic",
+          "description": "Integer math, substitutions, and expression-heavy shell snippets.",
           "cases": 6,
+          "bashkitMedianMs": 0.062,
+          "bashMedianMs": 1.703,
           "speedup": 28.8
         },
+        {
+          "category": "pipes",
+          "description": "Pipeline construction, streaming, and command chaining.",
+          "cases": 6,
+          "bashkitMedianMs": 0.065,
+          "bashMedianMs": 3.131,
+          "speedup": 36.3
+        },
         {
           "category": "control",
+          "description": "Conditionals, loops, case statements, and branching scripts.",
           "cases": 9,
+          "bashkitMedianMs": 0.076,
+          "bashMedianMs": 1.711,
           "speedup": 26.6
         },
+        {
+          "category": "io",
+          "description": "File reads, writes, redirects, and filesystem-facing commands.",
+          "cases": 6,
+          "bashkitMedianMs": 0.08,
+          "bashMedianMs": 2.681,
+          "speedup": 37.7
+        },
+        {
+          "category": "tools",
+          "description": "Builtin and external-tool style command workloads.",
+          "cases": 21,
+          "bashkitMedianMs": 0.093,
+          "bashMedianMs": 3.537,
+          "speedup": 37
+        },
         {
           "category": "complex",
+          "description": "Mixed shell workflows that combine multiple language features.",
           "cases": 7,
+          "bashkitMedianMs": 0.118,
+          "bashMedianMs": 3.207,
           "speedup": 16.5
         },
         {
           "category": "large",
+          "description": "Bigger scripts and higher-volume data paths.",
           "cases": 9,
+          "bashkitMedianMs": 1.789,
+          "bashMedianMs": 3.289,
           "speedup": 4.4
         }
       ]
@@ -95,6 +132,7 @@
       "date": "2026-05-26",
       "timestamp": "2026-05-26T02:36:42Z",
       "source": "crates/bashkit-eval/results/eval-openresponses-gpt-5.3-codex-2026-05-26-023642.json",
+      "reportSource": "crates/bashkit-eval/results/eval-openresponses-gpt-5.3-codex-2026-05-26-023642.md",
       "tasks": 58,
       "passed": 54,
       "scorePct": 93,
@@ -207,6 +245,7 @@
       "date": "2026-02-27",
       "timestamp": "2026-02-27T04:38:56Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-27-043856.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-27-043856.md",
       "tasks": 23,
       "passed": 23,
       "scorePct": 100,
@@ -286,6 +325,7 @@
       "date": "2026-02-03",
       "timestamp": "2026-02-03T04:31:00.000Z",
       "source": "crates/bashkit-bench/results/bench-runsc-linux-x86_64-1770093060.json",
+      "reportSource": "crates/bashkit-bench/results/bench-runsc-linux-x86_64-1770093060.md",
       "cases": 75,
       "speedup": 200.9,
       "bashkitMs": 8.97,
@@ -293,49 +333,76 @@
       "errorRate": 0,
       "matchRate": 89.33,
       "categories": [
-        {
-          "category": "pipes",
-          "cases": 6,
-          "speedup": 367
-        },
-        {
-          "category": "tools",
-          "cases": 21,
-          "speedup": 239.9
-        },
         {
           "category": "startup",
+          "description": "Small commands where interpreter startup dominates runtime.",
           "cases": 4,
+          "bashkitMedianMs": 0.039,
+          "bashMedianMs": 8.474,
           "speedup": 216.4
         },
         {
           "category": "strings",
+          "description": "String expansion, pattern handling, and text manipulation.",
           "cases": 8,
+          "bashkitMedianMs": 0.047,
+          "bashMedianMs": 9.218,
           "speedup": 201.4
         },
+        {
+          "category": "pipes",
+          "description": "Pipeline construction, streaming, and command chaining.",
+          "cases": 6,
+          "bashkitMedianMs": 0.047,
+          "bashMedianMs": 19.375,
+          "speedup": 367
+        },
         {
           "category": "arithmetic",
+          "description": "Integer math, substitutions, and expression-heavy shell snippets.",
           "cases": 6,
+          "bashkitMedianMs": 0.052,
+          "bashMedianMs": 8.68,
           "speedup": 177.6
         },
         {
           "category": "arrays",
+          "description": "Indexed array reads, writes, expansion, and iteration.",
           "cases": 6,
+          "bashkitMedianMs": 0.053,
+          "bashMedianMs": 9.096,
           "speedup": 172.3
         },
         {
           "category": "variables",
+          "description": "Variable assignment, lookup, expansion, and environment handling.",
           "cases": 8,
+          "bashkitMedianMs": 0.055,
+          "bashMedianMs": 8.665,
           "speedup": 162.2
         },
         {
           "category": "control",
+          "description": "Conditionals, loops, case statements, and branching scripts.",
           "cases": 9,
+          "bashkitMedianMs": 0.065,
+          "bashMedianMs": 9.822,
           "speedup": 153.8
         },
+        {
+          "category": "tools",
+          "description": "Builtin and external-tool style command workloads.",
+          "cases": 21,
+          "bashkitMedianMs": 0.094,
+          "bashMedianMs": 23.62,
+          "speedup": 239.9
+        },
         {
           "category": "complex",
+          "description": "Mixed shell workflows that combine multiple language features.",
           "cases": 7,
+          "bashkitMedianMs": 0.097,
+          "bashMedianMs": 22.169,
           "speedup": 131.5
         }
       ]
@@ -348,6 +415,7 @@
       "date": "2026-05-25",
       "timestamp": "2026-05-25T21:32:22.000Z",
       "source": "crates/bashkit/benches/results/criterion-hotpath-perf-linux-x86_64-1779744742.md",
+      "reportSource": "crates/bashkit/benches/results/criterion-hotpath-perf-linux-x86_64-1779744742.md",
       "cases": 23,
       "medianUs": null,
       "p95Us": null,
@@ -369,6 +437,7 @@
       "date": "2026-02-01",
       "timestamp": "2026-02-01T18:30:40.000Z",
       "source": "crates/bashkit-bench/results/bench-runsc-linux-x86_64-1769970640.json",
+      "reportSource": "crates/bashkit-bench/results/bench-runsc-linux-x86_64-1769970640.md",
       "cases": 75,
       "speedup": 0.4,
       "bashkitMs": 4004.73,
@@ -376,49 +445,76 @@
       "errorRate": 5.33,
       "matchRate": 80,
       "categories": [
-        {
-          "category": "pipes",
-          "cases": 6,
-          "speedup": 3767.5
-        },
         {
           "category": "startup",
+          "description": "Small commands where interpreter startup dominates runtime.",
           "cases": 4,
+          "bashkitMedianMs": 0.004,
+          "bashMedianMs": 9.144,
           "speedup": 2401.9
         },
+        {
+          "category": "variables",
+          "description": "Variable assignment, lookup, expansion, and environment handling.",
+          "cases": 8,
+          "bashkitMedianMs": 0.006,
+          "bashMedianMs": 8.654,
+          "speedup": 1611.5
+        },
         {
           "category": "strings",
+          "description": "String expansion, pattern handling, and text manipulation.",
           "cases": 8,
+          "bashkitMedianMs": 0.006,
+          "bashMedianMs": 9.038,
           "speedup": 1652.9
         },
         {
-          "category": "variables",
-          "cases": 8,
-          "speedup": 1611.5
+          "category": "pipes",
+          "description": "Pipeline construction, streaming, and command chaining.",
+          "cases": 6,
+          "bashkitMedianMs": 0.006,
+          "bashMedianMs": 17.435,
+          "speedup": 3767.5
         },
         {
           "category": "arithmetic",
+          "description": "Integer math, substitutions, and expression-heavy shell snippets.",
           "cases": 6,
+          "bashkitMedianMs": 0.007,
+          "bashMedianMs": 8.839,
           "speedup": 1307.2
         },
         {
           "category": "arrays",
+          "description": "Indexed array reads, writes, expansion, and iteration.",
           "cases": 6,
+          "bashkitMedianMs": 0.009,
+          "bashMedianMs": 10.939,
           "speedup": 1260.4
         },
         {
           "category": "control",
+          "description": "Conditionals, loops, case statements, and branching scripts.",
           "cases": 9,
+          "bashkitMedianMs": 0.011,
+          "bashMedianMs": 8.81,
           "speedup": 958.1
         },
         {
           "category": "tools",
+          "description": "Builtin and external-tool style command workloads.",
           "cases": 21,
+          "bashkitMedianMs": 0.028,
+          "bashMedianMs": 22.573,
           "speedup": 725.9
         },
         {
           "category": "complex",
+          "description": "Mixed shell workflows that combine multiple language features.",
           "cases": 7,
+          "bashkitMedianMs": 0.116,
+          "bashMedianMs": 17.902,
           "speedup": 408.7
         }
       ]
@@ -430,6 +526,7 @@
       "date": "2026-02-03",
       "timestamp": "2026-02-03T04:31:00.000Z",
       "source": "crates/bashkit-bench/results/bench-runsc-linux-x86_64-1770093060.json",
+      "reportSource": "crates/bashkit-bench/results/bench-runsc-linux-x86_64-1770093060.md",
       "cases": 75,
       "speedup": 200.9,
       "bashkitMs": 8.97,
@@ -437,49 +534,76 @@
       "errorRate": 0,
       "matchRate": 89.33,
       "categories": [
-        {
-          "category": "pipes",
-          "cases": 6,
-          "speedup": 367
-        },
-        {
-          "category": "tools",
-          "cases": 21,
-          "speedup": 239.9
-        },
         {
           "category": "startup",
+          "description": "Small commands where interpreter startup dominates runtime.",
           "cases": 4,
+          "bashkitMedianMs": 0.039,
+          "bashMedianMs": 8.474,
           "speedup": 216.4
         },
         {
           "category": "strings",
+          "description": "String expansion, pattern handling, and text manipulation.",
           "cases": 8,
+          "bashkitMedianMs": 0.047,
+          "bashMedianMs": 9.218,
           "speedup": 201.4
         },
+        {
+          "category": "pipes",
+          "description": "Pipeline construction, streaming, and command chaining.",
+          "cases": 6,
+          "bashkitMedianMs": 0.047,
+          "bashMedianMs": 19.375,
+          "speedup": 367
+        },
         {
           "category": "arithmetic",
+          "description": "Integer math, substitutions, and expression-heavy shell snippets.",
           "cases": 6,
+          "bashkitMedianMs": 0.052,
+          "bashMedianMs": 8.68,
           "speedup": 177.6
         },
         {
           "category": "arrays",
+          "description": "Indexed array reads, writes, expansion, and iteration.",
           "cases": 6,
+          "bashkitMedianMs": 0.053,
+          "bashMedianMs": 9.096,
           "speedup": 172.3
         },
         {
           "category": "variables",
+          "description": "Variable assignment, lookup, expansion, and environment handling.",
           "cases": 8,
+          "bashkitMedianMs": 0.055,
+          "bashMedianMs": 8.665,
           "speedup": 162.2
         },
         {
           "category": "control",
+          "description": "Conditionals, loops, case statements, and branching scripts.",
           "cases": 9,
+          "bashkitMedianMs": 0.065,
+          "bashMedianMs": 9.822,
           "speedup": 153.8
         },
+        {
+          "category": "tools",
+          "description": "Builtin and external-tool style command workloads.",
+          "cases": 21,
+          "bashkitMedianMs": 0.094,
+          "bashMedianMs": 23.62,
+          "speedup": 239.9
+        },
         {
           "category": "complex",
+          "description": "Mixed shell workflows that combine multiple language features.",
           "cases": 7,
+          "bashkitMedianMs": 0.097,
+          "bashMedianMs": 22.169,
           "speedup": 131.5
         }
       ]
@@ -491,6 +615,7 @@
       "date": "2026-03-14",
       "timestamp": "2026-03-14T05:02:28.000Z",
       "source": "crates/bashkit-bench/results/bench-none-linux-x86_64-1773464548.json",
+      "reportSource": "crates/bashkit-bench/results/bench-none-linux-x86_64-1773464548.md",
       "cases": 96,
       "speedup": 23.8,
       "bashkitMs": 33.11,
@@ -499,63 +624,99 @@
       "matchRate": 100,
       "categories": [
         {
-          "category": "pipes",
-          "cases": 6,
-          "speedup": 43.1
-        },
-        {
-          "category": "io",
-          "cases": 6,
-          "speedup": 31.8
+          "category": "startup",
+          "description": "Small commands where interpreter startup dominates runtime.",
+          "cases": 4,
+          "bashkitMedianMs": 0.054,
+          "bashMedianMs": 1.436,
+          "speedup": 26.2
         },
         {
-          "category": "subshell",
+          "category": "pipes",
+          "description": "Pipeline construction, streaming, and command chaining.",
           "cases": 6,
-          "speedup": 31.7
-        },
-        {
-          "category": "tools",
-          "cases": 21,
-          "speedup": 28.1
+          "bashkitMedianMs": 0.06,
+          "bashMedianMs": 2.691,
+          "speedup": 43.1
         },
         {
-          "category": "startup",
-          "cases": 4,
-          "speedup": 26.2
+          "category": "variables",
+          "description": "Variable assignment, lookup, expansion, and environment handling.",
+          "cases": 8,
+          "bashkitMedianMs": 0.063,
+          "bashMedianMs": 1.343,
+          "speedup": 21.3
         },
         {
           "category": "arithmetic",
+          "description": "Integer math, substitutions, and expression-heavy shell snippets.",
           "cases": 6,
+          "bashkitMedianMs": 0.063,
+          "bashMedianMs": 1.442,
           "speedup": 22.4
         },
         {
           "category": "strings",
+          "description": "String expansion, pattern handling, and text manipulation.",
           "cases": 8,
+          "bashkitMedianMs": 0.066,
+          "bashMedianMs": 1.454,
           "speedup": 21.7
         },
         {
           "category": "arrays",
+          "description": "Indexed array reads, writes, expansion, and iteration.",
           "cases": 6,
+          "bashkitMedianMs": 0.066,
+          "bashMedianMs": 1.408,
           "speedup": 21.5
         },
         {
-          "category": "variables",
-          "cases": 8,
-          "speedup": 21.3
+          "category": "io",
+          "description": "File reads, writes, redirects, and filesystem-facing commands.",
+          "cases": 6,
+          "bashkitMedianMs": 0.067,
+          "bashMedianMs": 2.172,
+          "speedup": 31.8
+        },
+        {
+          "category": "subshell",
+          "description": "Command substitution and nested shell execution paths.",
+          "cases": 6,
+          "bashkitMedianMs": 0.068,
+          "bashMedianMs": 2.716,
+          "speedup": 31.7
         },
         {
           "category": "control",
+          "description": "Conditionals, loops, case statements, and branching scripts.",
           "cases": 9,
+          "bashkitMedianMs": 0.076,
+          "bashMedianMs": 1.481,
           "speedup": 20
         },
+        {
+          "category": "tools",
+          "description": "Builtin and external-tool style command workloads.",
+          "cases": 21,
+          "bashkitMedianMs": 0.106,
+          "bashMedianMs": 2.998,
+          "speedup": 28.1
+        },
         {
           "category": "complex",
+          "description": "Mixed shell workflows that combine multiple language features.",
           "cases": 7,
+          "bashkitMedianMs": 0.144,
+          "bashMedianMs": 3.069,
           "speedup": 11.8
         },
         {
           "category": "large",
+          "description": "Bigger scripts and higher-volume data paths.",
           "cases": 9,
+          "bashkitMedianMs": 0.857,
+          "bashMedianMs": 2.667,
           "speedup": 3.7
         }
       ]
@@ -567,6 +728,7 @@
       "date": "2026-04-13",
       "timestamp": "2026-04-13T23:05:40.000Z",
       "source": "crates/bashkit-bench/results/bench-runsc-linux-x86_64-1776121540.json",
+      "reportSource": "crates/bashkit-bench/results/bench-runsc-linux-x86_64-1776121540.md",
       "cases": 96,
       "speedup": 107.2,
       "bashkitMs": 41.52,
@@ -574,64 +736,100 @@
       "errorRate": 0,
       "matchRate": 100,
       "categories": [
+        {
+          "category": "startup",
+          "description": "Small commands where interpreter startup dominates runtime.",
+          "cases": 4,
+          "bashkitMedianMs": 0.07,
+          "bashMedianMs": 8.238,
+          "speedup": 116.7
+        },
         {
           "category": "pipes",
+          "description": "Pipeline construction, streaming, and command chaining.",
           "cases": 6,
+          "bashkitMedianMs": 0.083,
+          "bashMedianMs": 17.156,
           "speedup": 220.4
         },
         {
-          "category": "tools",
-          "cases": 21,
-          "speedup": 167.5
+          "category": "strings",
+          "description": "String expansion, pattern handling, and text manipulation.",
+          "cases": 8,
+          "bashkitMedianMs": 0.085,
+          "bashMedianMs": 8.256,
+          "speedup": 98.9
         },
         {
           "category": "io",
+          "description": "File reads, writes, redirects, and filesystem-facing commands.",
           "cases": 6,
+          "bashkitMedianMs": 0.087,
+          "bashMedianMs": 14.647,
           "speedup": 162.5
         },
-        {
-          "category": "subshell",
-          "cases": 6,
-          "speedup": 133.5
-        },
-        {
-          "category": "startup",
-          "cases": 4,
-          "speedup": 116.7
-        },
-        {
-          "category": "strings",
-          "cases": 8,
-          "speedup": 98.9
-        },
         {
           "category": "variables",
+          "description": "Variable assignment, lookup, expansion, and environment handling.",
           "cases": 8,
+          "bashkitMedianMs": 0.089,
+          "bashMedianMs": 8.357,
           "speedup": 93.5
         },
+        {
+          "category": "arithmetic",
+          "description": "Integer math, substitutions, and expression-heavy shell snippets.",
+          "cases": 6,
+          "bashkitMedianMs": 0.091,
+          "bashMedianMs": 8.119,
+          "speedup": 88.5
+        },
         {
           "category": "control",
+          "description": "Conditionals, loops, case statements, and branching scripts.",
           "cases": 9,
+          "bashkitMedianMs": 0.094,
+          "bashMedianMs": 8.163,
           "speedup": 89.4
         },
         {
           "category": "arrays",
+          "description": "Indexed array reads, writes, expansion, and iteration.",
           "cases": 6,
+          "bashkitMedianMs": 0.094,
+          "bashMedianMs": 8.109,
           "speedup": 89.3
         },
         {
-          "category": "arithmetic",
+          "category": "subshell",
+          "description": "Command substitution and nested shell execution paths.",
           "cases": 6,
-          "speedup": 88.5
+          "bashkitMedianMs": 0.1,
+          "bashMedianMs": 16.778,
+          "speedup": 133.5
+        },
+        {
+          "category": "tools",
+          "description": "Builtin and external-tool style command workloads.",
+          "cases": 21,
+          "bashkitMedianMs": 0.118,
+          "bashMedianMs": 21.723,
+          "speedup": 167.5
         },
         {
           "category": "complex",
+          "description": "Mixed shell workflows that combine multiple language features.",
           "cases": 7,
+          "bashkitMedianMs": 0.185,
+          "bashMedianMs": 17.53,
           "speedup": 72.3
         },
         {
           "category": "large",
+          "description": "Bigger scripts and higher-volume data paths.",
           "cases": 9,
+          "bashkitMedianMs": 0.84,
+          "bashMedianMs": 9.895,
           "speedup": 19.8
         }
       ]
@@ -643,6 +841,7 @@
       "date": "2026-05-25",
       "timestamp": "2026-05-25T21:35:05.000Z",
       "source": "crates/bashkit-bench/results/bench-after-perf-linux-x86_64.json",
+      "reportSource": "crates/bashkit-bench/results/bench-after-perf-linux-x86_64.md",
       "cases": 96,
       "speedup": 25.4,
       "bashkitMs": 43.16,
@@ -651,63 +850,99 @@
       "matchRate": 100,
       "categories": [
         {
-          "category": "pipes",
-          "cases": 6,
-          "speedup": 83.1
+          "category": "startup",
+          "description": "Small commands where interpreter startup dominates runtime.",
+          "cases": 4,
+          "bashkitMedianMs": 0.044,
+          "bashMedianMs": 1.911,
+          "speedup": 43.5
         },
         {
-          "category": "tools",
-          "cases": 21,
-          "speedup": 62.9
+          "category": "strings",
+          "description": "String expansion, pattern handling, and text manipulation.",
+          "cases": 8,
+          "bashkitMedianMs": 0.044,
+          "bashMedianMs": 2.328,
+          "speedup": 51.3
         },
         {
-          "category": "subshell",
+          "category": "arrays",
+          "description": "Indexed array reads, writes, expansion, and iteration.",
           "cases": 6,
-          "speedup": 60.6
+          "bashkitMedianMs": 0.045,
+          "bashMedianMs": 2.3,
+          "speedup": 49.7
         },
         {
           "category": "arithmetic",
+          "description": "Integer math, substitutions, and expression-heavy shell snippets.",
           "cases": 6,
+          "bashkitMedianMs": 0.046,
+          "bashMedianMs": 2.91,
           "speedup": 52.7
         },
-        {
-          "category": "strings",
-          "cases": 8,
-          "speedup": 51.3
-        },
-        {
-          "category": "io",
-          "cases": 6,
-          "speedup": 50.6
-        },
         {
           "category": "variables",
+          "description": "Variable assignment, lookup, expansion, and environment handling.",
           "cases": 8,
+          "bashkitMedianMs": 0.049,
+          "bashMedianMs": 2.318,
           "speedup": 49.8
         },
         {
-          "category": "arrays",
+          "category": "subshell",
+          "description": "Command substitution and nested shell execution paths.",
           "cases": 6,
-          "speedup": 49.7
+          "bashkitMedianMs": 0.05,
+          "bashMedianMs": 3.232,
+          "speedup": 60.6
         },
         {
-          "category": "startup",
-          "cases": 4,
-          "speedup": 43.5
+          "category": "pipes",
+          "description": "Pipeline construction, streaming, and command chaining.",
+          "cases": 6,
+          "bashkitMedianMs": 0.055,
+          "bashMedianMs": 4.574,
+          "speedup": 83.1
         },
         {
           "category": "control",
+          "description": "Conditionals, loops, case statements, and branching scripts.",
           "cases": 9,
+          "bashkitMedianMs": 0.056,
+          "bashMedianMs": 2.362,
           "speedup": 36.8
         },
+        {
+          "category": "io",
+          "description": "File reads, writes, redirects, and filesystem-facing commands.",
+          "cases": 6,
+          "bashkitMedianMs": 0.071,
+          "bashMedianMs": 3.232,
+          "speedup": 50.6
+        },
+        {
+          "category": "tools",
+          "description": "Builtin and external-tool style command workloads.",
+          "cases": 21,
+          "bashkitMedianMs": 0.075,
+          "bashMedianMs": 5.143,
+          "speedup": 62.9
+        },
         {
           "category": "complex",
+          "description": "Mixed shell workflows that combine multiple language features.",
           "cases": 7,
+          "bashkitMedianMs": 0.117,
+          "bashMedianMs": 3.226,
           "speedup": 22.3
         },
         {
           "category": "large",
+          "description": "Bigger scripts and higher-volume data paths.",
           "cases": 9,
+          "bashkitMedianMs": 1.554,
+          "bashMedianMs": 3.005,
           "speedup": 8.6
         }
       ]
@@ -719,6 +954,7 @@
       "date": "2026-05-26",
       "timestamp": "2026-05-26T03:01:00.000Z",
       "source": "crates/bashkit-bench/results/bench-vm-linux-x86_64-1779764460.json",
+      "reportSource": "crates/bashkit-bench/results/bench-vm-linux-x86_64-1779764460.md",
       "cases": 96,
       "speedup": 20.9,
       "bashkitMs": 42.95,
@@ -726,64 +962,100 @@
       "errorRate": 0,
       "matchRate": 100,
       "categories": [
-        {
-          "category": "subshell",
-          "cases": 6,
-          "speedup": 40.3
-        },
-        {
-          "category": "io",
-          "cases": 6,
-          "speedup": 37.7
-        },
-        {
-          "category": "tools",
-          "cases": 21,
-          "speedup": 37
-        },
-        {
-          "category": "pipes",
-          "cases": 6,
-          "speedup": 36.3
-        },
         {
           "category": "startup",
+          "description": "Small commands where interpreter startup dominates runtime.",
           "cases": 4,
+          "bashkitMedianMs": 0.053,
+          "bashMedianMs": 1.662,
           "speedup": 32.1
         },
         {
           "category": "strings",
+          "description": "String expansion, pattern handling, and text manipulation.",
           "cases": 8,
+          "bashkitMedianMs": 0.057,
+          "bashMedianMs": 1.791,
           "speedup": 31.2
         },
         {
           "category": "variables",
+          "description": "Variable assignment, lookup, expansion, and environment handling.",
           "cases": 8,
+          "bashkitMedianMs": 0.058,
+          "bashMedianMs": 1.688,
           "speedup": 30.2
         },
         {
           "category": "arrays",
+          "description": "Indexed array reads, writes, expansion, and iteration.",
           "cases": 6,
+          "bashkitMedianMs": 0.059,
+          "bashMedianMs": 1.713,
           "speedup": 29
         },
+        {
+          "category": "subshell",
+          "description": "Command substitution and nested shell execution paths.",
+          "cases": 6,
+          "bashkitMedianMs": 0.061,
+          "bashMedianMs": 3.143,
+          "speedup": 40.3
+        },
         {
           "category": "arithmetic",
+          "description": "Integer math, substitutions, and expression-heavy shell snippets.",
           "cases": 6,
+          "bashkitMedianMs": 0.062,
+          "bashMedianMs": 1.703,
           "speedup": 28.8
         },
+        {
+          "category": "pipes",
+          "description": "Pipeline construction, streaming, and command chaining.",
+          "cases": 6,
+          "bashkitMedianMs": 0.065,
+          "bashMedianMs": 3.131,
+          "speedup": 36.3
+        },
         {
           "category": "control",
+          "description": "Conditionals, loops, case statements, and branching scripts.",
           "cases": 9,
+          "bashkitMedianMs": 0.076,
+          "bashMedianMs": 1.711,
           "speedup": 26.6
         },
+        {
+          "category": "io",
+          "description": "File reads, writes, redirects, and filesystem-facing commands.",
+          "cases": 6,
+          "bashkitMedianMs": 0.08,
+          "bashMedianMs": 2.681,
+          "speedup": 37.7
+        },
+        {
+          "category": "tools",
+          "description": "Builtin and external-tool style command workloads.",
+          "cases": 21,
+          "bashkitMedianMs": 0.093,
+          "bashMedianMs": 3.537,
+          "speedup": 37
+        },
         {
           "category": "complex",
+          "description": "Mixed shell workflows that combine multiple language features.",
           "cases": 7,
+          "bashkitMedianMs": 0.118,
+          "bashMedianMs": 3.207,
           "speedup": 16.5
         },
         {
           "category": "large",
+          "description": "Bigger scripts and higher-volume data paths.",
           "cases": 9,
+          "bashkitMedianMs": 1.789,
+          "bashMedianMs": 3.289,
           "speedup": 4.4
         }
       ]
@@ -798,6 +1070,7 @@
       "date": "2026-03-14",
       "timestamp": "2026-03-14T06:18:49.000Z",
       "source": "crates/bashkit/benches/results/criterion-parallel-(none)-linux-x86_64-1773469129.md",
+      "reportSource": "crates/bashkit/benches/results/criterion-parallel-(none)-linux-x86_64-1773469129.md",
       "cases": 9,
       "medianUs": 160.05,
       "p95Us": 1122.24,
@@ -818,6 +1091,7 @@
       "date": "2026-05-04",
       "timestamp": "2026-05-04T03:27:48.000Z",
       "source": "crates/bashkit/benches/results/criterion-sqlite-vm-linux-x86_64-1777865268.md",
+      "reportSource": "crates/bashkit/benches/results/criterion-sqlite-vm-linux-x86_64-1777865268.md",
       "cases": 44,
       "medianUs": 799.27,
       "p95Us": 7677.82,
@@ -838,6 +1112,7 @@
       "date": "2026-05-25",
       "timestamp": "2026-05-25T21:32:22.000Z",
       "source": "crates/bashkit/benches/results/criterion-hotpath-perf-linux-x86_64-1779744742.md",
+      "reportSource": "crates/bashkit/benches/results/criterion-hotpath-perf-linux-x86_64-1779744742.md",
       "cases": 23,
       "medianUs": null,
       "p95Us": null,
@@ -858,6 +1133,7 @@
       "date": "2026-05-26",
       "timestamp": "2026-05-26T01:44:10.000Z",
       "source": "crates/bashkit/benches/results/criterion-file_ops-linux-x86_64-1779759850.md",
+      "reportSource": "crates/bashkit/benches/results/criterion-file_ops-linux-x86_64-1779759850.md",
       "cases": 11,
       "medianUs": 2000,
       "p95Us": 3590,
@@ -878,6 +1154,7 @@
       "date": "2026-05-26",
       "timestamp": "2026-05-26T01:44:10.000Z",
       "source": "crates/bashkit/benches/results/criterion-hotpath-attrs+shopt-linux-x86_64-1779759850.md",
+      "reportSource": "crates/bashkit/benches/results/criterion-hotpath-attrs+shopt-linux-x86_64-1779759850.md",
       "cases": 43,
       "medianUs": 624,
       "p95Us": 2713,
@@ -902,6 +1179,7 @@
       "date": "2026-02-07",
       "timestamp": "2026-02-07T05:20:23Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-2026-02-07-052023.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-2026-02-07-052023.md",
       "tasks": 25,
       "passed": 19,
       "scorePct": 91.5,
@@ -984,6 +1262,7 @@
       "date": "2026-02-07",
       "timestamp": "2026-02-07T05:20:37Z",
       "source": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-07-052037.json",
+      "reportSource": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-07-052037.md",
       "tasks": 25,
       "passed": 19,
       "scorePct": 86.8,
@@ -1066,6 +1345,7 @@
       "date": "2026-02-07",
       "timestamp": "2026-02-07T05:25:36Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-07-052536.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-07-052536.md",
       "tasks": 25,
       "passed": 17,
       "scorePct": 86.8,
@@ -1148,6 +1428,7 @@
       "date": "2026-02-08",
       "timestamp": "2026-02-08T06:14:14Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-2026-02-08-061414.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-2026-02-08-061414.md",
       "tasks": 25,
       "passed": 23,
       "scorePct": 98.1,
@@ -1230,6 +1511,7 @@
       "date": "2026-02-08",
       "timestamp": "2026-02-08T06:14:45Z",
       "source": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-08-061445.json",
+      "reportSource": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-08-061445.md",
       "tasks": 25,
       "passed": 18,
       "scorePct": 81.1,
@@ -1312,6 +1594,7 @@
       "date": "2026-02-08",
       "timestamp": "2026-02-08T06:20:03Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-08-062003.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-08-062003.md",
       "tasks": 25,
       "passed": 21,
       "scorePct": 93.4,
@@ -1394,6 +1677,7 @@
       "date": "2026-02-09",
       "timestamp": "2026-02-09T05:44:24Z",
       "source": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-09-054424.json",
+      "reportSource": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-09-054424.md",
       "tasks": 37,
       "passed": 23,
       "scorePct": 79.9,
@@ -1476,6 +1760,7 @@
       "date": "2026-02-09",
       "timestamp": "2026-02-09T05:45:58Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-02-09-054558.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-02-09-054558.md",
       "tasks": 37,
       "passed": 32,
       "scorePct": 94.6,
@@ -1558,6 +1843,7 @@
       "date": "2026-02-09",
       "timestamp": "2026-02-09T14:27:36Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-09-142736.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-09-142736.md",
       "tasks": 37,
       "passed": 29,
       "scorePct": 87,
@@ -1640,6 +1926,7 @@
       "date": "2026-02-17",
       "timestamp": "2026-02-17T23:03:12Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-20250514-2026-02-17-230312.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-20250514-2026-02-17-230312.md",
       "tasks": 3,
       "passed": 3,
       "scorePct": 100,
@@ -1668,6 +1955,7 @@
       "date": "2026-02-17",
       "timestamp": "2026-02-17T23:13:36Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-20250514-2026-02-17-231336.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-20250514-2026-02-17-231336.md",
       "tasks": 37,
       "passed": 32,
       "scorePct": 92.9,
@@ -1750,6 +2038,7 @@
       "date": "2026-02-25",
       "timestamp": "2026-02-25T04:48:01Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-02-25-044801.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-02-25-044801.md",
       "tasks": 37,
       "passed": 35,
       "scorePct": 97.8,
@@ -1832,6 +2121,7 @@
       "date": "2026-02-25",
       "timestamp": "2026-02-25T04:49:04Z",
       "source": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-25-044904.json",
+      "reportSource": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-25-044904.md",
       "tasks": 37,
       "passed": 27,
       "scorePct": 86.4,
@@ -1914,6 +2204,7 @@
       "date": "2026-02-25",
       "timestamp": "2026-02-25T04:53:28Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-20250514-2026-02-25-045328.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-20250514-2026-02-25-045328.md",
       "tasks": 37,
       "passed": 34,
       "scorePct": 97.3,
@@ -1996,6 +2287,7 @@
       "date": "2026-02-25",
       "timestamp": "2026-02-25T04:56:11Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-25-045611.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-25-045611.md",
       "tasks": 37,
       "passed": 33,
       "scorePct": 92.9,
@@ -2078,6 +2370,7 @@
       "date": "2026-02-27",
       "timestamp": "2026-02-27T04:06:36Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-02-27-040636.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-02-27-040636.md",
       "tasks": 52,
       "passed": 43,
       "scorePct": 91.7,
@@ -2172,6 +2465,7 @@
       "date": "2026-02-27",
       "timestamp": "2026-02-27T04:38:13Z",
       "source": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-27-043813.json",
+      "reportSource": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-27-043813.md",
       "tasks": 52,
       "passed": 32,
       "scorePct": 79.4,
@@ -2266,6 +2560,7 @@
       "date": "2026-02-27",
       "timestamp": "2026-02-27T04:38:54Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-6-2026-02-27-043854.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-6-2026-02-27-043854.md",
       "tasks": 26,
       "passed": 23,
       "scorePct": 93.9,
@@ -2348,6 +2643,7 @@
       "date": "2026-02-27",
       "timestamp": "2026-02-27T04:38:56Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-27-043856.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-27-043856.md",
       "tasks": 23,
       "passed": 23,
       "scorePct": 100,
@@ -2430,6 +2726,7 @@
       "date": "2026-02-27",
       "timestamp": "2026-02-27T05:55:43Z",
       "source": "crates/bashkit-eval/results/eval-openai-responses-gpt-5.3-codex-2026-02-27-055543.json",
+      "reportSource": "crates/bashkit-eval/results/eval-openai-responses-gpt-5.3-codex-2026-02-27-055543.md",
       "tasks": 37,
       "passed": 30,
       "scorePct": 93,
@@ -2512,6 +2809,7 @@
       "date": "2026-02-28",
       "timestamp": "2026-02-28T20:40:52Z",
       "source": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-28-204052.json",
+      "reportSource": "crates/bashkit-eval/results/eval-openai-gpt-5.2-2026-02-28-204052.md",
       "tasks": 58,
       "passed": 41,
       "scorePct": 77.3,
@@ -2624,6 +2922,7 @@
       "date": "2026-02-28",
       "timestamp": "2026-02-28T20:42:32Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-02-28-204232.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-02-28-204232.md",
       "tasks": 58,
       "passed": 54,
       "scorePct": 97.2,
@@ -2736,6 +3035,7 @@
       "date": "2026-02-28",
       "timestamp": "2026-02-28T20:53:31Z",
       "source": "crates/bashkit-eval/results/eval-openresponses-gpt-5.3-codex-2026-02-28-205331.json",
+      "reportSource": "crates/bashkit-eval/results/eval-openresponses-gpt-5.3-codex-2026-02-28-205331.md",
       "tasks": 58,
       "passed": 51,
       "scorePct": 91,
@@ -2848,6 +3148,7 @@
       "date": "2026-02-28",
       "timestamp": "2026-02-28T20:53:58Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-28-205358.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-6-2026-02-28-205358.md",
       "tasks": 58,
       "passed": 50,
       "scorePct": 91,
@@ -2960,6 +3261,7 @@
       "date": "2026-02-28",
       "timestamp": "2026-02-28T21:11:20Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-6-2026-02-28-211120.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-6-2026-02-28-211120.md",
       "tasks": 58,
       "passed": 48,
       "scorePct": 92.5,
@@ -3072,6 +3374,7 @@
       "date": "2026-03-14",
       "timestamp": "2026-03-14T17:44:22Z",
       "source": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174422.json",
+      "reportSource": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174422.md",
       "tasks": 4,
       "passed": 3,
       "scorePct": 93.1,
@@ -3100,6 +3403,7 @@
       "date": "2026-03-14",
       "timestamp": "2026-03-14T17:44:33Z",
       "source": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174433.json",
+      "reportSource": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174433.md",
       "tasks": 3,
       "passed": 3,
       "scorePct": 100,
@@ -3128,6 +3432,7 @@
       "date": "2026-03-14",
       "timestamp": "2026-03-14T17:44:46Z",
       "source": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174446.json",
+      "reportSource": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174446.md",
       "tasks": 3,
       "passed": 2,
       "scorePct": 84.6,
@@ -3156,6 +3461,7 @@
       "date": "2026-03-14",
       "timestamp": "2026-03-14T17:44:58Z",
       "source": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174458.json",
+      "reportSource": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-5.4-2026-03-14-174458.md",
       "tasks": 4,
       "passed": 0,
       "scorePct": 75,
@@ -3184,6 +3490,7 @@
       "date": "2026-03-14",
       "timestamp": "2026-03-14T17:45:21Z",
       "source": "crates/bashkit-eval/results/scripting-eval-baseline-openai-gpt-5.4-2026-03-14-174521.json",
+      "reportSource": "crates/bashkit-eval/results/scripting-eval-baseline-openai-gpt-5.4-2026-03-14-174521.md",
       "tasks": 4,
       "passed": 3,
       "scorePct": 96.6,
@@ -3212,6 +3519,7 @@
       "date": "2026-03-14",
       "timestamp": "2026-03-14T17:45:30Z",
       "source": "crates/bashkit-eval/results/scripting-eval-baseline-openai-gpt-5.4-2026-03-14-174530.json",
+      "reportSource": "crates/bashkit-eval/results/scripting-eval-baseline-openai-gpt-5.4-2026-03-14-174530.md",
       "tasks": 3,
       "passed": 2,
       "scorePct": 90,
@@ -3240,6 +3548,7 @@
       "date": "2026-03-14",
       "timestamp": "2026-03-14T17:45:41Z",
       "source": "crates/bashkit-eval/results/scripting-eval-baseline-openai-gpt-5.4-2026-03-14-174541.json",
+      "reportSource": "crates/bashkit-eval/results/scripting-eval-baseline-openai-gpt-5.4-2026-03-14-174541.md",
       "tasks": 3,
       "passed": 3,
       "scorePct": 100,
@@ -3268,6 +3577,7 @@
       "date": "2026-03-24",
       "timestamp": "2026-03-24T00:36:10Z",
       "source": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-4o-2026-03-24-003610.json",
+      "reportSource": "crates/bashkit-eval/results/scripting-eval-scripted-openai-gpt-4o-2026-03-24-003610.md",
       "tasks": 4,
       "passed": 3,
       "scorePct": 91.7,
@@ -3296,6 +3606,7 @@
       "date": "2026-05-26",
       "timestamp": "2026-05-26T01:25:23Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-05-26-012523.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-haiku-4-5-20251001-2026-05-26-012523.md",
       "tasks": 58,
       "passed": 54,
       "scorePct": 98.4,
@@ -3408,6 +3719,7 @@
       "date": "2026-05-26",
       "timestamp": "2026-05-26T01:45:08Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-6-2026-05-26-014508.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-sonnet-4-6-2026-05-26-014508.md",
       "tasks": 58,
       "passed": 49,
       "scorePct": 94,
@@ -3520,6 +3832,7 @@
       "date": "2026-05-26",
       "timestamp": "2026-05-26T02:07:42Z",
       "source": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-7-2026-05-26-020742.json",
+      "reportSource": "crates/bashkit-eval/results/eval-anthropic-claude-opus-4-7-2026-05-26-020742.md",
       "tasks": 58,
       "passed": 56,
       "scorePct": 97.8,
@@ -3632,6 +3945,7 @@
       "date": "2026-05-26",
       "timestamp": "2026-05-26T02:18:53Z",
       "source": "crates/bashkit-eval/results/eval-openai-gpt-5.5-2026-05-26-021853.json",
+      "reportSource": "crates/bashkit-eval/results/eval-openai-gpt-5.5-2026-05-26-021853.md",
       "tasks": 58,
       "passed": 50,
       "scorePct": 92.7,
@@ -3744,6 +4058,7 @@
       "date": "2026-05-26",
       "timestamp": "2026-05-26T02:36:42Z",
       "source": "crates/bashkit-eval/results/eval-openresponses-gpt-5.3-codex-2026-05-26-023642.json",
+      "reportSource": "crates/bashkit-eval/results/eval-openresponses-gpt-5.3-codex-2026-05-26-023642.md",
       "tasks": 58,
       "passed": 54,
       "scorePct": 93,
diff --git a/site/src/pages/benches.astro b/site/src/pages/benches.astro
index 1f996460..41d4d1a7 100644
--- a/site/src/pages/benches.astro
+++ b/site/src/pages/benches.astro
@@ -24,31 +24,34 @@ const repoUrl = (source: string) =>
 const compactNumber = (value: number | null | undefined, suffix = "") =>
   typeof value === "number" ? `${value.toLocaleString()}${suffix}` : "n/a";
 
-const latestArtifacts = [
+const formatMs = (value: number | null | undefined) =>
+  typeof value === "number" ? `${value.toLocaleString()} ms` : "n/a";
+
+const latestReports = [
   ...(latestBench
     ? [
         {
-          title: "Latest bashkit-bench run",
+          title: "bashkit-bench report",
           detail: `${latestBench.cases} cases on ${latestBench.label}`,
-          href: repoUrl(latestBench.source),
+          href: repoUrl(latestBench.reportSource ?? latestBench.source),
         },
       ]
     : []),
   ...(latestEval
     ? [
         {
-          title: "Latest LLM eval run",
+          title: "LLM eval report",
           detail: `${latestEval.model}, ${latestEval.passed}/${latestEval.tasks} tasks`,
-          href: repoUrl(latestEval.source),
+          href: repoUrl(latestEval.reportSource ?? latestEval.source),
         },
       ]
     : []),
   ...(latestCriterion
     ? [
         {
-          title: "Latest criterion bench",
+          title: "Criterion report",
           detail: `${latestCriterion.family}, ${latestCriterion.cases} cases`,
-          href: repoUrl(latestCriterion.source),
+          href: repoUrl(latestCriterion.reportSource ?? latestCriterion.source),
         },
       ]
     : []),
@@ -73,6 +76,13 @@ const resultIndexes = [
   },
 ];
 
+const benchmarkResultsUrl =
+  "https://github.com/everruns/bashkit/tree/main/crates/bashkit-bench/results";
+const criterionResultsUrl =
+  "https://github.com/everruns/bashkit/tree/main/crates/bashkit/benches/results";
+const evalResultsUrl =
+  "https://github.com/everruns/bashkit/tree/main/crates/bashkit-eval/results";
+
 const pageDescription =
   "Latest Bashkit benchmark, criterion bench, and LLM eval snapshot.";
 ---
@@ -91,57 +101,25 @@ const pageDescription =
         </p>
       </header>
 
-      <section class="snapshot-grid" aria-label="Latest snapshot">
-        <a
-          href={latestBench ? repoUrl(latestBench.source) : "#"}
-          class="metric-card"
-          target="_blank"
-          rel="noopener noreferrer"
-        >
-          <span>bashkit-bench</span>
-          <strong>{compactNumber(latestBench?.speedup, "x")}</strong>
-          <small>
-            {latestBench
-              ? `${latestBench.cases} cases, ${latestBench.matchRate}% match`
-              : "No run"}
-          </small>
-        </a>
-        <a
-          href={latestEval ? repoUrl(latestEval.source) : "#"}
-          class="metric-card"
-          target="_blank"
-          rel="noopener noreferrer"
-        >
-          <span>LLM eval</span>
-          <strong>{compactNumber(latestEval?.scorePct, "%")}</strong>
-          <small>
-            {latestEval ? `${latestEval.passed}/${latestEval.tasks} tasks` : "No run"}
-          </small>
-        </a>
-        <a
-          href={latestCriterion ? repoUrl(latestCriterion.source) : "#"}
-          class="metric-card"
-          target="_blank"
-          rel="noopener noreferrer"
-        >
-          <span>Criterion</span>
-          <strong>{compactNumber(latestCriterion?.medianUs, " us")}</strong>
-          <small>
-            {latestCriterion?.fastestCase
-              ? `fastest: ${latestCriterion.fastestCase.name}`
-              : `${latestCriterion?.cases ?? 0} cases`}
-          </small>
-        </a>
-      </section>
-
-      <section class="artifact-strip" aria-label="Result artifacts">
+      <section class="artifact-strip" aria-label="Latest reports">
         <div>
-          <span class="bench-eyebrow">Latest files</span>
-          <h2>Open the source results</h2>
+          <span class="bench-eyebrow">Latest reports</span>
+          <h2>Open Markdown reports</h2>
+          <div class="section-links" aria-label="Report folders">
+            <a href={benchmarkResultsUrl} target="_blank" rel="noopener noreferrer">
+              Benchmark folder
+            </a>
+            <a href={evalResultsUrl} target="_blank" rel="noopener noreferrer">
+              Eval folder
+            </a>
+            <a href={criterionResultsUrl} target="_blank" rel="noopener noreferrer">
+              Criterion folder
+            </a>
+          </div>
         </div>
         <div class="artifact-links">
           {
-            latestArtifacts.map((item) => (
+            latestReports.map((item) => (
               <a href={item.href} target="_blank" rel="noopener noreferrer">
                 <strong>{item.title}</strong>
                 <small>{item.detail}</small>
@@ -154,20 +132,34 @@ const pageDescription =
       <section class="snapshot-columns">
         <article class="snapshot-panel">
           <div class="panel-head">
-            <span class="bench-eyebrow">Runtime speed</span>
-            <h2>Fastest benchmark categories</h2>
+            <span class="bench-eyebrow">Runtime snapshot</span>
+            <h2>Latest benchmark categories</h2>
+            <a
+              class="section-link"
+              href={benchmarkResultsUrl}
+              target="_blank"
+              rel="noopener noreferrer"
+            >
+              Browse benchmark runs
+            </a>
           </div>
           <table>
             <thead>
-              <tr><th>Category</th><th>Cases</th><th>Speedup</th></tr>
+              <tr><th>Category</th><th>Cases</th><th>Last run</th></tr>
             </thead>
             <tbody>
               {
                 topBenchCategories.map((row) => (
                   <tr>
-                    <td>{row.category}</td>
+                    <td>
+                      <strong>{row.category}</strong>
+                      <small>{row.description}</small>
+                    </td>
                     <td>{row.cases}</td>
-                    <td class="score">{row.speedup}x</td>
+                    <td>
+                      <strong class="score">{formatMs(row.bashkitMedianMs)}</strong>
+                      <small>bash median: {formatMs(row.bashMedianMs)}</small>
+                    </td>
                   </tr>
                 ))
               }
@@ -179,10 +171,30 @@ const pageDescription =
           <div class="panel-head">
             <span class="bench-eyebrow">Eval pressure</span>
             <h2>Lowest eval categories</h2>
+            <a
+              class="section-link"
+              href={evalResultsUrl}
+              target="_blank"
+              rel="noopener noreferrer"
+            >
+              Browse eval runs
+            </a>
           </div>
+          <a
+            href={latestEval ? repoUrl(latestEval.source) : "#"}
+            class="panel-metric"
+            target="_blank"
+            rel="noopener noreferrer"
+          >
+            <span>Latest LLM eval</span>
+            <strong>{compactNumber(latestEval?.scorePct, "%")}</strong>
+            <small>
+              {latestEval ? `${latestEval.passed}/${latestEval.tasks} tasks` : "No run"}
+            </small>
+          </a>
           <table>
             <thead>
-              <tr><th>Category</th><th>Tasks</th><th>Rate</th></tr>
+              <tr><th>Category</th><th>Passed</th><th>Pass rate</th></tr>
             </thead>
             <tbody>
               {
@@ -190,7 +202,10 @@ const pageDescription =
                   <tr>
                     <td>{row.category}</td>
                     <td>{row.passed}/{row.tasks}</td>
-                    <td class="score">{row.rate}%</td>
+                    <td>
+                      <strong class="score">{row.rate}%</strong>
+                      <small>tasks passed</small>
+                    </td>
                   </tr>
                 ))
               }
@@ -204,6 +219,14 @@ const pageDescription =
           <div class="panel-head">
             <span class="bench-eyebrow">Latest full evals</span>
             <h2>Recent model snapshots</h2>
+            <a
+              class="section-link"
+              href={evalResultsUrl}
+              target="_blank"
+              rel="noopener noreferrer"
+            >
+              Browse eval runs
+            </a>
           </div>
           <table>
             <thead>
@@ -281,29 +304,27 @@ const pageDescription =
     color: var(--color-slate);
   }
 
-  .snapshot-grid,
   .snapshot-columns {
     display: grid;
-    grid-template-columns: repeat(3, minmax(0, 1fr));
+    grid-template-columns: repeat(2, minmax(0, 1fr));
     gap: var(--space-md);
   }
 
   .snapshot-columns {
-    grid-template-columns: repeat(2, minmax(0, 1fr));
     margin-top: var(--space-md);
   }
 
-  .metric-card,
   .snapshot-panel,
   .artifact-strip {
     border: 1px solid #dddddd;
     background: var(--color-white);
   }
 
-  .metric-card {
+  .panel-metric {
     display: grid;
     gap: 0.2rem;
-    min-height: 8rem;
+    margin: 0 1.1rem 1rem;
+    border: 1px solid #dddddd;
     padding: 1.1rem;
     color: var(--color-obsidian);
     text-decoration: none;
@@ -312,7 +333,7 @@ const pageDescription =
       transform 0.12s ease;
   }
 
-  .metric-card:hover,
+  .panel-metric:hover,
   .artifact-links a:hover,
   .index-links a:hover {
     border-color: rgb(10 22 54 / 0.36);
@@ -320,13 +341,13 @@ const pageDescription =
     transform: translateY(-1px);
   }
 
-  .metric-card span,
-  .metric-card small,
+  .panel-metric span,
+  .panel-metric small,
   td small {
     color: var(--color-slate);
   }
 
-  .metric-card strong {
+  .panel-metric strong {
     font-size: 2rem;
     line-height: 1;
   }
@@ -345,6 +366,27 @@ const pageDescription =
     font-size: 1.35rem;
   }
 
+  .section-links {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 0.35rem 0.65rem;
+    margin-top: var(--space-xs);
+  }
+
+  .section-link,
+  .section-links a {
+    color: var(--color-navy);
+    font-size: 0.9rem;
+    font-weight: 700;
+    text-decoration-thickness: 1px;
+    text-underline-offset: 0.16em;
+  }
+
+  .section-link {
+    display: inline-flex;
+    margin-top: var(--space-xs);
+  }
+
   .artifact-links {
     display: grid;
     grid-template-columns: repeat(3, minmax(0, 1fr));
@@ -424,7 +466,6 @@ const pageDescription =
 
   @media (max-width: 900px) {
     .bench-header,
-    .snapshot-grid,
     .artifact-strip,
     .artifact-links,
     .snapshot-columns {
@@ -437,10 +478,6 @@ const pageDescription =
       padding-top: 1.5rem;
     }
 
-    .metric-card {
-      min-height: 0;
-    }
-
     .snapshot-panel {
       overflow-x: auto;
     }
diff --git a/specs/eval.md b/specs/eval.md
index c7b4d956..f15a7af4 100644
--- a/specs/eval.md
+++ b/specs/eval.md
@@ -165,6 +165,10 @@ After running evals with `--save`, update `crates/bashkit-eval/README.md` with:
 
 Keep README highlights concise. Full per-task details live in the saved markdown reports under `crates/bashkit-eval/results/`.
 
+Saved eval JSON/Markdown reports are also consumed by the site `/benches` page.
+See `specs/performance-results.md` for the result-location and aggregation
+contract.
+
 ## Scripting Tool Eval Mode
 
 In addition to the default "bash" eval (testing direct bash tool usage), there is a
diff --git a/specs/performance-results.md b/specs/performance-results.md
new file mode 100644
index 00000000..f4a5d840
--- /dev/null
+++ b/specs/performance-results.md
@@ -0,0 +1,64 @@
+# Performance Results and Site Aggregation
+
+## Status
+Implemented
+
+## Abstract
+
+Benchmark, Criterion, and LLM evaluation runs are historical artifacts. The
+static site exposes the latest snapshot at `/benches` by aggregating those
+artifacts during site build.
+
+## Result Locations
+
+Saved runs MUST write machine-readable data and Markdown reports to these
+directories:
+
+| Harness | Result directory | Site input |
+|---------|------------------|------------|
+| `bashkit-bench` | `crates/bashkit-bench/results/` | `bench-*.json` plus matching `bench-*.md` |
+| Criterion benches | `crates/bashkit/benches/results/` | `criterion-*.md` |
+| `bashkit-eval` | `crates/bashkit-eval/results/` | `eval-*.json`, `scripting-eval-*.json`, plus matching `.md` reports |
+
+The Markdown files are the user-facing reports linked from `/benches`. The JSON
+files are the aggregation input for benchmark and eval summaries.
+
+## Run Commands
+
+Default recipes that represent a real benchmark/eval run MUST save artifacts in
+the directories above:
+
+```bash
+just bench
+just eval
+just eval-scripting
+just bench-parallel
+just bench-sqlite
+```
+
+Non-saving exploratory commands may exist, but their names or comments must make
+clear that they do not update the site.
+
+After a successful saved run, the recipe MUST refresh the generated site data:
+
+```bash
+pnpm --dir site run data:performance
+```
+
+This makes the local `/benches` page update during development without waiting
+for a full site build.
+
+## Site Data Build
+
+`site/scripts/build-performance-data.mjs` is the only supported transformer for
+the `/benches` page. It reads the result directories above and writes:
+
+```text
+site/src/data/performance-timeline.json
+```
+
+`site/package.json` MUST run that transformer in `prebuild`, so every
+`pnpm run build` refreshes `/benches` from the latest committed result artifacts.
+
+When changing result schemas, update the transformer and this spec in the same
+PR. Do not hand-edit `performance-timeline.json` except by running the script.