Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ Fix root cause. Unsure: read more code; if stuck, ask w/ short options. Unrecogn
| sqlite-builtin | Embedded SQLite via Turso (MemoryIO + VfsIO backends, dot-commands) |
| coreutils-args-port | Port uutils `uu_app()` clap definitions (args mode) and platform-clean uucore modules (module mode, manifest-driven) into bashkit via codegen |
| credential-injection | Transparent per-host credential injection for outbound HTTP requests, without exposing secrets to sandboxed scripts |
| performance-results | Benchmark/eval result locations and `/benches` site aggregation contract |

### Documentation

Expand Down
4 changes: 4 additions & 0 deletions crates/bashkit-bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,10 @@ cargo run -p bashkit-bench --release -- --list
| `--verbose` | Show per-benchmark timing details |
| `--list` | List available benchmarks |

Saved JSON/Markdown reports in `crates/bashkit-bench/results/` feed the site
`/benches` page. See `specs/performance-results.md` for the aggregation
contract.

## Prerequisites

| Runner | Setup |
Expand Down
68 changes: 52 additions & 16 deletions crates/bashkit-bench/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -391,21 +391,15 @@ async fn main() -> Result<()> {

// Save if requested
if let Some(ref save_arg) = args.save {
let base_name = if save_arg.is_empty() {
// Auto-generate filename with moniker and timestamp
let timestamp = chrono_lite_now();
format!("bench-{}-{}", system_info.moniker, timestamp)
} else {
// Use provided name, strip extension if present
let path = PathBuf::from(save_arg);
path.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("bench-results")
.to_string()
};
let timestamp = chrono_lite_now();
let base_path = save_base_path(save_arg, &system_info.moniker, &timestamp);

let json_path = format!("{}.json", base_name);
let md_path = format!("{}.md", base_name);
let json_path = base_path.with_extension("json");
let md_path = base_path.with_extension("md");

if let Some(parent) = json_path.parent() {
std::fs::create_dir_all(parent).context("Failed to create results directory")?;
}

// Save JSON
let json = serde_json::to_string_pretty(&report)?;
Expand All @@ -418,14 +412,30 @@ async fn main() -> Result<()> {
println!(
"\n{} results to:\n - {}\n - {}",
"Saved".green(),
json_path,
md_path
json_path.display(),
md_path.display()
);
}

Ok(())
}

fn save_base_path(save_arg: &str, moniker: &str, timestamp: &str) -> PathBuf {
if save_arg.is_empty() {
// Auto-generate inside the repo-tracked results folder so site builds
// can pick up fresh benchmark runs.
return PathBuf::from("crates/bashkit-bench/results")
.join(format!("bench-{}-{}", moniker, timestamp));
}

let path = PathBuf::from(save_arg);
if path.extension().is_some() {
path.with_extension("")
} else {
path
}
}

async fn run_benchmark(
runner: &mut Runner,
case: &BenchCase,
Expand Down Expand Up @@ -780,3 +790,29 @@ fn print_summary(summary: &BenchSummary) {
println!();
}
}

#[cfg(test)]
mod tests {
use super::save_base_path;
use std::path::PathBuf;

#[test]
fn save_base_path_defaults_to_site_indexed_results_dir() {
assert_eq!(
save_base_path("", "vm-linux-x86_64", "1779764460"),
PathBuf::from("crates/bashkit-bench/results/bench-vm-linux-x86_64-1779764460")
);
}

#[test]
fn save_base_path_preserves_custom_directory_and_strips_extension() {
assert_eq!(
save_base_path(
"crates/bashkit-bench/results/manual-test.json",
"ignored",
"ignored"
),
PathBuf::from("crates/bashkit-bench/results/manual-test")
);
}
}
46 changes: 29 additions & 17 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -124,63 +124,75 @@ run-script file:

# === Benchmarks ===

# Run benchmarks comparing bashkit to bash
# Run benchmarks comparing bashkit to bash and save site-indexed JSON/Markdown results
bench:
cargo run -p bashkit-bench --release
cargo run -p bashkit-bench --release -- --save
pnpm --dir site run data:performance

# Run benchmarks and save results to JSON
bench-save file="bench-results.json":
# Run benchmarks and save results to JSON/Markdown
bench-save file="":
cargo run -p bashkit-bench --release -- --save {{file}}
pnpm --dir site run data:performance

# Run benchmarks with verbose output
# Run benchmarks with verbose output and save site-indexed JSON/Markdown results
bench-verbose:
cargo run -p bashkit-bench --release -- --verbose
cargo run -p bashkit-bench --release -- --verbose --save
pnpm --dir site run data:performance

# Run specific benchmark category (startup, variables, arithmetic, control, strings, arrays, pipes, tools, complex)
# Exploratory: run specific benchmark category without updating site results (startup, variables, arithmetic, control, strings, arrays, pipes, tools, complex)
bench-category cat:
cargo run -p bashkit-bench --release -- --category {{cat}}

# Run benchmarks with more iterations for accuracy
# Run benchmarks with more iterations for accuracy and save site-indexed JSON/Markdown results
bench-accurate:
cargo run -p bashkit-bench --release -- --iterations 50 --warmup 5
cargo run -p bashkit-bench --release -- --iterations 50 --warmup 5 --save
pnpm --dir site run data:performance

# List available benchmarks
bench-list:
cargo run -p bashkit-bench --release -- --list

# Run benchmarks with all runners (including just-bash if available)
# Run benchmarks with all runners and save site-indexed JSON/Markdown results (including just-bash if available)
bench-all:
cargo run -p bashkit-bench --release -- --runners bashkit,bash,just-bash
cargo run -p bashkit-bench --release -- --runners bashkit,bash,just-bash --save
pnpm --dir site run data:performance

# Run Criterion parallel_execution benchmark and save results
bench-parallel:
./scripts/bench-parallel.sh
pnpm --dir site run data:performance

# Run Criterion sqlite builtin benchmark and save results
bench-sqlite:
./scripts/bench-sqlite.sh
pnpm --dir site run data:performance

# === Eval ===

# Run LLM eval (requires ANTHROPIC_API_KEY or OPENAI_API_KEY)
# Run LLM eval and save site-indexed JSON/Markdown results (requires ANTHROPIC_API_KEY or OPENAI_API_KEY)
eval dataset="crates/bashkit-eval/data/eval-tasks.jsonl" provider="anthropic" model="claude-sonnet-4-20250514":
cargo run -p bashkit-eval --release -- run --dataset {{dataset}} --provider {{provider}} --model {{model}}
cargo run -p bashkit-eval --release -- run --dataset {{dataset}} --provider {{provider}} --model {{model}} --save
pnpm --dir site run data:performance

# Run eval and save results
eval-save dataset="crates/bashkit-eval/data/eval-tasks.jsonl" provider="anthropic" model="claude-sonnet-4-20250514":
cargo run -p bashkit-eval --release -- run --dataset {{dataset}} --provider {{provider}} --model {{model}} --save
pnpm --dir site run data:performance

# Run scripting-tool eval (scripted mode)
# Run scripting-tool eval (scripted mode) and save site-indexed JSON/Markdown results
eval-scripting dataset="crates/bashkit-eval/data/scripting-tool/many-tools.jsonl" provider="openai" model="gpt-5.4":
cargo run -p bashkit-eval --release -- run --eval-type scripting-tool --dataset {{dataset}} --provider {{provider}} --model {{model}}
cargo run -p bashkit-eval --release -- run --eval-type scripting-tool --dataset {{dataset}} --provider {{provider}} --model {{model}} --save
pnpm --dir site run data:performance

# Run scripting-tool eval (baseline mode — individual tools, no ScriptedTool)
# Run scripting-tool eval (baseline mode — individual tools, no ScriptedTool) and save site-indexed JSON/Markdown results
eval-scripting-baseline dataset="crates/bashkit-eval/data/scripting-tool/many-tools.jsonl" provider="openai" model="gpt-5.4":
cargo run -p bashkit-eval --release -- run --eval-type scripting-tool --baseline --dataset {{dataset}} --provider {{provider}} --model {{model}}
cargo run -p bashkit-eval --release -- run --eval-type scripting-tool --baseline --dataset {{dataset}} --provider {{provider}} --model {{model}} --save
pnpm --dir site run data:performance

# Run scripting-tool eval and save results
eval-scripting-save dataset="crates/bashkit-eval/data/scripting-tool/many-tools.jsonl" provider="openai" model="gpt-5.4":
cargo run -p bashkit-eval --release -- run --eval-type scripting-tool --dataset {{dataset}} --provider {{provider}} --model {{model}} --save
pnpm --dir site run data:performance

# === Security ===

Expand Down
4 changes: 4 additions & 0 deletions site/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ pnpm run build # emits ./dist
pnpm run preview # serve dist/ via wrangler
```

`pnpm run build` regenerates `src/data/performance-timeline.json` from saved
benchmark and eval artifacts before Astro builds. The `/benches` page contract is
specified in `../specs/performance-results.md`.

## Deploy

Deployment is intended to run from CI against the Cloudflare account that owns
Expand Down
2 changes: 2 additions & 0 deletions site/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
},
"scripts": {
"dev": "astro dev",
"data:performance": "node scripts/build-performance-data.mjs",
"prebuild": "node scripts/build-performance-data.mjs",
"build": "astro build",
"postbuild": "node scripts/normalize-generated-html.mjs && node scripts/verify-doc-routes.mjs && node scripts/verify-doc-markdown-routes.mjs && node scripts/verify-public-links.mjs && node scripts/verify-sitemap.mjs && node scripts/verify-robots.mjs && node scripts/verify-agent-skills.mjs && node scripts/verify-link-headers.mjs",
"preview": "wrangler dev",
Expand Down
Loading
Loading