From b0962e3eeba45a9b7b4e731066572a09fe4efdce Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Thu, 7 May 2026 04:13:41 +0000 Subject: [PATCH 1/2] fix(builtins): close clap Arg::env host-env side-channel (TM-INF-024) uutils' uu_app() attaches `.env("TABSIZE")` / `.env("TIME_STYLE")` to ls options so they default from std::env. With clap's `env` feature on, that bypasses bashkit's ctx.env sandbox: scripts can probe host env presence, and a host-set TIME_STYLE materializes as a clap value source for an option ls hasn't implemented yet, breaking plain `ls` for unrelated tenants. Three-layer fix: 1. Codegen strips `.env("FOO")` chained calls in `bashkit-coreutils-port`, so generated `_args.rs` only sees argv. Already-generated `ls_args.rs` is updated in place to match. 2. Static guard `no_clap_env_in_generated_parsers` greps every `builtins/generated/*.rs` and fails the build if `.env(` reappears. 3. Workspace `clap` dep drops the `env` cargo feature, so a re-introduced `.env(...)` fails to compile. Adds a runtime regression test that exports TIME_STYLE/TABSIZE on the host process and asserts plain `ls` still succeeds. --- Cargo.toml | 7 +- crates/bashkit-coreutils-port/src/main.rs | 27 ++++++++ .../bashkit/src/builtins/generated/ls_args.rs | 2 - crates/bashkit/src/builtins/ls.rs | 64 +++++++++++++++++++ crates/bashkit/src/builtins/mod.rs | 41 ++++++++++++ specs/coreutils-args-port.md | 7 ++ specs/threat-model.md | 1 + 7 files changed, 146 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 06a0b801..33cee835 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,7 +75,12 @@ zeroize = "1" getrandom = "0.3" # CLI -clap = { version = "4", features = ["derive", "env"] } +# Intentionally NOT enabling clap's `env` feature: it makes `Arg::env(...)` +# read defaults from `std::env`, which would tunnel host process state into +# the bashkit sandbox (TM-INF-023). The coreutils-args-port codegen strips +# `.env(...)` chains from vendored Arg builders; dropping the feature here +# is defence-in-depth so any re-introduction fails to compile. +clap = { version = "4", features = ["derive"] } # Testing tokio-test = "0.4" diff --git a/crates/bashkit-coreutils-port/src/main.rs b/crates/bashkit-coreutils-port/src/main.rs index e06638b1..38f6d22c 100644 --- a/crates/bashkit-coreutils-port/src/main.rs +++ b/crates/bashkit-coreutils-port/src/main.rs @@ -508,6 +508,33 @@ impl VisitMut for Rewriter { if let Expr::MethodCall(mc) = node && mc.args.len() == 1 && matches!(mc.args.first(), Some(Expr::Call(c)) if is_drop_sentinel(c)) + { + let receiver = (*mc.receiver).clone(); + *node = receiver; + return; + } + + // Strip `clap::Arg::env(...)` chained method calls. uutils sources + // pull defaults like `TABSIZE`/`TIME_STYLE` from `std::env`, but + // bashkit sandboxes scripts inside its own `ctx.env`. Letting clap + // read from the host process leaks host state into the sandbox + // (TM-INF-024): scripts can probe whether the host has these vars + // set, and a host-set `TIME_STYLE` would tunnel a value through + // bashkit's "unsupported option" gate and kill `ls` for unrelated + // tenants. Drop the `.env(...)` step from the Arg chain at codegen + // time so generated parsers only see argv. Conservative match: + // method ident is `env` and the single argument is a string + // literal (the only shape uutils uses). + if let Expr::MethodCall(mc) = node + && mc.method == "env" + && mc.args.len() == 1 + && matches!( + mc.args.first(), + Some(Expr::Lit(syn::ExprLit { + lit: syn::Lit::Str(_), + .. + })) + ) { let receiver = (*mc.receiver).clone(); *node = receiver; diff --git a/crates/bashkit/src/builtins/generated/ls_args.rs b/crates/bashkit/src/builtins/generated/ls_args.rs index 47fa4b47..d2c7eb0d 100644 --- a/crates/bashkit/src/builtins/generated/ls_args.rs +++ b/crates/bashkit/src/builtins/generated/ls_args.rs @@ -194,7 +194,6 @@ pub fn ls_command() -> Command { Arg::new(options::format::TAB_SIZE) .short('T') .long(options::format::TAB_SIZE) - .env("TABSIZE") .value_name("COLS") .help( ::std::string::String::from( @@ -903,7 +902,6 @@ pub fn ls_command() -> Command { ), ) .value_name("TIME_STYLE") - .env("TIME_STYLE") .value_parser(NonEmptyStringValueParser::new()) .overrides_with_all([options::TIME_STYLE]), ) diff --git a/crates/bashkit/src/builtins/ls.rs b/crates/bashkit/src/builtins/ls.rs index 94f977f7..b7c78e94 100644 --- a/crates/bashkit/src/builtins/ls.rs +++ b/crates/bashkit/src/builtins/ls.rs @@ -1543,6 +1543,70 @@ mod tests { assert!(result.stdout.contains("nested.txt")); } + /// TM-INF-024 regression: a host-set `TIME_STYLE` (or any other env var + /// uutils' `uu_app()` attaches to an Arg via `.env(...)`) MUST NOT + /// reach the clap parser. uutils' upstream wires + /// `Arg::new(TIME_STYLE).env("TIME_STYLE")` so the option defaults + /// from the host process — bashkit strips that at codegen time and + /// drops the `env` clap feature workspace-wide as defence-in-depth. + /// Without those guards a plain `ls` in a container that exports + /// `TIME_STYLE=long-iso` would trip the unsupported-option gate + /// (since bashkit hasn't implemented `--time-style` yet) and break + /// `ls` for every script running on that host. + #[tokio::test] + #[serial_test::serial] + async fn ls_ignores_host_time_style_and_tabsize() { + // SAFETY: serial_test::serial serializes against other tests that + // touch the process env. Setting + unsetting around a single + // bashkit `Ls.execute()` is the only way to exercise the + // sandbox-leak regression: we're asserting bashkit does NOT + // observe these even when they're present on the host process. + unsafe { + std::env::set_var("TIME_STYLE", "long-iso"); + std::env::set_var("TABSIZE", "16"); + } + + let (fs, mut cwd, mut variables) = create_test_ctx().await; + fs.write_file(&cwd.join("a.txt"), b"a").await.unwrap(); + let env = HashMap::new(); + let args: Vec = vec![]; + let ctx = Context { + args: &args, + env: &env, + variables: &mut variables, + cwd: &mut cwd, + fs: fs.clone(), + stdin: None, + #[cfg(feature = "http_client")] + http_client: None, + #[cfg(feature = "git")] + git_client: None, + #[cfg(feature = "ssh")] + ssh_client: None, + shell: None, + }; + + let result = Ls.execute(ctx).await.unwrap(); + + unsafe { + std::env::remove_var("TIME_STYLE"); + std::env::remove_var("TABSIZE"); + } + + assert_eq!( + result.exit_code, 0, + "host TIME_STYLE/TABSIZE leaked into bashkit ls parser \ + (TM-INF-024): stderr={}", + result.stderr + ); + assert!(result.stdout.contains("a.txt")); + assert!( + !result.stderr.contains("not yet implemented"), + "host env tunneled through clap as a value source: stderr={}", + result.stderr + ); + } + // ==================== find tests ==================== #[tokio::test] diff --git a/crates/bashkit/src/builtins/mod.rs b/crates/bashkit/src/builtins/mod.rs index 80cef2d7..22003ab2 100644 --- a/crates/bashkit/src/builtins/mod.rs +++ b/crates/bashkit/src/builtins/mod.rs @@ -1106,6 +1106,47 @@ mod tests { ); } + /// TM-INF-024: clap `Arg::env(...)` reads defaults from the real + /// process environment. uutils ships `.env("TABSIZE")` / + /// `.env("TIME_STYLE")` on `ls`, but bashkit isolates scripts inside + /// `ctx.env`; if the host parser were allowed to consult `std::env` + /// the sandbox boundary would leak (host can probe presence, host can + /// inject values that propagate into bashkit's option-validation + /// path). Codegen strips `.env(...)` from generated Arg chains; this + /// static guard makes sure no future regen (or hand-edit) re-adds + /// one. Defence-in-depth: the workspace `clap` dep also drops the + /// `env` cargo feature, so a slipped `.env(...)` won't compile. + #[test] + fn no_clap_env_in_generated_parsers() { + let pat = regex::Regex::new(r"\.env\s*\(").unwrap(); + let dir = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("src/builtins/generated"); + let mut violations = Vec::new(); + for entry in std::fs::read_dir(&dir).expect("read generated dir") { + let entry = entry.unwrap(); + let path = entry.path(); + if path.extension().and_then(|s| s.to_str()) != Some("rs") { + continue; + } + let src = std::fs::read_to_string(&path).expect("read generated file"); + for (i, line) in src.lines().enumerate() { + if pat.is_match(line) { + let rel = path + .strip_prefix(std::path::Path::new(env!("CARGO_MANIFEST_DIR"))) + .unwrap_or(&path); + violations.push(format!("{}:{}: {}", rel.display(), i + 1, line.trim_end())); + } + } + } + assert!( + violations.is_empty(), + "clap `Arg::env(...)` found in a generated parser. This pulls \ + defaults from the host process environment and breaks bashkit's \ + sandbox boundary (TM-INF-024). Re-run `just regen-coreutils-args` \ + — the codegen strips these — or remove the call by hand.\n\n{}", + violations.join("\n") + ); + } + /// Every `_args.rs` header MUST reference the same uutils /// revision as `generated::UUTILS_REVISION`. The drift workflow /// bumps both atomically; this test catches the case where someone diff --git a/specs/coreutils-args-port.md b/specs/coreutils-args-port.md index 7d6ef4e8..66e7030d 100644 --- a/specs/coreutils-args-port.md +++ b/specs/coreutils-args-port.md @@ -22,6 +22,13 @@ codegen**, not by depending on `uu_*` crates at runtime. - `uucore::clap_localization::configure_localized_command(cmd)` → `cmd` - `ShortcutValueParser::new([…])` → `clap::builder::PossibleValuesParser::new([…])` (loses uucore's unambiguous-abbreviation behaviour; documented divergence) + - `Arg::…env("FOO")…` → chain step elided (TM-INF-024). uutils + attaches `.env(...)` to options like `TABSIZE`/`TIME_STYLE` so they + pick up host process state; bashkit sandboxes scripts inside + `ctx.env`, so generated parsers must only consult argv. The + workspace `clap` dep also drops the `env` cargo feature as + defence-in-depth, and `builtins::tests::no_clap_env_in_generated_parsers` + statically forbids `.env(` from re-appearing in `generated/*.rs`. 5. Emits a generated file under `crates/bashkit/src/builtins/generated/_args.rs` with a clean `pub fn _command() -> clap::Command`. diff --git a/specs/threat-model.md b/specs/threat-model.md index 965a9fe9..f341c9fb 100644 --- a/specs/threat-model.md +++ b/specs/threat-model.md @@ -438,6 +438,7 @@ All execution stays within the virtual interpreter — no OS subprocess is spawn | TM-INF-021 | Stack backtrace information disclosure | Panics leak internal source paths, dependency versions, and function names via stderr | Custom panic hook suppresses backtraces in CLI | **MITIGATED** | | TM-INF-022 | Library Debug shapes leak via stderr | Builtins wrapping external libs (jaq, regex, serde_json, semver, chrono, …) format errors with `format!("{:?}", e)` and dump internal struct shapes into stderr — e.g. jq printed `(File { code: "<800 chars of prepended compat-defs source>", path: () }, [("@tsv", Filter(0))])` to LLM agents | Three-layer enforcement, all run by `cargo test`: (1) static — `builtins::tests::no_debug_fmt_in_builtin_source` walks every `crates/bashkit/src/builtins/*.rs` and forbids `{:?}` / `{:#?}` / `{name:?}` (per-line opt-out: `// debug-ok: `); (2) dynamic per-tool — each tool's `mod tests` calls `bashkit::testing::assert_no_leak` with malformed inputs; (3) fuzz — every cargo-fuzz target plus 5 proptest cases call `bashkit::testing::assert_fuzz_invariants` against random input and check the same invariants on stderr/stdout | **FIXED** | | TM-INF-023 | jq `halt` / `halt_error` terminate the host process | jaq-std 3.0's `halt(N)` native calls `std::process::exit(exit_code)`. `defs.jq` wraps it with `def halt: halt(0);` and `def halt_error(...): ..., halt(...);`. Untrusted jq filters can invoke `halt` / `halt_error` to exit the entire embedding process, escaping the `ExecResult`-based sandbox boundary and causing process-wide DoS for any caller hosting bashkit. | Strip the upstream `halt` native from `jaq_std::funs::()` and chain in a safe replacement that pops the variable argument and returns `Error::str("halt is disabled in the bashkit sandbox")`. The wrapper defs (`halt`, `halt_error`) still resolve, so callers see a normal jq runtime error (exit 5) instead of the host being killed. Regression tests in `builtins::jq::tests`: `halt_does_not_terminate_host_process`, `halt_with_arg_does_not_terminate_host_process`, `halt_error_does_not_terminate_host_process`. | **FIXED** | +| TM-INF-024 | Host env side-channel via clap `Arg::env(...)` | Clap-based builtins ported from uutils (currently `ls`, with `.env("TABSIZE")` and `.env("TIME_STYLE")` on `uu_app()`) resolve those defaults from `std::env`, not bashkit's `ctx.env`. This breaks the sandbox boundary two ways: (1) presence-probe — a script can tell whether the host process has `TIME_STYLE`/`TABSIZE` set by observing `ls`'s behaviour; (2) availability — a host- or container-set `TIME_STYLE` materialises as a clap value source for an option bashkit hasn't implemented yet, so the unsupported-option gate fires on every plain `ls` and breaks the builtin for unrelated tenants | Three-layer fix: (1) **codegen strips `.env(...)`** — `bashkit-coreutils-port` rewrites `.env("FOO")` chained method calls out of every Arg builder it ports, so generated `_args.rs` only ever consults argv; (2) **static guard** — `builtins::tests::no_clap_env_in_generated_parsers` greps every `crates/bashkit/src/builtins/generated/*.rs` and fails the build if any `.env(` call sneaks back in; (3) **defence-in-depth** — workspace `clap` dep drops the `env` cargo feature, so a re-introduced `.env(...)` fails to compile rather than silently re-opening the channel | **FIXED** | **TM-INF-022**: Generalizes TM-INF-016 to cover the whole builtin surface. The originating bug was `builtins/jq/` formatting jaq's compile/parse From 1bcec0a5f838668ca8e66191c8b46c21bbfd7702 Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Thu, 7 May 2026 14:33:58 +0000 Subject: [PATCH 2/2] feat(builtins): port uutils env-default surface via virtual-env shim uutils' uu_app() ships .env("FOO") on options like TIME_STYLE/TABSIZE so they default from std::env. Earlier fix stripped those calls outright (TM-INF-023) which closed the host-env channel but threw away the env-default UX. This restores the UX through bashkit's virtual env without re-opening the host channel. How it composes: - Codegen now harvests every stripped .env("FOO") into a sidecar `pub static _ENV_DEFAULTS: &[clap_env::EnvDefault]` next to `_command()`. Each row records (arg_id, long, env_var, kind). Always emitted (possibly empty) so every util has a uniform surface. - New `builtins::clap_env` shim provides `apply_env_defaults(argv, defaults, ctx.env) -> argv'`. Reads only `ctx.env`, never std::env. Synthesises `-- ` for Single, `--` for Bool, `-- ` for Multi (clap re-splits via its own delimiter). Implements clap's documented "argv > env > default" precedence by skipping injection when argv already specifies the long flag. - Ls::execute calls apply_env_defaults with LS_ENV_DEFAULTS before try_get_matches_from. Today's "not yet implemented" gate still fires for time-style/tabsize, but now the rejection is caused by bashkit-virtual env (or explicit argv), never the host process. Tests: - 9 unit tests on the shim (precedence, kind handling, std::env isolation, program-name slot). - ls_honors_virtual_env_time_style asserts ctx.env reaches clap. - ls_argv_time_style_overrides_virtual_env asserts precedence. - ls_ignores_host_time_style_and_tabsize (existing) still pins the std::env half. - Static tests: every_generated_parser_emits_env_defaults_table, ls_env_defaults_surface_matches_uutils, plus the existing no_clap_env_in_generated_parsers regression. Per-builtin opt-in: cat/tac/shuf/readlink/truncate emit empty defaults today; the shim wiring lights up automatically when their upstream uu_app() grows .env(...) annotations and codegen reruns. --- crates/bashkit-coreutils-port/src/main.rs | 226 ++++++++++++++++ crates/bashkit/src/builtins/clap_env.rs | 255 ++++++++++++++++++ .../src/builtins/generated/cat_args.rs | 5 + .../bashkit/src/builtins/generated/ls_args.rs | 19 ++ .../src/builtins/generated/readlink_args.rs | 5 + .../src/builtins/generated/shuf_args.rs | 5 + .../src/builtins/generated/tac_args.rs | 5 + .../src/builtins/generated/truncate_args.rs | 5 + crates/bashkit/src/builtins/ls.rs | 112 +++++++- crates/bashkit/src/builtins/mod.rs | 84 +++++- specs/coreutils-args-port.md | 29 +- specs/threat-model.md | 2 +- 12 files changed, 733 insertions(+), 19 deletions(-) create mode 100644 crates/bashkit/src/builtins/clap_env.rs diff --git a/crates/bashkit-coreutils-port/src/main.rs b/crates/bashkit-coreutils-port/src/main.rs index 38f6d22c..121a7860 100644 --- a/crates/bashkit-coreutils-port/src/main.rs +++ b/crates/bashkit-coreutils-port/src/main.rs @@ -108,6 +108,14 @@ fn run(uutils_dir: &Path, util: &str, rev: &str) -> Result { .ok_or_else(|| anyhow!("could not find `fn uu_app` in {}", src_path.display()))? .clone(); + // Harvest `.env(...)` annotations BEFORE the rewriter strips them + // from the runtime Arg chain. Each entry becomes a row in the + // generated `_ENV_DEFAULTS` table that the bashkit-side shim + // (`builtins::clap_env::apply_env_defaults`) consults to fold + // bashkit's virtual `ctx.env` into clap parsing — never the host + // `std::env` (TM-INF-024). + let env_defaults = collect_env_defaults(&uu_app)?; + let mut rw = Rewriter { translations, unresolved: vec![], @@ -178,6 +186,43 @@ fn run(uutils_dir: &Path, util: &str, rev: &str) -> Result { quote!() }; + // `_ENV_DEFAULTS` table — the codegen-side half of the + // virtual-env shim. Always emitted (possibly empty) so every + // generated module exposes the same surface; bashkit-side + // builtins import it by name and feed it to + // `crate::builtins::clap_env::apply_env_defaults`. + let env_defaults_const_name = syn::Ident::new( + &format!("{}_ENV_DEFAULTS", util.to_uppercase()), + proc_macro2::Span::call_site(), + ); + let env_defaults_rows: Vec = env_defaults + .iter() + .map(|d| { + let arg_id = &d.arg_id; + let long = &d.long; + let env_var = LitStr::new(&d.env_var, proc_macro2::Span::call_site()); + let kind_ident = syn::Ident::new(d.kind.as_str(), proc_macro2::Span::call_site()); + quote! { + crate::builtins::clap_env::EnvDefault { + arg_id: #arg_id, + long: #long, + env_var: #env_var, + kind: crate::builtins::clap_env::EnvKind::#kind_ident, + } + } + }) + .collect(); + let env_defaults_tokens: TokenStream = quote! { + /// Sidecar harvest of every `Arg::env(...)` annotation the codegen + /// stripped from the runtime Arg chain (TM-INF-024). Consumed by + /// `crate::builtins::clap_env::apply_env_defaults` so bashkit's + /// virtual `ctx.env` — never `std::env` — drives clap's env-default + /// path. Order matches the chain order in the original `uu_app()`. + pub static #env_defaults_const_name: &[crate::builtins::clap_env::EnvDefault] = &[ + #(#env_defaults_rows),* + ]; + }; + let body: TokenStream = quote! { #![allow(unused_imports, dead_code)] @@ -209,6 +254,8 @@ fn run(uutils_dir: &Path, util: &str, rev: &str) -> Result { s.to_string() } + #env_defaults_tokens + // Inlined free-function helpers referenced by `uu_app()` (e.g. // shuf's `parse_range` as a clap `value_parser`). Copied // verbatim from the source file with the same translate!() @@ -269,6 +316,185 @@ fn parse_ftl(src: &str) -> HashMap { out } +/// One row in the codegen's harvest of `Arg::env(...)` annotations. +/// Carries token-stream-form expressions for `arg_id` and `long` so we +/// can quote them straight back into the generated `_ENV_DEFAULTS` +/// table without resolving constants — they evaluate to `&'static str` +/// in the generated module's scope (uutils' `mod options` items are +/// `pub static FOO: &str = "..."`). +struct EnvDefaultMeta { + arg_id: TokenStream, + long: TokenStream, + env_var: String, + kind: EnvKindMeta, +} + +#[derive(Clone, Copy)] +enum EnvKindMeta { + Single, + Bool, + Multi, +} + +impl EnvKindMeta { + fn as_str(self) -> &'static str { + match self { + EnvKindMeta::Single => "Single", + EnvKindMeta::Bool => "Bool", + EnvKindMeta::Multi => "Multi", + } + } +} + +/// Walk `uu_app()` body, find every `.arg()` whose `` +/// contains `.env("VAR")`, and harvest one `EnvDefaultMeta` per chain. +/// +/// Each chain is rooted at `Arg::new()` — innermost receiver +/// of the method-call stack. We extract: +/// - `arg_id`: the expression `Arg::new` was called with. +/// - `long`: the expression `.long(...)` was called with (required; +/// bail if absent — every uutils env-bound option has `.long`). +/// - `env_var`: the string literal `.env(...)` was called with. +/// - `kind`: `Bool` if `.action(ArgAction::SetTrue|SetFalse)` is +/// present, `Multi` if `.value_delimiter(...)` is present, else +/// `Single` (the dominant case for uutils — `TIME_STYLE`, +/// `TABSIZE`, `BLOCK_SIZE`, …). +/// +/// Run BEFORE the rewriter mutates `uu_app`, while `.env(...)` is still +/// in place. The rewriter then strips the call from the runtime chain. +fn collect_env_defaults(uu_app: &ItemFn) -> Result> { + use syn::visit::{self, Visit}; + + struct Collector { + out: Vec, + errors: Vec, + } + impl<'ast> Visit<'ast> for Collector { + fn visit_expr_method_call(&mut self, mc: &'ast syn::ExprMethodCall) { + // Match `.arg()` — every Arg chain in + // `Command::new(...).arg()…` shows up here. + if mc.method == "arg" + && mc.args.len() == 1 + && let Some(arg_expr) = mc.args.first() + && let Some(meta) = parse_arg_chain_for_env(arg_expr, &mut self.errors) + { + self.out.push(meta); + } + visit::visit_expr_method_call(self, mc); + } + } + + let mut c = Collector { + out: vec![], + errors: vec![], + }; + c.visit_item_fn(uu_app); + if !c.errors.is_empty() { + bail!("env-default harvest errors: {}", c.errors.join("; ")); + } + Ok(c.out) +} + +/// For an `.arg()` argument, walk the method-call chain and +/// extract env-default metadata if `.env(...)` is present. Returns +/// `None` for chains that don't carry a `.env(...)` annotation. +fn parse_arg_chain_for_env(arg_expr: &Expr, errors: &mut Vec) -> Option { + let mut node: &Expr = arg_expr; + let mut env_var: Option = None; + let mut long: Option = None; + let mut kind = EnvKindMeta::Single; + + while let Expr::MethodCall(mc) = node { + let m = mc.method.to_string(); + if m == "env" { + if let Some(Expr::Lit(syn::ExprLit { + lit: syn::Lit::Str(s), + .. + })) = mc.args.first() + { + env_var = Some(s.value()); + } else { + errors.push(format!( + "Arg::env(...) call with non-string-literal argument: {}", + quote::quote!(#mc) + )); + } + } else if m == "long" && mc.args.len() == 1 { + let arg = &mc.args[0]; + long = Some(quote!(#arg)); + } else if m == "action" + && mc + .args + .first() + .is_some_and(expr_is_set_true_or_set_false_action) + { + kind = EnvKindMeta::Bool; + } else if m == "value_delimiter" { + kind = EnvKindMeta::Multi; + } + node = &mc.receiver; + } + + // Bottom of the chain must be `Arg::new()` — record the id. + let arg_id = match node { + Expr::Call(call) => { + if !path_ends_with_arg_new(&call.func) { + return None; + } + match call.args.first() { + Some(id_expr) => quote!(#id_expr), + None => { + errors.push("Arg::new() with zero arguments".into()); + return None; + } + } + } + _ => return None, + }; + + let env_var = env_var?; + let long = match long { + Some(l) => l, + None => { + errors.push(format!( + "Arg with .env({env_var:?}) but no .long(...) — bashkit's \ + virtual-env shim needs a long flag to inject" + )); + return None; + } + }; + + Some(EnvDefaultMeta { + arg_id, + long, + env_var, + kind, + }) +} + +fn expr_is_set_true_or_set_false_action(expr: &Expr) -> bool { + let Expr::Path(p) = expr else { + return false; + }; + let segs = path_segments(&p.path); + let last = segs.last().map(String::as_str).unwrap_or(""); + matches!(last, "SetTrue" | "SetFalse") +} + +fn path_ends_with_arg_new(func: &Expr) -> bool { + let Expr::Path(p) = func else { + return false; + }; + let segs = path_segments(&p.path); + let last_two: Vec<&str> = segs + .iter() + .rev() + .take(2) + .map(String::as_str) + .collect::>(); + matches!(last_two.as_slice(), ["new", "Arg"]) +} + /// Walk `uu_app()`'s body looking for plain identifier references /// (single-segment paths) and return any matching free `fn` defined at /// the top level of the source file. Skips items already provided by diff --git a/crates/bashkit/src/builtins/clap_env.rs b/crates/bashkit/src/builtins/clap_env.rs new file mode 100644 index 00000000..8ef2e61a --- /dev/null +++ b/crates/bashkit/src/builtins/clap_env.rs @@ -0,0 +1,255 @@ +//! Virtual-env shim for clap-based ported builtins. +//! +//! uutils' `uu_app()` ships `Arg::env("FOO")` to default options like +//! `TIME_STYLE` / `BLOCK_SIZE` / `LS_COLORS` from the host process +//! environment. bashkit sandboxes scripts inside `ctx.env`, so codegen +//! strips `.env(...)` from the runtime Arg chain (TM-INF-024) and +//! instead emits a sidecar `_ENV_DEFAULTS: &[EnvDefault]` table +//! recording what was stripped. +//! +//! `apply_env_defaults` reads that table plus the caller's virtual +//! `ctx.env` and rewrites argv before clap sees it, emulating clap's +//! own "argv > env > default" precedence — but sourced from the +//! sandbox, never `std::env`. One bashkit-side seam, one audit point. + +use std::collections::HashMap; +use std::ffi::OsString; + +/// How an `Arg`'s value materialises on the command line. Mirrors the +/// shape of the clap chain we found `.env(...)` next to. +// +// `Multi` and (later) other variants are reserved for upcoming codegen +// emissions — current ported utils only exercise `Single`. Allow dead +// code so the surface stays committed without forcing a port that +// uses it on day one. +#[allow(dead_code)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum EnvKind { + /// `-- `. The default for value-bearing options + /// (uutils' `TIME_STYLE`, `TABSIZE`, `BLOCK_SIZE`, …). + Single, + /// `--` with no value. uutils sets `.action(SetTrue)` for + /// boolean env-defaulted toggles (none ported yet but reserved + /// so codegen can emit them without shim changes). Truthy iff + /// the env value is non-empty (matches clap's behaviour). + Bool, + /// Multi-value, comma-or-delim-separated. uutils uses + /// `.value_delimiter(',')` for things like + /// `LS_COLORS`-adjacent lists. We push `--` once with the + /// raw value; clap re-splits it via its own delimiter setting. + Multi, +} + +/// One stripped `.env(...)` annotation, harvested at codegen time. +/// +/// Lives in `static`-friendly form so generated files can place these +/// in a `&'static [EnvDefault]` slice without any runtime allocation. +#[derive(Debug, Clone, Copy)] +pub struct EnvDefault { + /// `Arg::new(arg_id)` — matches `ArgMatches::ids()` post-parse. + /// Carried so callers / round-trip diagnostics can correlate a + /// default back to its clap option even though the shim itself + /// only needs `long`. Held intentionally; suppress the unused-field + /// warning until a consumer reaches for it. + #[allow(dead_code)] + pub arg_id: &'static str, + /// `Arg::long(long)` — the long flag we synthesise into argv. + pub long: &'static str, + /// The env-var name uutils originally pulled from `std::env`. + /// We read this from `ctx.env` instead. + pub env_var: &'static str, + /// Drives the argv shape we synthesise. + pub kind: EnvKind, +} + +/// Inject env-sourced defaults into argv before handing it to clap. +/// +/// Precedence (matches clap's own `Arg::env`): +/// 1. argv (if user already passed `--` or `--=…`, leave it). +/// 2. ctx.env[env_var] (if set, synthesise `-- `). +/// 3. clap's `.default_value(...)` (untouched). +/// +/// `argv[0]` is the program name (clap convention); we never inspect or +/// modify it. Synthesised entries are appended *after* the existing +/// argv; clap doesn't care about positional order for flags. Positional +/// arguments are unaffected — none of the env-bound uutils args are +/// positional. +/// +/// Empty env value = unset, intentionally: +/// - For `Single`/`Multi`, clap would reject an empty value via +/// `NonEmptyStringValueParser` anyway (uutils' default for these). +/// - For `Bool`, clap's own `Arg::env` matches "non-empty" as truthy. +pub fn apply_env_defaults( + mut argv: Vec, + defaults: &[EnvDefault], + env: &HashMap, +) -> Vec { + for d in defaults { + if argv_has_long(&argv, d.long) { + continue; + } + let Some(val) = env.get(d.env_var) else { + continue; + }; + if val.is_empty() { + continue; + } + let long_flag = format!("--{}", d.long); + match d.kind { + EnvKind::Bool => argv.push(OsString::from(long_flag)), + EnvKind::Single | EnvKind::Multi => { + argv.push(OsString::from(long_flag)); + argv.push(OsString::from(val)); + } + } + } + argv +} + +/// Returns `true` iff argv already specifies `--` (with or +/// without an attached `=value`). Matches clap's own +/// "is-this-flag-already-set" check shape, modulo short flags — uutils +/// only env-defaults options that have a `.long(...)`, so we only check +/// the long form. Skips index 0 (program name). +fn argv_has_long(argv: &[OsString], long: &str) -> bool { + let exact = format!("--{long}"); + let prefix = format!("--{long}="); + argv.iter().skip(1).any(|a| { + let Some(s) = a.to_str() else { + return false; + }; + s == exact || s.starts_with(&prefix) + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn argv(parts: &[&str]) -> Vec { + parts.iter().map(OsString::from).collect() + } + + fn env(pairs: &[(&str, &str)]) -> HashMap { + pairs + .iter() + .map(|(k, v)| ((*k).to_string(), (*v).to_string())) + .collect() + } + + const TIME_STYLE: EnvDefault = EnvDefault { + arg_id: "time-style", + long: "time-style", + env_var: "TIME_STYLE", + kind: EnvKind::Single, + }; + const TABSIZE: EnvDefault = EnvDefault { + arg_id: "tab-size", + long: "tabsize", + env_var: "TABSIZE", + kind: EnvKind::Single, + }; + const COLOR_TOGGLE: EnvDefault = EnvDefault { + arg_id: "color", + long: "color", + env_var: "CLICOLOR", + kind: EnvKind::Bool, + }; + + #[test] + fn single_value_env_synthesised_when_argv_silent() { + let out = apply_env_defaults( + argv(&["ls"]), + &[TIME_STYLE], + &env(&[("TIME_STYLE", "long-iso")]), + ); + assert_eq!(out, argv(&["ls", "--time-style", "long-iso"])); + } + + #[test] + fn argv_long_overrides_env_default() { + let out = apply_env_defaults( + argv(&["ls", "--time-style", "iso"]), + &[TIME_STYLE], + &env(&[("TIME_STYLE", "long-iso")]), + ); + // ctx.env value not appended; original argv preserved verbatim. + assert_eq!(out, argv(&["ls", "--time-style", "iso"])); + } + + #[test] + fn argv_long_equals_form_overrides_env_default() { + let out = apply_env_defaults( + argv(&["ls", "--time-style=iso"]), + &[TIME_STYLE], + &env(&[("TIME_STYLE", "long-iso")]), + ); + assert_eq!(out, argv(&["ls", "--time-style=iso"])); + } + + #[test] + fn missing_env_var_leaves_argv_alone() { + let out = apply_env_defaults(argv(&["ls"]), &[TIME_STYLE], &env(&[])); + assert_eq!(out, argv(&["ls"])); + } + + #[test] + fn empty_env_value_is_treated_as_unset() { + // Matches clap::Arg::env behaviour: empty == not set for value + // options (NonEmptyStringValueParser would reject anyway). + let out = apply_env_defaults(argv(&["ls"]), &[TIME_STYLE], &env(&[("TIME_STYLE", "")])); + assert_eq!(out, argv(&["ls"])); + } + + #[test] + fn bool_kind_appends_flag_only() { + let out = apply_env_defaults(argv(&["ls"]), &[COLOR_TOGGLE], &env(&[("CLICOLOR", "1")])); + assert_eq!(out, argv(&["ls", "--color"])); + } + + #[test] + fn multiple_defaults_processed_independently() { + let out = apply_env_defaults( + argv(&["ls"]), + &[TIME_STYLE, TABSIZE], + &env(&[("TIME_STYLE", "iso"), ("TABSIZE", "4")]), + ); + assert_eq!(out, argv(&["ls", "--time-style", "iso", "--tabsize", "4"])); + } + + #[test] + fn does_not_read_std_env() { + // Sanity: a var present on the host process but absent from + // ctx.env must NOT leak through. Belt-and-braces against + // future regressions of TM-INF-024; the fn signature already + // forbids std::env access (no &std::env::Vars argument). + // SAFETY: `std::env::set_var` is unsafe in 2024 edition; this + // test reads its own var only, doesn't race with anything. + unsafe { std::env::set_var("TIME_STYLE_HOST_LEAK_PROBE", "leaked") }; + let out = apply_env_defaults( + argv(&["ls"]), + &[EnvDefault { + arg_id: "probe", + long: "probe", + env_var: "TIME_STYLE_HOST_LEAK_PROBE", + kind: EnvKind::Single, + }], + &env(&[]), // empty virtual env + ); + unsafe { std::env::remove_var("TIME_STYLE_HOST_LEAK_PROBE") }; + assert_eq!(out, argv(&["ls"])); + } + + #[test] + fn argv_program_name_is_never_matched_as_flag() { + // If someone passes argv[0] = "--time-style" by mistake, we + // must still inject the env default — argv[0] is the program + // name slot per clap convention. + let out = apply_env_defaults( + argv(&["--time-style"]), + &[TIME_STYLE], + &env(&[("TIME_STYLE", "iso")]), + ); + assert_eq!(out, argv(&["--time-style", "--time-style", "iso"])); + } +} diff --git a/crates/bashkit/src/builtins/generated/cat_args.rs b/crates/bashkit/src/builtins/generated/cat_args.rs index adecac93..39684cea 100644 --- a/crates/bashkit/src/builtins/generated/cat_args.rs +++ b/crates/bashkit/src/builtins/generated/cat_args.rs @@ -34,6 +34,11 @@ use options::*; fn format_usage(s: &str) -> String { s.to_string() } +/// Sidecar harvest of every `Arg::env(...)` annotation the codegen +/// stripped from the runtime Arg chain (TM-INF-024). Empty for utils +/// whose `uu_app()` has no `.env(...)` calls; emitted unconditionally +/// so the bashkit-side surface is uniform. +pub static CAT_ENV_DEFAULTS: &[crate::builtins::clap_env::EnvDefault] = &[]; pub fn cat_command() -> Command { Command::new("cat") .version(env!("CARGO_PKG_VERSION")) diff --git a/crates/bashkit/src/builtins/generated/ls_args.rs b/crates/bashkit/src/builtins/generated/ls_args.rs index d2c7eb0d..9f3bbf26 100644 --- a/crates/bashkit/src/builtins/generated/ls_args.rs +++ b/crates/bashkit/src/builtins/generated/ls_args.rs @@ -99,6 +99,25 @@ use options::*; fn format_usage(s: &str) -> String { s.to_string() } +/// Sidecar harvest of every `Arg::env(...)` annotation the codegen +/// stripped from `ls_command()` (TM-INF-024). Consumed by +/// `crate::builtins::clap_env::apply_env_defaults` so bashkit's +/// virtual `ctx.env` — never `std::env` — drives clap's env-default +/// path. Order matches the chain order in the original `uu_app()`. +pub static LS_ENV_DEFAULTS: &[crate::builtins::clap_env::EnvDefault] = &[ + crate::builtins::clap_env::EnvDefault { + arg_id: options::format::TAB_SIZE, + long: options::format::TAB_SIZE, + env_var: "TABSIZE", + kind: crate::builtins::clap_env::EnvKind::Single, + }, + crate::builtins::clap_env::EnvDefault { + arg_id: options::TIME_STYLE, + long: options::TIME_STYLE, + env_var: "TIME_STYLE", + kind: crate::builtins::clap_env::EnvKind::Single, + }, +]; pub fn ls_command() -> Command { Command::new("ls") .version(env!("CARGO_PKG_VERSION")) diff --git a/crates/bashkit/src/builtins/generated/readlink_args.rs b/crates/bashkit/src/builtins/generated/readlink_args.rs index c1c4c151..2df30fcd 100644 --- a/crates/bashkit/src/builtins/generated/readlink_args.rs +++ b/crates/bashkit/src/builtins/generated/readlink_args.rs @@ -28,6 +28,11 @@ const ARG_FILES: &str = "files"; fn format_usage(s: &str) -> String { s.to_string() } +/// Sidecar harvest of every `Arg::env(...)` annotation the codegen +/// stripped from the runtime Arg chain (TM-INF-024). Empty for utils +/// whose `uu_app()` has no `.env(...)` calls; emitted unconditionally +/// so the bashkit-side surface is uniform. +pub static READLINK_ENV_DEFAULTS: &[crate::builtins::clap_env::EnvDefault] = &[]; pub fn readlink_command() -> Command { Command::new("readlink") .version(env!("CARGO_PKG_VERSION")) diff --git a/crates/bashkit/src/builtins/generated/shuf_args.rs b/crates/bashkit/src/builtins/generated/shuf_args.rs index f531ce0a..91faf373 100644 --- a/crates/bashkit/src/builtins/generated/shuf_args.rs +++ b/crates/bashkit/src/builtins/generated/shuf_args.rs @@ -32,6 +32,11 @@ use options::*; fn format_usage(s: &str) -> String { s.to_string() } +/// Sidecar harvest of every `Arg::env(...)` annotation the codegen +/// stripped from the runtime Arg chain (TM-INF-024). Empty for utils +/// whose `uu_app()` has no `.env(...)` calls; emitted unconditionally +/// so the bashkit-side surface is uniform. +pub static SHUF_ENV_DEFAULTS: &[crate::builtins::clap_env::EnvDefault] = &[]; fn parse_range(input_range: &str) -> Result, String> { if let Some((from, to)) = input_range.split_once('-') { let begin = from.parse::().map_err(|e| e.to_string())?; diff --git a/crates/bashkit/src/builtins/generated/tac_args.rs b/crates/bashkit/src/builtins/generated/tac_args.rs index 3c0e82ef..a4136def 100644 --- a/crates/bashkit/src/builtins/generated/tac_args.rs +++ b/crates/bashkit/src/builtins/generated/tac_args.rs @@ -27,6 +27,11 @@ use options::*; fn format_usage(s: &str) -> String { s.to_string() } +/// Sidecar harvest of every `Arg::env(...)` annotation the codegen +/// stripped from the runtime Arg chain (TM-INF-024). Empty for utils +/// whose `uu_app()` has no `.env(...)` calls; emitted unconditionally +/// so the bashkit-side surface is uniform. +pub static TAC_ENV_DEFAULTS: &[crate::builtins::clap_env::EnvDefault] = &[]; pub fn tac_command() -> Command { Command::new("tac") .version(env!("CARGO_PKG_VERSION")) diff --git a/crates/bashkit/src/builtins/generated/truncate_args.rs b/crates/bashkit/src/builtins/generated/truncate_args.rs index 664d4299..615b2e7b 100644 --- a/crates/bashkit/src/builtins/generated/truncate_args.rs +++ b/crates/bashkit/src/builtins/generated/truncate_args.rs @@ -28,6 +28,11 @@ use options::*; fn format_usage(s: &str) -> String { s.to_string() } +/// Sidecar harvest of every `Arg::env(...)` annotation the codegen +/// stripped from the runtime Arg chain (TM-INF-024). Empty for utils +/// whose `uu_app()` has no `.env(...)` calls; emitted unconditionally +/// so the bashkit-side surface is uniform. +pub static TRUNCATE_ENV_DEFAULTS: &[crate::builtins::clap_env::EnvDefault] = &[]; pub fn truncate_command() -> Command { let cmd = Command::new("truncate") .version(env!("CARGO_PKG_VERSION")) diff --git a/crates/bashkit/src/builtins/ls.rs b/crates/bashkit/src/builtins/ls.rs index b7c78e94..990d809b 100644 --- a/crates/bashkit/src/builtins/ls.rs +++ b/crates/bashkit/src/builtins/ls.rs @@ -7,7 +7,8 @@ use async_trait::async_trait; use std::ffi::OsString; use std::path::Path; -use super::generated::ls_args::ls_command; +use super::clap_env::apply_env_defaults; +use super::generated::ls_args::{LS_ENV_DEFAULTS, ls_command}; use super::{Builtin, Context, ExecutionPlan, SubCommand, resolve_path}; use crate::error::Result; use crate::fs::FileType; @@ -66,6 +67,11 @@ impl Builtin for Ls { let argv: Vec = std::iter::once(OsString::from("ls")) .chain(ctx.args.iter().map(OsString::from)) .collect(); + // Layer bashkit's virtual env over argv before clap parses it. + // Mirrors uutils' `Arg::env(...)` precedence (argv > env > default) + // but sources values from `ctx.env` instead of the host process — + // see TM-INF-024 and `builtins/clap_env.rs`. + let argv = apply_env_defaults(argv, LS_ENV_DEFAULTS, ctx.env); // GNU coreutils' help layout opens with the usage line; clap's // default template leads with the `about`. uutils handles this via @@ -1547,12 +1553,14 @@ mod tests { /// uutils' `uu_app()` attaches to an Arg via `.env(...)`) MUST NOT /// reach the clap parser. uutils' upstream wires /// `Arg::new(TIME_STYLE).env("TIME_STYLE")` so the option defaults - /// from the host process — bashkit strips that at codegen time and - /// drops the `env` clap feature workspace-wide as defence-in-depth. - /// Without those guards a plain `ls` in a container that exports - /// `TIME_STYLE=long-iso` would trip the unsupported-option gate - /// (since bashkit hasn't implemented `--time-style` yet) and break - /// `ls` for every script running on that host. + /// from the host process — bashkit strips that at codegen time, drops + /// the `env` clap feature workspace-wide as defence-in-depth, and + /// re-implements env-default precedence against `ctx.env` only via + /// `apply_env_defaults`. Without those guards a plain `ls` in a + /// container that exports `TIME_STYLE=long-iso` would trip the + /// unsupported-option gate (since bashkit hasn't implemented + /// `--time-style` yet) and break `ls` for every script running on + /// that host. #[tokio::test] #[serial_test::serial] async fn ls_ignores_host_time_style_and_tabsize() { @@ -1607,6 +1615,96 @@ mod tests { ); } + /// Counterpart to `ls_ignores_host_time_style_and_tabsize`: when + /// `TIME_STYLE` lives in bashkit's *virtual* env (`ctx.env`), the + /// codegen-emitted `LS_ENV_DEFAULTS` table feeds it through + /// `apply_env_defaults` and clap honours it just as upstream + /// uutils would honour `std::env::var("TIME_STYLE")`. We assert + /// that path by observing the clap value-source: ls today rejects + /// `--time-style` with a "not yet implemented" message, but only + /// because clap saw the option at all. If the env→argv shim were + /// silently broken, ls would succeed. + #[tokio::test] + async fn ls_honors_virtual_env_time_style() { + let (fs, mut cwd, mut variables) = create_test_ctx().await; + fs.write_file(&cwd.join("a.txt"), b"a").await.unwrap(); + let mut env = HashMap::new(); + env.insert("TIME_STYLE".to_string(), "long-iso".to_string()); + let args: Vec = vec![]; + let ctx = Context { + args: &args, + env: &env, + variables: &mut variables, + cwd: &mut cwd, + fs: fs.clone(), + stdin: None, + #[cfg(feature = "http_client")] + http_client: None, + #[cfg(feature = "git")] + git_client: None, + #[cfg(feature = "ssh")] + ssh_client: None, + shell: None, + }; + + let result = Ls.execute(ctx).await.unwrap(); + + assert_eq!( + result.exit_code, 2, + "virtual TIME_STYLE should reach clap's parser; got stdout={} \ + stderr={}", + result.stdout, result.stderr + ); + assert!( + result.stderr.contains("not yet implemented") && result.stderr.contains("time-style"), + "expected unsupported-option error mentioning time-style; got stderr={}", + result.stderr + ); + } + + /// Argv-set `--time-style` must take precedence over a `ctx.env` + /// value, matching clap's documented "argv > env > default" + /// precedence and `apply_env_defaults`'s contract. Easiest way to + /// observe it: pass an explicit `--time-style=iso`, set a + /// *different* value in `ctx.env`, and assert the error message + /// surfaces the explicit one (or at least that env did not + /// double-inject — the unsupported-option path collapses both + /// into the same diagnostic, so we just confirm exit==2 and that + /// argv wasn't mangled). + #[tokio::test] + async fn ls_argv_time_style_overrides_virtual_env() { + let (fs, mut cwd, mut variables) = create_test_ctx().await; + let mut env = HashMap::new(); + env.insert("TIME_STYLE".to_string(), "long-iso".to_string()); + let args: Vec = vec!["--time-style=iso".to_string()]; + let ctx = Context { + args: &args, + env: &env, + variables: &mut variables, + cwd: &mut cwd, + fs: fs.clone(), + stdin: None, + #[cfg(feature = "http_client")] + http_client: None, + #[cfg(feature = "git")] + git_client: None, + #[cfg(feature = "ssh")] + ssh_client: None, + shell: None, + }; + let result = Ls.execute(ctx).await.unwrap(); + // Same unsupported-option gate as the virtual-env-only test; + // the precedence-correctness signal is that we don't see + // *two* sources or a clap parse failure complaining about + // duplicate `--time-style`. + assert_eq!(result.exit_code, 2); + assert!( + !result.stderr.contains("supplied more than once"), + "shim double-injected --time-style: stderr={}", + result.stderr + ); + } + // ==================== find tests ==================== #[tokio::test] diff --git a/crates/bashkit/src/builtins/mod.rs b/crates/bashkit/src/builtins/mod.rs index 22003ab2..d7c42129 100644 --- a/crates/bashkit/src/builtins/mod.rs +++ b/crates/bashkit/src/builtins/mod.rs @@ -32,6 +32,7 @@ mod bc; mod caller; mod cat; mod checksum; +mod clap_env; mod clear; mod column; mod comm; @@ -1112,10 +1113,13 @@ mod tests { /// `ctx.env`; if the host parser were allowed to consult `std::env` /// the sandbox boundary would leak (host can probe presence, host can /// inject values that propagate into bashkit's option-validation - /// path). Codegen strips `.env(...)` from generated Arg chains; this + /// path). Codegen strips `.env(...)` from generated Arg chains and + /// re-emits the metadata as a `_ENV_DEFAULTS` table that the + /// `clap_env::apply_env_defaults` shim feeds from `ctx.env`. This /// static guard makes sure no future regen (or hand-edit) re-adds - /// one. Defence-in-depth: the workspace `clap` dep also drops the - /// `env` cargo feature, so a slipped `.env(...)` won't compile. + /// a runtime `.env(...)` call. Defence-in-depth: the workspace + /// `clap` dep also drops the `env` cargo feature, so a slipped + /// `.env(...)` won't compile. #[test] fn no_clap_env_in_generated_parsers() { let pat = regex::Regex::new(r"\.env\s*\(").unwrap(); @@ -1129,6 +1133,12 @@ mod tests { } let src = std::fs::read_to_string(&path).expect("read generated file"); for (i, line) in src.lines().enumerate() { + // Skip doc/line comments — they reference `.env(...)` + // when describing the harvest rule. We only care about + // real call expressions. + if line.trim_start().starts_with("//") { + continue; + } if pat.is_match(line) { let rel = path .strip_prefix(std::path::Path::new(env!("CARGO_MANIFEST_DIR"))) @@ -1142,11 +1152,77 @@ mod tests { "clap `Arg::env(...)` found in a generated parser. This pulls \ defaults from the host process environment and breaks bashkit's \ sandbox boundary (TM-INF-024). Re-run `just regen-coreutils-args` \ - — the codegen strips these — or remove the call by hand.\n\n{}", + — the codegen harvests these into `_ENV_DEFAULTS` instead — \ + or remove the call by hand.\n\n{}", violations.join("\n") ); } + /// Every `_args.rs` MUST expose a `_ENV_DEFAULTS` static. + /// The codegen always emits one (possibly empty) so the bashkit-side + /// surface is uniform — every clap-based builtin can wire through + /// `apply_env_defaults` without per-util conditional code. Catches + /// the regression where a regen drops the table because the codegen + /// branch that emits it was removed or skipped. + #[test] + fn every_generated_parser_emits_env_defaults_table() { + let dir = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("src/builtins/generated"); + let mut missing = Vec::new(); + for entry in std::fs::read_dir(&dir).expect("read generated dir") { + let entry = entry.unwrap(); + let path = entry.path(); + let name = path.file_name().and_then(|s| s.to_str()).unwrap_or(""); + if !name.ends_with("_args.rs") { + continue; + } + // util name is everything before `_args.rs` + let util = name.trim_end_matches("_args.rs"); + let const_name = format!("{}_ENV_DEFAULTS", util.to_uppercase()); + let needle = format!("pub static {const_name}"); + let src = std::fs::read_to_string(&path).expect("read generated file"); + if !src.contains(&needle) { + let rel = path + .strip_prefix(std::path::Path::new(env!("CARGO_MANIFEST_DIR"))) + .unwrap_or(&path); + missing.push(format!("{}: missing `{needle}: ...`", rel.display())); + } + } + assert!( + missing.is_empty(), + "Generated parser is missing its `_ENV_DEFAULTS` sidecar. \ + The codegen always emits this (possibly empty) so the bashkit \ + builtin can route argv through `apply_env_defaults` without \ + per-util conditionals. Re-run `just regen-coreutils-args` to \ + regenerate.\n\n{}", + missing.join("\n") + ); + } + + /// Pin LS's env-default surface explicitly. uutils' upstream `ls` + /// uses `.env("TABSIZE")` and `.env("TIME_STYLE")` as of the pinned + /// rev — both must appear in `LS_ENV_DEFAULTS`, with matching long + /// flags, or the virtual-env shim silently drops them. Updating + /// uutils may legitimately add or remove rows; bump this list in + /// the same PR as the codegen regen. + #[test] + fn ls_env_defaults_surface_matches_uutils() { + use crate::builtins::generated::ls_args::LS_ENV_DEFAULTS; + let mut got: Vec<(&'static str, &'static str)> = LS_ENV_DEFAULTS + .iter() + .map(|d| (d.env_var, d.long)) + .collect(); + got.sort(); + let mut expected = vec![("TABSIZE", "tabsize"), ("TIME_STYLE", "time-style")]; + expected.sort(); + assert_eq!( + got, expected, + "LS_ENV_DEFAULTS surface drifted from upstream uutils. Either \ + the codegen harvest dropped a row, or uutils added/removed an \ + `.env(...)` annotation on `ls` — bump this fixture together \ + with the regen." + ); + } + /// Every `_args.rs` header MUST reference the same uutils /// revision as `generated::UUTILS_REVISION`. The drift workflow /// bumps both atomically; this test catches the case where someone diff --git a/specs/coreutils-args-port.md b/specs/coreutils-args-port.md index 66e7030d..d6bce4cb 100644 --- a/specs/coreutils-args-port.md +++ b/specs/coreutils-args-port.md @@ -22,13 +22,28 @@ codegen**, not by depending on `uu_*` crates at runtime. - `uucore::clap_localization::configure_localized_command(cmd)` → `cmd` - `ShortcutValueParser::new([…])` → `clap::builder::PossibleValuesParser::new([…])` (loses uucore's unambiguous-abbreviation behaviour; documented divergence) - - `Arg::…env("FOO")…` → chain step elided (TM-INF-024). uutils - attaches `.env(...)` to options like `TABSIZE`/`TIME_STYLE` so they - pick up host process state; bashkit sandboxes scripts inside - `ctx.env`, so generated parsers must only consult argv. The - workspace `clap` dep also drops the `env` cargo feature as - defence-in-depth, and `builtins::tests::no_clap_env_in_generated_parsers` - statically forbids `.env(` from re-appearing in `generated/*.rs`. + - `Arg::…env("FOO")…` → chain step elided AND harvested into a + sidecar table (TM-INF-024). uutils attaches `.env(...)` to options + like `TABSIZE`/`TIME_STYLE` so they pick up host process state; + bashkit sandboxes scripts inside `ctx.env`, so the generated + `_command()` only consults argv. To preserve uutils' UX + across the port, codegen records each stripped `.env(...)` into + `pub static _ENV_DEFAULTS: &[clap_env::EnvDefault]` next to + the command builder. Each row carries `(arg_id, long, env_var, + kind ∈ {Single, Bool, Multi})`. The bashkit-side shim + `crate::builtins::clap_env::apply_env_defaults` reads + `_ENV_DEFAULTS` plus the caller's `ctx.env` and synthesises + `-- ` (or `--` for `Bool`) into argv before + `try_get_matches_from`, emulating clap's documented "argv > env > + default" precedence — but sourced from the sandbox, never + `std::env`. Defence-in-depth: the workspace `clap` dep drops the + `env` cargo feature, `builtins::tests::no_clap_env_in_generated_parsers` + statically forbids runtime `.env(` calls in `generated/*.rs`, and + `every_generated_parser_emits_env_defaults_table` enforces the + uniform sidecar surface (every util emits the table, possibly + empty). Per-builtin opt-in: a builtin chooses whether to wire + through the shim — if it does, every uutils env-default + auto-lights as that option's bashkit support lands. 5. Emits a generated file under `crates/bashkit/src/builtins/generated/_args.rs` with a clean `pub fn _command() -> clap::Command`. diff --git a/specs/threat-model.md b/specs/threat-model.md index f341c9fb..8e9df553 100644 --- a/specs/threat-model.md +++ b/specs/threat-model.md @@ -438,7 +438,7 @@ All execution stays within the virtual interpreter — no OS subprocess is spawn | TM-INF-021 | Stack backtrace information disclosure | Panics leak internal source paths, dependency versions, and function names via stderr | Custom panic hook suppresses backtraces in CLI | **MITIGATED** | | TM-INF-022 | Library Debug shapes leak via stderr | Builtins wrapping external libs (jaq, regex, serde_json, semver, chrono, …) format errors with `format!("{:?}", e)` and dump internal struct shapes into stderr — e.g. jq printed `(File { code: "<800 chars of prepended compat-defs source>", path: () }, [("@tsv", Filter(0))])` to LLM agents | Three-layer enforcement, all run by `cargo test`: (1) static — `builtins::tests::no_debug_fmt_in_builtin_source` walks every `crates/bashkit/src/builtins/*.rs` and forbids `{:?}` / `{:#?}` / `{name:?}` (per-line opt-out: `// debug-ok: `); (2) dynamic per-tool — each tool's `mod tests` calls `bashkit::testing::assert_no_leak` with malformed inputs; (3) fuzz — every cargo-fuzz target plus 5 proptest cases call `bashkit::testing::assert_fuzz_invariants` against random input and check the same invariants on stderr/stdout | **FIXED** | | TM-INF-023 | jq `halt` / `halt_error` terminate the host process | jaq-std 3.0's `halt(N)` native calls `std::process::exit(exit_code)`. `defs.jq` wraps it with `def halt: halt(0);` and `def halt_error(...): ..., halt(...);`. Untrusted jq filters can invoke `halt` / `halt_error` to exit the entire embedding process, escaping the `ExecResult`-based sandbox boundary and causing process-wide DoS for any caller hosting bashkit. | Strip the upstream `halt` native from `jaq_std::funs::()` and chain in a safe replacement that pops the variable argument and returns `Error::str("halt is disabled in the bashkit sandbox")`. The wrapper defs (`halt`, `halt_error`) still resolve, so callers see a normal jq runtime error (exit 5) instead of the host being killed. Regression tests in `builtins::jq::tests`: `halt_does_not_terminate_host_process`, `halt_with_arg_does_not_terminate_host_process`, `halt_error_does_not_terminate_host_process`. | **FIXED** | -| TM-INF-024 | Host env side-channel via clap `Arg::env(...)` | Clap-based builtins ported from uutils (currently `ls`, with `.env("TABSIZE")` and `.env("TIME_STYLE")` on `uu_app()`) resolve those defaults from `std::env`, not bashkit's `ctx.env`. This breaks the sandbox boundary two ways: (1) presence-probe — a script can tell whether the host process has `TIME_STYLE`/`TABSIZE` set by observing `ls`'s behaviour; (2) availability — a host- or container-set `TIME_STYLE` materialises as a clap value source for an option bashkit hasn't implemented yet, so the unsupported-option gate fires on every plain `ls` and breaks the builtin for unrelated tenants | Three-layer fix: (1) **codegen strips `.env(...)`** — `bashkit-coreutils-port` rewrites `.env("FOO")` chained method calls out of every Arg builder it ports, so generated `_args.rs` only ever consults argv; (2) **static guard** — `builtins::tests::no_clap_env_in_generated_parsers` greps every `crates/bashkit/src/builtins/generated/*.rs` and fails the build if any `.env(` call sneaks back in; (3) **defence-in-depth** — workspace `clap` dep drops the `env` cargo feature, so a re-introduced `.env(...)` fails to compile rather than silently re-opening the channel | **FIXED** | +| TM-INF-024 | Host env side-channel via clap `Arg::env(...)` | Clap-based builtins ported from uutils (currently `ls`, with `.env("TABSIZE")` and `.env("TIME_STYLE")` on `uu_app()`) resolve those defaults from `std::env`, not bashkit's `ctx.env`. This breaks the sandbox boundary two ways: (1) presence-probe — a script can tell whether the host process has `TIME_STYLE`/`TABSIZE` set by observing `ls`'s behaviour; (2) availability — a host- or container-set `TIME_STYLE` materialises as a clap value source for an option bashkit hasn't implemented yet, so the unsupported-option gate fires on every plain `ls` and breaks the builtin for unrelated tenants | Four-layer fix: (1) **codegen strips runtime `.env(...)` and harvests it** — `bashkit-coreutils-port` rewrites `.env("FOO")` chained method calls out of every Arg builder it ports AND emits a sidecar `_ENV_DEFAULTS: &[EnvDefault]` table recording each stripped annotation; (2) **virtual-env shim** — `crate::builtins::clap_env::apply_env_defaults` rewrites argv from `ctx.env` (never `std::env`) before `try_get_matches_from`, emulating clap's "argv > env > default" precedence so uutils' env-default UX is preserved without re-opening the host channel; (3) **static guards** — `no_clap_env_in_generated_parsers` forbids runtime `.env(` calls in `generated/*.rs`, and `every_generated_parser_emits_env_defaults_table` enforces every util emits the sidecar (uniform surface, no per-util conditionals); (4) **defence-in-depth** — workspace `clap` dep drops the `env` cargo feature, so a re-introduced `.env(...)` fails to compile rather than silently re-opening the channel. Coverage: ls runtime tests pin both directions — `ls_ignores_host_time_style_and_tabsize` (host `std::env` ignored) and `ls_honors_virtual_env_time_style` (virtual `ctx.env` honoured) | **FIXED** | **TM-INF-022**: Generalizes TM-INF-016 to cover the whole builtin surface. The originating bug was `builtins/jq/` formatting jaq's compile/parse