diff --git a/.gitignore b/.gitignore
index 8c2df2fb1..cbe293ff4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,9 @@ coverage-isolated
 *.junit.xml
 test/skill-eval/results.json
 
+# local bench baselines — machine-specific, not for version control
+.bench/
+
 # logs
 logs
 _.log
diff --git a/bun.lock b/bun.lock
index 406cc30ea..41f9d1d01 100644
--- a/bun.lock
+++ b/bun.lock
@@ -17,6 +17,7 @@
         "@types/bun": "latest",
         "@types/http-cache-semantics": "^4.2.0",
         "@types/node": "^22",
+        "@types/picomatch": "^4.0.3",
         "@types/qrcode-terminal": "^0.12.2",
         "@types/semver": "^7.7.1",
         "binpunch": "^1.0.0",
@@ -226,6 +227,8 @@
 
     "@types/node-fetch": ["@types/node-fetch@2.6.13", "", { "dependencies": { "@types/node": "*", "form-data": "^4.0.4" } }, "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw=="],
 
+    "@types/picomatch": ["@types/picomatch@4.0.3", "", {}, "sha512-iG0T6+nYJ9FAPmx9SsUlnwcq1ZVRuCXcVEvWnntoPlrOpwtSTKNDC9uVAxTsC3PUvJ+99n4RpAcNgBbHX3JSnQ=="],
+
     "@types/qrcode-terminal": ["@types/qrcode-terminal@0.12.2", "", {}, "sha512-v+RcIEJ+Uhd6ygSQ0u5YYY7ZM+la7GgPbs0V/7l/kFs2uO4S8BcIUEMoP7za4DNIqNnUD5npf0A/7kBhrCKG5Q=="],
 
     "@types/qs": ["@types/qs@6.14.0", "", {}, "sha512-eOunJqu0K1923aExK6y8p6fsihYEn/BYuQ4g0CxAAgFc4b/ZLN4CrsRZ55srTdqoiLzU2B2evC+apEIxprEzkQ=="],
diff --git a/package.json b/package.json
index de121753b..c5fad8711 100644
--- a/package.json
+++ b/package.json
@@ -19,6 +19,7 @@
     "@types/bun": "latest",
     "@types/http-cache-semantics": "^4.2.0",
     "@types/node": "^22",
+    "@types/picomatch": "^4.0.3",
     "@types/qrcode-terminal": "^0.12.2",
     "@types/semver": "^7.7.1",
     "binpunch": "^1.0.0",
@@ -91,6 +92,10 @@
     "generate:schema": "bun run script/generate-api-schema.ts",
     "generate:command-docs": "bun run script/generate-command-docs.ts",
     "eval:skill": "bun run script/eval-skill.ts",
+    "bench": "bun run script/bench.ts",
+    "bench:save": "bun run script/bench.ts --save-baseline",
+    "bench:compare": "bun run script/bench.ts --compare",
+    "bench:sweep": "bun run script/bench-sweep.ts",
     "check:fragments": "bun run script/check-fragments.ts",
     "check:deps": "bun run script/check-no-deps.ts",
     "check:errors": "bun run script/check-error-patterns.ts",
diff --git a/script/bench-sweep.ts b/script/bench-sweep.ts
new file mode 100644
index 000000000..c168b3438
--- /dev/null
+++ b/script/bench-sweep.ts
@@ -0,0 +1,375 @@
+#!/usr/bin/env bun
+/**
+ * Concurrency sweep for `src/lib/scan/` hot paths.
+ *
+ * Goal: measure how the walker + grep scale with `concurrency` on
+ * the synthetic bench fixtures, so we can pick a data-driven default
+ * for `CONCURRENCY_LIMIT`.
+ *
+ * The main bench harness (`script/bench.ts`) uses a fixed concurrency
+ * inherited from the DSN scanner. This script is a one-shot
+ * diagnostic run by contributors when tuning perf — it's not wired
+ * into CI.
+ *
+ * Usage:
+ *   bun run bench:sweep                         # full sweep on medium+large
+ *   bun run bench:sweep --size small            # one preset
+ *   bun run bench:sweep --values 1,2,4,8,16,32  # custom concurrency grid
+ *   bun run bench:sweep --runs 10 --warmup 3    # override run counts
+ *   bun run bench:sweep --json > sweep.json     # machine-readable
+ *
+ * Output: a per-(fixture, op) table of p50 times across the
+ * concurrency grid, plus a "knee" annotation flagging the value
+ * past which additional parallelism yields < 3% improvement.
+ */
+
+import { existsSync, mkdirSync } from "node:fs";
+import { arch, availableParallelism, cpus, platform, tmpdir } from "node:os";
+import { join } from "node:path";
+import {
+  type FixtureSpec,
+  generateFixture,
+  hashSpec,
+} from "../test/fixtures/bench/generate.js";
+import {
+  measure,
+  summarize,
+  withBenchDb,
+} from "../test/fixtures/bench/helpers.js";
+import {
+  PRESET_NAMES,
+  PRESETS,
+  type PresetName,
+} from "../test/fixtures/bench/presets.js";
+
+/** Default concurrency values we sweep across. */
+const DEFAULT_VALUES = [1, 2, 4, 8, 16, 32, 50, 100, 200] as const;
+
+/** Default fixture sizes we sweep on. `small` rarely shows signal. */
+const DEFAULT_SIZES: readonly PresetName[] = ["medium", "large"];
+
+/**
+ * DSN scanner hot regex — reused by the `scan.grepFiles` op. Kept at
+ * module scope to satisfy Biome's `useTopLevelRegex` rule.
+ */
+const DSN_PATTERN =
+  /https?:\/\/[a-z0-9]+(?::[a-z0-9]+)?@[a-z0-9.-]+(?:\.[a-z]+|:[0-9]+)\/\d+/i;
+
+type SweepArgs = {
+  sizes: readonly PresetName[];
+  values: readonly number[];
+  runs: number;
+  warmup: number;
+  json: boolean;
+  kneeThresholdPct: number;
+};
+
+// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI flag switch is inherently branchy
+function parseArgs(argv: readonly string[]): SweepArgs {
+  const sizes: PresetName[] = [...DEFAULT_SIZES];
+  let values: number[] = [...DEFAULT_VALUES];
+  let runs = 5;
+  let warmup = 2;
+  let json = false;
+  const kneeThresholdPct = 0.03; // 3%
+
+  for (let i = 0; i < argv.length; i += 1) {
+    const arg = argv[i] as string;
+    const next = argv[i + 1];
+    switch (arg) {
+      case "--size": {
+        if (!next) {
+          throw new Error("--size requires a value");
+        }
+        if (next === "all") {
+          sizes.length = 0;
+          sizes.push(...PRESET_NAMES);
+        } else if ((PRESET_NAMES as readonly string[]).includes(next)) {
+          sizes.length = 0;
+          sizes.push(next as PresetName);
+        } else {
+          throw new Error(
+            `Unknown size '${next}'. Valid: ${PRESET_NAMES.join(", ")}, all`
+          );
+        }
+        i += 1;
+        break;
+      }
+      case "--values": {
+        if (!next) {
+          throw new Error("--values requires a comma-separated list");
+        }
+        values = next
+          .split(",")
+          .map((s) => Number(s.trim()))
+          .filter((n) => Number.isFinite(n) && n > 0);
+        if (values.length === 0) {
+          throw new Error("--values must contain at least one positive number");
+        }
+        i += 1;
+        break;
+      }
+      case "--runs": {
+        if (!next) {
+          throw new Error("--runs requires a number");
+        }
+        runs = Number(next);
+        i += 1;
+        break;
+      }
+      case "--warmup": {
+        if (!next) {
+          throw new Error("--warmup requires a number");
+        }
+        warmup = Number(next);
+        i += 1;
+        break;
+      }
+      case "--json": {
+        json = true;
+        break;
+      }
+      case "-h":
+      case "--help": {
+        printHelp();
+        process.exit(0);
+        break;
+      }
+      default:
+        throw new Error(`Unknown flag: ${arg}`);
+    }
+  }
+
+  return { sizes, values, runs, warmup, json, kneeThresholdPct };
+}
+
+function printHelp(): void {
+  console.log("Usage: bun run bench:sweep [--size small|medium|large|all]");
+  console.log("                            [--values 1,2,4,8,...]");
+  console.log("                            [--runs N] [--warmup N]");
+  console.log("                            [--json]");
+}
+
+/** Resolve (or create) a synthetic fixture, same as bench.ts. */
+function resolveFixture(name: PresetName): { label: string; rootDir: string } {
+  const preset = PRESETS[name];
+  // biome-ignore lint/suspicious/noBitwiseOperators: deterministic seed mix
+  const seed = (0xde_ad_be_ef ^ hashStr(name)) >>> 0;
+  const spec: FixtureSpec = {
+    ...preset,
+    seed,
+    rootDir: "",
+  };
+  const { rootDir: _unused, ...specNoRoot } = spec;
+  const hash = hashSpec(specNoRoot);
+  const rootDir = join(tmpdir(), "sentry-cli-bench", `fx-${name}-${hash}`);
+  mkdirSync(rootDir, { recursive: true });
+  generateFixture({ ...spec, rootDir });
+  return { label: `synthetic/${name}`, rootDir };
+}
+
+/** Cheap 32-bit FNV-1a over a short string — same as bench.ts. */
+// biome-ignore-start lint/suspicious/noBitwiseOperators: FNV-1a is a bitwise hash
+function hashStr(s: string): number {
+  let h = 0x81_1c_9d_c5;
+  for (let i = 0; i < s.length; i += 1) {
+    h = Math.imul(h ^ s.charCodeAt(i), 0x01_00_01_93);
+  }
+  return h >>> 0;
+}
+// biome-ignore-end lint/suspicious/noBitwiseOperators: FNV-1a is a bitwise hash
+
+/**
+ * Single op × concurrency → p50 in ms. Returns NaN when the op
+ * throws or the fixture doesn't exist.
+ */
+type SweepResult = {
+  fixture: string;
+  operation: string;
+  concurrency: number;
+  p50: number;
+  p95: number;
+  runs: number;
+};
+
+/**
+ * Sweepable ops. Only ops that accept a `concurrency` override are
+ * included — the walker itself is sequential, so sweeping
+ * `scan.walk` would produce identical numbers across the grid.
+ *
+ * `scan.grepFiles` is the closest shape to `scanCodeForDsns` (walker
+ * + per-file read + regex). The knee we find here should transfer to
+ * the DSN scanner once we update `CONCURRENCY_LIMIT`.
+ */
+async function buildOps(): Promise<
+  Array<{
+    label: string;
+    run: (cwd: string, concurrency: number) => Promise<void>;
+    setup?: (cwd: string) => Promise<void>;
+  }>
+> {
+  const { collectGrep } = await import("../src/lib/scan/index.js");
+  const { dsnScanOptions } = await import("../src/lib/dsn/scan-options.js");
+
+  return [
+    {
+      label: "scan.grepFiles",
+      run: async (cwd, concurrency) => {
+        await collectGrep({
+          cwd,
+          pattern: DSN_PATTERN,
+          ...dsnScanOptions(),
+          concurrency,
+        });
+      },
+    },
+  ];
+}
+
+async function runSweep(args: SweepArgs): Promise<SweepResult[]> {
+  const fixtures = args.sizes.map(resolveFixture);
+  const ops = await buildOps();
+  const results: SweepResult[] = [];
+
+  // Silence CLI telemetry — we don't want Sentry events from bench runs.
+  process.env.SENTRY_CLI_NO_TELEMETRY = "1";
+
+  for (const fx of fixtures) {
+    if (!existsSync(fx.rootDir)) {
+      if (!args.json) {
+        console.error(`✗ fixture missing: ${fx.rootDir}`);
+      }
+      continue;
+    }
+    if (!args.json) {
+      console.log(`\n${fx.label}  (${fx.rootDir})`);
+    }
+    await withBenchDb(async () => {
+      await sweepFixture(fx, ops, args, results);
+    });
+  }
+
+  return results;
+}
+
+/** Inner loop body extracted to keep `runSweep`'s arity + complexity low. */
+async function sweepFixture(
+  fx: { label: string; rootDir: string },
+  ops: Awaited<ReturnType<typeof buildOps>>,
+  args: SweepArgs,
+  results: SweepResult[]
+): Promise<void> {
+  for (const op of ops) {
+    for (const concurrency of args.values) {
+      const samples = await measure(() => op.run(fx.rootDir, concurrency), {
+        runs: args.runs,
+        warmup: args.warmup,
+        beforeEach: op.setup ? () => op.setup?.(fx.rootDir) : undefined,
+      });
+      const stats = summarize(samples);
+      results.push({
+        fixture: fx.label,
+        operation: op.label,
+        concurrency,
+        p50: stats.p50,
+        p95: stats.p95,
+        runs: stats.runs,
+      });
+      if (!args.json) {
+        console.log(
+          `  ${op.label.padEnd(24)}  conc=${String(concurrency).padStart(3)}  p50 ${stats.p50.toFixed(1).padStart(6)}ms  p95 ${stats.p95.toFixed(1).padStart(6)}ms`
+        );
+      }
+    }
+  }
+}
+
+/**
+ * Given sorted-by-concurrency results for one (fixture, op), return
+ * the smallest concurrency value past which increasing concurrency
+ * yields < `thresholdPct` improvement in p50.
+ */
+function findKnee(
+  entries: readonly SweepResult[],
+  thresholdPct: number
+): number | null {
+  const sorted = [...entries].sort((a, b) => a.concurrency - b.concurrency);
+  let bestP50 = Number.POSITIVE_INFINITY;
+  let kneeAt: number | null = null;
+  for (const e of sorted) {
+    const improvementRatio = (bestP50 - e.p50) / bestP50;
+    if (
+      !Number.isFinite(improvementRatio) ||
+      improvementRatio >= thresholdPct
+    ) {
+      bestP50 = Math.min(bestP50, e.p50);
+      kneeAt = e.concurrency;
+    } else {
+      // Stop improving. Previous kneeAt is our answer.
+      break;
+    }
+  }
+  return kneeAt;
+}
+
+/** Render the per-(fixture, op) knee table. */
+function printKnees(
+  results: readonly SweepResult[],
+  thresholdPct: number
+): void {
+  const byKey = new Map<string, SweepResult[]>();
+  for (const r of results) {
+    const key = `${r.fixture}||${r.operation}`;
+    const list = byKey.get(key) ?? [];
+    list.push(r);
+    byKey.set(key, list);
+  }
+  console.log("");
+  console.log(
+    `Knee analysis (smallest concurrency past which each additional step gains < ${(thresholdPct * 100).toFixed(0)}%)`
+  );
+  console.log("─".repeat(72));
+  for (const [key, entries] of byKey) {
+    const [fixture, operation] = key.split("||");
+    const knee = findKnee(entries, thresholdPct);
+    const minP50 = Math.min(...entries.map((e) => e.p50));
+    console.log(
+      `  ${String(fixture).padEnd(20)}  ${String(operation).padEnd(24)}  knee = ${knee ?? "?"} (best p50 ${minP50.toFixed(1)}ms)`
+    );
+  }
+  console.log("");
+}
+
+async function main(): Promise<number> {
+  const args = parseArgs(process.argv.slice(2));
+  const results = await runSweep(args);
+
+  if (args.json) {
+    const report = {
+      generatedAt: new Date().toISOString(),
+      runtime: {
+        platform: platform(),
+        arch: arch(),
+        cpus: cpus().length,
+        availableParallelism: availableParallelism(),
+      },
+      kneeThresholdPct: args.kneeThresholdPct,
+      results,
+    };
+    process.stdout.write(`${JSON.stringify(report, null, 2)}\n`);
+  } else {
+    console.log(
+      `\nsystem: ${platform()}/${arch()}, availableParallelism=${availableParallelism()}`
+    );
+    printKnees(results, args.kneeThresholdPct);
+  }
+
+  return 0;
+}
+
+main()
+  .then((code) => process.exit(code))
+  .catch((error) => {
+    console.error(error);
+    process.exit(1);
+  });
diff --git a/script/bench.ts b/script/bench.ts
new file mode 100644
index 000000000..63fa2a4d6
--- /dev/null
+++ b/script/bench.ts
@@ -0,0 +1,650 @@
+#!/usr/bin/env bun
+/**
+ * Local benchmark harness for DSN detection, project-root finding, and
+ * (after the `src/lib/scan/` refactor lands) the generic scanner module.
+ *
+ * Goals:
+ *   1. Capture objective baselines *before* the scanner refactor so we can
+ *      verify that the new pure-TS implementation lands within ~1.2x.
+ *   2. Feed data-driven decisions for the worker-pool + caching follow-ups.
+ *   3. Use operation labels that match the Sentry spans already emitted in
+ *      production (`findProjectRoot`, `scanCodeForDsns`, etc.) so local
+ *      numbers correlate with prod telemetry.
+ *
+ * The harness is deliberately zero-dependency: it builds synthetic repos
+ * from `test/fixtures/bench/` (parameterized + deterministic) or, if you
+ * pass `--repo /path` or set `BENCH_REPO=`, benches against a real repo.
+ * Baselines go to `.bench/baseline.json` (gitignored) — they're machine-
+ * specific and intentionally not version-controlled.
+ *
+ * Usage:
+ *   bun run bench                        # all ops, all preset sizes
+ *   bun run bench --size small           # only the 'small' preset
+ *   bun run bench --op detectDsn.cold    # filter by operation (substring)
+ *   bun run bench --repo /path/to/repo   # bench a real repo (disables --save-baseline)
+ *   bun run bench --warmup 3 --runs 10   # override default run counts
+ *   bun run bench --json > report.json   # machine-readable stdout
+ *   bun run bench --save-baseline        # write .bench/baseline.json
+ *   bun run bench --compare              # diff current vs .bench/baseline.json
+ *                                        # (exit 1 if any p50 regresses >20%)
+ *   bun run bench --regen-fixtures       # force fixture regeneration
+ *
+ * Environment variables:
+ *   BENCH_REPO       Path to a real repo (equivalent to --repo)
+ *   BENCH_RUNS       Default measured run count (default: 10)
+ *   BENCH_WARMUP     Default warmup run count (default: 3)
+ *   BENCH_THRESHOLD  Default regression threshold for --compare (default: 0.2)
+ *
+ * Exit codes:
+ *   0 - Bench completed; no regression on --compare
+ *   1 - Invalid args, or --compare detected a p50 regression over threshold
+ */
+
+import { existsSync, mkdirSync, readFileSync, rmSync } from "node:fs";
+import { readdir } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import { join, resolve } from "node:path";
+import {
+  type FixtureSpec,
+  generateFixture,
+  hashSpec,
+} from "../test/fixtures/bench/generate.js";
+import {
+  type BenchEntry,
+  type BenchReport,
+  clearDsnDetectionCache,
+  compareReports,
+  measure,
+  printComparison,
+  printReport,
+  runtimeInfo,
+  summarize,
+  withBenchDb,
+  writeJsonReport,
+} from "../test/fixtures/bench/helpers.js";
+import {
+  PRESET_NAMES,
+  PRESETS,
+  type PresetName,
+} from "../test/fixtures/bench/presets.js";
+
+/**
+ * The DSN scanner's hot regex. Pinned at module scope per Biome's
+ * `useTopLevelRegex` rule; reused by the `scan.grepFiles` op.
+ */
+const DSN_PATTERN =
+  /https?:\/\/[a-z0-9]+(?::[a-z0-9]+)?@[a-z0-9.-]+(?:\.[a-z]+|:[0-9]+)\/\d+/i;
+
+// -------- Arg parsing --------
+
+type CliArgs = {
+  sizes: readonly PresetName[];
+  opFilter: string | undefined;
+  repo: string | undefined;
+  runs: number;
+  warmup: number;
+  json: boolean;
+  saveBaseline: boolean;
+  compare: boolean;
+  regenFixtures: boolean;
+  thresholdPct: number;
+};
+
+type ParseState = {
+  sizes: PresetName[];
+  opFilter: string | undefined;
+  repo: string | undefined;
+  runs: number;
+  warmup: number;
+  json: boolean;
+  saveBaseline: boolean;
+  compare: boolean;
+  regenFixtures: boolean;
+};
+
+/** Apply a single flag (and its optional value) to the mutable parse state. */
+function applyFlag(
+  state: ParseState,
+  arg: string,
+  next: string | undefined
+): boolean {
+  switch (arg) {
+    case "--size": {
+      if (!next) {
+        throw new Error("--size requires a value");
+      }
+      if (next === "all") {
+        state.sizes = [...PRESET_NAMES];
+      } else if ((PRESET_NAMES as readonly string[]).includes(next)) {
+        state.sizes = [next as PresetName];
+      } else {
+        throw new Error(
+          `Unknown size '${next}'. Valid: ${PRESET_NAMES.join(", ")}, all`
+        );
+      }
+      return true;
+    }
+    case "--op": {
+      if (!next) {
+        throw new Error("--op requires a value");
+      }
+      state.opFilter = next;
+      return true;
+    }
+    case "--repo": {
+      if (!next) {
+        throw new Error("--repo requires a path");
+      }
+      state.repo = next;
+      return true;
+    }
+    case "--runs": {
+      if (!next) {
+        throw new Error("--runs requires a number");
+      }
+      state.runs = Number(next);
+      return true;
+    }
+    case "--warmup": {
+      if (!next) {
+        throw new Error("--warmup requires a number");
+      }
+      state.warmup = Number(next);
+      return true;
+    }
+    case "--json":
+      state.json = true;
+      return false;
+    case "--save-baseline":
+      state.saveBaseline = true;
+      return false;
+    case "--compare":
+      state.compare = true;
+      return false;
+    case "--regen-fixtures":
+      state.regenFixtures = true;
+      return false;
+    case "-h": {
+      printHelp();
+      process.exit(0);
+      break;
+    }
+    case "--help": {
+      printHelp();
+      process.exit(0);
+      break;
+    }
+    default:
+      throw new Error(`Unknown flag: ${arg}`);
+  }
+}
+
+function parseArgs(argv: readonly string[]): CliArgs {
+  const state: ParseState = {
+    sizes: [...PRESET_NAMES],
+    opFilter: undefined,
+    repo: process.env.BENCH_REPO,
+    runs: Number(process.env.BENCH_RUNS ?? 10),
+    warmup: Number(process.env.BENCH_WARMUP ?? 3),
+    json: false,
+    saveBaseline: false,
+    compare: false,
+    regenFixtures: false,
+  };
+  const thresholdPct = Number(process.env.BENCH_THRESHOLD ?? 0.2);
+
+  for (let i = 0; i < argv.length; i += 1) {
+    const arg = argv[i] as string;
+    const next = argv[i + 1];
+    if (applyFlag(state, arg, next)) {
+      i += 1;
+    }
+  }
+
+  if (!Number.isFinite(state.runs) || state.runs < 1) {
+    throw new Error("--runs must be >= 1");
+  }
+  if (!Number.isFinite(state.warmup) || state.warmup < 0) {
+    throw new Error("--warmup must be >= 0");
+  }
+  if (state.saveBaseline && state.repo) {
+    throw new Error(
+      "--save-baseline is only valid with synthetic fixtures; remove --repo/BENCH_REPO"
+    );
+  }
+
+  return { ...state, thresholdPct };
+}
+
+function printHelp(): void {
+  console.log(
+    "Usage: bun run bench [--size small|medium|large|all] [--op NAME] [--repo PATH]"
+  );
+  console.log("                     [--runs N] [--warmup N]");
+  console.log(
+    "                     [--json] [--save-baseline] [--compare] [--regen-fixtures]"
+  );
+}
+
+// -------- Fixture resolution --------
+
+type FixtureHandle = {
+  label: string;
+  rootDir: string;
+  /** True when the fixture is an ephemeral synthetic tree that we may discard. */
+  synthetic: boolean;
+  /** Reported file count for the header / JSON. */
+  fileCount: number;
+};
+
+// biome-ignore-start lint/suspicious/noBitwiseOperators: FNV-1a is a bitwise hash
+/** 32-bit FNV-1a hash of a string — supplies a stable per-preset seed. */
+function hashToSeed(s: string): number {
+  let h = 0x81_1c_9d_c5;
+  for (let i = 0; i < s.length; i += 1) {
+    h = Math.imul(h ^ s.charCodeAt(i), 0x01_00_01_93);
+  }
+  // Keep the result in the 32-bit signed range for determinism across engines.
+  return h >>> 0;
+}
+// biome-ignore-end lint/suspicious/noBitwiseOperators: FNV-1a is a bitwise hash
+
+/** Build (or reuse) a synthetic fixture for the given preset. */
+function resolveSyntheticFixture(
+  name: PresetName,
+  forceRegen: boolean
+): FixtureHandle {
+  const preset = PRESETS[name];
+  // Deterministic seed per preset so every contributor lands on the same tree.
+  // We XOR an anchor constant with the per-preset name hash so seeds are
+  // spread across the 32-bit space even when preset names are similar.
+  // biome-ignore lint/suspicious/noBitwiseOperators: deterministic 32-bit seed mix
+  const seed = (0xde_ad_be_ef ^ hashToSeed(name)) >>> 0;
+  const specNoRoot = { ...preset, seed };
+  const hash = hashSpec(specNoRoot);
+  const rootDir = join(tmpdir(), "sentry-cli-bench", `fx-${name}-${hash}`);
+  mkdirSync(rootDir, { recursive: true });
+  if (forceRegen) {
+    rmSync(rootDir, { recursive: true, force: true });
+    mkdirSync(rootDir, { recursive: true });
+  }
+  const spec: FixtureSpec = { ...preset, seed, rootDir };
+  const meta = generateFixture(spec, { force: forceRegen });
+  return {
+    label: `synthetic/${name}`,
+    rootDir,
+    synthetic: true,
+    fileCount: meta.fileCount,
+  };
+}
+
+/** Count files under a path with a lightweight walk (used for real-repo headers). */
+async function roughFileCount(root: string): Promise<number> {
+  const skip = new Set([
+    ".git",
+    "node_modules",
+    "dist",
+    "build",
+    ".venv",
+    "venv",
+  ]);
+  let count = 0;
+  async function walk(dir: string): Promise<void> {
+    let entries: Awaited<ReturnType<typeof readdir>>;
+    try {
+      entries = await readdir(dir, { withFileTypes: true });
+    } catch {
+      return;
+    }
+    for (const entry of entries) {
+      if (entry.isDirectory()) {
+        if (!skip.has(entry.name)) {
+          await walk(join(dir, entry.name));
+        }
+      } else if (entry.isFile()) {
+        count += 1;
+      }
+    }
+  }
+  await walk(root);
+  return count;
+}
+
+// -------- Operation registry --------
+
+type OpRunner = (cwd: string) => Promise<unknown>;
+type OpEntry = {
+  label: string;
+  warm: boolean;
+  /** Called before every measured iteration — use for cold-cache resets. */
+  setup?: (cwd: string) => Promise<void>;
+  run: OpRunner;
+};
+
+async function buildOps(): Promise<OpEntry[]> {
+  // Lazy-import production code so unit tests can import the helpers/fixtures
+  // without loading all of @sentry/node-core.
+  const { detectDsn, detectAllDsns } = await import(
+    "../src/lib/dsn/detector.js"
+  );
+  const { findProjectRoot } = await import("../src/lib/dsn/project-root.js");
+  const { scanCodeForDsns, scanCodeForFirstDsn } = await import(
+    "../src/lib/dsn/code-scanner.js"
+  );
+  // Scan module — not yet wired into DSN detection (PR 3 will do that).
+  // These ops give us standalone baselines so PR 2/PR 3 can compare.
+  const { walkFiles, IgnoreStack, TEXT_EXTENSIONS, collectGrep } = await import(
+    "../src/lib/scan/index.js"
+  );
+  // DSN-parity preset — used by the `scan.walk.dsnParity` op below.
+  const { dsnScanOptions } = await import("../src/lib/dsn/scan-options.js");
+
+  const coldSetup = (cwd: string) => clearDsnDetectionCache(cwd);
+
+  return [
+    {
+      label: "findProjectRoot",
+      warm: false,
+      setup: coldSetup,
+      run: async (cwd) => {
+        await findProjectRoot(cwd);
+      },
+    },
+    {
+      label: "detectDsn.cold",
+      warm: false,
+      setup: coldSetup,
+      run: async (cwd) => {
+        await detectDsn(cwd);
+      },
+    },
+    {
+      label: "detectDsn.warm",
+      warm: true,
+      // No setup hook — leverage whatever the previous run cached.
+      run: async (cwd) => {
+        await detectDsn(cwd);
+      },
+    },
+    {
+      label: "detectAllDsns.cold",
+      warm: false,
+      setup: coldSetup,
+      run: async (cwd) => {
+        await detectAllDsns(cwd);
+      },
+    },
+    {
+      label: "detectAllDsns.warm",
+      warm: true,
+      run: async (cwd) => {
+        await detectAllDsns(cwd);
+      },
+    },
+    {
+      label: "scanCodeForDsns",
+      warm: false,
+      // scanCodeForDsns bypasses the cache entirely — no setup needed, but we
+      // still clear so any sibling-test leftover doesn't skew timings.
+      setup: coldSetup,
+      run: async (cwd) => {
+        await scanCodeForDsns(cwd);
+      },
+    },
+    {
+      label: "scanCodeForFirstDsn",
+      warm: false,
+      setup: coldSetup,
+      run: async (cwd) => {
+        await scanCodeForFirstDsn(cwd);
+      },
+    },
+    {
+      // scan.walk — iterate the walker with the DSN scanner's extension
+      // allowlist. This is the closest standalone comparison point with
+      // `scanCodeForDsns` (same file set, just no regex work).
+      label: "scan.walk",
+      warm: false,
+      run: async (cwd) => {
+        // We intentionally discard the entries — the bench measures
+        // the cost of iterating the generator, not anything downstream.
+        for await (const _ of walkFiles({
+          cwd,
+          extensions: TEXT_EXTENSIONS,
+        })) {
+          // body intentionally empty
+        }
+      },
+    },
+    {
+      // scan.walk.noExt — no extension filter, so every unknown-extension
+      // file is opened and sniffed for NUL. Tells us how expensive lazy
+      // binary detection is on mixed trees.
+      label: "scan.walk.noExt",
+      warm: false,
+      run: async (cwd) => {
+        for await (const _ of walkFiles({ cwd })) {
+          // body intentionally empty
+        }
+      },
+    },
+    {
+      // scan.walk.dsnParity — walker configured with the DSN scanner's
+      // exact options (TEXT_EXTENSIONS + full skip list + depth 3 with
+      // monorepo reset). This is the apples-to-apples comparison with
+      // `scanCodeForDsns`; the success bar for PR 1.5 is p50 ≤ 1.2x.
+      label: "scan.walk.dsnParity",
+      warm: false,
+      run: async (cwd) => {
+        for await (const _ of walkFiles({ cwd, ...dsnScanOptions() })) {
+          // body intentionally empty
+        }
+      },
+    },
+    {
+      // scan.grepFiles — walker + regex pass using the same DSN
+      // preset. Adds the per-file `readFile` + line-by-line
+      // `regex.test` cost on top of `scan.walk.dsnParity` so PR 3
+      // has a direct apples-to-apples comparison with
+      // `scanCodeForDsns` (which does the same work).
+      label: "scan.grepFiles",
+      warm: false,
+      run: async (cwd) => {
+        await collectGrep({
+          cwd,
+          pattern: DSN_PATTERN,
+          ...dsnScanOptions(),
+        });
+      },
+    },
+    {
+      // scan.ignore — micro-benchmark for IgnoreStack.isIgnored(). We
+      // build a stack once then hit it 10k times with synthetic paths
+      // so the reported timing is dominated by the query itself, not
+      // tree walking.
+      label: "scan.ignore",
+      warm: false,
+      run: async (cwd) => {
+        const stack = await IgnoreStack.create({
+          cwd,
+          alwaysSkipDirs: ["node_modules", ".git", "dist", "build"],
+          respectGitignore: true,
+          includeGitInfoExclude: true,
+        });
+        const queries = [
+          "src/index.ts",
+          "node_modules/foo/bar.js",
+          "packages/pkg/src/deep/file.tsx",
+          "dist/bundle.js",
+          "build/out.css",
+          "test/fixtures/secret.env",
+          "README.md",
+          ".git/HEAD",
+        ];
+        for (let i = 0; i < 10_000; i += 1) {
+          const q = queries[i % queries.length] as string;
+          stack.isIgnored(q, false);
+        }
+      },
+    },
+  ];
+}
+
+// -------- Main --------
+
+/** Resolve the fixture list for this invocation (synthetic or real repo). */
+async function resolveFixtures(args: CliArgs): Promise<FixtureHandle[]> {
+  if (args.repo) {
+    const abs = resolve(args.repo);
+    if (!existsSync(abs)) {
+      throw new Error(`--repo ${abs} does not exist`);
+    }
+    return [
+      {
+        label: `real:${abs}`,
+        rootDir: abs,
+        synthetic: false,
+        fileCount: await roughFileCount(abs),
+      },
+    ];
+  }
+  return args.sizes.map((size) =>
+    resolveSyntheticFixture(size, args.regenFixtures)
+  );
+}
+
+/** Filter the available ops by substring. Throws when nothing matches. */
+function filterOps(ops: OpEntry[], opFilter: string | undefined): OpEntry[] {
+  if (!opFilter) {
+    return ops;
+  }
+  const filtered = ops.filter((op) => op.label.includes(opFilter));
+  if (filtered.length === 0) {
+    throw new Error(
+      `--op ${opFilter} matched no operations.\n  Available: ${ops.map((o) => o.label).join(", ")}`
+    );
+  }
+  return filtered;
+}
+
+/** Run every op on every fixture and return the flattened entry list. */
+async function runAll(
+  fixtures: readonly FixtureHandle[],
+  ops: readonly OpEntry[],
+  args: CliArgs
+): Promise<BenchEntry[]> {
+  const entries: BenchEntry[] = [];
+  for (const fx of fixtures) {
+    if (!args.json) {
+      console.log(`${fx.label}  (${fx.fileCount} files @ ${fx.rootDir})`);
+    }
+    await withBenchDb(async () => {
+      for (const op of ops) {
+        const samples = await measure(() => op.run(fx.rootDir), {
+          runs: args.runs,
+          warmup: args.warmup,
+          beforeEach: op.setup ? () => op.setup?.(fx.rootDir) : undefined,
+        });
+        const stats = summarize(samples);
+        entries.push({
+          fixture: fx.label,
+          operation: op.label,
+          warm: op.warm,
+          stats,
+        });
+        if (!args.json) {
+          console.log(
+            `  ${op.label.padEnd(24)}  p50 ${stats.p50.toFixed(2)}ms  p95 ${stats.p95.toFixed(2)}ms  (${stats.runs} runs)`
+          );
+        }
+      }
+    });
+  }
+  return entries;
+}
+
+/** Perform the --compare step. Returns false on regression. */
+function compareAgainstBaseline(
+  report: BenchReport,
+  thresholdPct: number
+): boolean {
+  const baselinePath = ".bench/baseline.json";
+  if (!existsSync(baselinePath)) {
+    console.error(
+      `✗ No baseline found at ${baselinePath}. Run with --save-baseline first.`
+    );
+    return false;
+  }
+  const baseline = JSON.parse(
+    readFileSync(baselinePath, "utf8")
+  ) as BenchReport;
+  const rows = compareReports(baseline, report, thresholdPct);
+  const ok = printComparison(rows, thresholdPct);
+  if (!ok) {
+    console.error("✗ One or more operations regressed beyond threshold");
+    return false;
+  }
+  console.log("✓ No regressions beyond threshold");
+  return true;
+}
+
+async function main(): Promise<number> {
+  let args: CliArgs;
+  try {
+    args = parseArgs(process.argv.slice(2));
+  } catch (error) {
+    console.error(`✗ ${(error as Error).message}`);
+    printHelp();
+    return 1;
+  }
+
+  // Silence CLI telemetry — we don't want bench runs filing Sentry events.
+  process.env.SENTRY_CLI_NO_TELEMETRY = "1";
+
+  let fixtures: FixtureHandle[];
+  let ops: OpEntry[];
+  try {
+    fixtures = await resolveFixtures(args);
+    const allOps = await buildOps();
+    ops = filterOps(allOps, args.opFilter);
+  } catch (error) {
+    console.error(`✗ ${(error as Error).message}`);
+    return 1;
+  }
+
+  const entries = await runAll(fixtures, ops, args);
+
+  const report: BenchReport = {
+    version: 1,
+    generatedAt: new Date().toISOString(),
+    runtime: runtimeInfo(),
+    entries,
+  };
+
+  if (args.json) {
+    process.stdout.write(`${JSON.stringify(report, null, 2)}\n`);
+  } else {
+    printReport(report);
+  }
+
+  if (args.saveBaseline) {
+    mkdirSync(".bench", { recursive: true });
+    await writeJsonReport(report, ".bench/baseline.json");
+    if (!args.json) {
+      console.log("✓ Baseline written to .bench/baseline.json");
+    }
+  }
+
+  if (args.compare && !compareAgainstBaseline(report, args.thresholdPct)) {
+    return 1;
+  }
+
+  return 0;
+}
+
+main()
+  .then((code) => process.exit(code))
+  .catch((error) => {
+    console.error(error);
+    process.exit(1);
+  });
diff --git a/src/lib/dsn/code-scanner.ts b/src/lib/dsn/code-scanner.ts
index 446765163..24b6f8516 100644
--- a/src/lib/dsn/code-scanner.ts
+++ b/src/lib/dsn/code-scanner.ts
@@ -1,210 +1,81 @@
 /**
- * Language-Agnostic Code Scanner
+ * Language-Agnostic DSN Code Scanner (policy layer).
  *
- * Scans source code for Sentry DSNs using a simple grep-based approach.
- * This replaces the language-specific detectors with a unified scanner that:
+ * This module owns the DSN-specific policy (URL regex, comment-line
+ * filtering, host validation, package-path inference, stop-on-first
+ * semantics). All file walking, `.gitignore` handling, extension
+ * filtering, and bounded concurrency are delegated to the shared
+ * `src/lib/scan/` module.
  *
- * 1. Greps for DSN URL pattern directly: https://KEY@HOST/PROJECT_ID
- * 2. Filters out DSNs appearing in commented lines
- * 3. Respects .gitignore using the `ignore` package
- * 4. Validates DSN hosts (SaaS when no SENTRY_URL, or self-hosted host when set)
- * 5. Scans concurrently with p-limit for performance
- * 6. Skips large files and known non-source directories
+ * Flow:
+ *   1. `scanDirectory(cwd, stopOnFirst)` calls `walkFiles` with the
+ *      DSN preset (`dsnScanOptions()`), passing `recordMtimes` and an
+ *      `onDirectoryVisit` hook so the cache-invalidation map is
+ *      populated in one traversal.
+ *   2. Each yielded file is read + passed through `extractDsnsFromContent`
+ *      via `mapFilesConcurrent`. Per-file `ConfigError` re-throws up
+ *      to the caller; all other errors are logged at debug level and
+ *      the file is skipped.
+ *   3. `onResult` in `mapFilesConcurrent` dedups into a shared Map
+ *      and raises the early-exit flag on first unique DSN when
+ *      `stopOnFirst: true`.
+ *
+ * Behavior change landed in PR 3: the walker's `nestedGitignore: true`
+ * default (via `dsnScanOptions()`) means nested `.gitignore` files are
+ * now honored. Pre-PR-3 code only read the project-root `.gitignore`.
+ * This is a correctness improvement matching git's cumulative semantics;
+ * DSNs in files covered by a subdir `.gitignore` are no longer detected.
  */
 
-import type { Dirent } from "node:fs";
-import { readdir, stat } from "node:fs/promises";
 import path from "node:path";
-import ignore, { type Ignore } from "ignore";
-import pLimit from "p-limit";
 import { DEFAULT_SENTRY_HOST, getConfiguredSentryUrl } from "../constants.js";
 import { ConfigError } from "../errors.js";
 import { logger } from "../logger.js";
+import {
+  mapFilesConcurrent,
+  normalizePath,
+  type WalkEntry,
+  walkFiles,
+} from "../scan/index.js";
 import { withTracingSpan } from "../telemetry.js";
 import { createDetectedDsn, inferPackagePath, parseDsn } from "./parser.js";
+import { DSN_MAX_DEPTH, dsnScanOptions } from "./scan-options.js";
 import type { DetectedDsn } from "./types.js";
-import { MONOREPO_ROOTS } from "./types.js";
 
 /** Scoped logger for DSN code scanning */
 const log = logger.withTag("dsn-scan");
 
 /**
  * Result of scanning code for DSNs, including mtimes for caching.
+ *
+ * Shape is stable — `src/lib/db/dsn-cache.ts` stores this via
+ * `setCachedDetection` and verifies `sourceMtimes` / `dirMtimes`
+ * against the filesystem. Do NOT change keys/values without also
+ * bumping the cache schema.
  */
 export type CodeScanResult = {
   /** All detected DSNs */
   dsns: DetectedDsn[];
-  /** Map of source file paths to their mtimes (only files containing DSNs) */
+  /**
+   * Map of source file paths (POSIX, relative to cwd) to their mtimes.
+   * Only files that contained at least one DSN are present — the cache
+   * verifier uses this to detect "source file touched since last scan".
+   */
   sourceMtimes: Record<string, number>;
-  /** Mtimes of scanned directories (for detecting new files added to subdirs) */
+  /**
+   * Map of scanned directories (POSIX, relative to cwd; `.` for the
+   * root) to their floored `stat.mtimeMs`. The verifier uses this to
+   * detect "files added to a scanned dir since last scan".
+   */
   dirMtimes: Record<string, number>;
 };
 
-/**
- * Maximum file size to scan (256KB).
- * Files larger than this are skipped as they're unlikely to be source files
- * with DSN configuration.
- *
- * Note: This check happens during file processing rather than collection to
- * avoid extra stat() calls. Bun.file().size is a cheap operation once we
- * have the file handle.
- */
-const MAX_FILE_SIZE = 256 * 1024;
-
-/**
- * Concurrency limit for file reads.
- * Balances performance with file descriptor limits.
- */
-const CONCURRENCY_LIMIT = 50;
-
-/**
- * Maximum depth to scan from project root.
- * Depth 0 = files in root directory
- * Depth 3 = files in third-level subdirectories (e.g., src/lib/config/sentry.ts)
- *
- * In monorepos, depth resets to 0 when entering a package directory
- * (e.g., packages/spotlight/), giving each package its own depth budget.
- */
-const MAX_SCAN_DEPTH = 3;
-
-/**
- * Directories that are always skipped regardless of .gitignore.
- * These are common dependency/build/cache directories that should never contain DSNs.
- * Added to the gitignore instance as built-in patterns.
- */
-const ALWAYS_SKIP_DIRS = [
-  // Version control
-  ".git",
-  ".hg",
-  ".svn",
-  // IDE/Editor
-  ".idea",
-  ".vscode",
-  ".cursor",
-  // Node.js
-  "node_modules",
-  // Test directories (contain fixture DSNs, not real configuration)
-  "test",
-  "tests",
-  "__mocks__",
-  "fixtures",
-  "__fixtures__",
-  // Python
-  "__pycache__",
-  ".pytest_cache",
-  ".mypy_cache",
-  ".ruff_cache",
-  "venv",
-  ".venv",
-  // Java/Kotlin/Gradle
-  "build",
-  "target",
-  ".gradle",
-  // Go
-  "vendor",
-  // Ruby
-  ".bundle",
-  // General build outputs
-  "dist",
-  "out",
-  ".next",
-  ".nuxt",
-  ".output",
-  "coverage",
-];
-
-/**
- * File extensions to scan for DSNs.
- * Covers source code, config files, and data formats that might contain DSNs.
- */
-const TEXT_EXTENSIONS = new Set([
-  // JavaScript/TypeScript ecosystem
-  ".ts",
-  ".tsx",
-  ".js",
-  ".jsx",
-  ".mjs",
-  ".cjs",
-  ".astro",
-  ".vue",
-  ".svelte",
-  // Python
-  ".py",
-  // Go
-  ".go",
-  // Ruby
-  ".rb",
-  ".erb",
-  // PHP
-  ".php",
-  // JVM languages
-  ".java",
-  ".kt",
-  ".kts",
-  ".scala",
-  ".groovy",
-  // .NET languages
-  ".cs",
-  ".fs",
-  ".vb",
-  // Rust
-  ".rs",
-  // Swift/Objective-C
-  ".swift",
-  ".m",
-  ".mm",
-  // Dart/Flutter
-  ".dart",
-  // Elixir/Erlang
-  ".ex",
-  ".exs",
-  ".erl",
-  // Lua
-  ".lua",
-  // Config/data formats
-  ".json",
-  ".yaml",
-  ".yml",
-  ".toml",
-  ".xml",
-  ".properties",
-  ".config",
-]);
-
 /**
  * Common comment prefixes to detect commented-out DSNs.
  * Lines starting with these (after trimming whitespace) are ignored.
  */
 const COMMENT_PREFIXES = ["//", "#", "--", "<!--", "/*", "*", "'''", '"""'];
 
-/**
- * Normalize path separators to forward slashes for cross-platform consistency.
- * On POSIX systems, this is a no-op (identity function).
- * On Windows, converts backslashes to forward slashes.
- *
- * This is needed for:
- * 1. The `ignore` package pattern matching (requires forward slashes)
- * 2. inferPackagePath() which splits by "/"
- * 3. Consistent sourcePath values in DetectedDsn objects
- */
-const normalizePath: (p: string) => string =
-  path.sep === path.posix.sep
-    ? (x) => x
-    : (x) => x.replaceAll(path.sep, path.posix.sep);
-
-/**
- * Check if a relative path is a monorepo package directory.
- * Returns true for paths like "packages/frontend", "apps/server", etc.
- * (exactly 2 segments where the first matches a MONOREPO_ROOTS entry)
- */
-function isMonorepoPackageDir(relativePath: string): boolean {
-  const segments = relativePath.split("/");
-  return (
-    segments.length === 2 &&
-    MONOREPO_ROOTS.includes(segments[0] as (typeof MONOREPO_ROOTS)[number])
-  );
-}
-
 /**
  * Pattern to match Sentry DSN URLs.
  * Captures the full DSN including protocol, public key, optional secret key, host, and project ID.
@@ -226,6 +97,21 @@ function isMonorepoPackageDir(relativePath: string): boolean {
 const DSN_PATTERN =
   /https?:\/\/[a-z0-9]+(?::[a-z0-9]+)?@[a-z0-9.-]+(?:\.[a-z]+|:[0-9]+)\/\d+/gi;
 
+/**
+ * Case-insensitive probe for the DSN scheme prefix. `DSN_PATTERN`
+ * starts with `https?` under the `/i` flag, so any match's first 4
+ * chars are some casing of `http`. The literal-prefix fast path uses
+ * this probe to skip `matchAll` on files with no `http` substring —
+ * the common case on large walks.
+ *
+ * Must be `/i` for correctness: a previous version used two
+ * case-sensitive `indexOf` calls covering only all-lower and
+ * all-upper, which silently missed mixed-case URLs like `Https://…`
+ * or `hTtP://…`. Regressed detection on any source file with
+ * unusual scheme casing.
+ */
+const HTTP_SCHEME_PROBE = /http/i;
+
 /**
  * Extract DSN URLs from file content, filtering out those in commented lines.
  *
@@ -243,6 +129,23 @@ export function extractDsnsFromContent(
   content: string,
   limit?: number
 ): string[] {
+  // Literal-prefix fast path: every DSN starts with `http://` or
+  // `https://` (case-insensitive). When the scheme doesn't appear
+  // anywhere in the file, we know there are zero candidates and can
+  // skip the `matchAll` scan entirely. On large walks (10k+ files),
+  // ~99% of files contain no `http` substring in any casing, so the
+  // probe is effectively free.
+  //
+  // Correctness note: the probe must be case-insensitive. An earlier
+  // version used two `indexOf` calls covering only all-lowercase and
+  // all-uppercase, which regressed detection on mixed-case schemes
+  // like `Https://` or `hTtP://`. `/http/i.test()` is ~5µs per file
+  // in V8 — ~16ms slower than the two-indexOf version on a 10k-file
+  // scan, a trade we accept for correctness.
+  if (!HTTP_SCHEME_PROBE.test(content)) {
+    return [];
+  }
+
   const dsns = new Set<string>();
 
   // Find all potential DSN matches
@@ -294,11 +197,9 @@ export function extractFirstDsnFromContent(content: string): string | null {
 /**
  * Scan a directory for all DSNs in source code files.
  *
- * Respects .gitignore, skips large files, and limits depth.
- * Returns all unique DSNs found across all files, plus mtimes for caching.
- *
- * @param cwd - Directory to scan
- * @returns Object with detected DSNs and source file mtimes
+ * Respects .gitignore (including nested), skips large files, and
+ * limits depth via `dsnScanOptions()`. Returns all unique DSNs plus
+ * mtimes for cache invalidation.
  */
 export function scanCodeForDsns(cwd: string): Promise<CodeScanResult> {
   return scanDirectory(cwd, false);
@@ -308,10 +209,8 @@ export function scanCodeForDsns(cwd: string): Promise<CodeScanResult> {
  * Scan a directory and return the first DSN found.
  *
  * Optimized for the common case of single-project repositories.
- * Stops scanning as soon as a valid DSN is found.
- *
- * @param cwd - Directory to scan
- * @returns First detected DSN or null if none found
+ * Stops scanning as soon as a valid DSN is found (propagates via
+ * `mapFilesConcurrent`'s shared early-exit flag).
  */
 export async function scanCodeForFirstDsn(
   cwd: string
@@ -383,333 +282,190 @@ function isValidDsnHost(dsn: string): boolean {
 }
 
 /**
- * Create an ignore instance with built-in skip directories and .gitignore rules.
+ * Bundle of per-scan mutable state. Collecting these into one record
+ * keeps the per-file processor's arity under Biome's 4-param ceiling.
  */
-async function createIgnoreFilter(cwd: string): Promise<Ignore> {
-  const ig = ignore();
-
-  // Add built-in skip directories first
-  ig.add(ALWAYS_SKIP_DIRS);
-
-  // Then add .gitignore rules if present
-  try {
-    const gitignorePath = path.join(cwd, ".gitignore");
-    const content = await Bun.file(gitignorePath).text();
-    ig.add(content);
-  } catch {
-    // No .gitignore, that's fine
-  }
-
-  return ig;
-}
-
-/**
- * Check if a file should be scanned based on its extension.
- */
-function shouldScanFile(filename: string): boolean {
-  const ext = path.extname(filename);
-  return ext !== "" && TEXT_EXTENSIONS.has(ext);
-}
-
-/**
- * Safely read directory entries, returning empty array on error.
- */
-async function safeReaddir(dir: string): Promise<Dirent[]> {
-  try {
-    return await readdir(dir, { withFileTypes: true });
-  } catch {
-    // Can't read directory (permissions, etc.) - skip it
-    return [];
-  }
-}
-
-/** Result of file collection */
-type CollectResult = {
-  files: string[];
-  /** Mtimes of scanned directories (for detecting new files) */
-  dirMtimes: Record<string, number>;
+type ScanDirectoryState = {
+  cwd: string;
+  stopOnFirst: boolean;
+  seen: Map<string, DetectedDsn>;
+  sourceMtimes: Record<string, number>;
+  filesScanned: { count: number };
 };
 
 /**
- * Get directory mtime safely using node:fs/promises stat.
- * Must use stat() from node:fs/promises (not Bun.file()) for directories.
- */
-async function getDirMtime(dir: string): Promise<number> {
-  try {
-    const stats = await stat(dir);
-    return Math.floor(stats.mtimeMs);
-  } catch {
-    return 0;
-  }
-}
-
-/**
- * Collect files to scan from a directory using manual recursive walk.
- *
- * Unlike readdir with recursive: true, this implementation checks ignore rules
- * BEFORE traversing into directories, avoiding unnecessary traversal of large
- * ignored directories like node_modules.
- *
- * Also collects mtimes of all scanned directories for cache invalidation
- * (detects when new files are added to subdirectories).
- *
- * @param cwd - Root directory to scan
- * @param ig - Ignore filter instance
- * @returns Files and directory mtimes
+ * Main scan implementation. Wraps the pipeline in a traced span so
+ * production dashboards + the `scanCodeForDsns` bench op stay in
+ * sync. Attribute names match the pre-PR-3 scanner byte-for-byte.
  */
-async function collectFiles(cwd: string, ig: Ignore): Promise<CollectResult> {
-  const files: string[] = [];
-  const dirMtimes: Record<string, number> = {};
-
-  // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: recursive directory walk is inherently complex but straightforward
-  async function walk(dir: string, depth: number): Promise<void> {
-    if (depth > MAX_SCAN_DEPTH) {
-      return;
-    }
-
-    // Track this directory's mtime for cache invalidation
-    const relativeDirPath = normalizePath(path.relative(cwd, dir)) || ".";
-    dirMtimes[relativeDirPath] = await getDirMtime(dir);
-
-    const entries = await safeReaddir(dir);
-
-    for (const entry of entries) {
-      const fullPath = path.join(dir, entry.name);
-      const relativePath = normalizePath(path.relative(cwd, fullPath));
-
-      // Check ignore rules BEFORE traversing - prevents walking into node_modules, etc.
-      if (ig.ignores(relativePath)) {
-        continue;
-      }
+function scanDirectory(
+  cwd: string,
+  stopOnFirst: boolean
+): Promise<CodeScanResult> {
+  return withTracingSpan(
+    "scanCodeForDsns",
+    "dsn.detect.code",
+    async (span) => {
+      const sourceMtimes: Record<string, number> = {};
+      const dirMtimes: Record<string, number> = {};
+      const seen = new Map<string, DetectedDsn>();
+      // Mutable counters threaded through the state object so the
+      // per-file callback can update them without capturing.
+      const filesScanned = { count: 0 };
+      let filesCollected = 0;
+
+      const state: ScanDirectoryState = {
+        cwd,
+        stopOnFirst,
+        seen,
+        sourceMtimes,
+        filesScanned,
+      };
 
-      if (entry.isDirectory()) {
-        const nextDepth = isMonorepoPackageDir(relativePath) ? 0 : depth + 1;
-        await walk(fullPath, nextDepth);
-      } else if (entry.isFile() && shouldScanFile(entry.name)) {
-        files.push(relativePath);
+      try {
+        // Walker yields every text file under cwd that passes the DSN
+        // preset (depth 3 with monorepo reset, full DSN skip list,
+        // nested .gitignore honored). We tap the iterator to count
+        // collected files for telemetry, then feed the tapped stream
+        // through mapFilesConcurrent for bounded parallel reads.
+        const walkSource = walkFiles({
+          cwd,
+          ...dsnScanOptions(),
+          recordMtimes: true,
+          onDirectoryVisit: (absDir, mtimeMs) => {
+            const rel = normalizePath(path.relative(cwd, absDir)) || ".";
+            dirMtimes[rel] = mtimeMs;
+          },
+        });
+        const tapped = tapWalker(walkSource, () => {
+          filesCollected += 1;
+        });
+
+        await mapFilesConcurrent(
+          tapped,
+          (entry) => processEntry(entry, state),
+          {
+            onResult: (detected) => {
+              let firstUnique = false;
+              for (const dsn of detected) {
+                if (!seen.has(dsn.raw)) {
+                  seen.set(dsn.raw, dsn);
+                  if (stopOnFirst) {
+                    firstUnique = true;
+                  }
+                }
+              }
+              return firstUnique ? { done: true } : undefined;
+            },
+          }
+        );
+
+        span.setAttribute("dsn.files_collected", filesCollected);
+        span.setAttributes({
+          "dsn.files_scanned": filesScanned.count,
+          "dsn.dsns_found": seen.size,
+        });
+
+        return {
+          dsns: [...seen.values()],
+          sourceMtimes,
+          dirMtimes,
+        };
+      } catch (error) {
+        // ConfigError is a user-facing misconfiguration — surface it.
+        if (error instanceof ConfigError) {
+          throw error;
+        }
+        // Anything else is an unexpected walk failure. Return all
+        // three maps empty to match the pre-PR-3 scanner's error-path
+        // behavior AND avoid a cache-invalidation hole: a partial
+        // `dirMtimes` would cause the cache verifier to only check
+        // the dirs we happened to reach before the error, silently
+        // blessing the cache for dirs the walker never visited.
+        // Empty `dirMtimes` forces a full rescan on the next attempt.
+        span.setStatus({ code: 2, message: "Directory scan failed" });
+        return { dsns: [], sourceMtimes: {}, dirMtimes: {} };
       }
+    },
+    {
+      "dsn.scan_dir": cwd,
+      "dsn.stop_on_first": stopOnFirst,
+      "dsn.max_depth": DSN_MAX_DEPTH,
     }
-  }
-
-  await walk(cwd, 0);
-  return { files, dirMtimes };
+  );
 }
 
-/** Result from processing a single file */
-type FileProcessResult = {
-  dsns: DetectedDsn[];
-  /** File mtime in ms, only set if DSNs were found */
-  mtime?: number;
-};
-
 /**
- * Process a single file and extract DSNs.
+ * Per-file worker: read the file, extract DSNs via the DSN-specific
+ * content pipeline, wrap each raw match in a `DetectedDsn`. Returns
+ * `null` when the file contributes nothing (no raw matches OR every
+ * match got rejected by host validation OR the file was unreadable)
+ * so `mapFilesConcurrent` skips the push + onResult callback entirely
+ * — important on large walks where most files have zero DSNs.
  *
- * Note on Bun.file().size and lastModified: These are lazy properties that read
- * file metadata (via stat) only when accessed, not the file content. This is
- * cheaper than a separate stat() call since it uses the already-created file handle.
- *
- * @param cwd - Root directory
- * @param relativePath - Path relative to cwd
- * @param limit - Maximum DSNs to extract (undefined = no limit)
- * @returns Object with detected DSNs and mtime (if DSNs found)
+ * Re-throws `ConfigError` so `scanDirectory`'s outer try/catch can
+ * propagate user-facing misconfig. All other fs errors are logged at
+ * debug level and treated as "skip this file silently".
  */
-async function processFile(
-  cwd: string,
-  relativePath: string,
-  limit?: number
-): Promise<FileProcessResult> {
-  const filepath = path.join(cwd, relativePath);
-
+async function processEntry(
+  entry: WalkEntry,
+  state: ScanDirectoryState
+): Promise<DetectedDsn[] | null> {
+  state.filesScanned.count += 1;
   try {
-    const file = Bun.file(filepath);
-
-    // Skip large files (Bun.file().size reads metadata, not content)
-    if (file.size > MAX_FILE_SIZE) {
-      log.debug(`Skipping large file: ${relativePath} (${file.size} bytes)`);
-      return { dsns: [] };
+    const content = await Bun.file(entry.absolutePath).text();
+    const raws = extractDsnsFromContent(
+      content,
+      state.stopOnFirst ? 1 : undefined
+    );
+    if (raws.length === 0) {
+      // Return null so `mapFilesConcurrent` skips the push + the
+      // `onResult` callback entirely. On a typical 10k-file walk
+      // where ~99% of files contain no DSN, returning `[]` would
+      // fire ~9900 no-op `onResult` calls and push that many empty
+      // arrays into the (otherwise unused) results list. `null` is
+      // the documented "skip this file" signal.
+      return null;
     }
-
-    const content = await file.text();
-    const dsnStrings = extractDsnsFromContent(content, limit);
-
-    if (dsnStrings.length === 0) {
-      return { dsns: [] };
-    }
-
-    const packagePath = inferPackagePath(relativePath);
-
-    // Map DSN strings to DetectedDsn objects, filtering out any that fail to parse
-    const dsns = dsnStrings
-      .map((dsn) => createDetectedDsn(dsn, "code", relativePath, packagePath))
+    const packagePath = inferPackagePath(entry.relativePath);
+    const detected = raws
+      .map((raw) =>
+        createDetectedDsn(raw, "code", entry.relativePath, packagePath)
+      )
       .filter((d): d is DetectedDsn => d !== null);
-
-    // Return mtime only if we found valid DSNs (for cache invalidation)
-    return dsns.length > 0 ? { dsns, mtime: file.lastModified } : { dsns: [] };
+    // Only record mtime when at least one DSN was accepted (matches
+    // pre-PR-3 behavior — the cache only tracks files it cares about).
+    if (detected.length === 0) {
+      // All raw matches got rejected by host validation (e.g., a
+      // self-hosted repo scanning a file with a saas.sentry.io URL).
+      // Skip the push + onResult — same signal as "no DSNs found".
+      return null;
+    }
+    state.sourceMtimes[entry.relativePath] = entry.mtime;
+    return detected;
   } catch (error) {
-    // Re-throw configuration errors - they indicate user misconfiguration
-    // that should be surfaced rather than silently ignored
     if (error instanceof ConfigError) {
       throw error;
     }
-    // For file system errors (ENOENT, EACCES, EPERM, etc.), return empty result
-    log.debug(`Cannot read file: ${relativePath}`);
-    return { dsns: [] };
+    // ENOENT / EACCES / malformed content — the pre-PR-3 scanner
+    // matched these with a single `log.debug(...)` and returned
+    // empty. Preserve that behavior exactly (null = skip silently).
+    log.debug(`Cannot read file: ${entry.relativePath}`);
+    return null;
   }
 }
 
 /**
- * State for concurrent DSN scanning.
+ * Pass-through async generator that invokes `onEach` once per entry
+ * before yielding. Lets `scanDirectory` count collected files without
+ * forking the walker's output iterator.
  */
-type ScanState = {
-  results: Map<string, DetectedDsn>;
-  /** Map of source file paths to their mtimes (only files containing DSNs) */
-  sourceMtimes: Record<string, number>;
-  filesScanned: number;
-  earlyExit: boolean;
-};
-
-/**
- * Process a file and add found DSNs to the scan state.
- * Returns true if early exit should be triggered.
- */
-async function processFileAndCollect(
-  cwd: string,
-  file: string,
-  stopOnFirst: boolean,
-  state: ScanState
-): Promise<boolean> {
-  state.filesScanned += 1;
-  const { dsns, mtime } = await processFile(
-    cwd,
-    file,
-    stopOnFirst ? 1 : undefined
-  );
-
-  // Record mtime for files that contain DSNs (for cache invalidation)
-  if (mtime !== undefined && dsns.length > 0) {
-    state.sourceMtimes[file] = mtime;
+async function* tapWalker(
+  source: AsyncIterable<WalkEntry>,
+  onEach: () => void
+): AsyncGenerator<WalkEntry> {
+  for await (const entry of source) {
+    onEach();
+    yield entry;
   }
-
-  for (const dsn of dsns) {
-    if (!state.results.has(dsn.raw)) {
-      state.results.set(dsn.raw, dsn);
-      // When stopOnFirst is true, processFile returns at most 1 DSN per file.
-      // This check triggers early exit when we find the first *unique* DSN,
-      // handling the case where the same DSN appears in multiple files.
-      if (stopOnFirst) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-/**
- * Scan files concurrently and collect DSNs.
- *
- * @param cwd - Root directory
- * @param files - Files to scan (relative paths)
- * @param stopOnFirst - Whether to stop after finding the first DSN
- * @returns Map of DSNs (keyed by raw string), source mtimes, and count of files scanned
- */
-async function scanFilesForDsns(
-  cwd: string,
-  files: string[],
-  stopOnFirst: boolean
-): Promise<{
-  results: Map<string, DetectedDsn>;
-  sourceMtimes: Record<string, number>;
-  filesScanned: number;
-}> {
-  const limit = pLimit(CONCURRENCY_LIMIT);
-  const state: ScanState = {
-    results: new Map(),
-    sourceMtimes: {},
-    filesScanned: 0,
-    earlyExit: false,
-  };
-
-  // Create a rate-limited processor that handles early exit.
-  // Note: we intentionally do NOT use limit.clearQueue() because it causes
-  // the promises for cleared items to never settle, hanging Promise.all forever.
-  // Instead, queued tasks check state.earlyExit and return immediately.
-  const processWithLimit = (file: string) =>
-    limit(async () => {
-      if (state.earlyExit) {
-        return;
-      }
-
-      const shouldExit = await processFileAndCollect(
-        cwd,
-        file,
-        stopOnFirst,
-        state
-      );
-
-      if (shouldExit) {
-        state.earlyExit = true;
-      }
-    });
-
-  await Promise.all(files.map(processWithLimit));
-
-  return {
-    results: state.results,
-    sourceMtimes: state.sourceMtimes,
-    filesScanned: state.filesScanned,
-  };
-}
-
-/**
- * Main scan implementation with Sentry performance tracing and metrics.
- */
-function scanDirectory(
-  cwd: string,
-  stopOnFirst: boolean
-): Promise<CodeScanResult> {
-  return withTracingSpan(
-    "scanCodeForDsns",
-    "dsn.detect.code",
-    async (span) => {
-      // Create ignore filter with built-in patterns and .gitignore
-      const ig = await createIgnoreFilter(cwd);
-
-      // Collect all files to scan (also collects directory mtimes)
-      let collectResult: CollectResult;
-      try {
-        collectResult = await collectFiles(cwd, ig);
-      } catch {
-        span.setStatus({ code: 2, message: "Directory scan failed" });
-        return { dsns: [], sourceMtimes: {}, dirMtimes: {} };
-      }
-
-      const { files, dirMtimes } = collectResult;
-
-      span.setAttribute("dsn.files_collected", files.length);
-
-      if (files.length === 0) {
-        return { dsns: [], sourceMtimes: {}, dirMtimes };
-      }
-
-      // Scan files
-      const { results, sourceMtimes, filesScanned } = await scanFilesForDsns(
-        cwd,
-        files,
-        stopOnFirst
-      );
-
-      span.setAttributes({
-        "dsn.files_scanned": filesScanned,
-        "dsn.dsns_found": results.size,
-      });
-
-      return { dsns: [...results.values()], sourceMtimes, dirMtimes };
-    },
-    {
-      "dsn.scan_dir": cwd,
-      "dsn.stop_on_first": stopOnFirst,
-      "dsn.max_depth": MAX_SCAN_DEPTH,
-    }
-  );
 }
diff --git a/src/lib/dsn/scan-options.ts b/src/lib/dsn/scan-options.ts
new file mode 100644
index 000000000..84210537a
--- /dev/null
+++ b/src/lib/dsn/scan-options.ts
@@ -0,0 +1,70 @@
+/**
+ * DSN scanner preset for `src/lib/scan/walkFiles`.
+ *
+ * Expresses the policy the current DSN scanner
+ * (`src/lib/dsn/code-scanner.ts`) applies today: a depth-3 cap, full
+ * DSN skip list (including test/fixture dirs), monorepo-boundary
+ * depth reset, and the `TEXT_EXTENSIONS` allowlist. PR 3 will consume
+ * this to replace `collectFiles` / `processFile` with a walker-backed
+ * implementation.
+ *
+ * Isolated in the `dsn/` package — not `scan/` — so the core scanner
+ * module stays policy-free. Other callers (the init wizard, future
+ * features) will bring their own presets.
+ */
+
+import type { WalkOptions } from "../scan/index.js";
+import {
+  DEFAULT_SKIP_DIRS,
+  DSN_ADDITIONAL_SKIP_DIRS,
+  isMonorepoPackageDir,
+  TEXT_EXTENSIONS,
+} from "../scan/index.js";
+
+/**
+ * The DSN scanner's depth limit. Matches
+ * `src/lib/dsn/code-scanner.ts::MAX_SCAN_DEPTH` (pre-PR-3).
+ *
+ * `maxDepth` in the scan module caps **directory descent** — files
+ * inside the last-entered directory are still yielded regardless.
+ * So with `maxDepth: 3` and the monorepo descent hook, the deepest
+ * files yielded inside a package look like
+ * `packages/foo/a/b/c/file.ts` — three directory levels past the
+ * package boundary.
+ */
+export const DSN_MAX_DEPTH = 3;
+
+/**
+ * Build a `WalkOptions` recipe that produces DSN-scanner-equivalent
+ * behavior. Callers provide `cwd` (and optionally `signal`); this
+ * helper fills in the rest.
+ *
+ * Notable choices vs. the current DSN scanner:
+ *   - `nestedGitignore: true` — the existing scanner only reads the
+ *     root `.gitignore`. Honoring nested ones is a correctness
+ *     upgrade; PR 1.5's walker optimization makes the cost tolerable.
+ *   - `extensions: TEXT_EXTENSIONS` — matches the existing scanner's
+ *     filter; files outside this set are never considered.
+ */
+export function dsnScanOptions(): Omit<WalkOptions, "cwd"> {
+  return {
+    extensions: TEXT_EXTENSIONS,
+    alwaysSkipDirs: [...DEFAULT_SKIP_DIRS, ...DSN_ADDITIONAL_SKIP_DIRS],
+    maxDepth: DSN_MAX_DEPTH,
+    descentHook: dsnDescentHook,
+    nestedGitignore: true,
+    respectGitignore: true,
+    hidden: true,
+  };
+}
+
+/**
+ * Exported so tests can exercise the hook in isolation without
+ * reaching into the options object.
+ */
+export function dsnDescentHook(relPath: string, currentDepth: number): number {
+  // Entering a monorepo package dir resets the depth counter, giving
+  // each package its own depth-3 budget (mirrors the existing DSN
+  // scanner's isMonorepoPackageDir-driven reset in code-scanner.ts).
+  return isMonorepoPackageDir(relPath) ? 0 : currentDepth + 1;
+}
diff --git a/src/lib/scan/binary.ts b/src/lib/scan/binary.ts
new file mode 100644
index 000000000..ab521c8a6
--- /dev/null
+++ b/src/lib/scan/binary.ts
@@ -0,0 +1,88 @@
+/**
+ * Binary-file detection for the scan module.
+ *
+ * Uses the standard NUL-byte heuristic: a file is considered binary if
+ * any of its first 8 KB is 0x00. This matches rg, git, grep, and
+ * file(1). It is deliberately coarse — UTF-16-encoded text is
+ * misclassified as binary because its ASCII-range code units produce
+ * NUL bytes; callers that care can add a UTF-16 BOM check on top.
+ *
+ * Two entry points:
+ *
+ * - `classifyByExtension` — O(1) fast path. Returns `{ isBinary: false }`
+ *   for known text extensions; returns null otherwise so the caller knows
+ *   to fall through to the sniff path.
+ * - `readHeadAndSniff` — opens the file, reads the first 8 KB via
+ *   `fs.promises.open` + `handle.read`, runs the sniff, returns the head
+ *   buffer alongside the classification.
+ */
+
+import { open } from "node:fs/promises";
+import { extname } from "node:path";
+import { BINARY_SNIFF_BYTES } from "./constants.js";
+
+/**
+ * Inspect up to 8 KB of `head` for a NUL byte.
+ *
+ * Empty buffers are treated as text — they correspond to zero-byte
+ * files, which are conventionally text (nothing to be confused about).
+ */
+export function isLikelyBinary(head: Uint8Array): boolean {
+  const sniffLen = Math.min(head.length, BINARY_SNIFF_BYTES);
+  for (let i = 0; i < sniffLen; i += 1) {
+    if (head[i] === 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/**
+ * Extension-based classification for the fast path.
+ *
+ * Returns `{ isBinary: false }` when the path's extension is a known
+ * text extension — no disk read needed. Returns `null` when the
+ * extension is unknown and the caller must read the file head.
+ *
+ * This intentionally does not try to classify "known binary" extensions
+ * (.png, .zip, .woff…). A NUL-byte sniff is fast and more reliable than
+ * maintaining a binary-extension allowlist; most text files without a
+ * TEXT_EXTENSIONS membership (e.g., `.sentryclirc`, `.editorconfig`,
+ * `Makefile`) would be misclassified by a naive binary-ext list.
+ */
+export function classifyByExtension(
+  absPath: string,
+  textExtensions: ReadonlySet<string>
+): { isBinary: false } | null {
+  const ext = extname(absPath).toLowerCase();
+  if (ext && textExtensions.has(ext)) {
+    return { isBinary: false };
+  }
+  return null;
+}
+
+/**
+ * Open `absPath`, read up to 8 KB from offset 0, and classify.
+ *
+ * The returned `head` is a borrowed view of the read buffer — do NOT
+ * retain it beyond the current stack frame, as the backing allocation
+ * is not pooled. When the file is shorter than 8 KB, the buffer is
+ * sliced to the actual number of bytes read.
+ *
+ * Errors are re-thrown. Callers that want to swallow fs errors should
+ * wrap this in try/catch.
+ */
+export async function readHeadAndSniff(
+  absPath: string
+): Promise<{ head: Uint8Array; isBinary: boolean }> {
+  const handle = await open(absPath, "r");
+  try {
+    const buf = new Uint8Array(BINARY_SNIFF_BYTES);
+    const { bytesRead } = await handle.read(buf, 0, BINARY_SNIFF_BYTES, 0);
+    const head =
+      bytesRead === BINARY_SNIFF_BYTES ? buf : buf.subarray(0, bytesRead);
+    return { head, isBinary: isLikelyBinary(head) };
+  } finally {
+    await handle.close();
+  }
+}
diff --git a/src/lib/scan/concurrent.ts b/src/lib/scan/concurrent.ts
new file mode 100644
index 000000000..4c65f167f
--- /dev/null
+++ b/src/lib/scan/concurrent.ts
@@ -0,0 +1,223 @@
+/**
+ * Shared concurrency helper for grep + DSN scanner.
+ *
+ * Implements the "bounded parallelism with cooperative early exit"
+ * pattern from `src/lib/dsn/code-scanner.ts::scanFilesForDsns`,
+ * extracted so PR 2's grep engine and PR 3's DSN scanner can share
+ * one tested implementation.
+ *
+ * ### Why not `pLimit.clearQueue()`
+ *
+ * `pLimit` exposes a `clearQueue()` method that cancels still-queued
+ * tasks — but it does so by silently dropping the unresolved promises,
+ * which means the outer `Promise.all` never settles. The DSN scanner
+ * has a multi-line comment about this (`code-scanner.ts:634`). We use
+ * a shared `earlyExit` boolean instead: queued tasks peek at it in
+ * their first statement and return immediately.
+ */
+
+import pLimit from "p-limit";
+import { CONCURRENCY_LIMIT } from "./constants.js";
+
+/** Common options for the two concurrent helpers. */
+export type ConcurrentOptions = {
+  /** Max in-flight tasks. Default: `CONCURRENCY_LIMIT` (50). */
+  concurrency?: number;
+  /**
+   * Abort mid-stream. When aborted, the next `yield` (stream variant)
+   * or the `await` inside `mapFilesConcurrent` throws an
+   * `AbortError`. In-flight tasks are left to settle on their own.
+   */
+  signal?: AbortSignal;
+};
+
+/**
+ * Options for the gather-all `mapFilesConcurrent`.
+ *
+ * The optional `onResult` callback is invoked synchronously in the
+ * coordinator after each per-item result is pushed. Returning
+ * `{ done: true }` raises the shared early-exit flag — queued tasks
+ * bail and `mapFilesConcurrent` resolves with whatever has been
+ * collected so far.
+ */
+export type MapFilesOptions<T> = ConcurrentOptions & {
+  onResult?: (result: T) => { done: boolean } | undefined;
+};
+
+/**
+ * Run `fn` on every item from `source` with bounded concurrency,
+ * collect all results into an array, and resolve when every task has
+ * settled OR an early-exit signal has been raised and all in-flight
+ * tasks have completed.
+ *
+ * Result order is completion order, NOT source order — if you need
+ * source-order output, sort after. `null` returns from `fn` are
+ * filtered out of the result array.
+ */
+export async function mapFilesConcurrent<TIn, TOut>(
+  source: AsyncIterable<TIn>,
+  fn: (item: TIn) => Promise<TOut | null>,
+  opts: MapFilesOptions<TOut> = {}
+): Promise<TOut[]> {
+  const limit = pLimit(opts.concurrency ?? CONCURRENCY_LIMIT);
+  const results: TOut[] = [];
+  const state = { earlyExit: false };
+  const tasks: Promise<void>[] = [];
+
+  try {
+    for await (const item of source) {
+      if (state.earlyExit) {
+        break;
+      }
+      throwIfAborted(opts.signal);
+      tasks.push(
+        limit(async () => {
+          if (state.earlyExit) {
+            return;
+          }
+          const out = await fn(item);
+          if (state.earlyExit || out === null) {
+            return;
+          }
+          results.push(out);
+          const verdict = opts.onResult?.(out);
+          if (verdict?.done) {
+            state.earlyExit = true;
+          }
+        })
+      );
+    }
+  } finally {
+    // Always wait for in-flight tasks — otherwise a thrown AbortError
+    // mid-iteration would leave orphans running against closed state.
+    await Promise.all(tasks);
+  }
+
+  return results;
+}
+
+/**
+ * Streaming counterpart to `mapFilesConcurrent`. Yields each non-null
+ * result as soon as its producing task settles — useful when the
+ * consumer wants to display matches progressively or terminate early
+ * via `break`.
+ *
+ * `fn` returns `TOut[]` (or `null` for no output), which lets per-item
+ * work emit multiple results (e.g., multiple grep matches per file).
+ *
+ * Consumer-initiated `break` drains the queue then stops pumping;
+ * in-flight workers run to completion off the main path.
+ */
+export async function* mapFilesConcurrentStream<TIn, TOut>(
+  source: AsyncIterable<TIn>,
+  fn: (item: TIn) => Promise<TOut[] | null>,
+  opts: ConcurrentOptions = {}
+): AsyncGenerator<TOut> {
+  const limit = pLimit(opts.concurrency ?? CONCURRENCY_LIMIT);
+  const state = { earlyExit: false };
+
+  // Producer-consumer buffer. Workers push into `queue`; the generator
+  // drains it between awaits on `awake`. When `awake` resolves, it's
+  // replaced in one atomic step so subsequent notifications don't
+  // deadlock. We build the first promise inline so the TS flow
+  // analyzer sees both variables as assigned before any use.
+  const queue: TOut[] = [];
+  let wakeUp: () => void = () => {
+    /* reassigned by resetAwake below */
+  };
+  let awake: Promise<void> = new Promise<void>((r) => {
+    wakeUp = r;
+  });
+  const resetAwake = () => {
+    awake = new Promise<void>((r) => {
+      wakeUp = r;
+    });
+  };
+
+  let producerDone = false;
+  let producerError: unknown = null;
+  const tasks: Promise<void>[] = [];
+
+  const producer = (async () => {
+    try {
+      for await (const item of source) {
+        if (state.earlyExit) {
+          break;
+        }
+        throwIfAborted(opts.signal);
+        tasks.push(
+          limit(async () => {
+            if (state.earlyExit) {
+              return;
+            }
+            const out = await fn(item);
+            if (state.earlyExit || out === null || out.length === 0) {
+              return;
+            }
+            for (const entry of out) {
+              queue.push(entry);
+            }
+            wakeUp();
+          })
+        );
+      }
+      await Promise.all(tasks);
+    } catch (error) {
+      producerError = error;
+    } finally {
+      producerDone = true;
+      wakeUp();
+    }
+  })();
+
+  try {
+    while (true) {
+      if (queue.length > 0) {
+        // Drain everything already queued before we yield control,
+        // so fast consumers don't spin one microtask per result.
+        while (queue.length > 0) {
+          yield queue.shift() as TOut;
+          throwIfAborted(opts.signal);
+        }
+        continue;
+      }
+      if (producerDone) {
+        break;
+      }
+      await awake;
+      resetAwake();
+    }
+  } finally {
+    // Consumer-initiated break — stop pumping new tasks; in-flight
+    // tasks still run to completion because we don't clearQueue().
+    state.earlyExit = true;
+    // Make sure the producer has exited before returning (it might
+    // still be blocked on awake). We rely on the finally-block above
+    // flipping `producerDone` + calling `wakeUp()`.
+    await producer;
+    // Propagate producer errors from inside the `finally` so they
+    // surface on both paths: normal drain-to-completion AND
+    // consumer-initiated `break`. Code *after* a generator's
+    // try/finally is unreachable when the consumer breaks (the
+    // runtime's `return()` resolves the iterator without executing
+    // the post-try body), so an error thrown outside this block
+    // would be silently lost on the break path.
+    if (producerError) {
+      // biome-ignore lint/correctness/noUnsafeFinally: intentional — this is the only path to surface producer errors on the break path
+      throw producerError;
+    }
+  }
+}
+
+/**
+ * Mirror Node's `AbortSignal.throwIfAborted` for Bun targets that
+ * don't expose it.
+ */
+function throwIfAborted(signal: AbortSignal | undefined): void {
+  if (signal?.aborted) {
+    throw new DOMException(
+      signal.reason instanceof Error ? signal.reason.message : "Aborted",
+      "AbortError"
+    );
+  }
+}
diff --git a/src/lib/scan/constants.ts b/src/lib/scan/constants.ts
new file mode 100644
index 000000000..d4a13bfd0
--- /dev/null
+++ b/src/lib/scan/constants.ts
@@ -0,0 +1,244 @@
+/**
+ * Shared constants for the `scan` module.
+ *
+ * These values are deliberately policy-free defaults for a general-purpose
+ * scanner. Callers that need stricter DSN-style filtering should spread
+ * `DSN_ADDITIONAL_SKIP_DIRS` into their `alwaysSkipDirs` option.
+ *
+ * Single source of truth: `src/lib/dsn/code-scanner.ts` previously owned
+ * `TEXT_EXTENSIONS`, `MAX_FILE_SIZE`, `CONCURRENCY_LIMIT`, `normalizePath`,
+ * and `isMonorepoPackageDir`. Once PR 3 lands, `code-scanner.ts` re-imports
+ * from here instead of duplicating.
+ */
+
+import { availableParallelism } from "node:os";
+import path from "node:path";
+// Re-exported below so scan callers don't have to reach into `dsn/`.
+import { MONOREPO_ROOTS as DSN_MONOREPO_ROOTS } from "../dsn/types.js";
+
+/**
+ * File extensions the walker classifies as text without running the
+ * 8 KB NUL-byte sniff. Files with extensions outside this set fall
+ * through to `readHeadAndSniff()` which inspects the first 8 KB.
+ *
+ * Lifted verbatim from `src/lib/dsn/code-scanner.ts` so that PR 3 can
+ * swap the DSN scanner over to this module with no behavior change.
+ */
+export const TEXT_EXTENSIONS: ReadonlySet<string> = new Set([
+  // JavaScript/TypeScript ecosystem
+  ".ts",
+  ".tsx",
+  ".js",
+  ".jsx",
+  ".mjs",
+  ".cjs",
+  ".astro",
+  ".vue",
+  ".svelte",
+  // Python
+  ".py",
+  // Go
+  ".go",
+  // Ruby
+  ".rb",
+  ".erb",
+  // PHP
+  ".php",
+  // JVM languages
+  ".java",
+  ".kt",
+  ".kts",
+  ".scala",
+  ".groovy",
+  // .NET languages
+  ".cs",
+  ".fs",
+  ".vb",
+  // Rust
+  ".rs",
+  // Swift/Objective-C
+  ".swift",
+  ".m",
+  ".mm",
+  // Dart/Flutter
+  ".dart",
+  // Elixir/Erlang
+  ".ex",
+  ".exs",
+  ".erl",
+  // Lua
+  ".lua",
+  // Config/data formats
+  ".json",
+  ".yaml",
+  ".yml",
+  ".toml",
+  ".xml",
+  ".properties",
+  ".config",
+]);
+
+/**
+ * Default directories the walker always skips, independent of any user
+ * `.gitignore`. Limited to VCS + common build outputs — explicitly NOT
+ * including test / fixture / IDE dirs, because a general-purpose scanner
+ * has no business skipping those. Callers like the DSN detector, which
+ * have stricter policy, combine this list with `DSN_ADDITIONAL_SKIP_DIRS`.
+ */
+export const DEFAULT_SKIP_DIRS: readonly string[] = [
+  // Version control
+  ".git",
+  ".hg",
+  ".svn",
+  // Node / JS
+  "node_modules",
+  // Python
+  "__pycache__",
+  "venv",
+  ".venv",
+  // General build outputs
+  "dist",
+  "build",
+  "target",
+  ".next",
+  ".nuxt",
+  ".output",
+  // Go / Ruby / Gradle
+  "vendor",
+  ".gradle",
+  ".bundle",
+  // Coverage + caches
+  "coverage",
+  ".cache",
+  ".turbo",
+];
+
+/**
+ * DSN-specific extras that the DSN scanner needs on top of `DEFAULT_SKIP_DIRS`.
+ * These are listed separately because a general grep caller shouldn't skip
+ * `test/` / `fixtures/` (users search inside tests all the time), but the
+ * DSN scanner does (to avoid picking up fixture DSNs).
+ */
+export const DSN_ADDITIONAL_SKIP_DIRS: readonly string[] = [
+  // IDE / editor
+  ".idea",
+  ".vscode",
+  ".cursor",
+  // Test directories with fixture DSNs
+  "test",
+  "tests",
+  "__mocks__",
+  "fixtures",
+  "__fixtures__",
+  // Python cache dirs the generic list doesn't cover
+  ".pytest_cache",
+  ".mypy_cache",
+  ".ruff_cache",
+  // Alt build output name
+  "out",
+];
+
+/**
+ * Skip files larger than this during walks. 256 KB covers source files
+ * comfortably while keeping single-file `readText` within budget.
+ *
+ * Callers that need to accept larger files can override via
+ * `WalkOptions.maxFileSize`.
+ */
+export const MAX_FILE_SIZE = 256 * 1024;
+
+/**
+ * Concurrency ceiling for per-file work outside the walker itself
+ * (binary sniffing, content reading, regex matching via
+ * `mapFilesConcurrent`). The walker itself is a sequential generator;
+ * this bounds the parallel fan-out of its consumers.
+ *
+ * ### Tuning rationale (empirical, workload-dependent)
+ *
+ * This value is the result of two competing measurements on a 4-core
+ * box. The honest summary: **the "optimal" concurrency depends on the
+ * caller's shape**, and no single default is right for every workload.
+ *
+ * 1. **Walker-fed streaming** (DSN scanner, init-wizard grep over a
+ *    repo): the walker's serial `readdir` descent is the dominant
+ *    cost. Workers never starve; conc≥2 is enough; conc≈4-8 is best
+ *    empirically. Higher values add tiny scheduling overhead without
+ *    more useful work.
+ * 2. **Pure per-file I/O** (pre-listed file set, no walker): the
+ *    knee is at conc≈16 on raw reads, ≈32 on read+regex. libuv's
+ *    threadpool + async fs can keep many more in-flight tasks alive
+ *    than CPU count would suggest, because each task spends most of
+ *    its time awaiting I/O.
+ *
+ * We split the difference and tie the default to `availableParallelism`
+ * with floors/caps:
+ *   - 1-2 cores → 2  (tiny CI runners)
+ *   - 4 cores   → 4
+ *   - 8 cores   → 8
+ *   - 16+ cores → 16 (capped)
+ *
+ * Optimizing for the walker-fed case costs ~1-2% on pure-I/O; the
+ * reverse costs ~15% on walker-fed. Walker-fed is the real-world
+ * workload (every current caller uses the walker). Callers with
+ * known-pure-I/O workloads should override via
+ * `WalkOptions.concurrency` / `GrepOptions.concurrency`.
+ *
+ * ### History
+ *
+ * - Pre-PR: hardcoded 50, inherited with no measurement. Fine for
+ *   walker-fed, wastes scheduling budget.
+ * - PR 3.5 first attempt: tied to `availableParallelism()`. Correct
+ *   direction, but the "knee" analysis conflated walker dominance
+ *   with actual I/O parallelism limits.
+ * - PR 3.5 second attempt: `cores × 4` capped at 32 (8 floor).
+ *   Microbench-optimal but regressed `scanCodeForDsns` ~15% because
+ *   the microbench excluded the walker which is the actual bottleneck.
+ * - Now: `availableParallelism` with 2/16 clamps. Measured best on
+ *   the walker-fed workload; "suboptimal" on pure I/O but close
+ *   enough (within a few ms).
+ */
+export const CONCURRENCY_LIMIT = Math.min(
+  16,
+  Math.max(2, availableParallelism())
+);
+
+/**
+ * Byte length read from the head of a file to classify binary-ness.
+ * Standard NUL-byte heuristic used by rg, git, grep, and file(1).
+ */
+export const BINARY_SNIFF_BYTES = 8192;
+
+/**
+ * Re-export MONOREPO_ROOTS so downstream scan-module consumers don't have
+ * to cross into the `dsn/` package.
+ */
+export const MONOREPO_ROOTS = DSN_MONOREPO_ROOTS;
+
+/**
+ * Normalize path separators to forward slashes. No-op on POSIX,
+ * `\\` → `/` on Windows. Required for:
+ *   1. The `ignore` package (which expects forward slashes)
+ *   2. Monorepo-boundary detection (splits on "/")
+ *   3. Consistent relativePath values on `WalkEntry`
+ */
+export const normalizePath: (p: string) => string =
+  path.sep === path.posix.sep
+    ? (x) => x
+    : (x) => x.replaceAll(path.sep, path.posix.sep);
+
+/**
+ * True if `relativePath` names a monorepo package directory — exactly
+ * two segments where the first is one of `MONOREPO_ROOTS`
+ * (e.g., "packages/frontend", "apps/server").
+ *
+ * Not used by the core walker (which is policy-free), but exported for
+ * consumers like the DSN scanner that want to reset their depth counter
+ * at package boundaries.
+ */
+export function isMonorepoPackageDir(relativePath: string): boolean {
+  const segments = relativePath.split("/");
+  return (
+    segments.length === 2 &&
+    MONOREPO_ROOTS.includes(segments[0] as (typeof MONOREPO_ROOTS)[number])
+  );
+}
diff --git a/src/lib/scan/glob.ts b/src/lib/scan/glob.ts
new file mode 100644
index 000000000..3dff5074e
--- /dev/null
+++ b/src/lib/scan/glob.ts
@@ -0,0 +1,127 @@
+/**
+ * Pure-TS glob engine built on top of `walkFiles`.
+ *
+ * Accepts one or more glob patterns (picomatch syntax) and yields
+ * files under `cwd` matching at least one `patterns` entry and no
+ * `exclude` entry. Matching uses the `picomatch` package
+ * (already a devDep, already used in `script/node-polyfills.ts`).
+ *
+ * ### Pattern semantics
+ *
+ * - Patterns with a `/` are matched against the POSIX-normalized
+ *   relative path (e.g., `src/*.ts` only matches files directly in
+ *   `src/`).
+ * - Patterns without a `/` are matched against just the basename
+ *   (e.g., `*.ts` matches `any/dir/x.ts`).
+ * - `dot: true` — the matcher accepts dotfiles, matching the
+ *   walker's default `hidden: true`.
+ * - `**` spans directory boundaries.
+ *
+ * This mirrors the behavior of the init wizard's existing FS-fallback
+ * glob (`src/lib/init/tools/search-utils.ts::matchGlob`) but uses
+ * picomatch's full grammar — extglobs (`+(a|b)`), braces (`{a,b}`),
+ * negation (`!pattern`), etc.
+ *
+ * ### Cost model
+ *
+ * Globs layer onto the walker's output; no new stat calls are made.
+ * Compiled matchers are cached per call (single picomatch compile
+ * per `patterns`/`exclude` entry) so walking 10k files costs ~2
+ * compile calls + 10k `(rel|basename) → bool` invocations.
+ */
+
+import {
+  basenameOf,
+  compileMatchers,
+  joinPosix,
+  matchesAny,
+  walkerRoot,
+} from "./path-utils.js";
+import type { GlobOptions, GlobResult, WalkOptions } from "./types.js";
+import { walkFiles } from "./walker.js";
+
+/**
+ * Yield relative paths under `opts.cwd` that match at least one of
+ * `opts.patterns` and none of `opts.exclude`.
+ *
+ * Emits in walker order (lexicographic per-directory; see
+ * `walker.ts::compareByName`).
+ *
+ * The `opts.patterns` field is required — a glob with no patterns
+ * returns immediately.
+ */
+export async function* globFiles(opts: GlobOptions): AsyncGenerator<string> {
+  const includes = compileMatchers(opts.patterns);
+  if (includes.length === 0) {
+    return;
+  }
+  const excludes = compileMatchers(opts.exclude);
+
+  const root = walkerRoot(opts.cwd, opts.path);
+  const walkOpts: WalkOptions = {
+    cwd: root,
+    alwaysSkipDirs: opts.alwaysSkipDirs,
+    respectGitignore: opts.respectGitignore,
+    nestedGitignore: opts.nestedGitignore,
+    hidden: opts.hidden,
+    maxDepth: opts.maxDepth,
+    minDepth: opts.minDepth,
+    descentHook: opts.descentHook,
+    followSymlinks: opts.followSymlinks,
+    signal: opts.signal,
+    timeBudgetMs: opts.timeBudgetMs,
+  };
+
+  const maxResults = opts.maxResults ?? Number.POSITIVE_INFINITY;
+  let emitted = 0;
+
+  for await (const entry of walkFiles(walkOpts)) {
+    // entry.relativePath is relative to the walker's root, which is
+    // `opts.path`-anchored. Convert to a cwd-relative path so the
+    // public output is always relative to the caller's cwd.
+    const relToRoot = entry.relativePath;
+    const basename = basenameOf(relToRoot);
+    if (!matchesAny(includes, relToRoot, basename)) {
+      continue;
+    }
+    if (excludes.length > 0 && matchesAny(excludes, relToRoot, basename)) {
+      continue;
+    }
+
+    const relToCwd = opts.path ? joinPosix(opts.path, relToRoot) : relToRoot;
+    yield relToCwd;
+    emitted += 1;
+    if (emitted >= maxResults) {
+      return;
+    }
+  }
+}
+
+/**
+ * Drain `globFiles` into a sorted array and report whether the walk
+ * was truncated by `maxResults`.
+ *
+ * Sort key: byte-lexicographic on the relative path. This matches
+ * `Array.prototype.sort`'s default and is stable across runs.
+ *
+ * We forward `maxResults + 1` to the iterator so we can distinguish
+ * "exactly N matches" from "more than N matches were available".
+ */
+export async function collectGlob(opts: GlobOptions): Promise<GlobResult> {
+  const maxResults = opts.maxResults ?? Number.POSITIVE_INFINITY;
+  const probeLimit = Number.isFinite(maxResults)
+    ? Math.min(Number.MAX_SAFE_INTEGER, maxResults + 1)
+    : Number.POSITIVE_INFINITY;
+
+  const files: string[] = [];
+  let truncated = false;
+  for await (const file of globFiles({ ...opts, maxResults: probeLimit })) {
+    if (files.length >= maxResults) {
+      truncated = true;
+      break;
+    }
+    files.push(file);
+  }
+  files.sort();
+  return { files, truncated };
+}
diff --git a/src/lib/scan/grep.ts b/src/lib/scan/grep.ts
new file mode 100644
index 000000000..778be453a
--- /dev/null
+++ b/src/lib/scan/grep.ts
@@ -0,0 +1,501 @@
+/**
+ * Pure-TS grep engine.
+ *
+ * Layers regex matching onto `walkFiles`:
+ *   1. Walk cwd with the caller's filters (extensions, depth, gitignore).
+ *   2. Post-filter yields by `isBinary` + include/exclude globs.
+ *   3. Read each surviving file's content and test line-by-line.
+ *   4. Emit `GrepMatch` entries as they arrive.
+ *
+ * ### Primary API
+ *
+ * `grepFiles(opts)` returns an `AsyncIterable<GrepMatch>` that streams
+ * matches as they're discovered. Consumer-initiated `break` halts
+ * in-flight work via the shared early-exit flag inside
+ * `mapFilesConcurrentStream`.
+ *
+ * `collectGrep(opts)` drains the iterable into a sorted array +
+ * aggregate stats — the Promise-returning variant init-wizard style
+ * callers want.
+ *
+ * ### File-reading strategy
+ *
+ * Full slurp via `Bun.file(path).text()`. The walker's
+ * `maxFileSize` (default 256 KB) caps the blast radius; minified
+ * bundles bigger than that never reach us. Line splitting via
+ * `content.split("\n")` — fine for utf-8 text, preserves `\r` on
+ * CRLF files verbatim.
+ *
+ * ### Regex strategy
+ *
+ * `compilePattern` produces a `g`-less RegExp. Each line gets a
+ * fresh `regex.test(line)` — no `lastIndex` state to reset. Inline
+ * flags `(?i)` / `(?im)` / `(?i:...)` are translated to JS flags.
+ */
+
+import { handleFileError } from "../dsn/fs-utils.js";
+import {
+  type ConcurrentOptions,
+  mapFilesConcurrentStream,
+} from "./concurrent.js";
+import {
+  basenameOf,
+  type CompiledMatcher,
+  compileMatchers,
+  joinPosix,
+  matchesAny,
+  walkerRoot,
+} from "./path-utils.js";
+import {
+  compilePattern,
+  ensureGlobalFlag,
+  ensureGlobalMultilineFlags,
+} from "./regex.js";
+import type {
+  GrepMatch,
+  GrepOptions,
+  GrepResult,
+  GrepStats,
+  WalkEntry,
+  WalkOptions,
+} from "./types.js";
+import { walkFiles } from "./walker.js";
+
+/** Default line-truncation length for the init-wizard wire shape. */
+const DEFAULT_MAX_LINE_LENGTH = 2000;
+
+/**
+ * Build a stats object seeded with zeros. Separated so both iterable
+ * and collector paths share the init logic.
+ */
+function createStats(): GrepStats {
+  return {
+    filesConsidered: 0,
+    filesRead: 0,
+    filesSkippedBinary: 0,
+    matchesEmitted: 0,
+    truncated: false,
+  };
+}
+
+/**
+ * Wrap an async iterable to count each yielded entry into `stats`.
+ * Lets `grepFiles`' internal pipeline observe the walker's output
+ * without re-iterating it.
+ */
+async function* tapWalkerStats<T extends WalkEntry>(
+  source: AsyncIterable<T>,
+  stats: GrepStats
+): AsyncGenerator<T> {
+  for await (const item of source) {
+    stats.filesConsidered += 1;
+    yield item;
+  }
+}
+
+/**
+ * Filter walker entries by the grep-specific criteria (binary +
+ * include/exclude). Files that fail the filter are swallowed; the
+ * downstream per-file worker never sees them.
+ */
+async function* applyGrepFilters(
+  source: AsyncIterable<WalkEntry>,
+  opts: {
+    includes: CompiledMatcher[];
+    excludes: CompiledMatcher[];
+    includeBinary: boolean;
+    stats: GrepStats;
+  }
+): AsyncGenerator<WalkEntry> {
+  for await (const entry of source) {
+    if (entry.isBinary && !opts.includeBinary) {
+      opts.stats.filesSkippedBinary += 1;
+      continue;
+    }
+    const base = basenameOf(entry.relativePath);
+    if (
+      opts.includes.length > 0 &&
+      !matchesAny(opts.includes, entry.relativePath, base)
+    ) {
+      continue;
+    }
+    if (
+      opts.excludes.length > 0 &&
+      matchesAny(opts.excludes, entry.relativePath, base)
+    ) {
+      continue;
+    }
+    yield entry;
+  }
+}
+
+/**
+ * Options bundle for `readAndGrep` — collecting into one param keeps
+ * Biome's `useMaxParams` rule happy.
+ *
+ * The regex is cloned per file at the top of `readAndGrep` so each
+ * worker gets its own `lastIndex`. The `multiline` flag controls
+ * whether `/m` is applied (line-boundary anchoring on) or not
+ * (buffer-boundary anchoring); we can't infer this from `regex.flags`
+ * because the caller's intent matters independent of whatever flags
+ * `compilePattern` already set.
+ */
+type PerFileOptions = {
+  regex: RegExp;
+  multiline: boolean;
+  maxLineLength: number;
+  maxMatchesPerFile: number;
+  pathPrefix: string;
+};
+
+/**
+ * Read one file's content and emit one `GrepMatch` per matching line.
+ *
+ * ### Implementation note
+ *
+ * We iterate matches via `content.matchAll(regex)` on the whole
+ * buffer rather than `content.split("\n")` + per-line `regex.test`.
+ * The split approach allocates one string per line (millions on
+ * large repos), which dominates CPU on grep's hot path; whole-buffer
+ * matchAll does the same work 10-12× faster because the regex engine
+ * already understands `^`/`$`/`\n` and doesn't need a TS-side split.
+ *
+ * Line numbers are computed by incrementally counting `\n` up to
+ * each match's offset with a running cursor — O(content_length) total
+ * per file, amortized across all matches. For files with zero matches
+ * the line-count work is O(0).
+ *
+ * Per-line emission: to match the init-wizard / rg contract (one
+ * `GrepMatch` per matching line, not per in-line match), we advance
+ * the regex's `lastIndex` past the end of the matched line before
+ * the next `matchAll` iteration. Without this skip, a regex like
+ * `/foo/g` on `"foo foo"` would emit two matches on the same line.
+ */
+// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: per-match cursor + line-number + truncation is inherently branchy
+async function readAndGrep(
+  entry: WalkEntry,
+  opts: PerFileOptions
+): Promise<GrepMatch[] | null> {
+  let content: string;
+  try {
+    content = await Bun.file(entry.absolutePath).text();
+  } catch (error) {
+    handleFileError(error, {
+      operation: "scan.grep.readFile",
+      path: entry.absolutePath,
+    });
+    return null;
+  }
+
+  const matches: GrepMatch[] = [];
+  // Whole-buffer iteration requires `/g`. `/m` (line-boundary
+  // anchoring) is applied when the caller opts into grep-like
+  // semantics via `opts.multiline` (the default — see `GrepOptions`
+  // docstring). If the caller explicitly set `multiline: false`,
+  // anchor `^/$` to the buffer boundaries like raw JS semantics.
+  //
+  // We clone the regex per file so each invocation has its own
+  // `lastIndex`. Today the exec/loop block below runs synchronously
+  // with no `await`, so concurrent `readAndGrep` workers can't
+  // actually observe each other's `lastIndex` mutations (JS is
+  // single-threaded; microtasks only yield at `await`). But the
+  // clone is still worth paying — it's ~1µs per file and eliminates
+  // the foot-gun if anyone ever introduces an `await` inside the
+  // match loop.
+  const ensured = opts.multiline
+    ? ensureGlobalMultilineFlags(opts.regex)
+    : ensureGlobalFlag(opts.regex);
+  const regex = new RegExp(ensured.source, ensured.flags);
+
+  // Cursor for incremental line-number computation. We walk forward
+  // through the buffer, counting `\n` once per match emission. The
+  // 10-char constant is `\n`.
+  const NEWLINE = 10;
+  let lineNum = 1;
+  let cursor = 0;
+  let match = regex.exec(content);
+  while (match !== null) {
+    const matchIndex = match.index;
+    // Advance line counter to the match position.
+    while (cursor < matchIndex) {
+      if (content.charCodeAt(cursor) === NEWLINE) {
+        lineNum += 1;
+      }
+      cursor += 1;
+    }
+    const lineStart = content.lastIndexOf("\n", matchIndex) + 1;
+    const lineEnd = content.indexOf("\n", matchIndex);
+    const rawLine = content.slice(
+      lineStart,
+      lineEnd === -1 ? content.length : lineEnd
+    );
+    const line =
+      rawLine.length > opts.maxLineLength
+        ? `${rawLine.slice(0, opts.maxLineLength - 1)}…`
+        : rawLine;
+    matches.push({
+      path: opts.pathPrefix
+        ? joinPosix(opts.pathPrefix, entry.relativePath)
+        : entry.relativePath,
+      absolutePath: entry.absolutePath,
+      lineNum,
+      line,
+    });
+    if (matches.length >= opts.maxMatchesPerFile) {
+      break;
+    }
+    // Per-line-emission contract: skip past this line's `\n` so the
+    // next matchAll iteration starts on a new line. If the match
+    // ended the file, we're done.
+    if (lineEnd === -1) {
+      break;
+    }
+    regex.lastIndex = lineEnd + 1;
+    match = regex.exec(content);
+  }
+
+  return matches;
+}
+
+/**
+ * Build walker options from grep options — the set that `walkFiles`
+ * actually consumes. Anything grep-specific stays in grep.
+ */
+function buildWalkOptions(opts: GrepOptions, root: string): WalkOptions {
+  return {
+    cwd: root,
+    extensions: opts.extensions,
+    alwaysSkipDirs: opts.alwaysSkipDirs,
+    respectGitignore: opts.respectGitignore,
+    nestedGitignore: opts.nestedGitignore,
+    hidden: opts.hidden,
+    maxDepth: opts.maxDepth,
+    minDepth: opts.minDepth,
+    maxFileSize: opts.maxFileSize,
+    descentHook: opts.descentHook,
+    followSymlinks: opts.followSymlinks,
+    signal: opts.signal,
+    timeBudgetMs: opts.timeBudgetMs,
+  };
+}
+
+/**
+ * Options for `grepFilesInternal` — keeps arity down for Biome.
+ */
+type GrepPipelineOptions = {
+  regex: RegExp;
+  perFile: PerFileOptions;
+  walkSource: AsyncIterable<WalkEntry>;
+  stats: GrepStats;
+  concurrent: ConcurrentOptions;
+  maxResults: number;
+  stopOnFirst: boolean;
+};
+
+/**
+ * The internal pipeline. Separated from the public entry point so
+ * `collectGrep` can share it.
+ *
+ * `async *` is intentional even though we `yield*` into the
+ * `for await` — the whole pipeline is async and callers expect
+ * an `AsyncGenerator`.
+ */
+async function* grepFilesInternal(
+  opts: GrepPipelineOptions
+): AsyncGenerator<GrepMatch> {
+  for await (const match of mapFilesConcurrentStream(
+    opts.walkSource,
+    async (entry) => {
+      const matches = await readAndGrep(entry, opts.perFile);
+      // `filesRead` is documented as "files whose content was read
+      // and tested against the pattern" — only increment when
+      // `readAndGrep` actually succeeded (returned a non-null array).
+      // A null return means the open failed; we don't count those.
+      if (matches !== null) {
+        opts.stats.filesRead += 1;
+      }
+      return matches;
+    },
+    opts.concurrent
+  )) {
+    opts.stats.matchesEmitted += 1;
+    yield match;
+    if (opts.stopOnFirst) {
+      opts.stats.truncated = true;
+      return;
+    }
+    if (opts.stats.matchesEmitted >= opts.maxResults) {
+      opts.stats.truncated = true;
+      return;
+    }
+  }
+}
+
+/**
+ * Resolved pipeline inputs shared by `grepFiles` and `collectGrep`.
+ * Extracted so both entry points get the same default-resolution,
+ * pattern compilation, matcher compilation, walker construction, and
+ * filter wiring — a divergence between the two would be a silent
+ * correctness bug.
+ */
+type GrepPipelineSetup = {
+  regex: RegExp;
+  multiline: boolean;
+  stats: GrepStats;
+  walkSource: AsyncIterable<WalkEntry>;
+  maxResults: number;
+  stopOnFirst: boolean;
+  maxLineLength: number;
+  maxMatchesPerFile: number;
+  pathPrefix: string;
+  concurrency: number | undefined;
+  signal: AbortSignal | undefined;
+};
+
+/**
+ * Resolve all defaults + compile + wire the walker. Both `grepFiles`
+ * and `collectGrep` consume this — keeps defaults in one place.
+ */
+function setupGrepPipeline(opts: GrepOptions): GrepPipelineSetup {
+  // Default `multiline: true` — matches grep/rg's line-boundary
+  // anchoring semantics. Callers opt out with `multiline: false` for
+  // buffer-boundary JS semantics.
+  const multiline = opts.multiline ?? true;
+  const regex = compilePattern(opts.pattern, {
+    caseSensitive: opts.caseSensitive,
+    multiline,
+  });
+  const includes = compileMatchers(opts.include);
+  const excludes = compileMatchers(opts.exclude);
+  const includeBinary = opts.includeBinary ?? false;
+
+  const root = walkerRoot(opts.cwd, opts.path);
+  const walkOpts = buildWalkOptions(opts, root);
+
+  const stats = createStats();
+  const walkSource = applyGrepFilters(
+    tapWalkerStats(walkFiles(walkOpts), stats),
+    { includes, excludes, includeBinary, stats }
+  );
+
+  return {
+    regex,
+    multiline,
+    stats,
+    walkSource,
+    maxResults: opts.maxResults ?? Number.POSITIVE_INFINITY,
+    stopOnFirst: opts.stopOnFirst ?? false,
+    maxLineLength: opts.maxLineLength ?? DEFAULT_MAX_LINE_LENGTH,
+    maxMatchesPerFile: opts.maxMatchesPerFile ?? Number.POSITIVE_INFINITY,
+    pathPrefix: opts.path ?? "",
+    concurrency: opts.concurrency,
+    signal: opts.signal,
+  };
+}
+
+/**
+ * Public entry point — yield one `GrepMatch` per matching line under
+ * `opts.cwd`. Consumer `break` halts in-flight work (via the
+ * concurrent-stream's internal early-exit flag).
+ *
+ * Throws `ValidationError` on bad regex input; propagates
+ * `AbortError` from the `signal`.
+ *
+ * The `yield*` delegates to the internal pipeline, which does the
+ * actual async work — `async *` is required so callers get an
+ * `AsyncGenerator<GrepMatch>`.
+ */
+// biome-ignore lint/suspicious/useAwait: yield* delegates to async generator
+export async function* grepFiles(opts: GrepOptions): AsyncGenerator<GrepMatch> {
+  const setup = setupGrepPipeline(opts);
+
+  yield* grepFilesInternal({
+    regex: setup.regex,
+    perFile: {
+      regex: setup.regex,
+      multiline: setup.multiline,
+      maxLineLength: setup.maxLineLength,
+      maxMatchesPerFile: setup.maxMatchesPerFile,
+      pathPrefix: setup.pathPrefix,
+    },
+    walkSource: setup.walkSource,
+    stats: setup.stats,
+    concurrent: {
+      concurrency: setup.concurrency,
+      signal: setup.signal,
+    },
+    maxResults: setup.maxResults,
+    stopOnFirst: setup.stopOnFirst,
+  });
+}
+
+/**
+ * Drain `grepFiles` into a sorted-by-[path, lineNum] array alongside
+ * aggregate stats. Primary entry point for Promise-returning callers
+ * (init wizard, diagnostic scripts).
+ *
+ * Sort order is stable across runs: byte-lexicographic on `path`,
+ * numeric ascending on `lineNum` within a path.
+ */
+export async function collectGrep(opts: GrepOptions): Promise<GrepResult> {
+  const setup = setupGrepPipeline(opts);
+
+  // Ask the iterator for one extra match past `maxResults` so we can
+  // distinguish "exactly maxResults matches existed" from "more
+  // existed but we stopped". Without this +1 probe, the iterator
+  // would flip `stats.truncated = true` the moment it emits the
+  // N-th match, regardless of whether an (N+1)-th was available.
+  // Same pattern as `collectGlob`; see the lore entry on
+  // `collectGlob/collectGrep truncation flag`.
+  const probeLimit = Number.isFinite(setup.maxResults)
+    ? Math.min(Number.MAX_SAFE_INTEGER, setup.maxResults + 1)
+    : Number.POSITIVE_INFINITY;
+
+  const matches: GrepMatch[] = [];
+  let truncated = false;
+  for await (const match of grepFilesInternal({
+    regex: setup.regex,
+    perFile: {
+      regex: setup.regex,
+      multiline: setup.multiline,
+      maxLineLength: setup.maxLineLength,
+      maxMatchesPerFile: setup.maxMatchesPerFile,
+      pathPrefix: setup.pathPrefix,
+    },
+    walkSource: setup.walkSource,
+    stats: setup.stats,
+    concurrent: {
+      concurrency: setup.concurrency,
+      signal: setup.signal,
+    },
+    maxResults: probeLimit,
+    stopOnFirst: setup.stopOnFirst,
+  })) {
+    if (matches.length >= setup.maxResults) {
+      // We've got the overshoot match — there ARE more results than
+      // the caller asked for. Stop draining, flag truncation.
+      truncated = true;
+      break;
+    }
+    matches.push(match);
+  }
+  matches.sort(compareMatches);
+  // Reflect the collector-level truncation in the stats bag the
+  // iterator populated. Preserves stopOnFirst-path flag (stats.truncated
+  // is already set by the iterator in that case).
+  if (truncated) {
+    setup.stats.truncated = true;
+  }
+  return { matches, stats: setup.stats };
+}
+
+/** [path, lineNum] lexicographic comparator for stable collectGrep output. */
+function compareMatches(a: GrepMatch, b: GrepMatch): number {
+  if (a.path < b.path) {
+    return -1;
+  }
+  if (a.path > b.path) {
+    return 1;
+  }
+  return a.lineNum - b.lineNum;
+}
diff --git a/src/lib/scan/ignore.ts b/src/lib/scan/ignore.ts
new file mode 100644
index 000000000..de18ec963
--- /dev/null
+++ b/src/lib/scan/ignore.ts
@@ -0,0 +1,281 @@
+/**
+ * IgnoreStack — per-directory `.gitignore` aggregation for the scanner.
+ *
+ * ### Why a stack of instances
+ *
+ * The `ignore` npm package implements one `.gitignore`-file's semantics
+ * (last-matching rule wins inside that file, negations with `!`, etc.).
+ * It does NOT know about nested `.gitignore` files.
+ *
+ * Real git treats nested `.gitignore` files cumulatively: parent rules
+ * apply inside every subtree, and child `.gitignore` files can add new
+ * rules that apply only in their subtree (including negations that
+ * un-ignore something a parent had ignored). See `gitignore(5)`.
+ *
+ * To match that semantics we keep a `Map<relDir, Ignore>` of per-dir
+ * `.gitignore` contents. `isIgnored(relPath)` iterates the ancestor
+ * dirs root-first, asks each `Ignore` whether it ignores the file with
+ * the path re-anchored to that dir. Because we consult parents first
+ * and children last, a child file's `!negation` patterns naturally
+ * override parent matches (the child's answer is the last one we see).
+ *
+ * ### Fast path for root-only stacks
+ *
+ * Most of the time, `nestedGitignore: true` callers don't actually
+ * encounter any nested `.gitignore` files — the root `.gitignore` is
+ * the only one loaded. `isIgnored` detects this via a fast-path check
+ * (`#nestedByRelDir.size === 0`) and forwards directly to the root
+ * `Ignore` instance, skipping all the path-splitting + ancestor-walking
+ * machinery. This keeps per-query cost near the underlying library's
+ * floor (~0.25µs) when the expensive nested path isn't needed.
+ *
+ * ### Built-in skip list
+ *
+ * `alwaysSkipDirs` is a list of directory basenames (e.g., `node_modules`,
+ * `.git`) that must be skipped even when no `.gitignore` mentions them.
+ * These are seeded as patterns in the root `Ignore` instance.
+ *
+ * ### `.git/info/exclude`
+ *
+ * When `includeGitInfoExclude: true`, the root instance also reads
+ * `${cwd}/.git/info/exclude` if it exists. Matches ripgrep's behavior.
+ */
+
+import path from "node:path";
+import ignore, { type Ignore } from "ignore";
+import { handleFileError } from "../dsn/fs-utils.js";
+import type { IgnoreMatcher } from "./types.js";
+
+/** Options for constructing an `IgnoreStack`. */
+export type IgnoreStackOptions = {
+  /** Walker `cwd`. Absolute path. */
+  cwd: string;
+  /**
+   * Directory basenames that must always be skipped. Seeded into the
+   * root instance as bare gitignore patterns (basename-anywhere match).
+   */
+  alwaysSkipDirs: readonly string[];
+  /**
+   * When false, `.gitignore` / `.git/info/exclude` files are NOT read.
+   * Only `alwaysSkipDirs` patterns apply. Default: true.
+   */
+  respectGitignore?: boolean;
+  /**
+   * When true (and `respectGitignore` is also true), the root instance
+   * is seeded with the contents of `${cwd}/.git/info/exclude` in
+   * addition to the root `.gitignore`.
+   */
+  includeGitInfoExclude?: boolean;
+};
+
+/**
+ * Per-directory `.gitignore` aggregator.
+ *
+ * Construct with `await IgnoreStack.create(opts)` — initialization is
+ * async because it reads the root `.gitignore` (and optionally
+ * `.git/info/exclude`) up front.
+ */
+export class IgnoreStack implements IgnoreMatcher {
+  /** Walker root — absolute. Used to translate absolute dir paths to relative keys. */
+  readonly #cwd: string;
+  /** Length of `cwd + "/"`. Cached for slicing. */
+  readonly #cwdPrefixLen: number;
+  /**
+   * Root-level matcher. Holds `alwaysSkipDirs` patterns + root
+   * `.gitignore` + `.git/info/exclude`. Always present.
+   */
+  readonly #rootIg: Ignore;
+  /**
+   * Nested `.gitignore` instances keyed by POSIX-relative dir path
+   * (e.g., `"packages/foo"`, `"src/deep"`). Empty when no nested
+   * gitignores are loaded — that's the fast-path case.
+   *
+   * Keys never include a leading or trailing `/`. The root is NOT in
+   * this map — it lives in `#rootIg`.
+   */
+  readonly #nestedByRelDir = new Map<string, Ignore>();
+  /** When false, `loadFromDir` is a no-op. */
+  readonly #respectGitignore: boolean;
+
+  private constructor(cwd: string, respectGitignore: boolean, rootIg: Ignore) {
+    this.#cwd = cwd;
+    this.#cwdPrefixLen = cwd.length + 1;
+    this.#respectGitignore = respectGitignore;
+    this.#rootIg = rootIg;
+  }
+
+  /** Build an IgnoreStack and load its root-level patterns. */
+  static async create(opts: IgnoreStackOptions): Promise<IgnoreStack> {
+    const respectGitignore = opts.respectGitignore ?? true;
+    const root = ignore();
+    // Seed always-skip directory names as basename-matching patterns.
+    // The `ignore` package treats a bare name as basename-anywhere —
+    // perfect for skipping any `node_modules` subtree in the walk.
+    // These apply even when `respectGitignore: false` because they're
+    // the walker's policy, not part of the user's gitignore.
+    if (opts.alwaysSkipDirs.length > 0) {
+      root.add([...opts.alwaysSkipDirs]);
+    }
+    if (respectGitignore) {
+      await appendGitignoreFile(root, path.join(opts.cwd, ".gitignore"));
+      if (opts.includeGitInfoExclude) {
+        await appendGitignoreFile(
+          root,
+          path.join(opts.cwd, ".git", "info", "exclude")
+        );
+      }
+    }
+    return new IgnoreStack(opts.cwd, respectGitignore, root);
+  }
+
+  /**
+   * Read `${absDir}/.gitignore` into a new `Ignore` instance scoped to
+   * that dir. No-op if the file is missing or empty, or when
+   * `respectGitignore: false` was set on construction.
+   *
+   * Callers (the walker) invoke this on directory descent when
+   * `nestedGitignore: true`. Idempotent: calling twice with the same
+   * path replaces the earlier instance.
+   */
+  async loadFromDir(absDir: string): Promise<void> {
+    if (!this.#respectGitignore) {
+      return;
+    }
+    // Never re-seed the root — that was populated by `create()`.
+    if (absDir === this.#cwd) {
+      return;
+    }
+    const gitignorePath = path.join(absDir, ".gitignore");
+    try {
+      const content = await Bun.file(gitignorePath).text();
+      if (!content || content.trim().length === 0) {
+        return;
+      }
+      const ig = ignore();
+      ig.add(content);
+      const relDir = this.#relDirFor(absDir);
+      if (relDir === null) {
+        // Absolute dir isn't under cwd — refuse to register it.
+        return;
+      }
+      this.#nestedByRelDir.set(relDir, ig);
+    } catch (error) {
+      // ENOENT is the expected case — most directories don't have a
+      // `.gitignore`. Anything else (permission, I/O) goes through
+      // `handleFileError` so genuine bugs still surface to Sentry.
+      handleFileError(error, {
+        operation: "scan.ignore.loadFromDir",
+        path: gitignorePath,
+      });
+    }
+  }
+
+  /**
+   * Fast-path-aware `isIgnored`.
+   *
+   * When no nested gitignores are loaded (the common case), query
+   * `#rootIg` directly — bypasses all ancestor-walking and path
+   * splicing, bringing per-query cost to the underlying `ignore`
+   * package's floor.
+   *
+   * Otherwise: walk the ancestor prefix chain root→leaf, applying each
+   * loaded `Ignore` in turn. Inside each instance, `ignore`'s
+   * last-match-wins semantics handle intra-file negations; across
+   * instances, a child `!foo` pattern flips an earlier `ignored=true`
+   * back to `false` because we see later results last.
+   */
+  // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: two-tier fast+slow path with negation handling is inherently branchy
+  isIgnored(relPath: string, isDirectory: boolean): boolean {
+    if (path.isAbsolute(relPath)) {
+      // Programming error — a misuse that would silently produce wrong
+      // results inside `ignore`. Throwing here flags it immediately.
+      throw new Error(
+        `IgnoreStack.isIgnored requires a relative path, got: ${relPath}`
+      );
+    }
+    if (relPath === "" || relPath === ".") {
+      return false;
+    }
+
+    const trailingSlash = isDirectory ? "/" : "";
+    const query = `${relPath}${trailingSlash}`;
+
+    // --- Tier 1: root-only fast path (most common) ---
+    if (this.#nestedByRelDir.size === 0) {
+      return this.#rootIg.ignores(query);
+    }
+
+    // --- Tier 2: walk loaded ancestor prefixes root→leaf ---
+    //
+    // Start with the root's verdict. Then, for each prefix of `relPath`
+    // that is a loaded nested dir, re-query with the path reanchored
+    // to that dir. A match flips `ignored`; an `unignored` (negation)
+    // result flips it back to false.
+    const rootResult = this.#rootIg.test(query);
+    let ignored = rootResult.ignored && !rootResult.unignored;
+    if (rootResult.unignored) {
+      ignored = false;
+    }
+
+    // Walk `a/b/c/file.ts` → prefixes "a", "a/b", "a/b/c".
+    // We skip the final segment (the file itself — a file can't own
+    // a `.gitignore`).
+    let prefixEnd = relPath.indexOf("/");
+    while (prefixEnd !== -1) {
+      const prefix = relPath.slice(0, prefixEnd);
+      const ig = this.#nestedByRelDir.get(prefix);
+      if (ig !== undefined) {
+        // Rebase the query under this dir: path inside the nested scope
+        // is the suffix past `prefix/`.
+        const suffix = `${relPath.slice(prefixEnd + 1)}${trailingSlash}`;
+        if (suffix.length > 0) {
+          const result = ig.test(suffix);
+          if (result.unignored) {
+            ignored = false;
+          } else if (result.ignored) {
+            ignored = true;
+          }
+        }
+      }
+      prefixEnd = relPath.indexOf("/", prefixEnd + 1);
+    }
+    return ignored;
+  }
+
+  /**
+   * Convert an absolute directory path to its POSIX-relative form
+   * under `cwd`. Returns null when `absDir` isn't a descendant of cwd.
+   */
+  #relDirFor(absDir: string): string | null {
+    if (!absDir.startsWith(this.#cwd)) {
+      return null;
+    }
+    if (absDir.length === this.#cwd.length) {
+      return "";
+    }
+    if (absDir[this.#cwd.length] !== path.sep) {
+      return null;
+    }
+    const tail = absDir.slice(this.#cwdPrefixLen);
+    return path.sep === path.posix.sep ? tail : tail.replaceAll(path.sep, "/");
+  }
+}
+
+/**
+ * Load the contents of a gitignore-like file into an existing `Ignore`
+ * instance. Swallows ENOENT (the common case); routes other errors to
+ * `handleFileError` so genuine bugs surface to Sentry.
+ */
+async function appendGitignoreFile(ig: Ignore, absPath: string): Promise<void> {
+  try {
+    const content = await Bun.file(absPath).text();
+    if (content.length > 0) {
+      ig.add(content);
+    }
+  } catch (error) {
+    handleFileError(error, {
+      operation: "scan.ignore.appendGitignoreFile",
+      path: absPath,
+    });
+  }
+}
diff --git a/src/lib/scan/index.ts b/src/lib/scan/index.ts
new file mode 100644
index 000000000..6f39fda60
--- /dev/null
+++ b/src/lib/scan/index.ts
@@ -0,0 +1,73 @@
+// biome-ignore-all lint/performance/noBarrelFile: intentional public API
+/**
+ * Scan module — pure-TS ripgrep-compatible file scanner.
+ *
+ * PR 1 exports the file-walking foundation. PR 2 adds the grep and
+ * glob engines (`grepFiles`, `globFiles` + collect helpers). PR 3
+ * will migrate the DSN scanner to this module.
+ *
+ * @example
+ * ```ts
+ * import { walkFiles, TEXT_EXTENSIONS } from "./lib/scan/index.js";
+ *
+ * for await (const entry of walkFiles({ cwd, extensions: TEXT_EXTENSIONS })) {
+ *   if (!entry.isBinary) console.log(entry.relativePath);
+ * }
+ * ```
+ *
+ * @example
+ * ```ts
+ * import { collectGrep } from "./lib/scan/index.js";
+ *
+ * const { matches, stats } = await collectGrep({
+ *   cwd: "/path/to/repo",
+ *   pattern: "(?i)TODO",
+ *   include: "*.ts",
+ *   maxResults: 100,
+ * });
+ * ```
+ */
+
+export {
+  classifyByExtension,
+  isLikelyBinary,
+  readHeadAndSniff,
+} from "./binary.js";
+export type { ConcurrentOptions, MapFilesOptions } from "./concurrent.js";
+export {
+  mapFilesConcurrent,
+  mapFilesConcurrentStream,
+} from "./concurrent.js";
+export {
+  BINARY_SNIFF_BYTES,
+  CONCURRENCY_LIMIT,
+  DEFAULT_SKIP_DIRS,
+  DSN_ADDITIONAL_SKIP_DIRS,
+  isMonorepoPackageDir,
+  MAX_FILE_SIZE,
+  MONOREPO_ROOTS,
+  normalizePath,
+  TEXT_EXTENSIONS,
+} from "./constants.js";
+export { collectGlob, globFiles } from "./glob.js";
+export { collectGrep, grepFiles } from "./grep.js";
+export type { IgnoreStackOptions } from "./ignore.js";
+export { IgnoreStack } from "./ignore.js";
+export type { CompilePatternOptions } from "./regex.js";
+export {
+  compilePattern,
+  ensureGlobalFlag,
+  extractInlineFlags,
+} from "./regex.js";
+export type {
+  GlobOptions,
+  GlobResult,
+  GrepMatch,
+  GrepOptions,
+  GrepResult,
+  GrepStats,
+  IgnoreMatcher,
+  WalkEntry,
+  WalkOptions,
+} from "./types.js";
+export { walkFiles } from "./walker.js";
diff --git a/src/lib/scan/path-utils.ts b/src/lib/scan/path-utils.ts
new file mode 100644
index 000000000..d2a057544
--- /dev/null
+++ b/src/lib/scan/path-utils.ts
@@ -0,0 +1,117 @@
+/**
+ * Internal path + pattern utilities shared by the grep and glob
+ * engines in `src/lib/scan/`. Not part of the public barrel — these
+ * are implementation details we factored out to stop two very similar
+ * copies of the same logic drifting apart over time (flagged in PR
+ * 791 review).
+ */
+
+import path from "node:path";
+import picomatch from "picomatch";
+
+/**
+ * A precompiled glob matcher. We cache whether the pattern is
+ * "path-mode" (tested against the relative path, e.g. `src/*.ts`)
+ * vs "basename-mode" (tested against just the file's basename, e.g.
+ * `*.ts`) so the per-file call skips the `pattern.includes("/")` check
+ * every time.
+ *
+ * Matches the init-wizard's fs-fallback heuristic and ripgrep's
+ * `--glob` semantics: patterns with `/` anchor to the relative path
+ * from cwd, patterns without `/` match the basename anywhere.
+ */
+export type CompiledMatcher = {
+  test: (input: string) => boolean;
+  pathMode: boolean;
+};
+
+/**
+ * Compile a picomatch matcher for a single glob pattern. `dot: true`
+ * so dotfiles aren't silently excluded (the walker's own `hidden`
+ * flag owns that policy; glob patterns should match whatever the
+ * walker yields).
+ */
+export function compileMatcher(pattern: string): CompiledMatcher {
+  return {
+    test: picomatch(pattern, { dot: true }),
+    pathMode: pattern.includes("/"),
+  };
+}
+
+/**
+ * Compile zero or more glob patterns. The input is the shape grep
+ * and glob both accept on their options objects: `string | readonly
+ * string[] | undefined`. Undefined returns an empty array so callers
+ * can short-circuit on `.length === 0` without null-checks.
+ */
+export function compileMatchers(
+  patterns: string | readonly string[] | undefined
+): CompiledMatcher[] {
+  if (patterns === undefined) {
+    return [];
+  }
+  const list = typeof patterns === "string" ? [patterns] : patterns;
+  return list.map(compileMatcher);
+}
+
+/**
+ * True if at least one matcher accepts the given path. The caller
+ * supplies both the relative path and its basename so we don't
+ * recompute the basename per matcher.
+ *
+ * Path-mode matchers test against `relPath`, basename-mode against
+ * `basename` — see `CompiledMatcher.pathMode`.
+ */
+export function matchesAny(
+  matchers: readonly CompiledMatcher[],
+  relPath: string,
+  basename: string
+): boolean {
+  for (const m of matchers) {
+    if (m.test(m.pathMode ? relPath : basename)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/**
+ * Extract the basename (segment after the last `/`) from a
+ * POSIX-normalized relative path. Equivalent to `path.posix.basename`
+ * but avoids the import cycle + is slightly cheaper on the hot path.
+ */
+export function basenameOf(rel: string): string {
+  const slashIdx = rel.lastIndexOf("/");
+  return slashIdx === -1 ? rel : rel.slice(slashIdx + 1);
+}
+
+/**
+ * Join two POSIX-style path segments with a single `/` separator,
+ * trimming a trailing `/` on the left or a leading `/` on the right
+ * so we never produce a `//` in the middle.
+ */
+export function joinPosix(a: string, b: string): string {
+  const left = a.endsWith("/") ? a.slice(0, -1) : a;
+  const right = b.startsWith("/") ? b.slice(1) : b;
+  return `${left}/${right}`;
+}
+
+/**
+ * Narrow the walker's root when `opts.path` is set. When `sub` is
+ * undefined we just pass `cwd` through; otherwise we resolve the
+ * subpath against `cwd` and hand the walker that as its new root.
+ *
+ * ### Sandboxing is the caller's job
+ *
+ * `path.resolve` happily resolves an absolute path OR a relative one,
+ * so a malicious `../../etc` would escape the sandbox. Callers that
+ * accept user input (init-wizard tool adapters) MUST pre-validate
+ * via `src/lib/init/tools/shared.ts::safePath` before forwarding to
+ * grep/glob. The engine explicitly trusts the passed value.
+ */
+export function walkerRoot(cwd: string, sub: string | undefined): string {
+  if (!sub) {
+    return cwd;
+  }
+  return path.resolve(cwd, sub);
+}
diff --git a/src/lib/scan/regex.ts b/src/lib/scan/regex.ts
new file mode 100644
index 000000000..7b3dfdb7c
--- /dev/null
+++ b/src/lib/scan/regex.ts
@@ -0,0 +1,276 @@
+/**
+ * Minimal regex translation for user-supplied grep patterns.
+ *
+ * The init wizard's Mastra server sends regex sources written for
+ * ripgrep (Rust regex syntax). JS `RegExp` covers almost everything
+ * rg's default mode supports, with one real gap: **inline flag groups**
+ * like `(?i)foo`. JS requires flags at RegExp construction time; it
+ * can't flip them mid-pattern.
+ *
+ * This module bridges that gap by recognizing a leading `(?[imsU]+)`
+ * or `(?[imsU]+:…)` and translating it to JS flags. Everything else
+ * is passed to `new RegExp` unchanged — if it's not valid JS regex,
+ * `ValidationError` is thrown with the engine's error message.
+ *
+ * ### Scope
+ *
+ * - Leading-only. `foo(?i)bar` (mid-pattern flag) stays as-is, which
+ *   will typically fail to compile under JS and raise ValidationError.
+ * - Flag mapping: `i` → `i`, `m` → `m`, `s` → `s`, `U` → `s` (rg's
+ *   `U` == multiline-dotall is modeled by JS's `/s` flag).
+ * - The scoped form `(?i:foo)bar` is translated as
+ *   `{ cleaned: "foobar", flags: "i" }` — we widen the flag to the
+ *   whole pattern because JS can't scope flags to a group. This is a
+ *   documented limitation.
+ */
+
+import { ValidationError } from "../errors.js";
+
+/**
+ * Matches a leading inline-flag group at position 0 of a regex source.
+ * Group 1 captures the flag letters. Group 2 captures `:` if the form
+ * is the scoped `(?i:...)` variant, empty otherwise.
+ *
+ * We don't support uppercase-off flags (e.g. rg's `(?-i)`) — those are
+ * rare and harder to translate cleanly; they raise ValidationError at
+ * compile time if they sneak through.
+ */
+const INLINE_FLAG_RE = /^\(\?([imsU]+)(:|\))/;
+
+/** Canonical JS-side flag alphabet we emit. Sorted for determinism. */
+const VALID_JS_FLAGS = "imsu";
+
+/**
+ * Extract a leading inline-flag group from `source`.
+ *
+ * @returns `{ cleaned: pattern-with-flags-stripped, flags: jsFlagString }`.
+ *   Callers combine `flags` with their own options (e.g.,
+ *   `caseSensitive: false` → force `i`) and pass to `new RegExp`.
+ *
+ * When `source` has no leading flag group the function returns
+ * `{ cleaned: source, flags: "" }` without inspecting the rest of the
+ * pattern.
+ */
+export function extractInlineFlags(source: string): {
+  cleaned: string;
+  flags: string;
+} {
+  const match = INLINE_FLAG_RE.exec(source);
+  if (!match) {
+    return { cleaned: source, flags: "" };
+  }
+  const rawFlags = match[1] as string;
+  const separator = match[2] as ":" | ")";
+  const flags = translateFlags(rawFlags);
+
+  if (separator === ")") {
+    // (?i)pattern — strip the flag group entirely.
+    return { cleaned: source.slice(match[0].length), flags };
+  }
+  // (?i:pattern)tail — unwrap the group, widening the flag to the
+  // whole cleaned source. We have to find the matching closing paren,
+  // respecting nested groups. A small state machine is enough; we
+  // don't need to parse character classes specially because `)` inside
+  // `[...]` doesn't close a group.
+  const inner = unwrapScopedGroup(source, match[0].length);
+  if (inner === null) {
+    // Malformed group — leave source alone, report no flags. `new
+    // RegExp` downstream will raise a ValidationError.
+    return { cleaned: source, flags: "" };
+  }
+  return { cleaned: inner, flags };
+}
+
+/**
+ * Unwrap `(?i:foo)bar` → `foobar`.
+ *
+ * `openIndex` points one past the closing `:` (start of `foo`). We
+ * find the matching `)`, tracking parenthesis nesting and skipping
+ * paired `[]` ranges. If we run off the end or the syntax is
+ * malformed, return null so the caller falls back to "no translation."
+ *
+ * The branchy control flow is inherent to a tiny regex-syntax
+ * tokenizer — we track three states (char class, paren depth, escape)
+ * and each needs its own branch.
+ */
+// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: regex tokenizer is inherently branchy
+function unwrapScopedGroup(source: string, openIndex: number): string | null {
+  let depth = 1;
+  let i = openIndex;
+  let inClass = false;
+  while (i < source.length) {
+    const ch = source.charCodeAt(i);
+    // Backslash escapes the next char regardless of context.
+    if (ch === CHAR_BACKSLASH) {
+      i += 2;
+      continue;
+    }
+    if (inClass) {
+      if (ch === CHAR_CLOSE_BRACKET) {
+        inClass = false;
+      }
+      i += 1;
+      continue;
+    }
+    if (ch === CHAR_OPEN_BRACKET) {
+      inClass = true;
+      i += 1;
+      continue;
+    }
+    if (ch === CHAR_OPEN_PAREN) {
+      depth += 1;
+      i += 1;
+      continue;
+    }
+    if (ch === CHAR_CLOSE_PAREN) {
+      depth -= 1;
+      if (depth === 0) {
+        // `foo` is source[openIndex..i]; tail is source[i+1..].
+        return source.slice(openIndex, i) + source.slice(i + 1);
+      }
+    }
+    i += 1;
+  }
+  return null;
+}
+
+const CHAR_BACKSLASH = "\\".charCodeAt(0);
+const CHAR_OPEN_PAREN = "(".charCodeAt(0);
+const CHAR_CLOSE_PAREN = ")".charCodeAt(0);
+const CHAR_OPEN_BRACKET = "[".charCodeAt(0);
+const CHAR_CLOSE_BRACKET = "]".charCodeAt(0);
+
+/**
+ * Translate rg-style inline flag letters to JS RegExp flag letters.
+ * Unknown letters are dropped silently (the guard regex already
+ * restricts the input to `[imsU]+`).
+ */
+function translateFlags(raw: string): string {
+  const seen = new Set<string>();
+  for (const letter of raw) {
+    if (letter === "U") {
+      // rg's U == dotall (--multiline-dotall). Model with JS /s.
+      seen.add("s");
+    } else if (letter === "i" || letter === "m" || letter === "s") {
+      seen.add(letter);
+    }
+  }
+  // Deterministic, RegExp-accepted order.
+  return [...seen]
+    .filter((f) => VALID_JS_FLAGS.includes(f))
+    .sort()
+    .join("");
+}
+
+/** Options for `compilePattern`. Both default to falsy. */
+export type CompilePatternOptions = {
+  /**
+   * When false, forces the `i` flag regardless of inline flags.
+   * Default: true (case-sensitive, matching `rg`'s default).
+   */
+  caseSensitive?: boolean;
+  /** Force the `m` flag. Default: false. */
+  multiline?: boolean;
+};
+
+/**
+ * Compile a user-supplied pattern (string or RegExp) into a JS RegExp
+ * suitable for grep.
+ *
+ * Pre-compiled regex input is trusted and returned unchanged —
+ * callers that want `caseSensitive: false` on an existing RegExp
+ * must reconstruct it.
+ *
+ * String input goes through `extractInlineFlags` + `new RegExp`.
+ * The resulting regex is always `g`-less: grep tests one line at a
+ * time, and the `g` flag's `lastIndex` state is a foot-gun in that
+ * usage. Callers that want a `matchAll`-style regex should build
+ * their own.
+ *
+ * Throws `ValidationError` on any compile-time regex error,
+ * preserving the engine's message for user-facing diagnostics.
+ */
+export function compilePattern(
+  pattern: string | RegExp,
+  opts: CompilePatternOptions = {}
+): RegExp {
+  if (pattern instanceof RegExp) {
+    return pattern;
+  }
+  const { cleaned, flags: inline } = extractInlineFlags(pattern);
+  const flags = new Set<string>();
+  for (const f of inline) {
+    flags.add(f);
+  }
+  if (opts.caseSensitive === false) {
+    flags.add("i");
+  }
+  if (opts.multiline) {
+    flags.add("m");
+  }
+  const flagString = [...flags].sort().join("");
+  try {
+    return new RegExp(cleaned, flagString);
+  } catch (error) {
+    throw new ValidationError(
+      `Invalid grep pattern: ${(error as Error).message}`,
+      "pattern"
+    );
+  }
+}
+
+/**
+ * Return a RegExp with the `g` flag set. If the input already has
+ * `g`, it's returned as-is; otherwise we clone with `g` added.
+ *
+ * `content.matchAll(regex)` and `regex.exec(content)` with manual
+ * `lastIndex` management both require `/g`. The grep engine iterates
+ * matches on the whole file buffer, so we need to guarantee the flag
+ * is present — `compilePattern` strips `g` by default (historically
+ * grep tested one line at a time), so callers must pass through this
+ * helper before a whole-buffer iteration.
+ */
+export function ensureGlobalFlag(regex: RegExp): RegExp {
+  if (regex.flags.includes("g")) {
+    return regex;
+  }
+  return new RegExp(regex.source, `${regex.flags}g`);
+}
+
+/**
+ * Return a RegExp with the `m` (multiline) flag set so `^` and `$`
+ * match at line boundaries inside a multi-line buffer.
+ *
+ * Why this exists: grep historically worked by splitting content on
+ * `\n` and testing each line individually, which made `^` match the
+ * start of any line by accident (each line was its own string). Now
+ * that grep iterates the whole buffer via `matchAll`, patterns like
+ * `^foo` need the `m` flag for equivalent semantics — without it,
+ * `^` anchors to the buffer start and only matches the first line.
+ */
+export function ensureMultilineFlag(regex: RegExp): RegExp {
+  if (regex.flags.includes("m")) {
+    return regex;
+  }
+  return new RegExp(regex.source, `${regex.flags}m`);
+}
+
+/**
+ * Compose `ensureGlobalFlag` + `ensureMultilineFlag` in one clone.
+ * Single-pass avoids building a throwaway intermediate RegExp.
+ */
+export function ensureGlobalMultilineFlags(regex: RegExp): RegExp {
+  const needsG = !regex.flags.includes("g");
+  const needsM = !regex.flags.includes("m");
+  if (!(needsG || needsM)) {
+    return regex;
+  }
+  let flags = regex.flags;
+  if (needsG) {
+    flags += "g";
+  }
+  if (needsM) {
+    flags += "m";
+  }
+  return new RegExp(regex.source, flags);
+}
diff --git a/src/lib/scan/types.ts b/src/lib/scan/types.ts
new file mode 100644
index 000000000..1991c3ecc
--- /dev/null
+++ b/src/lib/scan/types.ts
@@ -0,0 +1,374 @@
+/**
+ * Public types for the scan module.
+ *
+ * The walker and ignore stack share a small contract defined here; the
+ * types are intentionally narrow for PR 1 (walker-only). PR 2 will add
+ * `GrepOptions` / `GrepMatch` / `GrepResult` on top.
+ */
+
+/**
+ * A single filesystem entry yielded by `walkFiles`. Always a regular file —
+ * directories are traversed but never yielded, and symbolic links are
+ * skipped unless `followSymlinks: true` is set on the walker.
+ *
+ * Paths are POSIX-normalized (`/`-separated) on all platforms.
+ */
+export type WalkEntry = {
+  /** Absolute path on disk. Native separators. */
+  absolutePath: string;
+  /**
+   * POSIX-normalized path relative to `WalkOptions.cwd`.
+   * Does not start with `./`. Does not end with `/`.
+   */
+  relativePath: string;
+  /** Size in bytes, from `Bun.file(path).size`. */
+  size: number;
+  /**
+   * mtime in milliseconds since epoch, from `Bun.file(path).lastModified`.
+   * Zero when `recordMtimes: false` (the default) — stat'ing every file
+   * adds measurable overhead on large scans.
+   */
+  mtime: number;
+  /**
+   * True if classified as binary. Classification uses an extension fast
+   * path (TEXT_EXTENSIONS members are always text); files with other
+   * extensions are tested by reading the first 8 KB and looking for a
+   * NUL byte.
+   *
+   * Known limitation: UTF-16-encoded text is misclassified as binary
+   * because its ASCII-range code units produce NUL bytes in the stream.
+   */
+  isBinary: boolean;
+  /**
+   * Depth at which this file sits relative to the walk root. Root files
+   * are depth 0; `cwd/src/foo.ts` is depth 1. Callers that need
+   * monorepo-style depth resets should wrap the walker and track their
+   * own counter — the core walker is policy-free.
+   */
+  depth: number;
+};
+
+/**
+ * Options controlling `walkFiles`. Only `cwd` is required.
+ */
+export type WalkOptions = {
+  /** Root to walk. Must be an absolute path. */
+  cwd: string;
+
+  // --- Filters ---
+  /**
+   * Extension allowlist. When set, only files whose extension (as
+   * returned by `path.extname`, lowercase) is in the set are yielded.
+   * Other files are skipped without running the binary sniff, which is
+   * both faster and avoids touching binary content.
+   *
+   * When unset, all files are considered — binary classification still
+   * runs, callers just see `isBinary: true` entries they can filter.
+   */
+  extensions?: ReadonlySet<string>;
+
+  // --- Skip policy ---
+  /**
+   * Directory basenames to always skip, regardless of `.gitignore`.
+   * Defaults to `DEFAULT_SKIP_DIRS` (VCS + common build output dirs).
+   * Match semantics are basename-anywhere: any subtree rooted at a
+   * directory named `node_modules` is skipped. Spread
+   * `DSN_ADDITIONAL_SKIP_DIRS` for stricter DSN-scanner policy.
+   */
+  alwaysSkipDirs?: readonly string[];
+  /**
+   * Include dotfiles (files and dirs whose basename starts with `.`).
+   * Defaults to `true`, matching `rg --hidden`. `false` skips them —
+   * independent of `.gitignore`, so e.g. `.env` is hidden even if the
+   * `.gitignore` doesn't mention it.
+   */
+  hidden?: boolean;
+  /** Respect `.gitignore` files found during the walk. Default: true. */
+  respectGitignore?: boolean;
+  /**
+   * Load nested `.gitignore` files as the walker descends. When true,
+   * child patterns are applied on top of parent patterns with git-like
+   * cumulative semantics. When false, only the root `.gitignore` is
+   * read. Default: true.
+   */
+  nestedGitignore?: boolean;
+
+  // --- Size / depth ---
+  /**
+   * Skip files larger than this. Defaults to `MAX_FILE_SIZE` (256 KB) —
+   * large files rarely contain relevant config/source content.
+   */
+  maxFileSize?: number;
+  /**
+   * Exhaustive must-scan depth. The walker visits every directory at
+   * depth ≤ `minDepth` regardless of wall-clock. Default: 0 — no
+   * minimum guaranteed depth; `timeBudgetMs` alone controls the walk.
+   * DSN callers pass `3`.
+   */
+  minDepth?: number;
+  /**
+   * Hard depth cap. Files at depth > `maxDepth` are never yielded.
+   * Default: Infinity.
+   */
+  maxDepth?: number;
+  /**
+   * Compute the depth of a child directory. The walker calls this
+   * when descending into `relPath`, passing the parent's current
+   * depth. Default: `(_, depth) => depth + 1` (linear descent).
+   *
+   * Callers that want monorepo-boundary depth resets (e.g., the DSN
+   * scanner, which treats `packages/foo/` as its own depth=0 root)
+   * override with a closure that returns 0 on package-dir matches.
+   * The hook is consulted once per descent, so it must be cheap.
+   */
+  descentHook?: (relPath: string, currentDepth: number) => number;
+
+  // --- Symlinks + cancellation ---
+  /**
+   * Follow symlinks. Defaults to false. When true, the walker resolves
+   * each symlink via `stat` and maintains a visited `dev:ino` set to
+   * avoid cycles.
+   */
+  followSymlinks?: boolean;
+  /**
+   * Abort in-flight walks. When the signal fires, the next `yield`
+   * throws a `DOMException` with `name === "AbortError"`. Entries
+   * already yielded remain valid.
+   */
+  signal?: AbortSignal;
+
+  // --- Time budget ---
+  /**
+   * Wall-clock budget in milliseconds. Every directory at depth
+   * ≤ `minDepth` is fully explored regardless of the budget. Beyond
+   * `minDepth`, each candidate descent is skipped once
+   * `clock() - startedAt > timeBudgetMs`. Already-queued directories
+   * still drain (files at their depth get yielded), but no new dirs
+   * at `depth > minDepth` are opened. Default: Infinity.
+   */
+  timeBudgetMs?: number;
+  /**
+   * Monotonic clock. Defaults to `performance.now`. Tests inject a
+   * mock function to deterministically verify time-budget behavior.
+   */
+  clock?: () => number;
+
+  // --- Output control ---
+  /**
+   * Populate `mtime` on each yielded entry. When false (the default),
+   * `mtime` is always 0 — which is fine for grep/search consumers.
+   * DSN-style cache invalidation callers pass `true`.
+   */
+  recordMtimes?: boolean;
+  /**
+   * Observer invoked once per directory the walker enters, with the
+   * directory's absolute path and its floored `stat.mtimeMs`. Fires
+   * after the directory's entries are read but before any children
+   * are yielded.
+   *
+   * Stat'ing costs one extra `stat()` per directory when set; when
+   * unset (the default), the walker does not stat directories at all.
+   * Used by the DSN scanner to populate `dirMtimes` for cache
+   * invalidation — the walker stays policy-free.
+   */
+  onDirectoryVisit?: (absDir: string, mtimeMs: number) => void;
+};
+
+/**
+ * Contract implemented by `IgnoreStack` in `./ignore.js`.
+ *
+ * Isolated as an interface so the walker can accept alternative matcher
+ * implementations (e.g., a pass-through matcher in tests).
+ */
+export type IgnoreMatcher = {
+  /**
+   * True if `relativePath` should be ignored.
+   *
+   * `relativePath` must be POSIX-normalized and relative to the walker's
+   * `cwd`. `isDirectory` lets the matcher honor trailing-slash patterns
+   * (e.g., `build/` matches only directories).
+   */
+  isIgnored(relativePath: string, isDirectory: boolean): boolean;
+  /**
+   * Read the `.gitignore` file inside `absDir`, if any. No-op when the
+   * file doesn't exist. Called by the walker on descent when
+   * `nestedGitignore: true`.
+   */
+  loadFromDir(absDir: string): Promise<void>;
+};
+
+/**
+ * A single hit emitted by `grepFiles`. One `GrepMatch` per matching
+ * line — multi-line patterns are not supported in this iteration.
+ */
+export type GrepMatch = {
+  /** POSIX-normalized path relative to `GrepOptions.cwd`. */
+  path: string;
+  /** Absolute path on disk (for callers that want to re-open). */
+  absolutePath: string;
+  /** 1-based line number. */
+  lineNum: number;
+  /**
+   * Line content, truncated at `maxLineLength` (default 2000) with a
+   * `…` suffix. Trailing `\r` on CRLF files is preserved verbatim —
+   * consumers that care should trim their own way.
+   */
+  line: string;
+};
+
+/**
+ * Options for `grepFiles` / `collectGrep`. Only `cwd` and `pattern`
+ * are required. All walker options are forwarded to the underlying
+ * `walkFiles` call.
+ */
+export type GrepOptions = {
+  /** Absolute root directory. */
+  cwd: string;
+  /**
+   * Regex source string (compiled by `compilePattern`) or a
+   * pre-compiled `RegExp`. Pre-compiled regexes are trusted and used
+   * verbatim — callers that want `caseSensitive` etc. should build
+   * their own RegExp. String input supports leading inline flags
+   * `(?i)` / `(?im)` / `(?i:...)`.
+   */
+  pattern: string | RegExp;
+
+  // --- Filters layered on top of the walker ---
+  /**
+   * One or more glob patterns (picomatch syntax). When set, only files
+   * whose path (or basename when the pattern has no `/`) matches at
+   * least one pattern are scanned.
+   */
+  include?: string | readonly string[];
+  /** One or more glob patterns that suppress matching files. */
+  exclude?: string | readonly string[];
+  /**
+   * Subdirectory under `cwd` to narrow the walk root. Path traversal
+   * out of `cwd` is the caller's responsibility (the grep engine
+   * trusts this value — use `safePath` at adapter boundaries).
+   */
+  path?: string;
+
+  // --- WalkOptions pass-through ---
+  extensions?: ReadonlySet<string>;
+  alwaysSkipDirs?: readonly string[];
+  respectGitignore?: boolean;
+  nestedGitignore?: boolean;
+  hidden?: boolean;
+  maxDepth?: number;
+  minDepth?: number;
+  maxFileSize?: number;
+  descentHook?: WalkOptions["descentHook"];
+  followSymlinks?: boolean;
+
+  // --- Grep-specific ---
+  /**
+   * Case-sensitive match. Default: true (matches rg's default).
+   * Leading `(?i)` in the pattern OR `caseSensitive: false` both
+   * produce a case-insensitive match.
+   */
+  caseSensitive?: boolean;
+  /**
+   * Control the `m` flag on the compiled pattern.
+   *
+   * - `true` (default): `^` / `$` match at line boundaries inside
+   *   each file. This is the grep-like semantic — patterns like
+   *   `^foo` match any line that starts with `foo`, not just the
+   *   first line of the file. Matches `rg`'s default behavior.
+   * - `false`: strict JS semantics — `^` anchors to the buffer start
+   *   and `$` to the buffer end. Only useful for patterns that
+   *   explicitly want to match on the whole file as a single unit.
+   *
+   * Note: the pre-PR-3.5 grep engine iterated line-by-line (via
+   * `content.split("\n")`), so each line was its own string and
+   * `^/$` anchored naturally. After the switch to whole-buffer
+   * `regex.exec`, the engine needs the `m` flag to recover the same
+   * anchoring semantics — which is why the default is `true` here
+   * despite `compilePattern`'s lower-level default being `false`.
+   */
+  multiline?: boolean;
+  /**
+   * Include files classified as binary by the walker. Default: false,
+   * matching `rg`'s default. When `extensions` is passed the walker
+   * marks everything `isBinary: false` and this option is a no-op.
+   */
+  includeBinary?: boolean;
+  /** Hard cap on total matches across all files. Default: unlimited. */
+  maxResults?: number;
+  /** Stop at the first match — used by DSN's scanCodeForFirstDsn. */
+  stopOnFirst?: boolean;
+  /** Per-file match cap. Default: unlimited. */
+  maxMatchesPerFile?: number;
+  /** Truncate each match's `line` to this many chars. Default: 2000. */
+  maxLineLength?: number;
+  /**
+   * Parallel file-read + regex work. Default: CONCURRENCY_LIMIT (50).
+   * Forwarded to the internal `mapFilesConcurrent` helper.
+   */
+  concurrency?: number;
+
+  // --- Cancellation / budget ---
+  signal?: AbortSignal;
+  /** Forwarded to `walkFiles` — time budget on the underlying walk. */
+  timeBudgetMs?: number;
+};
+
+/** Aggregate stats returned alongside matches from `collectGrep`. */
+export type GrepStats = {
+  /** Files yielded by the walker (before the grep's own filters). */
+  filesConsidered: number;
+  /** Files whose content was read and tested against the pattern. */
+  filesRead: number;
+  /** Files skipped because `isBinary: true` and `includeBinary: false`. */
+  filesSkippedBinary: number;
+  /** Total matches emitted. Equal to `matches.length` unless truncated. */
+  matchesEmitted: number;
+  /** True when `maxResults` or `stopOnFirst` cut the walk short. */
+  truncated: boolean;
+};
+
+/** Return shape of `collectGrep`. */
+export type GrepResult = {
+  /** Matches in a stable order: sorted by `[path, lineNum]`. */
+  matches: GrepMatch[];
+  stats: GrepStats;
+};
+
+/**
+ * Options for `globFiles` / `collectGlob`. Globs are post-filters
+ * over the walker's yield; there is no stat cost beyond what the
+ * walker already pays.
+ */
+export type GlobOptions = {
+  cwd: string;
+  /** One or more glob patterns (picomatch syntax). OR semantics. */
+  patterns: string | readonly string[];
+  /** Negative patterns. A file matching any `exclude` is suppressed. */
+  exclude?: string | readonly string[];
+  /** Subdirectory under `cwd` to narrow the walk root. */
+  path?: string;
+
+  // --- WalkOptions pass-through ---
+  alwaysSkipDirs?: readonly string[];
+  respectGitignore?: boolean;
+  nestedGitignore?: boolean;
+  hidden?: boolean;
+  maxDepth?: number;
+  minDepth?: number;
+  descentHook?: WalkOptions["descentHook"];
+  followSymlinks?: boolean;
+
+  // --- Glob-specific ---
+  /** Cap on emitted paths. Default: unlimited. */
+  maxResults?: number;
+  signal?: AbortSignal;
+  timeBudgetMs?: number;
+};
+
+/** Return shape of `collectGlob`. */
+export type GlobResult = {
+  /** Matching paths, POSIX-normalized + relative to cwd, sorted. */
+  files: string[];
+  /** True when `maxResults` was hit mid-walk. */
+  truncated: boolean;
+};
diff --git a/src/lib/scan/walker.ts b/src/lib/scan/walker.ts
new file mode 100644
index 000000000..eda681713
--- /dev/null
+++ b/src/lib/scan/walker.ts
@@ -0,0 +1,552 @@
+/**
+ * Streaming DFS directory walker with time-budgeted exploration.
+ *
+ * ### Contract
+ *
+ * `walkFiles(opts)` returns an `AsyncIterable<WalkEntry>` yielding one
+ * entry per regular file under `opts.cwd`. Directories are traversed
+ * but never yielded. Symbolic links are skipped unless
+ * `followSymlinks: true`. Paths are POSIX-normalized.
+ *
+ * ### Depth + time budget
+ *
+ * The walker visits every directory at depth ≤ `minDepth` regardless
+ * of wall-clock — that's the exhaustive-scan guarantee. Beyond
+ * `minDepth`, each candidate descent is gated on
+ * `clock() - startedAt ≤ timeBudgetMs`. When the budget is blown,
+ * already-queued directories at any depth still drain (their contents
+ * are yielded) but no new dirs at `depth > minDepth` are pushed.
+ *
+ * Traversal is DFS. Entries within a directory are sorted
+ * lexicographically so yield order is deterministic across
+ * filesystems — tests depend on this.
+ *
+ * ### Ignore integration
+ *
+ * The walker consults the provided `IgnoreMatcher` before descending
+ * into a directory (so `node_modules` et al. are never opened) and
+ * before yielding each file. Nested `.gitignore` files are loaded
+ * lazily: after `readdir`, the walker scans the dentry list for a
+ * `.gitignore` file and only calls `matcher.loadFromDir(absDir)`
+ * when one is present. This avoids a failing `Bun.file(...).text()`
+ * on every subdirectory without a `.gitignore` — the dominant cost
+ * we found in PR 1's benchmark.
+ *
+ * ### AbortSignal
+ *
+ * When `signal.aborted` becomes true, the generator throws a
+ * `DOMException("Walk aborted", "AbortError")` on its next advance.
+ * Entries already yielded remain valid.
+ */
+
+import type { Dirent } from "node:fs";
+import { readdir, stat } from "node:fs/promises";
+import path from "node:path";
+import { handleFileError } from "../dsn/fs-utils.js";
+import { logger } from "../logger.js";
+import { classifyByExtension, readHeadAndSniff } from "./binary.js";
+import {
+  DEFAULT_SKIP_DIRS,
+  MAX_FILE_SIZE,
+  normalizePath,
+  TEXT_EXTENSIONS,
+} from "./constants.js";
+import { IgnoreStack } from "./ignore.js";
+import type { IgnoreMatcher, WalkEntry, WalkOptions } from "./types.js";
+
+const log = logger.withTag("scan-walk");
+
+/** Entry on the walker's DFS stack. */
+type DirFrame = {
+  absDir: string;
+  /** Depth of this directory. Files inside it are at `depth + 1`. */
+  depth: number;
+};
+
+/** Telemetry-ish counters collected over a single walk. */
+type WalkStats = {
+  filesYielded: number;
+  dirsVisited: number;
+  filesSkippedBySize: number;
+  filesSkippedByBinary: number;
+  hitTimeBudget: boolean;
+  maxDepthReached: number;
+};
+
+/**
+ * Aggregate of everything the per-entry helpers need. Collecting these
+ * into one record keeps each helper's arity small — Biome's
+ * `useMaxParams` rule caps us at 4 (plus `this`), and individually
+ * passing cfg/matcher/stats/stack/budget state would blow past that.
+ */
+type WalkContext = {
+  cfg: NormalizedOptions;
+  matcher: IgnoreMatcher;
+  stats: WalkStats;
+  startedAt: number;
+  stack: DirFrame[];
+  visitedInodes: Set<string>;
+};
+
+/**
+ * Walk `opts.cwd` recursively and yield every file that passes the
+ * extension + size + ignore filters, up to the configured depth / time
+ * budget.
+ */
+export function walkFiles(opts: WalkOptions): AsyncIterable<WalkEntry> {
+  return {
+    [Symbol.asyncIterator]() {
+      return walkFilesImpl(opts);
+    },
+  };
+}
+
+async function* walkFilesImpl(opts: WalkOptions): AsyncGenerator<WalkEntry> {
+  const cfg = normalizeOptions(opts);
+  const matcher = await buildMatcher(cfg);
+  const stats: WalkStats = {
+    filesYielded: 0,
+    dirsVisited: 0,
+    filesSkippedBySize: 0,
+    filesSkippedByBinary: 0,
+    hitTimeBudget: false,
+    maxDepthReached: 0,
+  };
+  const startedAt = cfg.clock();
+
+  const visitedInodes = new Set<string>();
+  const stack: DirFrame[] = [{ absDir: cfg.cwd, depth: 0 }];
+
+  const ctx: WalkContext = {
+    cfg,
+    matcher,
+    stats,
+    startedAt,
+    stack,
+    visitedInodes,
+  };
+
+  try {
+    while (stack.length > 0) {
+      checkAborted(cfg.signal);
+      const frame = stack.pop() as DirFrame;
+      stats.dirsVisited += 1;
+      stats.maxDepthReached = Math.max(stats.maxDepthReached, frame.depth);
+
+      const entries = await listDirEntries(frame.absDir);
+      // Dentry-driven nested .gitignore loading: ONLY call loadFromDir
+      // when a .gitignore file is actually present in the directory
+      // listing we already read. This avoids a failed open + thrown
+      // Error on every subdir without a .gitignore — the dominant cost
+      // uncovered by PR 1's bench. The root cwd's .gitignore is
+      // already loaded by IgnoreStack.create(), so skip it here.
+      if (
+        cfg.nestedGitignore &&
+        frame.absDir !== cfg.cwd &&
+        hasGitignore(entries)
+      ) {
+        await matcher.loadFromDir(frame.absDir);
+      }
+
+      // Observer hook for consumers that need per-directory mtimes
+      // (primarily the DSN scanner's cache invalidation). Gated on
+      // the hook being defined — unset means we skip the extra stat.
+      if (cfg.onDirectoryVisit) {
+        await notifyDirectoryVisit(frame.absDir, cfg.onDirectoryVisit);
+      }
+
+      for (const entry of entries) {
+        const result = await processEntry(entry, frame, ctx);
+        if (result !== null) {
+          yield result;
+        }
+      }
+    }
+  } finally {
+    log.debug(
+      "walk done: yielded=%d dirs=%d hitBudget=%s maxDepth=%d elapsed=%dms",
+      stats.filesYielded,
+      stats.dirsVisited,
+      stats.hitTimeBudget,
+      stats.maxDepthReached,
+      Math.round(cfg.clock() - startedAt)
+    );
+  }
+}
+
+/**
+ * Process a single directory entry: skip / descend / yield.
+ *
+ * Extracted from the generator body purely to keep cognitive complexity
+ * under Biome's ceiling. Mutates `ctx.stats`, pushes directories onto
+ * `ctx.stack`, and returns a `WalkEntry` when a file should be yielded.
+ */
+// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: filter cascade (hidden, symlink, dir, file, ignore, ext) is inherently branchy
+async function processEntry(
+  entry: Dirent,
+  frame: DirFrame,
+  ctx: WalkContext
+): Promise<WalkEntry | null> {
+  const { cfg, matcher } = ctx;
+  if (!cfg.hidden && entry.name.startsWith(".")) {
+    return null;
+  }
+  if (entry.isSymbolicLink() && !cfg.followSymlinks) {
+    return null;
+  }
+  const abs = path.join(frame.absDir, entry.name);
+  const rel = normalizePath(path.relative(cfg.cwd, abs));
+
+  // For regular dirs/files, the Dirent already tells us the type.
+  // For symlinks (when `followSymlinks: true`), we need to `stat()`
+  // the target to learn whether it resolves to a file or a directory.
+  // `readdir({withFileTypes})` uses lstat semantics: `isSymbolicLink`
+  // returns true while `isFile`/`isDirectory` are both false. Without
+  // this extra stat, symlinks would fall through `isDirectory()` +
+  // `isFile()` and be silently dropped.
+  let isDir: boolean;
+  let isFile: boolean;
+  if (entry.isSymbolicLink() && cfg.followSymlinks) {
+    const resolved = await statSymlinkTarget(abs);
+    if (!resolved) {
+      // Broken symlink or stat error — skip (matches rg's behavior).
+      return null;
+    }
+    isDir = resolved.isDirectory;
+    isFile = resolved.isFile;
+  } else {
+    isDir = entry.isDirectory();
+    isFile = entry.isFile();
+  }
+
+  if (isDir) {
+    await maybeDescend(abs, rel, frame.depth, ctx);
+    return null;
+  }
+  if (!isFile) {
+    return null;
+  }
+  // maxDepth controls directory *descent*, not file yield. Files sit
+  // inside the last-entered dir; once the walker has entered that dir,
+  // every file in it is eligible for yield. This matches the classic
+  // Unix `find -maxdepth` semantics and the pre-PR-3 DSN scanner's
+  // MAX_SCAN_DEPTH behavior.
+  const fileDepth = frame.depth + 1;
+  if (matcher.isIgnored(rel, false)) {
+    return null;
+  }
+  if (cfg.extensions !== undefined) {
+    const ext = path.extname(entry.name).toLowerCase();
+    if (ext === "" || !cfg.extensions.has(ext)) {
+      return null;
+    }
+  }
+  return tryYieldFile(
+    { absPath: abs, relPath: rel, fileDepth },
+    cfg,
+    ctx.stats
+  );
+}
+
+/** Push a child directory onto the stack if filters allow. */
+async function maybeDescend(
+  abs: string,
+  rel: string,
+  parentDepth: number,
+  ctx: WalkContext
+): Promise<void> {
+  const { cfg, matcher } = ctx;
+  // Default descent is depth + 1; callers (e.g. DSN scanner) can
+  // override via `descentHook` to reset depth at monorepo package
+  // boundaries. The hook receives the PARENT's depth, not the child's.
+  const nextDepth = cfg.descentHook(rel, parentDepth);
+  if (nextDepth > cfg.maxDepth) {
+    return;
+  }
+  if (matcher.isIgnored(rel, true)) {
+    return;
+  }
+  // Time-budget check: the walker guarantees every directory at depth
+  // ≤ `minDepth` is fully explored regardless of wall-clock. Beyond
+  // that, we skip new descents once the budget is blown. `nextDepth`
+  // (not parentDepth) is what matters — we're deciding whether to
+  // visit the child at nextDepth.
+  if (
+    nextDepth > cfg.minDepth &&
+    cfg.clock() - ctx.startedAt > cfg.timeBudgetMs
+  ) {
+    ctx.stats.hitTimeBudget = true;
+    return;
+  }
+  if (cfg.followSymlinks) {
+    const key = await inodeKey(abs);
+    if (key && ctx.visitedInodes.has(key)) {
+      return;
+    }
+    if (key) {
+      ctx.visitedInodes.add(key);
+    }
+  }
+  // Nested-.gitignore loading is now done by the main loop based on
+  // the child's dentry list — not here. See `hasGitignore` call in
+  // walkFilesImpl.
+  ctx.stack.push({ absDir: abs, depth: nextDepth });
+}
+
+/**
+ * True if the dentry list contains a regular file named `.gitignore`.
+ * A cheap dentry scan avoids the failing `Bun.file().text()` attempt
+ * on every directory that doesn't have one — the dominant cost in
+ * PR 1's benchmark.
+ */
+function hasGitignore(entries: readonly Dirent[]): boolean {
+  for (const entry of entries) {
+    if (entry.name === ".gitignore" && entry.isFile()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/** Loose bundle of path + depth for `tryYieldFile`. */
+type FileCoords = {
+  absPath: string;
+  relPath: string;
+  fileDepth: number;
+};
+
+/** Classify, stat, and build a `WalkEntry`. Returns null on fs errors. */
+async function tryYieldFile(
+  coords: FileCoords,
+  cfg: NormalizedOptions,
+  stats: WalkStats
+): Promise<WalkEntry | null> {
+  try {
+    const file = Bun.file(coords.absPath);
+    const size = file.size;
+    if (size > cfg.maxFileSize) {
+      stats.filesSkippedBySize += 1;
+      return null;
+    }
+    const isBinary = await classifyFile(coords.absPath, size, cfg);
+    stats.filesYielded += 1;
+    return {
+      absolutePath: coords.absPath,
+      relativePath: coords.relPath,
+      size,
+      mtime: cfg.recordMtimes ? file.lastModified : 0,
+      isBinary,
+      depth: coords.fileDepth,
+    };
+  } catch (error) {
+    handleFileError(error, {
+      operation: "scan.walk.readFile",
+      path: coords.absPath,
+    });
+    return null;
+  }
+}
+
+/**
+ * Classify a file as binary via extension fast-path or 8 KB NUL sniff.
+ *
+ * Skip the sniff for:
+ *   (a) callers that provided an `extensions` allowlist — the file
+ *       already passed it, so by construction the caller considers
+ *       its extension text-bearing;
+ *   (b) known text extensions (`TEXT_EXTENSIONS`);
+ *   (c) empty files.
+ *
+ * Ordering matters: (a) runs first so we never re-compute
+ * `path.extname` + `Set.has` for the hot DSN-scan path where
+ * `processEntry` has already matched the extension.
+ */
+async function classifyFile(
+  absPath: string,
+  size: number,
+  cfg: NormalizedOptions
+): Promise<boolean> {
+  if (cfg.extensions !== undefined) {
+    // (a) caller filtered — skip re-classification.
+    return false;
+  }
+  const byExt = classifyByExtension(absPath, TEXT_EXTENSIONS);
+  if (byExt !== null) {
+    return byExt.isBinary;
+  }
+  if (size === 0) {
+    return false;
+  }
+  try {
+    const result = await readHeadAndSniff(absPath);
+    return result.isBinary;
+  } catch (error) {
+    handleFileError(error, {
+      operation: "scan.walk.classifyFile",
+      path: absPath,
+    });
+    // Fail closed: treat as binary when we can't read, so callers that
+    // ignore binaries won't trip on an unreadable file.
+    return true;
+  }
+}
+
+/** Lexicographic compare on `entry.name` for stable iteration order. */
+function compareByName(a: Dirent, b: Dirent): number {
+  if (a.name < b.name) {
+    return -1;
+  }
+  if (a.name > b.name) {
+    return 1;
+  }
+  return 0;
+}
+
+/**
+ * Read all entries in a directory, sorted by name for determinism.
+ *
+ * Uses `readdir({withFileTypes})` rather than `opendir` — the former
+ * returns the full list in a single await, the latter yields via an
+ * async iterator with one microtask per entry. Measurable win on hot
+ * walks (see PR 1.5 bench data). Sort is retained so yield order is
+ * filesystem-independent, which tests rely on.
+ */
+async function listDirEntries(dir: string): Promise<Dirent[]> {
+  try {
+    const entries = await readdir(dir, { withFileTypes: true });
+    entries.sort(compareByName);
+    return entries;
+  } catch (error) {
+    handleFileError(error, { operation: "scan.walk.readdir", path: dir });
+    return [];
+  }
+}
+
+/**
+ * Fire the `onDirectoryVisit` observer with the directory's
+ * `Math.floor(stat.mtimeMs)`. Errors are routed through
+ * `handleFileError` and swallowed — a stat failure shouldn't abort
+ * the walk.
+ *
+ * Uses `node:fs.stat` because `Bun.file()` doesn't support
+ * directories (see the pre-PR-3 `getDirMtime` helper for the same
+ * rationale). Flooring matches `src/lib/db/dsn-cache.ts`'s verifier
+ * — see `validateDirMtime`.
+ */
+async function notifyDirectoryVisit(
+  absDir: string,
+  hook: (dir: string, mtimeMs: number) => void
+): Promise<void> {
+  try {
+    const s = await stat(absDir);
+    hook(absDir, Math.floor(s.mtimeMs));
+  } catch (error) {
+    handleFileError(error, {
+      operation: "scan.walk.dirMtime",
+      path: absDir,
+    });
+  }
+}
+
+/** stat-based `dev:ino` key for symlink cycle detection. */
+async function inodeKey(absPath: string): Promise<string | null> {
+  try {
+    const s = await stat(absPath);
+    return `${s.dev}:${s.ino}`;
+  } catch (error) {
+    handleFileError(error, { operation: "scan.walk.inodeKey", path: absPath });
+    return null;
+  }
+}
+
+/**
+ * Resolve what a symlink points to. Returns `null` when the target
+ * is missing (broken symlink) or the stat otherwise fails — in both
+ * cases the caller should skip the entry the way ripgrep does.
+ *
+ * Uses `stat` (which follows symlinks) rather than `lstat` so we see
+ * the target's true kind. Cycle detection happens later in
+ * `maybeDescend` via `inodeKey`.
+ */
+async function statSymlinkTarget(
+  absPath: string
+): Promise<{ isFile: boolean; isDirectory: boolean } | null> {
+  try {
+    const s = await stat(absPath);
+    return { isFile: s.isFile(), isDirectory: s.isDirectory() };
+  } catch (error) {
+    // ENOENT (broken symlink) is the expected failure case — swallow.
+    // Other errors go through handleFileError so Sentry sees them.
+    handleFileError(error, {
+      operation: "scan.walk.statSymlinkTarget",
+      path: absPath,
+    });
+    return null;
+  }
+}
+
+/** Throw an AbortError if the caller's signal has fired. */
+function checkAborted(signal: AbortSignal | undefined): void {
+  if (signal?.aborted) {
+    // DOMException is available in Bun and Node >= 17.
+    throw new DOMException("Walk aborted", "AbortError");
+  }
+}
+
+type NormalizedOptions = {
+  cwd: string;
+  extensions: ReadonlySet<string> | undefined;
+  alwaysSkipDirs: readonly string[];
+  hidden: boolean;
+  respectGitignore: boolean;
+  nestedGitignore: boolean;
+  maxFileSize: number;
+  minDepth: number;
+  maxDepth: number;
+  descentHook: (relPath: string, currentDepth: number) => number;
+  followSymlinks: boolean;
+  signal: AbortSignal | undefined;
+  timeBudgetMs: number;
+  clock: () => number;
+  recordMtimes: boolean;
+  onDirectoryVisit: ((absDir: string, mtimeMs: number) => void) | undefined;
+};
+
+/** Default descent: linear depth counting. */
+const defaultDescentHook = (_relPath: string, currentDepth: number): number =>
+  currentDepth + 1;
+
+function normalizeOptions(opts: WalkOptions): NormalizedOptions {
+  if (!path.isAbsolute(opts.cwd)) {
+    throw new Error(`walkFiles: cwd must be absolute, got ${opts.cwd}`);
+  }
+  return {
+    cwd: opts.cwd,
+    extensions: opts.extensions,
+    alwaysSkipDirs: opts.alwaysSkipDirs ?? DEFAULT_SKIP_DIRS,
+    hidden: opts.hidden ?? true,
+    respectGitignore: opts.respectGitignore ?? true,
+    nestedGitignore: opts.nestedGitignore ?? true,
+    maxFileSize: opts.maxFileSize ?? MAX_FILE_SIZE,
+    minDepth: opts.minDepth ?? 0,
+    maxDepth: opts.maxDepth ?? Number.POSITIVE_INFINITY,
+    descentHook: opts.descentHook ?? defaultDescentHook,
+    followSymlinks: opts.followSymlinks ?? false,
+    signal: opts.signal,
+    timeBudgetMs: opts.timeBudgetMs ?? Number.POSITIVE_INFINITY,
+    clock: opts.clock ?? (() => performance.now()),
+    recordMtimes: opts.recordMtimes ?? false,
+    onDirectoryVisit: opts.onDirectoryVisit,
+  };
+}
+
+function buildMatcher(cfg: NormalizedOptions): Promise<IgnoreMatcher> {
+  return IgnoreStack.create({
+    cwd: cfg.cwd,
+    alwaysSkipDirs: cfg.alwaysSkipDirs,
+    respectGitignore: cfg.respectGitignore,
+    // Only relevant when respectGitignore: true; IgnoreStack handles
+    // the gate internally but we pass an explicit value for clarity.
+    includeGitInfoExclude: cfg.respectGitignore,
+  });
+}
diff --git a/test/fixtures/bench/generate.ts b/test/fixtures/bench/generate.ts
new file mode 100644
index 000000000..20466778e
--- /dev/null
+++ b/test/fixtures/bench/generate.ts
@@ -0,0 +1,372 @@
+/**
+ * Deterministic synthetic repo generator for benchmarks.
+ *
+ * Given a FixtureSpec + seed, produces a reproducible on-disk tree mixing
+ * text source files (with configurable DSN scatter), binary blobs, and
+ * .gitignore files. Two machines running the same spec + seed produce
+ * byte-identical directory trees (modulo mtimes), which is what lets us
+ * compare bench numbers across contributors.
+ *
+ * Intentionally has zero imports from `src/` — the bench harness has to
+ * be able to generate fixtures before any production code runs.
+ */
+
+import { createHash } from "node:crypto";
+import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
+import { join } from "node:path";
+
+/** Parameters for a synthetic repo. Defaults live in `presets.ts`. */
+export type FixtureSpec = {
+  /** Absolute directory to populate. Created if missing. */
+  rootDir: string;
+  /** Monorepo package count. `0` produces a single-repo layout. */
+  packages: number;
+  /**
+   * Text + binary files to generate per package (or total files when
+   * `packages` is 0). Actual count lands within ±3% of this target.
+   */
+  filesPerPackage: number;
+  /** Text file extensions. Picked round-robin; DSN sprinkling works across all. */
+  fileExtensions: string[];
+  /** Fraction [0,1] of files that are random binary blobs (NUL-byte-containing). */
+  binaryRatio: number;
+  /** Fraction [0,1] of text files that include a DSN somewhere in the body. */
+  dsnRatio: number;
+  /** Gitignore strategy. `"nested"` drops one per package directory. */
+  gitignoreDepth: "root" | "nested";
+  /** Mean file size in KB. Actual sizes drawn from a lognormal-ish distribution. */
+  avgFileKB: number;
+  /** Deterministic PRNG seed. Same seed = same tree. */
+  seed: number;
+  /**
+   * Max subdirectory depth below the root (or package root in monorepo mode).
+   * Files are scattered across `src/{sub}/...` up to this depth.
+   */
+  subdirDepth: number;
+};
+
+/** Metadata written to `.meta.json` inside the fixture root. */
+export type FixtureMeta = {
+  /** Schema version for forward compat. Bumped on breaking generator changes. */
+  version: 1;
+  spec: Omit<FixtureSpec, "rootDir">;
+  /** Total file count (text + binary). Reported by the generator. */
+  fileCount: number;
+  /** Count of text files that had a DSN placed inside them. */
+  dsnCount: number;
+  /** Generated timestamp (Unix ms). */
+  generatedAt: number;
+};
+
+/**
+ * Pseudo-random number generator with a single 32-bit state.
+ *
+ * Uses xorshift32, which is fast, has a period > 4 billion, and is
+ * plenty for fixture generation. The `next()` method returns a float
+ * in [0, 1) matching Math.random()'s contract.
+ *
+ * Note: the biome rule against bitwise operators is intentionally
+ * suppressed for this class. Xorshift's defining property is the
+ * bitwise xor/shift sequence; rewriting with arithmetic would change
+ * the output (and the "reproducible trees" property).
+ */
+// biome-ignore-start lint/suspicious/noBitwiseOperators: xorshift32 requires bitwise ops
+class XorShift32 {
+  private state: number;
+
+  constructor(seed: number) {
+    // Ensure seed is non-zero (xorshift degenerates at 0).
+    const coerced = seed | 0;
+    this.state = coerced === 0 ? 1 : coerced;
+  }
+
+  next(): number {
+    let x = this.state;
+    x ^= x << 13;
+    x ^= x >>> 17;
+    x ^= x << 5;
+    this.state = x | 0;
+    // Convert to [0, 1) by dividing by 2^32.
+    return (x >>> 0) / 4_294_967_296;
+  }
+
+  /** Integer in [0, max). */
+  int(max: number): number {
+    return Math.floor(this.next() * max);
+  }
+
+  /** Pick one element from `arr`. */
+  pick<T>(arr: readonly T[]): T {
+    return arr[this.int(arr.length)] as T;
+  }
+
+  /** Boolean true with probability `p`. */
+  chance(p: number): boolean {
+    return this.next() < p;
+  }
+}
+// biome-ignore-end lint/suspicious/noBitwiseOperators: xorshift32 requires bitwise ops
+
+/** Synthetic DSN templates scattered throughout text files. */
+const DSN_TEMPLATES = [
+  "https://abc123def456abc123def456abc123de@o123456.ingest.us.sentry.io/4507654321",
+  "https://f00ba7f00ba7f00ba7f00ba7f00ba7f0@o987654.ingest.de.sentry.io/1234567890",
+  "https://deadbeefdeadbeefdeadbeefdeadbeef@o555555.ingest.us.sentry.io/9999999",
+  "https://cafed00dcafed00dcafed00dcafed00d@o111222.ingest.us.sentry.io/7654321",
+] as const;
+
+/** Pool of boring filler line snippets — kept plain ASCII to simplify sizing. */
+const FILLER_LINES = [
+  "const value = computeSomething(input);",
+  "if (result === null) { return fallback; }",
+  "// regular comment about the implementation",
+  "function doWork(input: string): string { return input.trim(); }",
+  "import { something } from './module.js';",
+  "const pattern = /foo(bar)?/gi;",
+  "export type Thing = { id: string; name: string; };",
+  "throw new Error('unexpected state');",
+  "// TODO: revisit this after the refactor",
+  "return cache.get(key) ?? fallback();",
+] as const;
+
+/** Typical root-level .gitignore contents (simulates a real repo). */
+const ROOT_GITIGNORE = `# bench fixture gitignore
+node_modules/
+dist/
+build/
+.cache/
+coverage/
+*.log
+`;
+
+/** Per-package .gitignore (adds some package-local patterns). */
+const PACKAGE_GITIGNORE = `# package-local ignores
+.build/
+out/
+*.generated.ts
+`;
+
+/**
+ * Generate a fixture if one doesn't already exist with the same spec.
+ *
+ * The generator writes a `.meta.json` file into the fixture root containing
+ * a content-hash of the spec. On a second invocation with the same spec,
+ * the function short-circuits — this is what makes `bun run bench` fast
+ * on repeat invocations.
+ *
+ * When `force: true`, any existing fixture is wiped and re-generated.
+ *
+ * @returns Metadata describing the generated fixture.
+ */
+export function generateFixture(
+  spec: FixtureSpec,
+  options: { force?: boolean } = {}
+): FixtureMeta {
+  const existing = readMeta(spec.rootDir);
+  if (!options.force && existing && specMatches(existing.spec, spec)) {
+    return existing;
+  }
+
+  // Start fresh — `mkdirSync(recursive)` is cheap and tolerates existing dirs,
+  // but we don't rm here because the caller owns the directory lifecycle.
+  mkdirSync(spec.rootDir, { recursive: true });
+
+  const rng = new XorShift32(spec.seed);
+  let fileCount = 0;
+  let dsnCount = 0;
+
+  // Always seed a root .gitignore so walkers honor it regardless of strategy.
+  writeFileSync(join(spec.rootDir, ".gitignore"), ROOT_GITIGNORE, "utf8");
+  // An empty `.git` directory makes the fixture a definitive project root.
+  // Without this, `findProjectRoot` would walk up into the real repo and
+  // resolve the bench numbers there instead — polluting both the timing
+  // and the DSN results.
+  mkdirSync(join(spec.rootDir, ".git"), { recursive: true });
+  // A node_modules/ dir that should be fully skipped — sanity check for ignore logic.
+  mkdirSync(join(spec.rootDir, "node_modules", "some-pkg"), {
+    recursive: true,
+  });
+  writeFileSync(
+    join(spec.rootDir, "node_modules", "some-pkg", "index.js"),
+    "// should never be scanned\n",
+    "utf8"
+  );
+
+  if (spec.packages === 0) {
+    const result = populatePackage(spec.rootDir, spec, rng, {
+      includeNested: false,
+    });
+    fileCount += result.fileCount;
+    dsnCount += result.dsnCount;
+  } else {
+    // Pick a monorepo root name deterministically per seed to simulate real layouts.
+    const monorepoRoots = ["packages", "apps", "libs"] as const;
+    const monorepoRoot = monorepoRoots[
+      spec.seed % monorepoRoots.length
+    ] as string;
+    for (let i = 0; i < spec.packages; i += 1) {
+      const pkgDir = join(
+        spec.rootDir,
+        monorepoRoot,
+        `pkg-${i.toString().padStart(3, "0")}`
+      );
+      mkdirSync(pkgDir, { recursive: true });
+      if (spec.gitignoreDepth === "nested") {
+        writeFileSync(join(pkgDir, ".gitignore"), PACKAGE_GITIGNORE, "utf8");
+      }
+      const result = populatePackage(pkgDir, spec, rng, {
+        includeNested: spec.gitignoreDepth === "nested",
+      });
+      fileCount += result.fileCount;
+      dsnCount += result.dsnCount;
+    }
+  }
+
+  const meta: FixtureMeta = {
+    version: 1,
+    spec: stripRootDir(spec),
+    fileCount,
+    dsnCount,
+    generatedAt: Date.now(),
+  };
+  writeFileSync(
+    join(spec.rootDir, ".meta.json"),
+    JSON.stringify(meta, null, 2),
+    "utf8"
+  );
+  return meta;
+}
+
+/** Populate a single package (or the whole repo when `packages === 0`). */
+function populatePackage(
+  baseDir: string,
+  spec: FixtureSpec,
+  rng: XorShift32,
+  _opts: { includeNested: boolean }
+): { fileCount: number; dsnCount: number } {
+  const targetFiles = spec.filesPerPackage;
+  const textCount = Math.max(
+    1,
+    Math.round(targetFiles * (1 - spec.binaryRatio))
+  );
+  const binaryCount = Math.max(0, targetFiles - textCount);
+
+  const srcDir = join(baseDir, "src");
+  mkdirSync(srcDir, { recursive: true });
+
+  let fileCount = 0;
+  let dsnCount = 0;
+
+  for (let i = 0; i < textCount; i += 1) {
+    const depth = 1 + rng.int(Math.max(1, spec.subdirDepth));
+    const subdir = pickSubdir(srcDir, depth, rng);
+    mkdirSync(subdir, { recursive: true });
+    const ext = rng.pick(spec.fileExtensions);
+    const filename = `file-${i.toString().padStart(4, "0")}${ext}`;
+    const includeDsn = rng.chance(spec.dsnRatio);
+    const content = buildTextFileContent(spec.avgFileKB, includeDsn, rng);
+    writeFileSync(join(subdir, filename), content, "utf8");
+    fileCount += 1;
+    if (includeDsn) {
+      dsnCount += 1;
+    }
+  }
+
+  for (let i = 0; i < binaryCount; i += 1) {
+    const subdir = join(baseDir, "assets");
+    mkdirSync(subdir, { recursive: true });
+    const filename = `blob-${i.toString().padStart(4, "0")}.bin`;
+    writeFileSync(join(subdir, filename), buildBinaryBlob(spec.avgFileKB, rng));
+    fileCount += 1;
+  }
+
+  return { fileCount, dsnCount };
+}
+
+/** Random subdirectory path under `base` with exactly `depth` levels below it. */
+function pickSubdir(base: string, depth: number, rng: XorShift32): string {
+  const pool = [
+    "core",
+    "utils",
+    "config",
+    "lib",
+    "internal",
+    "components",
+    "api",
+  ];
+  let cur = base;
+  for (let i = 0; i < depth; i += 1) {
+    cur = join(cur, rng.pick(pool));
+  }
+  return cur;
+}
+
+/** Produce plausible text file contents, optionally embedding a DSN line. */
+function buildTextFileContent(
+  avgKB: number,
+  includeDsn: boolean,
+  rng: XorShift32
+): string {
+  // Target size varies between 0.5x–2x avgKB to simulate a lognormal-ish spread.
+  const sizeKB = avgKB * (0.5 + rng.next() * 1.5);
+  const targetBytes = Math.max(64, Math.round(sizeKB * 1024));
+  const lines: string[] = [];
+  let bytes = 0;
+  if (includeDsn) {
+    // Put the DSN somewhere non-commented so the DSN scanner accepts it.
+    lines.push(`const SENTRY_DSN = "${rng.pick(DSN_TEMPLATES)}";`);
+    bytes += lines[0].length + 1;
+  }
+  while (bytes < targetBytes) {
+    const line = rng.pick(FILLER_LINES);
+    lines.push(line);
+    bytes += line.length + 1;
+  }
+  return `${lines.join("\n")}\n`;
+}
+
+/** Build a random binary blob of roughly avgKB size. Guaranteed to contain NUL bytes. */
+function buildBinaryBlob(avgKB: number, rng: XorShift32): Uint8Array {
+  const sizeKB = avgKB * (0.5 + rng.next() * 1.5);
+  const bytes = Math.max(64, Math.round(sizeKB * 1024));
+  const buf = new Uint8Array(bytes);
+  for (let i = 0; i < bytes; i += 1) {
+    // ~1 in 200 bytes is a NUL; rest is arbitrary.
+    buf[i] = rng.int(256);
+    if (rng.chance(0.005)) {
+      buf[i] = 0;
+    }
+  }
+  // Force at least one NUL at a known offset so detection never false-negatives.
+  buf[16] = 0;
+  return buf;
+}
+
+/** Read existing `.meta.json`, returning undefined if missing or unreadable. */
+function readMeta(rootDir: string): FixtureMeta | undefined {
+  try {
+    const raw = readFileSync(join(rootDir, ".meta.json"), "utf8");
+    return JSON.parse(raw) as FixtureMeta;
+  } catch {
+    return;
+  }
+}
+
+/** Compare two specs excluding rootDir (which is meaningless for dedup). */
+function specMatches(a: FixtureMeta["spec"], b: FixtureSpec): boolean {
+  return hashSpec(a) === hashSpec(stripRootDir(b));
+}
+
+/** Stable hash of a spec — used for equality and for cache dir naming. */
+export function hashSpec(spec: FixtureMeta["spec"]): string {
+  const canonical = JSON.stringify({
+    ...spec,
+    fileExtensions: [...spec.fileExtensions].sort(),
+  });
+  return createHash("sha256").update(canonical).digest("hex").slice(0, 16);
+}
+
+function stripRootDir(spec: FixtureSpec): FixtureMeta["spec"] {
+  const { rootDir: _rootDir, ...rest } = spec;
+  return rest;
+}
diff --git a/test/fixtures/bench/helpers.ts b/test/fixtures/bench/helpers.ts
new file mode 100644
index 000000000..bbf62f0cf
--- /dev/null
+++ b/test/fixtures/bench/helpers.ts
@@ -0,0 +1,395 @@
+/**
+ * Bench harness helpers.
+ *
+ * Provides:
+ *   - `measure()`:   timing primitive with warmup + run distribution
+ *   - `summarize()`: statistics (p50, p95, min, max, mean, stddev)
+ *   - `withBenchDb()`: one-shot SQLite cache isolation (scoped
+ *     `SENTRY_CONFIG_DIR`, so bench runs never clobber the real user cache)
+ *   - `clearDsnDetectionCache()`: tears down cached DSN + project-root rows
+ *     between cold-cache measurements
+ *   - `printReport()` / `writeJsonReport()`: structured human/JSON output
+ *     mirroring `script/eval-skill.ts`'s reporter shape.
+ *
+ * All timing uses `performance.now()` — already the codebase convention
+ * (see src/lib/complete.ts and src/lib/sentry-client.ts).
+ */
+
+import { mkdirSync, mkdtempSync, rmSync } from "node:fs";
+import { arch, cpus, platform, tmpdir } from "node:os";
+import { join } from "node:path";
+
+/** Statistics derived from an array of measurements. */
+export type BenchStats = {
+  runs: number;
+  min: number;
+  max: number;
+  mean: number;
+  stddev: number;
+  p50: number;
+  p95: number;
+};
+
+/** A single bench result entry, one row in the report. */
+export type BenchEntry = {
+  /** Fixture label (e.g., "synthetic/medium" or "real:/home/foo/repo"). */
+  fixture: string;
+  /** Operation label (e.g., "detectDsn.cold"). */
+  operation: string;
+  /** True for operations that should see a primed cache; false for cold runs. */
+  warm: boolean;
+  /** Timing statistics in milliseconds. */
+  stats: BenchStats;
+};
+
+/** Full bench report — the JSON written to `.bench/baseline.json`. */
+export type BenchReport = {
+  /** Schema version. Bump on breaking change. */
+  version: 1;
+  /** When the run completed (ISO 8601). */
+  generatedAt: string;
+  /** Node/Bun runtime info for reproducibility context. */
+  runtime: {
+    bun: string;
+    platform: string;
+    arch: string;
+    cpus: number;
+  };
+  /** Every timed operation, flattened. */
+  entries: BenchEntry[];
+};
+
+/** Options for `measure()`. */
+export type MeasureOptions = {
+  /** Number of measured runs. Default: 10. */
+  runs?: number;
+  /** Warmup runs discarded before measurement. Default: 3. */
+  warmup?: number;
+  /**
+   * Optional setup hook run before each measured iteration. Timing excludes
+   * setup — use it to clear caches, rebuild state, etc.
+   */
+  beforeEach?: () => void | Promise<void>;
+};
+
+/**
+ * Time an async function over `runs` iterations (plus `warmup`).
+ *
+ * Returns the list of sample times in milliseconds; feed into `summarize()`
+ * for stats. The number of samples equals `runs`; warmup samples are
+ * discarded entirely.
+ */
+export async function measure(
+  fn: () => Promise<void> | void,
+  options: MeasureOptions = {}
+): Promise<number[]> {
+  const runs = options.runs ?? 10;
+  const warmup = options.warmup ?? 3;
+
+  // Warmup: discard timings but still run setup so caches start cold the
+  // same way the measured runs will.
+  for (let i = 0; i < warmup; i++) {
+    if (options.beforeEach) await options.beforeEach();
+    await fn();
+  }
+
+  const samples: number[] = [];
+  for (let i = 0; i < runs; i++) {
+    if (options.beforeEach) await options.beforeEach();
+    const t0 = performance.now();
+    await fn();
+    samples.push(performance.now() - t0);
+  }
+  return samples;
+}
+
+/** Compute summary statistics from a sample array. Samples may be unsorted. */
+export function summarize(samples: readonly number[]): BenchStats {
+  if (samples.length === 0) {
+    return { runs: 0, min: 0, max: 0, mean: 0, stddev: 0, p50: 0, p95: 0 };
+  }
+  const sorted = [...samples].sort((a, b) => a - b);
+  const n = sorted.length;
+  const min = sorted[0] as number;
+  const max = sorted[n - 1] as number;
+  const mean = sorted.reduce((acc, x) => acc + x, 0) / n;
+  // Population stddev — we're measuring the full sample, not inferring a population.
+  const variance = sorted.reduce((acc, x) => acc + (x - mean) ** 2, 0) / n;
+  const stddev = Math.sqrt(variance);
+  return {
+    runs: n,
+    min,
+    max,
+    mean,
+    stddev,
+    p50: percentile(sorted, 0.5),
+    p95: percentile(sorted, 0.95),
+  };
+}
+
+/** Nearest-rank percentile over a pre-sorted array. */
+function percentile(sorted: readonly number[], p: number): number {
+  const idx = Math.min(
+    sorted.length - 1,
+    Math.max(0, Math.ceil(p * sorted.length) - 1)
+  );
+  return sorted[idx] as number;
+}
+
+/**
+ * Run `fn` with a scoped SQLite cache directory.
+ *
+ * Creates a fresh temp dir, points SENTRY_CONFIG_DIR at it, closes any
+ * preexisting DB handle, runs `fn`, then restores the previous env value
+ * and deletes the temp dir. Guarantees bench runs never touch the real
+ * user cache even if the harness crashes (finally-block cleanup).
+ */
+export async function withBenchDb<T>(
+  fn: (configDir: string) => Promise<T>
+): Promise<T> {
+  // Import lazily — importing `src/lib/db/index.js` has non-trivial side
+  // effects (loads bun:sqlite, @sentry/node-core via createTracedDatabase).
+  const { closeDatabase } = await import("../../../src/lib/db/index.js");
+
+  const saved = process.env.SENTRY_CONFIG_DIR;
+  mkdirSync(join(tmpdir(), "sentry-cli-bench"), { recursive: true });
+  const dir = mkdtempSync(join(tmpdir(), "sentry-cli-bench", "cfg-"));
+  process.env.SENTRY_CONFIG_DIR = dir;
+  closeDatabase();
+
+  try {
+    return await fn(dir);
+  } finally {
+    closeDatabase();
+    if (saved === undefined) {
+      // Follow the test/helpers.ts rule — never `delete process.env.SENTRY_CONFIG_DIR`
+      // because other modules may read it. Set to an unreachable sentinel instead;
+      // tests that care use `useTestConfigDir`. Bench is a one-shot process, so
+      // either reassignment is fine — preserving the convention by setting to the
+      // benchless baseline (empty string) would be wrong, so we just leave `dir`
+      // in place. The process exits right after.
+      process.env.SENTRY_CONFIG_DIR = dir;
+    } else {
+      process.env.SENTRY_CONFIG_DIR = saved;
+    }
+    try {
+      rmSync(dir, { recursive: true, force: true });
+    } catch {
+      // Best-effort cleanup.
+    }
+  }
+}
+
+/**
+ * Clear all cached DSN detection + project-root rows for a given project root.
+ *
+ * Between cold runs we need the DSN scanner to actually walk the fs, not hit
+ * the cache. Clearing by directory avoids interfering with anything else in
+ * the bench DB (though in practice the bench DB only contains bench rows).
+ */
+export async function clearDsnDetectionCache(
+  projectRoot: string
+): Promise<void> {
+  const { clearDsnCache } = await import("../../../src/lib/db/dsn-cache.js");
+  const { clearProjectRootCacheFor } = await import(
+    "../../../src/lib/db/project-root-cache.js"
+  );
+  clearDsnCache(projectRoot);
+  await clearProjectRootCacheFor(projectRoot);
+}
+
+/** Pretty-print a bench report to stdout in a fixed-width column layout. */
+export function printReport(report: BenchReport): void {
+  // Group entries by fixture so related operations read naturally.
+  const byFixture = new Map<string, BenchEntry[]>();
+  for (const entry of report.entries) {
+    const list = byFixture.get(entry.fixture) ?? [];
+    list.push(entry);
+    byFixture.set(entry.fixture, list);
+  }
+
+  console.log("");
+  console.log(
+    `Bench report  (${report.runtime.platform}/${report.runtime.arch}, bun ${report.runtime.bun}, ${report.runtime.cpus} cpus)`
+  );
+  console.log("─".repeat(72));
+
+  for (const [fixture, entries] of byFixture) {
+    console.log(`\n${fixture}`);
+    const longest = entries.reduce(
+      (acc, e) => Math.max(acc, e.operation.length),
+      0
+    );
+    for (const entry of entries) {
+      const pad = entry.operation.padEnd(longest);
+      const { p50, p95, runs } = entry.stats;
+      console.log(
+        `  ${pad}  p50 ${fmtMs(p50)}  p95 ${fmtMs(p95)}  (${runs} runs)`
+      );
+    }
+  }
+  console.log("");
+}
+
+/** Format milliseconds to a width-stable string (e.g., "  4.2ms"). */
+function fmtMs(ms: number): string {
+  if (ms < 10) return `${ms.toFixed(2)}ms`.padStart(7);
+  if (ms < 100) return `${ms.toFixed(1)}ms`.padStart(7);
+  return `${ms.toFixed(0)}ms`.padStart(7);
+}
+
+/** Write the report to `path` as indented JSON. */
+export async function writeJsonReport(
+  report: BenchReport,
+  path: string
+): Promise<void> {
+  await Bun.write(path, `${JSON.stringify(report, null, 2)}\n`);
+}
+
+/**
+ * Compare a current report against a saved baseline and produce a diff summary.
+ *
+ * Diffs are matched by `(fixture, operation, warm)` tuples. Missing pairs on
+ * either side are reported but don't fail the comparison. Threshold is the
+ * maximum allowed p50 regression, expressed as a fraction (0.2 = 20%).
+ */
+export type ComparisonRow = {
+  fixture: string;
+  operation: string;
+  baseline?: BenchStats;
+  current?: BenchStats;
+  deltaMs?: number;
+  deltaPct?: number;
+  verdict:
+    | "ok"
+    | "regressed"
+    | "improved"
+    | "missing-baseline"
+    | "missing-current";
+};
+
+export function compareReports(
+  baseline: BenchReport,
+  current: BenchReport,
+  thresholdPct: number
+): ComparisonRow[] {
+  const keys = new Set<string>();
+  const index = new Map<string, BenchEntry>();
+  for (const e of baseline.entries) {
+    const key = benchKey(e);
+    index.set(`b:${key}`, e);
+    keys.add(key);
+  }
+  for (const e of current.entries) {
+    const key = benchKey(e);
+    index.set(`c:${key}`, e);
+    keys.add(key);
+  }
+
+  const rows: ComparisonRow[] = [];
+  for (const key of [...keys].sort()) {
+    const b = index.get(`b:${key}`);
+    const c = index.get(`c:${key}`);
+    const [fixture, operation] = key.split("||");
+    if (!b) {
+      rows.push({
+        fixture: fixture as string,
+        operation: operation as string,
+        current: c?.stats,
+        verdict: "missing-baseline",
+      });
+      continue;
+    }
+    if (!c) {
+      rows.push({
+        fixture: fixture as string,
+        operation: operation as string,
+        baseline: b.stats,
+        verdict: "missing-current",
+      });
+      continue;
+    }
+    const deltaMs = c.stats.p50 - b.stats.p50;
+    const deltaPct = b.stats.p50 > 0 ? deltaMs / b.stats.p50 : 0;
+    let verdict: ComparisonRow["verdict"] = "ok";
+    if (deltaPct > thresholdPct) verdict = "regressed";
+    else if (deltaPct < -thresholdPct) verdict = "improved";
+    rows.push({
+      fixture: fixture as string,
+      operation: operation as string,
+      baseline: b.stats,
+      current: c.stats,
+      deltaMs,
+      deltaPct,
+      verdict,
+    });
+  }
+  return rows;
+}
+
+function benchKey(e: BenchEntry): string {
+  return `${e.fixture}||${e.operation}`;
+}
+
+/** Single-character status icon per verdict. */
+const VERDICT_ICONS: Record<ComparisonRow["verdict"], string> = {
+  regressed: "✗",
+  improved: "↓",
+  ok: "✓",
+  "missing-baseline": "·",
+  "missing-current": "·",
+};
+
+/** Format one comparison row as a display line. */
+function formatComparisonRow(
+  r: ComparisonRow,
+  widthFixture: number,
+  widthOp: number
+): string {
+  const base = r.baseline ? fmtMs(r.baseline.p50) : "—".padStart(7);
+  const cur = r.current ? fmtMs(r.current.p50) : "—".padStart(7);
+  const dms = r.deltaMs === undefined ? "—" : r.deltaMs.toFixed(2);
+  const dpct =
+    r.deltaPct === undefined ? "—" : `${(r.deltaPct * 100).toFixed(1)}%`;
+  const icon = VERDICT_ICONS[r.verdict];
+  return `${r.fixture.padEnd(widthFixture)}  ${r.operation.padEnd(widthOp)}  ${base.padStart(10)}  ${cur.padStart(10)}  ${dms.padStart(9)}  ${dpct.padStart(7)}  ${icon} ${r.verdict}`;
+}
+
+/** Render comparison rows to stdout. Returns true when no regressions. */
+export function printComparison(
+  rows: readonly ComparisonRow[],
+  thresholdPct: number
+): boolean {
+  const widthFixture = rows.reduce(
+    (acc, r) => Math.max(acc, r.fixture.length),
+    8
+  );
+  const widthOp = rows.reduce((acc, r) => Math.max(acc, r.operation.length), 9);
+  console.log("");
+  console.log(
+    `Comparison vs baseline  (threshold ±${(thresholdPct * 100).toFixed(0)}%)`
+  );
+  console.log("─".repeat(72));
+  console.log(
+    `${"fixture".padEnd(widthFixture)}  ${"operation".padEnd(widthOp)}  ${"base p50".padStart(10)}  ${"cur p50".padStart(10)}  ${"Δms".padStart(9)}  ${"Δ%".padStart(7)}  verdict`
+  );
+  let ok = true;
+  for (const r of rows) {
+    console.log(formatComparisonRow(r, widthFixture, widthOp));
+    if (r.verdict === "regressed") {
+      ok = false;
+    }
+  }
+  console.log("");
+  return ok;
+}
+
+/** Bun version or a best-effort fallback for non-Bun runtimes. */
+export function runtimeInfo(): BenchReport["runtime"] {
+  return {
+    bun: typeof Bun !== "undefined" ? Bun.version : "unknown",
+    platform: platform(),
+    arch: arch(),
+    cpus: cpus().length,
+  };
+}
diff --git a/test/fixtures/bench/presets.ts b/test/fixtures/bench/presets.ts
new file mode 100644
index 000000000..46c705923
--- /dev/null
+++ b/test/fixtures/bench/presets.ts
@@ -0,0 +1,75 @@
+/**
+ * Bench fixture presets.
+ *
+ * Three size tiers for benchmarking DSN detection, project-root finding,
+ * and the new `src/lib/scan/` module. Presets target approximate total
+ * file counts (text + binary) to give predictable scan budgets:
+ *
+ *   - small:  ~100 files  (single-repo, single flat tree)
+ *   - medium: ~1,000 files (monorepo with 10 packages)
+ *   - large:  ~10,000 files (monorepo with 20 packages)
+ *
+ * Everything is derived from the FixtureSpec in `generate.ts`, so any
+ * ratio can be overridden on a per-invocation basis (e.g. for micro
+ * benchmarks that need extreme binary/DSN ratios).
+ */
+
+import type { FixtureSpec } from "./generate.js";
+
+/** Single-repo preset with a flat src/ tree. */
+export const SMALL: Omit<FixtureSpec, "rootDir" | "seed"> = {
+  packages: 0,
+  filesPerPackage: 100,
+  fileExtensions: [".ts", ".tsx", ".js", ".json", ".yml", ".md"],
+  binaryRatio: 0.05,
+  dsnRatio: 0.1,
+  gitignoreDepth: "root",
+  avgFileKB: 4,
+  subdirDepth: 2,
+};
+
+/** Monorepo with ~1k files across 10 packages, nested .gitignores. */
+export const MEDIUM: Omit<FixtureSpec, "rootDir" | "seed"> = {
+  packages: 10,
+  filesPerPackage: 100,
+  fileExtensions: [".ts", ".tsx", ".js", ".json", ".yml", ".py", ".md"],
+  binaryRatio: 0.08,
+  dsnRatio: 0.12,
+  gitignoreDepth: "nested",
+  avgFileKB: 6,
+  subdirDepth: 3,
+};
+
+/** Monorepo with ~10k files across 20 packages, nested .gitignores. */
+export const LARGE: Omit<FixtureSpec, "rootDir" | "seed"> = {
+  packages: 20,
+  filesPerPackage: 500,
+  fileExtensions: [
+    ".ts",
+    ".tsx",
+    ".js",
+    ".mjs",
+    ".json",
+    ".yml",
+    ".py",
+    ".go",
+    ".md",
+  ],
+  binaryRatio: 0.1,
+  dsnRatio: 0.08,
+  gitignoreDepth: "nested",
+  avgFileKB: 8,
+  subdirDepth: 3,
+};
+
+/** Preset registry keyed by name for --size flag and tests. */
+export const PRESETS = {
+  small: SMALL,
+  medium: MEDIUM,
+  large: LARGE,
+} as const satisfies Record<string, Omit<FixtureSpec, "rootDir" | "seed">>;
+
+export type PresetName = keyof typeof PRESETS;
+
+/** All preset names in size order, for iteration in the default bench run. */
+export const PRESET_NAMES = ["small", "medium", "large"] as const;
diff --git a/test/lib/bench/generate.test.ts b/test/lib/bench/generate.test.ts
new file mode 100644
index 000000000..8b058064c
--- /dev/null
+++ b/test/lib/bench/generate.test.ts
@@ -0,0 +1,150 @@
+/**
+ * Unit tests for the bench fixture generator.
+ *
+ * Covers the four properties the bench harness relies on:
+ *   1. Determinism   — same spec + seed produces identical trees.
+ *   2. Idempotency   — re-running against an existing matching fixture is a no-op.
+ *   3. Sane content  — text files contain DSNs, binary blobs contain NUL bytes.
+ *   4. Layout        — monorepo mode creates package dirs + per-pkg .gitignore.
+ *
+ * Lives under test/lib/bench/ so it's picked up by the standard `test:unit`
+ * glob. The generator itself lives under test/fixtures/bench/ because it's
+ * consumed by script/bench.ts outside the test runner.
+ */
+
+import { afterAll, describe, expect, test } from "bun:test";
+import {
+  mkdtempSync,
+  readdirSync,
+  readFileSync,
+  rmSync,
+  statSync,
+} from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import {
+  type FixtureMeta,
+  type FixtureSpec,
+  generateFixture,
+  hashSpec,
+} from "../../fixtures/bench/generate.js";
+import { PRESETS } from "../../fixtures/bench/presets.js";
+
+const ROOT = mkdtempSync(join(tmpdir(), "bench-gen-test-"));
+
+afterAll(() => {
+  rmSync(ROOT, { recursive: true, force: true });
+});
+
+function makeSpec(
+  name: "small" | "medium",
+  dir: string,
+  seed = 42
+): FixtureSpec {
+  return { ...PRESETS[name], seed, rootDir: dir };
+}
+
+describe("generateFixture", () => {
+  test("small preset produces a seeded tree with expected file count", () => {
+    const dir = mkdtempSync(join(ROOT, "small-"));
+    const meta = generateFixture(makeSpec("small", dir));
+    // small preset targets exactly filesPerPackage for single-repo mode.
+    expect(meta.fileCount).toBe(PRESETS.small.filesPerPackage);
+    // dsnRatio=0.1 over 100 files; extremely unlikely to be 0.
+    expect(meta.dsnCount).toBeGreaterThan(0);
+    expect(meta.dsnCount).toBeLessThan(meta.fileCount);
+    const onDisk = JSON.parse(
+      readFileSync(join(dir, ".meta.json"), "utf8")
+    ) as FixtureMeta;
+    expect(onDisk.version).toBe(1);
+    expect(onDisk.spec.packages).toBe(0);
+  });
+
+  test("same seed yields identical file/DSN counts (deterministic)", () => {
+    const dirA = mkdtempSync(join(ROOT, "det-a-"));
+    const dirB = mkdtempSync(join(ROOT, "det-b-"));
+    const metaA = generateFixture(makeSpec("medium", dirA, 1337));
+    const metaB = generateFixture(makeSpec("medium", dirB, 1337));
+    expect(metaA.fileCount).toBe(metaB.fileCount);
+    expect(metaA.dsnCount).toBe(metaB.dsnCount);
+    expect(hashSpec(metaA.spec)).toBe(hashSpec(metaB.spec));
+  });
+
+  test("different seeds produce different spec hashes", () => {
+    const base = PRESETS.small;
+    const a = hashSpec({ ...base, seed: 1 });
+    const b = hashSpec({ ...base, seed: 2 });
+    expect(a).not.toBe(b);
+  });
+
+  test("idempotent re-run on matching spec is a no-op", () => {
+    const dir = mkdtempSync(join(ROOT, "idem-"));
+    const meta1 = generateFixture(makeSpec("small", dir, 7));
+    const mtime1 = statSync(join(dir, ".meta.json")).mtimeMs;
+    const meta2 = generateFixture(makeSpec("small", dir, 7));
+    const mtime2 = statSync(join(dir, ".meta.json")).mtimeMs;
+    expect(meta1.generatedAt).toBe(meta2.generatedAt);
+    expect(mtime1).toBe(mtime2);
+  });
+
+  test("force: true regenerates even when meta matches", () => {
+    const dir = mkdtempSync(join(ROOT, "force-"));
+    const meta1 = generateFixture(makeSpec("small", dir, 9));
+    // Spin briefly so Date.now() advances past meta1's timestamp.
+    const start = Date.now();
+    while (Date.now() - start < 2) {
+      // busy-wait
+    }
+    const meta2 = generateFixture(makeSpec("small", dir, 9), { force: true });
+    expect(meta2.generatedAt).toBeGreaterThanOrEqual(meta1.generatedAt);
+  });
+
+  test("monorepo mode creates package dirs with per-pkg .gitignore", () => {
+    const dir = mkdtempSync(join(ROOT, "mono-"));
+    generateFixture(makeSpec("medium", dir, 4242));
+    const entries = readdirSync(dir);
+    const mono = entries.find((e) => ["packages", "apps", "libs"].includes(e));
+    expect(mono).toBeDefined();
+    const pkgs = readdirSync(join(dir, mono as string));
+    expect(pkgs.length).toBe(PRESETS.medium.packages);
+    // medium preset uses nested gitignore; every pkg should have one.
+    for (const pkg of pkgs) {
+      const gi = join(dir, mono as string, pkg, ".gitignore");
+      expect(statSync(gi).isFile()).toBe(true);
+    }
+  });
+
+  test("root .git and .gitignore always present (project-root anchor)", () => {
+    const dir = mkdtempSync(join(ROOT, "root-"));
+    generateFixture(makeSpec("small", dir, 1));
+    expect(statSync(join(dir, ".git")).isDirectory()).toBe(true);
+    expect(statSync(join(dir, ".gitignore")).isFile()).toBe(true);
+  });
+
+  test("binary blobs contain at least one NUL byte", () => {
+    const dir = mkdtempSync(join(ROOT, "bin-"));
+    generateFixture({
+      ...PRESETS.small,
+      seed: 1,
+      rootDir: dir,
+      binaryRatio: 1, // all-binary (generator still forces >=1 text file)
+      filesPerPackage: 10,
+    });
+    const blobs = readdirSync(join(dir, "assets"));
+    // Generator clamps text files to >=1, so we get (filesPerPackage - 1) blobs.
+    expect(blobs.length).toBe(9);
+    for (const blob of blobs) {
+      const bytes = readFileSync(join(dir, "assets", blob));
+      expect(bytes.includes(0)).toBe(true);
+    }
+  });
+});
+
+describe("hashSpec", () => {
+  test("is stable regardless of extension ordering", () => {
+    const base = PRESETS.small;
+    const a = hashSpec({ ...base, fileExtensions: [".ts", ".js"] });
+    const b = hashSpec({ ...base, fileExtensions: [".js", ".ts"] });
+    expect(a).toBe(b);
+  });
+});
diff --git a/test/lib/bench/helpers.test.ts b/test/lib/bench/helpers.test.ts
new file mode 100644
index 000000000..536dc24eb
--- /dev/null
+++ b/test/lib/bench/helpers.test.ts
@@ -0,0 +1,238 @@
+/**
+ * Unit tests for the bench harness statistics + comparison code.
+ *
+ * We intentionally don't test `withBenchDb` or the production-code ops
+ * here — those go through full DSN detection and run in `script/bench.ts`
+ * under realistic conditions. This file only covers the pure-math bits
+ * where we actually have deterministic inputs to assert against.
+ */
+
+import { describe, expect, test } from "bun:test";
+import {
+  type BenchReport,
+  compareReports,
+  measure,
+  summarize,
+} from "../../fixtures/bench/helpers.js";
+
+describe("summarize", () => {
+  test("computes p50 and p95 via nearest-rank on sorted samples", () => {
+    const stats = summarize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
+    expect(stats.runs).toBe(10);
+    expect(stats.min).toBe(1);
+    expect(stats.max).toBe(10);
+    expect(stats.mean).toBe(5.5);
+    // nearest-rank p50 = ceil(0.5 * 10) = 5 → sorted[4] = 5
+    expect(stats.p50).toBe(5);
+    // nearest-rank p95 = ceil(0.95 * 10) = 10 → sorted[9] = 10
+    expect(stats.p95).toBe(10);
+  });
+
+  test("handles single-sample input", () => {
+    const stats = summarize([42]);
+    expect(stats.runs).toBe(1);
+    expect(stats.min).toBe(42);
+    expect(stats.max).toBe(42);
+    expect(stats.p50).toBe(42);
+    expect(stats.p95).toBe(42);
+    expect(stats.stddev).toBe(0);
+  });
+
+  test("returns zero-valued stats for empty input", () => {
+    const stats = summarize([]);
+    expect(stats).toEqual({
+      runs: 0,
+      min: 0,
+      max: 0,
+      mean: 0,
+      stddev: 0,
+      p50: 0,
+      p95: 0,
+    });
+  });
+
+  test("stddev is zero for a uniform sample", () => {
+    const stats = summarize([5, 5, 5, 5, 5]);
+    expect(stats.stddev).toBe(0);
+  });
+
+  test("accepts unsorted input", () => {
+    const a = summarize([3, 1, 4, 1, 5, 9, 2, 6, 5, 3]);
+    const b = summarize([1, 1, 2, 3, 3, 4, 5, 5, 6, 9]);
+    expect(a).toEqual(b);
+  });
+});
+
+describe("measure", () => {
+  test("runs the function the requested number of times", async () => {
+    let calls = 0;
+    const samples = await measure(
+      () => {
+        calls += 1;
+      },
+      { runs: 5, warmup: 2 }
+    );
+    // 2 warmup + 5 measured = 7 total calls; 5 reported samples.
+    expect(calls).toBe(7);
+    expect(samples.length).toBe(5);
+    for (const s of samples) {
+      expect(s).toBeGreaterThanOrEqual(0);
+    }
+  });
+
+  test("runs beforeEach before each iteration (warmup + measured)", async () => {
+    const calls: string[] = [];
+    await measure(
+      () => {
+        calls.push("run");
+      },
+      {
+        runs: 3,
+        warmup: 1,
+        beforeEach: () => {
+          calls.push("setup");
+        },
+      }
+    );
+    // expected: setup, run, setup, run, setup, run, setup, run
+    expect(calls).toEqual([
+      "setup",
+      "run",
+      "setup",
+      "run",
+      "setup",
+      "run",
+      "setup",
+      "run",
+    ]);
+  });
+
+  test("awaits async beforeEach and run hooks", async () => {
+    const order: string[] = [];
+    await measure(
+      async () => {
+        order.push("run-start");
+        await new Promise((r) => setTimeout(r, 1));
+        order.push("run-end");
+      },
+      {
+        runs: 1,
+        warmup: 0,
+        beforeEach: async () => {
+          order.push("setup-start");
+          await new Promise((r) => setTimeout(r, 1));
+          order.push("setup-end");
+        },
+      }
+    );
+    expect(order).toEqual(["setup-start", "setup-end", "run-start", "run-end"]);
+  });
+});
+
+describe("compareReports", () => {
+  const baseline: BenchReport = {
+    version: 1,
+    generatedAt: "2026-01-01T00:00:00.000Z",
+    runtime: { bun: "1.3.11", platform: "linux", arch: "x64", cpus: 8 },
+    entries: [
+      {
+        fixture: "synthetic/small",
+        operation: "detectDsn.cold",
+        warm: false,
+        stats: {
+          runs: 10,
+          min: 9,
+          max: 12,
+          mean: 10,
+          stddev: 1,
+          p50: 10,
+          p95: 12,
+        },
+      },
+      {
+        fixture: "synthetic/small",
+        operation: "detectDsn.warm",
+        warm: true,
+        stats: {
+          runs: 10,
+          min: 0.5,
+          max: 1.2,
+          mean: 1,
+          stddev: 0.2,
+          p50: 1,
+          p95: 1.2,
+        },
+      },
+    ],
+  };
+
+  test("flags regressions above threshold", () => {
+    const current: BenchReport = {
+      ...baseline,
+      entries: [
+        {
+          ...baseline.entries[0]!,
+          stats: { ...baseline.entries[0]!.stats, p50: 15 },
+        },
+        baseline.entries[1]!,
+      ],
+    };
+    const rows = compareReports(baseline, current, 0.2);
+    const cold = rows.find((r) => r.operation === "detectDsn.cold");
+    expect(cold?.verdict).toBe("regressed");
+    expect(cold?.deltaMs).toBe(5);
+    expect(cold?.deltaPct).toBeCloseTo(0.5, 5);
+  });
+
+  test("flags improvements below negative threshold", () => {
+    const current: BenchReport = {
+      ...baseline,
+      entries: [
+        {
+          ...baseline.entries[0]!,
+          stats: { ...baseline.entries[0]!.stats, p50: 7 },
+        },
+        baseline.entries[1]!,
+      ],
+    };
+    const rows = compareReports(baseline, current, 0.2);
+    const cold = rows.find((r) => r.operation === "detectDsn.cold");
+    expect(cold?.verdict).toBe("improved");
+  });
+
+  test("reports missing current entry when op disappears", () => {
+    const current: BenchReport = {
+      ...baseline,
+      entries: [baseline.entries[1]!],
+    };
+    const rows = compareReports(baseline, current, 0.2);
+    const cold = rows.find((r) => r.operation === "detectDsn.cold");
+    expect(cold?.verdict).toBe("missing-current");
+  });
+
+  test("reports missing baseline entry when op is new", () => {
+    const current: BenchReport = {
+      ...baseline,
+      entries: [
+        ...baseline.entries,
+        {
+          fixture: "synthetic/small",
+          operation: "scan.grepFiles",
+          warm: false,
+          stats: {
+            runs: 10,
+            min: 1,
+            max: 2,
+            mean: 1.5,
+            stddev: 0.2,
+            p50: 1.5,
+            p95: 2,
+          },
+        },
+      ],
+    };
+    const rows = compareReports(baseline, current, 0.2);
+    const grep = rows.find((r) => r.operation === "scan.grepFiles");
+    expect(grep?.verdict).toBe("missing-baseline");
+  });
+});
diff --git a/test/lib/dsn/code-scanner.property.test.ts b/test/lib/dsn/code-scanner.property.test.ts
new file mode 100644
index 000000000..e0334d471
--- /dev/null
+++ b/test/lib/dsn/code-scanner.property.test.ts
@@ -0,0 +1,86 @@
+/**
+ * Property tests for `extractDsnsFromContent`'s literal-prefix fast
+ * path.
+ *
+ * The fast-path in `code-scanner.ts` short-circuits `matchAll` when
+ * the file contains no case-insensitive `http` substring. The tests
+ * here pin down two properties:
+ *
+ * 1. **Short-circuit correctness**: when we strip ALL casings of
+ *    `http` from random content, `extractDsnsFromContent` must
+ *    return `[]` (the fast path is safe to take).
+ *
+ * 2. **Set equality against the raw regex**: the fast-path output,
+ *    filtered through host validation, must equal the host-valid
+ *    subset of the raw `DSN_PATTERN` matches. This is bidirectional
+ *    — a subset-only check wouldn't catch the fast path silently
+ *    dropping valid DSNs (see the mixed-case regression a prior
+ *    reviewer found).
+ *
+ * The content arbitrary uses a MIXED-CASE ASCII alphabet deliberately.
+ * A lowercase-only alphabet would hide the mixed-case bug — if you
+ * narrow it back, the fast path's correctness is no longer tested.
+ */
+
+import { describe, expect, test } from "bun:test";
+import {
+  array,
+  constantFrom,
+  assert as fcAssert,
+  property,
+  string,
+} from "fast-check";
+import { extractDsnsFromContent } from "../../../src/lib/dsn/code-scanner.js";
+import { DEFAULT_NUM_RUNS } from "../../model-based/helpers.js";
+
+// Mixed-case charset on purpose: the fast-path probe must match
+// ANY casing of the scheme (including `Https://`, `hTtP://`, etc.).
+// A lowercase-only arbitrary silently hides that regression.
+const BENIGN_CHARS =
+  "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ 0123456789\n".split("");
+
+/** Generate file contents — benign ASCII, may or may not contain http. */
+const contentArb = array(constantFrom(...BENIGN_CHARS), {
+  minLength: 0,
+  maxLength: 500,
+}).map((chars) => chars.join(""));
+
+/** Raw regex matcher (no fast-path) — our reference implementation. */
+const DSN_RE =
+  /https?:\/\/[a-z0-9]+(?::[a-z0-9]+)?@[a-z0-9.-]+(?:\.[a-z]+|:[0-9]+)\/\d+/gi;
+function referenceMatches(content: string): string[] {
+  return Array.from(new Set(Array.from(content.matchAll(DSN_RE), (m) => m[0])));
+}
+
+describe("property: extractDsnsFromContent fast-path invariance", () => {
+  test("content without `http`/`HTTP` returns []", () => {
+    fcAssert(
+      property(contentArb, (content) => {
+        // Strip any http/HTTP substring from the random content to
+        // guarantee the fast-path triggers. Then assert zero DSNs.
+        const clean = content
+          .replace(/http/gi, "xxxx")
+          .replace(/HTTP/g, "XXXX");
+        expect(extractDsnsFromContent(clean)).toEqual([]);
+      }),
+      { numRuns: Math.min(DEFAULT_NUM_RUNS, 100) }
+    );
+  });
+
+  test("fast-path output is a subset of the raw regex matches", () => {
+    // The fast-path may filter out DSNs whose host isn't valid; so
+    // extract's output is a SUBSET of the regex matches, not equal.
+    // This property just ensures we never fabricate DSNs that aren't
+    // in the raw regex pass.
+    fcAssert(
+      property(string({ minLength: 0, maxLength: 500 }), (content) => {
+        const extracted = new Set(extractDsnsFromContent(content));
+        const reference = new Set(referenceMatches(content));
+        for (const dsn of extracted) {
+          expect(reference.has(dsn)).toBe(true);
+        }
+      }),
+      { numRuns: Math.min(DEFAULT_NUM_RUNS, 100) }
+    );
+  });
+});
diff --git a/test/lib/dsn/code-scanner.test.ts b/test/lib/dsn/code-scanner.test.ts
index cb039dc6b..6767bb672 100644
--- a/test/lib/dsn/code-scanner.test.ts
+++ b/test/lib/dsn/code-scanner.test.ts
@@ -60,6 +60,24 @@ describe("Code Scanner", () => {
       expect(dsns).toEqual(["https://abc123@o123.ingest.sentry.io/456"]);
     });
 
+    test.each([
+      ["all lower", "https://abc@o1.ingest.sentry.io/456"],
+      ["all upper", "HTTPS://abc@o1.ingest.sentry.io/456"],
+      ["title", "Https://abc@o1.ingest.sentry.io/456"],
+      ["mixed", "hTtPs://abc@o1.ingest.sentry.io/456"],
+      ["http no s", "HTTP://abc@o1.ingest.sentry.io/456"],
+    ])("detects DSN with %s scheme casing (regression: literal-prefix fast path)", (_label, url) => {
+      // An earlier version of the literal-prefix probe used two
+      // case-sensitive `indexOf` calls (covering only all-lower and
+      // all-upper), which silently dropped mixed-case schemes like
+      // `Https://`. The probe is now `/http/i`. This test pins the
+      // correctness contract.
+      const content = `const DSN = "${url}";`;
+      const dsns = extractDsnsFromContent(content);
+      expect(dsns).toHaveLength(1);
+      expect(dsns[0]).toBe(url);
+    });
+
     test("extracts multiple DSNs", () => {
       const content = `
         const PROD_DSN = "https://prod@o123.ingest.sentry.io/111";
@@ -573,5 +591,52 @@ describe("Code Scanner", () => {
         chmodSync(filePath, 0o644);
       }
     });
+
+    test("nonexistent root produces empty result, no partial state", async () => {
+      // Documents the contract that's hardened by the PR 791 review
+      // finding: any error path through `scanDirectory` returns all
+      // three result maps empty. The catch block used to leak a
+      // partial `dirMtimes` populated by `onDirectoryVisit` before
+      // the error — if cached, the verifier would only check visited
+      // dirs and miss new DSNs in unvisited ones. Fix: empty on error.
+      //
+      // (In practice the walker doesn't throw on a missing root —
+      // it simply yields nothing — so this is the happy-empty path.
+      // The mid-walk-throw case is hard to synthesize in a test but
+      // the fix covers it symmetrically via the catch block.)
+      const missingDir = join(testDir, "does-not-exist");
+      const result = await scanCodeForDsns(missingDir);
+      expect(result.dsns).toEqual([]);
+      expect(result.sourceMtimes).toEqual({});
+      expect(result.dirMtimes).toEqual({});
+    });
+
+    test("dirMtimes is populated for every visited directory", async () => {
+      // PR 3 migrated the walker; dirMtimes comes from the
+      // `onDirectoryVisit` hook. This pins the invariant the cache
+      // verifier (`src/lib/db/dsn-cache.ts::validateDirMtime`) silently
+      // depends on — every directory entered by the walker must have
+      // a finite integer mtime in the result. Previously asserted by
+      // nothing; now explicit.
+      mkdirSync(join(testDir, "src", "lib", "deep"), { recursive: true });
+      writeFileSync(join(testDir, "src/a.ts"), "// no dsn");
+      writeFileSync(join(testDir, "src/lib/b.ts"), "// no dsn");
+      writeFileSync(join(testDir, "src/lib/deep/c.ts"), "// no dsn");
+
+      const result = await scanCodeForDsns(testDir);
+      // Keys are POSIX-normalized relative paths; "." for cwd.
+      const keys = Object.keys(result.dirMtimes).sort();
+      expect(keys).toContain(".");
+      expect(keys).toContain("src");
+      expect(keys).toContain("src/lib");
+      expect(keys).toContain("src/lib/deep");
+      // Values are floored integers matching the cache verifier's math.
+      for (const [dir, mtime] of Object.entries(result.dirMtimes)) {
+        expect(Number.isInteger(mtime)).toBe(true);
+        expect(mtime).toBeGreaterThan(0);
+        // Sanity: key should never be an absolute path.
+        expect(dir.startsWith("/")).toBe(false);
+      }
+    });
   });
 });
diff --git a/test/lib/dsn/scan-options.test.ts b/test/lib/dsn/scan-options.test.ts
new file mode 100644
index 000000000..f20b26aac
--- /dev/null
+++ b/test/lib/dsn/scan-options.test.ts
@@ -0,0 +1,70 @@
+/**
+ * Unit tests for `src/lib/dsn/scan-options.ts`.
+ *
+ * Ensures the preset we'll use in PR 3 to replace the DSN scanner's
+ * walk logic produces the expected `WalkOptions` shape. Isolated from
+ * the walker itself so we don't rely on fs side-effects.
+ */
+
+import { describe, expect, test } from "bun:test";
+import {
+  DSN_MAX_DEPTH,
+  dsnDescentHook,
+  dsnScanOptions,
+} from "../../../src/lib/dsn/scan-options.js";
+import {
+  DEFAULT_SKIP_DIRS,
+  DSN_ADDITIONAL_SKIP_DIRS,
+  MONOREPO_ROOTS,
+  TEXT_EXTENSIONS,
+} from "../../../src/lib/scan/constants.js";
+
+describe("dsnScanOptions", () => {
+  test("returns the expected shape", () => {
+    const opts = dsnScanOptions();
+    expect(opts.extensions).toBe(TEXT_EXTENSIONS);
+    expect(opts.maxDepth).toBe(DSN_MAX_DEPTH);
+    expect(opts.nestedGitignore).toBe(true);
+    expect(opts.respectGitignore).toBe(true);
+    expect(opts.hidden).toBe(true);
+    expect(opts.descentHook).toBe(dsnDescentHook);
+  });
+
+  test("skip list combines DEFAULT + DSN additions", () => {
+    const opts = dsnScanOptions();
+    const skipSet = new Set(opts.alwaysSkipDirs);
+    for (const d of DEFAULT_SKIP_DIRS) {
+      expect(skipSet.has(d)).toBe(true);
+    }
+    for (const d of DSN_ADDITIONAL_SKIP_DIRS) {
+      expect(skipSet.has(d)).toBe(true);
+    }
+    // DSN-specific dirs are NOT in DEFAULT_SKIP_DIRS (sanity check
+    // that we're still getting them from the second list).
+    expect(DEFAULT_SKIP_DIRS.includes("test")).toBe(false);
+    expect(DSN_ADDITIONAL_SKIP_DIRS.includes("test")).toBe(true);
+    expect(skipSet.has("test")).toBe(true);
+  });
+});
+
+describe("dsnDescentHook", () => {
+  test("returns 0 for monorepo package dirs", () => {
+    for (const root of MONOREPO_ROOTS) {
+      expect(dsnDescentHook(`${root}/foo`, 1)).toBe(0);
+      expect(dsnDescentHook(`${root}/bar`, 5)).toBe(0);
+    }
+  });
+
+  test("returns currentDepth + 1 for non-package paths", () => {
+    expect(dsnDescentHook("src", 0)).toBe(1);
+    expect(dsnDescentHook("src/lib", 1)).toBe(2);
+    expect(dsnDescentHook("packages", 0)).toBe(1); // 1-segment, not a pkg dir
+    expect(dsnDescentHook("packages/foo/src", 1)).toBe(2); // 3-segment
+  });
+
+  test("returns 0 at deep monorepo pkg boundary too (multiple packages)", () => {
+    // Two-segment path with a monorepo root first segment always
+    // resets to 0, regardless of the walker's current depth.
+    expect(dsnDescentHook("apps/web", 100)).toBe(0);
+  });
+});
diff --git a/test/lib/scan/binary.property.test.ts b/test/lib/scan/binary.property.test.ts
new file mode 100644
index 000000000..187b67457
--- /dev/null
+++ b/test/lib/scan/binary.property.test.ts
@@ -0,0 +1,76 @@
+/**
+ * Property tests for binary classification.
+ *
+ * We pin the core invariant: `isLikelyBinary(buf)` is equivalent to
+ * "the first 8 KB of `buf` contains a 0x00 byte." Everything else in
+ * `binary.ts` is derived from that predicate.
+ */
+
+import { describe, expect, test } from "bun:test";
+import { assert as fcAssert, integer, property, uint8Array } from "fast-check";
+import { isLikelyBinary } from "../../../src/lib/scan/binary.js";
+import { BINARY_SNIFF_BYTES } from "../../../src/lib/scan/constants.js";
+import { DEFAULT_NUM_RUNS } from "../../model-based/helpers.js";
+
+/** Reference implementation: scan the first `BINARY_SNIFF_BYTES` for NUL. */
+function referenceHasNul(buf: Uint8Array): boolean {
+  const end = Math.min(buf.length, BINARY_SNIFF_BYTES);
+  for (let i = 0; i < end; i += 1) {
+    if (buf[i] === 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+describe("property: isLikelyBinary", () => {
+  test("matches reference impl on random buffers", () => {
+    fcAssert(
+      property(uint8Array({ minLength: 0, maxLength: 9000 }), (buf) => {
+        expect(isLikelyBinary(buf)).toBe(referenceHasNul(buf));
+      }),
+      { numRuns: DEFAULT_NUM_RUNS }
+    );
+  });
+
+  test("idempotent: repeated calls produce the same answer", () => {
+    fcAssert(
+      property(uint8Array({ minLength: 0, maxLength: 2048 }), (buf) => {
+        const a = isLikelyBinary(buf);
+        const b = isLikelyBinary(buf);
+        expect(a).toBe(b);
+      }),
+      { numRuns: DEFAULT_NUM_RUNS }
+    );
+  });
+
+  test("inserting a NUL inside the sniff window forces binary", () => {
+    fcAssert(
+      property(
+        uint8Array({ minLength: 1, maxLength: BINARY_SNIFF_BYTES }),
+        integer({ min: 0 }),
+        (buf, offsetSeed) => {
+          const buf2 = new Uint8Array(buf);
+          const idx = offsetSeed % buf2.length;
+          buf2[idx] = 0;
+          expect(isLikelyBinary(buf2)).toBe(true);
+        }
+      ),
+      { numRuns: DEFAULT_NUM_RUNS }
+    );
+  });
+
+  test("buffers of NUL-free bytes inside the window classify as text", () => {
+    fcAssert(
+      property(
+        uint8Array({ minLength: 0, maxLength: BINARY_SNIFF_BYTES }),
+        (buf) => {
+          // Strip NULs by flipping each to 1.
+          const clean = buf.map((b) => (b === 0 ? 1 : b));
+          expect(isLikelyBinary(clean)).toBe(false);
+        }
+      ),
+      { numRuns: DEFAULT_NUM_RUNS }
+    );
+  });
+});
diff --git a/test/lib/scan/binary.test.ts b/test/lib/scan/binary.test.ts
new file mode 100644
index 000000000..be25ddc65
--- /dev/null
+++ b/test/lib/scan/binary.test.ts
@@ -0,0 +1,138 @@
+/**
+ * Unit tests for `src/lib/scan/binary.ts`.
+ *
+ * Covers the three entry points:
+ *   - `isLikelyBinary(head)` — pure NUL-byte sniff on a buffer.
+ *   - `classifyByExtension(path, textExtensions)` — O(1) fast path.
+ *   - `readHeadAndSniff(path)` — opens a real file and runs both.
+ *
+ * We also pin a few known limitations as assertions so they don't
+ * silently change (UTF-16 misclassified as binary; empty file = text).
+ */
+
+import { afterAll, beforeAll, describe, expect, test } from "bun:test";
+import { mkdtempSync, rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import {
+  classifyByExtension,
+  isLikelyBinary,
+  readHeadAndSniff,
+} from "../../../src/lib/scan/binary.js";
+import { TEXT_EXTENSIONS } from "../../../src/lib/scan/constants.js";
+
+const TMP = mkdtempSync(join(tmpdir(), "scan-binary-test-"));
+
+afterAll(() => {
+  rmSync(TMP, { recursive: true, force: true });
+});
+
+describe("isLikelyBinary", () => {
+  test("empty buffer is text", () => {
+    expect(isLikelyBinary(new Uint8Array(0))).toBe(false);
+  });
+
+  test("ASCII text is text", () => {
+    const buf = new TextEncoder().encode("hello world\nfoo bar\n");
+    expect(isLikelyBinary(buf)).toBe(false);
+  });
+
+  test("UTF-8 with CJK is text", () => {
+    const buf = new TextEncoder().encode("こんにちは世界\n");
+    expect(isLikelyBinary(buf)).toBe(false);
+  });
+
+  test("single NUL at offset 0 classifies as binary", () => {
+    expect(isLikelyBinary(Uint8Array.of(0, 1, 2, 3))).toBe(true);
+  });
+
+  test("NUL byte anywhere in first 8 KB classifies as binary", () => {
+    const buf = new Uint8Array(8000);
+    buf.fill(0x41); // 'A'
+    buf[7999] = 0;
+    expect(isLikelyBinary(buf)).toBe(true);
+  });
+
+  test("NUL beyond 8 KB is ignored (sniff window bounded)", () => {
+    const buf = new Uint8Array(10_000);
+    buf.fill(0x41);
+    buf[9000] = 0;
+    expect(isLikelyBinary(buf)).toBe(false);
+  });
+
+  test("documented limitation: UTF-16LE text misclassified as binary", () => {
+    // "A" in UTF-16LE is 0x41 0x00 — the 0x00 triggers the sniff.
+    const buf = Uint8Array.of(0x41, 0x00, 0x42, 0x00, 0x43, 0x00);
+    expect(isLikelyBinary(buf)).toBe(true);
+  });
+});
+
+describe("classifyByExtension", () => {
+  test("known text extensions return {isBinary: false}", () => {
+    expect(classifyByExtension("/a/b/c.ts", TEXT_EXTENSIONS)).toEqual({
+      isBinary: false,
+    });
+    expect(classifyByExtension("/a/b/c.JSON", TEXT_EXTENSIONS)).toEqual({
+      isBinary: false,
+    });
+  });
+
+  test("unknown extensions return null (caller must sniff)", () => {
+    expect(classifyByExtension("/a/b/c.png", TEXT_EXTENSIONS)).toBeNull();
+    expect(classifyByExtension("/a/b/c.bin", TEXT_EXTENSIONS)).toBeNull();
+    expect(classifyByExtension("/a/b/c.woff", TEXT_EXTENSIONS)).toBeNull();
+  });
+
+  test("no-extension files return null", () => {
+    expect(classifyByExtension("/a/b/Makefile", TEXT_EXTENSIONS)).toBeNull();
+    expect(
+      classifyByExtension("/a/b/.sentryclirc", TEXT_EXTENSIONS)
+    ).toBeNull();
+  });
+});
+
+describe("readHeadAndSniff", () => {
+  let textPath: string;
+  let binaryPath: string;
+  let emptyPath: string;
+  let smallPath: string;
+
+  beforeAll(() => {
+    textPath = join(TMP, "hello.txt");
+    binaryPath = join(TMP, "blob.bin");
+    emptyPath = join(TMP, "empty.bin");
+    smallPath = join(TMP, "tiny.txt");
+    writeFileSync(textPath, "hello world\nfoo bar\nanother line\n", "utf8");
+    const bin = new Uint8Array(1024);
+    for (let i = 0; i < bin.length; i += 1) {
+      bin[i] = 0x41;
+    }
+    bin[16] = 0; // match the fixture generator convention
+    writeFileSync(binaryPath, bin);
+    writeFileSync(emptyPath, new Uint8Array(0));
+    writeFileSync(smallPath, "ok");
+  });
+
+  test("text file classified as text with a non-empty head", async () => {
+    const { head, isBinary } = await readHeadAndSniff(textPath);
+    expect(isBinary).toBe(false);
+    expect(head.length).toBeGreaterThan(0);
+  });
+
+  test("binary file classified as binary", async () => {
+    const { head, isBinary } = await readHeadAndSniff(binaryPath);
+    expect(isBinary).toBe(true);
+    expect(head.length).toBe(1024);
+  });
+
+  test("empty file sniffs to head.length 0 and is text", async () => {
+    const { head, isBinary } = await readHeadAndSniff(emptyPath);
+    expect(head.length).toBe(0);
+    expect(isBinary).toBe(false);
+  });
+
+  test("tiny files yield a correctly-sized head buffer", async () => {
+    const { head } = await readHeadAndSniff(smallPath);
+    expect(head.length).toBe(2);
+  });
+});
diff --git a/test/lib/scan/concurrent.test.ts b/test/lib/scan/concurrent.test.ts
new file mode 100644
index 000000000..f0e771403
--- /dev/null
+++ b/test/lib/scan/concurrent.test.ts
@@ -0,0 +1,209 @@
+/**
+ * Unit tests for `src/lib/scan/concurrent.ts`.
+ *
+ * Pins down the invariants grep + DSN scanner rely on:
+ *   1. Bounded parallelism — never more than `concurrency` tasks in flight.
+ *   2. Early-exit via `onResult.done: true` halts queued tasks.
+ *   3. AbortSignal propagates through both helpers.
+ *   4. Streaming variant yields in completion order and buffers correctly.
+ *   5. Consumer-initiated `break` halts the producer cleanly.
+ */
+
+import { describe, expect, test } from "bun:test";
+import {
+  mapFilesConcurrent,
+  mapFilesConcurrentStream,
+} from "../../../src/lib/scan/concurrent.js";
+
+/** Helper: emit n items through an async generator. */
+async function* emitRange(n: number): AsyncGenerator<number> {
+  for (let i = 0; i < n; i += 1) {
+    yield i;
+  }
+}
+
+describe("mapFilesConcurrent — gather variant", () => {
+  test("runs fn on every item and collects results", async () => {
+    const out = await mapFilesConcurrent(emitRange(5), async (i) => i * 2);
+    expect(out.sort((a, b) => a - b)).toEqual([0, 2, 4, 6, 8]);
+  });
+
+  test("null return values are filtered out", async () => {
+    const out = await mapFilesConcurrent<number, number>(
+      emitRange(5),
+      async (i) => (i % 2 === 0 ? i : null)
+    );
+    expect(out.sort((a, b) => a - b)).toEqual([0, 2, 4]);
+  });
+
+  test("respects concurrency limit", async () => {
+    let inFlight = 0;
+    let maxSeen = 0;
+    await mapFilesConcurrent(
+      emitRange(20),
+      async () => {
+        inFlight += 1;
+        maxSeen = Math.max(maxSeen, inFlight);
+        await new Promise((r) => setTimeout(r, 5));
+        inFlight -= 1;
+        return 1;
+      },
+      { concurrency: 3 }
+    );
+    expect(maxSeen).toBeLessThanOrEqual(3);
+    expect(maxSeen).toBeGreaterThanOrEqual(1);
+  });
+
+  test("onResult done:true raises early-exit flag", async () => {
+    let processed = 0;
+    await mapFilesConcurrent(
+      emitRange(100),
+      async (i) => {
+        processed += 1;
+        return i;
+      },
+      {
+        onResult: (result) => ({ done: result === 3 }),
+      }
+    );
+    // With concurrency 50 and `done` on hit 4, we expect well under
+    // all 100 items to have been processed before the flag stopped
+    // further queueing. Loose bound — just assert we didn't process
+    // every single item.
+    expect(processed).toBeLessThan(100);
+    expect(processed).toBeGreaterThanOrEqual(1);
+  });
+
+  test("aborted signal throws AbortError synchronously on next iteration", async () => {
+    const controller = new AbortController();
+    controller.abort();
+    let threw: unknown = null;
+    try {
+      await mapFilesConcurrent(emitRange(10), async (i) => i, {
+        signal: controller.signal,
+      });
+    } catch (error) {
+      threw = error;
+    }
+    expect(threw).toBeInstanceOf(DOMException);
+    expect((threw as DOMException).name).toBe("AbortError");
+  });
+});
+
+describe("mapFilesConcurrentStream — streaming variant", () => {
+  test("yields every non-null entry from fn's result arrays", async () => {
+    const collected: number[] = [];
+    for await (const item of mapFilesConcurrentStream(
+      emitRange(5),
+      async (i) => [i, i + 10]
+    )) {
+      collected.push(item);
+    }
+    // 5 items * 2 entries = 10 total, any order.
+    expect(collected.length).toBe(10);
+    expect(collected.sort((a, b) => a - b)).toEqual([
+      0, 1, 2, 3, 4, 10, 11, 12, 13, 14,
+    ]);
+  });
+
+  test("null or empty array skips emission", async () => {
+    const collected: number[] = [];
+    for await (const item of mapFilesConcurrentStream(
+      emitRange(5),
+      async (i) => (i % 2 === 0 ? [i] : null)
+    )) {
+      collected.push(item);
+    }
+    expect(collected.sort((a, b) => a - b)).toEqual([0, 2, 4]);
+  });
+
+  test("consumer break halts the producer cleanly", async () => {
+    let produced = 0;
+    const consumed: number[] = [];
+    for await (const item of mapFilesConcurrentStream(
+      emitRange(100),
+      async (i) => {
+        produced += 1;
+        return [i];
+      },
+      { concurrency: 4 }
+    )) {
+      consumed.push(item);
+      if (consumed.length === 2) {
+        break;
+      }
+    }
+    expect(consumed.length).toBe(2);
+    // Producer ran at most a handful of extras past the break point
+    // (the queue size + in-flight tasks). Don't pin an exact number —
+    // that's flaky. Just assert it's well short of all 100.
+    expect(produced).toBeLessThan(100);
+  });
+
+  test("signal propagates in stream variant when provided", async () => {
+    const controller = new AbortController();
+    setTimeout(() => controller.abort(), 5);
+    const iter = mapFilesConcurrentStream(
+      slowRange(200),
+      async (i) => {
+        await new Promise((r) => setTimeout(r, 20));
+        return [i];
+      },
+      { signal: controller.signal, concurrency: 2 }
+    );
+    let threw: unknown = null;
+    try {
+      for await (const _ of iter) {
+        // drain
+      }
+    } catch (error) {
+      threw = error;
+    }
+    expect(threw).toBeInstanceOf(DOMException);
+    expect((threw as DOMException).name).toBe("AbortError");
+  });
+
+  test("producer errors surface even when consumer breaks early", async () => {
+    // Regression test for PR 791 review finding: the producer-error
+    // rethrow was placed after the generator's `try/finally`. When a
+    // consumer `break`s early, the runtime's `return()` resolves the
+    // iterator after the `finally` runs but does NOT execute code
+    // after the try block — so a producer error would be silently
+    // swallowed. The fix moves the rethrow inside the `finally`.
+    const erroringSource = (async function* () {
+      yield 1;
+      yield 2;
+      throw new Error("producer exploded");
+    })();
+
+    let caught: unknown = null;
+    try {
+      for await (const _ of mapFilesConcurrentStream(
+        erroringSource,
+        async (i) => [i * 2],
+        { concurrency: 1 }
+      )) {
+        // break immediately — the error hasn't been observed yet.
+        break;
+      }
+    } catch (error) {
+      caught = error;
+    }
+    // Without the fix: caught stays null (error lost).
+    // With the fix: the error propagates via the generator's
+    // `return()` -> finally path and surfaces at the consumer.
+    expect(caught).toBeInstanceOf(Error);
+    expect((caught as Error).message).toBe("producer exploded");
+  });
+});
+
+/**
+ * Slow generator that yields one item every 1ms — gives the signal
+ * time to fire mid-stream.
+ */
+async function* slowRange(n: number): AsyncGenerator<number> {
+  for (let i = 0; i < n; i += 1) {
+    await new Promise((r) => setTimeout(r, 1));
+    yield i;
+  }
+}
diff --git a/test/lib/scan/constants.property.test.ts b/test/lib/scan/constants.property.test.ts
new file mode 100644
index 000000000..3f347345b
--- /dev/null
+++ b/test/lib/scan/constants.property.test.ts
@@ -0,0 +1,133 @@
+/**
+ * Property tests for shared constants in `src/lib/scan/constants.ts`.
+ *
+ * `normalizePath` is trivially identity on POSIX and a regex replace on
+ * Windows. We use property-based tests anyway because the identity case
+ * is easy to silently break (e.g., someone switches to `path.normalize`
+ * which collapses `a/b/../c` to `a/c` — that would not be idempotent).
+ */
+
+import { describe, expect, test } from "bun:test";
+import path from "node:path";
+import {
+  constantFrom,
+  assert as fcAssert,
+  property,
+  string,
+  tuple,
+} from "fast-check";
+import {
+  isMonorepoPackageDir,
+  MONOREPO_ROOTS,
+  normalizePath,
+} from "../../../src/lib/scan/constants.js";
+import { DEFAULT_NUM_RUNS } from "../../model-based/helpers.js";
+
+describe("property: normalizePath", () => {
+  test("idempotent", () => {
+    fcAssert(
+      property(string(), (input) => {
+        const once = normalizePath(input);
+        const twice = normalizePath(once);
+        expect(twice).toBe(once);
+      }),
+      { numRuns: DEFAULT_NUM_RUNS }
+    );
+  });
+
+  test("no backslashes in output on non-POSIX platforms", () => {
+    if (path.sep === path.posix.sep) {
+      // Identity on POSIX — backslashes are a valid filename character
+      // there, so preserving them is the correct behavior.
+      return;
+    }
+    fcAssert(
+      property(string(), (input) => {
+        expect(normalizePath(input).includes("\\")).toBe(false);
+      }),
+      { numRuns: DEFAULT_NUM_RUNS }
+    );
+  });
+
+  test("POSIX: identity", () => {
+    if (path.sep !== path.posix.sep) {
+      return;
+    }
+    fcAssert(
+      property(string(), (input) => {
+        expect(normalizePath(input)).toBe(input);
+      }),
+      { numRuns: DEFAULT_NUM_RUNS }
+    );
+  });
+});
+
+describe("property: isMonorepoPackageDir", () => {
+  const segmentArb = string({ minLength: 1, maxLength: 15 }).filter(
+    (s) => !s.includes("/") && s.length > 0
+  );
+
+  test("any 2-segment path with MONOREPO_ROOTS first is a package dir", () => {
+    fcAssert(
+      property(constantFrom(...MONOREPO_ROOTS), segmentArb, (root, pkg) => {
+        const rel = `${root}/${pkg}`;
+        expect(isMonorepoPackageDir(rel)).toBe(true);
+      }),
+      { numRuns: DEFAULT_NUM_RUNS }
+    );
+  });
+
+  test("single-segment paths are never package dirs", () => {
+    fcAssert(
+      property(segmentArb, (seg) => {
+        expect(isMonorepoPackageDir(seg)).toBe(false);
+      }),
+      { numRuns: DEFAULT_NUM_RUNS }
+    );
+  });
+
+  test("3-segment paths are never package dirs even with monorepo root", () => {
+    fcAssert(
+      property(
+        constantFrom(...MONOREPO_ROOTS),
+        tuple(segmentArb, segmentArb),
+        (root, [b, c]) => {
+          expect(isMonorepoPackageDir(`${root}/${b}/${c}`)).toBe(false);
+        }
+      ),
+      { numRuns: DEFAULT_NUM_RUNS }
+    );
+  });
+
+  test("2-segment paths with non-monorepo first segment are not package dirs", () => {
+    // Filter out any MONOREPO_ROOTS member so we exercise the negative case.
+    const monorepoSet = new Set<string>(MONOREPO_ROOTS);
+    fcAssert(
+      property(segmentArb, segmentArb, (first, second) => {
+        if (monorepoSet.has(first)) {
+          return; // skip — tested above
+        }
+        expect(isMonorepoPackageDir(`${first}/${second}`)).toBe(false);
+      }),
+      { numRuns: DEFAULT_NUM_RUNS }
+    );
+  });
+});
+
+describe("DEFAULT_SKIP_DIRS vs DSN_ADDITIONAL_SKIP_DIRS", () => {
+  test("no overlap between the two skip lists", () => {
+    // Overlapping entries would be a code-smell: either they move into
+    // DEFAULT_SKIP_DIRS (and DSN_ADDITIONAL_SKIP_DIRS drops them) or
+    // they're DSN-specific (and shouldn't be in both).
+    const fn = async () => {
+      const { DEFAULT_SKIP_DIRS, DSN_ADDITIONAL_SKIP_DIRS } = await import(
+        "../../../src/lib/scan/constants.js"
+      );
+      const base = new Set(DEFAULT_SKIP_DIRS);
+      for (const extra of DSN_ADDITIONAL_SKIP_DIRS) {
+        expect(base.has(extra)).toBe(false);
+      }
+    };
+    return fn();
+  });
+});
diff --git a/test/lib/scan/glob.test.ts b/test/lib/scan/glob.test.ts
new file mode 100644
index 000000000..75f7e6bb7
--- /dev/null
+++ b/test/lib/scan/glob.test.ts
@@ -0,0 +1,263 @@
+/**
+ * Unit tests for `src/lib/scan/glob.ts` (`globFiles`, `collectGlob`).
+ *
+ * Pin down the picomatch-backed semantics the init wizard's
+ * fs-fallback (and rg) expose:
+ *
+ *   - `*.ts` (no `/`) matches basename anywhere in tree.
+ *   - `src/*.ts` (with `/`) matches against the relative path.
+ *   - `**\/*.ts` matches `.ts` anywhere.
+ *   - Multiple patterns OR.
+ *   - `exclude` suppresses.
+ *   - `maxResults` caps + sets `truncated: true`.
+ *   - `path` narrows the walk root and yields cwd-relative paths.
+ */
+
+import { afterAll, describe, expect, test } from "bun:test";
+import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { collectGlob } from "../../../src/lib/scan/glob.js";
+
+const ROOT = mkdtempSync(join(tmpdir(), "scan-glob-test-"));
+
+afterAll(() => {
+  rmSync(ROOT, { recursive: true, force: true });
+});
+
+function makeSandbox(layout: Record<string, string>): {
+  cwd: string;
+  cleanup: () => void;
+} {
+  const cwd = mkdtempSync(join(ROOT, "box-"));
+  for (const [rel, content] of Object.entries(layout)) {
+    const abs = join(cwd, rel);
+    const parent = abs.slice(0, abs.lastIndexOf("/"));
+    mkdirSync(parent, { recursive: true });
+    writeFileSync(abs, content, "utf8");
+  }
+  return { cwd, cleanup: () => rmSync(cwd, { recursive: true, force: true }) };
+}
+
+describe("collectGlob — pattern semantics", () => {
+  test("bare `*.ext` matches basename anywhere in tree", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "x",
+      "src/b.ts": "y",
+      "deep/sub/c.ts": "z",
+      "d.md": "w",
+    });
+    try {
+      const { files } = await collectGlob({ cwd, patterns: "*.ts" });
+      expect(files).toEqual(["a.ts", "deep/sub/c.ts", "src/b.ts"]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("`src/*.ts` with `/` matches only directly under src/", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "src/a.ts": "x",
+      "src/b.ts": "y",
+      "src/deep/c.ts": "z",
+      "other/d.ts": "w",
+    });
+    try {
+      const { files } = await collectGlob({ cwd, patterns: "src/*.ts" });
+      expect(files).toEqual(["src/a.ts", "src/b.ts"]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("`**/*.ts` matches anywhere in tree", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "x",
+      "src/b.ts": "y",
+      "src/deep/c.ts": "z",
+    });
+    try {
+      const { files } = await collectGlob({ cwd, patterns: "**/*.ts" });
+      expect(files).toEqual(["a.ts", "src/b.ts", "src/deep/c.ts"]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("no matches returns empty array", async () => {
+    const { cwd, cleanup } = makeSandbox({ "a.txt": "x" });
+    try {
+      const { files, truncated } = await collectGlob({
+        cwd,
+        patterns: "*.nonexistent",
+      });
+      expect(files).toEqual([]);
+      expect(truncated).toBe(false);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("brace expansion works (picomatch grammar)", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "x",
+      "b.js": "y",
+      "c.md": "z",
+    });
+    try {
+      const { files } = await collectGlob({ cwd, patterns: "*.{ts,js}" });
+      expect(files).toEqual(["a.ts", "b.js"]);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("collectGlob — multiple patterns", () => {
+  test("array of patterns ORs them", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "x",
+      "b.js": "y",
+      "c.md": "z",
+    });
+    try {
+      const { files } = await collectGlob({
+        cwd,
+        patterns: ["*.ts", "*.md"],
+      });
+      expect(files).toEqual(["a.ts", "c.md"]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("empty patterns array yields nothing", async () => {
+    const { cwd, cleanup } = makeSandbox({ "a.ts": "x" });
+    try {
+      const { files } = await collectGlob({ cwd, patterns: [] });
+      expect(files).toEqual([]);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("collectGlob — exclude", () => {
+  test("exclude suppresses matching files", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "x",
+      "b.test.ts": "y",
+      "c.ts": "z",
+    });
+    try {
+      const { files } = await collectGlob({
+        cwd,
+        patterns: "*.ts",
+        exclude: "*.test.ts",
+      });
+      expect(files).toEqual(["a.ts", "c.ts"]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("exclude array also ORs", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "x",
+      "b.test.ts": "y",
+      "c.spec.ts": "z",
+      "d.ts": "w",
+    });
+    try {
+      const { files } = await collectGlob({
+        cwd,
+        patterns: "*.ts",
+        exclude: ["*.test.ts", "*.spec.ts"],
+      });
+      expect(files).toEqual(["a.ts", "d.ts"]);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("collectGlob — truncation", () => {
+  test("maxResults caps emission and sets truncated", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "x",
+      "b.ts": "y",
+      "c.ts": "z",
+      "d.ts": "w",
+      "e.ts": "v",
+    });
+    try {
+      const { files, truncated } = await collectGlob({
+        cwd,
+        patterns: "*.ts",
+        maxResults: 3,
+      });
+      expect(files.length).toBe(3);
+      expect(truncated).toBe(true);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("collectGlob — path narrowing", () => {
+  test("path scopes walk and yields cwd-relative paths", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "src/a.ts": "x",
+      "src/sub/b.ts": "y",
+      "other/c.ts": "z",
+    });
+    try {
+      const { files } = await collectGlob({
+        cwd,
+        patterns: "**/*.ts",
+        path: "src",
+      });
+      // Files are reported relative to cwd, not `path`.
+      expect(files).toEqual(["src/a.ts", "src/sub/b.ts"]);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("collectGlob — .gitignore + hidden handling", () => {
+  test("respects root .gitignore by default", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      ".gitignore": "*.log\n",
+      "a.ts": "x",
+      "b.log": "y",
+    });
+    try {
+      const { files } = await collectGlob({ cwd, patterns: "*" });
+      // .gitignore is a dotfile; with default hidden: true it matches
+      // `*` under picomatch dot:true semantics. b.log is gitignored.
+      expect(files.includes("a.ts")).toBe(true);
+      expect(files.includes("b.log")).toBe(false);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("hidden: false skips dotfiles", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "x",
+      ".env": "y",
+    });
+    try {
+      const { files } = await collectGlob({
+        cwd,
+        patterns: "*",
+        hidden: false,
+      });
+      expect(files.includes("a.ts")).toBe(true);
+      expect(files.includes(".env")).toBe(false);
+    } finally {
+      cleanup();
+    }
+  });
+});
diff --git a/test/lib/scan/grep.property.test.ts b/test/lib/scan/grep.property.test.ts
new file mode 100644
index 000000000..059021165
--- /dev/null
+++ b/test/lib/scan/grep.property.test.ts
@@ -0,0 +1,163 @@
+/**
+ * Property test: `grepFiles` agrees with a naive reference impl.
+ *
+ * The invariant we're pinning: for any pattern + corpus,
+ * `collectGrep` emits exactly the same matches as a naive
+ * `content.split("\n").forEach(line => regex.test(line))` pass over
+ * each file. Tests the engine's correctness without tying to any
+ * specific implementation detail.
+ */
+
+import { afterAll, describe, expect, test } from "bun:test";
+import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import {
+  array,
+  asyncProperty,
+  constantFrom,
+  assert as fcAssert,
+} from "fast-check";
+import { collectGrep } from "../../../src/lib/scan/grep.js";
+import { DEFAULT_NUM_RUNS } from "../../model-based/helpers.js";
+
+const ROOT = mkdtempSync(join(tmpdir(), "scan-grep-prop-"));
+
+afterAll(() => {
+  rmSync(ROOT, { recursive: true, force: true });
+});
+
+/** Alphabet safe for filenames and file content (no newlines, no meta). */
+const SAFE_CHARS = "abcdefghijklmnopqrstuvwxyz0123456789".split("");
+
+/** Short filename without extension metas. */
+const filenameArb = array(constantFrom(...SAFE_CHARS), {
+  minLength: 2,
+  maxLength: 5,
+}).map((chars) => `${chars.join("")}.txt`);
+
+/**
+ * File content: 1-8 lines, each 0-20 chars of SAFE_CHARS.
+ *
+ * We keep the character set small so generated patterns are
+ * statistically likely to match a non-trivial number of lines.
+ */
+const fileContentArb = array(
+  array(constantFrom(...SAFE_CHARS), { minLength: 0, maxLength: 20 }).map(
+    (chars) => chars.join("")
+  ),
+  { minLength: 1, maxLength: 8 }
+).map((lines) => lines.join("\n"));
+
+/** Small tree: 1-5 filename/content pairs. */
+const treeArb = array(
+  constantFrom(null).chain(() =>
+    filenameArb.chain((name) =>
+      fileContentArb.map((content) => [name, content] as [string, string])
+    )
+  ),
+  { minLength: 1, maxLength: 5 }
+);
+
+/** Pattern: 1-3 chars from the same alphabet — likely to match. */
+const patternArb = array(constantFrom(...SAFE_CHARS), {
+  minLength: 1,
+  maxLength: 3,
+}).map((chars) => chars.join(""));
+
+/**
+ * Naive reference: read each file's content directly, split by `\n`,
+ * test each line against the regex, record matching lines with 1-based
+ * line numbers. Sort by [path, lineNum] to align with collectGrep's
+ * stable sort.
+ */
+type NaiveMatch = { path: string; lineNum: number; line: string };
+function naiveGrep(
+  layout: readonly [string, string][],
+  pattern: string
+): NaiveMatch[] {
+  const out: NaiveMatch[] = [];
+  const regex = new RegExp(pattern);
+  for (const [path, content] of layout) {
+    const lines = content.split("\n");
+    for (let i = 0; i < lines.length; i += 1) {
+      const line = lines[i] as string;
+      if (regex.test(line)) {
+        out.push({ path, lineNum: i + 1, line });
+      }
+    }
+  }
+  return out.sort((a, b) => {
+    if (a.path < b.path) return -1;
+    if (a.path > b.path) return 1;
+    return a.lineNum - b.lineNum;
+  });
+}
+
+describe("property: collectGrep matches naive reference", () => {
+  test("every regex + corpus produces identical matches", async () => {
+    await fcAssert(
+      asyncProperty(treeArb, patternArb, async (pairs, pattern) => {
+        // Dedupe paths — fc may generate duplicates, in which case
+        // later entries overwrite earlier ones on disk but the naive
+        // reference sees all. Drop duplicates from both sides.
+        const seen = new Set<string>();
+        const layout: [string, string][] = [];
+        for (const [path, content] of pairs) {
+          if (seen.has(path)) continue;
+          seen.add(path);
+          layout.push([path, content]);
+        }
+        const cwd = mkdtempSync(join(ROOT, "tree-"));
+        try {
+          for (const [path, content] of layout) {
+            mkdirSync(cwd, { recursive: true });
+            writeFileSync(join(cwd, path), content, "utf8");
+          }
+          const expected = naiveGrep(layout, pattern);
+          const { matches } = await collectGrep({ cwd, pattern });
+          const got = matches.map((m) => ({
+            path: m.path,
+            lineNum: m.lineNum,
+            line: m.line,
+          }));
+          expect(got).toEqual(expected);
+        } finally {
+          rmSync(cwd, { recursive: true, force: true });
+        }
+      }),
+      // Filesystem-heavy property; keep run count modest so CI stays
+      // fast. 20 runs still explores plenty of pattern/corpus shapes.
+      { numRuns: Math.min(DEFAULT_NUM_RUNS, 20) }
+    );
+  });
+});
+
+describe("property: idempotence", () => {
+  test("running collectGrep twice returns identical matches", async () => {
+    await fcAssert(
+      asyncProperty(treeArb, patternArb, async (pairs, pattern) => {
+        const seen = new Set<string>();
+        const layout: [string, string][] = [];
+        for (const [path, content] of pairs) {
+          if (seen.has(path)) continue;
+          seen.add(path);
+          layout.push([path, content]);
+        }
+        const cwd = mkdtempSync(join(ROOT, "tree-"));
+        try {
+          for (const [path, content] of layout) {
+            mkdirSync(cwd, { recursive: true });
+            writeFileSync(join(cwd, path), content, "utf8");
+          }
+          const a = await collectGrep({ cwd, pattern });
+          const b = await collectGrep({ cwd, pattern });
+          expect(a.matches).toEqual(b.matches);
+        } finally {
+          rmSync(cwd, { recursive: true, force: true });
+        }
+      }),
+      { numRuns: Math.min(DEFAULT_NUM_RUNS, 15) }
+    );
+  });
+});
diff --git a/test/lib/scan/grep.test.ts b/test/lib/scan/grep.test.ts
new file mode 100644
index 000000000..357faed4b
--- /dev/null
+++ b/test/lib/scan/grep.test.ts
@@ -0,0 +1,557 @@
+/**
+ * Unit tests for `src/lib/scan/grep.ts` (`grepFiles`, `collectGrep`).
+ *
+ * Each test builds a small sandbox under `tmpdir()`, runs grep with
+ * specific options, and asserts on the returned matches.
+ *
+ * We use `collectGrep` in most tests — it gives us a stable sorted
+ * order plus the stats bag to assert on. The iterable variant is
+ * tested separately for streaming and early-break semantics.
+ */
+
+import { afterAll, describe, expect, test } from "bun:test";
+import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { ValidationError } from "../../../src/lib/errors.js";
+import { collectGrep, grepFiles } from "../../../src/lib/scan/grep.js";
+
+const ROOT = mkdtempSync(join(tmpdir(), "scan-grep-test-"));
+
+afterAll(() => {
+  rmSync(ROOT, { recursive: true, force: true });
+});
+
+/** Build a sandbox directory with the given relative-path → content map. */
+function makeSandbox(layout: Record<string, string>): {
+  cwd: string;
+  cleanup: () => void;
+} {
+  const cwd = mkdtempSync(join(ROOT, "box-"));
+  for (const [rel, content] of Object.entries(layout)) {
+    const abs = join(cwd, rel);
+    const parent = abs.slice(0, abs.lastIndexOf("/"));
+    mkdirSync(parent, { recursive: true });
+    writeFileSync(abs, content, "utf8");
+  }
+  return { cwd, cleanup: () => rmSync(cwd, { recursive: true, force: true }) };
+}
+
+describe("collectGrep — basic matching", () => {
+  test("finds a simple literal pattern across files", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.txt": "line one\nhello world\nline three",
+      "b.txt": "nothing here\nhello again",
+      "c.txt": "no match at all",
+    });
+    try {
+      const { matches } = await collectGrep({ cwd, pattern: "hello" });
+      expect(matches.map((m) => `${m.path}:${m.lineNum}`)).toEqual([
+        "a.txt:2",
+        "b.txt:2",
+      ]);
+      expect(matches[0]?.line).toBe("hello world");
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("no matches returns empty result with truncated: false", async () => {
+    const { cwd, cleanup } = makeSandbox({ "a.txt": "no match" });
+    try {
+      const result = await collectGrep({ cwd, pattern: "xyz" });
+      expect(result.matches).toEqual([]);
+      expect(result.stats.truncated).toBe(false);
+      expect(result.stats.filesRead).toBe(1);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("regex metachars work as patterns", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.txt": "one\ntwo\n\nthree",
+    });
+    try {
+      // `^t.*` matches any line starting with t.
+      const { matches } = await collectGrep({ cwd, pattern: "^t" });
+      expect(matches.map((m) => m.line)).toEqual(["two", "three"]);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("collectGrep — case sensitivity", () => {
+  test("default is case-sensitive", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.txt": "Hello\nhello\nHELLO",
+    });
+    try {
+      const { matches } = await collectGrep({ cwd, pattern: "hello" });
+      expect(matches.map((m) => m.line)).toEqual(["hello"]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("caseSensitive: false matches any case", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.txt": "Hello\nhello\nHELLO",
+    });
+    try {
+      const { matches } = await collectGrep({
+        cwd,
+        pattern: "hello",
+        caseSensitive: false,
+      });
+      expect(matches.map((m) => m.line)).toEqual(["Hello", "hello", "HELLO"]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("leading (?i) in pattern also enables case-insensitive", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.txt": "Hello\nhello\nHELLO",
+    });
+    try {
+      const { matches } = await collectGrep({ cwd, pattern: "(?i)hello" });
+      expect(matches.length).toBe(3);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("collectGrep — multiline mode", () => {
+  test("default: ^ matches at line boundaries (grep-like)", async () => {
+    // Default `multiline: true` gives rg/grep semantics: `^foo` hits
+    // any line starting with `foo`, not just the first line of the
+    // file. Regression test for a PR 791 review finding that the
+    // `multiline` option was always forced to true internally; the
+    // fix ties behavior to the caller's explicit opt-in/out.
+    const { cwd, cleanup } = makeSandbox({
+      "a.txt": "nope\nfoo line\nalso nope\nfoo again",
+    });
+    try {
+      const { matches } = await collectGrep({ cwd, pattern: "^foo" });
+      expect(matches.map((m) => m.line)).toEqual(["foo line", "foo again"]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("multiline: false applies strict buffer-boundary anchoring", async () => {
+    // With `multiline: false`, `^` anchors to the buffer start only.
+    // Only the first line can match `^foo`; a later `foo`-start line
+    // inside the same file does NOT.
+    const { cwd, cleanup } = makeSandbox({
+      "a.txt": "nope\nfoo line\nalso nope",
+      "b.txt": "foo buffer-start",
+    });
+    try {
+      const { matches } = await collectGrep({
+        cwd,
+        pattern: "^foo",
+        multiline: false,
+      });
+      expect(matches.map((m) => `${m.path}:${m.line}`)).toEqual([
+        "b.txt:foo buffer-start",
+      ]);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("collectGrep — include / exclude globs", () => {
+  test("include narrows to matching files", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "foo",
+      "b.js": "foo",
+      "c.md": "foo",
+    });
+    try {
+      const { matches } = await collectGrep({
+        cwd,
+        pattern: "foo",
+        include: "*.ts",
+      });
+      expect(matches.map((m) => m.path)).toEqual(["a.ts"]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("exclude suppresses matching files", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "foo",
+      "b.test.ts": "foo",
+      "c.ts": "foo",
+    });
+    try {
+      const { matches } = await collectGrep({
+        cwd,
+        pattern: "foo",
+        include: "*.ts",
+        exclude: "*.test.ts",
+      });
+      expect(matches.map((m) => m.path).sort()).toEqual(["a.ts", "c.ts"]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("include array with multiple patterns ORs them", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "foo",
+      "b.js": "foo",
+      "c.md": "foo",
+    });
+    try {
+      const { matches } = await collectGrep({
+        cwd,
+        pattern: "foo",
+        include: ["*.ts", "*.js"],
+      });
+      expect(matches.map((m) => m.path).sort()).toEqual(["a.ts", "b.js"]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("path narrows the walk root", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "src/a.ts": "foo",
+      "src/b.ts": "foo",
+      "other/c.ts": "foo",
+    });
+    try {
+      const { matches } = await collectGrep({
+        cwd,
+        pattern: "foo",
+        path: "src",
+      });
+      expect(matches.map((m) => m.path).sort()).toEqual([
+        "src/a.ts",
+        "src/b.ts",
+      ]);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("collectGrep — truncation / limits", () => {
+  test("maxResults caps total matches and sets truncated", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.txt": "hit\nhit\nhit\nhit\nhit",
+    });
+    try {
+      const { matches, stats } = await collectGrep({
+        cwd,
+        pattern: "hit",
+        maxResults: 3,
+      });
+      expect(matches.length).toBe(3);
+      expect(stats.truncated).toBe(true);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("maxResults == exact match count does NOT set truncated", async () => {
+    // Regression for PR 791 review finding: `collectGrep` previously
+    // set `truncated = true` whenever `matchesEmitted >= maxResults`,
+    // so asking for `maxResults: 3` against a corpus with exactly 3
+    // matches falsely reported truncation. The fix mirrors
+    // `collectGlob`'s `+1` overshoot probe.
+    const { cwd, cleanup } = makeSandbox({
+      "a.txt": "hit\nhit\nhit",
+    });
+    try {
+      const { matches, stats } = await collectGrep({
+        cwd,
+        pattern: "hit",
+        maxResults: 3,
+      });
+      expect(matches.length).toBe(3);
+      // Exactly 3 matches exist; we requested 3; no truncation.
+      expect(stats.truncated).toBe(false);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("stopOnFirst returns on first match", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.txt": "hit\nhit\nhit",
+    });
+    try {
+      const { matches, stats } = await collectGrep({
+        cwd,
+        pattern: "hit",
+        stopOnFirst: true,
+      });
+      expect(matches.length).toBe(1);
+      expect(stats.truncated).toBe(true);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("maxMatchesPerFile caps per-file output", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.txt": "hit\nhit\nhit\nhit\nhit",
+      "b.txt": "hit\nhit\nhit\nhit\nhit",
+    });
+    try {
+      const { matches } = await collectGrep({
+        cwd,
+        pattern: "hit",
+        maxMatchesPerFile: 2,
+      });
+      // 2 files × 2 matches/file = 4 total.
+      expect(matches.length).toBe(4);
+      const perFile = matches.reduce<Record<string, number>>((acc, m) => {
+        acc[m.path] = (acc[m.path] ?? 0) + 1;
+        return acc;
+      }, {});
+      expect(perFile["a.txt"]).toBe(2);
+      expect(perFile["b.txt"]).toBe(2);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("long lines truncated at maxLineLength with ellipsis", async () => {
+    const long = "x".repeat(3000);
+    const { cwd, cleanup } = makeSandbox({ "a.txt": `hit${long}` });
+    try {
+      const { matches } = await collectGrep({
+        cwd,
+        pattern: "hit",
+        maxLineLength: 50,
+      });
+      expect(matches.length).toBe(1);
+      const line = matches[0]?.line ?? "";
+      expect(line.length).toBe(50);
+      expect(line.endsWith("…")).toBe(true);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("lines shorter than maxLineLength are emitted verbatim", async () => {
+    const { cwd, cleanup } = makeSandbox({ "a.txt": "hit: short" });
+    try {
+      const { matches } = await collectGrep({
+        cwd,
+        pattern: "hit",
+      });
+      expect(matches[0]?.line).toBe("hit: short");
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("collectGrep — binary-file handling", () => {
+  test("default skips NUL-containing files", async () => {
+    const { cwd, cleanup } = makeSandbox({});
+    try {
+      // 256-byte "binary" blob with a hit string in it and a NUL.
+      const bin = new Uint8Array(256);
+      const enc = new TextEncoder();
+      const prefix = enc.encode("hitme\n");
+      bin.set(prefix, 0);
+      bin[100] = 0;
+      writeFileSync(join(cwd, "blob.bin"), bin);
+      const { matches, stats } = await collectGrep({
+        cwd,
+        pattern: "hitme",
+      });
+      expect(matches.length).toBe(0);
+      expect(stats.filesSkippedBinary).toBeGreaterThanOrEqual(1);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("includeBinary: true scans binary files", async () => {
+    const { cwd, cleanup } = makeSandbox({});
+    try {
+      const bin = new Uint8Array(256);
+      const enc = new TextEncoder();
+      const prefix = enc.encode("hitme\n");
+      bin.set(prefix, 0);
+      bin[100] = 0;
+      writeFileSync(join(cwd, "blob.bin"), bin);
+      const { matches } = await collectGrep({
+        cwd,
+        pattern: "hitme",
+        includeBinary: true,
+      });
+      expect(matches.length).toBe(1);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("collectGrep — result ordering", () => {
+  test("matches sorted by [path, lineNum]", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "z.txt": "hit\nhit",
+      "a.txt": "hit\nhit",
+      "m.txt": "hit",
+    });
+    try {
+      const { matches } = await collectGrep({ cwd, pattern: "hit" });
+      expect(matches.map((m) => `${m.path}:${m.lineNum}`)).toEqual([
+        "a.txt:1",
+        "a.txt:2",
+        "m.txt:1",
+        "z.txt:1",
+        "z.txt:2",
+      ]);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("collectGrep — regex errors", () => {
+  test("bad regex throws ValidationError", async () => {
+    const { cwd, cleanup } = makeSandbox({ "a.txt": "foo" });
+    try {
+      await expect(collectGrep({ cwd, pattern: "[unclosed" })).rejects.toThrow(
+        ValidationError
+      );
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("pre-compiled RegExp used verbatim", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.txt": "Foo\nfoo\nFOO",
+    });
+    try {
+      const { matches } = await collectGrep({ cwd, pattern: /foo/i });
+      expect(matches.length).toBe(3);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("grepFiles — iterable variant", () => {
+  test("yields matches lazily", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.txt": "hit\nhit\nhit\nhit\nhit",
+    });
+    try {
+      const collected: number[] = [];
+      for await (const match of grepFiles({ cwd, pattern: "hit" })) {
+        collected.push(match.lineNum);
+        if (collected.length === 2) {
+          break;
+        }
+      }
+      expect(collected.length).toBe(2);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("AbortSignal fires mid-iteration", async () => {
+    // Many files so the walker has work to do while the abort fires.
+    const layout: Record<string, string> = {};
+    for (let i = 0; i < 20; i += 1) {
+      layout[`dir${i}/file.txt`] = "hit\nhit\nhit";
+    }
+    const { cwd, cleanup } = makeSandbox(layout);
+    try {
+      const controller = new AbortController();
+      const iter = grepFiles({
+        cwd,
+        pattern: "hit",
+        signal: controller.signal,
+        concurrency: 2,
+      });
+      let yields = 0;
+      let threw: unknown = null;
+      try {
+        for await (const _ of iter) {
+          yields += 1;
+          if (yields === 2) {
+            controller.abort();
+          }
+        }
+      } catch (error) {
+        threw = error;
+      }
+      expect(threw).toBeInstanceOf(DOMException);
+      expect((threw as DOMException).name).toBe("AbortError");
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("collectGrep — pre-compiled RegExp isolation", () => {
+  test("concurrent workers each emit every match for a shared RegExp", async () => {
+    // Guards against a foot-gun identified in review: when a caller
+    // passes a pre-compiled `/gm` RegExp, `ensureGlobalMultilineFlags`
+    // returns the same object, which workers mutate via
+    // `regex.exec`'s `lastIndex`. Today the match loop is fully
+    // synchronous so JS's single-threaded microtask model hides the
+    // sharing — but if anyone introduces an `await` inside the loop
+    // the bug would manifest. The fix is a per-file `new RegExp(...)`
+    // clone. This test exercises the shared-regex shape so a
+    // regression would blow up here before landing.
+    const PATTERN_LINE = "hit-marker-unique-42";
+    const NUM_FILES = 30;
+    const MATCHES_PER_FILE = 5;
+    const NOISE_LINES = 200; // large enough to force multi-line scan per file
+
+    const layout: Record<string, string> = {};
+    for (let f = 0; f < NUM_FILES; f += 1) {
+      const lines: string[] = [];
+      for (let i = 0; i < NOISE_LINES; i += 1) {
+        lines.push(`noise noise noise line ${i}`);
+        if (i % Math.floor(NOISE_LINES / MATCHES_PER_FILE) === 0) {
+          lines.push(PATTERN_LINE);
+        }
+      }
+      layout[`file-${f}.txt`] = lines.join("\n");
+    }
+    const { cwd, cleanup } = makeSandbox(layout);
+    try {
+      // Pre-compile with /gm — exactly the shape that would have
+      // returned-as-is through `ensureGlobalMultilineFlags`.
+      const sharedRegex = /hit-marker-unique-\d+/gm;
+      const { matches } = await collectGrep({
+        cwd,
+        pattern: sharedRegex,
+        concurrency: 8, // plenty of room for interleaving
+      });
+      // Each file emits one match per unique-matching line. Across all
+      // files, total = NUM_FILES * MATCHES_PER_FILE. If the race were
+      // present, some files would miss matches whose `index` is
+      // behind a concurrent worker's advanced `lastIndex`.
+      const perFile = new Map<string, number>();
+      for (const m of matches) {
+        perFile.set(m.path, (perFile.get(m.path) ?? 0) + 1);
+      }
+      expect(perFile.size).toBe(NUM_FILES);
+      for (const [file, count] of perFile) {
+        expect([file, count]).toEqual([file, MATCHES_PER_FILE]);
+      }
+    } finally {
+      cleanup();
+    }
+  });
+});
diff --git a/test/lib/scan/ignore.property.test.ts b/test/lib/scan/ignore.property.test.ts
new file mode 100644
index 000000000..7d48f07be
--- /dev/null
+++ b/test/lib/scan/ignore.property.test.ts
@@ -0,0 +1,125 @@
+/**
+ * Property test: a root-only IgnoreStack behaves identically to a
+ * plain `ignore` instance.
+ *
+ * This isn't a full cumulative-semantics test (the nested case is
+ * covered by the unit tests) — it's a round-trip anchor so refactors
+ * to the stack's root path can't silently diverge from the upstream
+ * package.
+ */
+
+import { afterAll, describe, expect, test } from "bun:test";
+import { mkdtempSync, rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import {
+  array,
+  asyncProperty,
+  constantFrom,
+  assert as fcAssert,
+} from "fast-check";
+import ignore from "ignore";
+import { IgnoreStack } from "../../../src/lib/scan/ignore.js";
+import { DEFAULT_NUM_RUNS } from "../../model-based/helpers.js";
+
+const ROOT = mkdtempSync(join(tmpdir(), "scan-ignore-prop-"));
+
+/** Safe path-segment alphabet — no dots, slashes, or whitespace. */
+const SEGMENT_CHARS = "abcdefghijklmnopqrstuvwxyz0123456789_-".split("");
+
+afterAll(() => {
+  rmSync(ROOT, { recursive: true, force: true });
+});
+
+/**
+ * Arbitrary for well-formed relative POSIX path segments.
+ *
+ * Constrained to [a-z0-9_-] so we don't accidentally generate strings
+ * the `ignore` package treats specially (whitespace, leading `.`/`!`,
+ * glob metas, path separators).
+ */
+const segmentArb = array(constantFrom(...SEGMENT_CHARS), {
+  minLength: 1,
+  maxLength: 10,
+}).map((chars) => chars.join(""));
+const relPathArb = array(segmentArb, { minLength: 1, maxLength: 4 }).map((xs) =>
+  xs.join("/")
+);
+/**
+ * Pattern alphabet kept small and safe — the `ignore` package treats
+ * many characters with special semantics (e.g. `\`, `[`, `{`) that we
+ * don't want to explore at the boundary here.
+ */
+const patternArb = constantFrom(
+  "*.log",
+  "*.tmp",
+  "build",
+  "dist/",
+  "node_modules",
+  "*.bak",
+  "src/**/*.generated.ts",
+  "coverage",
+  ".env"
+);
+
+describe("property: IgnoreStack root-only == plain ignore instance", () => {
+  test("isIgnored matches `ignore` for root-only patterns", async () => {
+    await fcAssert(
+      asyncProperty(
+        array(patternArb, { minLength: 0, maxLength: 6 }),
+        relPathArb,
+        async (patterns, relPath) => {
+          const cwd = mkdtempSync(join(ROOT, "rt-"));
+          try {
+            if (patterns.length > 0) {
+              writeFileSync(
+                join(cwd, ".gitignore"),
+                `${patterns.join("\n")}\n`,
+                "utf8"
+              );
+            }
+            const stack = await IgnoreStack.create({
+              cwd,
+              alwaysSkipDirs: [],
+            });
+            const plain = ignore().add(patterns);
+            expect(stack.isIgnored(relPath, false)).toBe(
+              plain.ignores(relPath)
+            );
+          } finally {
+            rmSync(cwd, { recursive: true, force: true });
+          }
+        }
+      ),
+      // Fewer runs: each run writes fs state. 25 is enough to catch
+      // regressions without blowing up CI time.
+      { numRuns: Math.min(DEFAULT_NUM_RUNS, 25) }
+    );
+  });
+});
+
+describe("property: alwaysSkipDirs always ignore their basename", () => {
+  test("skipped-dir basenames are always ignored", async () => {
+    await fcAssert(
+      asyncProperty(
+        constantFrom("node_modules", "dist", ".git", "venv"),
+        relPathArb,
+        async (skipDir, trailing) => {
+          const cwd = mkdtempSync(join(ROOT, "skip-"));
+          try {
+            // Ensure the skip dir appears as a segment in the query.
+            const rel = `${skipDir}/${trailing}`;
+            const stack = await IgnoreStack.create({
+              cwd,
+              alwaysSkipDirs: [skipDir],
+            });
+            expect(stack.isIgnored(rel, false)).toBe(true);
+          } finally {
+            rmSync(cwd, { recursive: true, force: true });
+          }
+        }
+      ),
+      { numRuns: Math.min(DEFAULT_NUM_RUNS, 25) }
+    );
+  });
+});
diff --git a/test/lib/scan/ignore.test.ts b/test/lib/scan/ignore.test.ts
new file mode 100644
index 000000000..ced3ac192
--- /dev/null
+++ b/test/lib/scan/ignore.test.ts
@@ -0,0 +1,205 @@
+/**
+ * Unit tests for `src/lib/scan/ignore.ts` (IgnoreStack).
+ *
+ * Pins the semantics we care about most:
+ *
+ *   1. A single root `.gitignore` behaves like a plain `ignore` instance.
+ *   2. `alwaysSkipDirs` basenames are skipped even when no `.gitignore`
+ *      mentions them (basename-anywhere semantics).
+ *   3. Nested `.gitignore` files apply ON TOP OF parent patterns —
+ *      cumulative, root→leaf, with child negations overriding parents.
+ *   4. `.git/info/exclude` is treated as an additional root `.gitignore`
+ *      when requested.
+ *   5. Malformed inputs (absolute paths, empty relPath) are handled
+ *      gracefully.
+ */
+
+import { afterAll, describe, expect, test } from "bun:test";
+import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { IgnoreStack } from "../../../src/lib/scan/ignore.js";
+
+const ROOT = mkdtempSync(join(tmpdir(), "scan-ignore-test-"));
+
+afterAll(() => {
+  rmSync(ROOT, { recursive: true, force: true });
+});
+
+/** Build a per-test sandbox directory with a fresh set of gitignore files. */
+function makeSandbox(layout: Record<string, string>): {
+  cwd: string;
+  cleanup: () => void;
+} {
+  const cwd = mkdtempSync(join(ROOT, "sandbox-"));
+  for (const [relPath, content] of Object.entries(layout)) {
+    const abs = join(cwd, relPath);
+    const parent = abs.slice(0, abs.lastIndexOf("/"));
+    mkdirSync(parent, { recursive: true });
+    writeFileSync(abs, content, "utf8");
+  }
+  return { cwd, cleanup: () => rmSync(cwd, { recursive: true, force: true }) };
+}
+
+describe("IgnoreStack — root .gitignore", () => {
+  test("respects simple patterns", async () => {
+    const { cwd, cleanup } = makeSandbox({ ".gitignore": "*.log\nbuild/\n" });
+    try {
+      const stack = await IgnoreStack.create({
+        cwd,
+        alwaysSkipDirs: [],
+      });
+      expect(stack.isIgnored("foo.log", false)).toBe(true);
+      expect(stack.isIgnored("foo.txt", false)).toBe(false);
+      expect(stack.isIgnored("build", true)).toBe(true);
+      expect(stack.isIgnored("build", false)).toBe(false); // dir-only pattern
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("empty cwd .gitignore = no patterns", async () => {
+    const { cwd, cleanup } = makeSandbox({});
+    try {
+      const stack = await IgnoreStack.create({ cwd, alwaysSkipDirs: [] });
+      expect(stack.isIgnored("foo.log", false)).toBe(false);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("alwaysSkipDirs matches basename anywhere", async () => {
+    const { cwd, cleanup } = makeSandbox({});
+    try {
+      const stack = await IgnoreStack.create({
+        cwd,
+        alwaysSkipDirs: ["node_modules", ".git"],
+      });
+      expect(stack.isIgnored("node_modules", true)).toBe(true);
+      expect(stack.isIgnored("packages/foo/node_modules", true)).toBe(true);
+      expect(stack.isIgnored("src/code.ts", false)).toBe(false);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("IgnoreStack — nested .gitignore", () => {
+  test("parent *.log + child !important.log un-ignores the child", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      ".gitignore": "*.log\n",
+      "src/.gitignore": "!important.log\n",
+      "src/foo.log": "",
+      "src/important.log": "",
+    });
+    try {
+      const stack = await IgnoreStack.create({ cwd, alwaysSkipDirs: [] });
+      await stack.loadFromDir(join(cwd, "src"));
+      expect(stack.isIgnored("src/foo.log", false)).toBe(true);
+      expect(stack.isIgnored("src/important.log", false)).toBe(false);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("parent patterns still apply in deeper subdirs without their own .gitignore", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      ".gitignore": "*.log\n",
+      "src/deep/subdir/foo.log": "",
+    });
+    try {
+      const stack = await IgnoreStack.create({ cwd, alwaysSkipDirs: [] });
+      // No .gitignore in src/ or src/deep/ or src/deep/subdir/ — but the
+      // parent pattern should still match basename-anywhere.
+      expect(stack.isIgnored("src/deep/subdir/foo.log", false)).toBe(true);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("loadFromDir is a no-op when no .gitignore present", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      ".gitignore": "*.log\n",
+      "src/foo.log": "",
+    });
+    try {
+      const stack = await IgnoreStack.create({ cwd, alwaysSkipDirs: [] });
+      await stack.loadFromDir(join(cwd, "src")); // no-op; src has no .gitignore
+      expect(stack.isIgnored("src/foo.log", false)).toBe(true);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("loadFromDir called on cwd itself is idempotent (root already loaded)", async () => {
+    const { cwd, cleanup } = makeSandbox({ ".gitignore": "*.log\n" });
+    try {
+      const stack = await IgnoreStack.create({ cwd, alwaysSkipDirs: [] });
+      await stack.loadFromDir(cwd); // no-op per implementation
+      expect(stack.isIgnored("foo.log", false)).toBe(true);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("IgnoreStack — .git/info/exclude", () => {
+  test("is read when includeGitInfoExclude: true", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      ".git/info/exclude": "secrets.txt\n",
+      "secrets.txt": "",
+    });
+    try {
+      const stack = await IgnoreStack.create({
+        cwd,
+        alwaysSkipDirs: [],
+        includeGitInfoExclude: true,
+      });
+      expect(stack.isIgnored("secrets.txt", false)).toBe(true);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("is ignored when includeGitInfoExclude: false", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      ".git/info/exclude": "secrets.txt\n",
+      "secrets.txt": "",
+    });
+    try {
+      const stack = await IgnoreStack.create({
+        cwd,
+        alwaysSkipDirs: [],
+        includeGitInfoExclude: false,
+      });
+      expect(stack.isIgnored("secrets.txt", false)).toBe(false);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("IgnoreStack — input validation", () => {
+  test("throws on absolute path (programming-error safeguard)", async () => {
+    const { cwd, cleanup } = makeSandbox({});
+    try {
+      const stack = await IgnoreStack.create({ cwd, alwaysSkipDirs: [] });
+      expect(() => stack.isIgnored("/etc/passwd", false)).toThrow(
+        /relative path/
+      );
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("empty or '.' relPath is never ignored", async () => {
+    const { cwd, cleanup } = makeSandbox({ ".gitignore": "*\n" });
+    try {
+      const stack = await IgnoreStack.create({ cwd, alwaysSkipDirs: [] });
+      expect(stack.isIgnored("", false)).toBe(false);
+      expect(stack.isIgnored(".", false)).toBe(false);
+    } finally {
+      cleanup();
+    }
+  });
+});
diff --git a/test/lib/scan/regex.test.ts b/test/lib/scan/regex.test.ts
new file mode 100644
index 000000000..77e5cd4bc
--- /dev/null
+++ b/test/lib/scan/regex.test.ts
@@ -0,0 +1,172 @@
+/**
+ * Unit tests for `src/lib/scan/regex.ts`.
+ *
+ * Tabular coverage of `extractInlineFlags` across the four cases that
+ * actually matter for init-wizard patterns:
+ *   1. No inline flag group — identity.
+ *   2. `(?i)foo` — strip flag group, flags = "i".
+ *   3. `(?i:foo)bar` — unwrap scoped group, flag widens to whole pattern.
+ *   4. `foo(?i)bar` — mid-pattern group left alone (identity).
+ *
+ * Plus `compilePattern` success / failure modes.
+ */
+
+import { describe, expect, test } from "bun:test";
+import { ValidationError } from "../../../src/lib/errors.js";
+import {
+  compilePattern,
+  ensureGlobalFlag,
+  ensureGlobalMultilineFlags,
+  ensureMultilineFlag,
+  extractInlineFlags,
+} from "../../../src/lib/scan/regex.js";
+
+describe("extractInlineFlags", () => {
+  test.each<[string, { cleaned: string; flags: string }]>([
+    ["foo", { cleaned: "foo", flags: "" }],
+    ["", { cleaned: "", flags: "" }],
+    ["(?i)foo", { cleaned: "foo", flags: "i" }],
+    ["(?im)^foo$", { cleaned: "^foo$", flags: "im" }],
+    ["(?ims)anything", { cleaned: "anything", flags: "ims" }],
+    // rg's `U` → JS `s` (dotall).
+    ["(?U)foo.bar", { cleaned: "foo.bar", flags: "s" }],
+    // Scoped form widens to whole pattern.
+    ["(?i:foo)bar", { cleaned: "foobar", flags: "i" }],
+    ["(?i:hello)world", { cleaned: "helloworld", flags: "i" }],
+    // Mid-pattern flag group is untouched.
+    ["foo(?i)bar", { cleaned: "foo(?i)bar", flags: "" }],
+    // Escape before `(` — still recognized as inline flag
+    // (only leading position matters; the backslash after is part of
+    // the remaining pattern).
+    ["(?i)\\d+", { cleaned: "\\d+", flags: "i" }],
+  ])("extracts %p", (input, expected) => {
+    expect(extractInlineFlags(input)).toEqual(expected);
+  });
+
+  test("scoped form with nested groups: (?i:foo(bar))baz", () => {
+    expect(extractInlineFlags("(?i:foo(bar))baz")).toEqual({
+      cleaned: "foo(bar)baz",
+      flags: "i",
+    });
+  });
+
+  test("scoped form with char class containing ): (?i:[a-z)])", () => {
+    // Inside a `[...]` class, `)` doesn't close the group. The
+    // matching `)` is the outer one, outside the class.
+    expect(extractInlineFlags("(?i:[a-z)])")).toEqual({
+      cleaned: "[a-z)]",
+      flags: "i",
+    });
+  });
+
+  test("malformed scoped form (unclosed) passes through unchanged", () => {
+    // No matching close paren — translator falls back to "leave
+    // everything alone" so downstream compile throws a useful error.
+    expect(extractInlineFlags("(?i:foo")).toEqual({
+      cleaned: "(?i:foo",
+      flags: "",
+    });
+  });
+});
+
+describe("compilePattern", () => {
+  test("string without flags → plain RegExp", () => {
+    expect(compilePattern("foo").toString()).toBe("/foo/");
+  });
+
+  test("string with (?i) → /i flag", () => {
+    expect(compilePattern("(?i)foo").toString()).toBe("/foo/i");
+  });
+
+  test("caseSensitive: false adds i flag", () => {
+    expect(compilePattern("foo", { caseSensitive: false }).toString()).toBe(
+      "/foo/i"
+    );
+  });
+
+  test("inline + caseSensitive merge cleanly", () => {
+    expect(
+      compilePattern("(?m)^foo", { caseSensitive: false }).toString()
+    ).toBe("/^foo/im");
+  });
+
+  test("multiline: true adds m flag", () => {
+    expect(compilePattern("foo", { multiline: true }).toString()).toBe(
+      "/foo/m"
+    );
+  });
+
+  test("pre-compiled RegExp returned as-is", () => {
+    const re = /foo/gi;
+    expect(compilePattern(re)).toBe(re);
+  });
+
+  test("bad pattern throws ValidationError with a helpful message", () => {
+    expect(() => compilePattern("[unterminated")).toThrow(ValidationError);
+    try {
+      compilePattern("[unterminated");
+    } catch (error) {
+      expect((error as ValidationError).field).toBe("pattern");
+      expect((error as ValidationError).message).toMatch(
+        /Invalid grep pattern:/
+      );
+    }
+  });
+
+  test("no g flag — grep tests line-by-line", () => {
+    // Ensure we never leak the g flag even when requested via inline.
+    // (Our translator doesn't accept g, so the user can't provide it
+    // through extractInlineFlags; but a pre-compiled RegExp can bring
+    // its own.)
+    expect(compilePattern("(?i)foo").flags).toBe("i");
+  });
+});
+
+describe("ensureGlobalFlag", () => {
+  test("adds g to a regex without it", () => {
+    const re = ensureGlobalFlag(/foo/i);
+    expect(re.flags).toBe("gi");
+    expect(re.source).toBe("foo");
+  });
+
+  test("returns input unchanged when g is already present", () => {
+    const input = /foo/gi;
+    expect(ensureGlobalFlag(input)).toBe(input);
+  });
+});
+
+describe("ensureMultilineFlag", () => {
+  test("adds m to a regex without it", () => {
+    const re = ensureMultilineFlag(/foo/i);
+    expect(re.flags).toBe("im");
+  });
+
+  test("returns input unchanged when m is already present", () => {
+    const input = /foo/m;
+    expect(ensureMultilineFlag(input)).toBe(input);
+  });
+
+  test("^ matches at line boundaries with m flag", () => {
+    const re = ensureMultilineFlag(/^foo/);
+    expect(re.test("bar\nfoo")).toBe(true);
+  });
+});
+
+describe("ensureGlobalMultilineFlags", () => {
+  test("adds both g and m in one clone", () => {
+    const re = ensureGlobalMultilineFlags(/foo/i);
+    expect(re.flags.includes("g")).toBe(true);
+    expect(re.flags.includes("m")).toBe(true);
+    expect(re.flags.includes("i")).toBe(true);
+  });
+
+  test("returns input unchanged when both flags already present", () => {
+    const input = /foo/gim;
+    expect(ensureGlobalMultilineFlags(input)).toBe(input);
+  });
+
+  test("preserves source verbatim", () => {
+    const re = ensureGlobalMultilineFlags(/[\w.]+@example\.com/);
+    expect(re.source).toBe("[\\w.]+@example\\.com");
+  });
+});
diff --git a/test/lib/scan/walker.property.test.ts b/test/lib/scan/walker.property.test.ts
new file mode 100644
index 000000000..2dcd37656
--- /dev/null
+++ b/test/lib/scan/walker.property.test.ts
@@ -0,0 +1,132 @@
+/**
+ * Property tests for `walkFiles`.
+ *
+ * Two invariants we want to hold for any randomly generated tree:
+ *
+ *   1. Descent cap: no file yielded beyond `maxDepth + 1`. The walker
+ *      caps directory *descent* at `maxDepth` — files inside those
+ *      last-entered dirs still yield (they sit at parent_depth + 1).
+ *   2. minDepth guarantee: for any tree, with `timeBudgetMs: 0` and a
+ *      fixed `minDepth = N`, every file at depth ≤ N is yielded
+ *      regardless of budget.
+ *
+ * Trees are built from a flat `(path, kind)` list. Generation is
+ * constrained to ASCII alphanumerics + `_` to avoid tripping over the
+ * `ignore` package's pattern escaping quirks.
+ */
+
+import { afterAll, describe, expect, test } from "bun:test";
+import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import {
+  array,
+  asyncProperty,
+  constantFrom,
+  assert as fcAssert,
+  integer,
+} from "fast-check";
+import { walkFiles } from "../../../src/lib/scan/walker.js";
+import { DEFAULT_NUM_RUNS } from "../../model-based/helpers.js";
+
+const ROOT = mkdtempSync(join(tmpdir(), "scan-walker-prop-"));
+
+afterAll(() => {
+  rmSync(ROOT, { recursive: true, force: true });
+});
+
+const SEGMENT_CHARS = "abcdefghijklmnopqrstuvwxyz0123456789_".split("");
+const segmentArb = array(constantFrom(...SEGMENT_CHARS), {
+  minLength: 1,
+  maxLength: 6,
+}).map((chars) => chars.join(""));
+
+/** An array of 1–5 path segments — describes a file relative to cwd. */
+const filePathArb = array(segmentArb, { minLength: 1, maxLength: 5 });
+
+function buildTree(cwd: string, paths: string[][]): void {
+  for (const segs of paths) {
+    const rel = `${segs.join("/")}.ts`;
+    const abs = join(cwd, rel);
+    const parent = abs.slice(0, abs.lastIndexOf("/"));
+    mkdirSync(parent, { recursive: true });
+    writeFileSync(abs, "x", "utf8");
+  }
+}
+
+describe("property: walkFiles — invariants", () => {
+  test("maxDepth: no file yielded beyond maxDepth + 1", async () => {
+    // File depth = parent_dir_depth + 1. With maxDepth capping
+    // descent at N, the deepest dir entered is at N, so files inside
+    // it sit at N + 1. Files any deeper can't exist — their dir
+    // wouldn't have been entered.
+    await fcAssert(
+      asyncProperty(
+        array(filePathArb, { minLength: 0, maxLength: 20 }),
+        integer({ min: 0, max: 5 }),
+        async (paths, maxDepth) => {
+          const cwd = mkdtempSync(join(ROOT, "depth-"));
+          try {
+            buildTree(cwd, paths);
+            for await (const entry of walkFiles({ cwd, maxDepth })) {
+              expect(entry.depth).toBeLessThanOrEqual(maxDepth + 1);
+            }
+          } finally {
+            rmSync(cwd, { recursive: true, force: true });
+          }
+        }
+      ),
+      { numRuns: Math.min(DEFAULT_NUM_RUNS, 20) }
+    );
+  });
+
+  test(
+    "minDepth guarantee: with budget=0 and clock advancing, " +
+      "every file at depth <= minDepth still yields",
+    async () => {
+      await fcAssert(
+        asyncProperty(
+          array(filePathArb, { minLength: 0, maxLength: 20 }),
+          integer({ min: 0, max: 5 }),
+          async (paths, minDepth) => {
+            const cwd = mkdtempSync(join(ROOT, "min-"));
+            try {
+              buildTree(cwd, paths);
+
+              // Which file paths have depth ≤ minDepth? (depth N means
+              // N-1 dir segments + filename, i.e., segments.length ≤ minDepth
+              // when the file sits at depth N = segments.length.)
+              const expected = new Set<string>();
+              for (const segs of paths) {
+                // File at depth === segs.length.
+                if (segs.length <= minDepth) {
+                  expected.add(`${segs.join("/")}.ts`);
+                }
+              }
+
+              let now = 0;
+              const yielded = new Set<string>();
+              for await (const entry of walkFiles({
+                cwd,
+                minDepth,
+                timeBudgetMs: 0,
+                clock: () => {
+                  now += 1000;
+                  return now;
+                },
+              })) {
+                yielded.add(entry.relativePath);
+              }
+              for (const expectedPath of expected) {
+                expect(yielded.has(expectedPath)).toBe(true);
+              }
+            } finally {
+              rmSync(cwd, { recursive: true, force: true });
+            }
+          }
+        ),
+        { numRuns: Math.min(DEFAULT_NUM_RUNS, 20) }
+      );
+    }
+  );
+});
diff --git a/test/lib/scan/walker.test.ts b/test/lib/scan/walker.test.ts
new file mode 100644
index 000000000..66ad44ba7
--- /dev/null
+++ b/test/lib/scan/walker.test.ts
@@ -0,0 +1,665 @@
+/**
+ * Unit tests for `walkFiles` in `src/lib/scan/walker.ts`.
+ *
+ * Each test builds a small sandbox under `tmpdir()`, runs the walker
+ * with specific options, and asserts the yielded relative paths (order
+ * doesn't matter — we compare via Set).
+ *
+ * Time-budget tests inject a mock clock so we can verify the min-depth
+ * guarantee without flaky wall-clock dependencies.
+ */
+
+import { afterAll, describe, expect, test } from "bun:test";
+import {
+  mkdirSync,
+  mkdtempSync,
+  rmSync,
+  symlinkSync,
+  writeFileSync,
+} from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import type { WalkEntry } from "../../../src/lib/scan/types.js";
+import { walkFiles } from "../../../src/lib/scan/walker.js";
+
+const ROOT = mkdtempSync(join(tmpdir(), "scan-walker-test-"));
+
+afterAll(() => {
+  rmSync(ROOT, { recursive: true, force: true });
+});
+
+/** Build a sandbox directory with the given relative-path → content map. */
+function makeSandbox(layout: Record<string, string>): {
+  cwd: string;
+  cleanup: () => void;
+} {
+  const cwd = mkdtempSync(join(ROOT, "box-"));
+  for (const [rel, content] of Object.entries(layout)) {
+    const abs = join(cwd, rel);
+    const parent = abs.slice(0, abs.lastIndexOf("/"));
+    mkdirSync(parent, { recursive: true });
+    writeFileSync(abs, content, "utf8");
+  }
+  return { cwd, cleanup: () => rmSync(cwd, { recursive: true, force: true }) };
+}
+
+/** Collect every relativePath a walk yields into a sorted array. */
+async function collect(
+  opts: Parameters<typeof walkFiles>[0]
+): Promise<string[]> {
+  const out: WalkEntry[] = [];
+  for await (const entry of walkFiles(opts)) {
+    out.push(entry);
+  }
+  return out.map((e) => e.relativePath).sort();
+}
+
+describe("walkFiles — basic traversal", () => {
+  test("yields every file at every depth under cwd", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "x",
+      "src/b.ts": "y",
+      "src/deep/c.ts": "z",
+    });
+    try {
+      const files = await collect({ cwd });
+      expect(files).toEqual(["a.ts", "src/b.ts", "src/deep/c.ts"]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("returns nothing when cwd doesn't exist", async () => {
+    const files = await collect({ cwd: join(ROOT, "does-not-exist") });
+    expect(files).toEqual([]);
+  });
+
+  test("empty directory yields nothing", async () => {
+    const { cwd, cleanup } = makeSandbox({});
+    try {
+      const files = await collect({ cwd });
+      expect(files).toEqual([]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("throws on relative cwd (programming-error safeguard)", async () => {
+    await expect(
+      (async () => {
+        for await (const _ of walkFiles({ cwd: "./relative" })) {
+          break;
+        }
+      })()
+    ).rejects.toThrow(/absolute/);
+  });
+});
+
+describe("walkFiles — filtering", () => {
+  test("extensions filter narrows yield to allowed suffixes", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "x",
+      "b.js": "y",
+      "c.md": "z",
+    });
+    try {
+      const files = await collect({
+        cwd,
+        extensions: new Set([".ts", ".js"]),
+      });
+      expect(files).toEqual(["a.ts", "b.js"]);
+      expect(files.includes("c.md")).toBe(false);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("alwaysSkipDirs skips matching directory subtrees", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "src/a.ts": "x",
+      "node_modules/dep/index.js": "y",
+      "packages/p/node_modules/other/x.js": "z",
+      "packages/p/src/ok.ts": "w",
+    });
+    try {
+      const files = await collect({
+        cwd,
+        alwaysSkipDirs: ["node_modules"],
+        respectGitignore: false,
+      });
+      expect(files).toEqual(["packages/p/src/ok.ts", "src/a.ts"]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("hidden: false skips dotfiles and dotdirs", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "x",
+      ".env": "y",
+      ".hidden/z.ts": "z",
+    });
+    try {
+      const files = await collect({ cwd, hidden: false });
+      expect(files).toEqual(["a.ts"]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("hidden: true (default) yields dotfiles", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "x",
+      ".env": "y",
+    });
+    try {
+      const files = await collect({ cwd });
+      expect(files.includes(".env")).toBe(true);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("walkFiles — size + depth limits", () => {
+  test("maxFileSize skips oversized files", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "small.ts": "x",
+      "big.ts": "x".repeat(2000),
+    });
+    try {
+      const files = await collect({ cwd, maxFileSize: 1000 });
+      expect(files).toEqual(["small.ts"]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("maxDepth caps directory descent; files inside still yield", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "x",
+      "src/b.ts": "y",
+      "src/deep/c.ts": "z",
+      "src/deep/deeper/d.ts": "w",
+    });
+    try {
+      // Dirs entered: cwd (0), src (1), deep (2). `deeper` (would be
+      // 3) is NOT entered. Files inside any entered dir yield:
+      //   a.ts (inside cwd), src/b.ts (inside src), src/deep/c.ts
+      //   (inside deep). `deeper/d.ts` skipped because `deeper` is
+      //   never entered.
+      const files = await collect({ cwd, maxDepth: 2 });
+      expect(files).toEqual(["a.ts", "src/b.ts", "src/deep/c.ts"]);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("walkFiles — .gitignore integration", () => {
+  test("respectGitignore: true honors root patterns", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      ".gitignore": "*.log\n",
+      "a.ts": "x",
+      "b.log": "y",
+    });
+    try {
+      const files = await collect({ cwd });
+      expect(files.sort()).toEqual([".gitignore", "a.ts"]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("respectGitignore: false skips even the root .gitignore", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      ".gitignore": "*.log\n",
+      "a.ts": "x",
+      "b.log": "y",
+    });
+    try {
+      const files = await collect({ cwd, respectGitignore: false });
+      expect(files).toEqual([".gitignore", "a.ts", "b.log"]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("nested .gitignore layers on top of parent (cumulative)", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      ".gitignore": "*.log\n",
+      "src/.gitignore": "!important.log\n",
+      "src/foo.log": "x",
+      "src/important.log": "y",
+    });
+    try {
+      const files = await collect({ cwd });
+      // Parent ignored *.log, child un-ignored important.log.
+      expect(files.includes("src/foo.log")).toBe(false);
+      expect(files.includes("src/important.log")).toBe(true);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("walkFiles — binary detection", () => {
+  test("known text extensions classify as text without opening", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "hello",
+      "b.json": "{}",
+    });
+    try {
+      const entries: WalkEntry[] = [];
+      for await (const e of walkFiles({ cwd })) {
+        entries.push(e);
+      }
+      for (const e of entries) {
+        expect(e.isBinary).toBe(false);
+      }
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("binary files with unknown extensions classify as binary", async () => {
+    const { cwd, cleanup } = makeSandbox({});
+    try {
+      // Write a NUL-containing blob with an unknown extension.
+      const bin = new Uint8Array(256);
+      bin[10] = 0;
+      writeFileSync(join(cwd, "blob.bin"), bin);
+      const entries: WalkEntry[] = [];
+      for await (const e of walkFiles({ cwd })) {
+        entries.push(e);
+      }
+      const blob = entries.find((e) => e.relativePath === "blob.bin");
+      expect(blob?.isBinary).toBe(true);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("walkFiles — mtime recording", () => {
+  test("recordMtimes: false (default) leaves mtime = 0", async () => {
+    const { cwd, cleanup } = makeSandbox({ "a.ts": "x" });
+    try {
+      const entries: WalkEntry[] = [];
+      for await (const e of walkFiles({ cwd })) {
+        entries.push(e);
+      }
+      expect(entries[0]?.mtime).toBe(0);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("recordMtimes: true populates non-zero mtime", async () => {
+    const { cwd, cleanup } = makeSandbox({ "a.ts": "x" });
+    try {
+      const entries: WalkEntry[] = [];
+      for await (const e of walkFiles({ cwd, recordMtimes: true })) {
+        entries.push(e);
+      }
+      expect(entries[0]?.mtime).toBeGreaterThan(0);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("walkFiles — time budget", () => {
+  test("minDepth guarantee: files at depth <= minDepth yield even with budget=0", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "x",
+      "src/b.ts": "y",
+      "src/deep/c.ts": "z",
+    });
+    try {
+      let now = 0;
+      const files = await collect({
+        cwd,
+        minDepth: 2,
+        timeBudgetMs: 0,
+        clock: () => {
+          const current = now;
+          now += 1;
+          return current;
+        },
+      });
+      // `a.ts` is depth 1, `src/b.ts` is depth 2 — both within minDepth
+      // so they yield regardless of budget. `src/deep/c.ts` at depth 3
+      // is beyond minDepth and the 0ms budget kills the descent.
+      expect(files.includes("a.ts")).toBe(true);
+      expect(files.includes("src/b.ts")).toBe(true);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("budget truncates beyond minDepth", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "x",
+      "src/b.ts": "y",
+      "src/deep/c.ts": "z",
+      "src/deep/deeper/d.ts": "w",
+    });
+    try {
+      let now = 0;
+      const files = await collect({
+        cwd,
+        minDepth: 1,
+        timeBudgetMs: 0,
+        // Advance the clock a lot after the first read so budget is
+        // definitely blown before going beyond depth 1.
+        clock: () => {
+          now += 100;
+          return now;
+        },
+      });
+      // Once minDepth==1 completes, the walker bails before descending
+      // into src/deep/ and src/deep/deeper/. `src/b.ts` is at depth 2
+      // but comes from opening src/ (depth 1). `src/deep` itself is a
+      // directory at depth 2 — we won't descend past it.
+      expect(files.includes("a.ts")).toBe(true);
+      expect(files.includes("src/b.ts")).toBe(true);
+      expect(files.includes("src/deep/c.ts")).toBe(false);
+      expect(files.includes("src/deep/deeper/d.ts")).toBe(false);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("walkFiles — AbortSignal", () => {
+  test("aborting mid-walk throws AbortError on next advance", async () => {
+    // Use a deep tree so the iteration has somewhere to run.
+    const layout: Record<string, string> = {};
+    for (let i = 0; i < 10; i += 1) {
+      layout[`dir${i}/file.ts`] = "x";
+    }
+    const { cwd, cleanup } = makeSandbox(layout);
+    try {
+      const controller = new AbortController();
+      const iter = walkFiles({ cwd, signal: controller.signal });
+      let yields = 0;
+      let threw: unknown = null;
+      try {
+        for await (const _ of iter) {
+          yields += 1;
+          if (yields === 2) {
+            controller.abort();
+          }
+        }
+      } catch (error) {
+        threw = error;
+      }
+      expect(yields).toBeGreaterThanOrEqual(2);
+      expect(threw).toBeInstanceOf(DOMException);
+      expect((threw as DOMException).name).toBe("AbortError");
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("walkFiles — descentHook", () => {
+  test("default: linear depth counting", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "packages/foo/src/deep/a.ts": "x",
+    });
+    try {
+      const entries: WalkEntry[] = [];
+      for await (const e of walkFiles({ cwd })) {
+        entries.push(e);
+      }
+      const deep = entries.find(
+        (e) => e.relativePath === "packages/foo/src/deep/a.ts"
+      );
+      // 5 path segments → file depth 5.
+      expect(deep?.depth).toBe(5);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("custom descentHook resets depth at monorepo package dirs", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "packages/foo/src/deep/a.ts": "x",
+      "packages/bar/src/deep/b.ts": "y",
+      "root.ts": "z",
+    });
+    try {
+      // Reset to 0 on "packages/<pkg>" boundaries; otherwise linear.
+      const descentHook = (relPath: string, currentDepth: number) => {
+        const segs = relPath.split("/");
+        if (segs.length === 2 && segs[0] === "packages") {
+          return 0;
+        }
+        return currentDepth + 1;
+      };
+      const entries: WalkEntry[] = [];
+      for await (const e of walkFiles({ cwd, descentHook })) {
+        entries.push(e);
+      }
+      // packages/foo resets to depth 0; src/ → depth 1 under it,
+      // deep/ → depth 2, a.ts (file) → depth 3. Without the reset
+      // a.ts would be depth 5 from the repo root.
+      const deep = entries.find(
+        (e) => e.relativePath === "packages/foo/src/deep/a.ts"
+      );
+      expect(deep?.depth).toBe(3);
+      const root = entries.find((e) => e.relativePath === "root.ts");
+      expect(root?.depth).toBe(1);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("maxDepth applies to post-reset depth (descent cap)", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "packages/foo/a.ts": "x",
+      "packages/foo/deep/b.ts": "y",
+      "packages/foo/deep/deeper/c.ts": "z",
+      "packages/foo/deep/deeper/more/d.ts": "w",
+    });
+    try {
+      // After reset at packages/foo (depth 0), dirs entered at:
+      //   packages/foo/ = 0
+      //   packages/foo/deep = 1
+      //   packages/foo/deep/deeper = 2 (entered)
+      //   packages/foo/deep/deeper/more = 3 (NOT entered, > maxDepth=2)
+      // Files yielded: a.ts (inside foo), b.ts (inside deep), c.ts
+      // (inside deeper). d.ts skipped because `more` never entered.
+      const descentHook = (relPath: string, currentDepth: number) => {
+        const segs = relPath.split("/");
+        if (segs.length === 2 && segs[0] === "packages") {
+          return 0;
+        }
+        return currentDepth + 1;
+      };
+      const files = await collect({ cwd, descentHook, maxDepth: 2 });
+      expect(files.includes("packages/foo/a.ts")).toBe(true);
+      expect(files.includes("packages/foo/deep/b.ts")).toBe(true);
+      expect(files.includes("packages/foo/deep/deeper/c.ts")).toBe(true);
+      expect(files.includes("packages/foo/deep/deeper/more/d.ts")).toBe(false);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("walkFiles — onDirectoryVisit hook", () => {
+  test("fires once per visited directory with a floored mtime", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.ts": "x",
+      "src/b.ts": "y",
+      "src/deep/c.ts": "z",
+    });
+    try {
+      const visits: Array<{ absDir: string; mtimeMs: number }> = [];
+      for await (const _ of walkFiles({
+        cwd,
+        onDirectoryVisit: (absDir, mtimeMs) => {
+          visits.push({ absDir, mtimeMs });
+        },
+      })) {
+        // drain
+      }
+      // cwd + src + src/deep = 3 directory visits (node_modules / .git
+      // aren't created by the sandbox helper, so no spurious hits).
+      const dirs = visits.map((v) => v.absDir).sort();
+      expect(dirs).toEqual([cwd, join(cwd, "src"), join(cwd, "src", "deep")]);
+      for (const v of visits) {
+        expect(Number.isInteger(v.mtimeMs)).toBe(true);
+        expect(v.mtimeMs).toBeGreaterThan(0);
+      }
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("not called when unset (zero cost for non-DSN callers)", async () => {
+    const { cwd, cleanup } = makeSandbox({ "a.ts": "x" });
+    try {
+      let yields = 0;
+      // With no hook set, the walker must complete normally and yield
+      // the file. We can't directly observe "no extra stat" from JS
+      // — the contract is "no code path inside the walker should call
+      // the hook when opts.onDirectoryVisit is undefined." This test
+      // verifies the walk still works; correctness of the zero-cost
+      // path is covered by the above test (which DOES set the hook).
+      for await (const _ of walkFiles({ cwd })) {
+        yields += 1;
+      }
+      expect(yields).toBe(1);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("walkFiles — nested .gitignore loading", () => {
+  test("nested patterns apply to their subtree; siblings without .gitignore unaffected", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      ".gitignore": "",
+      "a/file1.ts": "x",
+      "a/.gitignore": "ignored.ts\n",
+      "a/ignored.ts": "y", // skipped by a/.gitignore
+      "b/file2.ts": "z", // b has no .gitignore — yields normally
+      "c/file3.ts": "w",
+      "c/.gitignore": "*.log\n",
+      "c/foo.log": "v", // skipped by c/.gitignore
+    });
+    try {
+      const files = await collect({ cwd });
+      expect(files.includes("a/file1.ts")).toBe(true);
+      expect(files.includes("a/ignored.ts")).toBe(false);
+      expect(files.includes("b/file2.ts")).toBe(true);
+      expect(files.includes("c/file3.ts")).toBe(true);
+      expect(files.includes("c/foo.log")).toBe(false);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("nested gitignore in a dir without one higher up still applies", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "deep/sub/.gitignore": "secret.ts\n",
+      "deep/sub/secret.ts": "x",
+      "deep/sub/ok.ts": "y",
+    });
+    try {
+      const files = await collect({ cwd });
+      expect(files.includes("deep/sub/ok.ts")).toBe(true);
+      expect(files.includes("deep/sub/secret.ts")).toBe(false);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("disabling nestedGitignore skips nested files", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a/.gitignore": "ignored.ts\n",
+      "a/ignored.ts": "x",
+      "a/ok.ts": "y",
+    });
+    try {
+      const files = await collect({ cwd, nestedGitignore: false });
+      // With nested disabled, the `a/.gitignore` is never read, so
+      // `a/ignored.ts` yields.
+      expect(files.includes("a/ignored.ts")).toBe(true);
+      expect(files.includes("a/ok.ts")).toBe(true);
+    } finally {
+      cleanup();
+    }
+  });
+});
+
+describe("walkFiles — followSymlinks", () => {
+  test("skips symlinks by default (followSymlinks: false)", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "real-dir/inside.ts": "x",
+      "real-file.ts": "y",
+    });
+    try {
+      symlinkSync(join(cwd, "real-dir"), join(cwd, "link-dir"));
+      symlinkSync(join(cwd, "real-file.ts"), join(cwd, "link-file.ts"));
+      const files = await collect({ cwd });
+      expect(files.sort()).toEqual(["real-dir/inside.ts", "real-file.ts"]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("followSymlinks: true follows symlinked files and dirs", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "aaa-real/inside.ts": "x",
+      "real-file.ts": "y",
+    });
+    try {
+      // Alphabetical order means `aaa-real` is visited first, so the
+      // real path claims the inode and subsequent symlinks to the
+      // same inode are skipped by cycle detection.
+      symlinkSync(join(cwd, "aaa-real"), join(cwd, "zzz-link"));
+      symlinkSync(join(cwd, "real-file.ts"), join(cwd, "link-file.ts"));
+      const files = await collect({ cwd, followSymlinks: true });
+      // Real dir's files yield; symlinked file also yields (different
+      // inode as a file is a different dentry).
+      expect(files.includes("aaa-real/inside.ts")).toBe(true);
+      expect(files.includes("real-file.ts")).toBe(true);
+      expect(files.includes("link-file.ts")).toBe(true);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("followSymlinks: true breaks circular symlinks via inode detection", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "inner/x.ts": "hello",
+    });
+    try {
+      // Create a cycle: inner/back -> cwd
+      symlinkSync(cwd, join(cwd, "inner", "back"));
+      const files = await collect({ cwd, followSymlinks: true });
+      // If the cycle weren't broken, we'd loop infinitely. Bounded
+      // result proves the inodeKey guard fires.
+      expect(files.length).toBeLessThan(20);
+      expect(files.includes("inner/x.ts")).toBe(true);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("broken symlink is silently skipped", async () => {
+    const { cwd, cleanup } = makeSandbox({ "real.ts": "hi" });
+    try {
+      symlinkSync(join(cwd, "nonexistent"), join(cwd, "broken.ts"));
+      const files = await collect({ cwd, followSymlinks: true });
+      expect(files).toEqual(["real.ts"]);
+    } finally {
+      cleanup();
+    }
+  });
+});