Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 24 additions & 10 deletions src/lib/scan/grep-worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,30 @@
*
* Worker → Main:
* - `{ type: "ready" }` once on startup.
* - `{ type: "result", ints: Uint32Array, linePool: string }` per
* request. `ints.buffer` is transferred (zero-copy); `linePool`
* is cloned.
* - `{ type: "result", ints: Uint32Array, linePoolBytes: Uint8Array }`
* per request. Both buffers are transferred (zero-copy).
*
* ## Match encoding
*
* Each match is 4 consecutive `u32`s in `ints`:
* [0] pathIdx index into the input `paths` array
* [1] lineNum 1-based line number
* [2] lineOffset character offset into `linePool`
* [2] lineOffset character offset into the decoded line pool
* [3] lineLength character length of the line (post-truncation)
*
* Structured-clone of `GrepMatch[]` for 215k matches costs ~200ms.
* Binary-packed form + shared `linePool` string drops that to
* ~2–3ms.
* The line pool is built as a JS string on the worker, UTF-8 encoded
* just before `postMessage`, and decoded back on the main side.
* Offsets stay in UTF-16 code-unit space; the encode/decode round
* trip preserves `.length` for all valid code points. Shipping the
* pool as a transferable `Uint8Array` keeps both buffers on
* `postMessage`'s zero-copy path — mixing a string with a
* transferable falls back to the slow structured-clone path in Bun.
*/

const { readFileSync } = require("node:fs");

const textEncoder = new TextEncoder();

// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: hot regex loop with literal gate + line counter + line-bound extraction + per-file cap is inherently branchy
self.onmessage = (event) => {
const {
Expand Down Expand Up @@ -85,7 +90,15 @@ self.onmessage = (event) => {
const lineEnd = lineEndRaw === -1 ? content.length : lineEndRaw;
let line = content.slice(lineStart, lineEnd);
if (line.length > maxLineLength) {
line = `${line.slice(0, maxLineLength - 1)}\u2026`;
// Back off if the cut lands on a high surrogate — splitting
// a pair leaves a lone half that `TextEncoder.encode`
// replaces with U+FFFD on the wire.
let cut = maxLineLength - 1;
const lastCode = line.charCodeAt(cut - 1);
if (lastCode >= 0xd8_00 && lastCode <= 0xdb_ff) {
cut -= 1;
}
line = `${line.slice(0, cut)}\u2026`;
}

const lineOffset = linePool.length;
Expand All @@ -105,9 +118,10 @@ self.onmessage = (event) => {
}

const packed = new Uint32Array(ints);
const linePoolBytes = textEncoder.encode(linePool);
self.postMessage(
{ type: "result", ints: packed, linePool },
{ transfer: [packed.buffer] }
{ type: "result", ints: packed, linePoolBytes },
{ transfer: [packed.buffer, linePoolBytes.buffer] }
);
};

Expand Down
24 changes: 19 additions & 5 deletions src/lib/scan/worker-pool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,11 @@ export type WorkerGrepRequest = {
export type WorkerGrepResult = {
/** Packed 4-u32-per-match (pathIdx, lineNum, lineOffset, lineLength). */
ints: Uint32Array;
/** Concatenated line text, indexed by `ints[i*4 + 2]` and `+3`. */
linePool: string;
/**
* Concatenated line text as UTF-8 bytes. Decoded on the main side;
* `ints[i*4 + 2]` / `+3` index into the decoded string.
*/
linePoolBytes: Uint8Array;
};

/**
Expand Down Expand Up @@ -169,7 +172,7 @@ export function getWorkerPool(): WorkerPool {
if (pw.inflight === 0) {
unrefWorker(pw.worker);
}
next.resolve({ ints: data.ints, linePool: data.linePool });
next.resolve({ ints: data.ints, linePoolBytes: data.linePoolBytes });
});
w.addEventListener("error", (err) => {
pw.alive = false;
Expand Down Expand Up @@ -271,8 +274,18 @@ export function terminatePool(): void {
}
}

// `ignoreBOM: true` is load-bearing: without it the decoder silently
// strips a leading U+FEFF, which desynchronises every `lineOffset` /
// `lineLength` index the worker stored against the pre-encode pool
// length. A BOM-prefixed source file lands a U+FEFF at pool index 0,
// and with default (BOM-eating) decode the whole batch's lines would
// shift left by one code unit. `fatal: false` (default) keeps
// replacement-char behavior intact for any invalid sequences — the
// worker's round-trip can't produce them, but it's the safer default.
const LINE_POOL_DECODER = new TextDecoder("utf-8", { ignoreBOM: true });

/**
* Decode a worker's packed `{ints, linePool}` into `GrepMatch[]`,
* Decode a worker's packed `{ints, linePoolBytes}` into `GrepMatch[]`,
* reconstructing path fields from the caller's `paths` / `relPaths`.
*
* Optional `mtimes` is a parallel per-path array; when provided,
Expand All @@ -285,7 +298,8 @@ export function decodeWorkerMatches(
relPaths: readonly string[],
mtimes: readonly number[] | null = null
): GrepMatch[] {
const { ints, linePool } = result;
const { ints, linePoolBytes } = result;
const linePool = LINE_POOL_DECODER.decode(linePoolBytes);
const matches: GrepMatch[] = [];
const count = Math.floor(ints.length / 4);
for (let i = 0; i < count; i += 1) {
Expand Down
72 changes: 72 additions & 0 deletions test/lib/scan/grep.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,78 @@ describe("collectGrep — basic matching", () => {
cleanup();
}
});

test("preserves non-ASCII / multi-byte UTF-8 in matched lines", async () => {
const { cwd, cleanup } = makeSandbox({
"a.txt": "héllo wörld TARGET here\npréfix TARGET 你好世界\n",
"b.txt": "emoji 🙂 TARGET 🎉 end\nmath ∑ ∞ TARGET ∂\n",
"c.txt": "astral 𝒜 TARGET 𝕏 𝔸\n",
});
try {
const { matches } = await collectGrep({ cwd, pattern: "TARGET" });
expect(matches.map((m) => m.line).sort()).toEqual([
"astral 𝒜 TARGET 𝕏 𝔸",
"emoji 🙂 TARGET 🎉 end",
"héllo wörld TARGET here",
"math ∑ ∞ TARGET ∂",
"préfix TARGET 你好世界",
]);
} finally {
cleanup();
}
});

test("truncation at a surrogate-pair boundary doesn't leak U+FFFD", async () => {
// Regression: `maxLineLength` truncation used to slice on a
// code-unit boundary, which could split a pair and leave a lone
// high surrogate that `TextEncoder` replaces with U+FFFD.
const { cwd, cleanup } = makeSandbox({
"a.txt": "TARGET🙂trailing content beyond the cutoff\n",
});
try {
const { matches } = await collectGrep({
cwd,
pattern: "TARGET",
maxLineLength: 8,
});
expect(matches).toHaveLength(1);
const line = matches[0]?.line ?? "";
expect(line.startsWith("TARGET")).toBe(true);
expect(line.endsWith("\u2026")).toBe(true);
expect(line.includes("\uFFFD")).toBe(false);
} finally {
cleanup();
}
});

test("UTF-8 BOM at the start of a file preserves line offsets", async () => {
// Regression: the decoder defaults to `ignoreBOM: false`, which
// silently strips a leading U+FEFF. A BOM-prefixed source file
// would put U+FEFF at pool index 0 on the worker side; without
// `ignoreBOM: true` on the main-side decoder, the decoded pool is
// one code unit shorter than the worker's offsets assume, and
// every line in that batch shifts left by one character.
const { cwd, cleanup } = makeSandbox({});
try {
const body = "TARGET first\nTARGET second\nTARGET third\n";
const bytes = new Uint8Array(3 + body.length);
bytes[0] = 0xef;
bytes[1] = 0xbb;
bytes[2] = 0xbf;
bytes.set(new TextEncoder().encode(body), 3);
writeFileSync(join(cwd, "bom.txt"), bytes);
const { matches } = await collectGrep({ cwd, pattern: "TARGET" });
// The first line keeps the BOM (that's what's in the source
// file); lines 2 and 3 are intact — no bleed-through.
expect(matches.map((m) => m.line)).toEqual([
"\uFEFFTARGET first",
"TARGET second",
"TARGET third",
]);
} finally {
cleanup();
}
});
});

describe("collectGrep — case sensitivity", () => {
Expand Down
Loading