getsentry · BYK · Apr 23, 2026 · Apr 23, 2026
diff --git a/src/lib/scan/grep-worker.js b/src/lib/scan/grep-worker.js
@@ -13,25 +13,30 @@
  *
  * Worker → Main:
  * - `{ type: "ready" }` once on startup.
- * - `{ type: "result", ints: Uint32Array, linePool: string }` per
- *   request. `ints.buffer` is transferred (zero-copy); `linePool`
- *   is cloned.
+ * - `{ type: "result", ints: Uint32Array, linePoolBytes: Uint8Array }`
+ *   per request. Both buffers are transferred (zero-copy).
  *
  * ## Match encoding
  *
  * Each match is 4 consecutive `u32`s in `ints`:
  *   [0] pathIdx     index into the input `paths` array
  *   [1] lineNum     1-based line number
- *   [2] lineOffset  character offset into `linePool`
+ *   [2] lineOffset  character offset into the decoded line pool
  *   [3] lineLength  character length of the line (post-truncation)
  *
- * Structured-clone of `GrepMatch[]` for 215k matches costs ~200ms.
- * Binary-packed form + shared `linePool` string drops that to
- * ~2–3ms.
+ * The line pool is built as a JS string on the worker, UTF-8 encoded
+ * just before `postMessage`, and decoded back on the main side.
+ * Offsets stay in UTF-16 code-unit space; the encode/decode round
+ * trip preserves `.length` for all valid code points. Shipping the
+ * pool as a transferable `Uint8Array` keeps both buffers on
+ * `postMessage`'s zero-copy path — mixing a string with a
+ * transferable falls back to the slow structured-clone path in Bun.
  */
 
 const { readFileSync } = require("node:fs");
 
+const textEncoder = new TextEncoder();
+
 // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: hot regex loop with literal gate + line counter + line-bound extraction + per-file cap is inherently branchy
 self.onmessage = (event) => {
   const {
@@ -85,7 +90,15 @@ self.onmessage = (event) => {
       const lineEnd = lineEndRaw === -1 ? content.length : lineEndRaw;
       let line = content.slice(lineStart, lineEnd);
       if (line.length > maxLineLength) {
-        line = `${line.slice(0, maxLineLength - 1)}\u2026`;
+        // Back off if the cut lands on a high surrogate — splitting
+        // a pair leaves a lone half that `TextEncoder.encode`
+        // replaces with U+FFFD on the wire.
+        let cut = maxLineLength - 1;
+        const lastCode = line.charCodeAt(cut - 1);
+        if (lastCode >= 0xd8_00 && lastCode <= 0xdb_ff) {
+          cut -= 1;
+        }
+        line = `${line.slice(0, cut)}\u2026`;
       }
 
       const lineOffset = linePool.length;
@@ -105,9 +118,10 @@ self.onmessage = (event) => {
   }
 
   const packed = new Uint32Array(ints);
+  const linePoolBytes = textEncoder.encode(linePool);
   self.postMessage(
-    { type: "result", ints: packed, linePool },
-    { transfer: [packed.buffer] }
+    { type: "result", ints: packed, linePoolBytes },
+    { transfer: [packed.buffer, linePoolBytes.buffer] }
   );
 };
 

diff --git a/src/lib/scan/worker-pool.ts b/src/lib/scan/worker-pool.ts
@@ -34,8 +34,11 @@ export type WorkerGrepRequest = {
 export type WorkerGrepResult = {
   /** Packed 4-u32-per-match (pathIdx, lineNum, lineOffset, lineLength). */
   ints: Uint32Array;
-  /** Concatenated line text, indexed by `ints[i*4 + 2]` and `+3`. */
-  linePool: string;
+  /**
+   * Concatenated line text as UTF-8 bytes. Decoded on the main side;
+   * `ints[i*4 + 2]` / `+3` index into the decoded string.
+   */
+  linePoolBytes: Uint8Array;
 };
 
 /**
@@ -169,7 +172,7 @@ export function getWorkerPool(): WorkerPool {
       if (pw.inflight === 0) {
         unrefWorker(pw.worker);
       }
-      next.resolve({ ints: data.ints, linePool: data.linePool });
+      next.resolve({ ints: data.ints, linePoolBytes: data.linePoolBytes });
     });
     w.addEventListener("error", (err) => {
       pw.alive = false;
@@ -271,8 +274,18 @@ export function terminatePool(): void {
   }
 }
 
+// `ignoreBOM: true` is load-bearing: without it the decoder silently
+// strips a leading U+FEFF, which desynchronises every `lineOffset` /
+// `lineLength` index the worker stored against the pre-encode pool
+// length. A BOM-prefixed source file lands a U+FEFF at pool index 0,
+// and with default (BOM-eating) decode the whole batch's lines would
+// shift left by one code unit. `fatal: false` (default) keeps
+// replacement-char behavior intact for any invalid sequences — the
+// worker's round-trip can't produce them, but it's the safer default.
+const LINE_POOL_DECODER = new TextDecoder("utf-8", { ignoreBOM: true });
+
 /**
- * Decode a worker's packed `{ints, linePool}` into `GrepMatch[]`,
+ * Decode a worker's packed `{ints, linePoolBytes}` into `GrepMatch[]`,
  * reconstructing path fields from the caller's `paths` / `relPaths`.
  *
  * Optional `mtimes` is a parallel per-path array; when provided,
@@ -285,7 +298,8 @@ export function decodeWorkerMatches(
   relPaths: readonly string[],
   mtimes: readonly number[] | null = null
 ): GrepMatch[] {
-  const { ints, linePool } = result;
+  const { ints, linePoolBytes } = result;
+  const linePool = LINE_POOL_DECODER.decode(linePoolBytes);
   const matches: GrepMatch[] = [];
   const count = Math.floor(ints.length / 4);
   for (let i = 0; i < count; i += 1) {

diff --git a/test/lib/scan/grep.test.ts b/test/lib/scan/grep.test.ts
@@ -80,6 +80,78 @@ describe("collectGrep — basic matching", () => {
       cleanup();
     }
   });
+
+  test("preserves non-ASCII / multi-byte UTF-8 in matched lines", async () => {
+    const { cwd, cleanup } = makeSandbox({
+      "a.txt": "héllo wörld TARGET here\npréfix TARGET 你好世界\n",
+      "b.txt": "emoji 🙂 TARGET 🎉 end\nmath ∑ ∞ TARGET ∂\n",
+      "c.txt": "astral 𝒜 TARGET 𝕏 𝔸\n",
+    });
+    try {
+      const { matches } = await collectGrep({ cwd, pattern: "TARGET" });
+      expect(matches.map((m) => m.line).sort()).toEqual([
+        "astral 𝒜 TARGET 𝕏 𝔸",
+        "emoji 🙂 TARGET 🎉 end",
+        "héllo wörld TARGET here",
+        "math ∑ ∞ TARGET ∂",
+        "préfix TARGET 你好世界",
+      ]);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("truncation at a surrogate-pair boundary doesn't leak U+FFFD", async () => {
+    // Regression: `maxLineLength` truncation used to slice on a
+    // code-unit boundary, which could split a pair and leave a lone
+    // high surrogate that `TextEncoder` replaces with U+FFFD.
+    const { cwd, cleanup } = makeSandbox({
+      "a.txt": "TARGET🙂trailing content beyond the cutoff\n",
+    });
+    try {
+      const { matches } = await collectGrep({
+        cwd,
+        pattern: "TARGET",
+        maxLineLength: 8,
+      });
+      expect(matches).toHaveLength(1);
+      const line = matches[0]?.line ?? "";
+      expect(line.startsWith("TARGET")).toBe(true);
+      expect(line.endsWith("\u2026")).toBe(true);
+      expect(line.includes("\uFFFD")).toBe(false);
+    } finally {
+      cleanup();
+    }
+  });
+
+  test("UTF-8 BOM at the start of a file preserves line offsets", async () => {
+    // Regression: the decoder defaults to `ignoreBOM: false`, which
+    // silently strips a leading U+FEFF. A BOM-prefixed source file
+    // would put U+FEFF at pool index 0 on the worker side; without
+    // `ignoreBOM: true` on the main-side decoder, the decoded pool is
+    // one code unit shorter than the worker's offsets assume, and
+    // every line in that batch shifts left by one character.
+    const { cwd, cleanup } = makeSandbox({});
+    try {
+      const body = "TARGET first\nTARGET second\nTARGET third\n";
+      const bytes = new Uint8Array(3 + body.length);
+      bytes[0] = 0xef;
+      bytes[1] = 0xbb;
+      bytes[2] = 0xbf;
+      bytes.set(new TextEncoder().encode(body), 3);
+      writeFileSync(join(cwd, "bom.txt"), bytes);
+      const { matches } = await collectGrep({ cwd, pattern: "TARGET" });
+      // The first line keeps the BOM (that's what's in the source
+      // file); lines 2 and 3 are intact — no bleed-through.
+      expect(matches.map((m) => m.line)).toEqual([
+        "\uFEFFTARGET first",
+        "TARGET second",
+        "TARGET third",
+      ]);
+    } finally {
+      cleanup();
+    }
+  });
 });
 
 describe("collectGrep — case sensitivity", () => {