diff --git a/src/lib/scan/binary.ts b/src/lib/scan/binary.ts index ab521c8a6..b6173cd2c 100644 --- a/src/lib/scan/binary.ts +++ b/src/lib/scan/binary.ts @@ -9,9 +9,10 @@ * * Two entry points: * - * - `classifyByExtension` — O(1) fast path. Returns `{ isBinary: false }` - * for known text extensions; returns null otherwise so the caller knows - * to fall through to the sniff path. + * - `classifyByExtension` — O(1) fast path. Classifies known text + * and known-binary extensions without a disk read. Returns `null` + * when the extension is ambiguous so the caller falls through to + * the NUL-sniff path. * - `readHeadAndSniff` — opens the file, reads the first 8 KB via * `fs.promises.open` + `handle.read`, runs the sniff, returns the head * buffer alongside the classification. @@ -21,6 +22,109 @@ import { open } from "node:fs/promises"; import { extname } from "node:path"; import { BINARY_SNIFF_BYTES } from "./constants.js"; +/** + * Extensions that are unambiguously binary. Listed extensions + * return `{ isBinary: true }` from `classifyByExtension` with no + * disk read — a 60-80ms win on fixtures rich in binary blobs + * (`.bin`, build outputs full of `.png`/`.woff2`/`.pdf`, etc.). + * + * Inclusion rule: only extensions whose file-format specification + * mandates non-text content. Ambiguous cases (`.log`, `.lock`, + * `.map`) fall through to the NUL-sniff — treating them as binary + * would silently drop text matches they may contain. `.svg` is XML + * text, NOT included. `.json` and `.yaml` are text, NOT included. + */ +export const BINARY_EXTENSIONS: ReadonlySet = new Set([ + // Images (raster/bitmap) + ".png", + ".jpg", + ".jpeg", + ".gif", + ".webp", + ".ico", + ".bmp", + ".tiff", + ".tif", + ".avif", + ".heic", + ".heif", + // Fonts + ".woff", + ".woff2", + ".ttf", + ".otf", + ".eot", + // Archives + ".zip", + ".gz", + ".bz2", + ".xz", + ".7z", + ".rar", + ".tar", + ".tgz", + ".tbz2", + ".txz", + // Media + ".mp3", + ".mp4", + ".wav", + ".ogg", + ".oga", + ".ogv", + ".webm", + ".flac", + ".m4a", + ".m4v", + ".avi", + ".mov", + ".mkv", + ".opus", + // Documents (binary office formats) + ".pdf", + ".doc", + ".docx", + ".xls", + ".xlsx", + ".ppt", + ".pptx", + ".odt", + ".ods", + ".odp", + // Executables and compiled artifacts + // NOTE: `.obj` is deliberately EXCLUDED — it's shared with the + // Wavefront OBJ 3D model format (plain-text ASCII), common in + // game-dev / AR / 3D-printing repos. MSVC `.obj` outputs land in + // `build/`/`target/` dirs which DEFAULT_SKIP_DIRS prunes anyway. + ".exe", + ".dll", + ".so", + ".dylib", + ".wasm", + ".class", + ".o", + ".a", + ".pyc", + ".pyo", + ".node", + // Databases (binary-format SQLite / Access) + ".db", + ".sqlite", + ".sqlite3", + ".mdb", + // Unambiguously-binary disk images. + // NOTE: generic blob extensions are deliberately EXCLUDED: + // - `.bin` — used for both firmware/raw data AND arbitrary + // text dumps; no format spec. + // - `.dat` — countless text data formats use this. + // - `.dump` — frequently plain-text SQL (`pg_dump`, `mysqldump` + // default to text). + // All three fall back to the NUL-sniff, which classifies them + // correctly by content. + ".iso", + ".img", +]); + /** * Inspect up to 8 KB of `head` for a NUL byte. * @@ -40,24 +144,30 @@ export function isLikelyBinary(head: Uint8Array): boolean { /** * Extension-based classification for the fast path. * - * Returns `{ isBinary: false }` when the path's extension is a known - * text extension — no disk read needed. Returns `null` when the - * extension is unknown and the caller must read the file head. + * Returns `{ isBinary: false }` when the extension is a known text + * type, `{ isBinary: true }` when it is unambiguously binary (see + * `BINARY_EXTENSIONS`). Returns `null` for the ambiguous middle + * ground — the caller falls through to `readHeadAndSniff`. * - * This intentionally does not try to classify "known binary" extensions - * (.png, .zip, .woff…). A NUL-byte sniff is fast and more reliable than - * maintaining a binary-extension allowlist; most text files without a - * TEXT_EXTENSIONS membership (e.g., `.sentryclirc`, `.editorconfig`, - * `Makefile`) would be misclassified by a naive binary-ext list. + * This is a performance hint, not a safety guarantee: even known- + * text extensions could in principle hold NUL bytes, and files + * without any extension (`.sentryclirc`, `Makefile`, `README`) or + * with unusual extensions always fall through to the NUL-sniff. */ export function classifyByExtension( absPath: string, textExtensions: ReadonlySet -): { isBinary: false } | null { +): { isBinary: boolean } | null { const ext = extname(absPath).toLowerCase(); - if (ext && textExtensions.has(ext)) { + if (!ext) { + return null; + } + if (textExtensions.has(ext)) { return { isBinary: false }; } + if (BINARY_EXTENSIONS.has(ext)) { + return { isBinary: true }; + } return null; } diff --git a/test/fixtures/bench/generate.ts b/test/fixtures/bench/generate.ts index 20466778e..5f13b0e86 100644 --- a/test/fixtures/bench/generate.ts +++ b/test/fixtures/bench/generate.ts @@ -28,6 +28,14 @@ export type FixtureSpec = { filesPerPackage: number; /** Text file extensions. Picked round-robin; DSN sprinkling works across all. */ fileExtensions: string[]; + /** + * Binary file extensions — picked for each binary blob. Defaults + * to `[".bin"]` for backward compat with pre-existing fixture + * hashes. Real-world shapes use a mix (`.png`, `.woff2`, `.mp3`, + * `.pdf`, etc.) to exercise the `BINARY_EXTENSIONS` fast-path in + * `src/lib/scan/binary.ts`. + */ + binaryExtensions?: string[]; /** Fraction [0,1] of files that are random binary blobs (NUL-byte-containing). */ binaryRatio: number; /** Fraction [0,1] of text files that include a DSN somewhere in the body. */ @@ -272,10 +280,17 @@ function populatePackage( } } + // `assets/` is intentionally a sibling of `src/` so the binary + // blobs stay inside the walked tree (outside `src/` means the + // walker still reaches them via the package root). Extension + // picked from `spec.binaryExtensions` (defaults to `.bin` when + // unspecified). + const binaryExts = spec.binaryExtensions ?? [".bin"]; for (let i = 0; i < binaryCount; i += 1) { const subdir = join(baseDir, "assets"); mkdirSync(subdir, { recursive: true }); - const filename = `blob-${i.toString().padStart(4, "0")}.bin`; + const ext = rng.pick(binaryExts); + const filename = `blob-${i.toString().padStart(4, "0")}${ext}`; writeFileSync(join(subdir, filename), buildBinaryBlob(spec.avgFileKB, rng)); fileCount += 1; } @@ -362,6 +377,9 @@ export function hashSpec(spec: FixtureMeta["spec"]): string { const canonical = JSON.stringify({ ...spec, fileExtensions: [...spec.fileExtensions].sort(), + binaryExtensions: spec.binaryExtensions + ? [...spec.binaryExtensions].sort() + : undefined, }); return createHash("sha256").update(canonical).digest("hex").slice(0, 16); } diff --git a/test/fixtures/bench/presets.ts b/test/fixtures/bench/presets.ts index 46c705923..54cc3bda0 100644 --- a/test/fixtures/bench/presets.ts +++ b/test/fixtures/bench/presets.ts @@ -21,6 +21,8 @@ export const SMALL: Omit = { packages: 0, filesPerPackage: 100, fileExtensions: [".ts", ".tsx", ".js", ".json", ".yml", ".md"], + // See `LARGE` for the rationale. + binaryExtensions: [".png", ".jpg", ".woff2", ".mp3", ".pdf", ".bin"], binaryRatio: 0.05, dsnRatio: 0.1, gitignoreDepth: "root", @@ -33,6 +35,8 @@ export const MEDIUM: Omit = { packages: 10, filesPerPackage: 100, fileExtensions: [".ts", ".tsx", ".js", ".json", ".yml", ".py", ".md"], + // See `LARGE` for the rationale. + binaryExtensions: [".png", ".jpg", ".woff2", ".mp3", ".pdf", ".bin"], binaryRatio: 0.08, dsnRatio: 0.12, gitignoreDepth: "nested", @@ -55,6 +59,12 @@ export const LARGE: Omit = { ".go", ".md", ], + // Mix of realistic webapp binary extensions that hit the + // `BINARY_EXTENSIONS` fast-path in `src/lib/scan/binary.ts`. `.bin` + // is NOT in BINARY_EXTENSIONS (generic — falls through to + // NUL-sniff), so including it lets the walker exercise both paths + // proportionally. + binaryExtensions: [".png", ".jpg", ".woff2", ".mp3", ".pdf", ".bin"], binaryRatio: 0.1, dsnRatio: 0.08, gitignoreDepth: "nested", diff --git a/test/lib/scan/binary.test.ts b/test/lib/scan/binary.test.ts index be25ddc65..d5d473aca 100644 --- a/test/lib/scan/binary.test.ts +++ b/test/lib/scan/binary.test.ts @@ -77,10 +77,41 @@ describe("classifyByExtension", () => { }); }); - test("unknown extensions return null (caller must sniff)", () => { - expect(classifyByExtension("/a/b/c.png", TEXT_EXTENSIONS)).toBeNull(); + test("known-binary extensions return isBinary:true (no sniff)", () => { + expect(classifyByExtension("/a/b/c.png", TEXT_EXTENSIONS)).toEqual({ + isBinary: true, + }); + expect(classifyByExtension("/a/b/c.woff", TEXT_EXTENSIONS)).toEqual({ + isBinary: true, + }); + expect(classifyByExtension("/a/b/c.pdf", TEXT_EXTENSIONS)).toEqual({ + isBinary: true, + }); + expect(classifyByExtension("/a/b/c.wasm", TEXT_EXTENSIONS)).toEqual({ + isBinary: true, + }); + // Case-insensitive — common for screenshot exports etc. + expect(classifyByExtension("/a/b/c.PNG", TEXT_EXTENSIONS)).toEqual({ + isBinary: true, + }); + }); + + test("ambiguous extensions return null (caller must sniff)", () => { + // `.svg` is XML text, NOT in BINARY_EXTENSIONS. + expect(classifyByExtension("/a/b/c.svg", TEXT_EXTENSIONS)).toBeNull(); + // `.log` / `.lock` / `.map` — usually text, unsafe to presume. + expect(classifyByExtension("/a/b/c.log", TEXT_EXTENSIONS)).toBeNull(); + expect(classifyByExtension("/a/b/c.lock", TEXT_EXTENSIONS)).toBeNull(); + expect(classifyByExtension("/a/b/c.map", TEXT_EXTENSIONS)).toBeNull(); + // Generic binary-ish extensions that are often text — we rely + // on the NUL-sniff for these. expect(classifyByExtension("/a/b/c.bin", TEXT_EXTENSIONS)).toBeNull(); - expect(classifyByExtension("/a/b/c.woff", TEXT_EXTENSIONS)).toBeNull(); + expect(classifyByExtension("/a/b/c.dat", TEXT_EXTENSIONS)).toBeNull(); + expect(classifyByExtension("/a/b/c.dump", TEXT_EXTENSIONS)).toBeNull(); + // `.obj` is shared with Wavefront OBJ (text 3D model format). + expect(classifyByExtension("/a/b/c.obj", TEXT_EXTENSIONS)).toBeNull(); + // Wholly unknown extension. + expect(classifyByExtension("/a/b/c.xyz", TEXT_EXTENSIONS)).toBeNull(); }); test("no-extension files return null", () => {