From 4b9322491e3314f516fffcef52f8b0fb7eacd33d Mon Sep 17 00:00:00 2001 From: andreinknv Date: Mon, 27 Apr 2026 17:09:08 -0400 Subject: [PATCH 1/2] =?UTF-8?q?refactor:=20file-based=20migrations=20?= =?UTF-8?q?=E2=80=94=20eliminate=20version-collision=20bug=20class?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Today every PR adding a schema migration claims `CURRENT_SCHEMA_VERSION = next` AND adds an array entry to `migrations: Migration[]` in src/db/migrations.ts. Two PRs both claiming the same version resolve as: "second PR's v4 silently no-ops on existing DBs" — a real silent-data-loss bug class (PR #113's reviewer caught one). After this refactor: Adding a new schema migration: 1. Pick the next free 3-digit prefix (`git ls-files 'src/db/migrations/[0-9]*.ts'` shows what's taken). 2. Create `src/db/migrations/-.ts` exporting a `MIGRATION: MigrationModule` (description + up). 3. Add one import line and one entry to `src/db/migrations/index.ts`'s REGISTERED_MODULES array. Two PRs both creating `004-foo.ts` collide on the FILESYSTEM — the maintainer sees it instantly. No more silent skipped migrations. ## What's new - `src/db/migrations/types.ts` — `MigrationModule { description, up }` and `Migration extends MigrationModule { version }`. - `src/db/migrations/002-project-metadata.ts` — extracted v2 body verbatim. - `src/db/migrations/003-lower-name-index.ts` — extracted v3 body verbatim. - `src/db/migrations/index.ts` — central registry. Static-imports each migration, parses the version FROM THE FILENAME (no hand-typed version field that can drift), enforces strict `NNN-kebab-name.ts` shape, validates uniqueness/sort at module load (throws loudly on collision), exposes ALL_MIGRATIONS and CURRENT_SCHEMA_VERSION. - `src/db/migrations.ts` — refactored to a thin runner. Same exported surface (CURRENT_SCHEMA_VERSION, getCurrentVersion, runMigrations, needsMigration, getPendingMigrations, getMigrationHistory, Migration type) — every existing import keeps working unchanged. - `__tests__/migrations-registry.test.ts` — 8 invariant tests: registry non-empty, versions unique + strictly ascending, CURRENT_SCHEMA_VERSION matches max, every file matches the strict NNN-kebab-name pattern, no orphan files, no phantom registrations. ## Reviewer pass Independent reviewer ran once. 3 REQUEST_CHANGES + 1 INFO addressed: 1. Hand-typed `version` field in REGISTERED_MODULES could drift from filename. **Fixed**: removed the version field; registry now parses version from filename via FILENAME_PATTERN regex inside validateRegistered. 2. Filename-pattern test was lenient (allowed 4-digit or 1-digit prefixes). **Fixed**: new "every migration file matches the strict NNN-kebab-name.ts pattern" test catches malformed filenames as orphan-detection-bypassing offenders. 3. `getPendingMigrations` returned `readonly Migration[]`, breaking callers that typed the result as `Migration[]`. **Fixed**: returns a fresh mutable array via `.slice()`. 4. No throw-on-duplicate test for validateRegistered (module evaluation timing). Acknowledged; not added. ## Backward compat Every existing import works unchanged: - `import { CURRENT_SCHEMA_VERSION } from './migrations'` ✓ - `import { runMigrations } from './migrations'` ✓ - `import { needsMigration } from './migrations'` ✓ - `import { getMigrationHistory } from './migrations'` ✓ - `import { getPendingMigrations } from './migrations'` — returns mutable Migration[] (preserved) - `Migration` type — re-exported ## Affected open PRs Every migration-touching PR (#102 UNIQUE edges, #105 cochange, #108 perf db, #111 LLM features, my #112 centrality+churn, #113 issue-history, #114 config-refs, #115 sql-refs) currently claims migration v4 and conflicts with each other on `migrations.ts`. After this lands they each become: - 1 new file: `src/db/migrations/-.ts` - 2 lines in registry.ts (import + array entry) Conflict shape changes from "next free version + array entry + CURRENT_SCHEMA_VERSION bump in one file" (4-way conflict) to "1 new file" + 2-line registry edit. If two PRs target the same NNN, the filesystem collision surfaces immediately — no silent skipped migrations. Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/migrations-registry.test.ts | 95 +++++++++++++++++++ src/db/migrations.ts | 93 +++++++------------ src/db/migrations/002-project-metadata.ts | 19 ++++ src/db/migrations/003-lower-name-index.ts | 10 ++ src/db/migrations/index.ts | 106 ++++++++++++++++++++++ src/db/migrations/types.ts | 25 +++++ 6 files changed, 286 insertions(+), 62 deletions(-) create mode 100644 __tests__/migrations-registry.test.ts create mode 100644 src/db/migrations/002-project-metadata.ts create mode 100644 src/db/migrations/003-lower-name-index.ts create mode 100644 src/db/migrations/index.ts create mode 100644 src/db/migrations/types.ts diff --git a/__tests__/migrations-registry.test.ts b/__tests__/migrations-registry.test.ts new file mode 100644 index 00000000..9fa15eed --- /dev/null +++ b/__tests__/migrations-registry.test.ts @@ -0,0 +1,95 @@ +/** + * Migration registry: structural invariants. + * + * Guards against the silent-no-op bug class that motivated this + * refactor. If a future PR introduces a duplicate version, + * out-of-order versions, or fails to register a new migration + * file, one of these tests fails loudly. + */ +import { describe, it, expect } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import { + ALL_MIGRATIONS, + CURRENT_SCHEMA_VERSION, +} from '../src/db/migrations'; + +describe('migration registry — structural invariants', () => { + it('registry is non-empty', () => { + expect(ALL_MIGRATIONS.length).toBeGreaterThan(0); + }); + + it('versions are unique', () => { + const seen = new Set(); + for (const m of ALL_MIGRATIONS) { + expect(seen.has(m.version)).toBe(false); + seen.add(m.version); + } + }); + + it('versions are strictly ascending', () => { + for (let i = 1; i < ALL_MIGRATIONS.length; i++) { + expect(ALL_MIGRATIONS[i]!.version).toBeGreaterThan( + ALL_MIGRATIONS[i - 1]!.version + ); + } + }); + + it('each migration has a non-empty description and a function up()', () => { + for (const m of ALL_MIGRATIONS) { + expect(m.description.length).toBeGreaterThan(0); + expect(typeof m.up).toBe('function'); + } + }); + + it('CURRENT_SCHEMA_VERSION matches the highest registered version', () => { + const max = ALL_MIGRATIONS[ALL_MIGRATIONS.length - 1]!.version; + expect(CURRENT_SCHEMA_VERSION).toBe(max); + }); +}); + +describe('migration files — filename ↔ version coupling', () => { + // Read the actual filenames on disk and assert each matches an + // entry in the registry. Catches the case where someone drops a + // new file in src/db/migrations/ but forgets to register it. + const migrationsDir = path.resolve(__dirname, '../src/db/migrations'); + const SUPPORT_FILES = new Set(['index.ts', 'types.ts']); + const STRICT_NNN_PATTERN = /^\d{3}-[a-z0-9]+(?:-[a-z0-9]+)*\.ts$/; + + function listMigrationFiles(): string[] { + return fs.readdirSync(migrationsDir).filter((f) => f.endsWith('.ts') && !SUPPORT_FILES.has(f)); + } + + it('every migration file matches the strict `NNN-kebab-name.ts` pattern', () => { + const offenders: string[] = []; + for (const f of listMigrationFiles()) { + if (!STRICT_NNN_PATTERN.test(f)) { + offenders.push(f); + } + } + expect(offenders).toEqual([]); + }); + + it('every src/db/migrations/NNN-*.ts file is registered (no orphan files)', () => { + const files = listMigrationFiles().filter((f) => STRICT_NNN_PATTERN.test(f)); + expect(files.length).toBeGreaterThan(0); + const registeredVersions = new Set(ALL_MIGRATIONS.map((m) => m.version)); + for (const f of files) { + const version = parseInt(f.slice(0, 3), 10); + if (!registeredVersions.has(version)) { + throw new Error( + `Migration file ${f} exists on disk but is not registered in src/db/migrations/index.ts. ` + + `Add an import + array entry for it.` + ); + } + } + }); + + it('every registered version has a matching NNN-*.ts file (no phantom registrations)', () => { + const files = listMigrationFiles().filter((f) => STRICT_NNN_PATTERN.test(f)); + const filenameVersions = new Set(files.map((f) => parseInt(f.slice(0, 3), 10))); + for (const m of ALL_MIGRATIONS) { + expect(filenameVersions.has(m.version)).toBe(true); + } + }); +}); diff --git a/src/db/migrations.ts b/src/db/migrations.ts index 0a256dbc..98325247 100644 --- a/src/db/migrations.ts +++ b/src/db/migrations.ts @@ -1,60 +1,26 @@ /** - * Database Migrations + * Database Migrations — runner + backward-compat surface. * - * Schema versioning and migration support. + * The migration definitions themselves live in + * `./migrations/-.ts`, one file per migration, with + * version derived from the filename prefix. This file is the + * runner (read schema_versions, apply pending in order) and the + * stable API surface that the rest of the codebase imports. + * + * Adding a migration: see `./migrations/index.ts`. */ import { SqliteDatabase } from './sqlite-adapter'; +import { ALL_MIGRATIONS, CURRENT_SCHEMA_VERSION as REGISTRY_CURRENT } from './migrations/index'; +import type { Migration } from './migrations/types'; /** - * Current schema version + * Highest registered migration version. Derived from the + * registry; re-exported here unchanged so existing consumers + * (`import { CURRENT_SCHEMA_VERSION } from './migrations'`) keep + * working. */ -export const CURRENT_SCHEMA_VERSION = 3; - -/** - * Migration definition - */ -interface Migration { - version: number; - description: string; - up: (db: SqliteDatabase) => void; -} - -/** - * All migrations in order - * - * Note: Version 1 is the initial schema, handled by schema.sql - * Future migrations go here. - */ -const migrations: Migration[] = [ - { - version: 2, - description: 'Add project metadata, provenance tracking, and unresolved ref context', - up: (db) => { - db.exec(` - CREATE TABLE IF NOT EXISTS project_metadata ( - key TEXT PRIMARY KEY, - value TEXT NOT NULL, - updated_at INTEGER NOT NULL - ); - ALTER TABLE unresolved_refs ADD COLUMN file_path TEXT NOT NULL DEFAULT ''; - ALTER TABLE unresolved_refs ADD COLUMN language TEXT NOT NULL DEFAULT 'unknown'; - ALTER TABLE edges ADD COLUMN provenance TEXT DEFAULT NULL; - CREATE INDEX IF NOT EXISTS idx_unresolved_file_path ON unresolved_refs(file_path); - CREATE INDEX IF NOT EXISTS idx_edges_provenance ON edges(provenance); - `); - }, - }, - { - version: 3, - description: 'Add lower(name) expression index for memory-efficient case-insensitive lookups', - up: (db) => { - db.exec(` - CREATE INDEX IF NOT EXISTS idx_nodes_lower_name ON nodes(lower(name)); - `); - }, - }, -]; +export const CURRENT_SCHEMA_VERSION: number = REGISTRY_CURRENT; /** * Get the current schema version from the database @@ -84,17 +50,14 @@ function recordMigration(db: SqliteDatabase, version: number, description: strin * Run all pending migrations */ export function runMigrations(db: SqliteDatabase, fromVersion: number): void { - const pending = migrations.filter((m) => m.version > fromVersion); - - if (pending.length === 0) { - return; - } + const pending = ALL_MIGRATIONS.filter((m) => m.version > fromVersion); + if (pending.length === 0) return; - // Sort by version - pending.sort((a, b) => a.version - b.version); + // ALL_MIGRATIONS is already sorted by version, but filtering can + // be cheap to re-confirm. + const ordered = [...pending].sort((a, b) => a.version - b.version); - // Run each migration in a transaction - for (const migration of pending) { + for (const migration of ordered) { db.transaction(() => { migration.up(db); recordMigration(db, migration.version, migration.description); @@ -111,13 +74,15 @@ export function needsMigration(db: SqliteDatabase): boolean { } /** - * Get list of pending migrations + * Get list of pending migrations. + * + * Returned as a fresh mutable array (not the underlying readonly + * registry) so callers that previously assigned the result to a + * `Migration[]`-typed variable keep working unchanged. */ export function getPendingMigrations(db: SqliteDatabase): Migration[] { const current = getCurrentVersion(db); - return migrations - .filter((m) => m.version > current) - .sort((a, b) => a.version - b.version); + return ALL_MIGRATIONS.filter((m) => m.version > current).slice(); } /** @@ -136,3 +101,7 @@ export function getMigrationHistory( description: row.description, })); } + +// Re-export the registry surface for callers that want it. +export { ALL_MIGRATIONS } from './migrations/index'; +export type { Migration, MigrationModule } from './migrations/types'; diff --git a/src/db/migrations/002-project-metadata.ts b/src/db/migrations/002-project-metadata.ts new file mode 100644 index 00000000..9fe7945b --- /dev/null +++ b/src/db/migrations/002-project-metadata.ts @@ -0,0 +1,19 @@ +import type { MigrationModule } from './types'; + +export const MIGRATION: MigrationModule = { + description: 'Add project metadata, provenance tracking, and unresolved ref context', + up: (db) => { + db.exec(` + CREATE TABLE IF NOT EXISTS project_metadata ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL, + updated_at INTEGER NOT NULL + ); + ALTER TABLE unresolved_refs ADD COLUMN file_path TEXT NOT NULL DEFAULT ''; + ALTER TABLE unresolved_refs ADD COLUMN language TEXT NOT NULL DEFAULT 'unknown'; + ALTER TABLE edges ADD COLUMN provenance TEXT DEFAULT NULL; + CREATE INDEX IF NOT EXISTS idx_unresolved_file_path ON unresolved_refs(file_path); + CREATE INDEX IF NOT EXISTS idx_edges_provenance ON edges(provenance); + `); + }, +}; diff --git a/src/db/migrations/003-lower-name-index.ts b/src/db/migrations/003-lower-name-index.ts new file mode 100644 index 00000000..ff5416eb --- /dev/null +++ b/src/db/migrations/003-lower-name-index.ts @@ -0,0 +1,10 @@ +import type { MigrationModule } from './types'; + +export const MIGRATION: MigrationModule = { + description: 'Add lower(name) expression index for memory-efficient case-insensitive lookups', + up: (db) => { + db.exec(` + CREATE INDEX IF NOT EXISTS idx_nodes_lower_name ON nodes(lower(name)); + `); + }, +}; diff --git a/src/db/migrations/index.ts b/src/db/migrations/index.ts new file mode 100644 index 00000000..f9bbcf10 --- /dev/null +++ b/src/db/migrations/index.ts @@ -0,0 +1,106 @@ +/** + * Migration registry. + * + * Adding a new schema migration is: + * + * 1. Pick the next free 3-digit prefix (`NNN`) — `git ls-files + * 'src/db/migrations/[0-9]*.ts'` shows what's taken. + * 2. Create `src/db/migrations/-.ts` + * exporting a `MIGRATION: MigrationModule` (just `description` + * and `up(db)`). + * 3. Add **one** import line and **one** array entry to this file. + * + * **Why filename-derived versions instead of a field?** Two PRs + * adding migrations independently used to collide on the + * `migrations[]` array AND the `CURRENT_SCHEMA_VERSION` const. + * With monolithic migrations.ts, "I claimed v4 / you claimed v4" + * resolved as "second PR's v4 silently no-ops" — a real bug class + * (PR #113's reviewer caught one). With filename-derived versions, + * two PRs both creating `004-foo.ts` produce a filesystem-level + * conflict the maintainer sees instantly. + * + * `CURRENT_SCHEMA_VERSION` is the max of all registered versions. + */ + +import type { Migration, MigrationModule } from './types'; + +import { MIGRATION as MIG_002 } from './002-project-metadata'; +import { MIGRATION as MIG_003 } from './003-lower-name-index'; + +interface ModuleRef { + /** + * Source filename. The 3-digit prefix is the source of truth for + * the version number — `validateRegistered` parses it. Keep this + * field in sync with the actual file on disk; the + * filesystem-cross-check test catches drift. + */ + filename: string; + module: MigrationModule; +} + +/** + * Static-import list of every migration. Two PRs adding + * migrations both add a single entry here; alphabetical ordering + * puts adjacent additions on different lines unless the version + * numbers themselves collide, in which case the filesystem + * collision on `NNN-*.ts` surfaces the conflict instantly. + */ +const REGISTERED_MODULES: readonly ModuleRef[] = [ + { filename: '002-project-metadata.ts', module: MIG_002 }, + { filename: '003-lower-name-index.ts', module: MIG_003 }, +]; + +/** Strict 3-digit prefix on each migration filename. */ +const FILENAME_PATTERN = /^(\d{3})-[a-z0-9]+(?:-[a-z0-9]+)*\.ts$/; + +/** + * Validate the registered set: filenames match the strict + * `NNN-name.ts` shape, version is parsed from the prefix (no + * hand-typed version field that can drift), versions are unique, + * and the result is sorted ascending. Throws loudly at module + * load if any invariant is violated rather than silently dropping + * a migration during `runMigrations()`. + */ +function validateRegistered(refs: readonly ModuleRef[]): readonly Migration[] { + if (refs.length === 0) { + throw new Error('[CodeGraph] migrations registry is empty'); + } + const parsed = refs.map((r) => { + const m = FILENAME_PATTERN.exec(r.filename); + if (!m) { + throw new Error( + `[CodeGraph] migration filename "${r.filename}" does not match ` + + `expected pattern NNN-kebab-name.ts (3-digit prefix, lowercase kebab-case body)` + ); + } + const version = parseInt(m[1]!, 10); + return { + version, + filename: r.filename, + description: r.module.description, + up: r.module.up, + }; + }); + const sorted = [...parsed].sort((a, b) => a.version - b.version); + for (let i = 1; i < sorted.length; i++) { + if (sorted[i]!.version === sorted[i - 1]!.version) { + throw new Error( + `[CodeGraph] duplicate migration version ${sorted[i]!.version}: ` + + `${sorted[i - 1]!.filename} vs ${sorted[i]!.filename}` + ); + } + } + return sorted.map((r) => ({ + version: r.version, + description: r.description, + up: r.up, + })); +} + +export const ALL_MIGRATIONS: readonly Migration[] = validateRegistered(REGISTERED_MODULES); + +/** + * Highest registered migration version. Derived from the registry + * (no hand-maintained constant to keep in sync). + */ +export const CURRENT_SCHEMA_VERSION: number = ALL_MIGRATIONS[ALL_MIGRATIONS.length - 1]!.version; diff --git a/src/db/migrations/types.ts b/src/db/migrations/types.ts new file mode 100644 index 00000000..479af672 --- /dev/null +++ b/src/db/migrations/types.ts @@ -0,0 +1,25 @@ +/** + * Migration registry types. + * + * Each migration ships its own self-contained file + * (`./NNN-description.ts`) exporting a `MIGRATION: + * MigrationModule`. The version number is derived from the + * leading 3-digit prefix on the filename, NOT from a field in the + * module — this guarantees no two PRs can claim the same version + * silently (filenames collide on the filesystem; SQL migrations + * never silently no-op). + */ + +import type { SqliteDatabase } from '../sqlite-adapter'; + +export interface MigrationModule { + /** One-line description for `schema_versions` table + diagnostics. */ + readonly description: string; + /** The actual schema-mutation function. Wrapped in a transaction. */ + readonly up: (db: SqliteDatabase) => void; +} + +export interface Migration extends MigrationModule { + /** Version derived from filename's leading NNN prefix. */ + readonly version: number; +} From 8c86079afa3799e644e9034ea3b4307fdb29554e Mon Sep 17 00:00:00 2001 From: andreinknv Date: Tue, 28 Apr 2026 12:49:01 -0400 Subject: [PATCH 2/2] perf(db): drop redundant idx_edges_source and idx_edges_target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These two narrow indexes are fully covered by the wider idx_edges_source_kind and idx_edges_target_kind composite indexes via SQLite's left-prefix scan. Keeping them costs DB size and bulk- insert time without giving any query that the kind-prefixed indexes don't already cover. Empirical measurements on a 50K-node / 250K-edge synthesized DB (scripts/spikes/spike-edge-indexes.mjs): - DB size: -22.2% (34.7 MB -> 27.0 MB) - Bulk insert: 1.37x faster (590ms -> 431ms) - Source-only / target-only query latency: no regression (EXPLAIN: SEARCH edges USING COVERING INDEX idx_edges_source_kind (source=?)) Ported to the post-#118 file-based-migration form: migration body lives in src/db/migrations/017-drop-redundant-edge-indexes.ts and is registered in src/db/migrations/index.ts. CURRENT_SCHEMA_VERSION is auto-derived from the registry — no constant bump. Schema: removes the two CREATE INDEX statements on the narrow indexes; the wider kind-prefixed indexes (which actually serve queries) stay. --- __tests__/foundation.test.ts | 2 +- __tests__/pr19-improvements.test.ts | 2 +- scripts/spikes/spike-edge-indexes.mjs | 119 ++++++++++++++++++ .../017-drop-redundant-edge-indexes.ts | 26 ++++ src/db/migrations/index.ts | 2 + src/db/schema.sql | 7 +- 6 files changed, 154 insertions(+), 4 deletions(-) create mode 100644 scripts/spikes/spike-edge-indexes.mjs create mode 100644 src/db/migrations/017-drop-redundant-edge-indexes.ts diff --git a/__tests__/foundation.test.ts b/__tests__/foundation.test.ts index 9ee437da..048c2ce5 100644 --- a/__tests__/foundation.test.ts +++ b/__tests__/foundation.test.ts @@ -305,7 +305,7 @@ describe('Database Connection', () => { const version = db.getSchemaVersion(); expect(version).not.toBeNull(); - expect(version?.version).toBe(3); + expect(version?.version).toBe(17); db.close(); }); diff --git a/__tests__/pr19-improvements.test.ts b/__tests__/pr19-improvements.test.ts index 5fbe17d7..352c8c7f 100644 --- a/__tests__/pr19-improvements.test.ts +++ b/__tests__/pr19-improvements.test.ts @@ -299,7 +299,7 @@ describe('Best-Candidate Resolution', () => { describe('Schema v2 Migration', () => { it.skipIf(!HAS_SQLITE)('should have correct current schema version', async () => { const { CURRENT_SCHEMA_VERSION } = await import('../src/db/migrations'); - expect(CURRENT_SCHEMA_VERSION).toBe(3); + expect(CURRENT_SCHEMA_VERSION).toBe(17); }); it.skipIf(!HAS_SQLITE)('should have migration for version 2', async () => { diff --git a/scripts/spikes/spike-edge-indexes.mjs b/scripts/spikes/spike-edge-indexes.mjs new file mode 100644 index 00000000..eee81529 --- /dev/null +++ b/scripts/spikes/spike-edge-indexes.mjs @@ -0,0 +1,119 @@ +#!/usr/bin/env node +/** + * Spike: redundant edge indexes + * + * Drops `idx_edges_source` and `idx_edges_target` and measures + * the impact on: + * - DB size + * - Bulk-insert throughput + * - Latency for `WHERE source = ?` and `WHERE target = ?` + * (the two queries that previously hit the dropped indexes) + * + * The hypothesis: SQLite covers source-only / target-only lookups + * via the wider `(source, kind)` and `(target, kind)` composite + * indexes through left-prefix scan, so dropping the narrow ones + * costs nothing on the read side but saves space and write time. + * + * Synthesises 50K nodes / 250K edges so the measurement scales to + * what real users will hit; codegraph's own DB at ~2K nodes is too + * small for index choices to surface. + */ +import Database from 'better-sqlite3'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; + +const NODES = 50_000; +const EDGES_PER_NODE = 5; + +function ms(start) { return Number(process.hrtime.bigint() - start) / 1_000_000; } +function fmt(n) { return n < 10 ? n.toFixed(2) : n.toFixed(0); } + +console.log('\n=== Spike: redundant edge indexes ===\n'); +console.log(`Synthesizing ${NODES.toLocaleString()} nodes, ${(NODES*EDGES_PER_NODE).toLocaleString()} edges...`); + +function buildEdgesDb({ withRedundant }) { + const dbPath = path.join(os.tmpdir(), `spike-edges-${Date.now()}-${Math.random()}.db`); + const db = new Database(dbPath); + db.pragma('journal_mode = WAL'); + db.pragma('synchronous = NORMAL'); + db.pragma('cache_size = -64000'); + db.exec(` + CREATE TABLE nodes (id TEXT PRIMARY KEY, kind TEXT NOT NULL, name TEXT NOT NULL); + CREATE TABLE edges ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source TEXT NOT NULL, target TEXT NOT NULL, kind TEXT NOT NULL, + line INTEGER, col INTEGER + ); + CREATE INDEX idx_edges_kind ON edges(kind); + CREATE INDEX idx_edges_source_kind ON edges(source, kind); + CREATE INDEX idx_edges_target_kind ON edges(target, kind); + `); + if (withRedundant) { + db.exec(` + CREATE INDEX idx_edges_source ON edges(source); + CREATE INDEX idx_edges_target ON edges(target); + `); + } + + const insNode = db.prepare('INSERT INTO nodes (id, kind, name) VALUES (?, ?, ?)'); + const insEdge = db.prepare('INSERT INTO edges (source, target, kind, line, col) VALUES (?, ?, ?, ?, ?)'); + const KINDS = ['calls', 'imports', 'references', 'type_of', 'extends', 'instantiates']; + const tStart = process.hrtime.bigint(); + db.transaction(() => { + for (let i = 0; i < NODES; i++) { + insNode.run(`n${i}`, 'function', `name${i}`); + } + for (let i = 0; i < NODES; i++) { + for (let j = 0; j < EDGES_PER_NODE; j++) { + const tgt = `n${(i + j + 1) % NODES}`; + const kind = KINDS[j % KINDS.length]; + insEdge.run(`n${i}`, tgt, kind, i, j); + } + } + })(); + const insertMs = ms(tStart); + db.exec('PRAGMA optimize'); + + return { db, dbPath, size: fs.statSync(dbPath).size, insertMs }; +} + +const baseline = buildEdgesDb({ withRedundant: true }); +const stripped = buildEdgesDb({ withRedundant: false }); + +console.log(''); +console.log(` baseline (with redundant): size=${(baseline.size / 1024 / 1024).toFixed(1)} MB · bulk insert=${fmt(baseline.insertMs)}ms`); +console.log(` stripped : size=${(stripped.size / 1024 / 1024).toFixed(1)} MB · bulk insert=${fmt(stripped.insertMs)}ms`); +const sizeDelta = ((baseline.size - stripped.size) / baseline.size * 100).toFixed(1); +const insertSpeedup = (baseline.insertMs / stripped.insertMs).toFixed(2); +console.log(` Δ size: -${sizeDelta}% · Δ bulk insert: ${insertSpeedup}× faster without redundant indexes`); + +function timeQueries(db, label) { + const N = 500; + const sourceOnly = db.prepare('SELECT COUNT(*) FROM edges WHERE source = ?'); + const targetOnly = db.prepare('SELECT COUNT(*) FROM edges WHERE target = ?'); + let t = process.hrtime.bigint(); + for (let i = 0; i < N; i++) sourceOnly.get(`n${i % NODES}`); + const sourceMs = ms(t) / N; + t = process.hrtime.bigint(); + for (let i = 0; i < N; i++) targetOnly.get(`n${i % NODES}`); + const targetMs = ms(t) / N; + console.log(` ${label}: WHERE source=? avg ${fmt(sourceMs)}ms · WHERE target=? avg ${fmt(targetMs)}ms`); + return { sourceMs, targetMs }; +} +console.log(''); +const baseQ = timeQueries(baseline.db, 'baseline'); +const strQ = timeQueries(stripped.db, 'stripped'); +console.log(` query speed delta: source ${(strQ.sourceMs / baseQ.sourceMs).toFixed(2)}× · target ${(strQ.targetMs / baseQ.targetMs).toFixed(2)}× (>1 = stripped slower)`); + +// EXPLAIN-confirm that the stripped DB still uses an index for these +// queries — we want to know it's a covering scan, not a table scan. +const plan = stripped.db.prepare('EXPLAIN QUERY PLAN SELECT COUNT(*) FROM edges WHERE source = ?').all('n0'); +console.log(''); +console.log(' EXPLAIN (stripped, source=?):'); +for (const row of plan) console.log(` ${row.detail}`); + +baseline.db.close(); stripped.db.close(); +fs.unlinkSync(baseline.dbPath); fs.unlinkSync(stripped.dbPath); + +console.log('\n=== Done ===\n'); diff --git a/src/db/migrations/017-drop-redundant-edge-indexes.ts b/src/db/migrations/017-drop-redundant-edge-indexes.ts new file mode 100644 index 00000000..08f8593e --- /dev/null +++ b/src/db/migrations/017-drop-redundant-edge-indexes.ts @@ -0,0 +1,26 @@ +import type { MigrationModule } from './types'; + +/** + * Drop `idx_edges_source` and `idx_edges_target` — both fully covered + * by the wider `idx_edges_source_kind` and `idx_edges_target_kind` + * composite indexes via SQLite's left-prefix scan. Keeping the narrow + * ones costs ~17-22% of DB size and ~1.3x bulk-insert time without + * giving any query that the kind-prefixed indexes don't already cover. + * + * EXPLAIN confirms the planner now uses the wider indexes as covering + * indexes for source-only / target-only queries: + * SEARCH edges USING COVERING INDEX idx_edges_source_kind (source=?) + * + * See `scripts/spikes/spike-edge-indexes.mjs` for the reproducer + * (companion to PR #122 against `main`; this is the file-based form + * for the post-#118 integration branch). + */ +export const MIGRATION: MigrationModule = { + description: 'Drop redundant idx_edges_source and idx_edges_target indexes', + up: (db) => { + db.exec(` + DROP INDEX IF EXISTS idx_edges_source; + DROP INDEX IF EXISTS idx_edges_target; + `); + }, +}; diff --git a/src/db/migrations/index.ts b/src/db/migrations/index.ts index f9bbcf10..1cd6e32f 100644 --- a/src/db/migrations/index.ts +++ b/src/db/migrations/index.ts @@ -26,6 +26,7 @@ import type { Migration, MigrationModule } from './types'; import { MIGRATION as MIG_002 } from './002-project-metadata'; import { MIGRATION as MIG_003 } from './003-lower-name-index'; +import { MIGRATION as MIG_017 } from './017-drop-redundant-edge-indexes'; interface ModuleRef { /** @@ -48,6 +49,7 @@ interface ModuleRef { const REGISTERED_MODULES: readonly ModuleRef[] = [ { filename: '002-project-metadata.ts', module: MIG_002 }, { filename: '003-lower-name-index.ts', module: MIG_003 }, + { filename: '017-drop-redundant-edge-indexes.ts', module: MIG_017 }, ]; /** Strict 3-digit prefix on each migration filename. */ diff --git a/src/db/schema.sql b/src/db/schema.sql index dd0a9f06..4c8d6ed5 100644 --- a/src/db/schema.sql +++ b/src/db/schema.sql @@ -123,8 +123,11 @@ CREATE TRIGGER IF NOT EXISTS nodes_au AFTER UPDATE ON nodes BEGIN END; -- Edge indexes -CREATE INDEX IF NOT EXISTS idx_edges_source ON edges(source); -CREATE INDEX IF NOT EXISTS idx_edges_target ON edges(target); +-- Note: narrow source/target indexes are intentionally omitted — the +-- (source, kind) and (target, kind) composite indexes below cover +-- source-only and target-only lookups via SQLite's left-prefix scan +-- (see migration 017 for the empirical justification: ~22% DB size +-- and ~1.3x bulk-insert win with no query regression). CREATE INDEX IF NOT EXISTS idx_edges_kind ON edges(kind); CREATE INDEX IF NOT EXISTS idx_edges_source_kind ON edges(source, kind); CREATE INDEX IF NOT EXISTS idx_edges_target_kind ON edges(target, kind);