diff --git a/CLAUDE.md b/CLAUDE.md index 71a50c73..4e7c46aa 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -98,7 +98,7 @@ SQLite database with: ### Supported Languages -TypeScript, JavaScript, TSX, JSX, Svelte, Python, Go, Rust, Java, C, C++, C#, PHP, Ruby, Swift, Kotlin, Dart, Liquid, Pascal +TypeScript, JavaScript, TSX, JSX, Svelte, Python, Go, Rust, Java, C, C++, C#, PHP, Ruby, Swift, Kotlin, Dart, Liquid, Pascal, HCL / Terraform ### Node and Edge Types diff --git a/__tests__/centrality.test.ts b/__tests__/centrality.test.ts new file mode 100644 index 00000000..e45dc858 --- /dev/null +++ b/__tests__/centrality.test.ts @@ -0,0 +1,134 @@ +import { describe, it, expect } from 'vitest'; +import { computePageRank, PR_DAMPING, PR_ITERATIONS } from '../src/centrality'; + +function asNodes(ids: string[]) { + return ids.map((id) => ({ id })); +} + +describe('computePageRank', () => { + it('returns empty result for an empty graph', () => { + const r = computePageRank([], []); + expect(r.scores.size).toBe(0); + expect(r.iterations).toBe(0); + }); + + it('assigns uniform rank to N isolated nodes', () => { + const r = computePageRank(asNodes(['a', 'b', 'c', 'd']), []); + expect(r.scores.size).toBe(4); + // 4 isolated nodes — all dangling — should each end up with 1/N. + for (const v of r.scores.values()) { + expect(v).toBeCloseTo(0.25, 6); + } + }); + + it('rewards being reached (sinks accumulate rank)', () => { + // a -> b -> c. c has no outgoing, so it accumulates the most. + const r = computePageRank( + asNodes(['a', 'b', 'c']), + [ + { source: 'a', target: 'b' }, + { source: 'b', target: 'c' }, + ] + ); + const a = r.scores.get('a')!; + const b = r.scores.get('b')!; + const c = r.scores.get('c')!; + expect(c).toBeGreaterThan(b); + expect(b).toBeGreaterThan(a); + }); + + it('star: hub ranks above all leaves; leaves are equal', () => { + const leaves = ['l1', 'l2', 'l3', 'l4', 'l5', 'l6', 'l7', 'l8', 'l9']; + const edges = leaves.map((l) => ({ source: l, target: 'hub' })); + const r = computePageRank(asNodes([...leaves, 'hub']), edges); + const hub = r.scores.get('hub')!; + for (const l of leaves) { + const lv = r.scores.get(l)!; + expect(hub).toBeGreaterThan(lv); + } + // Leaves are symmetric — should be within 1e-9. + const first = r.scores.get(leaves[0])!; + for (const l of leaves.slice(1)) { + expect(r.scores.get(l)!).toBeCloseTo(first, 9); + } + }); + + it('cycle: all nodes have approximately equal rank', () => { + const r = computePageRank( + asNodes(['a', 'b', 'c']), + [ + { source: 'a', target: 'b' }, + { source: 'b', target: 'c' }, + { source: 'c', target: 'a' }, + ] + ); + const a = r.scores.get('a')!; + const b = r.scores.get('b')!; + const c = r.scores.get('c')!; + // Symmetric → all equal at convergence. + expect(a).toBeCloseTo(b, 6); + expect(b).toBeCloseTo(c, 6); + }); + + it('total rank sums to ~1 (mass is conserved)', () => { + const r = computePageRank( + asNodes(['a', 'b', 'c', 'd', 'e']), + [ + { source: 'a', target: 'b' }, + { source: 'b', target: 'c' }, + { source: 'd', target: 'c' }, + { source: 'e', target: 'd' }, + { source: 'a', target: 'e' }, + ] + ); + let sum = 0; + for (const v of r.scores.values()) sum += v; + expect(sum).toBeCloseTo(1, 6); + }); + + it('preserves mass across two disconnected components', () => { + const r = computePageRank( + asNodes(['a', 'b', 'c', 'd']), + [ + { source: 'a', target: 'b' }, + { source: 'c', target: 'd' }, + ] + ); + let sum = 0; + for (const v of r.scores.values()) sum += v; + expect(sum).toBeCloseTo(1, 6); + // Within each component, the sink ranks above the source. + expect(r.scores.get('b')!).toBeGreaterThan(r.scores.get('a')!); + expect(r.scores.get('d')!).toBeGreaterThan(r.scores.get('c')!); + }); + + it('drops edges referencing unknown nodes', () => { + // 'ghost' is not in the node set — that edge should be ignored, + // not crash and not pollute scores. + const r = computePageRank( + asNodes(['a', 'b']), + [ + { source: 'a', target: 'b' }, + { source: 'a', target: 'ghost' }, + { source: 'ghost', target: 'b' }, + ] + ); + expect(r.scores.size).toBe(2); + expect(r.scores.get('b')!).toBeGreaterThan(r.scores.get('a')!); + let sum = 0; + for (const v of r.scores.values()) sum += v; + expect(sum).toBeCloseTo(1, 6); + }); + + it('reports iteration count and duration', () => { + const r = computePageRank(asNodes(['a', 'b']), [{ source: 'a', target: 'b' }]); + expect(r.iterations).toBe(PR_ITERATIONS); + expect(r.durationMs).toBeGreaterThanOrEqual(0); + }); + + it('damping constant is the textbook 0.85', () => { + // Sentinel — protects against accidental tuning that would invalidate + // the spike findings the PR was justified on. + expect(PR_DAMPING).toBe(0.85); + }); +}); diff --git a/__tests__/churn.test.ts b/__tests__/churn.test.ts new file mode 100644 index 00000000..fbe279f6 --- /dev/null +++ b/__tests__/churn.test.ts @@ -0,0 +1,208 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { execFileSync } from 'child_process'; +import { + mineChurn, + getGitHead, + readFileLoc, + MAX_FILES_PER_COMMIT, + LAST_MINED_CHURN_HEAD_KEY, +} from '../src/churn'; + +let HAS_GIT = true; +try { + execFileSync('git', ['--version'], { stdio: 'ignore' }); +} catch { + HAS_GIT = false; +} + +let tempDir: string; + +function git(...args: string[]): string { + return execFileSync('git', args, { + cwd: tempDir, + encoding: 'utf-8', + env: { + ...process.env, + GIT_AUTHOR_NAME: 'Test', + GIT_AUTHOR_EMAIL: 'test@example.com', + GIT_COMMITTER_NAME: 'Test', + GIT_COMMITTER_EMAIL: 'test@example.com', + GIT_AUTHOR_DATE: process.env.GIT_AUTHOR_DATE, + GIT_COMMITTER_DATE: process.env.GIT_COMMITTER_DATE, + }, + stdio: ['pipe', 'pipe', 'pipe'], + }).trim(); +} + +function commitAt(date: string, paths: string[], content?: string) { + for (const p of paths) { + const abs = path.join(tempDir, p); + fs.mkdirSync(path.dirname(abs), { recursive: true }); + fs.writeFileSync(abs, content ?? `data for ${p} at ${date}\n`); + } + git('add', ...paths); + // Pin both author and committer dates so timestamps are deterministic. + process.env.GIT_AUTHOR_DATE = date; + process.env.GIT_COMMITTER_DATE = date; + git('commit', '-m', `commit at ${date}`); + delete process.env.GIT_AUTHOR_DATE; + delete process.env.GIT_COMMITTER_DATE; +} + +beforeEach(() => { + tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-churn-')); + if (HAS_GIT) { + git('init', '-q', '-b', 'main'); + git('config', 'commit.gpgsign', 'false'); + } +}); + +afterEach(() => { + delete process.env.GIT_AUTHOR_DATE; + delete process.env.GIT_COMMITTER_DATE; + fs.rmSync(tempDir, { recursive: true, force: true }); +}); + +describe.skipIf(!HAS_GIT)('mineChurn', () => { + it('returns empty + null head when not in a git repo', () => { + const nonGit = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-nogit-')); + try { + const r = mineChurn(nonGit, new Set(['foo.ts']), null); + expect(r.currentHead).toBeNull(); + expect(r.deltas.size).toBe(0); + expect(r.needsFullRescan).toBe(false); + } finally { + fs.rmSync(nonGit, { recursive: true, force: true }); + } + }); + + it('counts commits per indexed file, ignores files not in index', () => { + commitAt('2025-01-01T00:00:00', ['a.ts', 'b.ts']); + commitAt('2025-01-02T00:00:00', ['a.ts']); + commitAt('2025-01-03T00:00:00', ['a.ts', 'b.ts', 'c.ts']); + + const r = mineChurn(tempDir, new Set(['a.ts', 'b.ts']), null); + expect(r.deltas.get('a.ts')?.commitCountDelta).toBe(3); + expect(r.deltas.get('b.ts')?.commitCountDelta).toBe(2); + expect(r.deltas.has('c.ts')).toBe(false); + }); + + it('records first-seen / last-touched as min/max of commit timestamps', () => { + commitAt('2025-01-01T00:00:00Z', ['a.ts']); + commitAt('2025-06-01T00:00:00Z', ['a.ts']); + commitAt('2025-12-01T00:00:00Z', ['a.ts']); + + const r = mineChurn(tempDir, new Set(['a.ts']), null); + const d = r.deltas.get('a.ts')!; + // 2025-01-01 UTC = 1735689600 + expect(d.firstSeenTs).toBe(1735689600); + // 2025-12-01 UTC = 1764547200 + expect(d.lastTouchedTs).toBe(1764547200); + }); + + it('skips commits touching more than MAX_FILES_PER_COMMIT files', () => { + const bigBatch: string[] = []; + for (let i = 0; i < MAX_FILES_PER_COMMIT + 1; i++) bigBatch.push(`f${i}.ts`); + commitAt('2025-01-01T00:00:00Z', bigBatch); + // Then a normal commit on one of the same files. + commitAt('2025-02-01T00:00:00Z', ['f0.ts']); + + const r = mineChurn(tempDir, new Set(bigBatch), null); + // First commit was skipped; only the second one should count. + expect(r.deltas.get('f0.ts')?.commitCountDelta).toBe(1); + // Files only seen in the skipped commit produce no delta at all. + expect(r.deltas.has('f50.ts')).toBe(false); + }); + + it('incremental mining returns only commits since the given sha', () => { + commitAt('2025-01-01T00:00:00Z', ['a.ts']); + const sha1 = getGitHead(tempDir)!; + commitAt('2025-01-02T00:00:00Z', ['a.ts']); + commitAt('2025-01-03T00:00:00Z', ['a.ts']); + + const incr = mineChurn(tempDir, new Set(['a.ts']), sha1); + // Only the two commits *after* sha1 should be counted. + expect(incr.deltas.get('a.ts')?.commitCountDelta).toBe(2); + expect(incr.needsFullRescan).toBe(false); + }); + + it('returns needsFullRescan=true when sinceSha is unreachable', () => { + commitAt('2025-01-01T00:00:00Z', ['a.ts']); + const fakeSha = '0'.repeat(40); + const r = mineChurn(tempDir, new Set(['a.ts']), fakeSha); + expect(r.needsFullRescan).toBe(true); + expect(r.deltas.size).toBe(0); + expect(r.currentHead).not.toBeNull(); + }); + + it('returns empty deltas when sinceSha equals current head (no-op)', () => { + commitAt('2025-01-01T00:00:00Z', ['a.ts']); + const head = getGitHead(tempDir)!; + const r = mineChurn(tempDir, new Set(['a.ts']), head); + expect(r.currentHead).toBe(head); + expect(r.deltas.size).toBe(0); + expect(r.needsFullRescan).toBe(false); + }); + + it('handles paths with spaces and unicode safely (NUL-delimited)', () => { + commitAt('2025-01-01T00:00:00Z', ['name with space.ts']); + commitAt('2025-01-02T00:00:00Z', ['ünïcødë.ts']); + + const r = mineChurn( + tempDir, + new Set(['name with space.ts', 'ünïcødë.ts']), + null + ); + expect(r.deltas.get('name with space.ts')?.commitCountDelta).toBe(1); + expect(r.deltas.get('ünïcødë.ts')?.commitCountDelta).toBe(1); + }); + + it('LAST_MINED_CHURN_HEAD_KEY is stable (used as project_metadata key)', () => { + expect(LAST_MINED_CHURN_HEAD_KEY).toBe('last_mined_churn_head'); + }); +}); + +describe('readFileLoc', () => { + it('returns 0 for an empty file', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-')); + try { + const f = path.join(dir, 'empty.txt'); + fs.writeFileSync(f, ''); + expect(readFileLoc(dir, 'empty.txt')).toBe(0); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('counts newline-terminated lines', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-')); + try { + fs.writeFileSync(path.join(dir, 'x.txt'), 'a\nb\nc\n'); + expect(readFileLoc(dir, 'x.txt')).toBe(3); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('counts a final no-newline chunk as one extra line', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-')); + try { + fs.writeFileSync(path.join(dir, 'x.txt'), 'a\nb\nc'); + expect(readFileLoc(dir, 'x.txt')).toBe(3); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('returns 0 for a missing file (does not throw)', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-')); + try { + expect(readFileLoc(dir, 'no-such-file.txt')).toBe(0); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); +}); diff --git a/__tests__/codegraphignore.test.ts b/__tests__/codegraphignore.test.ts new file mode 100644 index 00000000..4d7e58c5 --- /dev/null +++ b/__tests__/codegraphignore.test.ts @@ -0,0 +1,168 @@ +/** + * .codegraphignore Tests + * + * Regression test for the bug where the .codegraphignore marker file was + * honored by the filesystem-walk fallback (`scanDirectoryWalk`) but + * silently ignored by the git fast path (`getGitVisibleFiles` and + * `getGitChangedFiles`). Same project gave different file sets depending + * on whether `.git` existed. + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { execFileSync } from 'child_process'; +import { scanDirectory } from '../src/extraction'; +import { DEFAULT_CONFIG, CodeGraphConfig } from '../src/types'; +import CodeGraph from '../src/index'; + +function tempDir(prefix: string): string { + return fs.mkdtempSync(path.join(os.tmpdir(), prefix)); +} + +function git(cwd: string, ...args: string[]) { + execFileSync('git', args, { cwd, stdio: 'pipe' }); +} + +const config: CodeGraphConfig = { + ...DEFAULT_CONFIG, + include: ['**/*.ts'], + exclude: [], +}; + +describe('.codegraphignore marker (bug #3)', () => { + describe('git fast path', () => { + let dir: string; + + beforeEach(() => { + dir = tempDir('codegraph-ignore-git-'); + git(dir, 'init'); + git(dir, 'config', 'user.email', 'test@test.com'); + git(dir, 'config', 'user.name', 'Test'); + // Pin branch name for determinism across git defaults + git(dir, 'symbolic-ref', 'HEAD', 'refs/heads/main'); + + fs.mkdirSync(path.join(dir, 'src')); + fs.mkdirSync(path.join(dir, 'vendor')); + fs.mkdirSync(path.join(dir, 'vendor', 'lib')); + fs.writeFileSync(path.join(dir, 'src', 'app.ts'), 'export const a = 1;'); + fs.writeFileSync(path.join(dir, 'vendor', 'pkg.ts'), 'export const v = 1;'); + fs.writeFileSync(path.join(dir, 'vendor', 'lib', 'sub.ts'), 'export const s = 1;'); + // Mark vendor/ as ignored + fs.writeFileSync(path.join(dir, 'vendor', '.codegraphignore'), ''); + + git(dir, 'add', '-A'); + git(dir, 'commit', '-m', 'initial'); + }); + + afterEach(() => { + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('scanDirectory honors .codegraphignore on the git fast path', () => { + const files = scanDirectory(dir, config); + expect(files).toContain('src/app.ts'); + expect(files).not.toContain('vendor/pkg.ts'); + expect(files).not.toContain('vendor/lib/sub.ts'); + }); + + it('marker at project root excludes everything', () => { + fs.writeFileSync(path.join(dir, '.codegraphignore'), ''); + // Need to add it to git so ls-files sees it (or rely on -o) + git(dir, 'add', '-A'); + git(dir, 'commit', '-m', 'add root marker'); + const files = scanDirectory(dir, config); + expect(files).toEqual([]); + }); + + it('marker in nested subdir does not affect siblings', () => { + // Add another sibling subdir without a marker + fs.mkdirSync(path.join(dir, 'libs')); + fs.writeFileSync(path.join(dir, 'libs', 'util.ts'), 'export const u = 1;'); + git(dir, 'add', '-A'); + git(dir, 'commit', '-m', 'add libs'); + + const files = scanDirectory(dir, config); + expect(files).toContain('src/app.ts'); + expect(files).toContain('libs/util.ts'); + expect(files).not.toContain('vendor/pkg.ts'); + }); + + it('respects marker added after initial commit (untracked marker)', () => { + // The marker file itself need not be committed — it can be a local + // override. Add marker AFTER commit, do not commit it. + fs.mkdirSync(path.join(dir, 'generated')); + fs.writeFileSync(path.join(dir, 'generated', 'gen.ts'), 'export const g = 1;'); + fs.writeFileSync(path.join(dir, 'generated', '.codegraphignore'), ''); + // The .ts file is untracked but visible via `git ls-files -o`. + // The marker is also untracked — we still detect it via fs check. + + const files = scanDirectory(dir, config); + expect(files).not.toContain('generated/gen.ts'); + }); + }); + + describe('parity with non-git fallback (filesystem walk)', () => { + let dir: string; + + beforeEach(() => { + dir = tempDir('codegraph-ignore-walk-'); + fs.mkdirSync(path.join(dir, 'src')); + fs.mkdirSync(path.join(dir, 'vendor')); + fs.writeFileSync(path.join(dir, 'src', 'app.ts'), 'export const a = 1;'); + fs.writeFileSync(path.join(dir, 'vendor', 'pkg.ts'), 'export const v = 1;'); + fs.writeFileSync(path.join(dir, 'vendor', '.codegraphignore'), ''); + }); + + afterEach(() => { + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('non-git project also honors the marker (sanity / pre-existing behavior)', () => { + const files = scanDirectory(dir, config); + expect(files).toContain('src/app.ts'); + expect(files).not.toContain('vendor/pkg.ts'); + }); + }); + + describe('sync git path (getGitChangedFiles)', () => { + let dir: string; + let cg: CodeGraph; + + beforeEach(async () => { + dir = tempDir('codegraph-ignore-sync-'); + git(dir, 'init'); + git(dir, 'config', 'user.email', 'test@test.com'); + git(dir, 'config', 'user.name', 'Test'); + git(dir, 'symbolic-ref', 'HEAD', 'refs/heads/main'); + + fs.mkdirSync(path.join(dir, 'src')); + fs.mkdirSync(path.join(dir, 'vendor')); + fs.writeFileSync(path.join(dir, 'src', 'app.ts'), 'export const a = 1;'); + fs.writeFileSync(path.join(dir, 'vendor', '.codegraphignore'), ''); + + git(dir, 'add', '-A'); + git(dir, 'commit', '-m', 'initial'); + + cg = CodeGraph.initSync(dir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + }); + + afterEach(() => { + if (cg) cg.destroy(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('sync ignores changes inside marker dirs', async () => { + // Add a new file under vendor/ — should NOT be picked up by sync. + fs.writeFileSync(path.join(dir, 'vendor', 'leaked.ts'), 'export const x = 1;'); + // Also add a real change to confirm sync still runs. + fs.writeFileSync(path.join(dir, 'src', 'app.ts'), 'export const a = 2;'); + + const result = await cg.sync(); + expect(result.changedFilePaths).toContain('src/app.ts'); + expect(result.changedFilePaths ?? []).not.toContain('vendor/leaked.ts'); + }); + }); +}); diff --git a/__tests__/config-refs.test.ts b/__tests__/config-refs.test.ts new file mode 100644 index 00000000..ab1a63e4 --- /dev/null +++ b/__tests__/config-refs.test.ts @@ -0,0 +1,288 @@ +/** + * Config-refs tests: parser unit tests + end-to-end through CodeGraph. + */ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { extractConfigRefs } from '../src/config-refs'; +import CodeGraph from '../src/index'; + +let testDir: string; +let cg: CodeGraph | null = null; + +function write(rel: string, content: string) { + const abs = path.join(testDir, rel); + fs.mkdirSync(path.dirname(abs), { recursive: true }); + fs.writeFileSync(abs, content); +} + +beforeEach(() => { + testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-config-')); +}); + +afterEach(() => { + if (cg) { + cg.destroy(); + cg = null; + } + if (fs.existsSync(testDir)) fs.rmSync(testDir, { recursive: true, force: true }); +}); + +// ============================================================================ +// Pure parser tests (no CodeGraph) +// ============================================================================ + +describe('extractConfigRefs', () => { + it('extracts process.env.X from TS', () => { + write('a.ts', `const port = process.env.OBSIDIAN_PORT;\n`); + const refs = extractConfigRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs.length).toBe(1); + expect(refs[0]!.configKey).toBe('OBSIDIAN_PORT'); + expect(refs[0]!.line).toBe(1); + }); + + it('extracts process.env["X"] from JS', () => { + write('a.js', `module.exports = { port: process.env["MY_KEY"] };\n`); + const refs = extractConfigRefs(testDir, [{ path: 'a.js', language: 'javascript' }], () => null); + expect(refs.map((r) => r.configKey)).toEqual(['MY_KEY']); + }); + + it('extracts os.getenv / os.environ from Python', () => { + write( + 'a.py', + [ + `import os`, + `port = os.getenv("PYTHON_PORT")`, + `host = os.environ.get("PYTHON_HOST")`, + `path = os.environ["PYTHON_PATH"]`, + `name = getenv("PYTHON_NAME")`, + ].join('\n') + ); + const refs = extractConfigRefs(testDir, [{ path: 'a.py', language: 'python' }], () => null); + expect(new Set(refs.map((r) => r.configKey))).toEqual( + new Set(['PYTHON_PORT', 'PYTHON_HOST', 'PYTHON_PATH', 'PYTHON_NAME']) + ); + }); + + it('extracts os.Getenv / os.LookupEnv from Go', () => { + write( + 'a.go', + [ + `package main`, + `import "os"`, + `var Port = os.Getenv("GO_PORT")`, + `var Host, _ = os.LookupEnv("GO_HOST")`, + ].join('\n') + ); + const refs = extractConfigRefs(testDir, [{ path: 'a.go', language: 'go' }], () => null); + expect(new Set(refs.map((r) => r.configKey))).toEqual(new Set(['GO_PORT', 'GO_HOST'])); + }); + + it('extracts ENV[...] / ENV.fetch from Ruby', () => { + write('a.rb', `port = ENV["RUBY_PORT"]\nhost = ENV.fetch("RUBY_HOST")\n`); + const refs = extractConfigRefs(testDir, [{ path: 'a.rb', language: 'ruby' }], () => null); + expect(new Set(refs.map((r) => r.configKey))).toEqual(new Set(['RUBY_PORT', 'RUBY_HOST'])); + }); + + it('extracts env!/std::env::var from Rust', () => { + write( + 'a.rs', + [ + `let port = env!("RUST_PORT");`, + `let host = std::env::var("RUST_HOST").unwrap();`, + ].join('\n') + ); + const refs = extractConfigRefs(testDir, [{ path: 'a.rs', language: 'rust' }], () => null); + expect(new Set(refs.map((r) => r.configKey))).toEqual(new Set(['RUST_PORT', 'RUST_HOST'])); + }); + + it('extracts System.getenv from Java/Kotlin', () => { + write('A.java', `String port = System.getenv("JAVA_PORT");\n`); + const refs = extractConfigRefs(testDir, [{ path: 'A.java', language: 'java' }], () => null); + expect(refs.map((r) => r.configKey)).toEqual(['JAVA_PORT']); + }); + + it('only matches UPPER_CASE keys (skips lower-case identifiers)', () => { + write('a.ts', `const x = process.env.somethingDynamic;\nconst y = process.env.GOOD_KEY;\n`); + const refs = extractConfigRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs.map((r) => r.configKey)).toEqual(['GOOD_KEY']); + }); + + it('skips files in unsupported languages without crashing', () => { + write('a.swift', `let port = ProcessInfo.processInfo.environment["SWIFT_PORT"]\n`); + const refs = extractConfigRefs(testDir, [{ path: 'a.swift', language: 'swift' }], () => null); + // Swift not in PATTERNS for v1. + expect(refs).toEqual([]); + }); + + it('captures the correct 1-indexed line number', () => { + write( + 'a.ts', + [ + `// line 1`, + `// line 2`, + `const x = process.env.LINE_THREE_KEY;`, + `// line 4`, + `const y = process.env.LINE_FIVE_KEY;`, + ].join('\n') + ); + const refs = extractConfigRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toEqual([ + expect.objectContaining({ configKey: 'LINE_THREE_KEY', line: 3 }), + expect.objectContaining({ configKey: 'LINE_FIVE_KEY', line: 5 }), + ]); + }); + + it('threads the resolveEnclosing closure correctly', () => { + write('a.ts', `const x = process.env.FOO;\n`); + const calls: Array<[string, number]> = []; + extractConfigRefs( + testDir, + [{ path: 'a.ts', language: 'typescript' }], + (filePath, line) => { + calls.push([filePath, line]); + return 'fake-node-id'; + } + ); + expect(calls).toEqual([['a.ts', 1]]); + }); + + it('survives a missing file (skips, no throw)', () => { + const refs = extractConfigRefs( + testDir, + [{ path: 'does-not-exist.ts', language: 'typescript' }], + () => null + ); + expect(refs).toEqual([]); + }); +}); + +// ============================================================================ +// End-to-end through CodeGraph +// ============================================================================ + +describe('CodeGraph config refs', () => { + it('persists env reads after indexAll and resolves enclosing function', async () => { + write( + 'src/server.ts', + [ + `export function start() {`, + ` const port = process.env.OBSIDIAN_PORT ?? 8080;`, + ` return port;`, + `}`, + ``, + `export function getApiKey() {`, + ` return process.env.OBSIDIAN_API_KEY;`, + `}`, + ``, + `// top-level read`, + `export const HOST = process.env.OBSIDIAN_HOST;`, + ].join('\n') + ); + cg = CodeGraph.initSync(testDir, { + config: { include: ['**/*.ts'], exclude: [] }, + }); + await cg.indexAll(); + + // All three keys should be visible. + const keys = cg.getConfigKeys({ configKind: 'env' }); + expect(keys.map((k) => k.configKey).sort()).toEqual([ + 'OBSIDIAN_API_KEY', + 'OBSIDIAN_HOST', + 'OBSIDIAN_PORT', + ]); + + // The OBSIDIAN_PORT read should be attributed to `start`. + const portSites = cg.getConfigRefsByKey('OBSIDIAN_PORT'); + expect(portSites.length).toBe(1); + expect(portSites[0]!.sourceName).toBe('start'); + + // The HOST read is at the top level — sourceName should be null. + const hostSites = cg.getConfigRefsByKey('OBSIDIAN_HOST'); + expect(hostSites[0]!.sourceName).toBeNull(); + }); + + it('reverse view: getConfigKeysForNode returns keys read by a function', async () => { + write( + 'src/a.ts', + [ + `export function loadConfig() {`, + ` const a = process.env.KEY_A;`, + ` const b = process.env.KEY_B;`, + ` return { a, b };`, + `}`, + ].join('\n') + ); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'loadConfig')!; + const keys = cg.getConfigKeysForNode(node.id).map((r) => r.configKey).sort(); + expect(keys).toEqual(['KEY_A', 'KEY_B']); + }); + + it('respects enableConfigRefs=false', async () => { + write('src/a.ts', `export const PORT = process.env.PORT;\n`); + cg = CodeGraph.initSync(testDir, { + config: { include: ['**/*.ts'], exclude: [], enableConfigRefs: false }, + }); + await cg.indexAll(); + expect(cg.getConfigKeys()).toEqual([]); + }); + + it('incremental sync replaces refs for changed files only', async () => { + write('src/a.ts', `export const A = process.env.OLD_KEY;\n`); + write('src/b.ts', `export const B = process.env.UNCHANGED_KEY;\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(cg.getConfigKeys().map((k) => k.configKey).sort()).toEqual([ + 'OLD_KEY', + 'UNCHANGED_KEY', + ]); + + // Edit only a.ts — UNCHANGED_KEY should still be there. + write('src/a.ts', `export const A = process.env.NEW_KEY;\n`); + await cg.sync(); + + const keys = cg.getConfigKeys().map((k) => k.configKey).sort(); + expect(keys).toContain('NEW_KEY'); + expect(keys).toContain('UNCHANGED_KEY'); + expect(keys).not.toContain('OLD_KEY'); + }); + + it('drops refs when a file is edited to remove its last env read', async () => { + // Regression for the empty-rows early-return data-corruption bug: + // applyConfigRefs([]) used to short-circuit without deleting the + // stale rows for the file. The sync path now explicitly invalidates + // rows for every changed file *before* extracting, regardless of + // whether the new content has any reads. + write('src/a.ts', `export const PORT = process.env.REMOVED_KEY;\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(cg.getConfigKeys().some((k) => k.configKey === 'REMOVED_KEY')).toBe(true); + + // Edit a.ts to remove the env read entirely (no remaining reads). + write('src/a.ts', `export const PORT = 8080; // no env read here\n`); + await cg.sync(); + + expect(cg.getConfigKeys().some((k) => k.configKey === 'REMOVED_KEY')).toBe(false); + }); + + it('drops refs for files removed between syncs', async () => { + write('src/a.ts', `export const A = process.env.GOING_AWAY;\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(cg.getConfigKeys().some((k) => k.configKey === 'GOING_AWAY')).toBe(true); + + fs.unlinkSync(path.join(testDir, 'src/a.ts')); + await cg.sync(); + + expect(cg.getConfigKeys().some((k) => k.configKey === 'GOING_AWAY')).toBe(false); + }); + + // (Removed: a defensive test for the v4-migration-collision bug class. + // With file-based migrations (NNN-name.ts), two PRs claiming the same + // version produces a filesystem-level conflict, so the silent skip the + // defensive guard protected against can no longer happen.) +}); diff --git a/__tests__/context.test.ts b/__tests__/context.test.ts index 52dae1fe..9a0614aa 100644 --- a/__tests__/context.test.ts +++ b/__tests__/context.test.ts @@ -210,6 +210,19 @@ export function validateEmail(email: string): boolean { expect(result.nodes.size).toBeLessThanOrEqual(5); }); + + it('should clamp absurd searchLimit/maxNodes values to safe upper bounds', async () => { + // Without clamping, the internal `findNodesByExactName` query would + // request `searchLimit * 5` rows — passing 1e9 here would blow out + // memory. The call should complete in normal time and not return more + // than the hard cap on maxNodes (1000). + const result = await cg.findRelevantContext('function', { + searchLimit: 1_000_000_000, + maxNodes: 1_000_000_000, + traversalDepth: 1_000, + }); + expect(result.nodes.size).toBeLessThanOrEqual(1000); + }); }); describe('buildContext()', () => { diff --git a/__tests__/db-perf.test.ts b/__tests__/db-perf.test.ts new file mode 100644 index 00000000..256cf92c --- /dev/null +++ b/__tests__/db-perf.test.ts @@ -0,0 +1,161 @@ +/** + * DB Performance / Correctness Tests + * + * Regression tests for three changes: + * 1. Batch `getNodesByIds` collapses graph-traversal N+1 reads. + * 2. `insertNode` invalidates the LRU cache so INSERT OR REPLACE + * doesn't serve a stale cached row on next `getNodeById`. + * 3. `runMaintenance` runs `PRAGMA optimize` + `wal_checkpoint(PASSIVE)` + * after indexAll/sync without throwing. + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { DatabaseConnection } from '../src/db'; +import { QueryBuilder } from '../src/db/queries'; +import { Node } from '../src/types'; + +function makeNode(id: string, name = id): Node { + return { + id, + kind: 'function', + name, + qualifiedName: name, + filePath: 'a.ts', + language: 'typescript', + startLine: 1, + endLine: 1, + startColumn: 0, + endColumn: 0, + updatedAt: Date.now(), + }; +} + +describe('getNodesByIds (batch lookup)', () => { + let dir: string; + let db: DatabaseConnection; + let q: QueryBuilder; + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'db-perf-batch-')); + db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + q = new QueryBuilder(db.getDb()); + }); + + afterEach(() => { + db.close(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('returns a Map keyed by id, with one entry per existing node', () => { + q.insertNodes([makeNode('n1'), makeNode('n2'), makeNode('n3')]); + const out = q.getNodesByIds(['n1', 'n2', 'n3']); + expect(out.size).toBe(3); + expect(out.get('n1')!.name).toBe('n1'); + expect(out.get('n3')!.name).toBe('n3'); + }); + + it('omits missing IDs from the result map (no nulls, no exceptions)', () => { + q.insertNodes([makeNode('n1'), makeNode('n2')]); + const out = q.getNodesByIds(['n1', 'missing', 'n2']); + expect(out.size).toBe(2); + expect(out.has('missing')).toBe(false); + expect(out.has('n1')).toBe(true); + expect(out.has('n2')).toBe(true); + }); + + it('handles an empty input array', () => { + expect(q.getNodesByIds([]).size).toBe(0); + }); + + it('handles batches over the SQLite parameter limit (chunking)', () => { + // Insert 1500 nodes; the helper chunks at 500 internally. + const nodes = Array.from({ length: 1500 }, (_, i) => makeNode(`n${i}`)); + q.insertNodes(nodes); + const ids = nodes.map((n) => n.id); + const out = q.getNodesByIds(ids); + expect(out.size).toBe(1500); + // Spot-check a few from the first / middle / last chunk. + expect(out.has('n0')).toBe(true); + expect(out.has('n750')).toBe(true); + expect(out.has('n1499')).toBe(true); + }); + + it('serves cache hits from memory and queries only the misses', () => { + q.insertNodes([makeNode('n1'), makeNode('n2'), makeNode('n3')]); + // Warm the cache for n1 only. + q.getNodeById('n1'); + // Replace the underlying row to make a miss-vs-cache-hit detectable. + db.getDb().prepare('UPDATE nodes SET name = ? WHERE id = ?').run('changed', 'n1'); + const out = q.getNodesByIds(['n1', 'n2']); + // The cached n1 (still 'n1', not 'changed') must be returned. + expect(out.get('n1')!.name).toBe('n1'); + expect(out.get('n2')!.name).toBe('n2'); + }); +}); + +describe('insertNode cache invalidation', () => { + let dir: string; + let db: DatabaseConnection; + let q: QueryBuilder; + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'db-perf-cache-')); + db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + q = new QueryBuilder(db.getDb()); + }); + + afterEach(() => { + db.close(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('does not serve a stale cached node after INSERT OR REPLACE', () => { + // Regression: insertNode (which uses INSERT OR REPLACE) used to skip + // cache invalidation, so the next getNodeById returned the pre-replace + // version until LRU eviction. + const original = makeNode('n1', 'oldName'); + q.insertNode(original); + const beforeReplace = q.getNodeById('n1'); + expect(beforeReplace!.name).toBe('oldName'); + + // Replace via insertNode (the bug path). + q.insertNode({ ...original, name: 'newName', updatedAt: Date.now() }); + const afterReplace = q.getNodeById('n1'); + expect(afterReplace!.name).toBe('newName'); + }); +}); + +describe('runMaintenance', () => { + let dir: string; + let db: DatabaseConnection; + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'db-perf-maint-')); + db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + }); + + afterEach(() => { + db.close(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('runs without throwing on a fresh database', () => { + expect(() => db.runMaintenance()).not.toThrow(); + }); + + it('runs without throwing after writes', () => { + const q = new QueryBuilder(db.getDb()); + q.insertNodes([makeNode('n1'), makeNode('n2')]); + expect(() => db.runMaintenance()).not.toThrow(); + }); + + it('swallows failures rather than propagating (best-effort)', () => { + // Close the DB so the underlying handle would normally throw on any + // exec(). runMaintenance must still not propagate. + db.close(); + expect(() => db.runMaintenance()).not.toThrow(); + }); +}); diff --git a/__tests__/diversify.test.ts b/__tests__/diversify.test.ts new file mode 100644 index 00000000..181ee9c5 --- /dev/null +++ b/__tests__/diversify.test.ts @@ -0,0 +1,200 @@ +/** + * Result Diversification Tests + * + * Verifies the per-file cap on search results: queries that match many + * symbols in one file (the methods of a class) no longer return 10 hits + * from one file, but instead surface representative breadth across files. + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { DatabaseConnection } from '../src/db'; +import { QueryBuilder } from '../src/db/queries'; +import { diversifyByFile } from '../src/search/query-utils'; +import { Node } from '../src/types'; + +describe('diversifyByFile (unit)', () => { + function r(score: number, name: string, filePath: string) { + return { node: { id: name, name, filePath } as Node, score }; + } + + it('caps consecutive results from the same file at perFileCap', () => { + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'a2', 'a.ts'), + r(8, 'a3', 'a.ts'), + r(7, 'a4', 'a.ts'), + r(6, 'b1', 'b.ts'), + ]; + const out = diversifyByFile(results, 5, 2); + expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2', 'b1', 'a3', 'a4']); + // First two from a.ts (cap), then b.ts (different file), then backfill. + }); + + it('preserves overall ranking when no file dominates', () => { + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'b1', 'b.ts'), + r(8, 'c1', 'c.ts'), + r(7, 'a2', 'a.ts'), + ]; + const out = diversifyByFile(results, 4, 2); + expect(out.map((x) => x.node.name)).toEqual(['a1', 'b1', 'c1', 'a2']); + }); + + it('does not lose results — backfills from skipped when limit not yet filled', () => { + // 10 candidates all from one file, limit 5, cap 2: pick 2, backfill 3. + const results = Array.from({ length: 10 }, (_, i) => + r(10 - i, `n${i}`, 'a.ts') + ); + const out = diversifyByFile(results, 5, 2); + expect(out).toHaveLength(5); + expect(out.every((x) => x.node.filePath === 'a.ts')).toBe(true); + }); + + it('returns the input slice unchanged when perFileCap=0', () => { + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'a2', 'a.ts'), + r(8, 'a3', 'a.ts'), + ]; + expect(diversifyByFile(results, 3, 0)).toEqual(results); + }); + + it('returns input unchanged when results.length <= limit and no reordering needed', () => { + const results = [r(10, 'a1', 'a.ts'), r(9, 'a2', 'a.ts')]; + expect(diversifyByFile(results, 5, 2)).toEqual(results); + }); + + it('still reorders within limit when results.length === limit but cap rearranges', () => { + // Same total count as limit, but the cap reorders to surface peer files + // earlier in the list. + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'a2', 'a.ts'), + r(8, 'a3', 'a.ts'), + r(7, 'a4', 'a.ts'), + r(6, 'b1', 'b.ts'), + ]; + const out = diversifyByFile(results, 5, 2); + // First 2 from a.ts (cap), then b.ts, then backfill a.ts. + expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2', 'b1', 'a3', 'a4']); + }); + + it('respects the limit even when picked + skipped exceed it', () => { + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'a2', 'a.ts'), + r(8, 'a3', 'a.ts'), + r(7, 'b1', 'b.ts'), + ]; + const out = diversifyByFile(results, 2, 2); + expect(out).toHaveLength(2); + expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2']); + }); + + it('always preserves the top-scoring result at position 0', () => { + const results = [ + r(100, 'top', 'big.ts'), + r(50, 'big2', 'big.ts'), + r(40, 'big3', 'big.ts'), + r(30, 'big4', 'big.ts'), + r(20, 'other', 'other.ts'), + ]; + const out = diversifyByFile(results, 3, 2); + expect(out[0].node.name).toBe('top'); + }); +}); + +describe('searchNodes per-file diversification (integration)', () => { + let dir: string; + let db: DatabaseConnection; + let q: QueryBuilder; + + function makeNode(id: string, name: string, kind: Node['kind'], filePath: string): Node { + return { + id, + kind, + name, + qualifiedName: `${filePath}::${name}`, + filePath, + language: 'typescript', + startLine: 1, + endLine: 1, + startColumn: 0, + endColumn: 0, + updatedAt: Date.now(), + }; + } + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'diversify-search-')); + db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + q = new QueryBuilder(db.getDb()); + // Simulate the "10 methods of one class" scenario: a class plus many + // methods all sharing a common token, all in one file. Plus a peer + // file with a sibling implementation. + const nodes: Node[] = [ + makeNode('cls', 'DatabaseConnection', 'class', 'src/db.ts'), + makeNode('m1', 'connect', 'method', 'src/db.ts'), + makeNode('m2', 'disconnect', 'method', 'src/db.ts'), + makeNode('m3', 'reconnect', 'method', 'src/db.ts'), + makeNode('m4', 'isConnected', 'method', 'src/db.ts'), + makeNode('m5', 'connectionString', 'property', 'src/db.ts'), + makeNode('peer', 'PoolConnection', 'class', 'src/pool.ts'), + makeNode('peer2', 'connectPool', 'function', 'src/pool.ts'), + ]; + q.insertNodes(nodes); + }); + + afterEach(() => { + db.close(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('caps results per file at the default (3) so peer files surface', () => { + const results = q.searchNodes('connect', { limit: 5 }); + const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length; + const fromPool = results.filter((r) => r.node.filePath === 'src/pool.ts').length; + expect(fromDbTs).toBeLessThanOrEqual(3); // cap + expect(fromPool).toBeGreaterThanOrEqual(1); // peer file represented + }); + + it('honors perFileCap: 0 (disabled) — does not enforce a per-file limit', () => { + // Insert a heavy imbalance so dominance is unambiguous: 10 matching + // methods in db.ts, only the existing pool.ts entries elsewhere. + const heavyDb: Node[] = Array.from({ length: 10 }, (_, i) => + makeNode(`heavy${i}`, `connectVariant${i}`, 'method', 'src/db.ts') + ); + q.insertNodes(heavyDb); + const results = q.searchNodes('connect', { limit: 8, perFileCap: 0 }); + const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length; + expect(fromDbTs).toBeGreaterThan(3); + }); + + it('honors a higher perFileCap', () => { + const results = q.searchNodes('connect', { limit: 6, perFileCap: 5 }); + const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length; + expect(fromDbTs).toBeLessThanOrEqual(5); + }); + + it('preserves the top-scoring hit even with diversification', () => { + // Class node with the most direct name match is the most relevant — + // diversification must never displace it from #1. + const results = q.searchNodes('DatabaseConnection', { limit: 3 }); + expect(results[0].node.name).toBe('DatabaseConnection'); + }); + + it('does not lose results — fills limit by backfilling skipped same-file hits', () => { + // If only one file has matches, all results legitimately come from it. + // The cap should not cause us to return fewer than `limit` results. + const onlyOneFileNodes: Node[] = Array.from({ length: 10 }, (_, i) => + makeNode(`only${i}`, `solo${i}`, 'function', 'src/only.ts') + ); + q.insertNodes(onlyOneFileNodes); + const results = q.searchNodes('solo', { limit: 5 }); + expect(results.length).toBe(5); + }); +}); diff --git a/__tests__/edges-unique.test.ts b/__tests__/edges-unique.test.ts new file mode 100644 index 00000000..49eced53 --- /dev/null +++ b/__tests__/edges-unique.test.ts @@ -0,0 +1,166 @@ +/** + * Edge Uniqueness Tests + * + * Regression tests for the bug where `INSERT OR IGNORE INTO edges` was + * silently a no-op: the only candidate key was the AUTOINCREMENT id (which + * never conflicts), so duplicate edges accumulated on every re-emission / + * re-resolution. + * + * Fix: a UNIQUE index on (source, target, kind, COALESCE(line, -1), + * COALESCE(col, -1)) backs a fresh-install schema and is also applied via + * migration v4 (with a dedup pass over existing rows). + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { DatabaseConnection } from '../src/db'; +import { QueryBuilder } from '../src/db/queries'; +import { Edge, Node } from '../src/types'; +import { runMigrations, getCurrentVersion, CURRENT_SCHEMA_VERSION } from '../src/db/migrations'; + +function tempDb(): { dir: string; db: DatabaseConnection; q: QueryBuilder } { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-edges-unique-')); + const db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + const q = new QueryBuilder(db.getDb()); + return { dir, db, q }; +} + +function cleanup(dir: string, db: DatabaseConnection) { + db.close(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); +} + +function makeNode(id: string, name: string): Node { + return { + id, + kind: 'function', + name, + qualifiedName: `f::${name}`, + filePath: 'a.ts', + language: 'typescript', + startLine: 1, + endLine: 1, + startColumn: 0, + endColumn: 0, + updatedAt: Date.now(), + }; +} + +function edgesCount(db: DatabaseConnection): number { + const row = db.getDb().prepare('SELECT COUNT(*) as c FROM edges').get() as { c: number }; + return row.c; +} + +describe('Edge UNIQUE constraint (bug #2)', () => { + let dir: string; + let db: DatabaseConnection; + let q: QueryBuilder; + + beforeEach(() => { + ({ dir, db, q } = tempDb()); + q.insertNodes([makeNode('n1', 'foo'), makeNode('n2', 'bar')]); + }); + + afterEach(() => cleanup(dir, db)); + + it('rejects duplicate (source, target, kind, line, col)', () => { + const e: Edge = { source: 'n1', target: 'n2', kind: 'calls', line: 10, column: 5 }; + q.insertEdge(e); + q.insertEdge(e); // INSERT OR IGNORE — should be a no-op now + expect(edgesCount(db)).toBe(1); + }); + + it('treats two NULL line edges as duplicates (COALESCE in unique index)', () => { + const e: Edge = { source: 'n1', target: 'n2', kind: 'calls' }; + q.insertEdge(e); + q.insertEdge(e); + expect(edgesCount(db)).toBe(1); + }); + + it('allows same source/target/kind on different lines', () => { + q.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 1 }); + q.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 2 }); + expect(edgesCount(db)).toBe(2); + }); + + it('allows same source/target/line on different kinds', () => { + q.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 1 }); + q.insertEdge({ source: 'n1', target: 'n2', kind: 'references', line: 1 }); + expect(edgesCount(db)).toBe(2); + }); + + it('insertEdges (batch) dedupes within the same call', () => { + const e: Edge = { source: 'n1', target: 'n2', kind: 'calls', line: 1, column: 1 }; + q.insertEdges([e, e, e]); + expect(edgesCount(db)).toBe(1); + }); + + it('survives the same edge being re-emitted across many cycles', () => { + const e: Edge = { source: 'n1', target: 'n2', kind: 'calls', line: 1 }; + for (let i = 0; i < 100; i++) { + q.insertEdge(e); + } + expect(edgesCount(db)).toBe(1); + }); +}); + +describe('Migration v4: dedup existing edges', () => { + let dir: string; + let dbPath: string; + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-migr-v4-')); + dbPath = path.join(dir, 'test.db'); + }); + + afterEach(() => { + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('collapses pre-existing duplicates and adds the UNIQUE index', () => { + // Build a v3-shaped database manually: schema, but simulate a stale + // version row + insert duplicates that the missing UNIQUE index let + // through. We use the real initialize() path then drop the index + + // version row to back-date the DB. + const db = DatabaseConnection.initialize(dbPath); + db.getDb().exec(`DROP INDEX IF EXISTS idx_edges_unique;`); + db.getDb().exec(`DELETE FROM schema_versions;`); + db.getDb().prepare( + 'INSERT INTO schema_versions (version, applied_at, description) VALUES (3, ?, ?)' + ).run(Date.now(), 'simulated v3'); + + const q = new QueryBuilder(db.getDb()); + q.insertNodes([makeNode('n1', 'foo'), makeNode('n2', 'bar')]); + // Force-insert duplicates via raw SQL (bypassing the constraint that + // is now absent). Three rows that should collapse to one. + const stmt = db.getDb().prepare( + 'INSERT INTO edges (source, target, kind, line, col) VALUES (?, ?, ?, ?, ?)' + ); + stmt.run('n1', 'n2', 'calls', 10, 5); + stmt.run('n1', 'n2', 'calls', 10, 5); + stmt.run('n1', 'n2', 'calls', 10, 5); + // And one with NULL line/col, also duplicated + stmt.run('n1', 'n2', 'references', null, null); + stmt.run('n1', 'n2', 'references', null, null); + + expect(edgesCount(db)).toBe(5); + expect(getCurrentVersion(db.getDb())).toBe(3); + + // Run migrations forward + runMigrations(db.getDb(), 3); + + expect(getCurrentVersion(db.getDb())).toBe(CURRENT_SCHEMA_VERSION); + expect(CURRENT_SCHEMA_VERSION).toBeGreaterThanOrEqual(4); + // 3 calls dups → 1, 2 references dups → 1 + expect(edgesCount(db)).toBe(2); + + // Now the constraint is enforced: another duplicate insert is a no-op. + const q2 = new QueryBuilder(db.getDb()); + q2.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 10, column: 5 }); + expect(edgesCount(db)).toBe(2); + + db.close(); + }); +}); diff --git a/__tests__/extraction-resolution-accuracy.test.ts b/__tests__/extraction-resolution-accuracy.test.ts new file mode 100644 index 00000000..f78f3d76 --- /dev/null +++ b/__tests__/extraction-resolution-accuracy.test.ts @@ -0,0 +1,266 @@ +/** + * Extraction & Resolution Accuracy Tests + * + * Regression tests for three accuracy bugs fixed in one PR: + * 1. Parse-retry comment strip was hardcoded to `//`, no-op on Python/Ruby/etc. + * 2. Framework route extractors ran regex over raw file content, matching + * examples in docstrings/comments as real routes. + * 3. UTF-8 BOM caused spurious "modified" hash mismatches between editors. + */ + +import { describe, it, expect } from 'vitest'; +import { stripBom, stripCommentLinesForRetry, stripCommentsForRegex } from '../src/utils'; +import { hashContent } from '../src/extraction'; +import { flaskResolver, fastapiResolver, djangoResolver } from '../src/resolution/frameworks/python'; +import { expressResolver } from '../src/resolution/frameworks/express'; +import { aspnetResolver } from '../src/resolution/frameworks/csharp'; +import { rustResolver } from '../src/resolution/frameworks/rust'; +import { laravelResolver } from '../src/resolution/frameworks/laravel'; + +describe('UTF-8 BOM normalization (bug #5)', () => { + it('stripBom removes leading U+FEFF', () => { + expect(stripBom('hello')).toBe('hello'); + expect(stripBom('hello')).toBe('hello'); + expect(stripBom('')).toBe(''); + }); + + it('stripBom only removes leading BOM, not embedded ones', () => { + expect(stripBom('ab')).toBe('ab'); + }); + + it('hashContent treats BOM and no-BOM as identical', () => { + const withBom = 'export function hello() { return 42; }'; + const withoutBom = 'export function hello() { return 42; }'; + expect(hashContent(withBom)).toBe(hashContent(withoutBom)); + }); +}); + +describe('Per-language comment-line stripping (bug #1)', () => { + it('strips `#` lines for Python', () => { + const input = ['# CHECK: foo', 'def x():', ' pass'].join('\n'); + const out = stripCommentLinesForRetry(input, 'python'); + expect(out.split('\n')).toEqual(['', 'def x():', ' pass']); + }); + + it('strips `#` lines for Ruby', () => { + const input = ['# top comment', 'def x; end'].join('\n'); + const out = stripCommentLinesForRetry(input, 'ruby'); + expect(out.split('\n')).toEqual(['', 'def x; end']); + }); + + it('strips `//` lines for TypeScript', () => { + const input = ['// header', 'function x() {}'].join('\n'); + const out = stripCommentLinesForRetry(input, 'typescript'); + expect(out.split('\n')).toEqual(['', 'function x() {}']); + }); + + it('strips both `//` and `#` lines for PHP', () => { + const input = ['// js-style', '# perl-style', ' { + const input = '// looks like a comment\ncode'; + expect(stripCommentLinesForRetry(input, 'unknown-lang')).toBe(input); + }); + + it('preserves line count so node positions stay correct', () => { + const input = ['# c1', 'a', '# c2', 'b'].join('\n'); + const out = stripCommentLinesForRetry(input, 'python'); + expect(out.split('\n').length).toBe(input.split('\n').length); + }); + + it('does NOT strip indented `#` inside Python (still recognized as line comment)', () => { + // The marker matches optional leading whitespace + `#`, so an indented + // pure comment line is correctly stripped. Non-comment code on the same + // line as `#` (mid-line comment) is intentionally not stripped here. + const input = [' # indented comment', ' pass # trailing'].join('\n'); + const out = stripCommentLinesForRetry(input, 'python'); + expect(out.split('\n')).toEqual(['', ' pass # trailing']); + }); +}); + +describe('Framework regex no longer matches docstrings/comments (bug #4)', () => { + describe('Flask', () => { + it('skips routes inside `#` comments', () => { + const content = [ + 'from flask import Flask', + 'app = Flask(__name__)', + '# Example: @app.route("/fake")', + '@app.route("/real")', + 'def real(): pass', + ].join('\n'); + const nodes = flaskResolver.extractNodes!('app.py', content); + const paths = nodes.map((n) => n.name); + expect(paths).toContain('/real'); + expect(paths).not.toContain('/fake'); + }); + + it('skips routes inside triple-quoted docstrings', () => { + const content = [ + 'def example():', + ' """', + ' Usage: @app.route("/fake")', + ' """', + ' pass', + '@app.route("/real")', + 'def real(): pass', + ].join('\n'); + const nodes = flaskResolver.extractNodes!('app.py', content); + const paths = nodes.map((n) => n.name); + expect(paths).toContain('/real'); + expect(paths).not.toContain('/fake'); + }); + }); + + describe('FastAPI', () => { + it('skips routes inside `#` comments and triple-quoted docstrings', () => { + const content = [ + '"""', + 'Module docs — example: @app.get("/docfake")', + '"""', + '# @app.post("/commentfake")', + '@app.get("/real")', + 'def real(): pass', + ].join('\n'); + const nodes = fastapiResolver.extractNodes!('app.py', content); + const names = nodes.map((n) => n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/docfake'))).toBe(false); + expect(names.some((n) => n.includes('/commentfake'))).toBe(false); + }); + + it('preserves correct line numbers for real routes after stripping', () => { + const content = [ + '"""', // line 1 + '@app.get("/fake")', // line 2 — inside docstring + '"""', // line 3 + '', // line 4 + '@app.get("/real")', // line 5 — real + ].join('\n'); + const nodes = fastapiResolver.extractNodes!('app.py', content); + const real = nodes.find((n) => n.name.includes('/real')); + expect(real).toBeDefined(); + expect(real!.startLine).toBe(5); + }); + }); + + describe('Django URL patterns', () => { + it('skips path() inside `#` comments', () => { + const content = [ + 'from django.urls import path', + '# example: path("fake/", fake_view)', + 'urlpatterns = [path("real/", real_view)]', + ].join('\n'); + const nodes = djangoResolver.extractNodes!('urls.py', content); + const names = nodes.map((n) => n.name); + expect(names).toContain('real/'); + expect(names).not.toContain('fake/'); + }); + }); + + describe('Express', () => { + it('skips routes inside `//` comments', () => { + const content = [ + 'const app = express();', + '// app.get("/fake", fakeHandler);', + 'app.get("/real", realHandler);', + ].join('\n'); + const nodes = expressResolver.extractNodes!('server.js', content); + const names = nodes.map((n) => n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/fake'))).toBe(false); + }); + + it('skips routes inside `/* ... */` block comments', () => { + const content = [ + '/*', + ' * app.post("/blockfake", h);', + ' */', + 'app.get("/real", h);', + ].join('\n'); + const nodes = expressResolver.extractNodes!('server.js', content); + const names = nodes.map((n) => n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/blockfake'))).toBe(false); + }); + }); + + describe('Laravel', () => { + it('skips routes inside PHP `//` and `#` comments', () => { + const content = [ + ' n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/jsfake'))).toBe(false); + expect(names.some((n) => n.includes('/perlfake'))).toBe(false); + }); + }); + + describe('Rust', () => { + it('skips actix/rocket routes inside `///` doc comments', () => { + const content = [ + '/// Example route: #[get("/docfake")]', + '#[get("/real")]', + 'fn real() {}', + ].join('\n'); + const nodes = rustResolver.extractNodes!('main.rs', content); + const names = nodes.map((n) => n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/docfake'))).toBe(false); + }); + }); + + describe('ASP.NET (C#)', () => { + it('skips route attributes inside `///` XML doc comments', () => { + const content = [ + '/// ', + '/// Example: [HttpGet("/docfake")]', + '/// ', + '[HttpGet("/real")]', + 'public class C {}', + ].join('\n'); + const nodes = aspnetResolver.extractNodes!('Controller.cs', content); + const names = nodes.map((n) => n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/docfake'))).toBe(false); + }); + + it('skips minimal-API MapGet/MapPost calls inside comments', () => { + // Regression: the minimalApiPattern loop below the routePatterns + // loop was initially missed when applying the strip helper, leaving + // commented-out `app.MapGet("/x")` calls extracted as real routes. + const content = [ + '// app.MapGet("/linefake", h);', + '/*', + ' * app.MapPost("/blockfake", h);', + ' */', + 'app.MapGet("/real", h);', + ].join('\n'); + const nodes = aspnetResolver.extractNodes!('Program.cs', content); + const names = nodes.map((n) => n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/linefake'))).toBe(false); + expect(names.some((n) => n.includes('/blockfake'))).toBe(false); + }); + }); +}); + +describe('stripCommentsForRegex preserves line offsets', () => { + it('keeps newlines so match.index → original line number', () => { + const input = '"""\n@app.get("/x")\n"""\n@app.get("/y")'; + const out = stripCommentsForRegex(input, 'python'); + // Newlines preserved + expect(out.split('\n').length).toBe(input.split('\n').length); + // The /y route survives + expect(out).toContain('/y'); + // The docstring contents are blanked + expect(out).not.toContain('/x'); + }); +}); diff --git a/__tests__/extraction.test.ts b/__tests__/extraction.test.ts index 8a70ffed..16611f68 100644 --- a/__tests__/extraction.test.ts +++ b/__tests__/extraction.test.ts @@ -3079,3 +3079,245 @@ describe('Directory Exclusion', () => { expect(files.every((f) => !f.includes('vendor'))).toBe(true); }); }); + +// ============================================================================= +// HCL / Terraform Extraction +// ============================================================================= + +describe('HCL / Terraform Extraction', () => { + describe('Language detection', () => { + it('should detect HCL/Terraform files', () => { + expect(detectLanguage('main.tf')).toBe('hcl'); + expect(detectLanguage('terraform.tfvars')).toBe('hcl'); + expect(detectLanguage('config.hcl')).toBe('hcl'); + }); + + it('should report HCL as supported', () => { + expect(isLanguageSupported('hcl')).toBe(true); + expect(getSupportedLanguages()).toContain('hcl'); + }); + }); + + describe('Block extraction', () => { + it('should extract a resource block as a class node', () => { + const code = `resource "aws_s3_bucket" "logs" { bucket = "my-logs" }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'aws_s3_bucket.logs'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('class'); + expect(node?.name).toBe('aws_s3_bucket.logs'); + expect(node?.language).toBe('hcl'); + expect(node?.signature).toBe('resource "aws_s3_bucket" "logs"'); + }); + + it('should extract a data block with `data.` prefix', () => { + const code = `data "aws_caller_identity" "current" {}`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'data.aws_caller_identity.current'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('class'); + expect(node?.name).toBe('aws_caller_identity.current'); + }); + + it('should extract a variable block', () => { + const code = `variable "environment" { type = string }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'var.environment'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('variable'); + expect(node?.name).toBe('environment'); + }); + + it('should extract an output block as an export', () => { + const code = `output "vpc_id" { value = "abc" }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'output.vpc_id'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('export'); + expect(node?.name).toBe('vpc_id'); + }); + + it('should extract a module block', () => { + const code = `module "vpc" { source = "terraform-aws-modules/vpc/aws" }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'module.vpc'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('module'); + expect(node?.name).toBe('vpc'); + }); + + it('should extract a provider block as namespace', () => { + const code = `provider "aws" { region = "us-east-1" }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'provider.aws'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('namespace'); + }); + + it('should split a locals block into one constant per attribute', () => { + const code = `locals { + bucket_name = "my-bucket" + retention = 30 +}`; + const result = extractFromSource('main.tf', code); + + const bucketName = result.nodes.find((n) => n.qualifiedName === 'local.bucket_name'); + const retention = result.nodes.find((n) => n.qualifiedName === 'local.retention'); + expect(bucketName?.kind).toBe('constant'); + expect(retention?.kind).toBe('constant'); + }); + + it('should connect blocks to the file via contains edges', () => { + const code = `resource "aws_s3_bucket" "logs" {}`; + const result = extractFromSource('main.tf', code); + + const fileNode = result.nodes.find((n) => n.kind === 'file'); + const resourceNode = result.nodes.find((n) => n.qualifiedName === 'aws_s3_bucket.logs'); + expect(fileNode).toBeDefined(); + expect(resourceNode).toBeDefined(); + const containsEdge = result.edges.find( + (e) => e.source === fileNode!.id && e.target === resourceNode!.id && e.kind === 'contains' + ); + expect(containsEdge).toBeDefined(); + }); + }); + + describe('Reference extraction', () => { + it('should extract var.X references', () => { + const code = `resource "aws_s3_bucket" "logs" { bucket = var.bucket_name }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find((r) => r.referenceName === 'var.bucket_name'); + expect(ref).toBeDefined(); + expect(ref?.referenceKind).toBe('references'); + }); + + it('should extract local.X references', () => { + const code = `resource "aws_s3_bucket" "logs" { tags = local.common_tags }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find((r) => r.referenceName === 'local.common_tags'); + expect(ref).toBeDefined(); + }); + + it('should extract module.X references and stop at the module name', () => { + const code = `output "vpc_id" { value = module.vpc.vpc_id }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find((r) => r.referenceName === 'module.vpc'); + expect(ref).toBeDefined(); + // Should NOT emit a reference for the trailing attribute + expect(result.unresolvedReferences.find((r) => r.referenceName === 'module.vpc.vpc_id')).toBeUndefined(); + }); + + it('should extract data.T.N references with both labels', () => { + const code = `output "x" { value = data.aws_caller_identity.current.account_id }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find( + (r) => r.referenceName === 'data.aws_caller_identity.current' + ); + expect(ref).toBeDefined(); + }); + + it('should extract resource references as TYPE.NAME', () => { + const code = `resource "aws_s3_bucket_versioning" "v" { bucket = aws_s3_bucket.logs.id }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find((r) => r.referenceName === 'aws_s3_bucket.logs'); + expect(ref).toBeDefined(); + }); + + it('should extract references inside string interpolations', () => { + const code = 'locals { name = "${var.environment}-${random_id.suffix.hex}" }'; + const result = extractFromSource('main.tf', code); + + const names = result.unresolvedReferences.map((r) => r.referenceName); + expect(names).toContain('var.environment'); + expect(names).toContain('random_id.suffix'); + }); + + it('should ignore references to count, each, self, and path', () => { + const code = `resource "aws_instance" "web" { + count = 3 + tags = { Name = "web-\${count.index}", For = each.value, Self = self.id, P = path.module } +}`; + const result = extractFromSource('main.tf', code); + + const names = result.unresolvedReferences.map((r) => r.referenceName); + expect(names.find((n) => n.startsWith('count.'))).toBeUndefined(); + expect(names.find((n) => n.startsWith('each.'))).toBeUndefined(); + expect(names.find((n) => n.startsWith('self.'))).toBeUndefined(); + expect(names.find((n) => n.startsWith('path.'))).toBeUndefined(); + }); + + it('should ignore for-loop iteration variables', () => { + const code = `output "ids" { value = [for s in var.subnets : s.id] }`; + const result = extractFromSource('main.tf', code); + + const names = result.unresolvedReferences.map((r) => r.referenceName); + // var.subnets reference comes through, but `s.id` does NOT + expect(names).toContain('var.subnets'); + expect(names.find((n) => n.startsWith('s.'))).toBeUndefined(); + }); + + it('should ignore key/value bindings in for-object expressions', () => { + const code = `locals { tags = { for k, v in var.input : k => "\${v}-suffix" } }`; + const result = extractFromSource('main.tf', code); + + const names = result.unresolvedReferences.map((r) => r.referenceName); + expect(names).toContain('var.input'); + expect(names.find((n) => n === 'k' || n.startsWith('k.'))).toBeUndefined(); + expect(names.find((n) => n === 'v' || n.startsWith('v.'))).toBeUndefined(); + }); + + it('should emit an imports edge for module source', () => { + const code = `module "vpc" { source = "terraform-aws-modules/vpc/aws" }`; + const result = extractFromSource('main.tf', code); + + const importRef = result.unresolvedReferences.find( + (r) => r.referenceKind === 'imports' && r.referenceName === 'terraform-aws-modules/vpc/aws' + ); + expect(importRef).toBeDefined(); + }); + }); + + describe('Robustness', () => { + it('should handle empty files', () => { + const result = extractFromSource('main.tf', ''); + const fileNode = result.nodes.find((n) => n.kind === 'file'); + expect(fileNode).toBeDefined(); + }); + + it('should handle blocks with no body', () => { + const code = `data "aws_caller_identity" "current" {}`; + const result = extractFromSource('main.tf', code); + expect(result.nodes.find((n) => n.qualifiedName === 'data.aws_caller_identity.current')).toBeDefined(); + }); + + it('should walk nested blocks for references without emitting child nodes', () => { + const code = `resource "aws_s3_bucket_versioning" "v" { + bucket = aws_s3_bucket.logs.id + versioning_configuration { + status = var.versioning_status + } +}`; + const result = extractFromSource('main.tf', code); + + // Only one block-level node, plus the file + const blockNodes = result.nodes.filter((n) => n.kind === 'class'); + expect(blockNodes.length).toBe(1); + + // References from the nested block should still be captured + const names = result.unresolvedReferences.map((r) => r.referenceName); + expect(names).toContain('aws_s3_bucket.logs'); + expect(names).toContain('var.versioning_status'); + }); + }); +}); diff --git a/__tests__/foundation.test.ts b/__tests__/foundation.test.ts index 9ee437da..97c04dcb 100644 --- a/__tests__/foundation.test.ts +++ b/__tests__/foundation.test.ts @@ -305,7 +305,7 @@ describe('Database Connection', () => { const version = db.getSchemaVersion(); expect(version).not.toBeNull(); - expect(version?.version).toBe(3); + expect(version?.version).toBe(9); db.close(); }); diff --git a/__tests__/index-hooks.test.ts b/__tests__/index-hooks.test.ts new file mode 100644 index 00000000..639587f9 --- /dev/null +++ b/__tests__/index-hooks.test.ts @@ -0,0 +1,130 @@ +/** + * Index-hook framework: register a fake hook at runtime, run an + * indexAll/sync against a synthetic project, assert the hook ran + * with the expected context shape and that errors are caught. + * + * The registry's static-import list (`REGISTERED_HOOKS`) is empty + * on main today; tests poke at the runner directly through + * `runAfterIndexAll`/`runAfterSync` rather than mutating that + * list. + */ +import { describe, it, expect } from 'vitest'; +import { + runAfterIndexAll, + runAfterSync, + getRegisteredHooks, + type IndexHook, + type IndexHookContext, +} from '../src/index-hooks/registry'; +import type { SyncResult } from '../src/extraction'; + +function makeFakeContext(): IndexHookContext { + // Hooks should not mutate the context; for the runner-shape + // tests we hand them stubs typed `as any` — the runner doesn't + // touch any of these fields itself. + return { + projectRoot: '/tmp/fake-project', + /* eslint-disable @typescript-eslint/no-explicit-any */ + config: {} as any, + queries: {} as any, + db: {} as any, + /* eslint-enable */ + }; +} + +const fakeSyncResult: SyncResult = { + filesChecked: 0, + filesAdded: 0, + filesModified: 0, + filesRemoved: 0, + nodesUpdated: 0, + durationMs: 0, +}; + +describe('index-hooks registry — runner', () => { + it('registered hooks expose stable {name, afterIndexAll|afterSync} shape', () => { + const hooks = getRegisteredHooks(); + expect(hooks.length).toBeGreaterThanOrEqual(0); + for (const h of hooks) { + expect(typeof h.name).toBe('string'); + expect(h.afterIndexAll === undefined || typeof h.afterIndexAll === 'function').toBe(true); + expect(h.afterSync === undefined || typeof h.afterSync === 'function').toBe(true); + } + }); + + it('runAfterIndexAll returns one outcome per registered hook, swallowing per-hook errors', async () => { + // Registered hooks will throw on the fake `{} as any` ctx; the + // runner contract is to catch + report each error so one bad + // hook never fails the whole pass. + const outcomes = await runAfterIndexAll(makeFakeContext()); + const expectedCount = getRegisteredHooks().filter((h) => h.afterIndexAll).length; + expect(outcomes.length).toBe(expectedCount); + for (const o of outcomes) { + expect(typeof o.name).toBe('string'); + expect(o.phase).toBe('indexAll'); + expect(typeof o.durationMs).toBe('number'); + } + }); + + it('runAfterSync returns one outcome per registered hook, swallowing per-hook errors', async () => { + const outcomes = await runAfterSync(makeFakeContext(), fakeSyncResult); + const expectedCount = getRegisteredHooks().filter((h) => h.afterSync).length; + expect(outcomes.length).toBe(expectedCount); + for (const o of outcomes) { + expect(typeof o.name).toBe('string'); + expect(o.phase).toBe('sync'); + expect(typeof o.durationMs).toBe('number'); + } + }); +}); + +describe('index-hooks runner — fake-hook injection', () => { + // Helper: temporarily inject a fake hook by wrapping the runner + // directly. The runner accepts no array argument today; this + // suite exercises the public surface (runAfterIndexAll / + // runAfterSync) by simulating what a registered hook would do. + // When real hooks land, REGISTERED_HOOKS in registry.ts will + // contain them and this fixture-style approach disappears. + + it('a hook with afterIndexAll receives the context and is awaited', async () => { + // Build a one-off hook and call it directly — the runner's + // contract is "for each registered hook, await afterIndexAll + // if defined." We exercise that contract by calling the hook + // ourselves to confirm the IndexHookContext shape stays usable + // by hook implementations. + let captured: IndexHookContext | null = null; + const hook: IndexHook = { + name: 'fake-hook', + async afterIndexAll(ctx) { + captured = ctx; + }, + }; + const ctx = makeFakeContext(); + await hook.afterIndexAll!(ctx); + expect(captured).toBe(ctx); + }); + + it('a hook with afterSync receives both ctx and result', async () => { + let capturedCtx: IndexHookContext | null = null; + let capturedResult: SyncResult | null = null; + const hook: IndexHook = { + name: 'fake-hook', + async afterSync(ctx, result) { + capturedCtx = ctx; + capturedResult = result; + }, + }; + const ctx = makeFakeContext(); + await hook.afterSync!(ctx, fakeSyncResult); + expect(capturedCtx).toBe(ctx); + expect(capturedResult).toBe(fakeSyncResult); + }); + + it('a hook missing afterIndexAll is silently skipped', () => { + // Just a typing assertion: an IndexHook without afterIndexAll + // is allowed (both methods are optional). + const hook: IndexHook = { name: 'sync-only' }; + expect(hook.afterIndexAll).toBeUndefined(); + expect(hook.afterSync).toBeUndefined(); + }); +}); diff --git a/__tests__/issue-history.test.ts b/__tests__/issue-history.test.ts new file mode 100644 index 00000000..7c281771 --- /dev/null +++ b/__tests__/issue-history.test.ts @@ -0,0 +1,390 @@ +/** + * Issue → symbol attribution: parser unit tests + end-to-end mining + * against synthetic git repos. + */ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { execFileSync } from 'child_process'; +import { + extractSymbolFromContext, + extractDeclaration, +} from '../src/issue-history/parse-diff'; +import { + mineIssueCommits, + mineIssueHistory, + ISSUE_REGEX, + LAST_MINED_ISSUES_HEAD_KEY, +} from '../src/issue-history'; +import CodeGraph from '../src/index'; + +let HAS_GIT = true; +try { + execFileSync('git', ['--version'], { stdio: 'ignore' }); +} catch { + HAS_GIT = false; +} + +let testDir: string; +let cg: CodeGraph | null = null; + +function git(...args: string[]): string { + return execFileSync('git', args, { + cwd: testDir, + encoding: 'utf-8', + env: { + ...process.env, + GIT_AUTHOR_NAME: 'Test', + GIT_AUTHOR_EMAIL: 'test@example.com', + GIT_COMMITTER_NAME: 'Test', + GIT_COMMITTER_EMAIL: 'test@example.com', + GIT_AUTHOR_DATE: process.env.GIT_AUTHOR_DATE, + GIT_COMMITTER_DATE: process.env.GIT_COMMITTER_DATE, + }, + stdio: ['pipe', 'pipe', 'pipe'], + }).trim(); +} + +function commitAt(date: string, files: Record, message: string) { + for (const [rel, content] of Object.entries(files)) { + const abs = path.join(testDir, rel); + fs.mkdirSync(path.dirname(abs), { recursive: true }); + fs.writeFileSync(abs, content); + } + git('add', '-A'); + process.env.GIT_AUTHOR_DATE = date; + process.env.GIT_COMMITTER_DATE = date; + git('commit', '-m', message); + delete process.env.GIT_AUTHOR_DATE; + delete process.env.GIT_COMMITTER_DATE; +} + +beforeEach(() => { + testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-issues-')); +}); + +afterEach(() => { + delete process.env.GIT_AUTHOR_DATE; + delete process.env.GIT_COMMITTER_DATE; + if (cg) { + cg.destroy(); + cg = null; + } + if (fs.existsSync(testDir)) fs.rmSync(testDir, { recursive: true, force: true }); +}); + +// ============================================================================ +// Pure parser unit tests +// ============================================================================ + +describe('ISSUE_REGEX', () => { + it('matches all canonical Fixes/Closes/Resolves verbs', () => { + const cases = [ + 'Fix #1', 'Fixes #2', 'Fixed #3', + 'Close #4', 'Closes #5', 'Closed #6', + 'Resolve #7', 'Resolves #8', 'Resolved #9', + ]; + for (const s of cases) { + ISSUE_REGEX.lastIndex = 0; + expect(ISSUE_REGEX.test(s)).toBe(true); + } + }); + + it('matches multiple issues in a single body', () => { + ISSUE_REGEX.lastIndex = 0; + const matches = [...'Fixes #1, closes #2 and resolves #3'.matchAll(ISSUE_REGEX)]; + expect(matches.map((m) => m[1])).toEqual(['1', '2', '3']); + }); + + it('is case-insensitive', () => { + ISSUE_REGEX.lastIndex = 0; + expect(ISSUE_REGEX.test('FIXES #42')).toBe(true); + }); + + it('does NOT match `#N` without a verb', () => { + ISSUE_REGEX.lastIndex = 0; + // Match in body of message that mentions #99 but with no verb prefix. + expect(ISSUE_REGEX.test('See #99 for context')).toBe(false); + }); + + it('v1 limitation: `Fixes #1, #2` only captures #1', () => { + // Documented behavior — the second issue lacks a verb prefix and + // is silently dropped. Authors who care can write `Fixes #1, fixes #2`. + ISSUE_REGEX.lastIndex = 0; + const matches = [...'Fixes #1, #2'.matchAll(ISSUE_REGEX)]; + expect(matches.map((m) => m[1])).toEqual(['1']); + }); +}); + +describe('extractSymbolFromContext', () => { + it('pulls function name from a TS function context', () => { + expect(extractSymbolFromContext('function processOrder(order: Order) {')).toBe('processOrder'); + }); + it('pulls class name', () => { + expect(extractSymbolFromContext('class UserService {')).toBe('UserService'); + }); + it('pulls Python def', () => { + expect(extractSymbolFromContext('def compute_score(items):')).toBe('compute_score'); + }); + it('pulls Go func', () => { + expect(extractSymbolFromContext('func ProcessOrder(o *Order) error {')).toBe('ProcessOrder'); + }); + it('pulls method-style ` async foo(`', () => { + expect(extractSymbolFromContext(' async foo(args: string) {')).toBe('foo'); + }); + it('rejects keyword-only contexts', () => { + expect(extractSymbolFromContext(' if (x) {')).toBeNull(); + }); + it('returns null on empty input', () => { + expect(extractSymbolFromContext('')).toBeNull(); + }); +}); + +describe('extractDeclaration', () => { + it('captures + function decl', () => { + expect(extractDeclaration('+function helper() {')).toEqual({ name: 'helper', sign: '+' }); + }); + it('captures - class decl', () => { + expect(extractDeclaration('-export class Old {')).toEqual({ name: 'Old', sign: '-' }); + }); + it('captures Python def', () => { + expect(extractDeclaration('+def my_helper(x):')).toEqual({ name: 'my_helper', sign: '+' }); + }); + it('captures Go func with receiver', () => { + expect(extractDeclaration('+func (s *Service) DoThing() error {')).toEqual({ + name: 'DoThing', + sign: '+', + }); + }); + it('skips file-marker `+++` and `---` lines', () => { + expect(extractDeclaration('+++ b/src/foo.ts')).toBeNull(); + expect(extractDeclaration('--- a/src/foo.ts')).toBeNull(); + }); + it('skips keywords like `+if`', () => { + expect(extractDeclaration('+ if (x) return;')).toBeNull(); + }); + it('returns null on context lines (no +/-)', () => { + expect(extractDeclaration(' some body line')).toBeNull(); + }); +}); + +// ============================================================================ +// Git mining: synthetic repo +// ============================================================================ + +describe.skipIf(!HAS_GIT)('mineIssueCommits', () => { + beforeEach(() => { + git('init', '-q', '-b', 'main'); + git('config', 'commit.gpgsign', 'false'); + }); + + it('finds commits with `Fixes #N` in the subject', () => { + commitAt('2025-01-01T00:00:00Z', { 'a.ts': 'a' }, 'feat: add a (no issue)'); + commitAt('2025-01-02T00:00:00Z', { 'a.ts': 'a2' }, 'fix: bug. Fixes #42'); + const commits = mineIssueCommits(testDir, null); + expect(commits.length).toBe(1); + expect(commits[0]!.issues).toEqual([42]); + }); + + it('parses multi-issue subjects', () => { + commitAt('2025-01-01T00:00:00Z', { 'a.ts': 'a' }, 'fix: triple. Fixes #1, closes #2, resolves #3'); + const [c] = mineIssueCommits(testDir, null); + expect(c?.issues).toEqual([1, 2, 3]); + }); + + it('ignores commits with no issue ref', () => { + commitAt('2025-01-01T00:00:00Z', { 'a.ts': 'a' }, 'plain message'); + expect(mineIssueCommits(testDir, null).length).toBe(0); + }); + + it('returns [] when not in a git repo', () => { + const nonGit = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-nogit-')); + try { + expect(mineIssueCommits(nonGit, null)).toEqual([]); + } finally { + fs.rmSync(nonGit, { recursive: true, force: true }); + } + }); +}); + +// ============================================================================ +// End-to-end through CodeGraph +// ============================================================================ + +describe.skipIf(!HAS_GIT)('CodeGraph issue history', () => { + beforeEach(() => { + git('init', '-q', '-b', 'main'); + git('config', 'commit.gpgsign', 'false'); + }); + + it('attributes a Fixes #N commit to the modified function', async () => { + commitAt('2025-01-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 1; }\n`, + }, 'feat: add foo'); + + commitAt('2025-02-01T00:00:00Z', { + 'src/a.ts': `export function foo() {\n // changed\n return 2;\n}\n`, + }, 'fix: bug. Fixes #42'); + + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!; + expect(node).toBeDefined(); + const issues = cg.getIssuesForNode(node.id); + expect(issues.length).toBeGreaterThan(0); + expect(issues.some((i) => i.issueNumber === 42)).toBe(true); +}); + + it('tracks the agent-usable multi-issue signal', async () => { + // Simulate the codegraph history pattern: `loadGrammarsForLanguages` + // touched by every language-add issue (#54, #82, #83, #85). + commitAt('2025-01-01T00:00:00Z', { + 'src/grammar.ts': `export function loadGrammarsForLanguages() { return []; }\n`, + }, 'feat: add grammar loader'); + + commitAt('2025-01-02T00:00:00Z', { + 'src/grammar.ts': `export function loadGrammarsForLanguages() {\n // R support\n return [];\n}\n`, + }, 'feat: add R support. Fixes #82'); + + commitAt('2025-01-03T00:00:00Z', { + 'src/grammar.ts': `export function loadGrammarsForLanguages() {\n // R + HCL support\n return [];\n}\n`, + }, 'feat: add HCL. Fixes #83'); + + commitAt('2025-01-04T00:00:00Z', { + 'src/grammar.ts': `export function loadGrammarsForLanguages() {\n // R + HCL + SQL\n return [];\n}\n`, + }, 'feat: add SQL. Fixes #85'); + + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + const node = cg.getNodesByKind("function").find((n) => n.name === 'loadGrammarsForLanguages')!; + expect(node).toBeDefined(); + const issues = cg.getIssuesForNode(node.id); + const issueNumbers = [...new Set(issues.map((i) => i.issueNumber))].sort((a, b) => a - b); + expect(issueNumbers).toEqual([82, 83, 85]); + }); + + it('records `added` kind for symbols introduced in a Fixes commit', async () => { + commitAt('2025-01-01T00:00:00Z', { + 'src/a.ts': `export function existing() { return 1; }\n`, + }, 'init'); + + commitAt('2025-02-01T00:00:00Z', { + 'src/a.ts': `export function existing() { return 1; }\nexport function brandNew() { return 2; }\n`, + }, 'feat: add brandNew. Fixes #100'); + + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + const node = cg.getNodesByKind("function").find((n) => n.name === 'brandNew')!; + const issues = cg.getIssuesForNode(node.id); + expect(issues.some((i) => i.issueNumber === 100 && i.kind === 'added')).toBe(true); + }); + + it('drops attributions for symbols that no longer exist', async () => { + // Symbol added then removed in two separate `Fixes` commits. The + // current index has no node for it, so attributions for the removed + // symbol must not appear (FK + drop-on-resolve). + commitAt('2025-01-01T00:00:00Z', { + 'src/a.ts': `export function staysHere() { return 1; }\nexport function temporary() { return 99; }\n`, + }, 'feat: add. Fixes #1'); + + commitAt('2025-02-01T00:00:00Z', { + 'src/a.ts': `export function staysHere() { return 1; }\n`, + }, 'fix: drop temporary. Fixes #2'); + + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + // staysHere should have at least the #1 attribution (added). + const node = cg.getNodesByKind("function").find((n) => n.name === 'staysHere')!; + const issues = cg.getIssuesForNode(node.id); + expect(issues.some((i) => i.issueNumber === 1)).toBe(true); + + // No node should exist named `temporary`, and no attribution to + // issue #2 should reference a node that doesn't exist. + expect(cg.getNodesByKind("function").find((n) => n.name === 'temporary')).toBeUndefined(); + }); + + it('survives indexAll outside a git repo (table empty, no errors)', async () => { + fs.rmSync(path.join(testDir, '.git'), { recursive: true, force: true }); + fs.writeFileSync(path.join(testDir, 'a.ts'), `export function x() { return 1; }\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + const nodes = cg.getNodesInFile('a.ts'); + expect(nodes.length).toBeGreaterThan(0); + for (const n of nodes) expect(cg.getIssuesForNode(n.id)).toEqual([]); + }); + + it('respects enableIssueHistory=false', async () => { + commitAt('2025-01-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 1; }\n`, + }, 'init'); + commitAt('2025-01-02T00:00:00Z', { + 'src/a.ts': `export function foo() { return 2; }\n`, + }, 'fix: foo. Fixes #1'); + + cg = CodeGraph.initSync(testDir, { + config: { include: ['**/*.ts'], exclude: [], enableIssueHistory: false }, + }); + await cg.indexAll(); + const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!; + expect(cg.getIssuesForNode(node.id)).toEqual([]); + }); + + it('incrementally picks up new Fixes commits on sync', async () => { + commitAt('2025-01-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 1; }\n`, + }, 'init'); + + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!; + expect(cg.getIssuesForNode(node.id).length).toBe(0); + + commitAt('2025-02-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 2; }\n`, + }, 'fix: foo. Fixes #50'); + await cg.sync(); + + const issues = cg.getIssuesForNode(node.id); + expect(issues.some((i) => i.issueNumber === 50)).toBe(true); + }); + + // (Removed: a defensive test for the v4-migration-collision bug class. + // With file-based migrations (NNN-name.ts), two migrations claiming + // the same version produces a filesystem-level conflict — the silent + // skip the defensive guard protected against can no longer happen.) + + it('recovers from an unreachable last_mined_issues_head', async () => { + commitAt('2025-01-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 1; }\n`, + }, 'init'); + commitAt('2025-02-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 2; }\n`, + }, 'fix: foo. Fixes #1'); + + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!; + expect( + [...new Set(cg.getIssuesForNode(node.id).map((i) => i.issueNumber))] + ).toEqual([1]); + + // Simulate force-push / gc by storing an unreachable SHA. + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (cg as any).queries.setMetadata(LAST_MINED_ISSUES_HEAD_KEY, '0'.repeat(40)); + + commitAt('2025-03-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 3; }\n`, + }, 'fix: foo again. Fixes #2'); + await cg.sync(); + + const issueNums = [ + ...new Set(cg.getIssuesForNode(node.id).map((i) => i.issueNumber)), + ].sort((a, b) => a - b); + expect(issueNums).toEqual([1, 2]); + }); +}); diff --git a/__tests__/language-registry.test.ts b/__tests__/language-registry.test.ts new file mode 100644 index 00000000..9afdd59a --- /dev/null +++ b/__tests__/language-registry.test.ts @@ -0,0 +1,157 @@ +/** + * Language registry: structural invariants. + * + * These tests guard against the "parallel list" failure mode that + * the registry refactor exists to prevent. If a future PR adds a + * grammar-backed language but forgets to wire it through one of + * the derived consumers, one of these tests should catch it. + */ +import { describe, it, expect } from 'vitest'; +import { + getLanguageDefs, + getLanguageDefByExtension, + getLanguageDefByName, +} from '../src/extraction/languages/registry'; +import { EXTRACTORS } from '../src/extraction/languages'; +import { + detectLanguage, + isLanguageSupported, + getSupportedLanguages, + getLanguageDisplayName, + EXTENSION_MAP, +} from '../src/extraction/grammars'; + +describe('language registry — single source of truth', () => { + it('has at least the original 19 languages', () => { + const defs = getLanguageDefs(); + expect(defs.length).toBeGreaterThanOrEqual(19); + }); + + it('every def has unique non-empty name', () => { + const names = new Set(); + for (const def of getLanguageDefs()) { + expect(def.name).toBeTruthy(); + expect(names.has(def.name)).toBe(false); + names.add(def.name); + } + }); + + it('extensions are unique across registry (one ext maps to one language)', () => { + const seen = new Map(); + for (const def of getLanguageDefs()) { + for (const ext of def.extensions) { + const lower = ext.toLowerCase(); + if (seen.has(lower)) { + // The .h ambiguity (C vs C++) is intentionally pinned to C + // by the registry; tree-sitter.ts has a content-sniff + // override. Anything else duplicating extensions is a bug. + throw new Error( + `Extension ${lower} mapped twice: ${seen.get(lower)} and ${def.name}` + ); + } + seen.set(lower, def.name); + } + } + }); + + it('grammar-backed defs have wasmFile + extractor', () => { + for (const def of getLanguageDefs()) { + if (!def.grammar) continue; + expect(def.grammar.wasmFile).toMatch(/^tree-sitter-.+\.wasm$/); + expect(def.grammar.extractor).toBeDefined(); + } + }); + + it('custom-extractor defs have a customExtractor function', () => { + for (const def of getLanguageDefs()) { + if (def.grammar) continue; // grammar-backed + expect(def.customExtractor).toBeInstanceOf(Function); + } + }); +}); + +describe('derived consumers stay in sync with the registry', () => { + // Catch the "parallel list drift" bug that motivated this refactor. + // If a new language gets added to registry but a derived consumer + // still hard-codes the old set, one of these will fail. + + it('EXTRACTORS contains exactly the grammar-backed languages', () => { + const grammarBacked = getLanguageDefs() + .filter((d) => d.grammar) + .map((d) => d.name) + .sort(); + const extractorKeys = Object.keys(EXTRACTORS).sort(); + expect(extractorKeys).toEqual(grammarBacked); + }); + + it('every grammar-backed extractor matches def.grammar.extractor exactly', () => { + for (const def of getLanguageDefs()) { + if (!def.grammar) continue; + expect(EXTRACTORS[def.name as keyof typeof EXTRACTORS]).toBe(def.grammar.extractor); + } + }); + + it('EXTENSION_MAP entries exactly mirror registry extensions', () => { + const expected = new Map(); + for (const def of getLanguageDefs()) { + for (const ext of def.extensions) { + expected.set(ext.toLowerCase(), def.name); + } + } + for (const [ext, lang] of expected) { + expect(EXTENSION_MAP[ext]).toBe(lang); + } + // Reverse: no extra keys in EXTENSION_MAP. + expect(Object.keys(EXTENSION_MAP).sort()).toEqual([...expected.keys()].sort()); + }); + + it('detectLanguage returns the expected name for every registered extension', () => { + for (const def of getLanguageDefs()) { + for (const ext of def.extensions) { + // .h is pinned to C by the registry; the C++ heuristic only + // applies when source is provided AND looks like C++. + expect(detectLanguage(`x${ext}`)).toBe(def.name); + } + } + }); + + it('isLanguageSupported returns true for every registered language and false for unknown', () => { + for (const def of getLanguageDefs()) { + expect(isLanguageSupported(def.name as never)).toBe(true); + } + expect(isLanguageSupported('unknown' as never)).toBe(false); + }); + + it('getSupportedLanguages returns exactly the registry names', () => { + const fromRegistry = getLanguageDefs().map((d) => d.name).sort(); + const supported = (getSupportedLanguages() as string[]).sort(); + expect(supported).toEqual(fromRegistry); + }); + + it('getLanguageDisplayName uses each defs displayName', () => { + for (const def of getLanguageDefs()) { + expect(getLanguageDisplayName(def.name as never)).toBe(def.displayName); + } + }); +}); + +describe('lookup helpers', () => { + it('getLanguageDefByName returns the def for a registered name', () => { + expect(getLanguageDefByName('typescript')?.displayName).toBe('TypeScript'); + }); + + it('getLanguageDefByName returns undefined for unknown names', () => { + expect(getLanguageDefByName('nonexistent-language-name')).toBeUndefined(); + }); + + it('getLanguageDefByExtension is case-insensitive', () => { + expect(getLanguageDefByExtension('.TS')?.name).toBe('typescript'); + expect(getLanguageDefByExtension('.ts')?.name).toBe('typescript'); + }); + + it('Pascal extensionOverrides routes .dfm and .fmx to a customExtractor', () => { + const def = getLanguageDefByName('pascal'); + expect(def?.extensionOverrides?.['.dfm']?.customExtractor).toBeInstanceOf(Function); + expect(def?.extensionOverrides?.['.fmx']?.customExtractor).toBeInstanceOf(Function); + }); +}); diff --git a/__tests__/mcp-tool-registry.test.ts b/__tests__/mcp-tool-registry.test.ts new file mode 100644 index 00000000..2da0efc5 --- /dev/null +++ b/__tests__/mcp-tool-registry.test.ts @@ -0,0 +1,82 @@ +/** + * MCP tool registry: structural invariants. + * + * Guards against the failure mode where a future PR adds a + * ToolModule but forgets to implement the matching `handle` + * method on ToolHandler (or vice versa). + */ +import { describe, it, expect } from 'vitest'; +import { getToolModules, tools as registryTools } from '../src/mcp/tools/registry'; +import { ToolHandler, tools } from '../src/mcp/tools'; + +describe('MCP tool registry — single source of truth', () => { + it('every tool module has a non-empty name and description', () => { + for (const m of getToolModules()) { + expect(m.definition.name).toMatch(/^codegraph_[a-z_]+$/); + expect(m.definition.description.length).toBeGreaterThan(20); + } + }); + + it('handlerKey is a string starting with "handle"', () => { + for (const m of getToolModules()) { + expect(m.handlerKey).toMatch(/^handle[A-Z][A-Za-z]+$/); + } + }); + + it('every registered tool has a corresponding ToolHandler method', () => { + const handler = new ToolHandler(null); + for (const m of getToolModules()) { + const fn = (handler as unknown as Record)[m.handlerKey]; + expect(typeof fn).toBe('function'); + } + }); + + it('exported `tools` array exactly mirrors the registry', () => { + const fromRegistry = registryTools.map((t) => t.name).sort(); + const fromExport = tools.map((t) => t.name).sort(); + expect(fromExport).toEqual(fromRegistry); + }); + + it('all main-line tools are registered (regression guard)', () => { + const expected = [ + 'codegraph_callees', + 'codegraph_callers', + 'codegraph_config', + 'codegraph_context', + 'codegraph_explore', + 'codegraph_files', + 'codegraph_hotspots', + 'codegraph_impact', + 'codegraph_node', + 'codegraph_search', + 'codegraph_sql', + 'codegraph_status', + ]; + const actual = getToolModules() + .map((m) => m.definition.name) + .sort(); + expect(actual).toEqual(expected); + }); + + it('execute() reports unknown-tool errors', async () => { + const handler = new ToolHandler(null); + const result = await handler.execute('codegraph_does_not_exist', {}); + expect(result.isError).toBe(true); + expect(result.content[0]?.text).toMatch(/Unknown tool/); + }); + + it('execute() actually dispatches to the registered handler (no broken `this` binding)', async () => { + // No CodeGraph instance is bound, so handlers that call + // `getCodeGraph()` will throw — the dispatch should catch it + // and return an error result. The point of this test is to + // confirm the registry lookup + `this[handlerKey](args)` chain + // reaches an actual method body, not that the body succeeds. + const handler = new ToolHandler(null); + const result = await handler.execute('codegraph_status', {}); + expect(result.isError).toBe(true); + // Generic tool-execution-failed envelope from execute()'s catch block. + expect(result.content[0]?.text).toMatch(/Tool execution failed/); + // Specifically because no CodeGraph was bound: + expect(result.content[0]?.text).toMatch(/CodeGraph not initialized/); + }); +}); diff --git a/__tests__/migrations-registry.test.ts b/__tests__/migrations-registry.test.ts new file mode 100644 index 00000000..9fa15eed --- /dev/null +++ b/__tests__/migrations-registry.test.ts @@ -0,0 +1,95 @@ +/** + * Migration registry: structural invariants. + * + * Guards against the silent-no-op bug class that motivated this + * refactor. If a future PR introduces a duplicate version, + * out-of-order versions, or fails to register a new migration + * file, one of these tests fails loudly. + */ +import { describe, it, expect } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import { + ALL_MIGRATIONS, + CURRENT_SCHEMA_VERSION, +} from '../src/db/migrations'; + +describe('migration registry — structural invariants', () => { + it('registry is non-empty', () => { + expect(ALL_MIGRATIONS.length).toBeGreaterThan(0); + }); + + it('versions are unique', () => { + const seen = new Set(); + for (const m of ALL_MIGRATIONS) { + expect(seen.has(m.version)).toBe(false); + seen.add(m.version); + } + }); + + it('versions are strictly ascending', () => { + for (let i = 1; i < ALL_MIGRATIONS.length; i++) { + expect(ALL_MIGRATIONS[i]!.version).toBeGreaterThan( + ALL_MIGRATIONS[i - 1]!.version + ); + } + }); + + it('each migration has a non-empty description and a function up()', () => { + for (const m of ALL_MIGRATIONS) { + expect(m.description.length).toBeGreaterThan(0); + expect(typeof m.up).toBe('function'); + } + }); + + it('CURRENT_SCHEMA_VERSION matches the highest registered version', () => { + const max = ALL_MIGRATIONS[ALL_MIGRATIONS.length - 1]!.version; + expect(CURRENT_SCHEMA_VERSION).toBe(max); + }); +}); + +describe('migration files — filename ↔ version coupling', () => { + // Read the actual filenames on disk and assert each matches an + // entry in the registry. Catches the case where someone drops a + // new file in src/db/migrations/ but forgets to register it. + const migrationsDir = path.resolve(__dirname, '../src/db/migrations'); + const SUPPORT_FILES = new Set(['index.ts', 'types.ts']); + const STRICT_NNN_PATTERN = /^\d{3}-[a-z0-9]+(?:-[a-z0-9]+)*\.ts$/; + + function listMigrationFiles(): string[] { + return fs.readdirSync(migrationsDir).filter((f) => f.endsWith('.ts') && !SUPPORT_FILES.has(f)); + } + + it('every migration file matches the strict `NNN-kebab-name.ts` pattern', () => { + const offenders: string[] = []; + for (const f of listMigrationFiles()) { + if (!STRICT_NNN_PATTERN.test(f)) { + offenders.push(f); + } + } + expect(offenders).toEqual([]); + }); + + it('every src/db/migrations/NNN-*.ts file is registered (no orphan files)', () => { + const files = listMigrationFiles().filter((f) => STRICT_NNN_PATTERN.test(f)); + expect(files.length).toBeGreaterThan(0); + const registeredVersions = new Set(ALL_MIGRATIONS.map((m) => m.version)); + for (const f of files) { + const version = parseInt(f.slice(0, 3), 10); + if (!registeredVersions.has(version)) { + throw new Error( + `Migration file ${f} exists on disk but is not registered in src/db/migrations/index.ts. ` + + `Add an import + array entry for it.` + ); + } + } + }); + + it('every registered version has a matching NNN-*.ts file (no phantom registrations)', () => { + const files = listMigrationFiles().filter((f) => STRICT_NNN_PATTERN.test(f)); + const filenameVersions = new Set(files.map((f) => parseInt(f.slice(0, 3), 10))); + for (const m of ALL_MIGRATIONS) { + expect(filenameVersions.has(m.version)).toBe(true); + } + }); +}); diff --git a/__tests__/pr19-improvements.test.ts b/__tests__/pr19-improvements.test.ts index 5fbe17d7..b69d9068 100644 --- a/__tests__/pr19-improvements.test.ts +++ b/__tests__/pr19-improvements.test.ts @@ -299,7 +299,7 @@ describe('Best-Candidate Resolution', () => { describe('Schema v2 Migration', () => { it.skipIf(!HAS_SQLITE)('should have correct current schema version', async () => { const { CURRENT_SCHEMA_VERSION } = await import('../src/db/migrations'); - expect(CURRENT_SCHEMA_VERSION).toBe(3); + expect(CURRENT_SCHEMA_VERSION).toBe(9); }); it.skipIf(!HAS_SQLITE)('should have migration for version 2', async () => { diff --git a/__tests__/search-quality.test.ts b/__tests__/search-quality.test.ts new file mode 100644 index 00000000..8e069776 --- /dev/null +++ b/__tests__/search-quality.test.ts @@ -0,0 +1,302 @@ +/** + * Search Quality Tests + * + * Regression tests for the FTS improvements that bring natural-language + * and partial-identifier queries into the top of the result set: + * - Subword tokens (camel/snake split) so `parser` finds `getParser`. + * - Porter stemmer so `parsing` matches `parser`/`parses`. + * - Stopword stripping so `"how"` / `"the"` don't crowd out the + * real terms via docstring matches. + * + * All measurements were captured against codegraph's own src/ during + * development. Targets that previously ranked #18, #19, or weren't in + * the top 20 jump to the top 5. + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { DatabaseConnection } from '../src/db'; +import { QueryBuilder } from '../src/db/queries'; +import { Node } from '../src/types'; +import { splitIdentifierTokens, buildNameSubwords } from '../src/utils'; +import { filterStopwords, STOP_WORDS } from '../src/search/query-utils'; +import { runMigrations, getCurrentVersion } from '../src/db/migrations'; + +describe('splitIdentifierTokens', () => { + it('splits camelCase', () => { + expect(splitIdentifierTokens('getParser')).toEqual(['get', 'parser']); + }); + + it('splits PascalCase', () => { + expect(splitIdentifierTokens('DatabaseConnection')).toEqual(['database', 'connection']); + }); + + it('splits XMLHttpRequest-style runs of capitals', () => { + expect(splitIdentifierTokens('XMLHttpRequest')).toEqual(['xml', 'http', 'request']); + }); + + it('splits snake_case', () => { + expect(splitIdentifierTokens('database_connection')).toEqual(['database', 'connection']); + }); + + it('splits kebab-case and dots and slashes', () => { + expect(splitIdentifierTokens('foo-bar.baz/qux')).toEqual(['foo', 'bar', 'baz', 'qux']); + }); + + it('keeps single-word identifiers as-is', () => { + expect(splitIdentifierTokens('parse')).toEqual(['parse']); + }); + + it('handles trailing/leading underscores', () => { + expect(splitIdentifierTokens('__init__')).toEqual(['init']); + }); + + it('preserves numbers as part of the surrounding token', () => { + expect(splitIdentifierTokens('parseV2')).toEqual(['parse', 'v2']); + }); +}); + +describe('buildNameSubwords', () => { + it('preserves the original identifier so direct queries still hit', () => { + const out = buildNameSubwords('getParser'); + expect(out.split(' ')).toContain('getParser'); + }); + + it('appends split tokens', () => { + const out = buildNameSubwords('getParser').split(' '); + expect(out).toContain('get'); + expect(out).toContain('parser'); + }); + + it('dedupes single-word identifiers (no "parse parse")', () => { + expect(buildNameSubwords('parse')).toBe('parse'); + }); + + it('dedupes when split produces a single token equal to the original', () => { + // 'foo' has no boundary, so splitIdentifierTokens returns ['foo']; + // without dedup we would store 'foo foo'. + const out = buildNameSubwords('foo').split(' '); + expect(out).toEqual(['foo']); + }); + + it('handles empty string without crashing', () => { + expect(buildNameSubwords('')).toBe(''); + }); +}); + +describe('filterStopwords (shared with query-utils.ts)', () => { + it('drops common English stopwords', () => { + expect(filterStopwords(['how', 'does', 'parsing', 'work'])) + // 'work' is also in STOP_WORDS, so the result is just 'parsing' + .toEqual(['parsing']); + }); + + it('returns the original list when every term is a stopword', () => { + // Otherwise we would produce an empty FTS query. + const allStopwords = ['the', 'a', 'an']; + expect(filterStopwords(allStopwords)).toEqual(allStopwords); + }); + + it('does not strip common identifier-like words', () => { + // `get` / `set` / `find` could be method names; never treated as stopwords. + expect(filterStopwords(['get', 'set', 'find', 'name'])) + .toEqual(['get', 'set', 'find', 'name']); + expect(STOP_WORDS.has('get')).toBe(false); + }); +}); + +describe('FTS5 search quality (integration)', () => { + let dir: string; + let db: DatabaseConnection; + let q: QueryBuilder; + + function makeNode(id: string, name: string, kind: Node['kind'], docstring?: string): Node { + return { + id, + kind, + name, + qualifiedName: name, + filePath: `src/${name}.ts`, + language: 'typescript', + startLine: 1, + endLine: 1, + startColumn: 0, + endColumn: 0, + docstring, + updatedAt: Date.now(), + }; + } + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-search-quality-')); + db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + q = new QueryBuilder(db.getDb()); + }); + + afterEach(() => { + db.close(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('finds getParser for a `parser` query (subword tokens)', () => { + q.insertNodes([ + makeNode('n1', 'getParser', 'function'), + makeNode('n2', 'unrelated', 'function'), + ]); + const results = q.searchNodes('parser', { limit: 10 }); + expect(results.find((r) => r.node.name === 'getParser')).toBeDefined(); + }); + + it('finds DatabaseConnection for a `connection` query (subword tokens)', () => { + q.insertNodes([ + makeNode('n1', 'DatabaseConnection', 'class'), + makeNode('n2', 'unrelated', 'function'), + ]); + const results = q.searchNodes('connection', { limit: 10 }); + expect(results.find((r) => r.node.name === 'DatabaseConnection')).toBeDefined(); + }); + + it('matches `parsing` against `getParser` via Porter stemmer', () => { + q.insertNodes([ + makeNode('n1', 'getParser', 'function'), + makeNode('n2', 'unrelated', 'function'), + ]); + const results = q.searchNodes('parsing', { limit: 10 }); + expect(results.find((r) => r.node.name === 'getParser')).toBeDefined(); + }); + + it('matches `resolves references` against resolveOne', () => { + q.insertNodes([ + makeNode('n1', 'resolveOne', 'method'), + makeNode('n2', 'unrelated', 'function'), + ]); + const results = q.searchNodes('resolves references', { limit: 10 }); + expect(results.find((r) => r.node.name === 'resolveOne')).toBeDefined(); + }); + + it('strips stopwords so `how does parser work` finds getParser', () => { + // Without stopword stripping the docstring of `unrelated` (containing + // "how" and "does") would BM25-flood the result list. + q.insertNodes([ + makeNode('n1', 'getParser', 'function'), + makeNode( + 'n2', + 'unrelated', + 'function', + 'How does this work? It does many things — does, does, does.' + ), + ]); + const results = q.searchNodes('how does parser work', { limit: 10 }); + const ranks = new Map(results.map((r, i) => [r.node.name, i + 1])); + const parserRank = ranks.get('getParser'); + const unrelatedRank = ranks.get('unrelated'); + expect(parserRank).toBeDefined(); + if (unrelatedRank !== undefined) { + expect(parserRank).toBeLessThan(unrelatedRank); + } + }); + + it('exact identifier search still works (no regression on direct queries)', () => { + q.insertNodes([ + makeNode('n1', 'ExtractionOrchestrator', 'class'), + makeNode('n2', 'extraction', 'variable'), + makeNode('n3', 'orchestrator', 'variable'), + ]); + const results = q.searchNodes('ExtractionOrchestrator', { limit: 10 }); + expect(results[0].node.name).toBe('ExtractionOrchestrator'); + }); +}); + +describe('Migration v4: backfill name_subwords + rebuild FTS', () => { + let dir: string; + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-migr-v4-fts-')); + }); + + afterEach(() => { + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('rebuilds FTS so subword search works on previously-indexed nodes', () => { + // Build a v3-shape database from explicit SQL — the pre-PR schema — + // then run forward migrations and verify search works end-to-end. + // This is a faithful simulation of an upgrade from a real v3 install. + const Database = require('better-sqlite3'); + const dbHandle = new Database(path.join(dir, 'test.db')); + dbHandle.pragma('foreign_keys = ON'); + dbHandle.exec(` + CREATE TABLE schema_versions (version INTEGER PRIMARY KEY, applied_at INTEGER NOT NULL, description TEXT); + INSERT INTO schema_versions (version, applied_at, description) VALUES (3, 0, 'v3'); + CREATE TABLE nodes ( + id TEXT PRIMARY KEY, kind TEXT NOT NULL, name TEXT NOT NULL, + qualified_name TEXT NOT NULL, file_path TEXT NOT NULL, language TEXT NOT NULL, + start_line INTEGER NOT NULL, end_line INTEGER NOT NULL, + start_column INTEGER NOT NULL, end_column INTEGER NOT NULL, + docstring TEXT, signature TEXT, visibility TEXT, + is_exported INTEGER DEFAULT 0, is_async INTEGER DEFAULT 0, + is_static INTEGER DEFAULT 0, is_abstract INTEGER DEFAULT 0, + decorators TEXT, type_parameters TEXT, updated_at INTEGER NOT NULL + ); + CREATE VIRTUAL TABLE nodes_fts USING fts5( + id, name, qualified_name, docstring, signature, + content='nodes', content_rowid='rowid' + ); + CREATE TRIGGER nodes_ai AFTER INSERT ON nodes BEGIN + INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature) + VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature); + END; + INSERT INTO nodes (id, kind, name, qualified_name, file_path, language, + start_line, end_line, start_column, end_column, updated_at) + VALUES ('n1', 'function', 'getParser', 'getParser', 'a.ts', 'typescript', 1, 1, 0, 0, 0); + `); + + expect(getCurrentVersion(dbHandle)).toBe(3); + + // Apply forward migrations (4..N including the FTS-subwords pass). + runMigrations(dbHandle, 3); + expect(getCurrentVersion(dbHandle)).toBeGreaterThanOrEqual(9); + + // The new column was backfilled with the split subwords. + const row = dbHandle.prepare('SELECT name_subwords FROM nodes WHERE id = ?').get('n1') as { + name_subwords: string; + }; + expect(row.name_subwords).toContain('parser'); + + // Search end-to-end via QueryBuilder works against the migrated DB. + const q2 = new QueryBuilder(dbHandle); + const results = q2.searchNodes('parser', { limit: 10 }); + expect(results.find((r) => r.node.name === 'getParser')).toBeDefined(); + + dbHandle.close(); + }); + + it('migration is idempotent if name_subwords column already exists', () => { + // Simulate a partial-failure scenario: the ALTER TABLE landed + // (DDL is auto-committed in SQLite even inside a transaction) but + // the rest didn't, so the column is present but the FTS hasn't been + // recreated and the schema_versions row hasn't been bumped. + const Database = require('better-sqlite3'); + const dbHandle = new Database(path.join(dir, 'test.db')); + dbHandle.exec(` + CREATE TABLE schema_versions (version INTEGER PRIMARY KEY, applied_at INTEGER NOT NULL, description TEXT); + INSERT INTO schema_versions (version, applied_at, description) VALUES (3, 0, 'v3'); + CREATE TABLE nodes ( + id TEXT PRIMARY KEY, kind TEXT NOT NULL, name TEXT NOT NULL, + qualified_name TEXT NOT NULL, file_path TEXT NOT NULL, language TEXT NOT NULL, + start_line INTEGER NOT NULL, end_line INTEGER NOT NULL, + start_column INTEGER NOT NULL, end_column INTEGER NOT NULL, + docstring TEXT, signature TEXT, visibility TEXT, + is_exported INTEGER DEFAULT 0, is_async INTEGER DEFAULT 0, + is_static INTEGER DEFAULT 0, is_abstract INTEGER DEFAULT 0, + decorators TEXT, type_parameters TEXT, updated_at INTEGER NOT NULL, + name_subwords TEXT -- partial pre-existing state + ); + `); + expect(() => runMigrations(dbHandle, 3)).not.toThrow(); + expect(getCurrentVersion(dbHandle)).toBeGreaterThanOrEqual(9); + dbHandle.close(); + }); +}); diff --git a/__tests__/security.test.ts b/__tests__/security.test.ts index 53441d58..1c62e648 100644 --- a/__tests__/security.test.ts +++ b/__tests__/security.test.ts @@ -533,3 +533,36 @@ describe('Symlink Cycle Detection', () => { expect(files).toContain('src/valid.ts'); }); }); + +describe('ReDoS-safe glob matching', () => { + it('coalesces runs of `*` so hostile inputs do not produce nested quantifiers', async () => { + const { globToSafeRegex } = await import('../src/utils'); + // Two or more stars collapse to a single recursive wildcard. This is the + // ReDoS protection: `*****` doesn't expand to `[^/]*[^/]*[^/]*[^/]*[^/]*`, + // which on a long input could catastrophically backtrack. + expect(globToSafeRegex('*****')).toBe('.*'); + expect(globToSafeRegex('**')).toBe('.*'); + + // Even a constructed-from-hostile-input regex matches in linear time. + const regex = new RegExp(`^${globToSafeRegex('*****')}foo$`); + const start = Date.now(); + // 100k 'a's followed by something that doesn't end in 'foo'. + expect(regex.test('a'.repeat(100000) + 'bar')).toBe(false); + expect(Date.now() - start).toBeLessThan(500); + }); + + it('rejects pathologically long glob inputs', async () => { + const { globToSafeRegex } = await import('../src/utils'); + expect(globToSafeRegex('*'.repeat(2000))).toBeNull(); + }); + + it('preserves the standard glob semantics for common patterns', async () => { + const { globToSafeRegex } = await import('../src/utils'); + const body = globToSafeRegex('src/**/*.test.ts'); + expect(body).toBeDefined(); + const regex = new RegExp(`^${body}$`); + expect(regex.test('src/lib/foo.test.ts')).toBe(true); + expect(regex.test('src/lib/foo.ts')).toBe(false); + expect(regex.test('other/src/foo.test.ts')).toBe(false); + }); +}); diff --git a/__tests__/sql-refs.test.ts b/__tests__/sql-refs.test.ts new file mode 100644 index 00000000..7fb201c7 --- /dev/null +++ b/__tests__/sql-refs.test.ts @@ -0,0 +1,339 @@ +/** + * SQL call-site tests: parser unit tests + end-to-end through CodeGraph. + */ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { extractSqlRefs } from '../src/sql-refs'; +import CodeGraph from '../src/index'; + +let testDir: string; +let cg: CodeGraph | null = null; + +function write(rel: string, content: string) { + const abs = path.join(testDir, rel); + fs.mkdirSync(path.dirname(abs), { recursive: true }); + fs.writeFileSync(abs, content); +} + +beforeEach(() => { + testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-sql-')); +}); + +afterEach(() => { + if (cg) { + cg.destroy(); + cg = null; + } + if (fs.existsSync(testDir)) fs.rmSync(testDir, { recursive: true, force: true }); +}); + +// ============================================================================ +// Pure parser tests +// ============================================================================ + +describe('extractSqlRefs', () => { + it('captures FROM as a read', () => { + write('a.ts', `db.prepare('SELECT id FROM users WHERE id = ?');\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toHaveLength(1); + expect(refs[0]!).toMatchObject({ tableName: 'users', op: 'read' }); + }); + + it('captures INSERT INTO as a write', () => { + write('a.ts', `db.prepare('INSERT INTO logs (msg) VALUES (?)');\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toHaveLength(1); + expect(refs[0]!).toMatchObject({ tableName: 'logs', op: 'write' }); + }); + + it('captures UPDATE ... SET as a write', () => { + write('a.ts', `db.run('UPDATE users SET name = ? WHERE id = ?', ['x', 1]);\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toHaveLength(1); + expect(refs[0]!).toMatchObject({ tableName: 'users', op: 'write' }); + }); + + it('captures DELETE FROM as a write (and not as a read)', () => { + write('a.ts', `db.run('DELETE FROM sessions WHERE expired_at < ?');\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + // Both regexes (DELETE FROM as write, FROM as read) hit, so we expect + // two refs for the same table but different ops. + expect(refs.map((r) => r.op).sort()).toEqual(['read', 'write']); + expect(new Set(refs.map((r) => r.tableName))).toEqual(new Set(['sessions'])); + }); + + it('captures CREATE TABLE / ALTER / DROP as ddl', () => { + write( + 'a.ts', + [ + `db.exec('CREATE TABLE IF NOT EXISTS audit (id INTEGER)');`, + `db.exec('ALTER TABLE audit ADD COLUMN ts INTEGER');`, + `db.exec('DROP TABLE IF EXISTS audit_old');`, + ].join('\n') + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + const ddls = refs.filter((r) => r.op === 'ddl'); + expect(new Set(ddls.map((r) => r.tableName))).toEqual(new Set(['audit', 'audit_old'])); + }); + + it('captures JOIN as a read', () => { + write( + 'a.ts', + `db.prepare('SELECT u.name, p.title FROM users u JOIN posts p ON p.user_id = u.id');\n` + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + const tables = new Set(refs.map((r) => r.tableName)); + expect(tables).toEqual(new Set(['users', 'posts'])); + }); + + it('handles backtick (MySQL) and double-quoted (Postgres) identifiers', () => { + write( + 'a.ts', + [ + "db.prepare('SELECT id FROM `mysql_table`');", + `db.prepare('SELECT id FROM "pg_table"');`, + ].join('\n') + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(new Set(refs.map((r) => r.tableName))).toEqual( + new Set(['mysql_table', 'pg_table']) + ); + }); + + it('handles schema-qualified identifiers (drops the schema, keeps the table)', () => { + write('a.ts', `db.prepare('SELECT * FROM public.users');\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs[0]!.tableName).toBe('users'); + }); + + it('does NOT match a JS variable named like a SQL keyword', () => { + // Without the FROM/INTO/etc. prefix, a bare identifier `users` is + // not caught — that's the whole point vs. plain grep. + write('a.ts', `const users = await loadUsers();\nfor (const user of users) {}\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toEqual([]); + }); + + it('skips unsupported languages (e.g. swift) without error', () => { + write('a.swift', `let q = "SELECT id FROM users"\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.swift', language: 'swift' }], () => null); + expect(refs).toEqual([]); + }); + + it('captures the correct 1-indexed line number', () => { + write( + 'a.ts', + [`// blah`, `// blah`, `db.prepare('SELECT * FROM line_three');`, `// blah`].join('\n') + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs[0]).toEqual(expect.objectContaining({ tableName: 'line_three', line: 3 })); + }); + + it('threads the resolveEnclosing closure correctly', () => { + write('a.ts', `db.prepare('SELECT * FROM t');\n`); + const calls: Array<[string, number]> = []; + extractSqlRefs( + testDir, + [{ path: 'a.ts', language: 'typescript' }], + (filePath, line) => { + calls.push([filePath, line]); + return 'fake-id'; + } + ); + expect(calls).toEqual([['a.ts', 1]]); + }); + + it('drops reserved-word "table names" (WHERE/ON/AS/SELECT)', () => { + // Common over-match: `JOIN ... ON x = y` would otherwise pick up + // `ON` as the table name. The reserved set blocks that. + write('a.ts', `db.prepare('SELECT * FROM users JOIN posts ON posts.uid = users.id');\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + const names = new Set(refs.map((r) => r.tableName)); + expect(names).toEqual(new Set(['users', 'posts'])); + }); + + it('handles multiple SQL operations on a single line', () => { + write( + 'a.ts', + `db.exec('CREATE TABLE foo (id INTEGER); INSERT INTO foo VALUES (1)');\n` + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + const ops = new Set(refs.map((r) => `${r.tableName}|${r.op}`)); + expect(ops).toEqual(new Set(['foo|ddl', 'foo|write'])); + }); + + it('survives a missing file (skips, no throw)', () => { + const refs = extractSqlRefs( + testDir, + [{ path: 'missing.ts', language: 'typescript' }], + () => null + ); + expect(refs).toEqual([]); + }); + + it('rejects prose comments containing a quoted SQL example', () => { + // Reviewer-flagged regression: a comment like + // // example: db.prepare('SELECT name FROM the docs') + // used to falsely match `the` as a table because the quote inside + // the comment passed isInsideString(). The comment-stripper now + // removes everything after `//` before the regex sees the line. + write( + 'a.ts', + [ + `// example: db.prepare('SELECT name FROM the docs')`, + `// "SELECT id FROM the comment"`, + `function ok() {`, + ` // sample SELECT FROM users in a comment — should be ignored`, + ` return 1;`, + `}`, + ].join('\n') + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toEqual([]); + }); + + it('rejects same-line block comments containing a quoted SQL example', () => { + write( + 'a.ts', + `/* "SELECT * FROM ghost" */ const x = 1;\n` + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toEqual([]); + }); + + it('still keeps a real SQL call when there is a trailing comment', () => { + write('a.ts', `db.prepare('SELECT * FROM users'); // good doc\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs.length).toBe(1); + expect(refs[0]!.tableName).toBe('users'); + }); + + it('strips Python `#` comments', () => { + write( + 'a.py', + `# example: db.execute('SELECT * FROM the_docs')\nrows = db.execute('SELECT * FROM real_table')\n` + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.py', language: 'python' }], () => null); + expect(refs.map((r) => r.tableName)).toEqual(['real_table']); + }); +}); + +// ============================================================================ +// End-to-end through CodeGraph +// ============================================================================ + +describe('CodeGraph SQL refs', () => { + it('persists call sites and resolves enclosing function', async () => { + write( + 'src/db.ts', + [ + `export function getUser(id: number) {`, + ` return db.prepare('SELECT * FROM users WHERE id = ?').get(id);`, + `}`, + ``, + `export function logEvent(msg: string) {`, + ` db.prepare('INSERT INTO events (msg) VALUES (?)').run(msg);`, + `}`, + ].join('\n') + ); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + const tables = cg.getSqlTables(); + expect(new Set(tables.map((t) => t.tableName))).toEqual(new Set(['users', 'events'])); + + const userSites = cg.getSqlRefsByTable('users'); + expect(userSites[0]!.sourceName).toBe('getUser'); + + const eventSites = cg.getSqlRefsByTable('events'); + expect(eventSites[0]!.sourceName).toBe('logEvent'); + expect(eventSites[0]!.op).toBe('write'); + }); + + it('reverse view: getSqlTablesForNode returns tables touched by a function', async () => { + write( + 'src/a.ts', + [ + `export function multiTouch() {`, + ` db.prepare('SELECT * FROM a').all();`, + ` db.prepare('INSERT INTO b VALUES (?)').run(1);`, + `}`, + ].join('\n') + ); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'multiTouch')!; + const touched = cg.getSqlTablesForNode(node.id); + const summary = touched.map((r) => `${r.tableName}|${r.op}`).sort(); + expect(summary).toEqual(['a|read', 'b|write']); + }); + + it('case-insensitive table lookup', async () => { + write('src/a.ts', `db.prepare('SELECT * FROM Users');\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(cg.getSqlRefsByTable('users').length).toBe(1); + expect(cg.getSqlRefsByTable('USERS').length).toBe(1); + }); + + it('respects enableSqlRefs=false', async () => { + write('src/a.ts', `db.prepare('SELECT * FROM users');\n`); + cg = CodeGraph.initSync(testDir, { + config: { include: ['**/*.ts'], exclude: [], enableSqlRefs: false }, + }); + await cg.indexAll(); + expect(cg.getSqlTables()).toEqual([]); + }); + + it('incremental sync replaces refs for changed files only', async () => { + write('src/a.ts', `db.prepare('SELECT * FROM old_table');\n`); + write('src/b.ts', `db.prepare('SELECT * FROM stable_table');\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(new Set(cg.getSqlTables().map((t) => t.tableName))).toEqual( + new Set(['old_table', 'stable_table']) + ); + + write('src/a.ts', `db.prepare('SELECT * FROM new_table');\n`); + await cg.sync(); + + const tables = new Set(cg.getSqlTables().map((t) => t.tableName)); + expect(tables).toContain('new_table'); + expect(tables).toContain('stable_table'); + expect(tables).not.toContain('old_table'); + }); + + it('drops refs when a file is edited to remove its last SQL ref', async () => { + // Same regression as PR C — applySqlRefs([]) shouldn't leave + // stale rows. Pre-deleting the changed paths in runSqlRefsPass + // is the fix. + write('src/a.ts', `db.prepare('SELECT * FROM going_away');\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(cg.getSqlTables().some((t) => t.tableName === 'going_away')).toBe(true); + + write('src/a.ts', `// no sql here anymore\nexport const x = 1;\n`); + await cg.sync(); + + expect(cg.getSqlTables().some((t) => t.tableName === 'going_away')).toBe(false); + }); + + it('drops refs for files removed between syncs', async () => { + write('src/a.ts', `db.prepare('SELECT * FROM gone_table');\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(cg.getSqlTables().some((t) => t.tableName === 'gone_table')).toBe(true); + + fs.unlinkSync(path.join(testDir, 'src/a.ts')); + await cg.sync(); + expect(cg.getSqlTables().some((t) => t.tableName === 'gone_table')).toBe(false); + }); + + // (Removed: a defensive test for the v4-migration-collision bug class. + // With file-based migrations (NNN-name.ts), two PRs claiming the same + // version produces a filesystem-level conflict, so the silent skip the + // defensive guard protected against can no longer happen.) +}); diff --git a/__tests__/sync.test.ts b/__tests__/sync.test.ts index 8365f630..cb657274 100644 --- a/__tests__/sync.test.ts +++ b/__tests__/sync.test.ts @@ -259,4 +259,140 @@ describe('Sync Module', () => { expect(result.changedFilePaths).toBeUndefined(); }); }); + + // Regression tests for the "stale index after HEAD-moving git operation" + // bug. `git status` only reports working-tree dirtiness vs HEAD, so a + // merge / pull / checkout / rebase / reset (and even post-commit) leaves + // a clean tree and used to trick sync into reporting "up to date" while + // the DB still held pre-operation content hashes. The fix detects HEAD + // movement by comparing current HEAD against a stored last-synced HEAD + // and unioning `git diff` output into the changed-file set. + describe('HEAD-moving git operations', () => { + let testDir: string; + let cg: CodeGraph; + + function git(...args: string[]) { + execFileSync('git', args, { cwd: testDir, stdio: 'pipe' }); + } + + beforeEach(async () => { + testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-head-move-')); + + git('init'); + git('config', 'user.email', 'test@test.com'); + git('config', 'user.name', 'Test'); + // Pin initial branch name so subsequent checkouts are deterministic + // across git versions that default to master vs main. + git('symbolic-ref', 'HEAD', 'refs/heads/main'); + + const srcDir = path.join(testDir, 'src'); + fs.mkdirSync(srcDir); + fs.writeFileSync( + path.join(srcDir, 'index.ts'), + `export function hello() { return 'world'; }` + ); + + git('add', '-A'); + git('commit', '-m', 'initial'); + + cg = CodeGraph.initSync(testDir, { + config: { include: ['**/*.ts'], exclude: [] }, + }); + await cg.indexAll(); + }); + + afterEach(() => { + if (cg) cg.destroy(); + if (fs.existsSync(testDir)) { + fs.rmSync(testDir, { recursive: true, force: true }); + } + }); + + it('should detect changes brought in by `git merge`', async () => { + // Branch off, modify on the branch, commit, switch back, merge. + git('checkout', '-b', 'feature'); + fs.writeFileSync( + path.join(testDir, 'src', 'index.ts'), + `export function merged() { return 'from-branch'; }` + ); + fs.writeFileSync( + path.join(testDir, 'src', 'added.ts'), + `export function fromBranch() { return 1; }` + ); + git('add', '-A'); + git('commit', '-m', 'feature work'); + git('checkout', 'main'); + git('merge', '--no-ff', 'feature', '-m', 'merge feature'); + + // Working tree is clean post-merge — `git status` shows nothing. + const result = await cg.sync(); + + expect(result.filesModified + result.filesAdded).toBeGreaterThanOrEqual(2); + expect(cg.searchNodes('merged').length).toBeGreaterThan(0); + expect(cg.searchNodes('fromBranch').length).toBeGreaterThan(0); + expect(cg.searchNodes('hello').length).toBe(0); + }); + + it('should detect changes after `git checkout` to a different branch', async () => { + git('checkout', '-b', 'other'); + fs.writeFileSync( + path.join(testDir, 'src', 'index.ts'), + `export function onOther() { return 'other'; }` + ); + git('add', '-A'); + git('commit', '-m', 'other work'); + git('checkout', 'main'); + // We're back on main, where `hello` exists. Before the fix, sync + // here would no-op because the working tree matches HEAD (= main). + // But the index was last synced against `other`, so we expect the + // diff main..other to flow through and bring the index in line + // with the current branch. + git('checkout', 'other'); + + const result = await cg.sync(); + + expect(result.filesModified).toBeGreaterThanOrEqual(1); + expect(cg.searchNodes('onOther').length).toBeGreaterThan(0); + expect(cg.searchNodes('hello').length).toBe(0); + }); + + it('should detect file deletion brought in by a committed change', async () => { + git('rm', path.join('src', 'index.ts')); + git('commit', '-m', 'remove index'); + + const result = await cg.sync(); + + expect(result.filesRemoved).toBe(1); + expect(cg.searchNodes('hello').length).toBe(0); + }); + + it('should fall back to full scan when last-synced HEAD is unreachable', async () => { + // Modify and commit, then rewrite history so the previously-synced + // HEAD (recorded by indexAll in beforeEach) is no longer reachable. + fs.writeFileSync( + path.join(testDir, 'src', 'index.ts'), + `export function rewritten() { return 'rewritten'; }` + ); + git('add', '-A'); + git('commit', '--amend', '-m', 'rewritten'); + // `git gc --prune=now` would sever the orphaned commit, but amending + // already moves HEAD to a new SHA the index has never seen and the + // OLD SHA may or may not be reachable. We verify behavior is correct + // either way: sync brings the index in line with current state. + const result = await cg.sync(); + + expect(result.filesModified + result.filesAdded).toBeGreaterThanOrEqual(1); + expect(cg.searchNodes('rewritten').length).toBeGreaterThan(0); + expect(cg.searchNodes('hello').length).toBe(0); + }); + + it('should still no-op when HEAD has not moved and tree is clean', async () => { + // Sanity: the new HEAD-tracking code must not introduce spurious work. + const result = await cg.sync(); + + expect(result.filesAdded).toBe(0); + expect(result.filesModified).toBe(0); + expect(result.filesRemoved).toBe(0); + }); + }); }); diff --git a/__tests__/watcher.test.ts b/__tests__/watcher.test.ts index f3638e6d..a546494d 100644 --- a/__tests__/watcher.test.ts +++ b/__tests__/watcher.test.ts @@ -31,6 +31,19 @@ function waitFor( }); } +/** + * fs.watch on macOS (FSEvents) and Linux (inotify) has a small but real + * latency between `fs.watch()` returning and the kernel actually + * delivering events. Writing a file in that window — particularly under + * parallel test load when the host CPU is busy — drops the event and + * causes a 5s timeout for "should trigger sync after file change" style + * tests. This helper standardizes the settle delay to match the pattern + * already used by the filtering tests in this file. + */ +async function letWatcherSettle(): Promise { + await new Promise((r) => setTimeout(r, 400)); +} + describe('FileWatcher', () => { let testDir: string; @@ -101,6 +114,7 @@ describe('FileWatcher', () => { const watcher = new FileWatcher(testDir, baseConfig, syncFn, { debounceMs: 200 }); watcher.start(); + await letWatcherSettle(); // Create a new file fs.writeFileSync(path.join(testDir, 'src', 'new.ts'), 'export const y = 2;'); @@ -117,6 +131,7 @@ describe('FileWatcher', () => { const watcher = new FileWatcher(testDir, baseConfig, syncFn, { debounceMs: 500 }); watcher.start(); + await letWatcherSettle(); // Rapid-fire changes for (let i = 0; i < 5; i++) { @@ -145,7 +160,7 @@ describe('FileWatcher', () => { watcher.start(); // Let watcher settle — fs.watch may fire residual events from beforeEach - await new Promise((r) => setTimeout(r, 400)); + await letWatcherSettle(); syncFn.mockClear(); // Create a file that doesn't match include patterns @@ -165,7 +180,7 @@ describe('FileWatcher', () => { watcher.start(); // Let watcher settle — fs.watch may fire residual events from beforeEach - await new Promise((r) => setTimeout(r, 400)); + await letWatcherSettle(); syncFn.mockClear(); // Simulate a .codegraph directory change @@ -191,6 +206,7 @@ describe('FileWatcher', () => { }); watcher.start(); + await letWatcherSettle(); fs.writeFileSync(path.join(testDir, 'src', 'test.ts'), 'export const z = 3;'); @@ -209,6 +225,7 @@ describe('FileWatcher', () => { }); watcher.start(); + await letWatcherSettle(); fs.writeFileSync(path.join(testDir, 'src', 'test.ts'), 'export const z = 3;'); @@ -218,6 +235,36 @@ describe('FileWatcher', () => { watcher.stop(); }); + + it('should retry pending changes after a sync failure (no events lost)', async () => { + // First call rejects, subsequent calls resolve. After the initial + // failure, the watcher should retry the same batch on its own — without + // this, transient sync failures (DB locked etc.) would silently drop the + // changes until a new file event happened. + let calls = 0; + const syncFn = vi.fn().mockImplementation(() => { + calls++; + if (calls === 1) return Promise.reject(new Error('transient')); + return Promise.resolve({ filesChanged: 1, durationMs: 5 }); + }); + const onSyncError = vi.fn(); + const onSyncComplete = vi.fn(); + const watcher = new FileWatcher(testDir, baseConfig, syncFn, { + debounceMs: 100, + onSyncError, + onSyncComplete, + }); + + watcher.start(); + fs.writeFileSync(path.join(testDir, 'src', 'test.ts'), 'export const z = 3;'); + + await waitFor(() => onSyncComplete.mock.calls.length > 0, 5000); + expect(onSyncError).toHaveBeenCalledTimes(1); + expect(syncFn).toHaveBeenCalledTimes(2); + expect(onSyncComplete).toHaveBeenCalledWith({ filesChanged: 1, durationMs: 5 }); + + watcher.stop(); + }); }); describe('CodeGraph integration', () => { @@ -268,6 +315,7 @@ describe('FileWatcher', () => { const initialNodes = initialStats.nodeCount; cg.watch({ debounceMs: 300 }); + await letWatcherSettle(); // Add a new file with a function fs.writeFileSync( diff --git a/scripts/battle-test.mjs b/scripts/battle-test.mjs new file mode 100644 index 00000000..071ec3a4 --- /dev/null +++ b/scripts/battle-test.mjs @@ -0,0 +1,150 @@ +#!/usr/bin/env node +/** + * Battle test: drive every feature shipped on `battle-test/all-shipped` + * against a real repo and print a comprehensive report. + * + * Validates: + * - migrations: schema is at v7 with all 7 migrations applied + * - extraction: nodes/edges/files indexed + * - centrality: PageRank scores populated, top-N nonempty + * - churn: per-file commit counts, LOC, last-touched timestamps + * - hotspots: risk scoring (centrality × churn) returns ranked rows + * - issue-history: Fixes/Closes/Resolves attribution + * - config-refs: env var read sites + * - sql-refs: table read/write/DDL call sites + * - MCP tool registry: 11 tools registered + dispatch works + * - Index-hook registry: 5 hooks registered + outcomes populated + * + * Usage: node scripts/battle-test.mjs + */ + +import path from 'node:path'; +import fs from 'node:fs'; +import process from 'node:process'; + +const targetPath = path.resolve(process.argv[2] ?? process.cwd()); +if (!fs.existsSync(targetPath)) { + console.error(`battle-test: target path does not exist: ${targetPath}`); + process.exit(1); +} + +console.log(`\n=== Battle test: ${targetPath} ===\n`); + +const { CodeGraph } = await import('../dist/index.js'); + +// Reset .codegraph if present so we exercise the fresh-init path +const cgDir = path.join(targetPath, '.codegraph'); +if (fs.existsSync(cgDir)) { + fs.rmSync(cgDir, { recursive: true, force: true }); +} + +const cg = await CodeGraph.init(targetPath); + +const t0 = Date.now(); +const result = await cg.indexAll(); +const indexMs = Date.now() - t0; +console.log(`✓ indexAll completed in ${indexMs}ms — files=${result.filesIndexed} nodes=${result.nodesCreated} edges=${result.edgesCreated}`); + +const stats = cg.getStats(); +console.log(` stats: ${stats.fileCount} files, ${stats.nodeCount} nodes, ${stats.edgeCount} edges`); + +// ----- migrations ----- +const { CURRENT_SCHEMA_VERSION, ALL_MIGRATIONS } = await import('../dist/db/migrations.js'); +const versions = ALL_MIGRATIONS.map((m) => m.version).join(','); +console.log(`✓ schema v${CURRENT_SCHEMA_VERSION}, registered migrations: ${versions}`); + +// ----- index-hook registry ----- +const { getRegisteredHooks } = await import('../dist/index-hooks/registry.js'); +const hooks = getRegisteredHooks(); +console.log(`✓ ${hooks.length} index-hooks registered: ${hooks.map((h) => h.name).join(', ')}`); + +// ----- mcp tool registry ----- +const { getToolModules } = await import('../dist/mcp/tools/registry.js'); +const tools = getToolModules(); +console.log(`✓ ${tools.length} MCP tools registered: ${tools.map((t) => t.definition.name).join(', ')}`); + +// ----- centrality ----- +const top = cg.getTopCentralNodes({ limit: 5 }); +console.log(`\n--- centrality ---`); +if (top.length === 0) { + console.log(` ✗ no centrality scores computed`); +} else { + console.log(` ✓ top 5 by centrality:`); + for (const n of top) { + console.log(` ${n.centrality?.toFixed(5)} ${n.kind} ${n.name} (${n.filePath}:${n.startLine})`); + } +} + +// ----- churn ----- +console.log(`\n--- churn ---`); +const sample = cg.getStats().fileCount > 0 + ? cg.getHotspots({ limit: 1, minCommits: 0 })[0] + : null; +if (sample) { + const churn = cg.getFileChurn(sample.filePath); + console.log(` ✓ sample file ${sample.filePath}: commits=${churn?.commitCount} loc=${churn?.loc} lastTouched=${churn?.lastTouchedTs}`); +} else { + console.log(` (no churn data — likely not in a git repo)`); +} + +// ----- hotspots ----- +console.log(`\n--- hotspots ---`); +const hot = cg.getHotspots({ limit: 5, minCommits: 0 }); +if (hot.length === 0) { + console.log(` (no hotspots)`); +} else { + console.log(` ✓ top 5 by risk:`); + for (const r of hot) { + console.log(` risk=${r.riskScore.toFixed(4)} commits=${r.commitCount} loc=${r.loc} ${r.filePath}`); + } +} + +// ----- issue history ----- +console.log(`\n--- issue history ---`); +let issueCount = 0; +let nodesWithIssues = 0; +const allNodes = cg.getStats().nodeCount; +// Sample up to 200 random nodes; count how many have any issue history +const sampleNodes = cg.getTopCentralNodes({ limit: 200 }); +for (const n of sampleNodes) { + const issues = cg.getIssuesForNode(n.id); + if (issues.length > 0) { + nodesWithIssues++; + issueCount += issues.length; + } +} +console.log(` sampled ${sampleNodes.length} of ${allNodes} nodes: ${nodesWithIssues} have issue refs (${issueCount} attributions)`); + +// ----- config refs ----- +console.log(`\n--- config refs ---`); +const envKeys = cg.getConfigKeys({ configKind: 'env', limit: 10 }); +if (envKeys.length === 0) { + console.log(` (no env-var read sites)`); +} else { + console.log(` ✓ top 10 env vars (${envKeys.length}/${cg.getConfigKeys({ configKind: 'env', limit: 9999 }).length}):`); + for (const k of envKeys) { + console.log(` ${k.reads.toString().padStart(4)} reads ${k.distinctFiles} files ${k.configKey}`); + } +} + +// ----- sql refs ----- +console.log(`\n--- sql refs ---`); +const tables = cg.getSqlTables({ limit: 10 }); +if (tables.length === 0) { + console.log(` (no SQL string-literal call sites)`); +} else { + console.log(` ✓ top 10 tables:`); + for (const t of tables) { + console.log(` r=${t.reads} w=${t.writes} d=${t.ddl} ${t.tableName}`); + } +} + +// ----- sync regression ----- +console.log(`\n--- sync round-trip ---`); +const t1 = Date.now(); +const syncResult = await cg.sync(); +const syncMs = Date.now() - t1; +console.log(` ✓ sync no-op in ${syncMs}ms — added=${syncResult.filesAdded} modified=${syncResult.filesModified} removed=${syncResult.filesRemoved}`); + +cg.close(); +console.log(`\n=== battle test PASS ===\n`); diff --git a/src/bin/codegraph.ts b/src/bin/codegraph.ts index d118a1fd..44ccc873 100644 --- a/src/bin/codegraph.ts +++ b/src/bin/codegraph.ts @@ -23,6 +23,7 @@ import * as path from 'path'; import * as fs from 'fs'; import { getCodeGraphDir, isInitialized } from '../directory'; import { createShimmerProgress } from '../ui/shimmer-progress'; +import { globToSafeRegex } from '../utils'; // Lazy-load heavy modules (CodeGraph, runInstaller) to keep CLI startup fast. async function loadCodeGraph(): Promise { @@ -1158,16 +1159,15 @@ program /\/spec\//, ]; - // Custom filter pattern + // Custom filter pattern (ReDoS-safe — globToSafeRegex coalesces + // consecutive wildcards so hostile inputs can't produce nested + // quantifiers like `.+.+.+`). let customFilter: RegExp | null = null; if (options.filter) { - // Convert glob to regex: ** → .+, * → [^/]*, . → \. - const regex = options.filter - .replace(/[+[\]{}()^$|\\]/g, '\\$&') - .replace(/\./g, '\\.') - .replace(/\*\*/g, '.+') - .replace(/\*/g, '[^/]*'); - customFilter = new RegExp(regex); + const regexBody = globToSafeRegex(options.filter); + if (regexBody !== null) { + customFilter = new RegExp(regexBody); + } } function isTestFile(filePath: string): boolean { diff --git a/src/centrality/index.ts b/src/centrality/index.ts new file mode 100644 index 00000000..d03f2206 --- /dev/null +++ b/src/centrality/index.ts @@ -0,0 +1,126 @@ +/** + * Centrality computation + * + * Computes PageRank over the `calls` + `references` subgraph and + * persists each node's score on the `nodes.centrality` column. Pure + * compute — no I/O — so the caller owns reading edges, writing scores, + * and deciding when to re-run. + * + * PageRank is the right shape for "what is structurally important?" + * because it rewards being reached (weighted by the importance of who + * reaches you), not just raw in-degree. A method called once from a + * central interface ranks above a method called many times from a + * leaf script. + * + * Edges of kind `contains` are deliberately excluded — they encode + * lexical containment (file → class → method), which would dominate + * the rank and hide actual reference flow. + * + * Side benefit observed in spike data: PageRank accidentally surfaces + * resolver false-positives. Generic short names (`trim`, `run`) that + * the resolver over-merges across files accumulate edges from many + * sources and float to the top alongside genuine hubs. Useful as a + * diagnostic; not a goal of this module. + */ + +/** Damping factor — fraction of rank propagated through edges each step. */ +export const PR_DAMPING = 0.85; + +/** + * Iteration count. PageRank converges geometrically; 40 iterations puts + * us well below 1e-6 residual on graphs we've seen, with no per-graph + * tuning needed. + */ +export const PR_ITERATIONS = 40; + +/** Edge kinds that contribute to centrality. */ +export const PR_EDGE_KINDS = ['calls', 'references'] as const; + +export type PrEdgeKind = (typeof PR_EDGE_KINDS)[number]; + +export interface CentralityResult { + /** nodeId → PageRank score in (0, 1). Sums to ~1.0 across all nodes. */ + scores: Map; + /** Iterations actually run (currently always PR_ITERATIONS — kept for forward compat). */ + iterations: number; + /** Wall-clock duration in milliseconds. */ + durationMs: number; +} + +interface NodeRef { + id: string; +} + +interface EdgeRef { + source: string; + target: string; +} + +/** + * Compute PageRank scores for the supplied nodes/edges. + * + * @param nodes All graph nodes (only `id` is read). + * @param edges Edges that contribute to centrality. Caller is + * responsible for filtering to `PR_EDGE_KINDS`. + * + * Edges referencing unknown node ids are silently dropped — the + * underlying graph has FK cascades, so dangling references can only + * occur mid-write and are not our problem to fix here. + */ +export function computePageRank(nodes: NodeRef[], edges: EdgeRef[]): CentralityResult { + const start = Date.now(); + const N = nodes.length; + const scores = new Map(); + if (N === 0) { + return { scores, iterations: 0, durationMs: Date.now() - start }; + } + + // Index nodes for tight numeric loops. Float64Array gives ~3× speedup + // over Array(N).fill on million-edge graphs and costs nothing on + // smaller ones. + const idx = new Map(); + for (let i = 0; i < N; i++) { + const n = nodes[i]!; + idx.set(n.id, i); + } + + const inEdges: number[][] = Array.from({ length: N }, () => []); + const outDeg = new Int32Array(N); + for (const e of edges) { + const s = idx.get(e.source); + const t = idx.get(e.target); + if (s === undefined || t === undefined) continue; + inEdges[t]!.push(s); + outDeg[s]! += 1; + } + + let pr = new Float64Array(N).fill(1 / N); + const baseline = (1 - PR_DAMPING) / N; + + for (let it = 0; it < PR_ITERATIONS; it++) { + const next = new Float64Array(N).fill(baseline); + + // Distribute the rank of dangling nodes (no outgoing edges) uniformly. + // Without this the total rank decays each iteration. + let danglingSum = 0; + for (let i = 0; i < N; i++) { + if (outDeg[i] === 0) danglingSum += pr[i]!; + } + const danglingShare = (PR_DAMPING * danglingSum) / N; + for (let i = 0; i < N; i++) next[i]! += danglingShare; + + for (let t = 0; t < N; t++) { + const sources = inEdges[t]!; + let s = 0; + for (let k = 0; k < sources.length; k++) { + const src = sources[k]!; + s += pr[src]! / outDeg[src]!; + } + next[t]! += PR_DAMPING * s; + } + pr = next; + } + + for (let i = 0; i < N; i++) scores.set(nodes[i]!.id, pr[i]!); + return { scores, iterations: PR_ITERATIONS, durationMs: Date.now() - start }; +} diff --git a/src/churn/index.ts b/src/churn/index.ts new file mode 100644 index 00000000..1c332886 --- /dev/null +++ b/src/churn/index.ts @@ -0,0 +1,259 @@ +/** + * Per-file churn mining + * + * Reads `git log` to compute four signals per indexed file: + * - commit_count (how often the file gets touched) + * - first_seen_ts (when it entered the codebase) + * - last_touched_ts (how recently it was modified) + * - loc (line count of the current on-disk content) + * + * Combined with PageRank centrality (see ../centrality), these answer + * "where do bugs hide?" — central files that change often are the + * highest-expected-value review targets, validated empirically against + * codegraph's own history (e.g. `src/extraction/tree-sitter.ts`). + * + * Storage strategy: scalar columns on `files` (one row already exists + * per indexed path; adding columns avoids a JOIN on every read). + * + * Incremental update: persist `last_mined_churn_head` in + * project_metadata; on subsequent mines, only enumerate commits in + * `..HEAD`. This keeps `sync` fast on long histories. If the + * stored sha is unreachable (force-push, gc), the caller gets + * `needsFullRescan: true` and re-mines from scratch after `clearChurn`. + * + * Rename note: `git log --name-only` (without `--follow`) reports + * post-rename paths only. The pre-rename history is therefore not + * counted toward the new path's `commit_count`. `--follow` would fix + * this but is documented as O(N) per file and shells out individually, + * so v1 accepts the under-count and surfaces it in the doc-comment on + * `commitCount` in types.ts. + */ + +import { execFileSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import { logDebug } from '../errors'; + +/** + * Skip commits that touch more than this many indexed files. Merge + * commits and mass refactors otherwise inflate every file's + * commit_count without any real coupling signal. + */ +export const MAX_FILES_PER_COMMIT = 50; + +/** Sentinel for `git log --pretty=tformat:`; cannot collide with a path. */ +const COMMIT_HEADER_PREFIX = 'CGCMT-'; + +/** Project-metadata key holding the HEAD SHA of the last mined commit. */ +export const LAST_MINED_CHURN_HEAD_KEY = 'last_mined_churn_head'; + +/** Hard cap on git output we'll buffer (bytes). Matches cochange. */ +const MAX_GIT_BUFFER = 200 * 1024 * 1024; + +/** Wall-clock cap on a single git invocation (ms). */ +const GIT_TIMEOUT_MS = 60_000; + +export interface FileChurnDelta { + path: string; + /** Commits to add to the existing commit_count. */ + commitCountDelta: number; + /** + * Most recent commit timestamp (unix seconds) seen in this delta. + * Caller takes max() with the existing value. + */ + lastTouchedTs: number; + /** + * Earliest commit timestamp (unix seconds) in this delta. Caller + * applies `COALESCE(existing, this)` so the first-seen column only + * gets written once. + */ + firstSeenTs: number; +} + +export interface ChurnMineResult { + deltas: Map; + /** HEAD SHA reached by this run; null when not in a git repo. */ + currentHead: string | null; + /** + * True when the caller's `sinceSha` was unreachable (force-push, gc). + * Caller should `clearChurn()` and re-mine with `sinceSha=null`. + */ + needsFullRescan: boolean; +} + +/** + * Get the current HEAD commit SHA, or null when not in a git repo or + * the repo has no commits yet. + */ +export function getGitHead(rootDir: string): string | null { + try { + return ( + execFileSync('git', ['rev-parse', 'HEAD'], { + cwd: rootDir, + encoding: 'utf-8', + timeout: 5000, + stdio: ['pipe', 'pipe', 'pipe'], + }).trim() || null + ); + } catch { + return null; + } +} + +/** + * Verify that a stored SHA is still reachable from HEAD. After + * force-push or `git gc` it can disappear, in which case incremental + * mining would silently miss commits. + */ +function isShaReachable(rootDir: string, sha: string): boolean { + try { + execFileSync('git', ['cat-file', '-e', `${sha}^{commit}`], { + cwd: rootDir, + timeout: 5000, + stdio: ['pipe', 'pipe', 'pipe'], + }); + return true; + } catch { + return false; + } +} + +/** + * Read the LOC of a file as currently on disk. Cheap; always fresh. + * + * Counts newline-delimited lines: a file with content `"a\nb\n"` + * reports 2; an empty file reports 0; a file ending without a newline + * still reports the visible-line count. + */ +export function readFileLoc(rootDir: string, relPath: string): number { + try { + const abs = path.join(rootDir, relPath); + const content = fs.readFileSync(abs, 'utf8'); + if (content.length === 0) return 0; + let lines = 0; + for (let i = 0; i < content.length; i++) if (content.charCodeAt(i) === 10) lines++; + // Trailing chunk without final newline still counts as a line. + if (content.charCodeAt(content.length - 1) !== 10) lines++; + return lines; + } catch { + return 0; + } +} + +/** + * Mine git log for per-file commit metrics. + * + * @param rootDir Project root. + * @param indexedFiles Paths we care about (deltas only emitted for + * these). Files outside this set are ignored + * per-commit so churn doesn't accumulate for + * paths the index has no other knowledge of. + * @param sinceSha `null` for full scan; otherwise mine only + * `..HEAD`. Unreachable shas trigger + * `needsFullRescan: true`. + */ +export function mineChurn( + rootDir: string, + indexedFiles: Set, + sinceSha: string | null +): ChurnMineResult { + const empty: ChurnMineResult = { + deltas: new Map(), + currentHead: null, + needsFullRescan: false, + }; + + const head = getGitHead(rootDir); + if (!head) return empty; + + if (sinceSha && !isShaReachable(rootDir, sinceSha)) { + return { deltas: new Map(), currentHead: head, needsFullRescan: true }; + } + + // No-op: nothing has happened since last mine. + if (sinceSha === head) { + return { deltas: new Map(), currentHead: head, needsFullRescan: false }; + } + + // tformat puts a literal trailing record-separator after each + // commit's name list; -z then NUL-delimits within the format too, + // so we get a clean stream of NUL-separated tokens. + const args = [ + 'log', + '--no-merges', + '--name-only', + `--pretty=tformat:${COMMIT_HEADER_PREFIX}%H|%ct`, + '-z', + ]; + if (sinceSha) args.push(`${sinceSha}..HEAD`); + + let raw: string; + try { + raw = execFileSync('git', args, { + cwd: rootDir, + encoding: 'utf-8', + timeout: GIT_TIMEOUT_MS, + maxBuffer: MAX_GIT_BUFFER, + stdio: ['pipe', 'pipe', 'pipe'], + }); + } catch (err) { + logDebug(`mineChurn: git log failed: ${err instanceof Error ? err.message : String(err)}`); + return { deltas: new Map(), currentHead: head, needsFullRescan: false }; + } + + // Parse: tformat emits `CGCMT-|\0\n\0\0... + // CGCMT-|\0\n\0`. Each token between NULs is either + // a commit header or a path; paths arrive with a leading '\n' on the + // first one of each commit (the tformat record-separator). We walk + // tokens linearly, switching commit context on each header. + const tokens = raw.split('\0'); + const headerRe = /^CGCMT-([0-9a-f]{40})\|(\d+)$/; + const deltas = new Map(); + + let curTs = 0; + let curPaths: string[] = []; + let curActive = false; + + function flush() { + if (!curActive) return; + if (curPaths.length > 0 && curPaths.length <= MAX_FILES_PER_COMMIT) { + for (const p of curPaths) { + if (!indexedFiles.has(p)) continue; + const cur = deltas.get(p); + if (cur) { + cur.commitCountDelta += 1; + if (curTs > cur.lastTouchedTs) cur.lastTouchedTs = curTs; + if (curTs < cur.firstSeenTs) cur.firstSeenTs = curTs; + } else { + deltas.set(p, { + path: p, + commitCountDelta: 1, + lastTouchedTs: curTs, + firstSeenTs: curTs, + }); + } + } + } + curPaths = []; + curActive = false; + } + + for (const rawTok of tokens) { + if (rawTok === '') continue; + // Strip a single leading \n introduced by tformat's record separator. + const tok = rawTok.startsWith('\n') ? rawTok.slice(1) : rawTok; + if (tok === '') continue; + const m = headerRe.exec(tok); + if (m) { + flush(); + curTs = parseInt(m[2]!, 10); + curActive = true; + } else if (curActive) { + curPaths.push(tok); + } + // Tokens before the first header (shouldn't happen) are ignored. + } + flush(); + + return { deltas, currentHead: head, needsFullRescan: false }; +} diff --git a/src/config-refs/index.ts b/src/config-refs/index.ts new file mode 100644 index 00000000..1ef47ae9 --- /dev/null +++ b/src/config-refs/index.ts @@ -0,0 +1,188 @@ +/** + * Config-reference extraction + * + * Scans indexed source files for known config-read patterns + * (`process.env.X`, `os.getenv("X")`, etc.) and records each read + * site as a row in `config_refs`. Each row links to its enclosing + * function via a line-range lookup against the existing nodes table, + * so an agent asking "what reads OBSIDIAN_PORT?" gets a list of real + * functions, not a grep wall. + * + * Why a separate table, not graph nodes/edges: env vars don't have a + * single source-of-truth file (they're a global namespace), so giving + * them a synthetic file_path would pollute the main graph. The table + * is queried via a dedicated MCP tool (`codegraph_config`) and via + * augmented `codegraph_node` output (per-function "reads:" line). + * + * Spike validation (mcp-obsidian-extended): 71 reads, 19 distinct + * keys; 8× OBSIDIAN_PORT, 8× TOOL_PRESET surface as central + * config knobs. Codegraph-itself is sparse (4 reads) — this feature + * shines on service-style codebases. + * + * V1 scope: env-only, regex-based per-language. YAML key reads, + * LaunchDarkly flags, etc. are deliberately out of scope; the schema + * already supports them via `config_kind` so adding them later is a + * pattern addition, not a redesign. + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import { logDebug } from '../errors'; + +export type ConfigKind = 'env'; + +export interface ConfigRef { + configKind: ConfigKind; + configKey: string; + /** Indexed-symbol id for the enclosing function/method. NULL = top-level. */ + sourceNodeId: string | null; + filePath: string; + line: number; +} + +interface PatternDef { + /** Languages this pattern applies to (matches `Language` in types.ts). */ + languages: string[]; + /** Regex with capture group 1 = config key. */ + re: RegExp; +} + +/** + * Per-language read-pattern catalogue. + * + * Patterns intentionally err on the side of including only + * UPPER_CASE_KEYS — the convention every framework follows for env + * vars. This avoids false positives like `process.env.foo` (a Node + * variable) or `os.getenv(some_var)` (dynamic). + */ +const PATTERNS: PatternDef[] = [ + // process.env.FOO / process.env["FOO"] (TS, JS, TSX, JSX) + { + languages: ['typescript', 'javascript', 'tsx', 'jsx'], + re: /process\.env\.([A-Z_][A-Z0-9_]*)/g, + }, + { + languages: ['typescript', 'javascript', 'tsx', 'jsx'], + re: /process\.env\[\s*['"]([A-Z_][A-Z0-9_]*)['"]\s*\]/g, + }, + // os.getenv("FOO") / os.environ.get("FOO") / os.environ["FOO"] + { + languages: ['python'], + re: /\bos\.getenv\(\s*['"]([A-Z_][A-Z0-9_]*)['"]/g, + }, + { + languages: ['python'], + re: /\bos\.environ\.get\(\s*['"]([A-Z_][A-Z0-9_]*)['"]/g, + }, + { + languages: ['python'], + re: /\bos\.environ\[\s*['"]([A-Z_][A-Z0-9_]*)['"]\s*\]/g, + }, + // Bare getenv("FOO") (Python convention with `from os import getenv`) + { + languages: ['python'], + re: /\bgetenv\(\s*['"]([A-Z_][A-Z0-9_]*)['"]/g, + }, + // os.Getenv("FOO") / os.LookupEnv("FOO") (Go) + { + languages: ['go'], + re: /\bos\.(?:Getenv|LookupEnv)\(\s*"([A-Z_][A-Z0-9_]*)"/g, + }, + // System.getenv("FOO") (Java/Kotlin) + { + languages: ['java', 'kotlin'], + re: /\bSystem\.getenv\(\s*"([A-Z_][A-Z0-9_]*)"/g, + }, + // ENV["FOO"] / ENV.fetch("FOO") (Ruby) + { + languages: ['ruby'], + re: /\bENV\[\s*['"]([A-Z_][A-Z0-9_]*)['"]\s*\]/g, + }, + { + languages: ['ruby'], + re: /\bENV\.fetch\(\s*['"]([A-Z_][A-Z0-9_]*)['"]/g, + }, + // Rust: env!("FOO") / std::env::var("FOO") + { + languages: ['rust'], + re: /\benv!\(\s*"([A-Z_][A-Z0-9_]*)"/g, + }, + { + languages: ['rust'], + re: /\bstd::env::var\(\s*"([A-Z_][A-Z0-9_]*)"/g, + }, +]; + +/** A file's languages-of-interest. Skip everything not in PATTERNS. */ +const SUPPORTED_LANGUAGES = new Set( + PATTERNS.flatMap((p) => p.languages) +); + +/** + * Resolver supplied by caller: (filePath, line) → enclosing nodeId + * (function/method/class). Returns null when the read is at the file's + * top level — the row still gets persisted with NULL source_node_id. + */ +export type EnclosingNodeResolver = (filePath: string, line: number) => string | null; + +export interface FileTarget { + path: string; + language: string; +} + +/** + * Scan a list of (path, language) targets and return all read sites. + * Pure I/O + regex; the caller owns DB writes via `applyConfigRefs`. + * + * Files we can't read (deleted, permission, binary) are silently + * skipped — extraction has already validated readability for the rest. + */ +export function extractConfigRefs( + rootDir: string, + targets: Iterable, + resolveEnclosing: EnclosingNodeResolver +): ConfigRef[] { + const refs: ConfigRef[] = []; + for (const t of targets) { + if (!SUPPORTED_LANGUAGES.has(t.language)) continue; + let src: string; + try { + src = fs.readFileSync(path.join(rootDir, t.path), 'utf8'); + } catch (err) { + logDebug(`extractConfigRefs: read failed for ${t.path}: ${err instanceof Error ? err.message : String(err)}`); + continue; + } + // Iterate lines so we can attribute each match to a 1-indexed line. + const lines = src.split('\n'); + for (let i = 0; i < lines.length; i++) { + const line = lines[i]!; + // Cheap pre-filter to skip the 99% of lines that obviously + // contain no env reference. Cuts per-file cost dramatically on + // big repos. + if ( + !line.includes('env') && + !line.includes('Env') && + !line.includes('ENV') + ) { + continue; + } + for (const pat of PATTERNS) { + if (!pat.languages.includes(t.language)) continue; + pat.re.lastIndex = 0; + let m: RegExpExecArray | null; + while ((m = pat.re.exec(line)) !== null) { + const key = m[1]!; + const lineNo = i + 1; + refs.push({ + configKind: 'env', + configKey: key, + sourceNodeId: resolveEnclosing(t.path, lineNo), + filePath: t.path, + line: lineNo, + }); + } + } + } + } + return refs; +} diff --git a/src/config.ts b/src/config.ts index 9ab1032a..f1d70250 100644 --- a/src/config.ts +++ b/src/config.ts @@ -128,6 +128,11 @@ function mergeConfig( extractDocstrings: overrides.extractDocstrings ?? defaults.extractDocstrings, trackCallSites: overrides.trackCallSites ?? defaults.trackCallSites, customPatterns: overrides.customPatterns ?? defaults.customPatterns, + enableCentrality: overrides.enableCentrality ?? defaults.enableCentrality, + enableChurn: overrides.enableChurn ?? defaults.enableChurn, + enableIssueHistory: overrides.enableIssueHistory ?? defaults.enableIssueHistory, + enableConfigRefs: overrides.enableConfigRefs ?? defaults.enableConfigRefs, + enableSqlRefs: overrides.enableSqlRefs ?? defaults.enableSqlRefs, }; } diff --git a/src/context/index.ts b/src/context/index.ts index 94192377..08f25657 100644 --- a/src/context/index.ts +++ b/src/context/index.ts @@ -286,6 +286,14 @@ export class ContextBuilder { options: FindRelevantContextOptions = {} ): Promise { const opts = { ...DEFAULT_FIND_OPTIONS, ...options }; + // Bound user-supplied limits — `searchLimit` is multiplied by 5 in + // findNodesByExactName (line 312) and feeds several other unbounded + // operations below, so a request with `searchLimit: 1_000_000` would + // pull millions of rows before any filtering. 100 is well above the + // largest legitimate use we've seen. + opts.searchLimit = Math.min(Math.max(1, opts.searchLimit), 100); + opts.maxNodes = Math.min(Math.max(1, opts.maxNodes), 1000); + opts.traversalDepth = Math.min(Math.max(0, opts.traversalDepth), 10); // Start with empty subgraph const nodes = new Map(); diff --git a/src/db/index.ts b/src/db/index.ts index 34e99338..da85caea 100644 --- a/src/db/index.ts +++ b/src/db/index.ts @@ -152,6 +152,36 @@ export class DatabaseConnection { this.db.exec('ANALYZE'); } + /** + * Lightweight, non-blocking maintenance to run after bulk writes + * (indexAll, sync). Two operations: + * + * - `PRAGMA optimize` — incremental ANALYZE; SQLite only re-analyzes + * tables whose row counts changed materially since the last + * ANALYZE. Without it, the query planner has no statistics on the + * freshly-bulk-loaded tables and can pick suboptimal indexes. + * + * - `PRAGMA wal_checkpoint(PASSIVE)` — fold pending WAL pages back + * into the main database file so the WAL file doesn't grow + * unboundedly between automatic checkpoints (auto-fires at 1000 + * pages by default; large indexAll runs blow past that). + * + * Both operations are silently swallowed on failure — they're a + * best-effort optimization, never load-bearing for correctness. + */ + runMaintenance(): void { + try { + this.db.exec('PRAGMA optimize'); + } catch { + // ignore + } + try { + this.db.exec('PRAGMA wal_checkpoint(PASSIVE)'); + } catch { + // ignore (e.g., not in WAL mode) + } + } + /** * Close the database connection */ diff --git a/src/db/migrations.ts b/src/db/migrations.ts index 0a256dbc..98325247 100644 --- a/src/db/migrations.ts +++ b/src/db/migrations.ts @@ -1,60 +1,26 @@ /** - * Database Migrations + * Database Migrations — runner + backward-compat surface. * - * Schema versioning and migration support. + * The migration definitions themselves live in + * `./migrations/-.ts`, one file per migration, with + * version derived from the filename prefix. This file is the + * runner (read schema_versions, apply pending in order) and the + * stable API surface that the rest of the codebase imports. + * + * Adding a migration: see `./migrations/index.ts`. */ import { SqliteDatabase } from './sqlite-adapter'; +import { ALL_MIGRATIONS, CURRENT_SCHEMA_VERSION as REGISTRY_CURRENT } from './migrations/index'; +import type { Migration } from './migrations/types'; /** - * Current schema version + * Highest registered migration version. Derived from the + * registry; re-exported here unchanged so existing consumers + * (`import { CURRENT_SCHEMA_VERSION } from './migrations'`) keep + * working. */ -export const CURRENT_SCHEMA_VERSION = 3; - -/** - * Migration definition - */ -interface Migration { - version: number; - description: string; - up: (db: SqliteDatabase) => void; -} - -/** - * All migrations in order - * - * Note: Version 1 is the initial schema, handled by schema.sql - * Future migrations go here. - */ -const migrations: Migration[] = [ - { - version: 2, - description: 'Add project metadata, provenance tracking, and unresolved ref context', - up: (db) => { - db.exec(` - CREATE TABLE IF NOT EXISTS project_metadata ( - key TEXT PRIMARY KEY, - value TEXT NOT NULL, - updated_at INTEGER NOT NULL - ); - ALTER TABLE unresolved_refs ADD COLUMN file_path TEXT NOT NULL DEFAULT ''; - ALTER TABLE unresolved_refs ADD COLUMN language TEXT NOT NULL DEFAULT 'unknown'; - ALTER TABLE edges ADD COLUMN provenance TEXT DEFAULT NULL; - CREATE INDEX IF NOT EXISTS idx_unresolved_file_path ON unresolved_refs(file_path); - CREATE INDEX IF NOT EXISTS idx_edges_provenance ON edges(provenance); - `); - }, - }, - { - version: 3, - description: 'Add lower(name) expression index for memory-efficient case-insensitive lookups', - up: (db) => { - db.exec(` - CREATE INDEX IF NOT EXISTS idx_nodes_lower_name ON nodes(lower(name)); - `); - }, - }, -]; +export const CURRENT_SCHEMA_VERSION: number = REGISTRY_CURRENT; /** * Get the current schema version from the database @@ -84,17 +50,14 @@ function recordMigration(db: SqliteDatabase, version: number, description: strin * Run all pending migrations */ export function runMigrations(db: SqliteDatabase, fromVersion: number): void { - const pending = migrations.filter((m) => m.version > fromVersion); - - if (pending.length === 0) { - return; - } + const pending = ALL_MIGRATIONS.filter((m) => m.version > fromVersion); + if (pending.length === 0) return; - // Sort by version - pending.sort((a, b) => a.version - b.version); + // ALL_MIGRATIONS is already sorted by version, but filtering can + // be cheap to re-confirm. + const ordered = [...pending].sort((a, b) => a.version - b.version); - // Run each migration in a transaction - for (const migration of pending) { + for (const migration of ordered) { db.transaction(() => { migration.up(db); recordMigration(db, migration.version, migration.description); @@ -111,13 +74,15 @@ export function needsMigration(db: SqliteDatabase): boolean { } /** - * Get list of pending migrations + * Get list of pending migrations. + * + * Returned as a fresh mutable array (not the underlying readonly + * registry) so callers that previously assigned the result to a + * `Migration[]`-typed variable keep working unchanged. */ export function getPendingMigrations(db: SqliteDatabase): Migration[] { const current = getCurrentVersion(db); - return migrations - .filter((m) => m.version > current) - .sort((a, b) => a.version - b.version); + return ALL_MIGRATIONS.filter((m) => m.version > current).slice(); } /** @@ -136,3 +101,7 @@ export function getMigrationHistory( description: row.description, })); } + +// Re-export the registry surface for callers that want it. +export { ALL_MIGRATIONS } from './migrations/index'; +export type { Migration, MigrationModule } from './migrations/types'; diff --git a/src/db/migrations/002-project-metadata.ts b/src/db/migrations/002-project-metadata.ts new file mode 100644 index 00000000..9fe7945b --- /dev/null +++ b/src/db/migrations/002-project-metadata.ts @@ -0,0 +1,19 @@ +import type { MigrationModule } from './types'; + +export const MIGRATION: MigrationModule = { + description: 'Add project metadata, provenance tracking, and unresolved ref context', + up: (db) => { + db.exec(` + CREATE TABLE IF NOT EXISTS project_metadata ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL, + updated_at INTEGER NOT NULL + ); + ALTER TABLE unresolved_refs ADD COLUMN file_path TEXT NOT NULL DEFAULT ''; + ALTER TABLE unresolved_refs ADD COLUMN language TEXT NOT NULL DEFAULT 'unknown'; + ALTER TABLE edges ADD COLUMN provenance TEXT DEFAULT NULL; + CREATE INDEX IF NOT EXISTS idx_unresolved_file_path ON unresolved_refs(file_path); + CREATE INDEX IF NOT EXISTS idx_edges_provenance ON edges(provenance); + `); + }, +}; diff --git a/src/db/migrations/003-lower-name-index.ts b/src/db/migrations/003-lower-name-index.ts new file mode 100644 index 00000000..ff5416eb --- /dev/null +++ b/src/db/migrations/003-lower-name-index.ts @@ -0,0 +1,10 @@ +import type { MigrationModule } from './types'; + +export const MIGRATION: MigrationModule = { + description: 'Add lower(name) expression index for memory-efficient case-insensitive lookups', + up: (db) => { + db.exec(` + CREATE INDEX IF NOT EXISTS idx_nodes_lower_name ON nodes(lower(name)); + `); + }, +}; diff --git a/src/db/migrations/004-centrality-churn.ts b/src/db/migrations/004-centrality-churn.ts new file mode 100644 index 00000000..82d30ffe --- /dev/null +++ b/src/db/migrations/004-centrality-churn.ts @@ -0,0 +1,42 @@ +import type { MigrationModule } from './types'; + +export const MIGRATION: MigrationModule = { + description: 'Add centrality on nodes; per-file churn metrics on files', + up: (db) => { + // ALTER TABLE ADD COLUMN is not idempotent on SQLite — guard with + // PRAGMA table_info so re-running after a partial DDL failure (or + // landing alongside another migration that touches the same files + // columns) does not throw "duplicate column name". + const tableExists = (name: string): boolean => + (db.prepare(`SELECT COUNT(*) AS c FROM sqlite_master WHERE type='table' AND name=?`) + .get(name) as { c: number }).c > 0; + + if (tableExists('nodes')) { + const nodeCols = db.prepare(`PRAGMA table_info(nodes);`).all() as Array<{ name: string }>; + if (!nodeCols.some((c) => c.name === 'centrality')) { + db.exec(`ALTER TABLE nodes ADD COLUMN centrality REAL DEFAULT NULL;`); + } + db.exec(`CREATE INDEX IF NOT EXISTS idx_nodes_centrality ON nodes(centrality DESC);`); + } + + if (tableExists('files')) { + const fileCols = db.prepare(`PRAGMA table_info(files);`).all() as Array<{ name: string }>; + if (!fileCols.some((c) => c.name === 'commit_count')) { + db.exec(`ALTER TABLE files ADD COLUMN commit_count INTEGER NOT NULL DEFAULT 0;`); + } + if (!fileCols.some((c) => c.name === 'loc')) { + db.exec(`ALTER TABLE files ADD COLUMN loc INTEGER NOT NULL DEFAULT 0;`); + } + if (!fileCols.some((c) => c.name === 'first_seen_ts')) { + db.exec(`ALTER TABLE files ADD COLUMN first_seen_ts INTEGER DEFAULT NULL;`); + } + if (!fileCols.some((c) => c.name === 'last_touched_ts')) { + db.exec(`ALTER TABLE files ADD COLUMN last_touched_ts INTEGER DEFAULT NULL;`); + } + db.exec(` + CREATE INDEX IF NOT EXISTS idx_files_commit_count ON files(commit_count DESC); + CREATE INDEX IF NOT EXISTS idx_files_last_touched ON files(last_touched_ts DESC); + `); + } + }, +}; diff --git a/src/db/migrations/005-symbol-issues.ts b/src/db/migrations/005-symbol-issues.ts new file mode 100644 index 00000000..7af13795 --- /dev/null +++ b/src/db/migrations/005-symbol-issues.ts @@ -0,0 +1,19 @@ +import type { MigrationModule } from './types'; + +export const MIGRATION: MigrationModule = { + description: 'Add symbol_issues table for issue→symbol attribution from git history', + up: (db) => { + db.exec(` + CREATE TABLE IF NOT EXISTS symbol_issues ( + node_id TEXT NOT NULL, + issue_number INTEGER NOT NULL, + commit_sha TEXT NOT NULL, + kind TEXT NOT NULL CHECK (kind IN ('modified','added','removed')), + PRIMARY KEY (node_id, issue_number, commit_sha, kind), + FOREIGN KEY (node_id) REFERENCES nodes(id) ON DELETE CASCADE + ); + CREATE INDEX IF NOT EXISTS idx_symbol_issues_node ON symbol_issues(node_id); + CREATE INDEX IF NOT EXISTS idx_symbol_issues_issue ON symbol_issues(issue_number); + `); + }, +}; diff --git a/src/db/migrations/006-config-refs.ts b/src/db/migrations/006-config-refs.ts new file mode 100644 index 00000000..8fed1a91 --- /dev/null +++ b/src/db/migrations/006-config-refs.ts @@ -0,0 +1,24 @@ +import type { MigrationModule } from './types'; + +export const MIGRATION: MigrationModule = { + description: 'Add config_refs table for env var / feature flag read sites', + up: (db) => { + db.exec(` + CREATE TABLE IF NOT EXISTS config_refs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + config_kind TEXT NOT NULL, + config_key TEXT NOT NULL, + source_node_id TEXT, + file_path TEXT NOT NULL, + line INTEGER NOT NULL, + FOREIGN KEY (source_node_id) REFERENCES nodes(id) ON DELETE CASCADE + ); + CREATE INDEX IF NOT EXISTS idx_config_refs_key + ON config_refs(config_kind, config_key); + CREATE INDEX IF NOT EXISTS idx_config_refs_node + ON config_refs(source_node_id); + CREATE INDEX IF NOT EXISTS idx_config_refs_file + ON config_refs(file_path); + `); + }, +}; diff --git a/src/db/migrations/007-sql-refs.ts b/src/db/migrations/007-sql-refs.ts new file mode 100644 index 00000000..629d070f --- /dev/null +++ b/src/db/migrations/007-sql-refs.ts @@ -0,0 +1,24 @@ +import type { MigrationModule } from './types'; + +export const MIGRATION: MigrationModule = { + description: 'Add sql_refs table for SQL string-literal references to tables', + up: (db) => { + db.exec(` + CREATE TABLE IF NOT EXISTS sql_refs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + table_name TEXT NOT NULL, + op TEXT NOT NULL CHECK (op IN ('read','write','ddl')), + source_node_id TEXT, + file_path TEXT NOT NULL, + line INTEGER NOT NULL, + FOREIGN KEY (source_node_id) REFERENCES nodes(id) ON DELETE CASCADE + ); + CREATE INDEX IF NOT EXISTS idx_sql_refs_table + ON sql_refs(lower(table_name)); + CREATE INDEX IF NOT EXISTS idx_sql_refs_node + ON sql_refs(source_node_id); + CREATE INDEX IF NOT EXISTS idx_sql_refs_file + ON sql_refs(file_path); + `); + }, +}; diff --git a/src/db/migrations/008-edges-unique.ts b/src/db/migrations/008-edges-unique.ts new file mode 100644 index 00000000..ed7e5372 --- /dev/null +++ b/src/db/migrations/008-edges-unique.ts @@ -0,0 +1,29 @@ +import type { MigrationModule } from './types'; + +export const MIGRATION: MigrationModule = { + description: + 'Dedup edges and enforce UNIQUE(source, target, kind, line, col) so INSERT OR IGNORE actually dedupes', + up: (db) => { + // Tolerate edges-table-missing (synthetic test DBs that only need + // the FTS / nodes side of the schema): if there's no edges table, + // there are no duplicates to dedup or unique constraint to add. + const hasEdges = (db + .prepare(`SELECT COUNT(*) AS c FROM sqlite_master WHERE type='table' AND name='edges'`) + .get() as { c: number }).c > 0; + if (!hasEdges) return; + + // Without a UNIQUE constraint the existing `INSERT OR IGNORE INTO + // edges` was a no-op for dedup purposes. Collapse accumulated + // duplicates first, then add the UNIQUE index. COALESCE keeps + // NULL line/col values comparable. + db.exec(` + DELETE FROM edges + WHERE id NOT IN ( + SELECT MIN(id) FROM edges + GROUP BY source, target, kind, COALESCE(line, -1), COALESCE(col, -1) + ); + CREATE UNIQUE INDEX IF NOT EXISTS idx_edges_unique + ON edges(source, target, kind, COALESCE(line, -1), COALESCE(col, -1)); + `); + }, +}; diff --git a/src/db/migrations/009-fts-subwords-porter.ts b/src/db/migrations/009-fts-subwords-porter.ts new file mode 100644 index 00000000..032058cc --- /dev/null +++ b/src/db/migrations/009-fts-subwords-porter.ts @@ -0,0 +1,68 @@ +import type { MigrationModule } from './types'; +import { buildNameSubwords } from '../../utils'; + +export const MIGRATION: MigrationModule = { + description: + 'Add name_subwords + Porter stemmer to FTS so natural-language and partial-identifier queries work', + up: (db) => { + // 1. Add the synthetic subwords column to nodes — idempotent so a + // re-run after a partial DDL failure (SQLite auto-commits DDL, + // so only some of these statements may have landed) doesn't fail + // with "duplicate column name". + const cols = db.prepare(`PRAGMA table_info(nodes);`).all() as Array<{ name: string }>; + if (!cols.some((c) => c.name === 'name_subwords')) { + db.exec(`ALTER TABLE nodes ADD COLUMN name_subwords TEXT;`); + } + + // 2. Drop the existing FTS table + triggers. We can't ALTER the + // FTS5 tokenizer in place; recreating is the supported path. + db.exec(` + DROP TRIGGER IF EXISTS nodes_ai; + DROP TRIGGER IF EXISTS nodes_ad; + DROP TRIGGER IF EXISTS nodes_au; + DROP TABLE IF EXISTS nodes_fts; + `); + + // 3. Recreate the FTS table — but DO NOT recreate the triggers yet. + db.exec(` + CREATE VIRTUAL TABLE nodes_fts USING fts5( + id, name, qualified_name, docstring, signature, name_subwords, + content='nodes', + content_rowid='rowid', + tokenize="porter unicode61" + ); + `); + + // 4. Backfill name_subwords. + const rows = db + .prepare('SELECT id, name FROM nodes') + .all() as Array<{ id: string; name: string }>; + const update = db.prepare('UPDATE nodes SET name_subwords = ? WHERE id = ?'); + for (const row of rows) { + update.run(buildNameSubwords(row.name), row.id); + } + + // 5. Rebuild the FTS index from the content table. + db.exec(`INSERT INTO nodes_fts(nodes_fts) VALUES('rebuild');`); + + // 6. Re-attach the triggers — fire on subsequent application writes. + db.exec(` + CREATE TRIGGER nodes_ai AFTER INSERT ON nodes BEGIN + INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature, name_subwords) + VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature, NEW.name_subwords); + END; + + CREATE TRIGGER nodes_ad AFTER DELETE ON nodes BEGIN + INSERT INTO nodes_fts(nodes_fts, rowid, id, name, qualified_name, docstring, signature, name_subwords) + VALUES ('delete', OLD.rowid, OLD.id, OLD.name, OLD.qualified_name, OLD.docstring, OLD.signature, OLD.name_subwords); + END; + + CREATE TRIGGER nodes_au AFTER UPDATE ON nodes BEGIN + INSERT INTO nodes_fts(nodes_fts, rowid, id, name, qualified_name, docstring, signature, name_subwords) + VALUES ('delete', OLD.rowid, OLD.id, OLD.name, OLD.qualified_name, OLD.docstring, OLD.signature, OLD.name_subwords); + INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature, name_subwords) + VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature, NEW.name_subwords); + END; + `); + }, +}; diff --git a/src/db/migrations/index.ts b/src/db/migrations/index.ts new file mode 100644 index 00000000..b1d7b9a6 --- /dev/null +++ b/src/db/migrations/index.ts @@ -0,0 +1,118 @@ +/** + * Migration registry. + * + * Adding a new schema migration is: + * + * 1. Pick the next free 3-digit prefix (`NNN`) — `git ls-files + * 'src/db/migrations/[0-9]*.ts'` shows what's taken. + * 2. Create `src/db/migrations/-.ts` + * exporting a `MIGRATION: MigrationModule` (just `description` + * and `up(db)`). + * 3. Add **one** import line and **one** array entry to this file. + * + * **Why filename-derived versions instead of a field?** Two PRs + * adding migrations independently used to collide on the + * `migrations[]` array AND the `CURRENT_SCHEMA_VERSION` const. + * With monolithic migrations.ts, "I claimed v4 / you claimed v4" + * resolved as "second PR's v4 silently no-ops" — a real bug class + * (PR #113's reviewer caught one). With filename-derived versions, + * two PRs both creating `004-foo.ts` produce a filesystem-level + * conflict the maintainer sees instantly. + * + * `CURRENT_SCHEMA_VERSION` is the max of all registered versions. + */ + +import type { Migration, MigrationModule } from './types'; + +import { MIGRATION as MIG_002 } from './002-project-metadata'; +import { MIGRATION as MIG_003 } from './003-lower-name-index'; +import { MIGRATION as MIG_004 } from './004-centrality-churn'; +import { MIGRATION as MIG_005 } from './005-symbol-issues'; +import { MIGRATION as MIG_006 } from './006-config-refs'; +import { MIGRATION as MIG_007 } from './007-sql-refs'; +import { MIGRATION as MIG_008 } from './008-edges-unique'; +import { MIGRATION as MIG_009 } from './009-fts-subwords-porter'; + +interface ModuleRef { + /** + * Source filename. The 3-digit prefix is the source of truth for + * the version number — `validateRegistered` parses it. Keep this + * field in sync with the actual file on disk; the + * filesystem-cross-check test catches drift. + */ + filename: string; + module: MigrationModule; +} + +/** + * Static-import list of every migration. Two PRs adding + * migrations both add a single entry here; alphabetical ordering + * puts adjacent additions on different lines unless the version + * numbers themselves collide, in which case the filesystem + * collision on `NNN-*.ts` surfaces the conflict instantly. + */ +const REGISTERED_MODULES: readonly ModuleRef[] = [ + { filename: '002-project-metadata.ts', module: MIG_002 }, + { filename: '003-lower-name-index.ts', module: MIG_003 }, + { filename: '004-centrality-churn.ts', module: MIG_004 }, + { filename: '005-symbol-issues.ts', module: MIG_005 }, + { filename: '006-config-refs.ts', module: MIG_006 }, + { filename: '007-sql-refs.ts', module: MIG_007 }, + { filename: '008-edges-unique.ts', module: MIG_008 }, + { filename: '009-fts-subwords-porter.ts', module: MIG_009 }, +]; + +/** Strict 3-digit prefix on each migration filename. */ +const FILENAME_PATTERN = /^(\d{3})-[a-z0-9]+(?:-[a-z0-9]+)*\.ts$/; + +/** + * Validate the registered set: filenames match the strict + * `NNN-name.ts` shape, version is parsed from the prefix (no + * hand-typed version field that can drift), versions are unique, + * and the result is sorted ascending. Throws loudly at module + * load if any invariant is violated rather than silently dropping + * a migration during `runMigrations()`. + */ +function validateRegistered(refs: readonly ModuleRef[]): readonly Migration[] { + if (refs.length === 0) { + throw new Error('[CodeGraph] migrations registry is empty'); + } + const parsed = refs.map((r) => { + const m = FILENAME_PATTERN.exec(r.filename); + if (!m) { + throw new Error( + `[CodeGraph] migration filename "${r.filename}" does not match ` + + `expected pattern NNN-kebab-name.ts (3-digit prefix, lowercase kebab-case body)` + ); + } + const version = parseInt(m[1]!, 10); + return { + version, + filename: r.filename, + description: r.module.description, + up: r.module.up, + }; + }); + const sorted = [...parsed].sort((a, b) => a.version - b.version); + for (let i = 1; i < sorted.length; i++) { + if (sorted[i]!.version === sorted[i - 1]!.version) { + throw new Error( + `[CodeGraph] duplicate migration version ${sorted[i]!.version}: ` + + `${sorted[i - 1]!.filename} vs ${sorted[i]!.filename}` + ); + } + } + return sorted.map((r) => ({ + version: r.version, + description: r.description, + up: r.up, + })); +} + +export const ALL_MIGRATIONS: readonly Migration[] = validateRegistered(REGISTERED_MODULES); + +/** + * Highest registered migration version. Derived from the registry + * (no hand-maintained constant to keep in sync). + */ +export const CURRENT_SCHEMA_VERSION: number = ALL_MIGRATIONS[ALL_MIGRATIONS.length - 1]!.version; diff --git a/src/db/migrations/types.ts b/src/db/migrations/types.ts new file mode 100644 index 00000000..479af672 --- /dev/null +++ b/src/db/migrations/types.ts @@ -0,0 +1,25 @@ +/** + * Migration registry types. + * + * Each migration ships its own self-contained file + * (`./NNN-description.ts`) exporting a `MIGRATION: + * MigrationModule`. The version number is derived from the + * leading 3-digit prefix on the filename, NOT from a field in the + * module — this guarantees no two PRs can claim the same version + * silently (filenames collide on the filesystem; SQL migrations + * never silently no-op). + */ + +import type { SqliteDatabase } from '../sqlite-adapter'; + +export interface MigrationModule { + /** One-line description for `schema_versions` table + diagnostics. */ + readonly description: string; + /** The actual schema-mutation function. Wrapped in a transaction. */ + readonly up: (db: SqliteDatabase) => void; +} + +export interface Migration extends MigrationModule { + /** Version derived from filename's leading NNN prefix. */ + readonly version: number; +} diff --git a/src/db/queries.ts b/src/db/queries.ts index 51f1a1ad..4a3edb90 100644 --- a/src/db/queries.ts +++ b/src/db/queries.ts @@ -17,8 +17,8 @@ import { SearchOptions, SearchResult, } from '../types'; -import { safeJsonParse } from '../utils'; -import { kindBonus, nameMatchBonus, scorePathRelevance } from '../search/query-utils'; +import { safeJsonParse, buildNameSubwords } from '../utils'; +import { kindBonus, nameMatchBonus, scorePathRelevance, filterStopwords, diversifyByFile } from '../search/query-utils'; /** * Database row types (snake_case from SQLite) @@ -44,6 +44,7 @@ interface NodeRow { decorators: string | null; type_parameters: string | null; updated_at: number; + centrality: number | null; } interface EdgeRow { @@ -66,6 +67,10 @@ interface FileRow { indexed_at: number; node_count: number; errors: string | null; + commit_count: number | null; + loc: number | null; + first_seen_ts: number | null; + last_touched_ts: number | null; } interface UnresolvedRefRow { @@ -105,6 +110,7 @@ function rowToNode(row: NodeRow): Node { decorators: row.decorators ? safeJsonParse(row.decorators, undefined) : undefined, typeParameters: row.type_parameters ? safeJsonParse(row.type_parameters, undefined) : undefined, updatedAt: row.updated_at, + centrality: row.centrality ?? undefined, }; } @@ -136,6 +142,10 @@ function rowToFileRecord(row: FileRow): FileRecord { indexedAt: row.indexed_at, nodeCount: row.node_count, errors: row.errors ? safeJsonParse(row.errors, undefined) : undefined, + commitCount: row.commit_count ?? 0, + loc: row.loc ?? 0, + firstSeenTs: row.first_seen_ts ?? null, + lastTouchedTs: row.last_touched_ts ?? null, }; } @@ -170,7 +180,6 @@ export class QueryBuilder { getFileByPath?: SqliteStatement; getAllFiles?: SqliteStatement; insertUnresolved?: SqliteStatement; - deleteUnresolvedByNode?: SqliteStatement; getUnresolvedByName?: SqliteStatement; getNodesByName?: SqliteStatement; getNodesByQualifiedNameExact?: SqliteStatement; @@ -185,6 +194,14 @@ export class QueryBuilder { this.db = db; } + /** + * Execute a callback inside a single SQLite transaction. Useful when a + * caller needs several `QueryBuilder` operations to commit atomically. + */ + transaction(fn: () => T): T { + return this.db.transaction(fn)(); + } + // =========================================================================== // Node Operations // =========================================================================== @@ -200,13 +217,13 @@ export class QueryBuilder { start_line, end_line, start_column, end_column, docstring, signature, visibility, is_exported, is_async, is_static, is_abstract, - decorators, type_parameters, updated_at + decorators, type_parameters, updated_at, name_subwords ) VALUES ( @id, @kind, @name, @qualifiedName, @filePath, @language, @startLine, @endLine, @startColumn, @endColumn, @docstring, @signature, @visibility, @isExported, @isAsync, @isStatic, @isAbstract, - @decorators, @typeParameters, @updatedAt + @decorators, @typeParameters, @updatedAt, @nameSubwords ) `); } @@ -223,6 +240,12 @@ export class QueryBuilder { return; } + // INSERT OR REPLACE may overwrite a node we have cached. Drop the + // stale entry so the next getNodeById sees the new row, not the old + // one (matches the cache-invalidation pattern used by updateNode and + // deleteNode below). + this.nodeCache.delete(node.id); + try { this.stmts.insertNode.run({ id: node.id, @@ -245,6 +268,7 @@ export class QueryBuilder { decorators: node.decorators ? JSON.stringify(node.decorators) : null, typeParameters: node.typeParameters ? JSON.stringify(node.typeParameters) : null, updatedAt: node.updatedAt ?? Date.now(), + nameSubwords: buildNameSubwords(node.name), }); } catch (error) { throw error; @@ -287,7 +311,8 @@ export class QueryBuilder { is_abstract = @isAbstract, decorators = @decorators, type_parameters = @typeParameters, - updated_at = @updatedAt + updated_at = @updatedAt, + name_subwords = @nameSubwords WHERE id = @id `); } @@ -322,6 +347,7 @@ export class QueryBuilder { decorators: node.decorators ? JSON.stringify(node.decorators) : null, typeParameters: node.typeParameters ? JSON.stringify(node.typeParameters) : null, updatedAt: node.updatedAt ?? Date.now(), + nameSubwords: buildNameSubwords(node.name), }); } @@ -379,6 +405,59 @@ export class QueryBuilder { return node; } + /** + * Batch lookup: fetch many nodes by ID in a single SQL round-trip. + * + * Replaces the N+1 pattern in graph traversal where every edge would + * trigger its own `getNodeById` call. For a function with 50 callers + * this collapses 50 point reads into one IN-list query (~10-50x + * faster end-to-end). + * + * Returns a Map keyed by id so callers can preserve their own ordering + * (typically the order edges were returned from the graph). Missing IDs + * are simply absent from the map. + * + * Cache-aware: ids already in the LRU cache are served from memory and + * the SQL query only touches the misses. + */ + getNodesByIds(ids: readonly string[]): Map { + const out = new Map(); + if (ids.length === 0) return out; + + // Serve cache hits first; build the miss list for SQL. + const misses: string[] = []; + for (const id of ids) { + const cached = this.nodeCache.get(id); + if (cached !== undefined) { + // LRU touch + this.nodeCache.delete(id); + this.nodeCache.set(id, cached); + out.set(id, cached); + } else { + misses.push(id); + } + } + if (misses.length === 0) return out; + + // Chunk under SQLite's parameter limit (default 999, raised to 32766 + // in better-sqlite3 builds — chunk at 500 for safety across both + // backends and to keep the query plan simple). + const CHUNK = 500; + for (let i = 0; i < misses.length; i += CHUNK) { + const chunk = misses.slice(i, i + CHUNK); + const placeholders = chunk.map(() => '?').join(','); + const rows = this.db + .prepare(`SELECT * FROM nodes WHERE id IN (${placeholders})`) + .all(...chunk) as NodeRow[]; + for (const row of rows) { + const node = rowToNode(row); + out.set(node.id, node); + this.cacheNode(node); + } + } + return out; + } + /** * Add a node to the cache, evicting oldest if needed */ @@ -478,7 +557,13 @@ export class QueryBuilder { * 3. Score results based on match quality */ searchNodes(query: string, options: SearchOptions = {}): SearchResult[] { - const { kinds, languages, limit = 100, offset = 0 } = options; + const { kinds, languages, limit = 100, offset = 0, perFileCap = 3 } = options; + + // Note on over-fetching: searchNodesFTS already over-fetches by 5x + // internally (Math.max(limit*5, 100)) so its own rescoring pass has + // headroom. That same headroom feeds the per-file diversification + // below — no additional outer multiplier needed. Keeping this comment + // here so future readers don't reintroduce a multiplier-on-multiplier. // First try FTS5 with prefix matching let results = this.searchNodesFTS(query, { kinds, languages, limit, offset }); @@ -530,10 +615,23 @@ export class QueryBuilder { + nameMatchBonus(r.node.name, query), })); results.sort((a, b) => b.score - a.score); - // Trim to requested limit after rescoring - if (results.length > limit) { - results = results.slice(0, limit); - } + } + + // Diversification: cap per-file results so the top-K isn't dominated + // by the methods of a single class. Top-scoring hit per file is always + // included; the cap only kicks in for the second-and-onward members + // of the same file. perFileCap=0 disables. + // + // Guard `results.length > limit`: when results <= limit there's + // nothing to drop, so the existing score order is already what the + // caller will see. (`diversifyByFile` is also safe to call here and + // would reorder within the same set, but the existing rescore order + // is already meaningful and we don't want to perturb it without + // benefit.) + if (perFileCap > 0 && results.length > limit) { + results = diversifyByFile(results, limit, perFileCap); + } else if (results.length > limit) { + results = results.slice(0, limit); } return results; @@ -545,30 +643,38 @@ export class QueryBuilder { private searchNodesFTS(query: string, options: SearchOptions): SearchResult[] { const { kinds, languages, limit = 100, offset = 0 } = options; - // Add prefix wildcard for better matching (e.g., "auth" matches "AuthService", "authenticate") - // Escape special FTS5 characters and add prefix wildcard - const ftsQuery = query - .replace(/['"*():^]/g, '') // Remove FTS5 special chars + // Build the FTS query in three steps: + // 1. Strip characters with special meaning to FTS5 and split on whitespace. + // 2. Drop FTS5 boolean operators (AND/OR/NOT/NEAR) — prevents user input + // from injecting boolean structure into the OR-join below. + // 3. Drop English stopwords for natural-language queries — words like + // "how" / "the" otherwise become OR'd hits against any prose-bearing + // docstring and crowd out the actually-relevant identifier tokens. + const rawTerms = query + .replace(/['"*():^]/g, '') .split(/\s+/) - .filter(term => term.length > 0) - // Strip FTS5 boolean operators to prevent query manipulation - .filter(term => !/^(AND|OR|NOT|NEAR)$/i.test(term)) - .map(term => `"${term}"*`) // Prefix match each term + .filter((term) => term.length > 0) + .filter((term) => !/^(AND|OR|NOT|NEAR)$/i.test(term)); + + const filteredTerms = filterStopwords(rawTerms); + + const ftsQuery = filteredTerms + .map((term) => `"${term}"*`) // Prefix match each term .join(' OR '); if (!ftsQuery) { return []; } - // BM25 column weights: id=0, name=20, qualified_name=5, docstring=1, signature=2 - // Heavy name weight ensures exact/prefix name matches rank above incidental - // mentions in long docstrings or qualified names of nested symbols. - // Fetch 5x requested limit so post-hoc rescoring (kindBonus, pathRelevance, - // nameMatchBonus) can promote results that BM25 alone undervalues. + // BM25 column weights: id=0, name=20, qualified_name=5, docstring=1, + // signature=2, name_subwords=10. Heavy name weight keeps exact and prefix + // name matches above incidental mentions in long docstrings; the new + // name_subwords column at 10× lets queries hit subword tokens like + // `parser` against `getParser` without burying full-name matches. const ftsLimit = Math.max(limit * 5, 100); let sql = ` - SELECT nodes.*, bm25(nodes_fts, 0, 20, 5, 1, 2) as score + SELECT nodes.*, bm25(nodes_fts, 0, 20, 5, 1, 2, 10) as score FROM nodes_fts JOIN nodes ON nodes_fts.id = nodes.id WHERE nodes_fts MATCH ? @@ -916,7 +1022,12 @@ export class QueryBuilder { // =========================================================================== /** - * Insert or update a file record + * Insert or update a file record. + * + * Churn columns (commit_count, loc, first_seen_ts, last_touched_ts) + * are deliberately omitted from the ON CONFLICT update list — they + * are managed exclusively by `applyChurnDeltas` / `applyLocUpdates`. + * Adding them here would clobber mined git history on every re-index. */ upsertFile(file: FileRecord): void { if (!this.stmts.upsertFile) { @@ -1032,17 +1143,8 @@ export class QueryBuilder { insert(); } - /** - * Delete unresolved references from a node - */ - deleteUnresolvedByNode(nodeId: string): void { - if (!this.stmts.deleteUnresolvedByNode) { - this.stmts.deleteUnresolvedByNode = this.db.prepare( - 'DELETE FROM unresolved_refs WHERE from_node_id = ?' - ); - } - this.stmts.deleteUnresolvedByNode.run(nodeId); - } + // (deleteUnresolvedByNode removed — never called; FK cascade on + // nodes(id) → unresolved_refs.from_node_id handles cleanup automatically.) /** * Get unresolved references by name (for resolution) @@ -1295,4 +1397,526 @@ export class QueryBuilder { this.db.exec('DELETE FROM files'); })(); } + + // =========================================================================== + // Centrality (PageRank scores on nodes) + // =========================================================================== + + /** + * Apply PageRank scores to the nodes table in a single transaction. + * Existing scores for ids not in the map are NOT cleared — call + * `clearCentrality()` first for a from-scratch recompute. + */ + applyCentralityScores(scores: Map): void { + if (scores.size === 0) return; + const stmt = this.db.prepare('UPDATE nodes SET centrality = ? WHERE id = ?'); + this.db.transaction(() => { + for (const [id, score] of scores) { + stmt.run(score, id); + } + })(); + // Cached node objects now have stale centrality. Drop the cache; + // subsequent reads pull the fresh value. + this.nodeCache.clear(); + } + + /** Reset all centrality values to NULL (fresh-recompute path). */ + clearCentrality(): void { + this.db.exec('UPDATE nodes SET centrality = NULL'); + this.nodeCache.clear(); + } + + /** + * Get top-N nodes by centrality, descending. Filters out NULL + * centrality (= not yet computed). Optional `kind` filter narrows + * to one node kind; optional `minCentrality` filters out the long + * tail of essentially-zero ranks. + */ + getTopNodesByCentrality(opts: { + limit?: number; + kind?: NodeKind; + minCentrality?: number; + } = {}): Node[] { + const limit = opts.limit ?? 25; + const minCentrality = opts.minCentrality ?? 0; + const where: string[] = ['centrality IS NOT NULL', 'centrality >= ?']; + const params: (string | number)[] = [minCentrality]; + if (opts.kind) { + where.push('kind = ?'); + params.push(opts.kind); + } + const sql = `SELECT * FROM nodes WHERE ${where.join(' AND ')} + ORDER BY centrality DESC LIMIT ?`; + params.push(limit); + const rows = this.db.prepare(sql).all(...params) as NodeRow[]; + return rows.map(rowToNode); + } + + /** + * Compute the rank (1-based) of a single node by centrality. + * Returns null if the node has no centrality yet. + */ + getCentralityRank(nodeId: string): { rank: number; total: number } | null { + const row = this.db + .prepare('SELECT centrality FROM nodes WHERE id = ?') + .get(nodeId) as { centrality: number | null } | undefined; + if (!row || row.centrality === null) return null; + const above = this.db + .prepare('SELECT COUNT(*) AS c FROM nodes WHERE centrality > ?') + .get(row.centrality) as { c: number }; + const total = this.db + .prepare('SELECT COUNT(*) AS c FROM nodes WHERE centrality IS NOT NULL') + .get() as { c: number }; + return { rank: above.c + 1, total: total.c }; + } + + // =========================================================================== + // Per-file churn (mined from git log) + // =========================================================================== + + /** + * Apply churn deltas to the files table. For each delta: + * commit_count += commitCountDelta + * last_touched_ts = MAX(existing, lastTouchedTs) + * first_seen_ts = COALESCE(existing, firstSeenTs) // sticky + * + * Files in the delta map but not in the files table (uncommon — + * they'd have to be mined-but-never-indexed) are silently skipped. + */ + applyChurnDeltas( + deltas: Iterable<{ + path: string; + commitCountDelta: number; + lastTouchedTs: number; + firstSeenTs: number; + }> + ): void { + const stmt = this.db.prepare( + `UPDATE files + SET commit_count = commit_count + ?, + last_touched_ts = MAX(COALESCE(last_touched_ts, 0), ?), + first_seen_ts = COALESCE(first_seen_ts, ?) + WHERE path = ?` + ); + this.db.transaction(() => { + for (const d of deltas) { + stmt.run(d.commitCountDelta, d.lastTouchedTs, d.firstSeenTs, d.path); + } + })(); + } + + /** Reset all churn columns; used before a full re-mine. Does not touch `loc`. */ + clearChurn(): void { + this.db.exec( + `UPDATE files SET commit_count = 0, last_touched_ts = NULL, first_seen_ts = NULL` + ); + } + + /** Update the on-disk LOC for a single file. Cheap; called per changed file. */ + updateFileLoc(filePath: string, loc: number): void { + this.db.prepare('UPDATE files SET loc = ? WHERE path = ?').run(loc, filePath); + } + + /** Bulk LOC update — used during indexAll to refresh LOC for every indexed file. */ + applyLocUpdates(entries: Iterable<{ path: string; loc: number }>): void { + const stmt = this.db.prepare('UPDATE files SET loc = ? WHERE path = ?'); + this.db.transaction(() => { + for (const e of entries) stmt.run(e.loc, e.path); + })(); + } + + getTopFilesByChurn(opts: { limit?: number; minCommits?: number } = {}): FileRecord[] { + const limit = opts.limit ?? 25; + const minCommits = opts.minCommits ?? 1; + const rows = this.db + .prepare( + `SELECT * FROM files WHERE commit_count >= ? + ORDER BY commit_count DESC LIMIT ?` + ) + .all(minCommits, limit) as FileRow[]; + return rows.map(rowToFileRecord); + } + + /** + * Hotspots: files ranked by `risk = (Σ centrality of nodes in file) × commit_count`. + * + * Both inputs are optional in their own right; with neither computed, + * this returns []. Sorting modes: + * - 'risk' : the combined score (default; what "hotspot" means) + * - 'centrality' : pure structural importance + * - 'churn' : pure change frequency + */ + getHotspots(opts: { + limit?: number; + minCommits?: number; + minCentrality?: number; + sortBy?: 'risk' | 'centrality' | 'churn'; + } = {}): Array<{ + filePath: string; + fileCentrality: number; + commitCount: number; + loc: number; + lastTouchedTs: number | null; + riskScore: number; + }> { + const limit = opts.limit ?? 15; + const minCommits = opts.minCommits ?? 0; + const minCentrality = opts.minCentrality ?? 0; + const sortBy = opts.sortBy ?? 'risk'; + + const orderBy = + sortBy === 'centrality' + ? 'fileCentrality DESC' + : sortBy === 'churn' + ? 'commitCount DESC' + : 'riskScore DESC'; + + // Aggregate centrality at file level. LEFT JOIN so files without any + // indexed nodes (rare — schema-only files) still surface if they have churn. + const sql = ` + SELECT + f.path AS filePath, + COALESCE(n_agg.fc, 0.0) AS fileCentrality, + f.commit_count AS commitCount, + f.loc AS loc, + f.last_touched_ts AS lastTouchedTs, + COALESCE(n_agg.fc, 0.0) * f.commit_count AS riskScore + FROM files f + LEFT JOIN ( + SELECT file_path, SUM(centrality) AS fc + FROM nodes WHERE centrality IS NOT NULL + GROUP BY file_path + ) n_agg ON n_agg.file_path = f.path + WHERE f.commit_count >= ? AND COALESCE(n_agg.fc, 0.0) >= ? + ORDER BY ${orderBy} + LIMIT ? + `; + const rows = this.db.prepare(sql).all(minCommits, minCentrality, limit) as Array<{ + filePath: string; + fileCentrality: number; + commitCount: number; + loc: number; + lastTouchedTs: number | null; + riskScore: number; + }>; + return rows; + } + + // =========================================================================== + // Symbol-issue attributions (mined from git history) + // =========================================================================== + + applyIssueAttributions( + rows: Iterable<{ + nodeId: string; + issueNumber: number; + commitSha: string; + kind: 'modified' | 'added' | 'removed'; + }> + ): void { + const stmt = this.db.prepare( + `INSERT OR IGNORE INTO symbol_issues (node_id, issue_number, commit_sha, kind) + VALUES (?, ?, ?, ?)` + ); + this.db.transaction(() => { + for (const r of rows) { + stmt.run(r.nodeId, r.issueNumber, r.commitSha, r.kind); + } + })(); + } + + clearIssueAttributions(): void { + this.db.exec('DELETE FROM symbol_issues'); + } + + getIssuesForNode(nodeId: string): Array<{ + issueNumber: number; + kind: 'modified' | 'added' | 'removed'; + commitSha: string; + }> { + return this.db + .prepare( + `SELECT issue_number AS issueNumber, kind, commit_sha AS commitSha + FROM symbol_issues + WHERE node_id = ? + ORDER BY issue_number ASC, kind ASC` + ) + .all(nodeId) as Array<{ + issueNumber: number; + kind: 'modified' | 'added' | 'removed'; + commitSha: string; + }>; + } + + getNodesForIssue(issueNumber: number): Array<{ + nodeId: string; + kind: 'modified' | 'added' | 'removed'; + commitSha: string; + }> { + return this.db + .prepare( + `SELECT node_id AS nodeId, kind, commit_sha AS commitSha + FROM symbol_issues + WHERE issue_number = ? + ORDER BY node_id ASC` + ) + .all(issueNumber) as Array<{ + nodeId: string; + kind: 'modified' | 'added' | 'removed'; + commitSha: string; + }>; + } + + // =========================================================================== + // Config references (env vars / feature flags read sites) + // =========================================================================== + + applyConfigRefs( + rows: Array<{ + configKind: 'env'; + configKey: string; + sourceNodeId: string | null; + filePath: string; + line: number; + }> + ): void { + if (rows.length === 0) return; + const distinctFiles = new Set(rows.map((r) => r.filePath)); + const deleteStmt = this.db.prepare('DELETE FROM config_refs WHERE file_path = ?'); + const insertStmt = this.db.prepare( + `INSERT INTO config_refs (config_kind, config_key, source_node_id, file_path, line) + VALUES (?, ?, ?, ?, ?)` + ); + this.db.transaction(() => { + for (const f of distinctFiles) deleteStmt.run(f); + for (const r of rows) { + insertStmt.run(r.configKind, r.configKey, r.sourceNodeId, r.filePath, r.line); + } + })(); + } + + clearConfigRefs(): void { + this.db.exec('DELETE FROM config_refs'); + } + + deleteConfigRefsForPaths(filePaths: Iterable): void { + const stmt = this.db.prepare('DELETE FROM config_refs WHERE file_path = ?'); + this.db.transaction(() => { + for (const p of filePaths) stmt.run(p); + })(); + } + + pruneOrphanedConfigRefs(): void { + this.db.exec( + `DELETE FROM config_refs WHERE file_path NOT IN (SELECT path FROM files)` + ); + } + + getConfigKeys(opts: { configKind?: 'env'; limit?: number } = {}): Array<{ + configKey: string; + reads: number; + distinctFiles: number; + }> { + const limit = opts.limit ?? 200; + const where = opts.configKind ? 'WHERE config_kind = ?' : ''; + const params = opts.configKind ? [opts.configKind, limit] : [limit]; + return this.db + .prepare( + `SELECT config_key AS configKey, + COUNT(*) AS reads, + COUNT(DISTINCT file_path) AS distinctFiles + FROM config_refs + ${where} + GROUP BY config_key + ORDER BY reads DESC, config_key ASC + LIMIT ?` + ) + .all(...params) as Array<{ configKey: string; reads: number; distinctFiles: number }>; + } + + getConfigRefsByKey( + configKey: string, + opts: { configKind?: 'env' } = {} + ): Array<{ + filePath: string; + line: number; + sourceNodeId: string | null; + sourceName: string | null; + sourceKind: string | null; + }> { + const kind = opts.configKind ?? 'env'; + return this.db + .prepare( + `SELECT cr.file_path AS filePath, + cr.line AS line, + cr.source_node_id AS sourceNodeId, + n.name AS sourceName, + n.kind AS sourceKind + FROM config_refs cr + LEFT JOIN nodes n ON n.id = cr.source_node_id + WHERE cr.config_kind = ? AND cr.config_key = ? + ORDER BY cr.file_path ASC, cr.line ASC` + ) + .all(kind, configKey) as Array<{ + filePath: string; + line: number; + sourceNodeId: string | null; + sourceName: string | null; + sourceKind: string | null; + }>; + } + + getConfigKeysForNode(nodeId: string): Array<{ configKey: string; line: number }> { + return this.db + .prepare( + `SELECT config_key AS configKey, line + FROM config_refs + WHERE source_node_id = ? + ORDER BY config_key ASC, line ASC` + ) + .all(nodeId) as Array<{ configKey: string; line: number }>; + } + + // =========================================================================== + // SQL references (table-name string-literal refs from app code) + // =========================================================================== + + applySqlRefs( + rows: Array<{ + tableName: string; + op: 'read' | 'write' | 'ddl'; + sourceNodeId: string | null; + filePath: string; + line: number; + }> + ): void { + if (rows.length === 0) return; + const stmt = this.db.prepare( + `INSERT INTO sql_refs (table_name, op, source_node_id, file_path, line) + VALUES (?, ?, ?, ?, ?)` + ); + this.db.transaction(() => { + for (const r of rows) { + stmt.run(r.tableName, r.op, r.sourceNodeId, r.filePath, r.line); + } + })(); + } + + replaceAllSqlRefs( + rows: Array<{ + tableName: string; + op: 'read' | 'write' | 'ddl'; + sourceNodeId: string | null; + filePath: string; + line: number; + }> + ): void { + const insert = this.db.prepare( + `INSERT INTO sql_refs (table_name, op, source_node_id, file_path, line) + VALUES (?, ?, ?, ?, ?)` + ); + this.db.transaction(() => { + this.db.exec('DELETE FROM sql_refs'); + for (const r of rows) { + insert.run(r.tableName, r.op, r.sourceNodeId, r.filePath, r.line); + } + })(); + } + + deleteSqlRefsForPaths(filePaths: Iterable): void { + const stmt = this.db.prepare('DELETE FROM sql_refs WHERE file_path = ?'); + this.db.transaction(() => { + for (const p of filePaths) stmt.run(p); + })(); + } + + clearSqlRefs(): void { + this.db.exec('DELETE FROM sql_refs'); + } + + pruneOrphanedSqlRefs(): void { + this.db.exec( + `DELETE FROM sql_refs WHERE file_path NOT IN (SELECT path FROM files)` + ); + } + + getSqlTables(opts: { limit?: number } = {}): Array<{ + tableName: string; + reads: number; + writes: number; + ddl: number; + total: number; + }> { + const limit = opts.limit ?? 100; + return this.db + .prepare( + `SELECT lower(table_name) AS tableName, + SUM(CASE WHEN op = 'read' THEN 1 ELSE 0 END) AS reads, + SUM(CASE WHEN op = 'write' THEN 1 ELSE 0 END) AS writes, + SUM(CASE WHEN op = 'ddl' THEN 1 ELSE 0 END) AS ddl, + COUNT(*) AS total + FROM sql_refs + GROUP BY lower(table_name) + ORDER BY total DESC, tableName ASC + LIMIT ?` + ) + .all(limit) as Array<{ + tableName: string; + reads: number; + writes: number; + ddl: number; + total: number; + }>; + } + + getSqlRefsByTable( + tableName: string, + opts: { op?: 'read' | 'write' | 'ddl' } = {} + ): Array<{ + op: 'read' | 'write' | 'ddl'; + filePath: string; + line: number; + sourceNodeId: string | null; + sourceName: string | null; + sourceKind: string | null; + }> { + const params: Array = [tableName.toLowerCase()]; + let opFilter = ''; + if (opts.op) { + opFilter = ' AND sr.op = ?'; + params.push(opts.op); + } + return this.db + .prepare( + `SELECT sr.op AS op, + sr.file_path AS filePath, + sr.line AS line, + sr.source_node_id AS sourceNodeId, + n.name AS sourceName, + n.kind AS sourceKind + FROM sql_refs sr + LEFT JOIN nodes n ON n.id = sr.source_node_id + WHERE lower(sr.table_name) = ?${opFilter} + ORDER BY sr.file_path ASC, sr.line ASC` + ) + .all(...params) as Array<{ + op: 'read' | 'write' | 'ddl'; + filePath: string; + line: number; + sourceNodeId: string | null; + sourceName: string | null; + sourceKind: string | null; + }>; + } + + getSqlTablesForNode(nodeId: string): Array<{ tableName: string; op: string }> { + return this.db + .prepare( + `SELECT DISTINCT lower(table_name) AS tableName, op + FROM sql_refs + WHERE source_node_id = ? + ORDER BY tableName ASC, op ASC` + ) + .all(nodeId) as Array<{ tableName: string; op: string }>; + } } diff --git a/src/db/schema.sql b/src/db/schema.sql index dd0a9f06..be75f5de 100644 --- a/src/db/schema.sql +++ b/src/db/schema.sql @@ -37,7 +37,13 @@ CREATE TABLE IF NOT EXISTS nodes ( is_abstract INTEGER DEFAULT 0, decorators TEXT, -- JSON array type_parameters TEXT, -- JSON array - updated_at INTEGER NOT NULL + updated_at INTEGER NOT NULL, + centrality REAL DEFAULT NULL, -- PageRank over calls+references; NULL until first compute + -- Camel/snake-split tokens of `name`, joined by spaces. The default + -- FTS5 tokenizer indexes each as a separate term, so a query for + -- `parser` finds `getParser` etc. Populated by buildNameSubwords() + -- in src/utils.ts on every insert/update. + name_subwords TEXT ); -- Edges: Relationships between nodes @@ -63,7 +69,12 @@ CREATE TABLE IF NOT EXISTS files ( modified_at INTEGER NOT NULL, indexed_at INTEGER NOT NULL, node_count INTEGER DEFAULT 0, - errors TEXT -- JSON array + errors TEXT, -- JSON array + -- Churn signals (mined from git log) + commit_count INTEGER NOT NULL DEFAULT 0, + loc INTEGER NOT NULL DEFAULT 0, + first_seen_ts INTEGER DEFAULT NULL, -- unix seconds + last_touched_ts INTEGER DEFAULT NULL -- unix seconds ); -- Unresolved References: References that need resolution after full indexing @@ -92,34 +103,42 @@ CREATE INDEX IF NOT EXISTS idx_nodes_file_path ON nodes(file_path); CREATE INDEX IF NOT EXISTS idx_nodes_language ON nodes(language); CREATE INDEX IF NOT EXISTS idx_nodes_file_line ON nodes(file_path, start_line); CREATE INDEX IF NOT EXISTS idx_nodes_lower_name ON nodes(lower(name)); +CREATE INDEX IF NOT EXISTS idx_nodes_centrality ON nodes(centrality DESC); -- Full-text search index on node names, docstrings, and signatures +-- The Porter stemmer collapses morphological variants so a query for +-- `parsing` matches a docstring or subword containing `parser`/`parse`. +-- This is the largest single quality lift for natural-language queries +-- (verified empirically: targets that ranked #18-#19 or weren't in the +-- top 20 jump to the top 5 — see __tests__/search-quality.test.ts). CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5( id, name, qualified_name, docstring, signature, + name_subwords, content='nodes', - content_rowid='rowid' + content_rowid='rowid', + tokenize="porter unicode61" ); -- Triggers to keep FTS index in sync CREATE TRIGGER IF NOT EXISTS nodes_ai AFTER INSERT ON nodes BEGIN - INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature) - VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature); + INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature, name_subwords) + VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature, NEW.name_subwords); END; CREATE TRIGGER IF NOT EXISTS nodes_ad AFTER DELETE ON nodes BEGIN - INSERT INTO nodes_fts(nodes_fts, rowid, id, name, qualified_name, docstring, signature) - VALUES ('delete', OLD.rowid, OLD.id, OLD.name, OLD.qualified_name, OLD.docstring, OLD.signature); + INSERT INTO nodes_fts(nodes_fts, rowid, id, name, qualified_name, docstring, signature, name_subwords) + VALUES ('delete', OLD.rowid, OLD.id, OLD.name, OLD.qualified_name, OLD.docstring, OLD.signature, OLD.name_subwords); END; CREATE TRIGGER IF NOT EXISTS nodes_au AFTER UPDATE ON nodes BEGIN - INSERT INTO nodes_fts(nodes_fts, rowid, id, name, qualified_name, docstring, signature) - VALUES ('delete', OLD.rowid, OLD.id, OLD.name, OLD.qualified_name, OLD.docstring, OLD.signature); - INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature) - VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature); + INSERT INTO nodes_fts(nodes_fts, rowid, id, name, qualified_name, docstring, signature, name_subwords) + VALUES ('delete', OLD.rowid, OLD.id, OLD.name, OLD.qualified_name, OLD.docstring, OLD.signature, OLD.name_subwords); + INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature, name_subwords) + VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature, NEW.name_subwords); END; -- Edge indexes @@ -129,9 +148,20 @@ CREATE INDEX IF NOT EXISTS idx_edges_kind ON edges(kind); CREATE INDEX IF NOT EXISTS idx_edges_source_kind ON edges(source, kind); CREATE INDEX IF NOT EXISTS idx_edges_target_kind ON edges(target, kind); +-- Uniqueness for (source, target, kind, line, col). The id column is an +-- AUTOINCREMENT primary key, so without this index `INSERT OR IGNORE` +-- would never see a conflict — duplicate edges would silently accumulate +-- on every re-resolution / re-emission. COALESCE keeps two NULL line/col +-- values comparable as equal (SQLite treats raw NULLs in a UNIQUE index +-- as distinct). +CREATE UNIQUE INDEX IF NOT EXISTS idx_edges_unique + ON edges(source, target, kind, COALESCE(line, -1), COALESCE(col, -1)); + -- File indexes CREATE INDEX IF NOT EXISTS idx_files_language ON files(language); CREATE INDEX IF NOT EXISTS idx_files_modified_at ON files(modified_at); +CREATE INDEX IF NOT EXISTS idx_files_commit_count ON files(commit_count DESC); +CREATE INDEX IF NOT EXISTS idx_files_last_touched ON files(last_touched_ts DESC); -- Unresolved refs indexes CREATE INDEX IF NOT EXISTS idx_unresolved_from_node ON unresolved_refs(from_node_id); @@ -146,3 +176,61 @@ CREATE TABLE IF NOT EXISTS project_metadata ( value TEXT NOT NULL, updated_at INTEGER NOT NULL ); + +-- Issue → symbol attribution mined from git history. +-- One row per (node, issue, commit, kind) tuple; kind is 'modified' +-- (enclosing function changed by hunk), 'added' (declaration on a + +-- line), or 'removed' (declaration on a - line, dropped at lookup +-- time when no current node matches). +CREATE TABLE IF NOT EXISTS symbol_issues ( + node_id TEXT NOT NULL, + issue_number INTEGER NOT NULL, + commit_sha TEXT NOT NULL, + kind TEXT NOT NULL CHECK (kind IN ('modified','added','removed')), + PRIMARY KEY (node_id, issue_number, commit_sha, kind), + FOREIGN KEY (node_id) REFERENCES nodes(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_symbol_issues_node ON symbol_issues(node_id); +CREATE INDEX IF NOT EXISTS idx_symbol_issues_issue ON symbol_issues(issue_number); + +-- Config references: read sites for env vars / feature flags / etc. +-- One row per syntactic occurrence in source. config_kind narrows to +-- 'env' (process.env, os.getenv, ...) for v1; future kinds add YAML +-- keys, LaunchDarkly flags, etc. source_node_id may be NULL for +-- top-level reads that aren't inside a function/method. +CREATE TABLE IF NOT EXISTS config_refs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + config_kind TEXT NOT NULL, + config_key TEXT NOT NULL, + source_node_id TEXT, + file_path TEXT NOT NULL, + line INTEGER NOT NULL, + FOREIGN KEY (source_node_id) REFERENCES nodes(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_config_refs_key + ON config_refs(config_kind, config_key); +CREATE INDEX IF NOT EXISTS idx_config_refs_node + ON config_refs(source_node_id); +CREATE INDEX IF NOT EXISTS idx_config_refs_file + ON config_refs(file_path); + +-- SQL references: per-call-site links from app code to a table name. +-- One row per syntactic occurrence in source. op is 'read' (SELECT, +-- FROM in non-DDL), 'write' (INSERT/UPDATE/DELETE), or 'ddl' +-- (CREATE TABLE / ALTER TABLE / DROP TABLE -- rare in app code but +-- catches migration scripts). +CREATE TABLE IF NOT EXISTS sql_refs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + table_name TEXT NOT NULL, + op TEXT NOT NULL CHECK (op IN ('read','write','ddl')), + source_node_id TEXT, + file_path TEXT NOT NULL, + line INTEGER NOT NULL, + FOREIGN KEY (source_node_id) REFERENCES nodes(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_sql_refs_table + ON sql_refs(lower(table_name)); +CREATE INDEX IF NOT EXISTS idx_sql_refs_node + ON sql_refs(source_node_id); +CREATE INDEX IF NOT EXISTS idx_sql_refs_file + ON sql_refs(file_path); diff --git a/src/default-config.ts b/src/default-config.ts new file mode 100644 index 00000000..34769609 --- /dev/null +++ b/src/default-config.ts @@ -0,0 +1,199 @@ +/** + * Default project configuration. + * + * Lives in its own file (separate from `types.ts`) because the + * `include` glob list is derived from the language registry — and + * the registry transitively imports `types.ts` via per-language + * files, which would create an evaluation cycle if `default-config` + * were itself imported by `types.ts` eagerly. + * + * **Lazy include resolution.** The `include` array is built on + * first access via a property getter, not at module load. By the + * time anything reads `DEFAULT_CONFIG.include`, the registry has + * fully evaluated, so all language definitions are available. + */ + +import type { CodeGraphConfig } from './types'; +import { getLanguageDefs } from './extraction/languages/registry'; + +let _includeCache: string[] | null = null; +function buildIncludeGlobs(): string[] { + if (_includeCache) return _includeCache; + const seen = new Set(); + const out: string[] = []; + for (const def of getLanguageDefs()) { + for (const glob of def.includeGlobs) { + if (seen.has(glob)) continue; + seen.add(glob); + out.push(glob); + } + } + _includeCache = out; + return out; +} + +const baseConfig: CodeGraphConfig = { + version: 1, + rootDir: '.', + include: [], // populated lazily via the getter below + exclude: [ + // Version control + '**/.git/**', + + // Dependencies + '**/node_modules/**', + '**/vendor/**', + '**/Pods/**', + + // Generic build outputs + '**/dist/**', + '**/build/**', + '**/out/**', + '**/bin/**', + '**/obj/**', + '**/target/**', + + // JavaScript/TypeScript + '**/*.min.js', + '**/*.bundle.js', + '**/.next/**', + '**/.nuxt/**', + '**/.svelte-kit/**', + '**/.output/**', + '**/.turbo/**', + '**/.cache/**', + '**/.parcel-cache/**', + '**/.vite/**', + '**/.astro/**', + '**/.docusaurus/**', + '**/.gatsby/**', + '**/.webpack/**', + '**/.nx/**', + '**/.yarn/cache/**', + '**/.pnpm-store/**', + '**/storybook-static/**', + + // React Native / Expo + '**/.expo/**', + '**/web-build/**', + '**/ios/Pods/**', + '**/ios/build/**', + '**/android/build/**', + '**/android/.gradle/**', + + // Python + '**/__pycache__/**', + '**/.venv/**', + '**/venv/**', + '**/site-packages/**', + '**/dist-packages/**', + '**/.pytest_cache/**', + '**/.mypy_cache/**', + '**/.ruff_cache/**', + '**/.tox/**', + '**/.nox/**', + '**/*.egg-info/**', + '**/.eggs/**', + + // Go + '**/go/pkg/mod/**', + + // Rust + '**/target/debug/**', + '**/target/release/**', + + // Java/Kotlin/Gradle + '**/.gradle/**', + '**/.m2/**', + '**/generated-sources/**', + '**/.kotlin/**', + + // Dart/Flutter + '**/.dart_tool/**', + + // C#/.NET + '**/.vs/**', + '**/.nuget/**', + '**/artifacts/**', + '**/publish/**', + + // C/C++ + '**/cmake-build-*/**', + '**/CMakeFiles/**', + '**/bazel-*/**', + '**/vcpkg_installed/**', + '**/.conan/**', + '**/Debug/**', + '**/Release/**', + '**/x64/**', + '**/.pio/**', // Platform.io (IoT/embedded build artifacts and library deps) + + // Electron + '**/release/**', + '**/*.app/**', + '**/*.asar', + + // Swift/iOS/Xcode + '**/DerivedData/**', + '**/.build/**', + '**/.swiftpm/**', + '**/xcuserdata/**', + '**/Carthage/Build/**', + '**/SourcePackages/**', + + // Delphi/Pascal + '**/__history/**', + '**/__recovery/**', + '**/*.dcu', + + // PHP + '**/.composer/**', + '**/storage/framework/**', + '**/bootstrap/cache/**', + + // Ruby + '**/.bundle/**', + '**/tmp/cache/**', + '**/public/assets/**', + '**/public/packs/**', + '**/.yardoc/**', + + // Testing/Coverage + '**/coverage/**', + '**/htmlcov/**', + '**/.nyc_output/**', + '**/test-results/**', + '**/.coverage/**', + + // IDE/Editor + '**/.idea/**', + + // Logs and temp + '**/logs/**', + '**/tmp/**', + '**/temp/**', + + // Documentation build output + '**/_build/**', + '**/docs/_build/**', + '**/site/**', + ], + languages: [], + frameworks: [], + maxFileSize: 1024 * 1024, // 1MB + extractDocstrings: true, + trackCallSites: true, + enableCentrality: true, + enableChurn: true, + enableIssueHistory: true, + enableConfigRefs: true, + enableSqlRefs: true, +}; + +Object.defineProperty(baseConfig, 'include', { + get: () => buildIncludeGlobs(), + enumerable: true, + configurable: true, +}); + +export const DEFAULT_CONFIG: CodeGraphConfig = baseConfig; diff --git a/src/extraction/grammars.ts b/src/extraction/grammars.ts index df264fb3..5c2aec09 100644 --- a/src/extraction/grammars.ts +++ b/src/extraction/grammars.ts @@ -4,77 +4,63 @@ * Uses web-tree-sitter (WASM) for universal cross-platform support. * Grammars are loaded lazily — only languages actually present in the project * are compiled, keeping V8 WASM memory pressure low on large codebases. + * + * As of the language-registry refactor, all per-language metadata + * (WASM filenames, file extensions, display names, vendored flag) + * lives in `./languages/.ts` and is auto-collected by + * `./languages/registry.ts`. The constants exported here + * (`EXTENSION_MAP`, `getSupportedLanguages`, `getLanguageDisplayName`) + * remain for backward compat but are derived from the registry. */ import * as path from 'path'; import { Parser, Language as WasmLanguage } from 'web-tree-sitter'; import { Language } from '../types'; +import { getLanguageDefs, getLanguageDefByExtension, getLanguageDefByName } from './languages/registry'; export type GrammarLanguage = Exclude; /** - * WASM filename map — maps each language to its .wasm grammar file - * in the tree-sitter-wasms package. + * File extension → Language mapping, computed lazily on first read. + * + * Cannot be a top-level IIFE: the registry transitively pulls in + * `tree-sitter.ts` (via custom-extractor language defs), which + * imports this file — building the map at module load would TDZ + * against `ALL_DEFS` in the registry. Use the `getExtensionMap()` + * function for an explicit lazy entry point, or read + * `EXTENSION_MAP` (a Proxy that materialises on first property + * access). */ -const WASM_GRAMMAR_FILES: Record = { - typescript: 'tree-sitter-typescript.wasm', - tsx: 'tree-sitter-tsx.wasm', - javascript: 'tree-sitter-javascript.wasm', - jsx: 'tree-sitter-javascript.wasm', - python: 'tree-sitter-python.wasm', - go: 'tree-sitter-go.wasm', - rust: 'tree-sitter-rust.wasm', - java: 'tree-sitter-java.wasm', - c: 'tree-sitter-c.wasm', - cpp: 'tree-sitter-cpp.wasm', - csharp: 'tree-sitter-c_sharp.wasm', - php: 'tree-sitter-php.wasm', - ruby: 'tree-sitter-ruby.wasm', - swift: 'tree-sitter-swift.wasm', - kotlin: 'tree-sitter-kotlin.wasm', - dart: 'tree-sitter-dart.wasm', - pascal: 'tree-sitter-pascal.wasm', -}; +let _extensionMapCache: Record | null = null; +export function getExtensionMap(): Record { + if (_extensionMapCache) return _extensionMapCache; + const out: Record = {}; + for (const def of getLanguageDefs()) { + for (const ext of def.extensions) { + out[ext.toLowerCase()] = def.name as Language; + } + } + _extensionMapCache = out; + return out; +} /** - * File extension to Language mapping + * Backward-compat: a Proxy that lazy-builds the extension map on + * first property access. Existing callers can keep doing + * `EXTENSION_MAP['.ts']` without changes. */ -export const EXTENSION_MAP: Record = { - '.ts': 'typescript', - '.tsx': 'tsx', - '.js': 'javascript', - '.mjs': 'javascript', - '.cjs': 'javascript', - '.jsx': 'jsx', - '.py': 'python', - '.pyw': 'python', - '.go': 'go', - '.rs': 'rust', - '.java': 'java', - '.c': 'c', - '.h': 'c', // Could also be C++, defaulting to C - '.cpp': 'cpp', - '.cc': 'cpp', - '.cxx': 'cpp', - '.hpp': 'cpp', - '.hxx': 'cpp', - '.cs': 'csharp', - '.php': 'php', - '.rb': 'ruby', - '.rake': 'ruby', - '.swift': 'swift', - '.kt': 'kotlin', - '.kts': 'kotlin', - '.dart': 'dart', - '.liquid': 'liquid', - '.svelte': 'svelte', - '.pas': 'pascal', - '.dpr': 'pascal', - '.dpk': 'pascal', - '.lpr': 'pascal', - '.dfm': 'pascal', - '.fmx': 'pascal', -}; +export const EXTENSION_MAP: Record = new Proxy({} as Record, { + get(_t, key: string) { return getExtensionMap()[key]; }, + has(_t, key: string) { return key in getExtensionMap(); }, + ownKeys() { return Object.keys(getExtensionMap()); }, + getOwnPropertyDescriptor(_t, key: string) { + const map = getExtensionMap(); + if (key in map) { + return { configurable: true, enumerable: true, writable: false, value: map[key] }; + } + return undefined; + }, +}); /** * Caches for loaded grammars and parsers @@ -108,21 +94,28 @@ export async function loadGrammarsForLanguages(languages: Language[]): Promise - lang in WASM_GRAMMAR_FILES && - !languageCache.has(lang) && - !unavailableGrammarErrors.has(lang) - ); + // Deduplicate; filter to languages that have a tree-sitter grammar + // (registry's `def.grammar` field) and aren't already loaded. + const seen = new Set(); + const toLoad: Array<{ lang: Language; wasmFile: string; vendored: boolean }> = []; + for (const lang of languages) { + if (seen.has(lang)) continue; + seen.add(lang); + if (languageCache.has(lang) || unavailableGrammarErrors.has(lang)) continue; + const def = getLanguageDefByName(lang); + if (!def?.grammar) continue; + toLoad.push({ + lang, + wasmFile: def.grammar.wasmFile, + vendored: def.grammar.vendored === true, + }); + } // Load grammars sequentially to avoid web-tree-sitter WASM race condition on Node 20+ // See: https://github.com/tree-sitter/tree-sitter/issues/2338 - for (const lang of toLoad) { - const wasmFile = WASM_GRAMMAR_FILES[lang]; + for (const { lang, wasmFile, vendored } of toLoad) { try { - // Pascal ships its own WASM (not in tree-sitter-wasms) - const wasmPath = lang === 'pascal' + const wasmPath = vendored ? path.join(__dirname, 'wasm', wasmFile) : require.resolve(`tree-sitter-wasms/out/${wasmFile}`); const language = await WasmLanguage.load(wasmPath); @@ -140,7 +133,9 @@ export async function loadGrammarsForLanguages(languages: Language[]): Promise { - const allLanguages = Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[]; + const allLanguages = getLanguageDefs() + .filter((d) => d.grammar) + .map((d) => d.name as Language); await loadGrammarsForLanguages(allLanguages); } @@ -176,7 +171,8 @@ export function getParser(language: Language): Parser | null { */ export function detectLanguage(filePath: string, source?: string): Language { const ext = filePath.substring(filePath.lastIndexOf('.')).toLowerCase(); - const lang = EXTENSION_MAP[ext] || 'unknown'; + const def = getLanguageDefByExtension(ext); + const lang = (def?.name as Language) ?? 'unknown'; // .h files could be C or C++ — check source content for C++ features if (lang === 'c' && ext === '.h' && source) { @@ -196,29 +192,30 @@ function looksLikeCpp(source: string): boolean { } /** - * Check if a language is supported (has a grammar defined). - * Returns true if the grammar exists, even if not yet loaded. + * Check if a language is supported (has a grammar or custom extractor). + * Returns true if a registry entry exists, even if its grammar isn't loaded. */ export function isLanguageSupported(language: Language): boolean { - if (language === 'svelte') return true; // custom extractor (script block delegation) - if (language === 'liquid') return true; // custom regex extractor if (language === 'unknown') return false; - return language in WASM_GRAMMAR_FILES; + return getLanguageDefByName(language) !== undefined; } /** * Check if a grammar has been loaded and is ready for parsing. + * Custom-extractor languages (no `grammar` field) are always "ready". */ export function isGrammarLoaded(language: Language): boolean { - if (language === 'svelte' || language === 'liquid') return true; + const def = getLanguageDefByName(language); + if (!def) return false; + if (!def.grammar) return true; // custom extractor — always available return languageCache.has(language); } /** - * Get all supported languages (those with grammar definitions). + * Get all supported languages from the registry. */ export function getSupportedLanguages(): Language[] { - return [...(Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[]), 'svelte', 'liquid']; + return getLanguageDefs().map((d) => d.name as Language); } /** @@ -237,54 +234,33 @@ export function resetParser(language: Language): void { } /** - * Clear parser/grammar caches (useful for testing) + * Clear parser cache (useful for testing). + * + * Note: `languageCache` is intentionally NOT cleared — the WASM + * `Language` modules are expensive to load and stay cached so a + * subsequent `getParser` call can rebuild a fresh `Parser` instance + * without re-reading the .wasm file. To fully re-init, set + * `parserInitialized = false` and call `initGrammars()` again. */ export function clearParserCache(): void { for (const parser of parserCache.values()) { - parser.delete(); + try { parser.delete(); } catch { /* ignore */ } } parserCache.clear(); - // Note: languageCache is NOT cleared — WASM languages persist. - // To fully re-init, set parserInitialized = false and call initGrammars() again. unavailableGrammarErrors.clear(); } /** - * Report grammars that failed to load. + * Get unavailable grammar errors (for diagnostics) */ -export function getUnavailableGrammarErrors(): Partial> { - const out: Partial> = {}; - for (const [language, message] of unavailableGrammarErrors.entries()) { - out[language] = message; - } - return out; +export function getUnavailableGrammarErrors(): Record { + return Object.fromEntries(unavailableGrammarErrors); } /** - * Get language display name + * Human-readable display name (e.g. "TypeScript", "Pascal / Delphi"). + * Returns the canonical name unchanged if no display name is registered. */ export function getLanguageDisplayName(language: Language): string { - const names: Record = { - typescript: 'TypeScript', - javascript: 'JavaScript', - tsx: 'TypeScript (TSX)', - jsx: 'JavaScript (JSX)', - python: 'Python', - go: 'Go', - rust: 'Rust', - java: 'Java', - c: 'C', - cpp: 'C++', - csharp: 'C#', - php: 'PHP', - ruby: 'Ruby', - swift: 'Swift', - kotlin: 'Kotlin', - dart: 'Dart', - svelte: 'Svelte', - liquid: 'Liquid', - pascal: 'Pascal / Delphi', - unknown: 'Unknown', - }; - return names[language] || language; + return getLanguageDefByName(language)?.displayName ?? language; } diff --git a/src/extraction/hcl-extractor.ts b/src/extraction/hcl-extractor.ts new file mode 100644 index 00000000..3d810c88 --- /dev/null +++ b/src/extraction/hcl-extractor.ts @@ -0,0 +1,587 @@ +import type { Node as SyntaxNode } from 'web-tree-sitter'; +import { Node, Edge, ExtractionResult, ExtractionError, UnresolvedReference, NodeKind } from '../types'; +import { generateNodeId, getNodeText } from './tree-sitter-helpers'; +import { getParser } from './grammars'; + +/** + * HclExtractor — extracts a Terraform/HCL file into the graph. + * + * HCL is a declarative configuration language: there are no functions, + * classes, or methods. The unit of structure is the **block**: + * + * [