diff --git a/__tests__/search-query-parser.test.ts b/__tests__/search-query-parser.test.ts new file mode 100644 index 00000000..8a7767da --- /dev/null +++ b/__tests__/search-query-parser.test.ts @@ -0,0 +1,142 @@ +/** + * Unit tests for the field-qualified query parser and bounded + * edit distance — the two algorithms behind `kind:`/`lang:`/`path:`/ + * `name:` filtering and the fuzzy typo fallback. + */ + +import { describe, it, expect } from 'vitest'; +import { parseQuery, boundedEditDistance } from '../src/search/query-parser'; + +describe('parseQuery', () => { + it('returns plain text for a query with no field prefixes', () => { + const r = parseQuery('authenticate user'); + expect(r.text).toBe('authenticate user'); + expect(r.kinds).toEqual([]); + expect(r.languages).toEqual([]); + expect(r.pathFilters).toEqual([]); + expect(r.nameFilters).toEqual([]); + }); + + it('extracts kind: filter and removes it from text', () => { + const r = parseQuery('kind:function auth'); + expect(r.kinds).toEqual(['function']); + expect(r.text).toBe('auth'); + }); + + it('extracts lang: and language: as the same filter family', () => { + const a = parseQuery('lang:typescript foo'); + const b = parseQuery('language:typescript foo'); + expect(a.languages).toEqual(['typescript']); + expect(b.languages).toEqual(['typescript']); + }); + + it('handles multiple kind: filters as an OR set', () => { + const r = parseQuery('kind:function kind:method auth'); + expect(r.kinds.sort()).toEqual(['function', 'method']); + }); + + it('extracts path: and name: as substring filters (kept verbatim)', () => { + const r = parseQuery('path:src/api name:Handler'); + expect(r.pathFilters).toEqual(['src/api']); + expect(r.nameFilters).toEqual(['Handler']); + }); + + it('preserves quoted spans as a single token (whitespace in path:)', () => { + const r = parseQuery('path:"my dir/file" foo'); + expect(r.pathFilters).toEqual(['my dir/file']); + expect(r.text).toBe('foo'); + }); + + it('passes URL-like tokens through to text (does not match http: as a field)', () => { + const r = parseQuery('http://example.com'); + expect(r.text).toBe('http://example.com'); + expect(r.kinds).toEqual([]); + }); + + it('passes empty-value tokens through as text (kind: → "kind:")', () => { + const r = parseQuery('kind: foo'); + expect(r.kinds).toEqual([]); + // The trailing-colon token comes back as plain text + expect(r.text.includes('kind:')).toBe(true); + }); + + it('passes unknown field prefixes through as text (TODO: keeps the colon)', () => { + const r = parseQuery('TODO: needs review'); + expect(r.text).toBe('TODO: needs review'); + expect(r.kinds).toEqual([]); + }); + + it('rejects unknown values for kind: (passes the whole token to text)', () => { + const r = parseQuery('kind:invalid foo'); + // Invalid kind value falls back to text + expect(r.kinds).toEqual([]); + expect(r.text).toContain('kind:invalid'); + }); + + it('handles all-filters-no-text query', () => { + const r = parseQuery('kind:function lang:typescript'); + expect(r.kinds).toEqual(['function']); + expect(r.languages).toEqual(['typescript']); + expect(r.text).toBe(''); + }); + + it('survives empty input', () => { + const r = parseQuery(''); + expect(r.text).toBe(''); + expect(r.kinds).toEqual([]); + }); + + it('survives a very long input (no allocation explosion)', () => { + const huge = 'foo '.repeat(5000); // 20k chars + const r = parseQuery(huge); + expect(r.text.length).toBeGreaterThan(0); + }); +}); + +describe('boundedEditDistance', () => { + it('returns 0 for identical strings', () => { + expect(boundedEditDistance('user', 'user', 2)).toBe(0); + }); + + it('returns 1 for a single substitution', () => { + expect(boundedEditDistance('user', 'usar', 2)).toBe(1); + }); + + it('returns 1 for a single insertion', () => { + expect(boundedEditDistance('user', 'users', 2)).toBe(1); + }); + + it('returns 1 for a single deletion', () => { + expect(boundedEditDistance('users', 'user', 2)).toBe(1); + }); + + it('returns 2 for a transposition (two edits in basic Levenshtein)', () => { + // 'aple' vs 'palp' would be 2; pick a clearer pair. + // 'foo' vs 'fou': substitution + insertion = 2 if different lengths. + expect(boundedEditDistance('confg', 'configX', 2)).toBe(2); + }); + + it('returns maxDist+1 when distance clearly exceeds budget', () => { + expect(boundedEditDistance('foo', 'completely-different', 2)).toBe(3); + }); + + it('respects length-difference shortcut', () => { + // |len(a) - len(b)| > maxDist must immediately be over budget + expect(boundedEditDistance('a', 'aaaaaaa', 2)).toBe(3); + }); + + it('handles empty inputs', () => { + expect(boundedEditDistance('', '', 2)).toBe(0); + expect(boundedEditDistance('a', '', 2)).toBe(1); + expect(boundedEditDistance('', 'abc', 2)).toBe(3); + }); + + it('is case-sensitive — caller must lowercase if case-insensitive match wanted', () => { + expect(boundedEditDistance('Foo', 'foo', 2)).toBe(1); + }); + + it('early-exits when row min exceeds budget (correctness, not just perf)', () => { + // 'aaaaa' vs 'bbbbb': distance is 5, well over budget 2 + expect(boundedEditDistance('aaaaa', 'bbbbb', 2)).toBe(3); + }); +}); diff --git a/src/db/queries.ts b/src/db/queries.ts index 51f1a1ad..db7c6118 100644 --- a/src/db/queries.ts +++ b/src/db/queries.ts @@ -19,6 +19,7 @@ import { } from '../types'; import { safeJsonParse } from '../utils'; import { kindBonus, nameMatchBonus, scorePathRelevance } from '../search/query-utils'; +import { parseQuery, boundedEditDistance } from '../search/query-parser'; /** * Database row types (snake_case from SQLite) @@ -478,14 +479,51 @@ export class QueryBuilder { * 3. Score results based on match quality */ searchNodes(query: string, options: SearchOptions = {}): SearchResult[] { - const { kinds, languages, limit = 100, offset = 0 } = options; + const { limit = 100, offset = 0 } = options; + + // Parse field-qualified bits out of the raw query (kind:, lang:, + // path:, name:). Anything not recognised stays in `text` and goes + // to FTS unchanged. Filters compose with the SearchOptions arg — + // both are applied (intersection-style). + const parsed = parseQuery(query); + const mergedKinds = + parsed.kinds.length > 0 + ? Array.from(new Set([...(options.kinds ?? []), ...parsed.kinds])) + : options.kinds; + const mergedLanguages = + parsed.languages.length > 0 + ? Array.from(new Set([...(options.languages ?? []), ...parsed.languages])) + : options.languages; + const pathFilters = parsed.pathFilters; + const nameFilters = parsed.nameFilters; + // The text portion drives FTS/LIKE; if all the user typed was + // filters (`kind:function`), we still need *some* candidate set, + // so synthesise an empty-text path that returns everything matching + // the filters. + const text = parsed.text; + const kinds = mergedKinds; + const languages = mergedLanguages; // First try FTS5 with prefix matching - let results = this.searchNodesFTS(query, { kinds, languages, limit, offset }); + let results = text + ? this.searchNodesFTS(text, { kinds, languages, limit, offset }) + // Over-fetch by 5× when running filter-only (no text). The + // post-scoring path: + name: filters can be very selective, so + // a smaller multiplier risks returning fewer than `limit` + // results despite the DB having plenty of matches. + : this.searchAllByFilters({ kinds, languages, limit: limit * 5 }); // If no FTS results, try LIKE-based substring search - if (results.length === 0 && query.length >= 2) { - results = this.searchNodesLike(query, { kinds, languages, limit, offset }); + if (results.length === 0 && text.length >= 2) { + results = this.searchNodesLike(text, { kinds, languages, limit, offset }); + } + + // Final fuzzy fallback: scan all known names and keep those within + // a tight Levenshtein distance. Only fires when both FTS and LIKE + // returned nothing AND there's a text portion long enough to be + // worth fuzzing (1-char queries would match too much). + if (results.length === 0 && text.length >= 3) { + results = this.searchNodesFuzzy(text, { kinds, languages, limit }); } // Supplement: ensure exact name matches are always candidates. @@ -521,13 +559,14 @@ export class QueryBuilder { } // Apply multi-signal scoring - if (results.length > 0 && query) { + if (results.length > 0 && (text || query)) { + const scoringQuery = text || query; results = results.map(r => ({ ...r, score: r.score + kindBonus(r.node.kind) - + scorePathRelevance(r.node.filePath, query) - + nameMatchBonus(r.node.name, query), + + scorePathRelevance(r.node.filePath, scoringQuery) + + nameMatchBonus(r.node.name, scoringQuery), })); results.sort((a, b) => b.score - a.score); // Trim to requested limit after rescoring @@ -536,6 +575,117 @@ export class QueryBuilder { } } + // Apply path: + name: filters AFTER scoring. Scoring already uses + // path/name as a soft signal; the explicit filters here are a hard + // gate. Done last so the FTS limit fetched plenty of candidates to + // narrow from. + if (pathFilters.length > 0) { + const lowered = pathFilters.map((p) => p.toLowerCase()); + results = results.filter((r) => { + const fp = r.node.filePath.toLowerCase(); + return lowered.some((p) => fp.includes(p)); + }); + } + if (nameFilters.length > 0) { + const lowered = nameFilters.map((n) => n.toLowerCase()); + results = results.filter((r) => { + const nm = r.node.name.toLowerCase(); + return lowered.some((n) => nm.includes(n)); + }); + } + + return results; + } + + /** + * Match-everything path used when the user supplied only field + * filters (`kind:function lang:typescript`) with no text. Returns + * candidates ordered by name; the caller's filter pass narrows to + * what was asked for. + */ + private searchAllByFilters(options: { + kinds?: NodeKind[]; + languages?: Language[]; + limit: number; + }): SearchResult[] { + const { kinds, languages, limit } = options; + let sql = 'SELECT * FROM nodes WHERE 1=1'; + const params: (string | number)[] = []; + if (kinds && kinds.length > 0) { + sql += ` AND kind IN (${kinds.map(() => '?').join(',')})`; + params.push(...kinds); + } + if (languages && languages.length > 0) { + sql += ` AND language IN (${languages.map(() => '?').join(',')})`; + params.push(...languages); + } + sql += ' ORDER BY name LIMIT ?'; + params.push(limit); + const rows = this.db.prepare(sql).all(...params) as NodeRow[]; + return rows.map((row) => ({ node: rowToNode(row), score: 1 })); + } + + /** + * Fuzzy fallback: when zero FTS/LIKE hits, try an edit-distance + * sweep over the distinct symbol-name set. Caps `maxDist` at 2 so + * `getUssr` finds `getUser` but `process` doesn't match `prosody`. + * Bounded edit distance keeps each comparison cheap; the per-query + * scan is O(distinct-name-count) which is far smaller than total + * node count on any real codebase. + */ + private searchNodesFuzzy( + text: string, + options: { kinds?: NodeKind[]; languages?: Language[]; limit: number } + ): SearchResult[] { + const { kinds, languages, limit } = options; + const lowered = text.toLowerCase(); + const maxDist = lowered.length <= 4 ? 1 : 2; + + // Pull the distinct name list once. The set is cached on QueryBuilder + // by getAllNodeNames(); even on a 200k-node project the distinct + // name set is typically O(10k) because most names repeat. The + // candidate-cap below bounds memory regardless. + const allNames = this.getAllNodeNames(); + const candidates: Array<{ name: string; dist: number }> = []; + for (const name of allNames) { + const dist = boundedEditDistance(name.toLowerCase(), lowered, maxDist); + if (dist <= maxDist) candidates.push({ name, dist }); + } + candidates.sort((a, b) => a.dist - b.dist); + + // Cap the per-name follow-up queries. Each survivor triggers a + // separate `SELECT * FROM nodes WHERE name = ?`; without this cap + // a project with many similar names (`getUser1`, `getUser2`...) + // could fan out far beyond `limit` queries before the inner-loop + // limit kicks in. + const FUZZY_FOLLOWUP_CAP = Math.max(limit * 2, 50); + const cappedCandidates = candidates.slice(0, FUZZY_FOLLOWUP_CAP); + + const results: SearchResult[] = []; + const seen = new Set(); + for (const c of cappedCandidates) { + if (results.length >= limit) break; + let sql = 'SELECT * FROM nodes WHERE name = ?'; + const params: (string | number)[] = [c.name]; + if (kinds && kinds.length > 0) { + sql += ` AND kind IN (${kinds.map(() => '?').join(',')})`; + params.push(...kinds); + } + if (languages && languages.length > 0) { + sql += ` AND language IN (${languages.map(() => '?').join(',')})`; + params.push(...languages); + } + sql += ' LIMIT 5'; + const rows = this.db.prepare(sql).all(...params) as NodeRow[]; + for (const row of rows) { + if (seen.has(row.id)) continue; + seen.add(row.id); + // Lower the score for each edit step away from the query so + // exact-match fallbacks (dist 0) outrank dist-2 typos. + results.push({ node: rowToNode(row), score: 1 / (1 + c.dist) }); + if (results.length >= limit) break; + } + } return results; } diff --git a/src/search/query-parser.ts b/src/search/query-parser.ts new file mode 100644 index 00000000..05007287 --- /dev/null +++ b/src/search/query-parser.ts @@ -0,0 +1,184 @@ +/** + * Field-qualified search query parser. + * + * Splits a raw query like + * + * kind:function name:auth path:src/api authenticate + * + * into structured filters (kind=function, name="auth", path prefix + * "src/api") plus the free-text portion ("authenticate") that goes + * to FTS. Free-text and filters compose: filters narrow the result + * set, FTS scores within the narrowed set. + * + * Recognised fields (case-insensitive, value is the rest until + * whitespace): + * + * kind: one of function|method|class|interface|struct|... + * lang: one of typescript|python|go|... (alias: language:) + * path: case-insensitive substring of file_path + * name: case-insensitive substring of the symbol's name + * + * Unknown field prefixes (e.g. `foo:bar`) are passed through to FTS + * as plain text — that's how someone searching for `TODO:` gets a + * result instead of a parse error. + * + * Quoting: + * kind:function path:"src/some path/with spaces" → handled by stripping + * the surrounding double quotes from the value (single token only, + * no nested escapes). + */ + +import { NODE_KINDS, LANGUAGES } from '../types'; +import type { NodeKind, Language } from '../types'; + +export interface ParsedQuery { + /** Free-text portion to feed to FTS / LIKE. May be empty. */ + text: string; + /** kind: filters (OR'd). Empty when none specified. */ + kinds: NodeKind[]; + /** lang:/language: filters (OR'd). Empty when none specified. */ + languages: Language[]; + /** path: filters (OR'd, case-insensitive substring of file_path). Empty when none. */ + pathFilters: string[]; + /** name: filters (OR'd, case-insensitive substring of node.name). */ + nameFilters: string[]; +} + +// Derived from the canonical `NODE_KINDS` / `LANGUAGES` arrays in +// types.ts so adding a new kind or language doesn't silently fall +// through to plain text here. +const KIND_VALUES: ReadonlySet = new Set(NODE_KINDS); +const LANGUAGE_VALUES: ReadonlySet = new Set(LANGUAGES); + +/** + * Strip a surrounding pair of double quotes from `s`. Allows users to + * keep whitespace in path filters: `path:"my dir/file"`. + */ +function unquote(s: string): string { + if (s.length >= 2 && s.startsWith('"') && s.endsWith('"')) return s.slice(1, -1); + return s; +} + +/** + * Parse a raw query into structured filters + remaining text. + * Always returns a value; never throws. + */ +export function parseQuery(raw: string): ParsedQuery { + const out: ParsedQuery = { + text: '', + kinds: [], + languages: [], + pathFilters: [], + nameFilters: [], + }; + + // Tokenise on whitespace, preserving quoted spans as part of the + // current token. Quotes can appear at the start (`"…"`) OR mid-token + // (`path:"…"`); in both cases everything from the opening `"` to the + // matching `"` is included in the token, whitespace and all. + const tokens: string[] = []; + let i = 0; + while (i < raw.length) { + while (i < raw.length && /\s/.test(raw[i]!)) i++; + if (i >= raw.length) break; + const start = i; + while (i < raw.length && !/\s/.test(raw[i]!)) { + if (raw[i] === '"') { + const end = raw.indexOf('"', i + 1); + if (end === -1) { + // Unterminated quote — swallow the rest of the input as + // one token. Forgiving rather than throwing. + i = raw.length; + break; + } + i = end + 1; + continue; + } + i++; + } + tokens.push(raw.slice(start, i)); + } + + const textParts: string[] = []; + for (const tok of tokens) { + const colon = tok.indexOf(':'); + if (colon <= 0 || colon === tok.length - 1) { + textParts.push(tok); + continue; + } + const key = tok.slice(0, colon).toLowerCase(); + const valueRaw = unquote(tok.slice(colon + 1)); + if (!valueRaw) { + textParts.push(tok); + continue; + } + switch (key) { + case 'kind': { + if (KIND_VALUES.has(valueRaw)) { + out.kinds.push(valueRaw as NodeKind); + } else { + textParts.push(tok); + } + break; + } + case 'lang': + case 'language': { + const lower = valueRaw.toLowerCase(); + if (LANGUAGE_VALUES.has(lower)) { + out.languages.push(lower as Language); + } else { + textParts.push(tok); + } + break; + } + case 'path': + out.pathFilters.push(valueRaw); + break; + case 'name': + out.nameFilters.push(valueRaw); + break; + default: + textParts.push(tok); + } + } + + out.text = textParts.join(' ').trim(); + return out; +} + +/** + * Damerau-Levenshtein-ish bounded edit distance. Returns `maxDist + 1` + * as soon as the distance is known to exceed `maxDist`; that early-exit + * makes the fuzzy fallback cheap even over tens of thousands of names. + * + * Pure DP, O(min(len(a), len(b))) memory. Compares case-folded inputs; + * callers should pass `lowercase(name)` strings. + */ +export function boundedEditDistance(a: string, b: string, maxDist: number): number { + if (a === b) return 0; + const al = a.length; + const bl = b.length; + if (Math.abs(al - bl) > maxDist) return maxDist + 1; + if (al === 0) return bl; + if (bl === 0) return al; + + let prev = new Array(bl + 1); + let cur = new Array(bl + 1); + for (let j = 0; j <= bl; j++) prev[j] = j; + + for (let i = 1; i <= al; i++) { + cur[0] = i; + let rowMin = cur[0]!; + for (let j = 1; j <= bl; j++) { + const cost = a.charCodeAt(i - 1) === b.charCodeAt(j - 1) ? 0 : 1; + const insertion = cur[j - 1]! + 1; + const deletion = prev[j]! + 1; + const substitution = prev[j - 1]! + cost; + cur[j] = Math.min(insertion, deletion, substitution); + if (cur[j]! < rowMin) rowMin = cur[j]!; + } + if (rowMin > maxDist) return maxDist + 1; + [prev, cur] = [cur, prev]; + } + return prev[bl]!; +} diff --git a/src/types.ts b/src/types.ts index 6834483d..7c51407f 100644 --- a/src/types.ts +++ b/src/types.ts @@ -9,31 +9,38 @@ // ============================================================================= /** - * Types of nodes in the knowledge graph + * Types of nodes in the knowledge graph. + * + * Defined as a runtime-iterable `as const` array so the same source + * of truth backs both the TS type and any runtime validation + * (e.g. the search query parser). */ -export type NodeKind = - | 'file' - | 'module' - | 'class' - | 'struct' - | 'interface' - | 'trait' - | 'protocol' - | 'function' - | 'method' - | 'property' - | 'field' - | 'variable' - | 'constant' - | 'enum' - | 'enum_member' - | 'type_alias' - | 'namespace' - | 'parameter' - | 'import' - | 'export' - | 'route' - | 'component'; +export const NODE_KINDS = [ + 'file', + 'module', + 'class', + 'struct', + 'interface', + 'trait', + 'protocol', + 'function', + 'method', + 'property', + 'field', + 'variable', + 'constant', + 'enum', + 'enum_member', + 'type_alias', + 'namespace', + 'parameter', + 'import', + 'export', + 'route', + 'component', +] as const; + +export type NodeKind = (typeof NODE_KINDS)[number]; /** * Types of edges (relationships) between nodes @@ -53,29 +60,33 @@ export type EdgeKind = | 'decorates'; // Decorator applied to symbol /** - * Supported programming languages + * Supported programming languages. See NODE_KINDS for why this is a + * runtime-iterable const array. */ -export type Language = - | 'typescript' - | 'javascript' - | 'tsx' - | 'jsx' - | 'python' - | 'go' - | 'rust' - | 'java' - | 'c' - | 'cpp' - | 'csharp' - | 'php' - | 'ruby' - | 'swift' - | 'kotlin' - | 'dart' - | 'svelte' - | 'liquid' - | 'pascal' - | 'unknown'; +export const LANGUAGES = [ + 'typescript', + 'javascript', + 'tsx', + 'jsx', + 'python', + 'go', + 'rust', + 'java', + 'c', + 'cpp', + 'csharp', + 'php', + 'ruby', + 'swift', + 'kotlin', + 'dart', + 'svelte', + 'liquid', + 'pascal', + 'unknown', +] as const; + +export type Language = (typeof LANGUAGES)[number]; // ============================================================================= // Core Graph Types