Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 142 additions & 0 deletions __tests__/search-query-parser.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
/**
* Unit tests for the field-qualified query parser and bounded
* edit distance — the two algorithms behind `kind:`/`lang:`/`path:`/
* `name:` filtering and the fuzzy typo fallback.
*/

import { describe, it, expect } from 'vitest';
import { parseQuery, boundedEditDistance } from '../src/search/query-parser';

describe('parseQuery', () => {
it('returns plain text for a query with no field prefixes', () => {
const r = parseQuery('authenticate user');
expect(r.text).toBe('authenticate user');
expect(r.kinds).toEqual([]);
expect(r.languages).toEqual([]);
expect(r.pathFilters).toEqual([]);
expect(r.nameFilters).toEqual([]);
});

it('extracts kind: filter and removes it from text', () => {
const r = parseQuery('kind:function auth');
expect(r.kinds).toEqual(['function']);
expect(r.text).toBe('auth');
});

it('extracts lang: and language: as the same filter family', () => {
const a = parseQuery('lang:typescript foo');
const b = parseQuery('language:typescript foo');
expect(a.languages).toEqual(['typescript']);
expect(b.languages).toEqual(['typescript']);
});

it('handles multiple kind: filters as an OR set', () => {
const r = parseQuery('kind:function kind:method auth');
expect(r.kinds.sort()).toEqual(['function', 'method']);
});

it('extracts path: and name: as substring filters (kept verbatim)', () => {
const r = parseQuery('path:src/api name:Handler');
expect(r.pathFilters).toEqual(['src/api']);
expect(r.nameFilters).toEqual(['Handler']);
});

it('preserves quoted spans as a single token (whitespace in path:)', () => {
const r = parseQuery('path:"my dir/file" foo');
expect(r.pathFilters).toEqual(['my dir/file']);
expect(r.text).toBe('foo');
});

it('passes URL-like tokens through to text (does not match http: as a field)', () => {
const r = parseQuery('http://example.com');
expect(r.text).toBe('http://example.com');
expect(r.kinds).toEqual([]);
});

it('passes empty-value tokens through as text (kind: → "kind:")', () => {
const r = parseQuery('kind: foo');
expect(r.kinds).toEqual([]);
// The trailing-colon token comes back as plain text
expect(r.text.includes('kind:')).toBe(true);
});

it('passes unknown field prefixes through as text (TODO: keeps the colon)', () => {
const r = parseQuery('TODO: needs review');
expect(r.text).toBe('TODO: needs review');
expect(r.kinds).toEqual([]);
});

it('rejects unknown values for kind: (passes the whole token to text)', () => {
const r = parseQuery('kind:invalid foo');
// Invalid kind value falls back to text
expect(r.kinds).toEqual([]);
expect(r.text).toContain('kind:invalid');
});

it('handles all-filters-no-text query', () => {
const r = parseQuery('kind:function lang:typescript');
expect(r.kinds).toEqual(['function']);
expect(r.languages).toEqual(['typescript']);
expect(r.text).toBe('');
});

it('survives empty input', () => {
const r = parseQuery('');
expect(r.text).toBe('');
expect(r.kinds).toEqual([]);
});

it('survives a very long input (no allocation explosion)', () => {
const huge = 'foo '.repeat(5000); // 20k chars
const r = parseQuery(huge);
expect(r.text.length).toBeGreaterThan(0);
});
});

describe('boundedEditDistance', () => {
it('returns 0 for identical strings', () => {
expect(boundedEditDistance('user', 'user', 2)).toBe(0);
});

it('returns 1 for a single substitution', () => {
expect(boundedEditDistance('user', 'usar', 2)).toBe(1);
});

it('returns 1 for a single insertion', () => {
expect(boundedEditDistance('user', 'users', 2)).toBe(1);
});

it('returns 1 for a single deletion', () => {
expect(boundedEditDistance('users', 'user', 2)).toBe(1);
});

it('returns 2 for a transposition (two edits in basic Levenshtein)', () => {
// 'aple' vs 'palp' would be 2; pick a clearer pair.
// 'foo' vs 'fou': substitution + insertion = 2 if different lengths.
expect(boundedEditDistance('confg', 'configX', 2)).toBe(2);
});

it('returns maxDist+1 when distance clearly exceeds budget', () => {
expect(boundedEditDistance('foo', 'completely-different', 2)).toBe(3);
});

it('respects length-difference shortcut', () => {
// |len(a) - len(b)| > maxDist must immediately be over budget
expect(boundedEditDistance('a', 'aaaaaaa', 2)).toBe(3);
});

it('handles empty inputs', () => {
expect(boundedEditDistance('', '', 2)).toBe(0);
expect(boundedEditDistance('a', '', 2)).toBe(1);
expect(boundedEditDistance('', 'abc', 2)).toBe(3);
});

it('is case-sensitive — caller must lowercase if case-insensitive match wanted', () => {
expect(boundedEditDistance('Foo', 'foo', 2)).toBe(1);
});

it('early-exits when row min exceeds budget (correctness, not just perf)', () => {
// 'aaaaa' vs 'bbbbb': distance is 5, well over budget 2
expect(boundedEditDistance('aaaaa', 'bbbbb', 2)).toBe(3);
});
});
164 changes: 157 additions & 7 deletions src/db/queries.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import {
} from '../types';
import { safeJsonParse } from '../utils';
import { kindBonus, nameMatchBonus, scorePathRelevance } from '../search/query-utils';
import { parseQuery, boundedEditDistance } from '../search/query-parser';

/**
* Database row types (snake_case from SQLite)
Expand Down Expand Up @@ -478,14 +479,51 @@ export class QueryBuilder {
* 3. Score results based on match quality
*/
searchNodes(query: string, options: SearchOptions = {}): SearchResult[] {
const { kinds, languages, limit = 100, offset = 0 } = options;
const { limit = 100, offset = 0 } = options;

// Parse field-qualified bits out of the raw query (kind:, lang:,
// path:, name:). Anything not recognised stays in `text` and goes
// to FTS unchanged. Filters compose with the SearchOptions arg —
// both are applied (intersection-style).
const parsed = parseQuery(query);
const mergedKinds =
parsed.kinds.length > 0
? Array.from(new Set([...(options.kinds ?? []), ...parsed.kinds]))
: options.kinds;
const mergedLanguages =
parsed.languages.length > 0
? Array.from(new Set([...(options.languages ?? []), ...parsed.languages]))
: options.languages;
const pathFilters = parsed.pathFilters;
const nameFilters = parsed.nameFilters;
// The text portion drives FTS/LIKE; if all the user typed was
// filters (`kind:function`), we still need *some* candidate set,
// so synthesise an empty-text path that returns everything matching
// the filters.
const text = parsed.text;
const kinds = mergedKinds;
const languages = mergedLanguages;

// First try FTS5 with prefix matching
let results = this.searchNodesFTS(query, { kinds, languages, limit, offset });
let results = text
? this.searchNodesFTS(text, { kinds, languages, limit, offset })
// Over-fetch by 5× when running filter-only (no text). The
// post-scoring path: + name: filters can be very selective, so
// a smaller multiplier risks returning fewer than `limit`
// results despite the DB having plenty of matches.
: this.searchAllByFilters({ kinds, languages, limit: limit * 5 });

// If no FTS results, try LIKE-based substring search
if (results.length === 0 && query.length >= 2) {
results = this.searchNodesLike(query, { kinds, languages, limit, offset });
if (results.length === 0 && text.length >= 2) {
results = this.searchNodesLike(text, { kinds, languages, limit, offset });
}

// Final fuzzy fallback: scan all known names and keep those within
// a tight Levenshtein distance. Only fires when both FTS and LIKE
// returned nothing AND there's a text portion long enough to be
// worth fuzzing (1-char queries would match too much).
if (results.length === 0 && text.length >= 3) {
results = this.searchNodesFuzzy(text, { kinds, languages, limit });
}

// Supplement: ensure exact name matches are always candidates.
Expand Down Expand Up @@ -521,13 +559,14 @@ export class QueryBuilder {
}

// Apply multi-signal scoring
if (results.length > 0 && query) {
if (results.length > 0 && (text || query)) {
const scoringQuery = text || query;
results = results.map(r => ({
...r,
score: r.score
+ kindBonus(r.node.kind)
+ scorePathRelevance(r.node.filePath, query)
+ nameMatchBonus(r.node.name, query),
+ scorePathRelevance(r.node.filePath, scoringQuery)
+ nameMatchBonus(r.node.name, scoringQuery),
}));
results.sort((a, b) => b.score - a.score);
// Trim to requested limit after rescoring
Expand All @@ -536,6 +575,117 @@ export class QueryBuilder {
}
}

// Apply path: + name: filters AFTER scoring. Scoring already uses
// path/name as a soft signal; the explicit filters here are a hard
// gate. Done last so the FTS limit fetched plenty of candidates to
// narrow from.
if (pathFilters.length > 0) {
const lowered = pathFilters.map((p) => p.toLowerCase());
results = results.filter((r) => {
const fp = r.node.filePath.toLowerCase();
return lowered.some((p) => fp.includes(p));
});
}
if (nameFilters.length > 0) {
const lowered = nameFilters.map((n) => n.toLowerCase());
results = results.filter((r) => {
const nm = r.node.name.toLowerCase();
return lowered.some((n) => nm.includes(n));
});
}

return results;
}

/**
* Match-everything path used when the user supplied only field
* filters (`kind:function lang:typescript`) with no text. Returns
* candidates ordered by name; the caller's filter pass narrows to
* what was asked for.
*/
private searchAllByFilters(options: {
kinds?: NodeKind[];
languages?: Language[];
limit: number;
}): SearchResult[] {
const { kinds, languages, limit } = options;
let sql = 'SELECT * FROM nodes WHERE 1=1';
const params: (string | number)[] = [];
if (kinds && kinds.length > 0) {
sql += ` AND kind IN (${kinds.map(() => '?').join(',')})`;
params.push(...kinds);
}
if (languages && languages.length > 0) {
sql += ` AND language IN (${languages.map(() => '?').join(',')})`;
params.push(...languages);
}
sql += ' ORDER BY name LIMIT ?';
params.push(limit);
const rows = this.db.prepare(sql).all(...params) as NodeRow[];
return rows.map((row) => ({ node: rowToNode(row), score: 1 }));
}

/**
* Fuzzy fallback: when zero FTS/LIKE hits, try an edit-distance
* sweep over the distinct symbol-name set. Caps `maxDist` at 2 so
* `getUssr` finds `getUser` but `process` doesn't match `prosody`.
* Bounded edit distance keeps each comparison cheap; the per-query
* scan is O(distinct-name-count) which is far smaller than total
* node count on any real codebase.
*/
private searchNodesFuzzy(
text: string,
options: { kinds?: NodeKind[]; languages?: Language[]; limit: number }
): SearchResult[] {
const { kinds, languages, limit } = options;
const lowered = text.toLowerCase();
const maxDist = lowered.length <= 4 ? 1 : 2;

// Pull the distinct name list once. The set is cached on QueryBuilder
// by getAllNodeNames(); even on a 200k-node project the distinct
// name set is typically O(10k) because most names repeat. The
// candidate-cap below bounds memory regardless.
const allNames = this.getAllNodeNames();
const candidates: Array<{ name: string; dist: number }> = [];
for (const name of allNames) {
const dist = boundedEditDistance(name.toLowerCase(), lowered, maxDist);
if (dist <= maxDist) candidates.push({ name, dist });
}
candidates.sort((a, b) => a.dist - b.dist);

// Cap the per-name follow-up queries. Each survivor triggers a
// separate `SELECT * FROM nodes WHERE name = ?`; without this cap
// a project with many similar names (`getUser1`, `getUser2`...)
// could fan out far beyond `limit` queries before the inner-loop
// limit kicks in.
const FUZZY_FOLLOWUP_CAP = Math.max(limit * 2, 50);
const cappedCandidates = candidates.slice(0, FUZZY_FOLLOWUP_CAP);

const results: SearchResult[] = [];
const seen = new Set<string>();
for (const c of cappedCandidates) {
if (results.length >= limit) break;
let sql = 'SELECT * FROM nodes WHERE name = ?';
const params: (string | number)[] = [c.name];
if (kinds && kinds.length > 0) {
sql += ` AND kind IN (${kinds.map(() => '?').join(',')})`;
params.push(...kinds);
}
if (languages && languages.length > 0) {
sql += ` AND language IN (${languages.map(() => '?').join(',')})`;
params.push(...languages);
}
sql += ' LIMIT 5';
const rows = this.db.prepare(sql).all(...params) as NodeRow[];
for (const row of rows) {
if (seen.has(row.id)) continue;
seen.add(row.id);
// Lower the score for each edit step away from the query so
// exact-match fallbacks (dist 0) outrank dist-2 typos.
results.push({ node: rowToNode(row), score: 1 / (1 + c.dist) });
if (results.length >= limit) break;
}
}
return results;
}

Expand Down
Loading