Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion __tests__/foundation.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ describe('Database Connection', () => {

const version = db.getSchemaVersion();
expect(version).not.toBeNull();
expect(version?.version).toBe(3);
expect(version?.version).toBe(4);

db.close();
});
Expand Down
2 changes: 1 addition & 1 deletion __tests__/pr19-improvements.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ describe('Best-Candidate Resolution', () => {
describe('Schema v2 Migration', () => {
it.skipIf(!HAS_SQLITE)('should have correct current schema version', async () => {
const { CURRENT_SCHEMA_VERSION } = await import('../src/db/migrations');
expect(CURRENT_SCHEMA_VERSION).toBe(3);
expect(CURRENT_SCHEMA_VERSION).toBe(4);
});

it.skipIf(!HAS_SQLITE)('should have migration for version 2', async () => {
Expand Down
302 changes: 302 additions & 0 deletions __tests__/search-quality.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,302 @@
/**
* Search Quality Tests
*
* Regression tests for the FTS improvements that bring natural-language
* and partial-identifier queries into the top of the result set:
* - Subword tokens (camel/snake split) so `parser` finds `getParser`.
* - Porter stemmer so `parsing` matches `parser`/`parses`.
* - Stopword stripping so `"how"` / `"the"` don't crowd out the
* real terms via docstring matches.
*
* All measurements were captured against codegraph's own src/ during
* development. Targets that previously ranked #18, #19, or weren't in
* the top 20 jump to the top 5.
*/

import { describe, it, expect, beforeEach, afterEach } from 'vitest';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
import { DatabaseConnection } from '../src/db';
import { QueryBuilder } from '../src/db/queries';
import { Node } from '../src/types';
import { splitIdentifierTokens, buildNameSubwords } from '../src/utils';
import { filterStopwords, STOP_WORDS } from '../src/search/query-utils';
import { runMigrations, getCurrentVersion } from '../src/db/migrations';

describe('splitIdentifierTokens', () => {
it('splits camelCase', () => {
expect(splitIdentifierTokens('getParser')).toEqual(['get', 'parser']);
});

it('splits PascalCase', () => {
expect(splitIdentifierTokens('DatabaseConnection')).toEqual(['database', 'connection']);
});

it('splits XMLHttpRequest-style runs of capitals', () => {
expect(splitIdentifierTokens('XMLHttpRequest')).toEqual(['xml', 'http', 'request']);
});

it('splits snake_case', () => {
expect(splitIdentifierTokens('database_connection')).toEqual(['database', 'connection']);
});

it('splits kebab-case and dots and slashes', () => {
expect(splitIdentifierTokens('foo-bar.baz/qux')).toEqual(['foo', 'bar', 'baz', 'qux']);
});

it('keeps single-word identifiers as-is', () => {
expect(splitIdentifierTokens('parse')).toEqual(['parse']);
});

it('handles trailing/leading underscores', () => {
expect(splitIdentifierTokens('__init__')).toEqual(['init']);
});

it('preserves numbers as part of the surrounding token', () => {
expect(splitIdentifierTokens('parseV2')).toEqual(['parse', 'v2']);
});
});

describe('buildNameSubwords', () => {
it('preserves the original identifier so direct queries still hit', () => {
const out = buildNameSubwords('getParser');
expect(out.split(' ')).toContain('getParser');
});

it('appends split tokens', () => {
const out = buildNameSubwords('getParser').split(' ');
expect(out).toContain('get');
expect(out).toContain('parser');
});

it('dedupes single-word identifiers (no "parse parse")', () => {
expect(buildNameSubwords('parse')).toBe('parse');
});

it('dedupes when split produces a single token equal to the original', () => {
// 'foo' has no boundary, so splitIdentifierTokens returns ['foo'];
// without dedup we would store 'foo foo'.
const out = buildNameSubwords('foo').split(' ');
expect(out).toEqual(['foo']);
});

it('handles empty string without crashing', () => {
expect(buildNameSubwords('')).toBe('');
});
});

describe('filterStopwords (shared with query-utils.ts)', () => {
it('drops common English stopwords', () => {
expect(filterStopwords(['how', 'does', 'parsing', 'work']))
// 'work' is also in STOP_WORDS, so the result is just 'parsing'
.toEqual(['parsing']);
});

it('returns the original list when every term is a stopword', () => {
// Otherwise we would produce an empty FTS query.
const allStopwords = ['the', 'a', 'an'];
expect(filterStopwords(allStopwords)).toEqual(allStopwords);
});

it('does not strip common identifier-like words', () => {
// `get` / `set` / `find` could be method names; never treated as stopwords.
expect(filterStopwords(['get', 'set', 'find', 'name']))
.toEqual(['get', 'set', 'find', 'name']);
expect(STOP_WORDS.has('get')).toBe(false);
});
});

describe('FTS5 search quality (integration)', () => {
let dir: string;
let db: DatabaseConnection;
let q: QueryBuilder;

function makeNode(id: string, name: string, kind: Node['kind'], docstring?: string): Node {
return {
id,
kind,
name,
qualifiedName: name,
filePath: `src/${name}.ts`,
language: 'typescript',
startLine: 1,
endLine: 1,
startColumn: 0,
endColumn: 0,
docstring,
updatedAt: Date.now(),
};
}

beforeEach(() => {
dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-search-quality-'));
db = DatabaseConnection.initialize(path.join(dir, 'test.db'));
q = new QueryBuilder(db.getDb());
});

afterEach(() => {
db.close();
if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
});

it('finds getParser for a `parser` query (subword tokens)', () => {
q.insertNodes([
makeNode('n1', 'getParser', 'function'),
makeNode('n2', 'unrelated', 'function'),
]);
const results = q.searchNodes('parser', { limit: 10 });
expect(results.find((r) => r.node.name === 'getParser')).toBeDefined();
});

it('finds DatabaseConnection for a `connection` query (subword tokens)', () => {
q.insertNodes([
makeNode('n1', 'DatabaseConnection', 'class'),
makeNode('n2', 'unrelated', 'function'),
]);
const results = q.searchNodes('connection', { limit: 10 });
expect(results.find((r) => r.node.name === 'DatabaseConnection')).toBeDefined();
});

it('matches `parsing` against `getParser` via Porter stemmer', () => {
q.insertNodes([
makeNode('n1', 'getParser', 'function'),
makeNode('n2', 'unrelated', 'function'),
]);
const results = q.searchNodes('parsing', { limit: 10 });
expect(results.find((r) => r.node.name === 'getParser')).toBeDefined();
});

it('matches `resolves references` against resolveOne', () => {
q.insertNodes([
makeNode('n1', 'resolveOne', 'method'),
makeNode('n2', 'unrelated', 'function'),
]);
const results = q.searchNodes('resolves references', { limit: 10 });
expect(results.find((r) => r.node.name === 'resolveOne')).toBeDefined();
});

it('strips stopwords so `how does parser work` finds getParser', () => {
// Without stopword stripping the docstring of `unrelated` (containing
// "how" and "does") would BM25-flood the result list.
q.insertNodes([
makeNode('n1', 'getParser', 'function'),
makeNode(
'n2',
'unrelated',
'function',
'How does this work? It does many things — does, does, does.'
),
]);
const results = q.searchNodes('how does parser work', { limit: 10 });
const ranks = new Map(results.map((r, i) => [r.node.name, i + 1]));
const parserRank = ranks.get('getParser');
const unrelatedRank = ranks.get('unrelated');
expect(parserRank).toBeDefined();
if (unrelatedRank !== undefined) {
expect(parserRank).toBeLessThan(unrelatedRank);
}
});

it('exact identifier search still works (no regression on direct queries)', () => {
q.insertNodes([
makeNode('n1', 'ExtractionOrchestrator', 'class'),
makeNode('n2', 'extraction', 'variable'),
makeNode('n3', 'orchestrator', 'variable'),
]);
const results = q.searchNodes('ExtractionOrchestrator', { limit: 10 });
expect(results[0].node.name).toBe('ExtractionOrchestrator');
});
});

describe('Migration v4: backfill name_subwords + rebuild FTS', () => {
let dir: string;

beforeEach(() => {
dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-migr-v4-fts-'));
});

afterEach(() => {
if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
});

it('rebuilds FTS so subword search works on previously-indexed nodes', () => {
// Build a v3-shape database from explicit SQL — the pre-PR schema —
// then run forward migrations and verify search works end-to-end.
// This is a faithful simulation of an upgrade from a real v3 install.
const Database = require('better-sqlite3');
const dbHandle = new Database(path.join(dir, 'test.db'));
dbHandle.pragma('foreign_keys = ON');
dbHandle.exec(`
CREATE TABLE schema_versions (version INTEGER PRIMARY KEY, applied_at INTEGER NOT NULL, description TEXT);
INSERT INTO schema_versions (version, applied_at, description) VALUES (3, 0, 'v3');
CREATE TABLE nodes (
id TEXT PRIMARY KEY, kind TEXT NOT NULL, name TEXT NOT NULL,
qualified_name TEXT NOT NULL, file_path TEXT NOT NULL, language TEXT NOT NULL,
start_line INTEGER NOT NULL, end_line INTEGER NOT NULL,
start_column INTEGER NOT NULL, end_column INTEGER NOT NULL,
docstring TEXT, signature TEXT, visibility TEXT,
is_exported INTEGER DEFAULT 0, is_async INTEGER DEFAULT 0,
is_static INTEGER DEFAULT 0, is_abstract INTEGER DEFAULT 0,
decorators TEXT, type_parameters TEXT, updated_at INTEGER NOT NULL
);
CREATE VIRTUAL TABLE nodes_fts USING fts5(
id, name, qualified_name, docstring, signature,
content='nodes', content_rowid='rowid'
);
CREATE TRIGGER nodes_ai AFTER INSERT ON nodes BEGIN
INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature)
VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature);
END;
INSERT INTO nodes (id, kind, name, qualified_name, file_path, language,
start_line, end_line, start_column, end_column, updated_at)
VALUES ('n1', 'function', 'getParser', 'getParser', 'a.ts', 'typescript', 1, 1, 0, 0, 0);
`);

expect(getCurrentVersion(dbHandle)).toBe(3);

// Apply migration v4
runMigrations(dbHandle, 3);
expect(getCurrentVersion(dbHandle)).toBe(4);

// The new column was backfilled with the split subwords.
const row = dbHandle.prepare('SELECT name_subwords FROM nodes WHERE id = ?').get('n1') as {
name_subwords: string;
};
expect(row.name_subwords).toContain('parser');

// Search end-to-end via QueryBuilder works against the migrated DB.
const q2 = new QueryBuilder(dbHandle);
const results = q2.searchNodes('parser', { limit: 10 });
expect(results.find((r) => r.node.name === 'getParser')).toBeDefined();

dbHandle.close();
});

it('migration is idempotent if name_subwords column already exists', () => {
// Simulate a partial-failure scenario: the ALTER TABLE landed
// (DDL is auto-committed in SQLite even inside a transaction) but
// the rest didn't, so the column is present but the FTS hasn't been
// recreated and the schema_versions row hasn't been bumped.
const Database = require('better-sqlite3');
const dbHandle = new Database(path.join(dir, 'test.db'));
dbHandle.exec(`
CREATE TABLE schema_versions (version INTEGER PRIMARY KEY, applied_at INTEGER NOT NULL, description TEXT);
INSERT INTO schema_versions (version, applied_at, description) VALUES (3, 0, 'v3');
CREATE TABLE nodes (
id TEXT PRIMARY KEY, kind TEXT NOT NULL, name TEXT NOT NULL,
qualified_name TEXT NOT NULL, file_path TEXT NOT NULL, language TEXT NOT NULL,
start_line INTEGER NOT NULL, end_line INTEGER NOT NULL,
start_column INTEGER NOT NULL, end_column INTEGER NOT NULL,
docstring TEXT, signature TEXT, visibility TEXT,
is_exported INTEGER DEFAULT 0, is_async INTEGER DEFAULT 0,
is_static INTEGER DEFAULT 0, is_abstract INTEGER DEFAULT 0,
decorators TEXT, type_parameters TEXT, updated_at INTEGER NOT NULL,
name_subwords TEXT -- partial pre-existing state
);
`);
expect(() => runMigrations(dbHandle, 3)).not.toThrow();
expect(getCurrentVersion(dbHandle)).toBe(4);
dbHandle.close();
});
});
Loading