Skip to content

Commit ffb4e77

Browse files
committed
feat: wire TextProcessingPipeline + HnswIndexSidecar into RAG system
Integration: - BM25Index accepts optional TextProcessingPipeline (replaces hardcoded regex tokenizer) - SqlVectorStore.hybridSearch() uses pipeline when configured - SqlVectorStore.query() uses HNSW sidecar fast path (O(log n)) when active - SqlVectorStore.upsert() updates sidecar + checks activation threshold - SqlVectorStore.initialize() creates HnswIndexSidecar alongside SQLite file - SqlVectorStore.shutdown() persists and releases sidecar Deprecation: - HnswlibVectorStore marked deprecated with console.warn pointing to SqlVectorStore - VectorStoreManager warns on type 'hnswlib' selection Stop words: - Add getNaturalStopWords() exporting natural's 170-word list as option - Memory HnswSidecar annotated with forward reference to shared HnswIndexSidecar 779 tests passing (1 pre-existing AudioGen failure)
1 parent a9248d4 commit ffb4e77

7 files changed

Lines changed: 205 additions & 9 deletions

File tree

src/core/text-processing/filters/StopWordFilter.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,25 @@ export const CODE_STOP_WORDS: ReadonlySet<string> = new Set([
4242
'or', 'if', 'while', 'about', 'up', 'out', 'also', 'it', 'its',
4343
]);
4444

45+
/**
46+
* Extended stop word list from the `natural` NLP library (170 words).
47+
* Loaded lazily — falls back to ENGLISH_STOP_WORDS if natural is unavailable.
48+
*/
49+
let _naturalStopWords: ReadonlySet<string> | null = null;
50+
export function getNaturalStopWords(): ReadonlySet<string> {
51+
if (_naturalStopWords) return _naturalStopWords;
52+
try {
53+
// eslint-disable-next-line @typescript-eslint/no-var-requires
54+
const natural = require('natural');
55+
if (natural.stopwords && Array.isArray(natural.stopwords)) {
56+
_naturalStopWords = new Set(natural.stopwords as string[]);
57+
return _naturalStopWords;
58+
}
59+
} catch { /* natural not installed */ }
60+
_naturalStopWords = ENGLISH_STOP_WORDS;
61+
return _naturalStopWords;
62+
}
63+
4564
/**
4665
* Filters tokens whose `.text` appears in the provided stop word set.
4766
* Case-sensitive — apply after LowercaseNormalizer for case-insensitive filtering.

src/core/text-processing/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ export { LowercaseNormalizer } from './normalizers/LowercaseNormalizer';
1818
export { AccentStripper } from './normalizers/AccentStripper';
1919

2020
// Filters
21-
export { StopWordFilter, ENGLISH_STOP_WORDS, CODE_STOP_WORDS } from './filters/StopWordFilter';
21+
export { StopWordFilter, ENGLISH_STOP_WORDS, CODE_STOP_WORDS, getNaturalStopWords } from './filters/StopWordFilter';
2222

2323
// Stemmers
2424
export { PorterStemmer } from './stemmers/PorterStemmer';

src/memory/store/HnswSidecar.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,20 @@
99
* Auto-activates when trace count exceeds threshold (default: 1000).
1010
* Below that, brute-force cosine in the Memory facade is fast enough.
1111
*
12+
* NOTE: The generalized version of this pattern is now available at
13+
* `core/vector-search/HnswIndexSidecar`. This Memory-specific version
14+
* will be migrated to delegate to the shared module in a future update.
15+
* New code should use `HnswIndexSidecar` from `core/vector-search/` directly.
16+
*
1217
* Architecture:
1318
* ```
1419
* ~/.wunderland/agents/{name}/
1520
* ├── brain.sqlite ← source of truth
1621
* └── brain.hnsw ← HNSW index (rebuildable)
1722
* brain.hnsw.map.json ← label↔id mapping
1823
* ```
24+
*
25+
* @see core/vector-search/HnswIndexSidecar for the shared generalized version
1926
*/
2027

2128
import { existsSync, unlinkSync, writeFileSync, readFileSync } from 'node:fs';

src/rag/VectorStoreManager.ts

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,12 @@ export class VectorStoreManager implements IVectorStoreManager {
180180
// Supports SQLite, PostgreSQL, IndexedDB, and more
181181
return new SqlVectorStore();
182182
case 'hnswlib':
183-
// HNSW-based vector store using hnswlib-node for fast ANN search
184-
// O(log n) queries, in-process, file-based persistence
183+
// DEPRECATED: Use SqlVectorStore (type: 'sql') instead — it now includes
184+
// automatic HNSW acceleration via HnswIndexSidecar.
185+
console.warn(
186+
'[DEPRECATED] VectorStore type "hnswlib" is deprecated. Use type "sql" instead — ' +
187+
'SqlVectorStore now includes automatic HNSW acceleration.',
188+
);
185189
return new HnswlibVectorStore();
186190
case 'qdrant':
187191
// Qdrant vector store via HTTP (self-hosted or cloud)

src/rag/implementations/vector_stores/HnswlibVectorStore.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@
33
* @description HNSW-based vector store using hnswlib-node for fast approximate nearest neighbor search.
44
* Provides O(log n) query performance vs O(n) linear scan, with file-based persistence.
55
*
6+
* @deprecated Use SqlVectorStore instead — it now includes automatic HNSW acceleration
7+
* via the HnswIndexSidecar from core/vector-search/. SqlVectorStore provides the same
8+
* HNSW performance plus SQLite persistence, hybrid search (BM25 + vector), metadata
9+
* queries, and ACID transactions. Configure with `type: 'sql'` and the HNSW sidecar
10+
* activates automatically when document count crosses the threshold (default 1000).
11+
*
612
* @module AgentOS/RAG/VectorStores
713
* @version 1.0.0
814
*/
@@ -119,6 +125,11 @@ export class HnswlibVectorStore implements IVectorStore {
119125
private nodePath?: typeof import('node:path');
120126

121127
async initialize(config: VectorStoreProviderConfig): Promise<void> {
128+
console.warn(
129+
'[DEPRECATED] HnswlibVectorStore is deprecated. Use SqlVectorStore instead — ' +
130+
'it now includes automatic HNSW acceleration via sidecar index. ' +
131+
'See core/vector-search/HnswIndexSidecar for details.',
132+
);
122133
if (this.isInitialized) {
123134
console.warn(`[HnswlibVectorStore:${this.providerId}] Re-initializing.`);
124135
this.collections.clear();

src/rag/implementations/vector_stores/SqlVectorStore.ts

Lines changed: 141 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,28 @@ export interface SqlVectorStoreConfig extends VectorStoreProviderConfig {
112112
* @default 'agentos_rag_'
113113
*/
114114
tablePrefix?: string;
115+
116+
/**
117+
* Optional text processing pipeline for hybrid search tokenization.
118+
* Replaces the built-in regex tokenizer with configurable stemming,
119+
* lemmatization, and stop word handling.
120+
* @see createRagPipeline from core/text-processing
121+
*/
122+
pipeline?: import('../../../core/text-processing/TextProcessingPipeline').TextProcessingPipeline;
123+
124+
/**
125+
* Document count threshold before HNSW sidecar activates.
126+
* Below this count, brute-force cosine similarity is used.
127+
* Set to 0 to disable HNSW. Set to Infinity to always use brute-force.
128+
* @default 1000
129+
*/
130+
hnswThreshold?: number;
131+
132+
/**
133+
* Embedding dimensions for the HNSW sidecar index.
134+
* @default 1536
135+
*/
136+
hnswDimensions?: number;
115137
}
116138

117139
// ============================================================================
@@ -196,6 +218,12 @@ export class SqlVectorStore implements IVectorStore {
196218
private readonly providerId: string;
197219
private tablePrefix: string = 'agentos_rag_';
198220

221+
/** Optional HNSW sidecar for O(log n) vector search when available. */
222+
private sidecar: import('../../../core/vector-search/HnswIndexSidecar').HnswIndexSidecar | null = null;
223+
224+
/** Optional text processing pipeline for hybrid search tokenization. */
225+
private pipeline?: import('../../../core/text-processing/TextProcessingPipeline').TextProcessingPipeline;
226+
199227
/**
200228
* Constructs a SqlVectorStore instance.
201229
* The store is not operational until `initialize()` is called.
@@ -246,8 +274,33 @@ export class SqlVectorStore implements IVectorStore {
246274
// Create schema
247275
await this.createSchema();
248276

277+
// Store pipeline reference
278+
this.pipeline = this.config.pipeline;
279+
280+
// Initialize HNSW sidecar for accelerated vector search
281+
if (this.config.hnswThreshold !== Infinity) {
282+
try {
283+
const { HnswIndexSidecar } = await import('../../../core/vector-search/HnswIndexSidecar');
284+
this.sidecar = new HnswIndexSidecar();
285+
286+
// Derive sidecar index path from adapter config
287+
const storagePath = (this.config.storage as any)?.filePath ?? (this.config.storage as any)?.database;
288+
const indexPath = storagePath ? `${storagePath}.hnsw` : `/tmp/agentos-rag-${this.providerId}.hnsw`;
289+
290+
await this.sidecar.initialize({
291+
indexPath,
292+
dimensions: this.config.hnswDimensions ?? this.config.defaultEmbeddingDimension ?? 1536,
293+
metric: this.config.similarityMetric ?? 'cosine',
294+
activationThreshold: this.config.hnswThreshold ?? 1000,
295+
});
296+
} catch {
297+
/* HNSW sidecar unavailable (hnswlib-node not installed) — brute-force fallback */
298+
this.sidecar = null;
299+
}
300+
}
301+
249302
this.isInitialized = true;
250-
console.log(`SqlVectorStore (ID: ${this.providerId}, Config ID: ${this.config.id}) initialized successfully.`);
303+
console.log(`SqlVectorStore (ID: ${this.providerId}, Config ID: ${this.config.id}) initialized successfully${this.sidecar?.isAvailable() ? ' (HNSW sidecar ready)' : ''}.`);
251304
}
252305

253306
/**
@@ -526,6 +579,37 @@ export class SqlVectorStore implements IVectorStore {
526579
[countResult?.count ?? 0, now, collectionName]
527580
);
528581

582+
// ── HNSW sidecar: add upserted vectors + check threshold ──────────
583+
if (this.sidecar?.isAvailable() && upsertedIds.length > 0) {
584+
const docsWithEmbeddings = documents
585+
.filter(d => upsertedIds.includes(d.id) && d.embedding?.length > 0)
586+
.map(d => ({ id: d.id, embedding: d.embedding }));
587+
588+
if (this.sidecar.isActive()) {
589+
await this.sidecar.addBatch(docsWithEmbeddings);
590+
} else {
591+
// Check if we just crossed the activation threshold
592+
const docCount = countResult?.count ?? 0;
593+
const threshold = this.config.hnswThreshold ?? 1000;
594+
if (docCount >= threshold) {
595+
// Load ALL embeddings from SQLite and rebuild the HNSW index
596+
const allRows = await this.adapter.all<DocumentRow>(
597+
`SELECT id, embedding_blob FROM ${this.tablePrefix}documents WHERE collection_name = ?`,
598+
[collectionName],
599+
);
600+
const allItems = allRows
601+
.map(row => ({
602+
id: row.id,
603+
embedding: isLegacyJsonBlob(row.embedding_blob)
604+
? JSON.parse(row.embedding_blob as string) as number[]
605+
: blobToEmbedding(row.embedding_blob as Buffer),
606+
}))
607+
.filter(item => item.embedding.length > 0);
608+
await this.sidecar.rebuildFromData(allItems);
609+
}
610+
}
611+
}
612+
529613
return {
530614
upsertedCount: upsertedIds.length,
531615
upsertedIds,
@@ -561,6 +645,51 @@ export class SqlVectorStore implements IVectorStore {
561645
);
562646
}
563647

648+
// ── HNSW fast path ─────────────────────────────────────────────────
649+
// When the sidecar is active, use O(log n) ANN search to get top
650+
// candidates by ID, then fetch full documents from SQLite. Falls through
651+
// to brute-force when the sidecar is inactive or unavailable.
652+
if (this.sidecar?.isActive()) {
653+
const hnswCandidates = await this.sidecar.search(queryEmbedding, topK * 3);
654+
if (hnswCandidates.length > 0) {
655+
const candidateIds = hnswCandidates.map(c => c.id);
656+
const placeholders = candidateIds.map(() => '?').join(',');
657+
let hnswQuery = `SELECT * FROM ${this.tablePrefix}documents WHERE collection_name = ? AND id IN (${placeholders})`;
658+
const hnswParams: unknown[] = [collectionName, ...candidateIds];
659+
if (options?.filter) {
660+
const filterSQL = this.buildMetadataFilterSQL(options.filter);
661+
hnswQuery += filterSQL.clause;
662+
hnswParams.push(...filterSQL.params);
663+
}
664+
const rows = await this.adapter.all<DocumentRow>(hnswQuery, hnswParams);
665+
666+
const scoreMap = new Map(hnswCandidates.map(c => [c.id, c.score]));
667+
const candidates: RetrievedVectorDocument[] = rows.map(row => {
668+
const metadata = row.metadata_json ? JSON.parse(row.metadata_json) : undefined;
669+
const embedding = options?.includeEmbedding
670+
? (isLegacyJsonBlob(row.embedding_blob) ? JSON.parse(row.embedding_blob as string) : blobToEmbedding(row.embedding_blob as Buffer))
671+
: [];
672+
const doc: RetrievedVectorDocument = {
673+
id: row.id,
674+
embedding,
675+
similarityScore: scoreMap.get(row.id) ?? 0,
676+
};
677+
if (options?.includeMetadata !== false && metadata) doc.metadata = metadata;
678+
if (options?.includeTextContent && row.text_content) doc.textContent = row.text_content;
679+
return doc;
680+
}).filter(d => options?.minSimilarityScore === undefined || d.similarityScore >= options.minSimilarityScore);
681+
682+
candidates.sort((a, b) => b.similarityScore - a.similarityScore);
683+
const results = candidates.slice(0, topK);
684+
return {
685+
documents: results,
686+
queryId: `sql-hnsw-query-${uuidv4()}`,
687+
stats: { totalCandidates: hnswCandidates.length, filteredCandidates: candidates.length, returnedCount: results.length },
688+
};
689+
}
690+
}
691+
692+
// ── Brute-force fallback ──────────────────────────────────────────
564693
// Build query with SQL-level metadata filtering via json_extract()
565694
let query = `SELECT * FROM ${this.tablePrefix}documents WHERE collection_name = ?`;
566695
const params: unknown[] = [collectionName];
@@ -692,11 +821,12 @@ export class SqlVectorStore implements IVectorStore {
692821
}
693822
const rows = await this.adapter.all<DocumentRow>(hybridQuery, hybridParams);
694823

695-
const tokenize = (text: string): string[] =>
696-
text
697-
.toLowerCase()
698-
.split(/[^a-z0-9_]+/g)
699-
.filter((t) => t.length > 2);
824+
const tokenize = (text: string): string[] => {
825+
/* Use pluggable pipeline when configured */
826+
if (this.pipeline) return this.pipeline.processToStrings(text);
827+
/* Fallback: built-in regex tokenizer */
828+
return text.toLowerCase().split(/[^a-z0-9_]+/g).filter((t) => t.length > 2);
829+
};
700830

701831
const queryTerms = tokenize(queryText);
702832
const queryTermSet = new Set(queryTerms);
@@ -1026,6 +1156,11 @@ export class SqlVectorStore implements IVectorStore {
10261156
return;
10271157
}
10281158

1159+
if (this.sidecar) {
1160+
await this.sidecar.shutdown();
1161+
this.sidecar = null;
1162+
}
1163+
10291164
if (this.ownsAdapter && this.adapter) {
10301165
await this.adapter.close();
10311166
}

src/rag/search/BM25Index.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,13 @@ export interface BM25Config {
7575
k1?: number;
7676
/** Document length normalization factor. Default: 0.75. */
7777
b?: number;
78+
/**
79+
* Optional text processing pipeline for tokenization.
80+
* When provided, replaces the built-in regex tokenizer with configurable
81+
* stemming, lemmatization, and stop word handling.
82+
* @see createRagPipeline from core/text-processing for the recommended default.
83+
*/
84+
pipeline?: import('../../core/text-processing/TextProcessingPipeline').TextProcessingPipeline;
7885
}
7986

8087
/**
@@ -181,6 +188,13 @@ export class BM25Index {
181188
/** Whether the IDF cache needs recomputation. */
182189
private idfDirty: boolean;
183190

191+
/**
192+
* Optional pluggable text processing pipeline. When set, replaces the
193+
* built-in regex tokenizer with configurable stemming, lemmatization,
194+
* and stop word handling.
195+
*/
196+
private pipeline?: import('../../core/text-processing/TextProcessingPipeline').TextProcessingPipeline;
197+
184198
/**
185199
* Creates a new BM25 index.
186200
*
@@ -200,6 +214,7 @@ export class BM25Index {
200214
constructor(config?: BM25Config) {
201215
this.k1 = config?.k1 ?? 1.2;
202216
this.b = config?.b ?? 0.75;
217+
this.pipeline = config?.pipeline;
203218
this.documents = new Map();
204219
this.invertedIndex = new Map();
205220
this.idf = new Map();
@@ -224,6 +239,11 @@ export class BM25Index {
224239
* ```
225240
*/
226241
private tokenize(text: string): string[] {
242+
/* Use pluggable pipeline when configured (supports stemming, lemmatization, etc.) */
243+
if (this.pipeline) {
244+
return this.pipeline.processToStrings(text);
245+
}
246+
/* Fallback: built-in regex tokenizer (backwards compatible) */
227247
return text
228248
.toLowerCase()
229249
.split(/[\s\-_.,;:!?'"()[\]{}<>/\\|@#$%^&*~`+=]+/)

0 commit comments

Comments
 (0)