feat: wire TextProcessingPipeline + HnswIndexSidecar into RAG system

jddunn · jddunn · commit ffb4e770e25d · 2026-03-27T15:15:32.000-07:00
Integration:
- BM25Index accepts optional TextProcessingPipeline (replaces hardcoded regex tokenizer)
- SqlVectorStore.hybridSearch() uses pipeline when configured
- SqlVectorStore.query() uses HNSW sidecar fast path (O(log n)) when active
- SqlVectorStore.upsert() updates sidecar + checks activation threshold
- SqlVectorStore.initialize() creates HnswIndexSidecar alongside SQLite file
- SqlVectorStore.shutdown() persists and releases sidecar

Deprecation:
- HnswlibVectorStore marked deprecated with console.warn pointing to SqlVectorStore
- VectorStoreManager warns on type 'hnswlib' selection

Stop words:
- Add getNaturalStopWords() exporting natural's 170-word list as option
- Memory HnswSidecar annotated with forward reference to shared HnswIndexSidecar

779 tests passing (1 pre-existing AudioGen failure)
diff --git a/src/core/text-processing/filters/StopWordFilter.ts b/src/core/text-processing/filters/StopWordFilter.ts
@@ -42,6 +42,25 @@ export const CODE_STOP_WORDS: ReadonlySet<string> = new Set([
   'or', 'if', 'while', 'about', 'up', 'out', 'also', 'it', 'its',
 ]);
 
+/**
+ * Extended stop word list from the `natural` NLP library (170 words).
+ * Loaded lazily — falls back to ENGLISH_STOP_WORDS if natural is unavailable.
+ */
+let _naturalStopWords: ReadonlySet<string> | null = null;
+export function getNaturalStopWords(): ReadonlySet<string> {
+  if (_naturalStopWords) return _naturalStopWords;
+  try {
+    // eslint-disable-next-line @typescript-eslint/no-var-requires
+    const natural = require('natural');
+    if (natural.stopwords && Array.isArray(natural.stopwords)) {
+      _naturalStopWords = new Set(natural.stopwords as string[]);
+      return _naturalStopWords;
+    }
+  } catch { /* natural not installed */ }
+  _naturalStopWords = ENGLISH_STOP_WORDS;
+  return _naturalStopWords;
+}
+
 /**
  * Filters tokens whose `.text` appears in the provided stop word set.
  * Case-sensitive — apply after LowercaseNormalizer for case-insensitive filtering.
diff --git a/src/core/text-processing/index.ts b/src/core/text-processing/index.ts
@@ -18,7 +18,7 @@ export { LowercaseNormalizer } from './normalizers/LowercaseNormalizer';
 export { AccentStripper } from './normalizers/AccentStripper';
 
 // Filters
-export { StopWordFilter, ENGLISH_STOP_WORDS, CODE_STOP_WORDS } from './filters/StopWordFilter';
+export { StopWordFilter, ENGLISH_STOP_WORDS, CODE_STOP_WORDS, getNaturalStopWords } from './filters/StopWordFilter';
 
 // Stemmers
 export { PorterStemmer } from './stemmers/PorterStemmer';
diff --git a/src/memory/store/HnswSidecar.ts b/src/memory/store/HnswSidecar.ts
@@ -9,13 +9,20 @@
  * Auto-activates when trace count exceeds threshold (default: 1000).
  * Below that, brute-force cosine in the Memory facade is fast enough.
  *
+ * NOTE: The generalized version of this pattern is now available at
+ * `core/vector-search/HnswIndexSidecar`. This Memory-specific version
+ * will be migrated to delegate to the shared module in a future update.
+ * New code should use `HnswIndexSidecar` from `core/vector-search/` directly.
+ *
  * Architecture:
  * ```
  * ~/.wunderland/agents/{name}/
  *   ├── brain.sqlite   ← source of truth
  *   └── brain.hnsw     ← HNSW index (rebuildable)
  *        brain.hnsw.map.json ← label↔id mapping
  * ```
+ *
+ * @see core/vector-search/HnswIndexSidecar for the shared generalized version
  */
 
 import { existsSync, unlinkSync, writeFileSync, readFileSync } from 'node:fs';
diff --git a/src/rag/VectorStoreManager.ts b/src/rag/VectorStoreManager.ts
@@ -180,8 +180,12 @@ export class VectorStoreManager implements IVectorStoreManager {
         // Supports SQLite, PostgreSQL, IndexedDB, and more
         return new SqlVectorStore();
       case 'hnswlib':
-        // HNSW-based vector store using hnswlib-node for fast ANN search
-        // O(log n) queries, in-process, file-based persistence
+        // DEPRECATED: Use SqlVectorStore (type: 'sql') instead — it now includes
+        // automatic HNSW acceleration via HnswIndexSidecar.
+        console.warn(
+          '[DEPRECATED] VectorStore type "hnswlib" is deprecated. Use type "sql" instead — ' +
+          'SqlVectorStore now includes automatic HNSW acceleration.',
+        );
         return new HnswlibVectorStore();
       case 'qdrant':
         // Qdrant vector store via HTTP (self-hosted or cloud)
diff --git a/src/rag/implementations/vector_stores/HnswlibVectorStore.ts b/src/rag/implementations/vector_stores/HnswlibVectorStore.ts
@@ -3,6 +3,12 @@
  * @description HNSW-based vector store using hnswlib-node for fast approximate nearest neighbor search.
  * Provides O(log n) query performance vs O(n) linear scan, with file-based persistence.
  *
+ * @deprecated Use SqlVectorStore instead — it now includes automatic HNSW acceleration
+ * via the HnswIndexSidecar from core/vector-search/. SqlVectorStore provides the same
+ * HNSW performance plus SQLite persistence, hybrid search (BM25 + vector), metadata
+ * queries, and ACID transactions. Configure with `type: 'sql'` and the HNSW sidecar
+ * activates automatically when document count crosses the threshold (default 1000).
+ *
  * @module AgentOS/RAG/VectorStores
  * @version 1.0.0
  */
@@ -119,6 +125,11 @@ export class HnswlibVectorStore implements IVectorStore {
   private nodePath?: typeof import('node:path');
 
   async initialize(config: VectorStoreProviderConfig): Promise<void> {
+    console.warn(
+      '[DEPRECATED] HnswlibVectorStore is deprecated. Use SqlVectorStore instead — ' +
+      'it now includes automatic HNSW acceleration via sidecar index. ' +
+      'See core/vector-search/HnswIndexSidecar for details.',
+    );
     if (this.isInitialized) {
       console.warn(`[HnswlibVectorStore:${this.providerId}] Re-initializing.`);
       this.collections.clear();
diff --git a/src/rag/implementations/vector_stores/SqlVectorStore.ts b/src/rag/implementations/vector_stores/SqlVectorStore.ts
@@ -112,6 +112,28 @@ export interface SqlVectorStoreConfig extends VectorStoreProviderConfig {
    * @default 'agentos_rag_'
    */
   tablePrefix?: string;
+
+  /**
+   * Optional text processing pipeline for hybrid search tokenization.
+   * Replaces the built-in regex tokenizer with configurable stemming,
+   * lemmatization, and stop word handling.
+   * @see createRagPipeline from core/text-processing
+   */
+  pipeline?: import('../../../core/text-processing/TextProcessingPipeline').TextProcessingPipeline;
+
+  /**
+   * Document count threshold before HNSW sidecar activates.
+   * Below this count, brute-force cosine similarity is used.
+   * Set to 0 to disable HNSW. Set to Infinity to always use brute-force.
+   * @default 1000
+   */
+  hnswThreshold?: number;
+
+  /**
+   * Embedding dimensions for the HNSW sidecar index.
+   * @default 1536
+   */
+  hnswDimensions?: number;
 }
 
 // ============================================================================
@@ -196,6 +218,12 @@ export class SqlVectorStore implements IVectorStore {
   private readonly providerId: string;
   private tablePrefix: string = 'agentos_rag_';
 
+  /** Optional HNSW sidecar for O(log n) vector search when available. */
+  private sidecar: import('../../../core/vector-search/HnswIndexSidecar').HnswIndexSidecar | null = null;
+
+  /** Optional text processing pipeline for hybrid search tokenization. */
+  private pipeline?: import('../../../core/text-processing/TextProcessingPipeline').TextProcessingPipeline;
+
   /**
    * Constructs a SqlVectorStore instance.
    * The store is not operational until `initialize()` is called.
@@ -246,8 +274,33 @@ export class SqlVectorStore implements IVectorStore {
     // Create schema
     await this.createSchema();
 
+    // Store pipeline reference
+    this.pipeline = this.config.pipeline;
+
+    // Initialize HNSW sidecar for accelerated vector search
+    if (this.config.hnswThreshold !== Infinity) {
+      try {
+        const { HnswIndexSidecar } = await import('../../../core/vector-search/HnswIndexSidecar');
+        this.sidecar = new HnswIndexSidecar();
+
+        // Derive sidecar index path from adapter config
+        const storagePath = (this.config.storage as any)?.filePath ?? (this.config.storage as any)?.database;
+        const indexPath = storagePath ? `${storagePath}.hnsw` : `/tmp/agentos-rag-${this.providerId}.hnsw`;
+
+        await this.sidecar.initialize({
+          indexPath,
+          dimensions: this.config.hnswDimensions ?? this.config.defaultEmbeddingDimension ?? 1536,
+          metric: this.config.similarityMetric ?? 'cosine',
+          activationThreshold: this.config.hnswThreshold ?? 1000,
+        });
+      } catch {
+        /* HNSW sidecar unavailable (hnswlib-node not installed) — brute-force fallback */
+        this.sidecar = null;
+      }
+    }
+
     this.isInitialized = true;
-    console.log(`SqlVectorStore (ID: ${this.providerId}, Config ID: ${this.config.id}) initialized successfully.`);
+    console.log(`SqlVectorStore (ID: ${this.providerId}, Config ID: ${this.config.id}) initialized successfully${this.sidecar?.isAvailable() ? ' (HNSW sidecar ready)' : ''}.`);
   }
 
   /**
@@ -526,6 +579,37 @@ export class SqlVectorStore implements IVectorStore {
       [countResult?.count ?? 0, now, collectionName]
     );
 
+    // ── HNSW sidecar: add upserted vectors + check threshold ──────────
+    if (this.sidecar?.isAvailable() && upsertedIds.length > 0) {
+      const docsWithEmbeddings = documents
+        .filter(d => upsertedIds.includes(d.id) && d.embedding?.length > 0)
+        .map(d => ({ id: d.id, embedding: d.embedding }));
+
+      if (this.sidecar.isActive()) {
+        await this.sidecar.addBatch(docsWithEmbeddings);
+      } else {
+        // Check if we just crossed the activation threshold
+        const docCount = countResult?.count ?? 0;
+        const threshold = this.config.hnswThreshold ?? 1000;
+        if (docCount >= threshold) {
+          // Load ALL embeddings from SQLite and rebuild the HNSW index
+          const allRows = await this.adapter.all<DocumentRow>(
+            `SELECT id, embedding_blob FROM ${this.tablePrefix}documents WHERE collection_name = ?`,
+            [collectionName],
+          );
+          const allItems = allRows
+            .map(row => ({
+              id: row.id,
+              embedding: isLegacyJsonBlob(row.embedding_blob)
+                ? JSON.parse(row.embedding_blob as string) as number[]
+                : blobToEmbedding(row.embedding_blob as Buffer),
+            }))
+            .filter(item => item.embedding.length > 0);
+          await this.sidecar.rebuildFromData(allItems);
+        }
+      }
+    }
+
     return {
       upsertedCount: upsertedIds.length,
       upsertedIds,
@@ -561,6 +645,51 @@ export class SqlVectorStore implements IVectorStore {
       );
     }
 
+    // ── HNSW fast path ─────────────────────────────────────────────────
+    // When the sidecar is active, use O(log n) ANN search to get top
+    // candidates by ID, then fetch full documents from SQLite. Falls through
+    // to brute-force when the sidecar is inactive or unavailable.
+    if (this.sidecar?.isActive()) {
+      const hnswCandidates = await this.sidecar.search(queryEmbedding, topK * 3);
+      if (hnswCandidates.length > 0) {
+        const candidateIds = hnswCandidates.map(c => c.id);
+        const placeholders = candidateIds.map(() => '?').join(',');
+        let hnswQuery = `SELECT * FROM ${this.tablePrefix}documents WHERE collection_name = ? AND id IN (${placeholders})`;
+        const hnswParams: unknown[] = [collectionName, ...candidateIds];
+        if (options?.filter) {
+          const filterSQL = this.buildMetadataFilterSQL(options.filter);
+          hnswQuery += filterSQL.clause;
+          hnswParams.push(...filterSQL.params);
+        }
+        const rows = await this.adapter.all<DocumentRow>(hnswQuery, hnswParams);
+
+        const scoreMap = new Map(hnswCandidates.map(c => [c.id, c.score]));
+        const candidates: RetrievedVectorDocument[] = rows.map(row => {
+          const metadata = row.metadata_json ? JSON.parse(row.metadata_json) : undefined;
+          const embedding = options?.includeEmbedding
+            ? (isLegacyJsonBlob(row.embedding_blob) ? JSON.parse(row.embedding_blob as string) : blobToEmbedding(row.embedding_blob as Buffer))
+            : [];
+          const doc: RetrievedVectorDocument = {
+            id: row.id,
+            embedding,
+            similarityScore: scoreMap.get(row.id) ?? 0,
+          };
+          if (options?.includeMetadata !== false && metadata) doc.metadata = metadata;
+          if (options?.includeTextContent && row.text_content) doc.textContent = row.text_content;
+          return doc;
+        }).filter(d => options?.minSimilarityScore === undefined || d.similarityScore >= options.minSimilarityScore);
+
+        candidates.sort((a, b) => b.similarityScore - a.similarityScore);
+        const results = candidates.slice(0, topK);
+        return {
+          documents: results,
+          queryId: `sql-hnsw-query-${uuidv4()}`,
+          stats: { totalCandidates: hnswCandidates.length, filteredCandidates: candidates.length, returnedCount: results.length },
+        };
+      }
+    }
+
+    // ── Brute-force fallback ──────────────────────────────────────────
     // Build query with SQL-level metadata filtering via json_extract()
     let query = `SELECT * FROM ${this.tablePrefix}documents WHERE collection_name = ?`;
     const params: unknown[] = [collectionName];
@@ -692,11 +821,12 @@ export class SqlVectorStore implements IVectorStore {
     }
     const rows = await this.adapter.all<DocumentRow>(hybridQuery, hybridParams);
 
-    const tokenize = (text: string): string[] =>
-      text
-        .toLowerCase()
-        .split(/[^a-z0-9_]+/g)
-        .filter((t) => t.length > 2);
+    const tokenize = (text: string): string[] => {
+      /* Use pluggable pipeline when configured */
+      if (this.pipeline) return this.pipeline.processToStrings(text);
+      /* Fallback: built-in regex tokenizer */
+      return text.toLowerCase().split(/[^a-z0-9_]+/g).filter((t) => t.length > 2);
+    };
 
     const queryTerms = tokenize(queryText);
     const queryTermSet = new Set(queryTerms);
@@ -1026,6 +1156,11 @@ export class SqlVectorStore implements IVectorStore {
       return;
     }
 
+    if (this.sidecar) {
+      await this.sidecar.shutdown();
+      this.sidecar = null;
+    }
+
     if (this.ownsAdapter && this.adapter) {
       await this.adapter.close();
     }
diff --git a/src/rag/search/BM25Index.ts b/src/rag/search/BM25Index.ts
@@ -75,6 +75,13 @@ export interface BM25Config {
   k1?: number;
   /** Document length normalization factor. Default: 0.75. */
   b?: number;
+  /**
+   * Optional text processing pipeline for tokenization.
+   * When provided, replaces the built-in regex tokenizer with configurable
+   * stemming, lemmatization, and stop word handling.
+   * @see createRagPipeline from core/text-processing for the recommended default.
+   */
+  pipeline?: import('../../core/text-processing/TextProcessingPipeline').TextProcessingPipeline;
 }
 
 /**
@@ -181,6 +188,13 @@ export class BM25Index {
   /** Whether the IDF cache needs recomputation. */
   private idfDirty: boolean;
 
+  /**
+   * Optional pluggable text processing pipeline. When set, replaces the
+   * built-in regex tokenizer with configurable stemming, lemmatization,
+   * and stop word handling.
+   */
+  private pipeline?: import('../../core/text-processing/TextProcessingPipeline').TextProcessingPipeline;
+
   /**
    * Creates a new BM25 index.
    *
@@ -200,6 +214,7 @@ export class BM25Index {
   constructor(config?: BM25Config) {
     this.k1 = config?.k1 ?? 1.2;
     this.b = config?.b ?? 0.75;
+    this.pipeline = config?.pipeline;
     this.documents = new Map();
     this.invertedIndex = new Map();
     this.idf = new Map();
@@ -224,6 +239,11 @@ export class BM25Index {
    * ```
    */
   private tokenize(text: string): string[] {
+    /* Use pluggable pipeline when configured (supports stemming, lemmatization, etc.) */
+    if (this.pipeline) {
+      return this.pipeline.processToStrings(text);
+    }
+    /* Fallback: built-in regex tokenizer (backwards compatible) */
     return text
       .toLowerCase()
       .split(/[\s\-_.,;:!?'"()[\]{}<>/\\|@#$%^&*~`+=]+/)