framersai
diff --git a/‎src/rag/index.ts‎
Lines changed: 6 additions & 0 deletions b/‎src/rag/index.ts‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/rag/multimodal/LLMVisionAdapter.ts‎
Lines changed: 49 additions & 0 deletions b/‎src/rag/multimodal/LLMVisionAdapter.ts‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎src/rag/multimodal/SpeechProviderAdapter.ts‎
Lines changed: 167 additions & 0 deletions b/‎src/rag/multimodal/SpeechProviderAdapter.ts‎
Lines changed: 167 additions & 0 deletions
diff --git a/‎src/rag/multimodal/__tests__/LLMVisionAdapter.spec.ts‎
Lines changed: 66 additions & 0 deletions b/‎src/rag/multimodal/__tests__/LLMVisionAdapter.spec.ts‎
Lines changed: 66 additions & 0 deletions
@@ -193,6 +193,12 @@ export {
 // ============================================================================
 
 export { MultimodalIndexer } from './multimodal/index.js';
+export { SpeechProviderAdapter } from './multimodal/index.js';
+export { LLMVisionAdapter, type LLMVisionAdapterConfig } from './multimodal/index.js';
+export {
+  createMultimodalIndexerFromResolver,
+  type MultimodalIndexerFromResolverOptions,
+} from './multimodal/index.js';
 
 export type {
   ContentModality,
 
@@ -0,0 +1,49 @@
+/**
+ * @module rag/multimodal/LLMVisionAdapter
+ *
+ * Wraps a vision-capable LLM as an {@link IVisionProvider} for the
+ * multimodal RAG indexer.
+ *
+ * Unlike the full {@link VisionPipeline} which runs OCR, handwriting,
+ * document-AI tiers before escalating to cloud, this adapter goes
+ * straight to the LLM — making it the simplest path for teams that
+ * only need cloud vision and don't want the multi-tier pipeline.
+ *
+ * ## Relationship to LLMVisionProvider
+ *
+ * The `core/vision/providers/LLMVisionProvider` class fills the same
+ * role and already exists. This file re-exports it under the multimodal
+ * module namespace so consumers importing from `rag/multimodal` can
+ * access it without reaching into `core/vision/`. The underlying
+ * implementation is identical — this is a convenience re-export plus
+ * an alias type.
+ *
+ * @see {@link LLMVisionProvider} for the implementation.
+ * @see {@link PipelineVisionProvider} for the full-pipeline alternative.
+ * @see {@link IVisionProvider} for the interface contract.
+ *
+ * @example
+ * ```typescript
+ * import { LLMVisionAdapter } from './LLMVisionAdapter.js';
+ *
+ * const vision = new LLMVisionAdapter({
+ *   provider: 'openai',
+ *   model: 'gpt-4o',
+ *   prompt: 'Describe this image for a RAG search index.',
+ * });
+ *
+ * const indexer = new MultimodalIndexer({
+ *   embeddingManager,
+ *   vectorStore,
+ *   visionProvider: vision,
+ * });
+ * ```
+ */
+
+// Re-export the existing LLMVisionProvider from core/vision so that
+// consumers importing from the multimodal module don't need to reach
+// into core/vision/ directly. The underlying class is unchanged.
+export {
+  LLMVisionProvider as LLMVisionAdapter,
+  type LLMVisionProviderConfig as LLMVisionAdapterConfig,
+} from '../../core/vision/providers/LLMVisionProvider.js';
@@ -0,0 +1,167 @@
+/**
+ * @module rag/multimodal/SpeechProviderAdapter
+ *
+ * Adapts the voice-pipeline's {@link SpeechToTextProvider} to the narrow
+ * {@link ISpeechToTextProvider} interface expected by the multimodal RAG
+ * indexer.
+ *
+ * ## Why this adapter exists
+ *
+ * The speech subsystem (`src/speech/`) and the multimodal RAG pipeline
+ * (`src/rag/multimodal/`) each define their own STT contract:
+ *
+ * | Contract                  | Input                | Output                          |
+ * |---------------------------|----------------------|---------------------------------|
+ * | `SpeechToTextProvider`    | `SpeechAudioInput`   | `SpeechTranscriptionResult`     |
+ * | `ISpeechToTextProvider`   | `Buffer`             | `string`                        |
+ *
+ * The voice pipeline's providers (Whisper, Deepgram, AssemblyAI, Azure)
+ * implement the richer `SpeechToTextProvider` contract. This adapter
+ * wraps any of them so the multimodal indexer can consume them without
+ * requiring separate STT configuration.
+ *
+ * ## Mapping details
+ *
+ * - **Input**: The raw `Buffer` is wrapped in a `SpeechAudioInput` with
+ *   a default MIME type of `audio/wav`. The optional `language` parameter
+ *   is forwarded via `SpeechTranscriptionOptions.language`.
+ *
+ * - **Output**: The full `SpeechTranscriptionResult` is reduced to just
+ *   the `text` string. Rich metadata (segments, confidence, usage) is
+ *   intentionally discarded because the indexer only needs the text for
+ *   embedding generation.
+ *
+ * @see {@link SpeechToTextProvider} for the voice pipeline contract.
+ * @see {@link ISpeechToTextProvider} for the multimodal indexer contract.
+ * @see {@link SpeechProviderResolver} for resolving STT providers.
+ *
+ * @example
+ * ```typescript
+ * import { SpeechProviderResolver } from '../../speech/SpeechProviderResolver.js';
+ * import { SpeechProviderAdapter } from './SpeechProviderAdapter.js';
+ *
+ * const resolver = new SpeechProviderResolver();
+ * await resolver.refresh();
+ * const stt = resolver.resolveSTT();
+ * const adapter = new SpeechProviderAdapter(stt);
+ *
+ * const indexer = new MultimodalIndexer({ sttProvider: adapter, ... });
+ * ```
+ */
+
+import type { SpeechToTextProvider } from '../../speech/types.js';
+import type { ISpeechToTextProvider } from './types.js';
+
+// ---------------------------------------------------------------------------
+// Implementation
+// ---------------------------------------------------------------------------
+
+/**
+ * Bridges the voice-pipeline's `SpeechToTextProvider` to the multimodal
+ * indexer's `ISpeechToTextProvider` interface.
+ *
+ * Converts raw `Buffer` audio into the `SpeechAudioInput` shape expected
+ * by voice providers, forwards the language hint through
+ * `SpeechTranscriptionOptions`, and extracts the plain transcript text
+ * from the rich `SpeechTranscriptionResult`.
+ *
+ * @example
+ * ```typescript
+ * const whisper = resolver.resolveSTT();
+ * const adapted = new SpeechProviderAdapter(whisper);
+ *
+ * // Now usable by the multimodal indexer:
+ * const text = await adapted.transcribe(audioBuffer, 'en');
+ * ```
+ */
+export class SpeechProviderAdapter implements ISpeechToTextProvider {
+  /**
+   * The underlying voice-pipeline STT provider being adapted.
+   * Held as a readonly reference — the caller retains ownership.
+   */
+  private readonly _provider: SpeechToTextProvider;
+
+  /**
+   * Default MIME type applied to raw audio buffers when no format
+   * information is available. WAV is the most universally supported
+   * format across STT providers.
+   */
+  private readonly _defaultMimeType: string;
+
+  /**
+   * Create a new adapter wrapping a voice-pipeline STT provider.
+   *
+   * @param provider - A configured `SpeechToTextProvider` instance
+   *   (e.g. Whisper, Deepgram, AssemblyAI, Azure Speech).
+   * @param defaultMimeType - MIME type to assume for raw audio buffers.
+   *   Defaults to `'audio/wav'` which is accepted by all major STT
+   *   providers. Override to `'audio/mpeg'` or `'audio/ogg'` when
+   *   indexing MP3/OGG files.
+   *
+   * @throws {Error} If provider is null or undefined.
+   *
+   * @example
+   * ```typescript
+   * const adapter = new SpeechProviderAdapter(whisperProvider);
+   * const mp3Adapter = new SpeechProviderAdapter(whisperProvider, 'audio/mpeg');
+   * ```
+   */
+  constructor(provider: SpeechToTextProvider, defaultMimeType = 'audio/wav') {
+    if (!provider) {
+      throw new Error(
+        'SpeechProviderAdapter: a SpeechToTextProvider instance is required.',
+      );
+    }
+    this._provider = provider;
+    this._defaultMimeType = defaultMimeType;
+  }
+
+  /**
+   * Transcribe audio data to text.
+   *
+   * Wraps the raw buffer in a `SpeechAudioInput` and delegates to the
+   * underlying voice-pipeline provider. The rich transcription result
+   * is reduced to the plain text string that the multimodal indexer
+   * needs for embedding generation.
+   *
+   * @param audio - Raw audio data as a Buffer (WAV, MP3, OGG, etc.).
+   * @param language - Optional BCP-47 language code hint for improved
+   *   transcription accuracy (e.g. `'en'`, `'es'`, `'ja'`).
+   * @returns The transcribed text content.
+   *
+   * @throws {Error} If the underlying STT provider fails.
+   *
+   * @example
+   * ```typescript
+   * const transcript = await adapter.transcribe(wavBuffer);
+   * const spanishTranscript = await adapter.transcribe(audioBuffer, 'es');
+   * ```
+   */
+  async transcribe(audio: Buffer, language?: string): Promise<string> {
+    const result = await this._provider.transcribe(
+      {
+        data: audio,
+        mimeType: this._defaultMimeType,
+      },
+      language ? { language } : undefined,
+    );
+    return result.text;
+  }
+
+  /**
+   * Get the display name of the underlying STT provider.
+   *
+   * Useful for logging and diagnostics — lets callers identify which
+   * voice-pipeline provider is actually handling transcription.
+   *
+   * @returns The provider's display name or ID string.
+   *
+   * @example
+   * ```typescript
+   * console.log(`STT via: ${adapter.getProviderName()}`); // "openai-whisper"
+   * ```
+   */
+  getProviderName(): string {
+    return this._provider.displayName ?? this._provider.id;
+  }
+}
@@ -0,0 +1,66 @@
+/**
+ * @module rag/multimodal/__tests__/LLMVisionAdapter.spec
+ *
+ * Unit tests for the {@link LLMVisionAdapter} re-export.
+ *
+ * The LLMVisionAdapter is a convenience re-export of `LLMVisionProvider`
+ * from `core/vision/providers/`. These tests verify that:
+ *
+ * - The re-export resolves to the correct class
+ * - The adapter implements IVisionProvider
+ * - Constructor validates required config
+ *
+ * More thorough LLMVisionProvider tests live in
+ * `core/vision/__tests__/LLMVisionProvider.spec.ts`. This file only
+ * validates the re-export wiring and basic contract.
+ */
+
+import { describe, it, expect } from 'vitest';
+import { LLMVisionAdapter, type LLMVisionAdapterConfig } from '../LLMVisionAdapter.js';
+import { LLMVisionProvider } from '../../../core/vision/providers/LLMVisionProvider.js';
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+describe('LLMVisionAdapter', () => {
+  it('should be the same class as LLMVisionProvider', () => {
+    // The re-export should resolve to the exact same constructor
+    expect(LLMVisionAdapter).toBe(LLMVisionProvider);
+  });
+
+  it('should throw if provider name is missing', () => {
+    expect(() => new LLMVisionAdapter({ provider: '' })).toThrow(
+      /provider name is required/,
+    );
+  });
+
+  it('should construct with valid config', () => {
+    const config: LLMVisionAdapterConfig = {
+      provider: 'openai',
+      model: 'gpt-4o',
+    };
+
+    const adapter = new LLMVisionAdapter(config);
+    expect(adapter).toBeInstanceOf(LLMVisionProvider);
+  });
+
+  it('should have a describeImage method (implements IVisionProvider)', () => {
+    const adapter = new LLMVisionAdapter({ provider: 'openai' });
+    expect(typeof adapter.describeImage).toBe('function');
+  });
+
+  it('should accept custom prompt in config', () => {
+    // Should not throw — just verifying the config shape is accepted
+    expect(
+      () =>
+        new LLMVisionAdapter({
+          provider: 'anthropic',
+          model: 'claude-sonnet-4-20250514',
+          prompt: 'Describe this image for a search index.',
+          apiKey: 'test-key',
+          baseUrl: 'https://custom.endpoint.com',
+        }),
+    ).not.toThrow();
+  });
+});