|
| 1 | +/** |
| 2 | + * @module rag/multimodal/SpeechProviderAdapter |
| 3 | + * |
| 4 | + * Adapts the voice-pipeline's {@link SpeechToTextProvider} to the narrow |
| 5 | + * {@link ISpeechToTextProvider} interface expected by the multimodal RAG |
| 6 | + * indexer. |
| 7 | + * |
| 8 | + * ## Why this adapter exists |
| 9 | + * |
| 10 | + * The speech subsystem (`src/speech/`) and the multimodal RAG pipeline |
| 11 | + * (`src/rag/multimodal/`) each define their own STT contract: |
| 12 | + * |
| 13 | + * | Contract | Input | Output | |
| 14 | + * |---------------------------|----------------------|---------------------------------| |
| 15 | + * | `SpeechToTextProvider` | `SpeechAudioInput` | `SpeechTranscriptionResult` | |
| 16 | + * | `ISpeechToTextProvider` | `Buffer` | `string` | |
| 17 | + * |
| 18 | + * The voice pipeline's providers (Whisper, Deepgram, AssemblyAI, Azure) |
| 19 | + * implement the richer `SpeechToTextProvider` contract. This adapter |
| 20 | + * wraps any of them so the multimodal indexer can consume them without |
| 21 | + * requiring separate STT configuration. |
| 22 | + * |
| 23 | + * ## Mapping details |
| 24 | + * |
| 25 | + * - **Input**: The raw `Buffer` is wrapped in a `SpeechAudioInput` with |
| 26 | + * a default MIME type of `audio/wav`. The optional `language` parameter |
| 27 | + * is forwarded via `SpeechTranscriptionOptions.language`. |
| 28 | + * |
| 29 | + * - **Output**: The full `SpeechTranscriptionResult` is reduced to just |
| 30 | + * the `text` string. Rich metadata (segments, confidence, usage) is |
| 31 | + * intentionally discarded because the indexer only needs the text for |
| 32 | + * embedding generation. |
| 33 | + * |
| 34 | + * @see {@link SpeechToTextProvider} for the voice pipeline contract. |
| 35 | + * @see {@link ISpeechToTextProvider} for the multimodal indexer contract. |
| 36 | + * @see {@link SpeechProviderResolver} for resolving STT providers. |
| 37 | + * |
| 38 | + * @example |
| 39 | + * ```typescript |
| 40 | + * import { SpeechProviderResolver } from '../../speech/SpeechProviderResolver.js'; |
| 41 | + * import { SpeechProviderAdapter } from './SpeechProviderAdapter.js'; |
| 42 | + * |
| 43 | + * const resolver = new SpeechProviderResolver(); |
| 44 | + * await resolver.refresh(); |
| 45 | + * const stt = resolver.resolveSTT(); |
| 46 | + * const adapter = new SpeechProviderAdapter(stt); |
| 47 | + * |
| 48 | + * const indexer = new MultimodalIndexer({ sttProvider: adapter, ... }); |
| 49 | + * ``` |
| 50 | + */ |
| 51 | + |
| 52 | +import type { SpeechToTextProvider } from '../../speech/types.js'; |
| 53 | +import type { ISpeechToTextProvider } from './types.js'; |
| 54 | + |
| 55 | +// --------------------------------------------------------------------------- |
| 56 | +// Implementation |
| 57 | +// --------------------------------------------------------------------------- |
| 58 | + |
| 59 | +/** |
| 60 | + * Bridges the voice-pipeline's `SpeechToTextProvider` to the multimodal |
| 61 | + * indexer's `ISpeechToTextProvider` interface. |
| 62 | + * |
| 63 | + * Converts raw `Buffer` audio into the `SpeechAudioInput` shape expected |
| 64 | + * by voice providers, forwards the language hint through |
| 65 | + * `SpeechTranscriptionOptions`, and extracts the plain transcript text |
| 66 | + * from the rich `SpeechTranscriptionResult`. |
| 67 | + * |
| 68 | + * @example |
| 69 | + * ```typescript |
| 70 | + * const whisper = resolver.resolveSTT(); |
| 71 | + * const adapted = new SpeechProviderAdapter(whisper); |
| 72 | + * |
| 73 | + * // Now usable by the multimodal indexer: |
| 74 | + * const text = await adapted.transcribe(audioBuffer, 'en'); |
| 75 | + * ``` |
| 76 | + */ |
| 77 | +export class SpeechProviderAdapter implements ISpeechToTextProvider { |
| 78 | + /** |
| 79 | + * The underlying voice-pipeline STT provider being adapted. |
| 80 | + * Held as a readonly reference — the caller retains ownership. |
| 81 | + */ |
| 82 | + private readonly _provider: SpeechToTextProvider; |
| 83 | + |
| 84 | + /** |
| 85 | + * Default MIME type applied to raw audio buffers when no format |
| 86 | + * information is available. WAV is the most universally supported |
| 87 | + * format across STT providers. |
| 88 | + */ |
| 89 | + private readonly _defaultMimeType: string; |
| 90 | + |
| 91 | + /** |
| 92 | + * Create a new adapter wrapping a voice-pipeline STT provider. |
| 93 | + * |
| 94 | + * @param provider - A configured `SpeechToTextProvider` instance |
| 95 | + * (e.g. Whisper, Deepgram, AssemblyAI, Azure Speech). |
| 96 | + * @param defaultMimeType - MIME type to assume for raw audio buffers. |
| 97 | + * Defaults to `'audio/wav'` which is accepted by all major STT |
| 98 | + * providers. Override to `'audio/mpeg'` or `'audio/ogg'` when |
| 99 | + * indexing MP3/OGG files. |
| 100 | + * |
| 101 | + * @throws {Error} If provider is null or undefined. |
| 102 | + * |
| 103 | + * @example |
| 104 | + * ```typescript |
| 105 | + * const adapter = new SpeechProviderAdapter(whisperProvider); |
| 106 | + * const mp3Adapter = new SpeechProviderAdapter(whisperProvider, 'audio/mpeg'); |
| 107 | + * ``` |
| 108 | + */ |
| 109 | + constructor(provider: SpeechToTextProvider, defaultMimeType = 'audio/wav') { |
| 110 | + if (!provider) { |
| 111 | + throw new Error( |
| 112 | + 'SpeechProviderAdapter: a SpeechToTextProvider instance is required.', |
| 113 | + ); |
| 114 | + } |
| 115 | + this._provider = provider; |
| 116 | + this._defaultMimeType = defaultMimeType; |
| 117 | + } |
| 118 | + |
| 119 | + /** |
| 120 | + * Transcribe audio data to text. |
| 121 | + * |
| 122 | + * Wraps the raw buffer in a `SpeechAudioInput` and delegates to the |
| 123 | + * underlying voice-pipeline provider. The rich transcription result |
| 124 | + * is reduced to the plain text string that the multimodal indexer |
| 125 | + * needs for embedding generation. |
| 126 | + * |
| 127 | + * @param audio - Raw audio data as a Buffer (WAV, MP3, OGG, etc.). |
| 128 | + * @param language - Optional BCP-47 language code hint for improved |
| 129 | + * transcription accuracy (e.g. `'en'`, `'es'`, `'ja'`). |
| 130 | + * @returns The transcribed text content. |
| 131 | + * |
| 132 | + * @throws {Error} If the underlying STT provider fails. |
| 133 | + * |
| 134 | + * @example |
| 135 | + * ```typescript |
| 136 | + * const transcript = await adapter.transcribe(wavBuffer); |
| 137 | + * const spanishTranscript = await adapter.transcribe(audioBuffer, 'es'); |
| 138 | + * ``` |
| 139 | + */ |
| 140 | + async transcribe(audio: Buffer, language?: string): Promise<string> { |
| 141 | + const result = await this._provider.transcribe( |
| 142 | + { |
| 143 | + data: audio, |
| 144 | + mimeType: this._defaultMimeType, |
| 145 | + }, |
| 146 | + language ? { language } : undefined, |
| 147 | + ); |
| 148 | + return result.text; |
| 149 | + } |
| 150 | + |
| 151 | + /** |
| 152 | + * Get the display name of the underlying STT provider. |
| 153 | + * |
| 154 | + * Useful for logging and diagnostics — lets callers identify which |
| 155 | + * voice-pipeline provider is actually handling transcription. |
| 156 | + * |
| 157 | + * @returns The provider's display name or ID string. |
| 158 | + * |
| 159 | + * @example |
| 160 | + * ```typescript |
| 161 | + * console.log(`STT via: ${adapter.getProviderName()}`); // "openai-whisper" |
| 162 | + * ``` |
| 163 | + */ |
| 164 | + getProviderName(): string { |
| 165 | + return this._provider.displayName ?? this._provider.id; |
| 166 | + } |
| 167 | +} |
0 commit comments