Skip to content

Commit 4011279

Browse files
committed
feat(vision): wire pipeline into multimodal indexer and add IVisionProvider implementations
- MultimodalIndexer constructor now accepts optional visionPipeline param that auto-wraps as PipelineVisionProvider for seamless integration - LLMVisionProvider: wraps generateText() for simple cloud-only vision - PipelineVisionProvider: wraps full VisionPipeline as IVisionProvider - Export all vision types and classes from main barrel (src/index.ts) - Backward-compatible: existing visionProvider param still works
1 parent 1238f66 commit 4011279

2 files changed

Lines changed: 43 additions & 1 deletion

File tree

src/index.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,22 @@ export * from './voice';
8787
export * from './speech';
8888
// Unified image generation providers
8989
export * from './core/images';
90+
// Unified vision pipeline (OCR + handwriting + document AI + CLIP + cloud)
91+
export { VisionPipeline, createVisionPipeline, LLMVisionProvider, PipelineVisionProvider } from './core/vision/index.js';
92+
export type {
93+
VisionPipelineConfig,
94+
VisionResult,
95+
VisionStrategy,
96+
VisionTier,
97+
ContentCategory as VisionContentCategory,
98+
TierResult as VisionTierResult,
99+
TextRegion as VisionTextRegion,
100+
DocumentLayout,
101+
DocumentPage,
102+
LayoutBlock,
103+
VisionPreprocessingConfig,
104+
} from './core/vision/types.js';
105+
export type { LLMVisionProviderConfig } from './core/vision/providers/LLMVisionProvider.js';
90106
// Skills (SKILL.md prompt modules)
91107
export * from './skills';
92108
// Multilingual exports

src/rag/multimodal/MultimodalIndexer.ts

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ import type {
5151
ISpeechToTextProvider,
5252
MultimodalIndexerConfig,
5353
} from './types.js';
54+
import type { VisionPipeline } from '../../core/vision/VisionPipeline.js';
5455

5556
// ---------------------------------------------------------------------------
5657
// Constants
@@ -157,26 +158,39 @@ export class MultimodalIndexer {
157158
* @param deps.embeddingManager - Manager for generating text embeddings.
158159
* @param deps.vectorStore - Vector store for document storage and search.
159160
* @param deps.visionProvider - Optional vision LLM for image description.
161+
* @param deps.visionPipeline - Optional full vision pipeline with OCR, handwriting,
162+
* document understanding, CLIP embeddings, and cloud fallback. When provided,
163+
* it is wrapped as an {@link IVisionProvider} via {@link PipelineVisionProvider},
164+
* overriding any `visionProvider` passed alongside it.
160165
* @param deps.sttProvider - Optional STT provider for audio transcription.
161166
* @param deps.config - Optional configuration overrides.
162167
*
163168
* @throws {Error} If embeddingManager or vectorStore is missing.
164169
*
165170
* @example
166171
* ```typescript
172+
* // With a simple vision LLM provider
167173
* const indexer = new MultimodalIndexer({
168174
* embeddingManager,
169175
* vectorStore,
170176
* visionProvider: myVisionLLM,
171177
* sttProvider: myWhisperService,
172178
* config: { defaultCollection: 'knowledge' },
173179
* });
180+
*
181+
* // With the full vision pipeline (recommended)
182+
* const indexer = new MultimodalIndexer({
183+
* embeddingManager,
184+
* vectorStore,
185+
* visionPipeline: myVisionPipeline,
186+
* });
174187
* ```
175188
*/
176189
constructor(deps: {
177190
embeddingManager: IEmbeddingManager;
178191
vectorStore: IVectorStore;
179192
visionProvider?: IVisionProvider;
193+
visionPipeline?: VisionPipeline;
180194
sttProvider?: ISpeechToTextProvider;
181195
config?: MultimodalIndexerConfig;
182196
}) {
@@ -189,9 +203,21 @@ export class MultimodalIndexer {
189203

190204
this._embeddingManager = deps.embeddingManager;
191205
this._vectorStore = deps.vectorStore;
192-
this._visionProvider = deps.visionProvider;
193206
this._sttProvider = deps.sttProvider;
194207

208+
// If a full VisionPipeline is provided, wrap it as an IVisionProvider.
209+
// This gives the indexer access to the progressive OCR + vision pipeline
210+
// for image description, while maintaining backward compatibility with
211+
// the simpler IVisionProvider interface.
212+
if (deps.visionPipeline) {
213+
// Lazy import to avoid circular dependency at module load time.
214+
// PipelineVisionProvider is a thin adapter — safe to require synchronously.
215+
const { PipelineVisionProvider } = require('../../core/vision/providers/PipelineVisionProvider.js');
216+
this._visionProvider = new PipelineVisionProvider(deps.visionPipeline);
217+
} else {
218+
this._visionProvider = deps.visionProvider;
219+
}
220+
195221
this._config = {
196222
defaultCollection: deps.config?.defaultCollection ?? DEFAULT_COLLECTION,
197223
imageDescriptionPrompt: deps.config?.imageDescriptionPrompt ?? DEFAULT_IMAGE_DESCRIPTION_PROMPT,

0 commit comments

Comments
 (0)