Skip to content

Commit 67b3201

Browse files
committed
feat(rag): multi-hypothesis HyDE for improved recall via diverse perspectives
1 parent 65dca28 commit 67b3201

2 files changed

Lines changed: 528 additions & 0 deletions

File tree

src/rag/HydeRetriever.ts

Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,21 @@ export interface HydeConfig {
3838
hypothesisSystemPrompt?: string;
3939
/** Use full-answer granularity (recommended by research). Default: true. */
4040
fullAnswerGranularity?: boolean;
41+
/**
42+
* Number of diverse hypothetical documents to generate per query.
43+
*
44+
* Multi-hypothesis HyDE generates N hypotheses from different perspectives
45+
* (technical, practical/example, overview) and searches with each embedding.
46+
* Results are deduplicated by chunk ID, keeping the highest score.
47+
*
48+
* Higher values improve recall at the cost of additional LLM calls.
49+
* - 1: Original single-hypothesis HyDE (fastest)
50+
* - 3: Recommended default (good diversity/cost tradeoff)
51+
* - 5: Maximum diversity (highest recall, most expensive)
52+
*
53+
* Default: 3.
54+
*/
55+
hypothesisCount?: number;
4156
}
4257

4358
export const DEFAULT_HYDE_CONFIG: Required<HydeConfig> = {
@@ -51,6 +66,7 @@ export const DEFAULT_HYDE_CONFIG: Required<HydeConfig> = {
5166
'You are a knowledgeable assistant. Generate a concise, factual answer to the following question. ' +
5267
'This answer will be used for semantic search, so be specific and include relevant technical terms.',
5368
fullAnswerGranularity: true,
69+
hypothesisCount: 3,
5470
};
5571

5672
function clampUnitInterval(value: unknown, fallback: number): number {
@@ -78,12 +94,18 @@ export function resolveHydeConfig(partial?: Partial<HydeConfig>): Required<HydeC
7894
? Math.floor(merged.maxHypothesisTokens)
7995
: DEFAULT_HYDE_CONFIG.maxHypothesisTokens;
8096

97+
const hypothesisCount =
98+
typeof merged.hypothesisCount === 'number' && Number.isFinite(merged.hypothesisCount) && merged.hypothesisCount >= 1
99+
? Math.floor(merged.hypothesisCount)
100+
: DEFAULT_HYDE_CONFIG.hypothesisCount;
101+
81102
return {
82103
...merged,
83104
initialThreshold,
84105
minThreshold,
85106
thresholdStep,
86107
maxHypothesisTokens,
108+
hypothesisCount,
87109
};
88110
}
89111

@@ -114,6 +136,27 @@ export interface HydeRetrievalResult {
114136
retrievalLatencyMs: number;
115137
}
116138

139+
/**
140+
* Result from multi-hypothesis HyDE retrieval.
141+
*
142+
* Contains all generated hypotheses and the deduplicated, merged result set
143+
* from searching with each hypothesis embedding.
144+
*
145+
* @interface HydeMultiRetrievalResult
146+
*/
147+
export interface HydeMultiRetrievalResult {
148+
/** All generated hypotheses. */
149+
hypotheses: string[];
150+
/** Deduplicated query result (union of all hypothesis searches, highest score per doc). */
151+
queryResult: QueryResult;
152+
/** Number of hypotheses generated. */
153+
hypothesisCount: number;
154+
/** Total time for all hypothesis generations (ms). */
155+
hypothesisLatencyMs: number;
156+
/** Total time for all embedding + retrieval passes (ms). */
157+
retrievalLatencyMs: number;
158+
}
159+
117160
// ── Core Retriever ─────────────────────────────────────────────────────────
118161

119162
/**
@@ -168,6 +211,225 @@ export class HydeRetriever {
168211
};
169212
}
170213

214+
/**
215+
* Generate multiple hypothetical documents from different perspectives.
216+
*
217+
* Each hypothesis approaches the query from a different angle, improving
218+
* recall by covering more of the semantic space. Uses chain-of-thought
219+
* prompting to ensure diverse, high-quality hypotheses.
220+
*
221+
* The system prompt asks the LLM to generate N diverse hypotheses:
222+
* - Hypothesis 1: Technical/formal perspective
223+
* - Hypothesis 2: Practical/example perspective
224+
* - Hypothesis 3: Overview/summary perspective
225+
* - (Additional hypotheses explore further angles)
226+
*
227+
* @param {string} query - The user query to generate hypotheses for.
228+
* @param {number} [count] - Number of hypotheses to generate. Default: config.hypothesisCount (3).
229+
* @returns {Promise<{ hypotheses: string[]; latencyMs: number }>} Generated hypotheses and timing.
230+
* @throws {Error} If the LLM call fails.
231+
*
232+
* @example
233+
* ```typescript
234+
* const { hypotheses, latencyMs } = await retriever.generateMultipleHypotheses(
235+
* 'How does BM25 scoring work?',
236+
* 3,
237+
* );
238+
* // hypotheses[0]: Technical explanation with formulas
239+
* // hypotheses[1]: Practical example with code
240+
* // hypotheses[2]: High-level conceptual overview
241+
* ```
242+
*/
243+
async generateMultipleHypotheses(
244+
query: string,
245+
count?: number,
246+
): Promise<{ hypotheses: string[]; latencyMs: number }> {
247+
const n = count ?? this.config.hypothesisCount;
248+
249+
// For n=1, fall back to the single-hypothesis path
250+
if (n <= 1) {
251+
const result = await this.generateHypothesis(query);
252+
return { hypotheses: [result.hypothesis], latencyMs: result.latencyMs };
253+
}
254+
255+
const start = Date.now();
256+
257+
const systemPrompt = [
258+
this.config.hypothesisSystemPrompt,
259+
this.config.fullAnswerGranularity
260+
? 'Write complete hypothetical answers in natural language prose.'
261+
: 'Write concise hypothetical answers suitable for semantic retrieval.',
262+
`Keep each answer under ${this.config.maxHypothesisTokens} tokens.`,
263+
].join(' ');
264+
265+
const userPrompt = [
266+
'Think step by step:',
267+
'1. What is this question really asking?',
268+
'2. What kind of document would contain the answer?',
269+
'3. What vocabulary and terminology would that document use?',
270+
'4. Write a brief version of that hypothetical document.',
271+
'',
272+
`Generate ${n} diverse hypothetical documents that would answer: "${query}"`,
273+
'',
274+
'Each hypothesis MUST take a DIFFERENT perspective or focus on a',
275+
'DIFFERENT aspect of the question. Be diverse in vocabulary and approach.',
276+
'',
277+
...Array.from({ length: n }, (_, i) => {
278+
const perspectives = [
279+
'technical/formal perspective with precise terminology',
280+
'practical/example perspective with concrete use cases',
281+
'overview/summary perspective with broad context',
282+
'troubleshooting/diagnostic perspective',
283+
'comparative perspective contrasting with alternatives',
284+
];
285+
const perspectiveLabel = perspectives[i % perspectives.length];
286+
return `Hypothesis ${i + 1} (${perspectiveLabel}):`;
287+
}),
288+
].join('\n');
289+
290+
const rawResponse = await this.llmCaller(systemPrompt, userPrompt);
291+
292+
// Parse the response: split on "Hypothesis N:" markers
293+
const hypotheses: string[] = [];
294+
const hypothesisRegex = /Hypothesis\s+\d+\s*(?:\([^)]*\))?:\s*/gi;
295+
const parts = rawResponse.split(hypothesisRegex).filter((p) => p.trim().length > 0);
296+
297+
for (const part of parts) {
298+
const trimmed = part.trim();
299+
if (trimmed.length > 0) {
300+
hypotheses.push(trimmed);
301+
}
302+
}
303+
304+
// If parsing failed (LLM didn't follow format), treat entire response as one hypothesis
305+
// and generate remaining hypotheses individually as fallback
306+
if (hypotheses.length === 0) {
307+
hypotheses.push(rawResponse.trim());
308+
}
309+
310+
// If we got fewer hypotheses than requested, generate remaining individually
311+
while (hypotheses.length < n) {
312+
const fallbackResult = await this.generateHypothesis(query);
313+
hypotheses.push(fallbackResult.hypothesis);
314+
}
315+
316+
// Trim to exactly n hypotheses
317+
return {
318+
hypotheses: hypotheses.slice(0, n),
319+
latencyMs: Date.now() - start,
320+
};
321+
}
322+
323+
/**
324+
* Multi-hypothesis retrieval: generates N diverse hypotheses, searches with each,
325+
* and merges results by deduplication (keeping the highest score per document).
326+
*
327+
* This dramatically improves recall compared to single-hypothesis HyDE because
328+
* one bad hypothesis doesn't ruin everything — other hypotheses can still find
329+
* relevant documents from different angles.
330+
*
331+
* Pipeline:
332+
* 1. Generate N hypotheses via {@link generateMultipleHypotheses}
333+
* 2. Embed each hypothesis
334+
* 3. Search the vector store with each embedding
335+
* 4. Union all results, deduplicate by document ID, keep highest score
336+
*
337+
* @param {object} opts - Retrieval options.
338+
* @param {string} opts.query - The user query.
339+
* @param {IVectorStore} opts.vectorStore - Vector store to search.
340+
* @param {string} opts.collectionName - Collection to search in.
341+
* @param {Partial<QueryOptions>} [opts.queryOptions] - Additional query options.
342+
* @param {number} [opts.hypothesisCount] - Override hypothesis count for this call.
343+
* @returns {Promise<HydeMultiRetrievalResult>} Deduplicated results from all hypotheses.
344+
*
345+
* @example
346+
* ```typescript
347+
* const result = await retriever.retrieveMulti({
348+
* query: 'How does BM25 work?',
349+
* vectorStore: myStore,
350+
* collectionName: 'knowledge-base',
351+
* hypothesisCount: 3,
352+
* });
353+
* console.log(`Found ${result.queryResult.documents.length} unique docs from ${result.hypothesisCount} hypotheses`);
354+
* ```
355+
*/
356+
async retrieveMulti(opts: {
357+
query: string;
358+
vectorStore: IVectorStore;
359+
collectionName: string;
360+
queryOptions?: Partial<QueryOptions>;
361+
hypothesisCount?: number;
362+
}): Promise<HydeMultiRetrievalResult> {
363+
const count = opts.hypothesisCount ?? this.config.hypothesisCount;
364+
365+
// Step 1: Generate multiple hypotheses
366+
const { hypotheses, latencyMs: hypothesisLatencyMs } =
367+
await this.generateMultipleHypotheses(opts.query, count);
368+
369+
// Step 2: Embed all hypotheses
370+
const retrievalStart = Date.now();
371+
const embeddingResponse = await this.embeddingManager.generateEmbeddings({
372+
texts: hypotheses,
373+
});
374+
375+
if (!embeddingResponse.embeddings || embeddingResponse.embeddings.length === 0) {
376+
return {
377+
hypotheses,
378+
queryResult: { documents: [] },
379+
hypothesisCount: hypotheses.length,
380+
hypothesisLatencyMs,
381+
retrievalLatencyMs: Date.now() - retrievalStart,
382+
};
383+
}
384+
385+
// Step 3: Search with each embedding in parallel
386+
const {
387+
minSimilarityScore: _ignoredMinSimilarityScore,
388+
...extraQueryOptions
389+
} = opts.queryOptions ?? {};
390+
391+
const searchPromises = embeddingResponse.embeddings
392+
.filter((emb) => emb && emb.length > 0)
393+
.map((embedding) =>
394+
opts.vectorStore.query(opts.collectionName, embedding, {
395+
topK: extraQueryOptions.topK ?? 5,
396+
includeTextContent: true,
397+
includeMetadata: true,
398+
...extraQueryOptions,
399+
}),
400+
);
401+
402+
const searchResults = await Promise.all(searchPromises);
403+
404+
// Step 4: Merge and deduplicate — keep highest score per document ID
405+
const docMap = new Map<string, (typeof searchResults)[0]['documents'][0]>();
406+
407+
for (const result of searchResults) {
408+
for (const doc of result.documents) {
409+
const existing = docMap.get(doc.id);
410+
if (!existing || doc.similarityScore > existing.similarityScore) {
411+
docMap.set(doc.id, doc);
412+
}
413+
}
414+
}
415+
416+
// Sort by similarity score descending
417+
const mergedDocs = Array.from(docMap.values()).sort(
418+
(a, b) => b.similarityScore - a.similarityScore,
419+
);
420+
421+
// Apply topK limit
422+
const topK = opts.queryOptions?.topK ?? 5;
423+
424+
return {
425+
hypotheses,
426+
queryResult: { documents: mergedDocs.slice(0, topK) },
427+
hypothesisCount: hypotheses.length,
428+
hypothesisLatencyMs,
429+
retrievalLatencyMs: Date.now() - retrievalStart,
430+
};
431+
}
432+
171433
/**
172434
* Embed the hypothesis and search the vector store.
173435
* Uses adaptive thresholding: starts at initialThreshold, steps down

0 commit comments

Comments
 (0)