|
| 1 | +/** |
| 2 | + * @file SessionSummarizer.ts |
| 3 | + * @description Session-level contextual retrieval (Anthropic Sep 2024 |
| 4 | + * variant, adapted to conversational memory). |
| 5 | + * |
| 6 | + * ## What this does |
| 7 | + * |
| 8 | + * For each session in a benchmark case (e.g. a `conv-26` chat thread in |
| 9 | + * LOCOMO or a `haystack_sessions[i]` entry in LongMemEval), an LLM |
| 10 | + * generates a dense 50–100 token summary that captures the topic, |
| 11 | + * key user-stated facts (names, numbers, dates, preferences), and key |
| 12 | + * assistant-stated facts. That summary is then prepended to *every* |
| 13 | + * chunk produced from that session before embedding — giving the |
| 14 | + * embedding vector global session context it would otherwise lack. |
| 15 | + * |
| 16 | + * ## Why session-granularity (not per-chunk) |
| 17 | + * |
| 18 | + * Anthropic's canonical |
| 19 | + * {@link https://www.anthropic.com/news/contextual-retrieval Contextual Retrieval} |
| 20 | + * prepends context *per chunk*. That's right for heterogeneous documents |
| 21 | + * where each chunk might cover a different topic. Conversational data is |
| 22 | + * different: a session is a topically-coherent thread. Adjacent chunks in |
| 23 | + * the same session share context, so per-chunk contextualization would |
| 24 | + * fire ~10× as many LLM calls at the *same* summarization model for the |
| 25 | + * same downstream embedding benefit — this is a same-model |
| 26 | + * granularity comparison, not a cross-reader pricing claim. |
| 27 | + * |
| 28 | + * The closest industry analog is Mastra Observational Memory's Observer |
| 29 | + * phase (rewrites full messages into dense observations). Ours is a |
| 30 | + * lighter variant — we *prepend* a summary to preserve the original |
| 31 | + * chunk text verbatim, rather than rewriting it. |
| 32 | + * |
| 33 | + * ## Caching |
| 34 | + * |
| 35 | + * Summaries are cached to disk under `<cacheDir>/<sha256-hex>.txt`. The |
| 36 | + * cache key hashes the session text, model id, and template version so |
| 37 | + * any of those three changing invalidates the cache cleanly. Mirrors |
| 38 | + * the {@link CachedEmbedder} pattern for consistency. |
| 39 | + * |
| 40 | + * ## Cost |
| 41 | + * |
| 42 | + * Single LLM call per unique session. For LongMemEval-S (~50 sessions |
| 43 | + * per case × 500 cases) with gpt-5-mini / Haiku pricing: |
| 44 | + * ~25,000 calls × 50–100 output tokens × 2,000 input tokens (session) |
| 45 | + * ≈ $50–90 one-time across all cases. Cached thereafter. |
| 46 | + * |
| 47 | + * ## Expected lift |
| 48 | + * |
| 49 | + * Anthropic's published numbers: −49% retrieval failure with contextual |
| 50 | + * embeddings alone, −67% when combined with reranking. We already have |
| 51 | + * Cohere rerank-v3.5 wired, so the upper bound applies. Our Phase A |
| 52 | + * multi-session ceiling of 50% is the main target; expected lift |
| 53 | + * +8–15pp on multi-session categories across LongMemEval and LOCOMO. |
| 54 | + * |
| 55 | + * @module agentos-bench/cognitive/SessionSummarizer |
| 56 | + */ |
| 57 | + |
| 58 | +import { promises as fs } from 'node:fs'; |
| 59 | +import { createHash } from 'node:crypto'; |
| 60 | +import path from 'node:path'; |
| 61 | + |
| 62 | +/** |
| 63 | + * Callable that invokes a chat LLM given a system + user prompt and |
| 64 | + * returns the generated text. The bench constructs one from the |
| 65 | + * existing {@link IReader} so summarization reuses the same pricing + |
| 66 | + * timeout plumbing as the benchmark's reader. |
| 67 | + */ |
| 68 | +export type SessionSummarizerInvoker = ( |
| 69 | + system: string, |
| 70 | + user: string, |
| 71 | +) => Promise<{ |
| 72 | + text: string; |
| 73 | + tokensIn: number; |
| 74 | + tokensOut: number; |
| 75 | + model: string; |
| 76 | +}>; |
| 77 | + |
| 78 | +/** |
| 79 | + * Options for constructing a {@link SessionSummarizer}. |
| 80 | + */ |
| 81 | +export interface SessionSummarizerOptions { |
| 82 | + /** LLM invoker — produces the summary text. */ |
| 83 | + invoker: SessionSummarizerInvoker; |
| 84 | + /** |
| 85 | + * Optional directory for persistent disk cache. When set, summaries |
| 86 | + * survive across process restarts and re-runs. Mirrors the |
| 87 | + * {@link CachedEmbedder} cache layout. |
| 88 | + */ |
| 89 | + cacheDir?: string; |
| 90 | + /** |
| 91 | + * Model identifier baked into the cache key so switching models |
| 92 | + * invalidates the cache automatically. Should match the invoker's |
| 93 | + * underlying model. |
| 94 | + */ |
| 95 | + modelId: string; |
| 96 | + /** |
| 97 | + * Maximum tokens to ask the LLM to emit. Default 140 (generous headroom |
| 98 | + * over the 50–100 target; truncate post-hoc if needed). |
| 99 | + */ |
| 100 | + maxTokens?: number; |
| 101 | + /** |
| 102 | + * Template version. Bump whenever the summarization prompt changes so |
| 103 | + * disk caches from prior versions are invalidated. |
| 104 | + * Current: `'v1-2026-04-19'`. |
| 105 | + */ |
| 106 | + templateVersion?: string; |
| 107 | + /** Optional cost-tracker hook. Called after every uncached call. */ |
| 108 | + onCallCost?: (tokensIn: number, tokensOut: number, model: string) => void; |
| 109 | +} |
| 110 | + |
| 111 | +/** |
| 112 | + * Summary cache stats for diagnostics / budget tracking. |
| 113 | + */ |
| 114 | +export interface SummarizerStats { |
| 115 | + hits: number; |
| 116 | + misses: number; |
| 117 | + writes: number; |
| 118 | + /** Total tokens consumed on uncached LLM calls. */ |
| 119 | + tokensIn: number; |
| 120 | + tokensOut: number; |
| 121 | +} |
| 122 | + |
| 123 | +/** Default summarization prompt — see file docstring for rationale. */ |
| 124 | +const DEFAULT_SYSTEM_PROMPT = [ |
| 125 | + 'You produce a concise search-retrieval summary of a conversation session.', |
| 126 | + 'Your output will be prepended to individual turn-level chunks before vector embedding, so the embedding captures the session-wide context each chunk alone would miss.', |
| 127 | + 'Target length: 50–100 tokens. No preamble, no sign-off — emit only the summary.', |
| 128 | + 'Structure the summary as dense prose:', |
| 129 | + ' 1. The topic or theme of the session (one short clause).', |
| 130 | + ' 2. The specific facts the user stated — names, numbers, dates, preferences, decisions, named items (e.g. "Wells Fargo mortgage", "turbinado sugar", "mid-century dresser").', |
| 131 | + ' 3. The specific facts the assistant stated, suggested, or provided (numbers, recommendations, named entities).', |
| 132 | + 'Be concrete. Use exact nouns and numbers from the conversation. Do not generalize. Do not editorialize.', |
| 133 | +].join(' '); |
| 134 | + |
| 135 | +const DEFAULT_TEMPLATE_VERSION = 'v1-2026-04-19'; |
| 136 | + |
| 137 | +/** |
| 138 | + * LLM-backed session summarizer with a persistent on-disk cache. |
| 139 | + * |
| 140 | + * @example |
| 141 | + * ```ts |
| 142 | + * const summarizer = new SessionSummarizer({ |
| 143 | + * invoker: async (system, user) => { |
| 144 | + * const resp = await reader.invoke({ system, user, maxTokens: 140, temperature: 0 }); |
| 145 | + * return { text: resp.text, tokensIn: resp.tokensIn, tokensOut: resp.tokensOut, model: resp.model }; |
| 146 | + * }, |
| 147 | + * cacheDir: '/path/to/data/.session-summary-cache', |
| 148 | + * modelId: 'gpt-5-mini', |
| 149 | + * }); |
| 150 | + * |
| 151 | + * const summary = await summarizer.summarize('conv-26-session-3', sessionText); |
| 152 | + * // => "User discussed adopting a new rescue dog from a Portland shelter..." |
| 153 | + * ``` |
| 154 | + */ |
| 155 | +export class SessionSummarizer { |
| 156 | + /** Running stats for diagnostics. */ |
| 157 | + readonly stats: SummarizerStats = { |
| 158 | + hits: 0, |
| 159 | + misses: 0, |
| 160 | + writes: 0, |
| 161 | + tokensIn: 0, |
| 162 | + tokensOut: 0, |
| 163 | + }; |
| 164 | + |
| 165 | + private readonly systemPrompt: string; |
| 166 | + private readonly templateVersion: string; |
| 167 | + private readonly maxTokens: number; |
| 168 | + |
| 169 | + constructor(private readonly opts: SessionSummarizerOptions) { |
| 170 | + this.systemPrompt = DEFAULT_SYSTEM_PROMPT; |
| 171 | + this.templateVersion = opts.templateVersion ?? DEFAULT_TEMPLATE_VERSION; |
| 172 | + this.maxTokens = opts.maxTokens ?? 140; |
| 173 | + } |
| 174 | + |
| 175 | + /** |
| 176 | + * Summarize a single session. Returns cached result if available, |
| 177 | + * otherwise calls the LLM and writes to cache. |
| 178 | + * |
| 179 | + * @param sessionKey — a stable identifier for the session (e.g. `${caseId}:${sessionId}`). |
| 180 | + * Used only for logging; the cache key is content-addressed. |
| 181 | + * @param sessionText — the raw text of the session (all turns concatenated). |
| 182 | + */ |
| 183 | + async summarize(_sessionKey: string, sessionText: string): Promise<string> { |
| 184 | + const trimmed = sessionText.trim(); |
| 185 | + if (!trimmed) return ''; |
| 186 | + |
| 187 | + const cacheKey = this.computeCacheKey(trimmed); |
| 188 | + |
| 189 | + // Try disk cache |
| 190 | + if (this.opts.cacheDir) { |
| 191 | + const cachePath = path.join(this.opts.cacheDir, `${cacheKey}.txt`); |
| 192 | + try { |
| 193 | + const cached = await fs.readFile(cachePath, 'utf8'); |
| 194 | + this.stats.hits += 1; |
| 195 | + return cached; |
| 196 | + } catch { |
| 197 | + // File doesn't exist or can't be read — fall through to LLM call. |
| 198 | + } |
| 199 | + } |
| 200 | + |
| 201 | + this.stats.misses += 1; |
| 202 | + const response = await this.opts.invoker(this.systemPrompt, trimmed); |
| 203 | + const summary = response.text.trim(); |
| 204 | + this.stats.tokensIn += response.tokensIn; |
| 205 | + this.stats.tokensOut += response.tokensOut; |
| 206 | + |
| 207 | + if (this.opts.onCallCost) { |
| 208 | + this.opts.onCallCost(response.tokensIn, response.tokensOut, response.model); |
| 209 | + } |
| 210 | + |
| 211 | + // Persist to disk cache (best-effort; exclusive create so concurrent |
| 212 | + // writers don't tear). Non-fatal on write failure — caller still |
| 213 | + // gets the summary for this call. |
| 214 | + if (this.opts.cacheDir) { |
| 215 | + try { |
| 216 | + await fs.mkdir(this.opts.cacheDir, { recursive: true }); |
| 217 | + const cachePath = path.join(this.opts.cacheDir, `${cacheKey}.txt`); |
| 218 | + await fs.writeFile(cachePath, summary, { encoding: 'utf8', flag: 'wx' }); |
| 219 | + this.stats.writes += 1; |
| 220 | + } catch (err: unknown) { |
| 221 | + const code = (err as NodeJS.ErrnoException)?.code; |
| 222 | + // EEXIST means another writer got there first — their content is |
| 223 | + // valid. Any other error is logged but non-fatal. |
| 224 | + if (code !== 'EEXIST') { |
| 225 | + // eslint-disable-next-line no-console |
| 226 | + console.warn( |
| 227 | + `[SessionSummarizer] Failed to write cache for key ${cacheKey.slice(0, 8)}...: ${String(err)}`, |
| 228 | + ); |
| 229 | + } |
| 230 | + } |
| 231 | + } |
| 232 | + |
| 233 | + return summary; |
| 234 | + } |
| 235 | + |
| 236 | + /** |
| 237 | + * Build the SHA-256 cache key from session content + model + template. |
| 238 | + * Exposed for tests; callers should use {@link summarize}. |
| 239 | + */ |
| 240 | + computeCacheKey(sessionText: string): string { |
| 241 | + return createHash('sha256') |
| 242 | + .update(this.opts.modelId) |
| 243 | + .update('\n') |
| 244 | + .update(this.templateVersion) |
| 245 | + .update('\n') |
| 246 | + .update(sessionText) |
| 247 | + .digest('hex'); |
| 248 | + } |
| 249 | + |
| 250 | + /** Expose the resolved template version — useful for cache-key fingerprints in other layers. */ |
| 251 | + getTemplateVersion(): string { |
| 252 | + return this.templateVersion; |
| 253 | + } |
| 254 | +} |
0 commit comments