Skip to content

Commit cc83791

Browse files
committed
memory: restore SessionSummarizer + tests (tracked file missing from prior push — unblocks CI)
1 parent ca44097 commit cc83791

2 files changed

Lines changed: 508 additions & 0 deletions

File tree

Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
/**
2+
* @file SessionSummarizer.ts
3+
* @description Session-level contextual retrieval (Anthropic Sep 2024
4+
* variant, adapted to conversational memory).
5+
*
6+
* ## What this does
7+
*
8+
* For each session in a benchmark case (e.g. a `conv-26` chat thread in
9+
* LOCOMO or a `haystack_sessions[i]` entry in LongMemEval), an LLM
10+
* generates a dense 50–100 token summary that captures the topic,
11+
* key user-stated facts (names, numbers, dates, preferences), and key
12+
* assistant-stated facts. That summary is then prepended to *every*
13+
* chunk produced from that session before embedding — giving the
14+
* embedding vector global session context it would otherwise lack.
15+
*
16+
* ## Why session-granularity (not per-chunk)
17+
*
18+
* Anthropic's canonical
19+
* {@link https://www.anthropic.com/news/contextual-retrieval Contextual Retrieval}
20+
* prepends context *per chunk*. That's right for heterogeneous documents
21+
* where each chunk might cover a different topic. Conversational data is
22+
* different: a session is a topically-coherent thread. Adjacent chunks in
23+
* the same session share context, so per-chunk contextualization would
24+
* fire ~10× as many LLM calls at the *same* summarization model for the
25+
* same downstream embedding benefit — this is a same-model
26+
* granularity comparison, not a cross-reader pricing claim.
27+
*
28+
* The closest industry analog is Mastra Observational Memory's Observer
29+
* phase (rewrites full messages into dense observations). Ours is a
30+
* lighter variant — we *prepend* a summary to preserve the original
31+
* chunk text verbatim, rather than rewriting it.
32+
*
33+
* ## Caching
34+
*
35+
* Summaries are cached to disk under `<cacheDir>/<sha256-hex>.txt`. The
36+
* cache key hashes the session text, model id, and template version so
37+
* any of those three changing invalidates the cache cleanly. Mirrors
38+
* the {@link CachedEmbedder} pattern for consistency.
39+
*
40+
* ## Cost
41+
*
42+
* Single LLM call per unique session. For LongMemEval-S (~50 sessions
43+
* per case × 500 cases) with gpt-5-mini / Haiku pricing:
44+
* ~25,000 calls × 50–100 output tokens × 2,000 input tokens (session)
45+
* ≈ $50–90 one-time across all cases. Cached thereafter.
46+
*
47+
* ## Expected lift
48+
*
49+
* Anthropic's published numbers: −49% retrieval failure with contextual
50+
* embeddings alone, −67% when combined with reranking. We already have
51+
* Cohere rerank-v3.5 wired, so the upper bound applies. Our Phase A
52+
* multi-session ceiling of 50% is the main target; expected lift
53+
* +8–15pp on multi-session categories across LongMemEval and LOCOMO.
54+
*
55+
* @module agentos-bench/cognitive/SessionSummarizer
56+
*/
57+
58+
import { promises as fs } from 'node:fs';
59+
import { createHash } from 'node:crypto';
60+
import path from 'node:path';
61+
62+
/**
63+
* Callable that invokes a chat LLM given a system + user prompt and
64+
* returns the generated text. The bench constructs one from the
65+
* existing {@link IReader} so summarization reuses the same pricing +
66+
* timeout plumbing as the benchmark's reader.
67+
*/
68+
export type SessionSummarizerInvoker = (
69+
system: string,
70+
user: string,
71+
) => Promise<{
72+
text: string;
73+
tokensIn: number;
74+
tokensOut: number;
75+
model: string;
76+
}>;
77+
78+
/**
79+
* Options for constructing a {@link SessionSummarizer}.
80+
*/
81+
export interface SessionSummarizerOptions {
82+
/** LLM invoker — produces the summary text. */
83+
invoker: SessionSummarizerInvoker;
84+
/**
85+
* Optional directory for persistent disk cache. When set, summaries
86+
* survive across process restarts and re-runs. Mirrors the
87+
* {@link CachedEmbedder} cache layout.
88+
*/
89+
cacheDir?: string;
90+
/**
91+
* Model identifier baked into the cache key so switching models
92+
* invalidates the cache automatically. Should match the invoker's
93+
* underlying model.
94+
*/
95+
modelId: string;
96+
/**
97+
* Maximum tokens to ask the LLM to emit. Default 140 (generous headroom
98+
* over the 50–100 target; truncate post-hoc if needed).
99+
*/
100+
maxTokens?: number;
101+
/**
102+
* Template version. Bump whenever the summarization prompt changes so
103+
* disk caches from prior versions are invalidated.
104+
* Current: `'v1-2026-04-19'`.
105+
*/
106+
templateVersion?: string;
107+
/** Optional cost-tracker hook. Called after every uncached call. */
108+
onCallCost?: (tokensIn: number, tokensOut: number, model: string) => void;
109+
}
110+
111+
/**
112+
* Summary cache stats for diagnostics / budget tracking.
113+
*/
114+
export interface SummarizerStats {
115+
hits: number;
116+
misses: number;
117+
writes: number;
118+
/** Total tokens consumed on uncached LLM calls. */
119+
tokensIn: number;
120+
tokensOut: number;
121+
}
122+
123+
/** Default summarization prompt — see file docstring for rationale. */
124+
const DEFAULT_SYSTEM_PROMPT = [
125+
'You produce a concise search-retrieval summary of a conversation session.',
126+
'Your output will be prepended to individual turn-level chunks before vector embedding, so the embedding captures the session-wide context each chunk alone would miss.',
127+
'Target length: 50–100 tokens. No preamble, no sign-off — emit only the summary.',
128+
'Structure the summary as dense prose:',
129+
' 1. The topic or theme of the session (one short clause).',
130+
' 2. The specific facts the user stated — names, numbers, dates, preferences, decisions, named items (e.g. "Wells Fargo mortgage", "turbinado sugar", "mid-century dresser").',
131+
' 3. The specific facts the assistant stated, suggested, or provided (numbers, recommendations, named entities).',
132+
'Be concrete. Use exact nouns and numbers from the conversation. Do not generalize. Do not editorialize.',
133+
].join(' ');
134+
135+
const DEFAULT_TEMPLATE_VERSION = 'v1-2026-04-19';
136+
137+
/**
138+
* LLM-backed session summarizer with a persistent on-disk cache.
139+
*
140+
* @example
141+
* ```ts
142+
* const summarizer = new SessionSummarizer({
143+
* invoker: async (system, user) => {
144+
* const resp = await reader.invoke({ system, user, maxTokens: 140, temperature: 0 });
145+
* return { text: resp.text, tokensIn: resp.tokensIn, tokensOut: resp.tokensOut, model: resp.model };
146+
* },
147+
* cacheDir: '/path/to/data/.session-summary-cache',
148+
* modelId: 'gpt-5-mini',
149+
* });
150+
*
151+
* const summary = await summarizer.summarize('conv-26-session-3', sessionText);
152+
* // => "User discussed adopting a new rescue dog from a Portland shelter..."
153+
* ```
154+
*/
155+
export class SessionSummarizer {
156+
/** Running stats for diagnostics. */
157+
readonly stats: SummarizerStats = {
158+
hits: 0,
159+
misses: 0,
160+
writes: 0,
161+
tokensIn: 0,
162+
tokensOut: 0,
163+
};
164+
165+
private readonly systemPrompt: string;
166+
private readonly templateVersion: string;
167+
private readonly maxTokens: number;
168+
169+
constructor(private readonly opts: SessionSummarizerOptions) {
170+
this.systemPrompt = DEFAULT_SYSTEM_PROMPT;
171+
this.templateVersion = opts.templateVersion ?? DEFAULT_TEMPLATE_VERSION;
172+
this.maxTokens = opts.maxTokens ?? 140;
173+
}
174+
175+
/**
176+
* Summarize a single session. Returns cached result if available,
177+
* otherwise calls the LLM and writes to cache.
178+
*
179+
* @param sessionKey — a stable identifier for the session (e.g. `${caseId}:${sessionId}`).
180+
* Used only for logging; the cache key is content-addressed.
181+
* @param sessionText — the raw text of the session (all turns concatenated).
182+
*/
183+
async summarize(_sessionKey: string, sessionText: string): Promise<string> {
184+
const trimmed = sessionText.trim();
185+
if (!trimmed) return '';
186+
187+
const cacheKey = this.computeCacheKey(trimmed);
188+
189+
// Try disk cache
190+
if (this.opts.cacheDir) {
191+
const cachePath = path.join(this.opts.cacheDir, `${cacheKey}.txt`);
192+
try {
193+
const cached = await fs.readFile(cachePath, 'utf8');
194+
this.stats.hits += 1;
195+
return cached;
196+
} catch {
197+
// File doesn't exist or can't be read — fall through to LLM call.
198+
}
199+
}
200+
201+
this.stats.misses += 1;
202+
const response = await this.opts.invoker(this.systemPrompt, trimmed);
203+
const summary = response.text.trim();
204+
this.stats.tokensIn += response.tokensIn;
205+
this.stats.tokensOut += response.tokensOut;
206+
207+
if (this.opts.onCallCost) {
208+
this.opts.onCallCost(response.tokensIn, response.tokensOut, response.model);
209+
}
210+
211+
// Persist to disk cache (best-effort; exclusive create so concurrent
212+
// writers don't tear). Non-fatal on write failure — caller still
213+
// gets the summary for this call.
214+
if (this.opts.cacheDir) {
215+
try {
216+
await fs.mkdir(this.opts.cacheDir, { recursive: true });
217+
const cachePath = path.join(this.opts.cacheDir, `${cacheKey}.txt`);
218+
await fs.writeFile(cachePath, summary, { encoding: 'utf8', flag: 'wx' });
219+
this.stats.writes += 1;
220+
} catch (err: unknown) {
221+
const code = (err as NodeJS.ErrnoException)?.code;
222+
// EEXIST means another writer got there first — their content is
223+
// valid. Any other error is logged but non-fatal.
224+
if (code !== 'EEXIST') {
225+
// eslint-disable-next-line no-console
226+
console.warn(
227+
`[SessionSummarizer] Failed to write cache for key ${cacheKey.slice(0, 8)}...: ${String(err)}`,
228+
);
229+
}
230+
}
231+
}
232+
233+
return summary;
234+
}
235+
236+
/**
237+
* Build the SHA-256 cache key from session content + model + template.
238+
* Exposed for tests; callers should use {@link summarize}.
239+
*/
240+
computeCacheKey(sessionText: string): string {
241+
return createHash('sha256')
242+
.update(this.opts.modelId)
243+
.update('\n')
244+
.update(this.templateVersion)
245+
.update('\n')
246+
.update(sessionText)
247+
.digest('hex');
248+
}
249+
250+
/** Expose the resolved template version — useful for cache-key fingerprints in other layers. */
251+
getTemplateVersion(): string {
252+
return this.templateVersion;
253+
}
254+
}

0 commit comments

Comments
 (0)