Skip to content

Commit b59cd43

Browse files
committed
feat(ingest-router,memory-router): Stage I entity-linking executors + ranker
Mem0-v3-style entity-linking primitives shipped in agentos core: ingest-router/executors/: - EntityExtractor: regex-based detection of proper nouns, quoted text, and compound noun phrases. Zero LLM cost. Configurable via properNounMinLength + compoundNounMaxLength. - EntityLinkingIngestExecutor: ingest-side reference executor for the IngestRouter 'fact-graph' strategy. Surfaces entities alongside chunks for downstream entity-overlap re-ranking. - entity-types.ts: shared types for both ingest + recall sides. memory-router/backends/: - EntityRetrievalRanker: recall-stage re-ranker that boosts candidates by entity-overlap with the query. combinedScore = (1-w)*semanticScore + w*(overlap/queryEntityCount). Mem0-v3 default weight ~0.5. Both barrel exports updated. The IngestRouter 'fact-graph' strategy ID now has a working reference executor. Consumers using '@framers/agentos/memory-router' get the entity-overlap re-ranker. 20 new tests, 163/163 passing across ingest-router + memory-router. Reference: docs.mem0.ai/migration/oss-v2-to-v3
1 parent fc7cf50 commit b59cd43

11 files changed

Lines changed: 555 additions & 0 deletions

File tree

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
/**
2+
* @file EntityExtractor.ts
3+
* @description Mem0-v3-style entity extractor. Three classes:
4+
* - proper nouns (standalone capitalized tokens)
5+
* - quoted text (within "..." or '...')
6+
* - compound noun phrases (consecutive capitalized tokens)
7+
*
8+
* Order of detection: quoted text first (preserves verbatim spans),
9+
* then compound noun phrases (greedy, longest match), then proper
10+
* nouns (whatever capitalized tokens are not already inside a compound).
11+
*
12+
* Reference: docs.mem0.ai/migration/oss-v2-to-v3 §"Entity Extraction".
13+
*
14+
* @module @framers/agentos/ingest-router/executors/EntityExtractor
15+
*/
16+
17+
import type {
18+
ExtractedEntity,
19+
EntityExtractionResult,
20+
EntityLinkingOptions,
21+
} from './entity-types.js';
22+
23+
export class EntityExtractor {
24+
private readonly properNounMinLength: number;
25+
private readonly compoundNounMaxLength: number;
26+
27+
constructor(opts: EntityLinkingOptions = {}) {
28+
this.properNounMinLength = opts.properNounMinLength ?? 2;
29+
this.compoundNounMaxLength = opts.compoundNounMaxLength ?? 5;
30+
}
31+
32+
extract(text: string): EntityExtractionResult {
33+
const entities: ExtractedEntity[] = [];
34+
35+
// 1. Quoted text — `"..."` or `'...'`. Captures verbatim spans
36+
// that authors wrap in quotes (config values, paths, etc.).
37+
const quotedRegex = /["']([^"'\n]{1,200}?)["']/g;
38+
let match: RegExpExecArray | null;
39+
while ((match = quotedRegex.exec(text)) !== null) {
40+
entities.push({
41+
text: match[1],
42+
kind: 'quoted-text',
43+
positions: [match.index + 1],
44+
});
45+
}
46+
47+
// 2. Compound noun phrases — N consecutive capitalized tokens
48+
// where N >= 2 and N <= compoundNounMaxLength. Greedy: the
49+
// longest match wins.
50+
const maxCompoundTokens = Math.max(2, this.compoundNounMaxLength);
51+
const compoundPattern = new RegExp(
52+
`\\b[A-Z][a-zA-Z]+(?:\\s+[A-Z][a-zA-Z]+){1,${maxCompoundTokens - 1}}\\b`,
53+
'g',
54+
);
55+
const compoundRanges: Array<{ start: number; end: number }> = [];
56+
while ((match = compoundPattern.exec(text)) !== null) {
57+
const start = match.index;
58+
const end = start + match[0].length;
59+
compoundRanges.push({ start, end });
60+
entities.push({
61+
text: match[0],
62+
kind: 'compound-noun-phrase',
63+
positions: [start],
64+
});
65+
}
66+
67+
// 3. Proper nouns — single capitalized tokens NOT already covered
68+
// by a compound noun phrase.
69+
const properPattern = /\b[A-Z][a-zA-Z]+\b/g;
70+
while ((match = properPattern.exec(text)) !== null) {
71+
if (match[0].length < this.properNounMinLength) continue;
72+
const start = match.index;
73+
const inCompound = compoundRanges.some(
74+
(range) => start >= range.start && start < range.end,
75+
);
76+
if (inCompound) continue;
77+
entities.push({
78+
text: match[0],
79+
kind: 'proper-noun',
80+
positions: [start],
81+
});
82+
}
83+
84+
return { entities, rawText: text };
85+
}
86+
}
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
/**
2+
* @file EntityLinkingIngestExecutor.ts
3+
* @description Mem0-v3-style ingest executor for the IngestRouter
4+
* `fact-graph` strategy. Extracts entities from content (proper nouns,
5+
* quoted text, compound noun phrases) and surfaces them on the result
6+
* for downstream entity-overlap indexing at recall time.
7+
*
8+
* Unlike Mem0 v2 (which kept a separate Neo4j/Memgraph graph store),
9+
* v3 uses parallel entity columns + multi-signal hybrid search. This
10+
* executor captures the v3 pattern: regex-based extraction at ingest,
11+
* no LLM cost, entities flow alongside chunks for retrieval-time
12+
* re-ranking via {@link EntityRetrievalRanker}.
13+
*
14+
* Reference: docs.mem0.ai/migration/oss-v2-to-v3.
15+
*
16+
* @module @framers/agentos/ingest-router/executors/EntityLinkingIngestExecutor
17+
*/
18+
19+
import { EntityExtractor } from './EntityExtractor.js';
20+
import type { IngestPayload } from './SummarizedIngestExecutor.js';
21+
import type { EntityLinkingOptions } from './entity-types.js';
22+
23+
/**
24+
* Outcome shape for the entity-linking executor.
25+
*/
26+
export interface EntityLinkingOutcome {
27+
writtenTraces: number;
28+
summary: string;
29+
embedTexts: string[];
30+
/** Distinct entities found across all chunks, deduplicated. */
31+
entities: string[];
32+
/** Entities found per chunk, in chunk order. */
33+
entitiesPerChunk: string[][];
34+
tokensIn: number;
35+
tokensOut: number;
36+
}
37+
38+
/**
39+
* Reference executor for the IngestRouter `fact-graph` strategy.
40+
* Wires entity extraction at ingest; the bench (or any consumer)
41+
* indexes the entities alongside chunks for entity-overlap re-ranking.
42+
*/
43+
export class EntityLinkingIngestExecutor {
44+
/** Strategy ID expected by IngestRouter's FunctionIngestDispatcher registry. */
45+
readonly strategyId = 'fact-graph' as const;
46+
47+
private readonly extractor: EntityExtractor;
48+
49+
constructor(opts: EntityLinkingOptions = {}) {
50+
this.extractor = new EntityExtractor(opts);
51+
}
52+
53+
async ingest(
54+
content: string,
55+
payload: IngestPayload,
56+
): Promise<EntityLinkingOutcome> {
57+
const chunks = payload.chunks ?? [content];
58+
const entitiesPerChunk = chunks.map((chunk) =>
59+
this.extractor.extract(chunk).entities.map((e) => e.text),
60+
);
61+
const allEntities = Array.from(new Set(entitiesPerChunk.flat()));
62+
63+
return {
64+
writtenTraces: chunks.length,
65+
summary: '',
66+
embedTexts: chunks,
67+
entities: allEntities,
68+
entitiesPerChunk,
69+
tokensIn: 0,
70+
tokensOut: 0,
71+
};
72+
}
73+
}
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/**
2+
* @file EntityExtractor.test.ts
3+
* @description Tests for the Mem0-v3-style entity extractor.
4+
* Validates proper noun, quoted text, and compound noun phrase
5+
* detection with character offsets.
6+
*/
7+
8+
import { describe, it, expect } from 'vitest';
9+
import { EntityExtractor } from '../EntityExtractor.js';
10+
11+
describe('EntityExtractor (Mem0 v3 style)', () => {
12+
const ex = new EntityExtractor();
13+
14+
it('extracts proper nouns', () => {
15+
const result = ex.extract('John works at Anthropic in San Francisco.');
16+
const proper = result.entities.filter((e) => e.kind === 'proper-noun').map((e) => e.text);
17+
expect(proper).toEqual(expect.arrayContaining(['John', 'Anthropic']));
18+
// San Francisco is detected as a compound noun phrase, not two proper nouns
19+
});
20+
21+
it('extracts quoted text (double quotes)', () => {
22+
const result = ex.extract('She said "deploy at midnight" before leaving.');
23+
const quoted = result.entities.filter((e) => e.kind === 'quoted-text');
24+
expect(quoted).toHaveLength(1);
25+
expect(quoted[0].text).toBe('deploy at midnight');
26+
});
27+
28+
it('extracts quoted text (single quotes)', () => {
29+
const result = ex.extract("Use 'rerank-v3.5' for now.");
30+
const quoted = result.entities.filter((e) => e.kind === 'quoted-text');
31+
expect(quoted).toHaveLength(1);
32+
expect(quoted[0].text).toBe('rerank-v3.5');
33+
});
34+
35+
it('extracts compound noun phrases (consecutive capitalized tokens)', () => {
36+
const result = ex.extract('San Francisco is north of Los Angeles.');
37+
const cnp = result.entities.filter((e) => e.kind === 'compound-noun-phrase').map((e) => e.text);
38+
expect(cnp).toEqual(expect.arrayContaining(['San Francisco', 'Los Angeles']));
39+
});
40+
41+
it('does not double-count tokens already inside compound noun phrases', () => {
42+
const result = ex.extract('Apple announced something in San Francisco.');
43+
const proper = result.entities.filter((e) => e.kind === 'proper-noun').map((e) => e.text);
44+
// "San" and "Francisco" are inside the compound, so they should NOT
45+
// also appear as standalone proper nouns
46+
expect(proper).not.toContain('San');
47+
expect(proper).not.toContain('Francisco');
48+
expect(proper).toContain('Apple');
49+
});
50+
51+
it('returns position offsets for each entity', () => {
52+
const result = ex.extract('Anthropic released Claude.');
53+
const anthropic = result.entities.find((e) => e.text === 'Anthropic');
54+
expect(anthropic).toBeDefined();
55+
expect(anthropic!.positions[0]).toBe(0);
56+
});
57+
58+
it('returns empty entities for text with no proper nouns', () => {
59+
const result = ex.extract('the quick brown fox jumps over the lazy dog');
60+
expect(result.entities).toHaveLength(0);
61+
});
62+
63+
it('respects properNounMinLength option', () => {
64+
const ex2 = new EntityExtractor({ properNounMinLength: 3 });
65+
const result = ex2.extract('At noon, Bob called Al.');
66+
const proper = result.entities.filter((e) => e.kind === 'proper-noun').map((e) => e.text);
67+
// 'Al' is length 2 with min 3 — should be filtered out
68+
expect(proper).not.toContain('Al');
69+
expect(proper).toContain('Bob');
70+
});
71+
72+
it('preserves raw text on the result', () => {
73+
const text = 'OpenAI built GPT-4.';
74+
const result = ex.extract(text);
75+
expect(result.rawText).toBe(text);
76+
});
77+
});
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/**
2+
* @file EntityLinkingIngestExecutor.test.ts
3+
* @description Tests for the ingest-side executor that extracts entities
4+
* from content and emits them alongside chunks for downstream indexing
5+
* (Stage I, Mem0-v3 style).
6+
*/
7+
8+
import { describe, it, expect } from 'vitest';
9+
import { EntityLinkingIngestExecutor } from '../EntityLinkingIngestExecutor.js';
10+
11+
describe('EntityLinkingIngestExecutor', () => {
12+
const executor = new EntityLinkingIngestExecutor();
13+
14+
it('returns the strategy ID expected by IngestRouter dispatcher', () => {
15+
expect(executor.strategyId).toBe('fact-graph');
16+
});
17+
18+
it('extracts entities and surfaces them on the result', async () => {
19+
const result = await executor.ingest('Anthropic released Claude 3 Opus.', {
20+
sessionId: 'sess-1',
21+
});
22+
expect(result.writtenTraces).toBe(1);
23+
expect(result.entities).toEqual(expect.arrayContaining(['Anthropic']));
24+
});
25+
26+
it('passes content through unchanged in embedTexts (no summary prepend)', async () => {
27+
const result = await executor.ingest('plain content', { sessionId: 'sess-2' });
28+
expect(result.embedTexts).toEqual(['plain content']);
29+
});
30+
31+
it('returns one entity-tagged trace per chunk when payload.chunks supplied', async () => {
32+
const result = await executor.ingest('full text', {
33+
sessionId: 'sess-multi',
34+
chunks: ['John works at Anthropic.', 'Bob works at OpenAI.'],
35+
});
36+
expect(result.writtenTraces).toBe(2);
37+
expect(result.embedTexts).toEqual([
38+
'John works at Anthropic.',
39+
'Bob works at OpenAI.',
40+
]);
41+
expect(result.entitiesPerChunk).toHaveLength(2);
42+
expect(result.entitiesPerChunk[0]).toEqual(expect.arrayContaining(['John', 'Anthropic']));
43+
expect(result.entitiesPerChunk[1]).toEqual(expect.arrayContaining(['Bob', 'OpenAI']));
44+
});
45+
46+
it('costs nothing in tokensIn/tokensOut (regex-only, no LLM)', async () => {
47+
const result = await executor.ingest('any text', { sessionId: 'sess' });
48+
expect(result.tokensIn).toBe(0);
49+
expect(result.tokensOut).toBe(0);
50+
});
51+
});
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
/**
2+
* @file entity-types.ts
3+
* @description Types for the Mem0-v3-style entity-linking ingest
4+
* executor (Stage I).
5+
*
6+
* Mem0 v3 dropped its graph store in favor of single-pass ADD-only
7+
* fact extraction with multi-signal hybrid search. Entity extraction
8+
* powers the entity-overlap re-rank signal at recall time. See spec
9+
* §3.2 + STAGE_L_PHASE_A_FINDINGS for why this is the next-priority
10+
* accuracy push.
11+
*
12+
* Reference: docs.mem0.ai/migration/oss-v2-to-v3.
13+
*
14+
* @module @framers/agentos/ingest-router/executors/entity-types
15+
*/
16+
17+
/**
18+
* Three entity kinds extracted at ingest. Mem0 v3 spec.
19+
*/
20+
export type EntityKind = 'proper-noun' | 'quoted-text' | 'compound-noun-phrase';
21+
22+
/**
23+
* One extracted entity with its kind + character offsets.
24+
*/
25+
export interface ExtractedEntity {
26+
/** The literal entity string as it appears in the source text. */
27+
text: string;
28+
/** Classification of how the entity was identified. */
29+
kind: EntityKind;
30+
/** Character offsets in the source where this entity was matched. */
31+
positions: number[];
32+
}
33+
34+
/**
35+
* Result of running EntityExtractor on a piece of text.
36+
*/
37+
export interface EntityExtractionResult {
38+
/** Every entity found, in detection order. */
39+
entities: ExtractedEntity[];
40+
/** The raw text the extractor ran against. */
41+
rawText: string;
42+
}
43+
44+
/**
45+
* Tunable parameters for entity extraction.
46+
*/
47+
export interface EntityLinkingOptions {
48+
/**
49+
* Minimum length for a token to count as a proper noun. Default 2.
50+
* Filters out single-letter capitals (e.g., "I", "A").
51+
*/
52+
properNounMinLength?: number;
53+
/**
54+
* Maximum number of consecutive capitalized tokens to count as one
55+
* compound noun phrase. Default 5.
56+
*/
57+
compoundNounMaxLength?: number;
58+
}

src/ingest-router/executors/index.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,22 @@ export type { IngestOutcome, IngestPayload } from './SummarizedIngestExecutor.js
1919
export { RawChunksIngestExecutor } from './RawChunksIngestExecutor.js';
2020
export type { RawChunksOutcome } from './RawChunksIngestExecutor.js';
2121
export { SkipIngestExecutor } from './SkipIngestExecutor.js';
22+
export { EntityExtractor } from './EntityExtractor.js';
23+
export { EntityLinkingIngestExecutor } from './EntityLinkingIngestExecutor.js';
24+
export type { EntityLinkingOutcome } from './EntityLinkingIngestExecutor.js';
25+
export type {
26+
EntityKind,
27+
ExtractedEntity,
28+
EntityExtractionResult,
29+
EntityLinkingOptions,
30+
} from './entity-types.js';
2231

2332
import { SummarizedIngestExecutor } from './SummarizedIngestExecutor.js';
2433
import { RawChunksIngestExecutor } from './RawChunksIngestExecutor.js';
2534
import { SkipIngestExecutor } from './SkipIngestExecutor.js';
35+
import { EntityLinkingIngestExecutor } from './EntityLinkingIngestExecutor.js';
2636
import type { SessionSummarizer } from '../../memory/ingest/SessionSummarizer.js';
37+
import type { EntityLinkingOptions } from './entity-types.js';
2738

2839
export function createSummarizedIngestExecutor(opts: {
2940
summarizer: SessionSummarizer;
@@ -38,3 +49,9 @@ export function createRawChunksIngestExecutor(): RawChunksIngestExecutor {
3849
export function createSkipIngestExecutor(): SkipIngestExecutor {
3950
return new SkipIngestExecutor();
4051
}
52+
53+
export function createEntityLinkingIngestExecutor(
54+
opts: EntityLinkingOptions = {},
55+
): EntityLinkingIngestExecutor {
56+
return new EntityLinkingIngestExecutor(opts);
57+
}

0 commit comments

Comments
 (0)