Skip to content

Commit 6156cdc

Browse files
committed
feat(validation): add centralized extractJson for LLM output parsing
Handles raw JSON, markdown fenced blocks, <thinking> block stripping, brace/bracket matching for embedded JSON, and JSONL multi-line output. Replaces duplicated extraction logic across 30+ AgentOS files.
1 parent 94b8720 commit 6156cdc

2 files changed

Lines changed: 248 additions & 0 deletions

File tree

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import { describe, it, expect } from 'vitest';
2+
import { extractJson } from '../extractJson.js';
3+
4+
describe('extractJson', () => {
5+
it('returns raw JSON string when input is valid JSON object', () => {
6+
const input = '{"type":"semantic","content":"fact"}';
7+
expect(extractJson(input)).toBe(input);
8+
});
9+
10+
it('returns raw JSON string when input is valid JSON array', () => {
11+
const input = '[{"a":1},{"b":2}]';
12+
expect(extractJson(input)).toBe(input);
13+
});
14+
15+
it('extracts JSON from markdown fenced block with json tag', () => {
16+
const input = 'Here is the result:\n```json\n{"type":"semantic"}\n```\nDone.';
17+
expect(extractJson(input)).toBe('{"type":"semantic"}');
18+
});
19+
20+
it('extracts JSON from markdown fenced block without tag', () => {
21+
const input = '```\n{"key":"value"}\n```';
22+
expect(extractJson(input)).toBe('{"key":"value"}');
23+
});
24+
25+
it('strips <thinking> blocks before extraction', () => {
26+
const input = '<thinking>Let me reason about this.</thinking>\n{"type":"episodic","content":"event"}';
27+
expect(extractJson(input)).toBe('{"type":"episodic","content":"event"}');
28+
});
29+
30+
it('extracts first JSON object via brace matching', () => {
31+
const input = 'Some preamble text {"type":"semantic","content":"a fact"} and trailing text';
32+
expect(extractJson(input)).toBe('{"type":"semantic","content":"a fact"}');
33+
});
34+
35+
it('extracts first JSON array via bracket matching', () => {
36+
const input = 'Results: [{"a":1},{"b":2}] done';
37+
expect(extractJson(input)).toBe('[{"a":1},{"b":2}]');
38+
});
39+
40+
it('handles nested objects in brace extraction', () => {
41+
const input = 'Result: {"outer":{"inner":"value"},"list":[1,2]}';
42+
expect(extractJson(input)).toBe('{"outer":{"inner":"value"},"list":[1,2]}');
43+
});
44+
45+
it('parses JSONL (multiple JSON objects on separate lines) as array', () => {
46+
const input = '{"type":"a","content":"1"}\n{"type":"b","content":"2"}';
47+
const result = extractJson(input);
48+
expect(result).not.toBeNull();
49+
const parsed = JSON.parse(result!);
50+
expect(parsed).toHaveLength(2);
51+
expect(parsed[0].type).toBe('a');
52+
expect(parsed[1].type).toBe('b');
53+
});
54+
55+
it('returns null for completely malformed input', () => {
56+
expect(extractJson('This is just plain text with no JSON')).toBeNull();
57+
});
58+
59+
it('returns null for empty string', () => {
60+
expect(extractJson('')).toBeNull();
61+
});
62+
63+
it('handles strings with escaped quotes inside JSON', () => {
64+
const input = '{"content":"She said \\"hello\\""}';
65+
expect(extractJson(input)).toBe(input);
66+
});
67+
});

src/core/validation/extractJson.ts

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
/**
2+
* @fileoverview Centralized JSON extraction from messy LLM output.
3+
*
4+
* LLMs return structured data in many formats: raw JSON, markdown-fenced
5+
* blocks, JSON wrapped in prose, JSONL, or JSON preceded by chain-of-thought
6+
* `<thinking>` blocks. This module handles all of them with a priority-ordered
7+
* extraction pipeline.
8+
*
9+
* Replaces the ad-hoc `JSON.parse` + `match()` + `split('\n')` patterns
10+
* duplicated across 30+ files in AgentOS and wilds-ai.
11+
*
12+
* @module agentos/core/validation/extractJson
13+
*/
14+
15+
/**
16+
* Extract JSON from raw LLM output text.
17+
*
18+
* Tries multiple extraction strategies in priority order:
19+
* 1. Raw JSON (entire string is valid JSON)
20+
* 2. Markdown fenced blocks (```json ... ``` or ``` ... ```)
21+
* 3. Strip `<thinking>` blocks, then retry
22+
* 4. First `{...}` or `[...]` via greedy brace/bracket matching
23+
* 5. JSONL (multiple JSON objects on separate lines → array)
24+
*
25+
* @param rawText - Raw LLM output that may contain JSON
26+
* @returns Extracted JSON string, or null if no valid JSON found
27+
*
28+
* @example
29+
* ```ts
30+
* extractJson('```json\n{"key": "value"}\n```') // '{"key": "value"}'
31+
* extractJson('<thinking>hmm</thinking>\n{"a":1}') // '{"a":1}'
32+
* extractJson('no json here') // null
33+
* ```
34+
*/
35+
export function extractJson(rawText: string): string | null {
36+
if (!rawText || rawText.trim().length === 0) return null;
37+
38+
const trimmed = rawText.trim();
39+
40+
// Strategy 1: Raw JSON — entire string is valid JSON
41+
if ((trimmed.startsWith('{') && trimmed.endsWith('}')) ||
42+
(trimmed.startsWith('[') && trimmed.endsWith(']'))) {
43+
try {
44+
JSON.parse(trimmed);
45+
return trimmed;
46+
} catch {
47+
// Not valid raw JSON — fall through to other strategies
48+
}
49+
}
50+
51+
// Strategy 2: Markdown fenced blocks — ```json ... ``` or ``` ... ```
52+
const fencedMatch = trimmed.match(/```(?:json)?\s*\n([\s\S]*?)\n\s*```/);
53+
if (fencedMatch) {
54+
const content = fencedMatch[1].trim();
55+
try {
56+
JSON.parse(content);
57+
return content;
58+
} catch {
59+
// Fenced block wasn't valid JSON — fall through
60+
}
61+
}
62+
63+
// Strategy 3: Strip <thinking>...</thinking> blocks, then retry.
64+
// Chain-of-thought reasoning often precedes the actual JSON output.
65+
if (trimmed.includes('<thinking>')) {
66+
const stripped = trimmed.replace(/<thinking>[\s\S]*?<\/thinking>/gi, '').trim();
67+
if (stripped.length > 0 && stripped !== trimmed) {
68+
const result = extractJson(stripped);
69+
if (result) return result;
70+
}
71+
}
72+
73+
// Strategy 4: JSONL — multiple JSON objects on separate lines.
74+
// Common in MemoryObserver and MemoryReflector output where the LLM
75+
// outputs one JSON object per observation/trace on its own line.
76+
// Checked BEFORE brace matching so multi-line output isn't truncated
77+
// to just the first object.
78+
const lines = trimmed.split('\n').filter((l) => l.trim());
79+
if (lines.length >= 2) {
80+
const jsonObjects: unknown[] = [];
81+
for (const line of lines) {
82+
const clean = line.trim();
83+
try {
84+
const parsed = JSON.parse(clean);
85+
if (typeof parsed === 'object' && parsed !== null) {
86+
jsonObjects.push(parsed);
87+
}
88+
} catch {
89+
// Skip non-JSON lines (common in LLM output with commentary)
90+
}
91+
}
92+
if (jsonObjects.length >= 2) {
93+
return JSON.stringify(jsonObjects);
94+
}
95+
}
96+
97+
// Strategy 5: First {...} or [...] via greedy brace/bracket matching.
98+
// Handles JSON embedded in prose like "Here is the result: {...} done."
99+
const braceResult = extractByBraceMatching(trimmed);
100+
if (braceResult) return braceResult;
101+
102+
return null;
103+
}
104+
105+
/**
106+
* Extract the first balanced JSON object or array from text using
107+
* brace/bracket depth counting. Handles nested structures and
108+
* string escaping correctly.
109+
*
110+
* @param text - Text that may contain embedded JSON
111+
* @returns Extracted JSON string, or null if no balanced structure found
112+
*/
113+
function extractByBraceMatching(text: string): string | null {
114+
// Find the first { or [
115+
const objectStart = text.indexOf('{');
116+
const arrayStart = text.indexOf('[');
117+
118+
// Pick whichever comes first
119+
let start: number;
120+
let openChar: string;
121+
let closeChar: string;
122+
123+
if (objectStart === -1 && arrayStart === -1) return null;
124+
if (objectStart === -1) {
125+
start = arrayStart;
126+
openChar = '[';
127+
closeChar = ']';
128+
} else if (arrayStart === -1) {
129+
start = objectStart;
130+
openChar = '{';
131+
closeChar = '}';
132+
} else if (objectStart <= arrayStart) {
133+
start = objectStart;
134+
openChar = '{';
135+
closeChar = '}';
136+
} else {
137+
start = arrayStart;
138+
openChar = '[';
139+
closeChar = ']';
140+
}
141+
142+
// Walk forward counting depth, handling string escaping.
143+
// This correctly handles nested {"a": {"b": [1, 2]}} structures
144+
// and escaped quotes inside strings like {"content": "she said \"hello\""}.
145+
let depth = 0;
146+
let inString = false;
147+
let escaped = false;
148+
149+
for (let i = start; i < text.length; i++) {
150+
const ch = text[i];
151+
152+
if (escaped) {
153+
escaped = false;
154+
continue;
155+
}
156+
if (ch === '\\' && inString) {
157+
escaped = true;
158+
continue;
159+
}
160+
if (ch === '"') {
161+
inString = !inString;
162+
continue;
163+
}
164+
if (inString) continue;
165+
166+
if (ch === openChar) depth++;
167+
if (ch === closeChar) depth--;
168+
169+
if (depth === 0) {
170+
const candidate = text.slice(start, i + 1);
171+
try {
172+
JSON.parse(candidate);
173+
return candidate;
174+
} catch {
175+
return null;
176+
}
177+
}
178+
}
179+
180+
return null;
181+
}

0 commit comments

Comments
 (0)