Skip to content

Commit 6af478d

Browse files
committed
feat(memory): add PdfLoader (unpdf), OcrPdfLoader (tesseract.js), DoclingLoader, DocxLoader
Implements Task 6 of the memory ingestion engine: - PdfLoader: tiered extraction (unpdf primary, OCR fallback for sparse text, Docling when available) - OcrPdfLoader: optional Tesseract.js loader with factory returning null when not installed - DoclingLoader: Python subprocess loader with spawnSync availability check - DocxLoader: mammoth-based DOCX text extraction with wordCount metadata - LoaderRegistry: .pdf and .docx now registered by default; optional loaders conditionally registered - Tests: 18 tests in pdf.test.ts + updated loaders.test.ts (30 tests) — all passing
1 parent 3c3f770 commit 6af478d

8 files changed

Lines changed: 1096 additions & 16 deletions

File tree

package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,9 +230,11 @@
230230
"axios": "^1.7.7",
231231
"gray-matter": "^4.0.3",
232232
"lru-cache": "^11.1.0",
233+
"mammoth": "^1.12.0",
233234
"natural": "^6.12.0",
234235
"openredaction": "^1.1.2",
235236
"pino": "^9.14.0",
237+
"unpdf": "^1.4.0",
236238
"uuid": "^11.1.0",
237239
"yaml": "^2.8.1",
238240
"zod": "^4.3.6"
Lines changed: 287 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,287 @@
1+
/**
2+
* @fileoverview DoclingLoader — high-fidelity PDF/DOCX extraction via Python Docling.
3+
*
4+
* Docling (https://github.com/DS4SD/docling) is an IBM Research open-source
5+
* library that converts PDFs and office documents to structured JSON, preserving
6+
* tables, figures, and layout information far beyond what pure-JS text extraction
7+
* can achieve.
8+
*
9+
* This module provides a factory function {@link createDoclingLoader} that:
10+
* 1. Checks whether `python3 -m docling --version` succeeds in the current PATH.
11+
* 2. If it does, returns a {@link DoclingLoader} instance that spawns a
12+
* `python3 -m docling` subprocess for each document.
13+
* 3. If Docling is not installed, returns `null` gracefully.
14+
*
15+
* ### Opting in
16+
* ```sh
17+
* pip install docling
18+
* ```
19+
*
20+
* @module memory/ingestion/DoclingLoader
21+
*/
22+
23+
import { spawn, spawnSync } from 'node:child_process';
24+
import path from 'node:path';
25+
import os from 'node:os';
26+
import fs from 'node:fs/promises';
27+
import type { IDocumentLoader } from './IDocumentLoader.js';
28+
import type { LoadOptions, LoadedDocument, DocumentMetadata } from '../facade/types.js';
29+
30+
// ---------------------------------------------------------------------------
31+
// Constants
32+
// ---------------------------------------------------------------------------
33+
34+
/** Extensions this loader can handle (Docling supports PDF and DOCX). */
35+
const SUPPORTED_EXTENSIONS = ['.pdf', '.docx'] as const;
36+
37+
// ---------------------------------------------------------------------------
38+
// Helpers
39+
// ---------------------------------------------------------------------------
40+
41+
/**
42+
* Returns the lower-cased extension (with dot) of a file path.
43+
*
44+
* @param filePath - Absolute or relative file path.
45+
*/
46+
function extOf(filePath: string): string {
47+
return path.extname(filePath).toLowerCase();
48+
}
49+
50+
// ---------------------------------------------------------------------------
51+
// Docling JSON output shape (minimal — we only map what we need)
52+
// ---------------------------------------------------------------------------
53+
54+
/**
55+
* Minimal representation of the JSON Docling emits when invoked with
56+
* `--output-format json`. Only the fields we consume are typed here; all
57+
* others are captured in the spread catchall.
58+
*
59+
* @internal
60+
*/
61+
interface DoclingJsonOutput {
62+
/** Full extracted text (Docling v2+). */
63+
text?: string;
64+
65+
/** Document metadata block. */
66+
metadata?: {
67+
title?: string;
68+
author?: string;
69+
pageCount?: number;
70+
page_count?: number;
71+
};
72+
73+
/** Older Docling format: array of per-page text blocks. */
74+
pages?: Array<{ text?: string }>;
75+
76+
/** Catch-all for forward compatibility. */
77+
[key: string]: unknown;
78+
}
79+
80+
// ---------------------------------------------------------------------------
81+
// DoclingLoader (internal class)
82+
// ---------------------------------------------------------------------------
83+
84+
/**
85+
* High-fidelity document loader that delegates to a `python3 -m docling`
86+
* subprocess.
87+
*
88+
* Consumers should use {@link createDoclingLoader} rather than constructing
89+
* this class directly so that the Python availability check is always run
90+
* before first use.
91+
*
92+
* @implements {IDocumentLoader}
93+
*/
94+
class DoclingLoader implements IDocumentLoader {
95+
/** @inheritdoc */
96+
readonly supportedExtensions: string[] = [...SUPPORTED_EXTENSIONS];
97+
98+
// -------------------------------------------------------------------------
99+
// canLoad
100+
// -------------------------------------------------------------------------
101+
102+
/** @inheritdoc */
103+
canLoad(source: string | Buffer): boolean {
104+
if (Buffer.isBuffer(source)) {
105+
// Without an extension we can't determine compatibility from bytes alone.
106+
return false;
107+
}
108+
return (SUPPORTED_EXTENSIONS as readonly string[]).includes(extOf(source) as '.pdf' | '.docx');
109+
}
110+
111+
// -------------------------------------------------------------------------
112+
// load
113+
// -------------------------------------------------------------------------
114+
115+
/** @inheritdoc */
116+
async load(source: string | Buffer, _options?: LoadOptions): Promise<LoadedDocument> {
117+
let filePath: string;
118+
let tempFile: string | null = null;
119+
120+
if (Buffer.isBuffer(source)) {
121+
// Write buffer to a temp file so Docling has a real path to read.
122+
tempFile = path.join(os.tmpdir(), `docling-input-${Date.now()}.pdf`);
123+
await fs.writeFile(tempFile, source);
124+
filePath = tempFile;
125+
} else {
126+
filePath = source;
127+
}
128+
129+
try {
130+
const jsonOutput = await this._runDocling(filePath);
131+
return this._mapToLoadedDocument(jsonOutput, Buffer.isBuffer(source) ? undefined : source);
132+
} finally {
133+
// Clean up any temp file we created.
134+
if (tempFile !== null) {
135+
await fs.unlink(tempFile).catch(() => { /* ignore cleanup errors */ });
136+
}
137+
}
138+
}
139+
140+
// -------------------------------------------------------------------------
141+
// Private: subprocess invocation
142+
// -------------------------------------------------------------------------
143+
144+
/**
145+
* Spawn `python3 -m docling --output-format json <filePath>` and collect
146+
* stdout.
147+
*
148+
* @param filePath - Absolute path to the PDF or DOCX file.
149+
* @returns Parsed Docling JSON output.
150+
* @throws When the subprocess exits with a non-zero code or stdout is not
151+
* valid JSON.
152+
*/
153+
private async _runDocling(filePath: string): Promise<DoclingJsonOutput> {
154+
return new Promise((resolve, reject) => {
155+
let stdout = '';
156+
let stderr = '';
157+
158+
const proc = spawn('python3', ['-m', 'docling', '--output-format', 'json', filePath], {
159+
stdio: ['ignore', 'pipe', 'pipe'],
160+
});
161+
162+
proc.stdout.on('data', (chunk: Buffer) => {
163+
stdout += chunk.toString('utf8');
164+
});
165+
166+
proc.stderr.on('data', (chunk: Buffer) => {
167+
stderr += chunk.toString('utf8');
168+
});
169+
170+
proc.on('close', (code) => {
171+
if (code !== 0) {
172+
reject(new Error(
173+
`DoclingLoader: python3 -m docling exited with code ${code}.\n${stderr.slice(0, 500)}`,
174+
));
175+
return;
176+
}
177+
178+
try {
179+
const parsed = JSON.parse(stdout) as DoclingJsonOutput;
180+
resolve(parsed);
181+
} catch (err) {
182+
reject(new Error(
183+
`DoclingLoader: failed to parse Docling JSON output: ${String(err)}\n` +
184+
`stdout (first 500 chars): ${stdout.slice(0, 500)}`,
185+
));
186+
}
187+
});
188+
189+
proc.on('error', (err) => {
190+
reject(new Error(`DoclingLoader: failed to spawn python3: ${err.message}`));
191+
});
192+
});
193+
}
194+
195+
// -------------------------------------------------------------------------
196+
// Private: JSON → LoadedDocument mapping
197+
// -------------------------------------------------------------------------
198+
199+
/**
200+
* Convert a Docling JSON output object to a {@link LoadedDocument}.
201+
*
202+
* Handles both the newer (`text` top-level string) and older
203+
* (`pages[].text` array) Docling output shapes.
204+
*
205+
* @param json - Parsed Docling JSON.
206+
* @param resolvedPath - Original source path for the `source` metadata field.
207+
*/
208+
private _mapToLoadedDocument(
209+
json: DoclingJsonOutput,
210+
resolvedPath?: string,
211+
): LoadedDocument {
212+
// Prefer top-level `text` (Docling v2+), fall back to concatenating pages.
213+
let content: string;
214+
if (typeof json['text'] === 'string') {
215+
content = json['text'];
216+
} else if (Array.isArray(json['pages'])) {
217+
content = json['pages']
218+
.map((p) => (typeof p['text'] === 'string' ? p['text'] : ''))
219+
.join('\n\n');
220+
} else {
221+
content = '';
222+
}
223+
224+
const rawMeta = json['metadata'] ?? {};
225+
const pageCount: number | undefined =
226+
typeof rawMeta['pageCount'] === 'number' ? rawMeta['pageCount'] :
227+
typeof rawMeta['page_count'] === 'number' ? rawMeta['page_count'] :
228+
undefined;
229+
230+
const meta: DocumentMetadata = {
231+
...(typeof rawMeta['title'] === 'string' && rawMeta['title']
232+
? { title: rawMeta['title'] }
233+
: {}),
234+
...(typeof rawMeta['author'] === 'string' ? { author: rawMeta['author'] } : {}),
235+
...(pageCount !== undefined ? { pageCount } : {}),
236+
...(resolvedPath ? { source: resolvedPath } : {}),
237+
};
238+
239+
return {
240+
content,
241+
metadata: meta,
242+
format: 'pdf',
243+
};
244+
}
245+
}
246+
247+
// ---------------------------------------------------------------------------
248+
// Factory
249+
// ---------------------------------------------------------------------------
250+
251+
/**
252+
* Checks whether `python3 -m docling` is available in the current environment
253+
* and, if so, returns a new {@link DoclingLoader} instance; otherwise returns
254+
* `null`.
255+
*
256+
* The availability check runs `python3 -m docling --version` synchronously
257+
* via `spawnSync` — it exits quickly and is only called once during registry
258+
* initialisation.
259+
*
260+
* ### Usage
261+
* ```ts
262+
* import { createDoclingLoader } from './DoclingLoader.js';
263+
* import { PdfLoader } from './PdfLoader.js';
264+
*
265+
* const doclingLoader = createDoclingLoader();
266+
* const loader = new PdfLoader(null, doclingLoader);
267+
* ```
268+
*
269+
* @returns A `DoclingLoader` instance when Docling is installed, or `null`.
270+
*/
271+
export function createDoclingLoader(): IDocumentLoader | null {
272+
try {
273+
const result = spawnSync('python3', ['-m', 'docling', '--version'], {
274+
stdio: 'ignore',
275+
timeout: 5000,
276+
});
277+
// spawnSync throws when the binary cannot be found, and sets .error for
278+
// other failure modes. A non-zero status also means docling is absent.
279+
if (result.error !== undefined || result.status !== 0) {
280+
return null;
281+
}
282+
return new DoclingLoader();
283+
} catch {
284+
// python3 is not in PATH or docling is not installed.
285+
return null;
286+
}
287+
}

0 commit comments

Comments
 (0)