Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Docling-agent simplifies agentic operation on documents, such as writing, editin
- [Document writing](examples/example_01_write_report.py): Generate well-structured reports from natural prompts and export to JSON/Markdown/HTML.
- [Targeted editing](examples/example_02_edit_report.py): Load an existing Docling JSON and apply focused edits with natural-language tasks.
- [Schema-guided extraction](examples/example_03_extract_schema.py): Extract typed fields from PDFs/images using a simple schema and produce HTML reports. See examples on curriculum_vitae, papers, invoices, etc.
- [Document enrichment](examples/example_04_enrich_document.py): Enrich existing documents with summaries, search keywords, key entities, and item classifications (language/function).
- Model-agnostic: Plug in different backends via [Mellea](https://github.com/generative-computing/mellea) `model_ids` (e.g., OpenAI GPT OSS, IBM Granite).
- Simple API surface: Use `agent.run(...)` with `DoclingDocument` in/out; save via `save_as_*` helpers.
- Optional tools: Integrate external tools (e.g., MCP) when available.
Expand Down Expand Up @@ -86,6 +87,24 @@ report = agent.run(task=str(schema), sources=sources)
report.save_as_html("./scratch/invoices_extraction_report.html")
```

### Enrich an existing document (see [example](examples/example_04_enrich_document.py)):

Run enrichment passes like summaries, keywords, entities, and classifications on a Docling JSON.

```python
from pathlib import Path
from mellea.backends import model_ids
from docling_core.types.doc.document import DoclingDocument
from docling_agent.agents import DoclingEnrichingAgent

ipath = Path("./examples/example_02_edit_resources/20250815_125216.json")
doc = DoclingDocument.load_from_json(ipath)

agent = DoclingEnrichingAgent(model_id=model_ids.OPENAI_GPT_OSS_20B)
enriched = agent.run(task="Summarize each paragraph, table, and section header.", document=doc)
enriched.save_as_html("./scratch/enriched_summaries.html")
```

## Documentation

**Coming soon**
Expand Down
1 change: 1 addition & 0 deletions docling_agent/agent/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class DoclingAgentType(Enum):
DOCLING_DOCUMENT_WRITER = "writer"
DOCLING_DOCUMENT_EDITOR = "editor"
DOCLING_DOCUMENT_EXTRACTOR = "extractor"
DOCLING_DOCUMENT_ENRICHER = "enricher"

def __str__(self) -> str:
"""Return the string value of the enum."""
Expand Down
136 changes: 136 additions & 0 deletions docling_agent/agent/enricher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
from pathlib import Path
from typing import Any, ClassVar

from mellea.backends.model_ids import ModelIdentifier
from mellea.stdlib.sampling import RejectionSamplingStrategy
from pydantic import Field

from docling_core.types.doc.document import (
DoclingDocument,
)

from docling_agent.agent.base import BaseDoclingAgent, DoclingAgentType
from docling_agent.agent.base_functions import find_json_dicts
from docling_agent.agent_models import setup_local_session, view_linear_context
from docling_agent.logging import logger


class DoclingEnrichingAgent(BaseDoclingAgent):
"""Agent for enriching a document with metadata like summaries, keywords,
entities, and classifications.

This scaffold routes the task to one of several enrichment operations using
a small reasoning step that returns a JSON instruction containing an
`operation` field. Each operation function currently iterates items and is
left for concrete implementation.
"""

# Simple system prompt to route enrichment tasks
system_prompt_for_enrichment_routing: ClassVar[str] = (
"""
You are a precise document enrichment router. Given a natural language task description, select exactly one operation to run and return only one JSON object in a ```json ...``` block, with the following schema:

{
"operation": "summarize_items" | "find_search_keywords" | "detect_key_entities" | "classify_items",
"args": { }
}

Return no extra commentary. If multiple seem plausible, choose the single best fit.
"""
)

# Store last chosen operation for introspection/debugging (optional)
last_operation: dict[str, Any] = Field(default_factory=dict)

def __init__(self, *, model_id: ModelIdentifier, tools: list):
super().__init__(
agent_type=DoclingAgentType.DOCLING_DOCUMENT_ENRICHER,
model_id=model_id,
tools=tools,
)

def run(
self,
task: str,
document: DoclingDocument | None = None,
sources: list[DoclingDocument | Path] = [],
**kwargs,
) -> DoclingDocument:
if document is None:
raise ValueError("Document must not be None")

op = self._choose_operation(task=task)
self.last_operation = op

operation = op.get("operation")
args = op.get("args", {})

logger.info(f"Chosen enrichment operation: {operation}")

if operation == "summarize_items":
self._summarize_items(document=document, **args)
elif operation == "find_search_keywords":
self._find_search_keywords(document=document, **args)
elif operation == "detect_key_entities":
self._detect_key_entities(document=document, **args)
elif operation == "classify_items":
self._classify_items(document=document, **args)
else:
raise ValueError(
f"Unknown enrichment operation: {operation}. Op payload: {op}"
)

return document

def _choose_operation(self, *, task: str, loop_budget: int = 5) -> dict[str, Any]:
logger.info(f"task: {task}")

m = setup_local_session(
model_id=self.get_reasoning_model_id(),
system_prompt=self.system_prompt_for_enrichment_routing,
)

answer = m.instruct(
task,
strategy=RejectionSamplingStrategy(loop_budget=loop_budget),
)

view_linear_context(m)

ops = find_json_dicts(text=answer.value)
if not ops:
raise ValueError("No routing operation detected in model response")
if "operation" not in ops[0]:
raise ValueError(f"`operation` not found in routing result: {ops[0]}")
return ops[0]

# --- Enrichment operations (scaffolds) ---
def _summarize_items(self, *, document: DoclingDocument, **kwargs) -> None:
logger.info("_summarize_items: iterating over document items")
for item, level in document.iterate_items(with_groups=True):
_ = (item, level) # placeholder to avoid unused warnings
# TODO: implement summarization per item
# e.g., update item.meta.summary
pass

def _find_search_keywords(self, *, document: DoclingDocument, **kwargs) -> None:
logger.info("_find_search_keywords: iterating over document items")
for item, level in document.iterate_items(with_groups=True):
_ = (item, level)
# TODO: implement keyword extraction per item
pass

def _detect_key_entities(self, *, document: DoclingDocument, **kwargs) -> None:
logger.info("_detect_key_entities: iterating over document items")
for item, level in document.iterate_items(with_groups=True):
_ = (item, level)
# TODO: implement entity detection per item
pass

def _classify_items(self, *, document: DoclingDocument, **kwargs) -> None:
logger.info("_classify_items: iterating over document items")
for item, level in document.iterate_items(with_groups=True):
_ = (item, level)
# TODO: implement classification per item (language, function, etc.)
pass

Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
from docling_agent.agent_models import setup_local_session
from docling_agent.logging import logger

# Use shared logger from docling_agent.agents


class DoclingExtractingAgent(BaseDoclingAgent):
system_prompt_schema_extraction: ClassVar[str] = (
Expand Down Expand Up @@ -152,3 +150,4 @@ def validate_json_str(text: str) -> bool:
)

return json.loads(answer.value)

4 changes: 3 additions & 1 deletion docling_agent/agents.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
# Public re-exports for convenience imports in examples
from docling_agent.agent.editor import DoclingEditingAgent
from docling_agent.agent.extraction import DoclingExtractingAgent
from docling_agent.agent.extractor import DoclingExtractingAgent
from docling_agent.agent.enricher import DoclingEnrichingAgent
from docling_agent.agent.writer import DoclingWritingAgent
from docling_agent.logging import logger

__all__ = [
"DoclingEditingAgent",
"DoclingExtractingAgent",
"DoclingEnrichingAgent",
"DoclingWritingAgent",
"logger",
]
64 changes: 64 additions & 0 deletions examples/example_04_enrich_document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import os
from pathlib import Path

from mellea.backends import model_ids

from docling_core.types.doc.document import DoclingDocument

from docling_agent.agents import DoclingEnrichingAgent, logger


def run_task(
ipath: Path,
task: str,
suffix: str,
model_id=model_ids.OPENAI_GPT_OSS_20B,
tools: list | None = None,
):
document = DoclingDocument.load_from_json(ipath)

agent = DoclingEnrichingAgent(model_id=model_id, tools=tools or [])

document = agent.run(
task=task,
document=document,
)

os.makedirs("./scratch", exist_ok=True)
opath = Path("./scratch") / f"{ipath.stem}{suffix}.html"
document.save_as_html(filename=opath)
logger.info(f"enrichment report written to `{opath}`")


def main():
model_id = model_ids.OPENAI_GPT_OSS_20B

# Example document to enrich (reuse the editing sample document)
ipath = Path("./examples/example_02_edit_resources/20250815_125216.json")

tasks: list[tuple[str, str]] = [
(
"Summarize each paragraph, table, and section header in this document.",
"_summaries",
),
(
"Find search keywords for each paragraph, table, and section header.",
"_keywords",
),
(
"Detect key entities across paragraphs, tables, and sections.",
"_entities",
),
(
"Classify items by language and function (e.g., title, abstract, claim, reference).",
"_classifications",
),
]

for task, suffix in tasks:
run_task(ipath=ipath, task=task, suffix=suffix, model_id=model_id)


if __name__ == "__main__":
main()

13 changes: 12 additions & 1 deletion tests/test_agents_instantiation.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,20 @@ def test_instantiate_docling_editing_agent():
def test_instantiate_docling_extracting_agent():
from mellea.backends import model_ids

from docling_agent.agent.extraction import DoclingExtractingAgent
from docling_agent.agent.extractor import DoclingExtractingAgent

try:
_ = DoclingExtractingAgent(model_id=model_ids.OPENAI_GPT_OSS_20B, tools=[])
except Exception as e:
pytest.fail(f"DoclingExtractingAgent instantiation raised: {e}")


def test_instantiate_docling_enriching_agent():
from mellea.backends import model_ids

from docling_agent.agent.enricher import DoclingEnrichingAgent

try:
_ = DoclingEnrichingAgent(model_id=model_ids.OPENAI_GPT_OSS_20B, tools=[])
except Exception as e:
pytest.fail(f"DoclingEnrichingAgent instantiation raised: {e}")
Loading