diff --git a/README.md b/README.md index 49d6b70..fb397c8 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ Docling-agent simplifies agentic operation on documents, such as writing, editin - [Document writing](examples/example_01_write_report.py): Generate well-structured reports from natural prompts and export to JSON/Markdown/HTML. - [Targeted editing](examples/example_02_edit_report.py): Load an existing Docling JSON and apply focused edits with natural-language tasks. - [Schema-guided extraction](examples/example_03_extract_schema.py): Extract typed fields from PDFs/images using a simple schema and produce HTML reports. See examples on curriculum_vitae, papers, invoices, etc. +- [Document enrichment](examples/example_04_enrich_document.py): Enrich existing documents with summaries, search keywords, key entities, and item classifications (language/function). - Model-agnostic: Plug in different backends via [Mellea](https://github.com/generative-computing/mellea) `model_ids` (e.g., OpenAI GPT OSS, IBM Granite). - Simple API surface: Use `agent.run(...)` with `DoclingDocument` in/out; save via `save_as_*` helpers. - Optional tools: Integrate external tools (e.g., MCP) when available. @@ -86,6 +87,24 @@ report = agent.run(task=str(schema), sources=sources) report.save_as_html("./scratch/invoices_extraction_report.html") ``` +### Enrich an existing document (see [example](examples/example_04_enrich_document.py)): + +Run enrichment passes like summaries, keywords, entities, and classifications on a Docling JSON. + +```python +from pathlib import Path +from mellea.backends import model_ids +from docling_core.types.doc.document import DoclingDocument +from docling_agent.agents import DoclingEnrichingAgent + +ipath = Path("./examples/example_02_edit_resources/20250815_125216.json") +doc = DoclingDocument.load_from_json(ipath) + +agent = DoclingEnrichingAgent(model_id=model_ids.OPENAI_GPT_OSS_20B) +enriched = agent.run(task="Summarize each paragraph, table, and section header.", document=doc) +enriched.save_as_html("./scratch/enriched_summaries.html") +``` + ## Documentation **Coming soon** diff --git a/docling_agent/agent/base.py b/docling_agent/agent/base.py index 5362f98..96073e3 100644 --- a/docling_agent/agent/base.py +++ b/docling_agent/agent/base.py @@ -23,6 +23,7 @@ class DoclingAgentType(Enum): DOCLING_DOCUMENT_WRITER = "writer" DOCLING_DOCUMENT_EDITOR = "editor" DOCLING_DOCUMENT_EXTRACTOR = "extractor" + DOCLING_DOCUMENT_ENRICHER = "enricher" def __str__(self) -> str: """Return the string value of the enum.""" diff --git a/docling_agent/agent/enricher.py b/docling_agent/agent/enricher.py new file mode 100644 index 0000000..d9220c0 --- /dev/null +++ b/docling_agent/agent/enricher.py @@ -0,0 +1,136 @@ +from pathlib import Path +from typing import Any, ClassVar + +from mellea.backends.model_ids import ModelIdentifier +from mellea.stdlib.sampling import RejectionSamplingStrategy +from pydantic import Field + +from docling_core.types.doc.document import ( + DoclingDocument, +) + +from docling_agent.agent.base import BaseDoclingAgent, DoclingAgentType +from docling_agent.agent.base_functions import find_json_dicts +from docling_agent.agent_models import setup_local_session, view_linear_context +from docling_agent.logging import logger + + +class DoclingEnrichingAgent(BaseDoclingAgent): + """Agent for enriching a document with metadata like summaries, keywords, + entities, and classifications. + + This scaffold routes the task to one of several enrichment operations using + a small reasoning step that returns a JSON instruction containing an + `operation` field. Each operation function currently iterates items and is + left for concrete implementation. + """ + + # Simple system prompt to route enrichment tasks + system_prompt_for_enrichment_routing: ClassVar[str] = ( + """ +You are a precise document enrichment router. Given a natural language task description, select exactly one operation to run and return only one JSON object in a ```json ...``` block, with the following schema: + +{ + "operation": "summarize_items" | "find_search_keywords" | "detect_key_entities" | "classify_items", + "args": { } +} + +Return no extra commentary. If multiple seem plausible, choose the single best fit. + """ + ) + + # Store last chosen operation for introspection/debugging (optional) + last_operation: dict[str, Any] = Field(default_factory=dict) + + def __init__(self, *, model_id: ModelIdentifier, tools: list): + super().__init__( + agent_type=DoclingAgentType.DOCLING_DOCUMENT_ENRICHER, + model_id=model_id, + tools=tools, + ) + + def run( + self, + task: str, + document: DoclingDocument | None = None, + sources: list[DoclingDocument | Path] = [], + **kwargs, + ) -> DoclingDocument: + if document is None: + raise ValueError("Document must not be None") + + op = self._choose_operation(task=task) + self.last_operation = op + + operation = op.get("operation") + args = op.get("args", {}) + + logger.info(f"Chosen enrichment operation: {operation}") + + if operation == "summarize_items": + self._summarize_items(document=document, **args) + elif operation == "find_search_keywords": + self._find_search_keywords(document=document, **args) + elif operation == "detect_key_entities": + self._detect_key_entities(document=document, **args) + elif operation == "classify_items": + self._classify_items(document=document, **args) + else: + raise ValueError( + f"Unknown enrichment operation: {operation}. Op payload: {op}" + ) + + return document + + def _choose_operation(self, *, task: str, loop_budget: int = 5) -> dict[str, Any]: + logger.info(f"task: {task}") + + m = setup_local_session( + model_id=self.get_reasoning_model_id(), + system_prompt=self.system_prompt_for_enrichment_routing, + ) + + answer = m.instruct( + task, + strategy=RejectionSamplingStrategy(loop_budget=loop_budget), + ) + + view_linear_context(m) + + ops = find_json_dicts(text=answer.value) + if not ops: + raise ValueError("No routing operation detected in model response") + if "operation" not in ops[0]: + raise ValueError(f"`operation` not found in routing result: {ops[0]}") + return ops[0] + + # --- Enrichment operations (scaffolds) --- + def _summarize_items(self, *, document: DoclingDocument, **kwargs) -> None: + logger.info("_summarize_items: iterating over document items") + for item, level in document.iterate_items(with_groups=True): + _ = (item, level) # placeholder to avoid unused warnings + # TODO: implement summarization per item + # e.g., update item.meta.summary + pass + + def _find_search_keywords(self, *, document: DoclingDocument, **kwargs) -> None: + logger.info("_find_search_keywords: iterating over document items") + for item, level in document.iterate_items(with_groups=True): + _ = (item, level) + # TODO: implement keyword extraction per item + pass + + def _detect_key_entities(self, *, document: DoclingDocument, **kwargs) -> None: + logger.info("_detect_key_entities: iterating over document items") + for item, level in document.iterate_items(with_groups=True): + _ = (item, level) + # TODO: implement entity detection per item + pass + + def _classify_items(self, *, document: DoclingDocument, **kwargs) -> None: + logger.info("_classify_items: iterating over document items") + for item, level in document.iterate_items(with_groups=True): + _ = (item, level) + # TODO: implement classification per item (language, function, etc.) + pass + diff --git a/docling_agent/agent/extraction.py b/docling_agent/agent/extractor.py similarity index 99% rename from docling_agent/agent/extraction.py rename to docling_agent/agent/extractor.py index 8c19411..095267b 100644 --- a/docling_agent/agent/extraction.py +++ b/docling_agent/agent/extractor.py @@ -21,8 +21,6 @@ from docling_agent.agent_models import setup_local_session from docling_agent.logging import logger -# Use shared logger from docling_agent.agents - class DoclingExtractingAgent(BaseDoclingAgent): system_prompt_schema_extraction: ClassVar[str] = ( @@ -152,3 +150,4 @@ def validate_json_str(text: str) -> bool: ) return json.loads(answer.value) + diff --git a/docling_agent/agents.py b/docling_agent/agents.py index 689362a..5ac9fb1 100644 --- a/docling_agent/agents.py +++ b/docling_agent/agents.py @@ -1,12 +1,14 @@ # Public re-exports for convenience imports in examples from docling_agent.agent.editor import DoclingEditingAgent -from docling_agent.agent.extraction import DoclingExtractingAgent +from docling_agent.agent.extractor import DoclingExtractingAgent +from docling_agent.agent.enricher import DoclingEnrichingAgent from docling_agent.agent.writer import DoclingWritingAgent from docling_agent.logging import logger __all__ = [ "DoclingEditingAgent", "DoclingExtractingAgent", + "DoclingEnrichingAgent", "DoclingWritingAgent", "logger", ] diff --git a/examples/example_04_enrich_document.py b/examples/example_04_enrich_document.py new file mode 100644 index 0000000..cf13945 --- /dev/null +++ b/examples/example_04_enrich_document.py @@ -0,0 +1,64 @@ +import os +from pathlib import Path + +from mellea.backends import model_ids + +from docling_core.types.doc.document import DoclingDocument + +from docling_agent.agents import DoclingEnrichingAgent, logger + + +def run_task( + ipath: Path, + task: str, + suffix: str, + model_id=model_ids.OPENAI_GPT_OSS_20B, + tools: list | None = None, +): + document = DoclingDocument.load_from_json(ipath) + + agent = DoclingEnrichingAgent(model_id=model_id, tools=tools or []) + + document = agent.run( + task=task, + document=document, + ) + + os.makedirs("./scratch", exist_ok=True) + opath = Path("./scratch") / f"{ipath.stem}{suffix}.html" + document.save_as_html(filename=opath) + logger.info(f"enrichment report written to `{opath}`") + + +def main(): + model_id = model_ids.OPENAI_GPT_OSS_20B + + # Example document to enrich (reuse the editing sample document) + ipath = Path("./examples/example_02_edit_resources/20250815_125216.json") + + tasks: list[tuple[str, str]] = [ + ( + "Summarize each paragraph, table, and section header in this document.", + "_summaries", + ), + ( + "Find search keywords for each paragraph, table, and section header.", + "_keywords", + ), + ( + "Detect key entities across paragraphs, tables, and sections.", + "_entities", + ), + ( + "Classify items by language and function (e.g., title, abstract, claim, reference).", + "_classifications", + ), + ] + + for task, suffix in tasks: + run_task(ipath=ipath, task=task, suffix=suffix, model_id=model_id) + + +if __name__ == "__main__": + main() + diff --git a/tests/test_agents_instantiation.py b/tests/test_agents_instantiation.py index f9623a0..305468e 100644 --- a/tests/test_agents_instantiation.py +++ b/tests/test_agents_instantiation.py @@ -26,9 +26,20 @@ def test_instantiate_docling_editing_agent(): def test_instantiate_docling_extracting_agent(): from mellea.backends import model_ids - from docling_agent.agent.extraction import DoclingExtractingAgent + from docling_agent.agent.extractor import DoclingExtractingAgent try: _ = DoclingExtractingAgent(model_id=model_ids.OPENAI_GPT_OSS_20B, tools=[]) except Exception as e: pytest.fail(f"DoclingExtractingAgent instantiation raised: {e}") + + +def test_instantiate_docling_enriching_agent(): + from mellea.backends import model_ids + + from docling_agent.agent.enricher import DoclingEnrichingAgent + + try: + _ = DoclingEnrichingAgent(model_id=model_ids.OPENAI_GPT_OSS_20B, tools=[]) + except Exception as e: + pytest.fail(f"DoclingEnrichingAgent instantiation raised: {e}")