From 5f60969a901a0dbf4936253c6a8b3b661824d94d Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:57:52 +0200 Subject: [PATCH 1/3] fix: align chunk ref format with one used in Document Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling_core/transforms/chunker/base.py | 25 ++++++++++++++++++- .../chunker/hierarchical_chunker.py | 4 --- ..._out_chunks_with_meta_heading_in_text.json | 10 ++++---- .../0_out_chunks_with_meta_incl_heading.json | 8 +++--- test/data/chunker/0_out_chunks_wout_meta.json | 10 ++++---- 5 files changed, 38 insertions(+), 19 deletions(-) diff --git a/docling_core/transforms/chunker/base.py b/docling_core/transforms/chunker/base.py index 17650e7e..a9e734ae 100644 --- a/docling_core/transforms/chunker/base.py +++ b/docling_core/transforms/chunker/base.py @@ -7,17 +7,33 @@ from abc import ABC, abstractmethod from typing import Iterator, Optional -from pydantic import BaseModel +from pydantic import BaseModel, model_validator from docling_core.types import BoundingBox, Document +def _create_path(pos: int, path_prefix: str = "main-text") -> str: + return f"#/{path_prefix}/{pos}" + + class Chunk(BaseModel): """Data model for Chunk.""" path: str text: str + @model_validator(mode="before") + @classmethod + def _json_pointer_from_json_path(cls, data): + path = data.get("path") + if path.startswith("$."): + parts = path.split("[") + data["path"] = _create_path( + pos=parts[1][:-1], + path_prefix=parts[0][2:], + ) + return data + class ChunkWithMetadata(Chunk): """Data model for Chunk including metadata.""" @@ -44,3 +60,10 @@ def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]: Iterator[Chunk]: iterator over extracted chunks """ raise NotImplementedError() + + @classmethod + def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str: + return _create_path( + pos=pos, + path_prefix=path_prefix, + ) diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py index 5b1831da..efa89cbe 100644 --- a/docling_core/transforms/chunker/hierarchical_chunker.py +++ b/docling_core/transforms/chunker/hierarchical_chunker.py @@ -83,10 +83,6 @@ def _triplet_serialize(cls, table) -> Optional[str]: return output_text - @classmethod - def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str: - return f"$.{path_prefix}[{pos}]" - class _MainTextItemNode(BaseModel): parent: Optional[int] = None children: list[int] = [] diff --git a/test/data/chunker/0_out_chunks_with_meta_heading_in_text.json b/test/data/chunker/0_out_chunks_with_meta_heading_in_text.json index c33fc210..054bbc59 100644 --- a/test/data/chunker/0_out_chunks_with_meta_heading_in_text.json +++ b/test/data/chunker/0_out_chunks_with_meta_heading_in_text.json @@ -1,7 +1,7 @@ { "root": [ { - "path": "$.main-text[0]", + "path": "#/main-text/0", "text": "This paragraph is marginally long enough for getting accepted as a chunk.", "page": 1, "bbox": [ @@ -12,7 +12,7 @@ ] }, { - "path": "$.main-text[4]", + "path": "#/main-text/4", "text": "Some subtitle\nThis one should also include the subtitle above since it is long enough.", "page": 3, "bbox": [ @@ -23,7 +23,7 @@ ] }, { - "path": "$.tables[0]", + "path": "#/tables/0", "text": "Acquisitions\nAtomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany", "page": 4, "bbox": [ @@ -34,7 +34,7 @@ ] }, { - "path": "$.main-text[7]", + "path": "#/main-text/7", "text": "Acquisitions\nThis paragraph should actually include the latest subtitle.", "page": 4, "bbox": [ @@ -45,7 +45,7 @@ ] }, { - "path": "$.main-text[8]", + "path": "#/main-text/8", "text": "Acquisitions\nThis paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.", "page": 4, "bbox": [ diff --git a/test/data/chunker/0_out_chunks_with_meta_incl_heading.json b/test/data/chunker/0_out_chunks_with_meta_incl_heading.json index 6c441096..de7a0bfa 100644 --- a/test/data/chunker/0_out_chunks_with_meta_incl_heading.json +++ b/test/data/chunker/0_out_chunks_with_meta_incl_heading.json @@ -1,7 +1,7 @@ { "root": [ { - "path": "$.main-text[0]", + "path": "#/main-text/0", "text": "This paragraph is marginally long enough for getting accepted as a chunk.", "page": 1, "bbox": [ @@ -12,7 +12,7 @@ ] }, { - "path": "$.main-text[4]", + "path": "#/main-text/4", "text": "This one should also include the subtitle above since it is long enough.", "page": 3, "bbox": [ @@ -24,7 +24,7 @@ "heading": "Some subtitle" }, { - "path": "$.tables[0]", + "path": "#/tables/0", "text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany", "page": 4, "bbox": [ @@ -36,7 +36,7 @@ "heading": "Acquisitions" }, { - "path": "$.main-text[8]", + "path": "#/main-text/8", "text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.", "page": 4, "bbox": [ diff --git a/test/data/chunker/0_out_chunks_wout_meta.json b/test/data/chunker/0_out_chunks_wout_meta.json index 5bd0da82..994b19bb 100644 --- a/test/data/chunker/0_out_chunks_wout_meta.json +++ b/test/data/chunker/0_out_chunks_wout_meta.json @@ -1,23 +1,23 @@ { "root": [ { - "path": "$.main-text[0]", + "path": "#/main-text/0", "text": "This paragraph is marginally long enough for getting accepted as a chunk." }, { - "path": "$.main-text[4]", + "path": "#/main-text/4", "text": "Some subtitle\nThis one should also include the subtitle above since it is long enough." }, { - "path": "$.tables[0]", + "path": "#/tables/0", "text": "Acquisitions\nAtomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany" }, { - "path": "$.main-text[7]", + "path": "#/main-text/7", "text": "Acquisitions\nThis paragraph should actually include the latest subtitle." }, { - "path": "$.main-text[8]", + "path": "#/main-text/8", "text": "Acquisitions\nThis paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here." } ] From 7c8aa2279cd1ad15fa1fe0e974834d6e2fa00920 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Tue, 1 Oct 2024 11:31:55 +0200 Subject: [PATCH 2/3] move heading to base `Chunk`, extend & update tests Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling_core/transforms/chunker/base.py | 2 +- .../chunker/hierarchical_chunker.py | 24 ++++++++------ ...t_chunks_heading_in_meta_with_extras.json} | 12 +++---- ...ut_chunks_heading_in_meta_wout_extras.json | 23 +++++++++++++ ...t_chunks_heading_in_text_with_extras.json} | 0 ...t_chunks_heading_in_text_wout_extras.json} | 0 test/test_hierarchical_chunker.py | 32 +++++++++++++------ 7 files changed, 66 insertions(+), 27 deletions(-) rename test/data/chunker/{0_out_chunks_with_meta_incl_heading.json => 0_out_chunks_heading_in_meta_with_extras.json} (89%) create mode 100644 test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json rename test/data/chunker/{0_out_chunks_with_meta_heading_in_text.json => 0_out_chunks_heading_in_text_with_extras.json} (100%) rename test/data/chunker/{0_out_chunks_wout_meta.json => 0_out_chunks_heading_in_text_wout_extras.json} (100%) diff --git a/docling_core/transforms/chunker/base.py b/docling_core/transforms/chunker/base.py index a9e734ae..c77a59ad 100644 --- a/docling_core/transforms/chunker/base.py +++ b/docling_core/transforms/chunker/base.py @@ -21,6 +21,7 @@ class Chunk(BaseModel): path: str text: str + heading: Optional[str] = None @model_validator(mode="before") @classmethod @@ -40,7 +41,6 @@ class ChunkWithMetadata(Chunk): page: Optional[int] = None bbox: Optional[BoundingBox] = None - heading: Optional[str] = None class BaseChunker(BaseModel, ABC): diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py index efa89cbe..39a54ce4 100644 --- a/docling_core/transforms/chunker/hierarchical_chunker.py +++ b/docling_core/transforms/chunker/hierarchical_chunker.py @@ -12,7 +12,7 @@ from typing import Any, Iterator, Optional, Union import pandas as pd -from pydantic import BaseModel, PositiveInt +from pydantic import BaseModel, Field, PositiveInt from docling_core.transforms.chunker import BaseChunker, Chunk, ChunkWithMetadata from docling_core.types import BaseText @@ -25,9 +25,17 @@ class HierarchicalChunker(BaseChunker): """Chunker implementation leveraging the document layout.""" - include_metadata: bool = True - heading_as_metadata: bool = False - min_chunk_len: PositiveInt = 64 + heading_as_metadata: bool = Field( + default=False, + description="Whether heading should be in metadata", + ) + include_metadata: bool = Field( + default=True, + description="Whether to include extras in the metadata", + ) + min_chunk_len: PositiveInt = Field( + default=64, description="Minimum chunk text length to consider (in chars)" + ) class _NodeType(str, Enum): PARAGRAPH = "paragraph" @@ -300,14 +308,15 @@ def _build_chunk( return ChunkWithMetadata( text=concat, path=path, + heading=heading, page=item.prov[0].page if item.prov else None, bbox=item.prov[0].bbox if item.prov else None, - heading=heading, ) else: return Chunk( text=concat, path=path, + heading=heading, ) else: return None @@ -323,11 +332,6 @@ def chunk(self, dl_doc: DLDocument, delim="\n", **kwargs: Any) -> Iterator[Chunk Yields: Iterator[Chunk]: iterator over extracted chunks """ - if (not self.include_metadata) and self.heading_as_metadata: - raise RuntimeError( - "To enable `heading_as_metadata`, also `include_metadata` must be True." - ) - if dl_doc.main_text: # extract doc structure incl. metadata for # each item (e.g. parent, children) diff --git a/test/data/chunker/0_out_chunks_with_meta_incl_heading.json b/test/data/chunker/0_out_chunks_heading_in_meta_with_extras.json similarity index 89% rename from test/data/chunker/0_out_chunks_with_meta_incl_heading.json rename to test/data/chunker/0_out_chunks_heading_in_meta_with_extras.json index de7a0bfa..af629629 100644 --- a/test/data/chunker/0_out_chunks_with_meta_incl_heading.json +++ b/test/data/chunker/0_out_chunks_heading_in_meta_with_extras.json @@ -14,38 +14,38 @@ { "path": "#/main-text/4", "text": "This one should also include the subtitle above since it is long enough.", + "heading": "Some subtitle", "page": 3, "bbox": [ 5.0, 6.0, 7.0, 8.0 - ], - "heading": "Some subtitle" + ] }, { "path": "#/tables/0", "text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany", + "heading": "Acquisitions", "page": 4, "bbox": [ 8.0, 9.0, 10.0, 11.0 - ], - "heading": "Acquisitions" + ] }, { "path": "#/main-text/8", "text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.", + "heading": "Acquisitions", "page": 4, "bbox": [ 8.0, 9.0, 10.0, 11.0 - ], - "heading": "Acquisitions" + ] } ] } diff --git a/test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json b/test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json new file mode 100644 index 00000000..d45de944 --- /dev/null +++ b/test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json @@ -0,0 +1,23 @@ +{ + "root": [ + { + "path": "#/main-text/0", + "text": "This paragraph is marginally long enough for getting accepted as a chunk." + }, + { + "path": "#/main-text/4", + "text": "This one should also include the subtitle above since it is long enough.", + "heading": "Some subtitle" + }, + { + "path": "#/tables/0", + "text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany", + "heading": "Acquisitions" + }, + { + "path": "#/main-text/8", + "text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.", + "heading": "Acquisitions" + } + ] +} diff --git a/test/data/chunker/0_out_chunks_with_meta_heading_in_text.json b/test/data/chunker/0_out_chunks_heading_in_text_with_extras.json similarity index 100% rename from test/data/chunker/0_out_chunks_with_meta_heading_in_text.json rename to test/data/chunker/0_out_chunks_heading_in_text_with_extras.json diff --git a/test/data/chunker/0_out_chunks_wout_meta.json b/test/data/chunker/0_out_chunks_heading_in_text_wout_extras.json similarity index 100% rename from test/data/chunker/0_out_chunks_wout_meta.json rename to test/data/chunker/0_out_chunks_heading_in_text_wout_extras.json diff --git a/test/test_hierarchical_chunker.py b/test/test_hierarchical_chunker.py index 00ef9d5f..fe9cba10 100644 --- a/test/test_hierarchical_chunker.py +++ b/test/test_hierarchical_chunker.py @@ -9,37 +9,49 @@ from docling_core.types import Document as DLDocument -def test_chunk_without_metadata(): +def test_chunk_heading_in_text_wout_extras(): with open("test/data/chunker/0_inp_dl_doc.json") as f: data_json = f.read() dl_doc = DLDocument.model_validate_json(data_json) - chunker = HierarchicalChunker(include_metadata=False) + chunker = HierarchicalChunker(heading_as_metadata=False, include_metadata=False) chunks = chunker.chunk(dl_doc=dl_doc) - act_data = dict(root=[n.model_dump() for n in chunks]) - with open("test/data/chunker/0_out_chunks_wout_meta.json") as f: + act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks]) + with open("test/data/chunker/0_out_chunks_heading_in_text_wout_extras.json") as f: + exp_data = json.load(fp=f) + assert exp_data == act_data + + +def test_chunk_heading_in_text_with_extras(): + with open("test/data/chunker/0_inp_dl_doc.json") as f: + data_json = f.read() + dl_doc = DLDocument.model_validate_json(data_json) + chunker = HierarchicalChunker(heading_as_metadata=False, include_metadata=True) + chunks = chunker.chunk(dl_doc=dl_doc) + act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks]) + with open("test/data/chunker/0_out_chunks_heading_in_text_with_extras.json") as f: exp_data = json.load(fp=f) assert exp_data == act_data -def test_chunk_with_metadata_heading_in_text(): +def test_chunk_heading_in_meta_wout_extras(): with open("test/data/chunker/0_inp_dl_doc.json") as f: data_json = f.read() dl_doc = DLDocument.model_validate_json(data_json) - chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=False) + chunker = HierarchicalChunker(heading_as_metadata=True, include_metadata=False) chunks = chunker.chunk(dl_doc=dl_doc) act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks]) - with open("test/data/chunker/0_out_chunks_with_meta_heading_in_text.json") as f: + with open("test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json") as f: exp_data = json.load(fp=f) assert exp_data == act_data -def test_chunk_with_metadata_incl_heading(): +def test_chunk_heading_in_meta_with_extras(): with open("test/data/chunker/0_inp_dl_doc.json") as f: data_json = f.read() dl_doc = DLDocument.model_validate_json(data_json) - chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=True) + chunker = HierarchicalChunker(heading_as_metadata=True, include_metadata=True) chunks = chunker.chunk(dl_doc=dl_doc) act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks]) - with open("test/data/chunker/0_out_chunks_with_meta_incl_heading.json") as f: + with open("test/data/chunker/0_out_chunks_heading_in_meta_with_extras.json") as f: exp_data = json.load(fp=f) assert exp_data == act_data From c1da27c0df8dc8e23a0c22799cfca785d0065489 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Tue, 1 Oct 2024 12:36:32 +0200 Subject: [PATCH 3/3] improve heading flag description Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling_core/transforms/chunker/hierarchical_chunker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py index 39a54ce4..2ad17d03 100644 --- a/docling_core/transforms/chunker/hierarchical_chunker.py +++ b/docling_core/transforms/chunker/hierarchical_chunker.py @@ -27,7 +27,7 @@ class HierarchicalChunker(BaseChunker): heading_as_metadata: bool = Field( default=False, - description="Whether heading should be in metadata", + description="Whether heading should be in metadata (instead of text)", ) include_metadata: bool = Field( default=True,