From 5f60969a901a0dbf4936253c6a8b3b661824d94d Mon Sep 17 00:00:00 2001
From: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Date: Mon, 30 Sep 2024 15:57:52 +0200
Subject: [PATCH 1/3] fix: align chunk ref format with one used in Document

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
---
 docling_core/transforms/chunker/base.py       | 25 ++++++++++++++++++-
 .../chunker/hierarchical_chunker.py           |  4 ---
 ..._out_chunks_with_meta_heading_in_text.json | 10 ++++----
 .../0_out_chunks_with_meta_incl_heading.json  |  8 +++---
 test/data/chunker/0_out_chunks_wout_meta.json | 10 ++++----
 5 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/docling_core/transforms/chunker/base.py b/docling_core/transforms/chunker/base.py
index 17650e7e..a9e734ae 100644
--- a/docling_core/transforms/chunker/base.py
+++ b/docling_core/transforms/chunker/base.py
@@ -7,17 +7,33 @@
 from abc import ABC, abstractmethod
 from typing import Iterator, Optional
 
-from pydantic import BaseModel
+from pydantic import BaseModel, model_validator
 
 from docling_core.types import BoundingBox, Document
 
 
+def _create_path(pos: int, path_prefix: str = "main-text") -> str:
+    return f"#/{path_prefix}/{pos}"
+
+
 class Chunk(BaseModel):
     """Data model for Chunk."""
 
     path: str
     text: str
 
+    @model_validator(mode="before")
+    @classmethod
+    def _json_pointer_from_json_path(cls, data):
+        path = data.get("path")
+        if path.startswith("$."):
+            parts = path.split("[")
+            data["path"] = _create_path(
+                pos=parts[1][:-1],
+                path_prefix=parts[0][2:],
+            )
+        return data
+
 
 class ChunkWithMetadata(Chunk):
     """Data model for Chunk including metadata."""
@@ -44,3 +60,10 @@ def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
             Iterator[Chunk]: iterator over extracted chunks
         """
         raise NotImplementedError()
+
+    @classmethod
+    def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
+        return _create_path(
+            pos=pos,
+            path_prefix=path_prefix,
+        )
diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py
index 5b1831da..efa89cbe 100644
--- a/docling_core/transforms/chunker/hierarchical_chunker.py
+++ b/docling_core/transforms/chunker/hierarchical_chunker.py
@@ -83,10 +83,6 @@ def _triplet_serialize(cls, table) -> Optional[str]:
 
         return output_text
 
-    @classmethod
-    def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
-        return f"$.{path_prefix}[{pos}]"
-
     class _MainTextItemNode(BaseModel):
         parent: Optional[int] = None
         children: list[int] = []
diff --git a/test/data/chunker/0_out_chunks_with_meta_heading_in_text.json b/test/data/chunker/0_out_chunks_with_meta_heading_in_text.json
index c33fc210..054bbc59 100644
--- a/test/data/chunker/0_out_chunks_with_meta_heading_in_text.json
+++ b/test/data/chunker/0_out_chunks_with_meta_heading_in_text.json
@@ -1,7 +1,7 @@
 {
     "root": [
         {
-            "path": "$.main-text[0]",
+            "path": "#/main-text/0",
             "text": "This paragraph is marginally long enough for getting accepted as a chunk.",
             "page": 1,
             "bbox": [
@@ -12,7 +12,7 @@
             ]
         },
         {
-            "path": "$.main-text[4]",
+            "path": "#/main-text/4",
             "text": "Some subtitle\nThis one should also include the subtitle above since it is long enough.",
             "page": 3,
             "bbox": [
@@ -23,7 +23,7 @@
             ]
         },
         {
-            "path": "$.tables[0]",
+            "path": "#/tables/0",
             "text": "Acquisitions\nAtomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
             "page": 4,
             "bbox": [
@@ -34,7 +34,7 @@
             ]
         },
         {
-            "path": "$.main-text[7]",
+            "path": "#/main-text/7",
             "text": "Acquisitions\nThis paragraph should actually include the latest subtitle.",
             "page": 4,
             "bbox": [
@@ -45,7 +45,7 @@
             ]
         },
         {
-            "path": "$.main-text[8]",
+            "path": "#/main-text/8",
             "text": "Acquisitions\nThis paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
             "page": 4,
             "bbox": [
diff --git a/test/data/chunker/0_out_chunks_with_meta_incl_heading.json b/test/data/chunker/0_out_chunks_with_meta_incl_heading.json
index 6c441096..de7a0bfa 100644
--- a/test/data/chunker/0_out_chunks_with_meta_incl_heading.json
+++ b/test/data/chunker/0_out_chunks_with_meta_incl_heading.json
@@ -1,7 +1,7 @@
 {
     "root": [
         {
-            "path": "$.main-text[0]",
+            "path": "#/main-text/0",
             "text": "This paragraph is marginally long enough for getting accepted as a chunk.",
             "page": 1,
             "bbox": [
@@ -12,7 +12,7 @@
             ]
         },
         {
-            "path": "$.main-text[4]",
+            "path": "#/main-text/4",
             "text": "This one should also include the subtitle above since it is long enough.",
             "page": 3,
             "bbox": [
@@ -24,7 +24,7 @@
             "heading": "Some subtitle"
         },
         {
-            "path": "$.tables[0]",
+            "path": "#/tables/0",
             "text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
             "page": 4,
             "bbox": [
@@ -36,7 +36,7 @@
             "heading": "Acquisitions"
         },
         {
-            "path": "$.main-text[8]",
+            "path": "#/main-text/8",
             "text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
             "page": 4,
             "bbox": [
diff --git a/test/data/chunker/0_out_chunks_wout_meta.json b/test/data/chunker/0_out_chunks_wout_meta.json
index 5bd0da82..994b19bb 100644
--- a/test/data/chunker/0_out_chunks_wout_meta.json
+++ b/test/data/chunker/0_out_chunks_wout_meta.json
@@ -1,23 +1,23 @@
 {
     "root": [
         {
-            "path": "$.main-text[0]",
+            "path": "#/main-text/0",
             "text": "This paragraph is marginally long enough for getting accepted as a chunk."
         },
         {
-            "path": "$.main-text[4]",
+            "path": "#/main-text/4",
             "text": "Some subtitle\nThis one should also include the subtitle above since it is long enough."
         },
         {
-            "path": "$.tables[0]",
+            "path": "#/tables/0",
             "text": "Acquisitions\nAtomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany"
         },
         {
-            "path": "$.main-text[7]",
+            "path": "#/main-text/7",
             "text": "Acquisitions\nThis paragraph should actually include the latest subtitle."
         },
         {
-            "path": "$.main-text[8]",
+            "path": "#/main-text/8",
             "text": "Acquisitions\nThis paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here."
         }
     ]

From 7c8aa2279cd1ad15fa1fe0e974834d6e2fa00920 Mon Sep 17 00:00:00 2001
From: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Date: Tue, 1 Oct 2024 11:31:55 +0200
Subject: [PATCH 2/3] move heading to base `Chunk`, extend & update tests

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
---
 docling_core/transforms/chunker/base.py       |  2 +-
 .../chunker/hierarchical_chunker.py           | 24 ++++++++------
 ...t_chunks_heading_in_meta_with_extras.json} | 12 +++----
 ...ut_chunks_heading_in_meta_wout_extras.json | 23 +++++++++++++
 ...t_chunks_heading_in_text_with_extras.json} |  0
 ...t_chunks_heading_in_text_wout_extras.json} |  0
 test/test_hierarchical_chunker.py             | 32 +++++++++++++------
 7 files changed, 66 insertions(+), 27 deletions(-)
 rename test/data/chunker/{0_out_chunks_with_meta_incl_heading.json => 0_out_chunks_heading_in_meta_with_extras.json} (89%)
 create mode 100644 test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json
 rename test/data/chunker/{0_out_chunks_with_meta_heading_in_text.json => 0_out_chunks_heading_in_text_with_extras.json} (100%)
 rename test/data/chunker/{0_out_chunks_wout_meta.json => 0_out_chunks_heading_in_text_wout_extras.json} (100%)

diff --git a/docling_core/transforms/chunker/base.py b/docling_core/transforms/chunker/base.py
index a9e734ae..c77a59ad 100644
--- a/docling_core/transforms/chunker/base.py
+++ b/docling_core/transforms/chunker/base.py
@@ -21,6 +21,7 @@ class Chunk(BaseModel):
 
     path: str
     text: str
+    heading: Optional[str] = None
 
     @model_validator(mode="before")
     @classmethod
@@ -40,7 +41,6 @@ class ChunkWithMetadata(Chunk):
 
     page: Optional[int] = None
     bbox: Optional[BoundingBox] = None
-    heading: Optional[str] = None
 
 
 class BaseChunker(BaseModel, ABC):
diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py
index efa89cbe..39a54ce4 100644
--- a/docling_core/transforms/chunker/hierarchical_chunker.py
+++ b/docling_core/transforms/chunker/hierarchical_chunker.py
@@ -12,7 +12,7 @@
 from typing import Any, Iterator, Optional, Union
 
 import pandas as pd
-from pydantic import BaseModel, PositiveInt
+from pydantic import BaseModel, Field, PositiveInt
 
 from docling_core.transforms.chunker import BaseChunker, Chunk, ChunkWithMetadata
 from docling_core.types import BaseText
@@ -25,9 +25,17 @@
 class HierarchicalChunker(BaseChunker):
     """Chunker implementation leveraging the document layout."""
 
-    include_metadata: bool = True
-    heading_as_metadata: bool = False
-    min_chunk_len: PositiveInt = 64
+    heading_as_metadata: bool = Field(
+        default=False,
+        description="Whether heading should be in metadata",
+    )
+    include_metadata: bool = Field(
+        default=True,
+        description="Whether to include extras in the metadata",
+    )
+    min_chunk_len: PositiveInt = Field(
+        default=64, description="Minimum chunk text length to consider (in chars)"
+    )
 
     class _NodeType(str, Enum):
         PARAGRAPH = "paragraph"
@@ -300,14 +308,15 @@ def _build_chunk(
                 return ChunkWithMetadata(
                     text=concat,
                     path=path,
+                    heading=heading,
                     page=item.prov[0].page if item.prov else None,
                     bbox=item.prov[0].bbox if item.prov else None,
-                    heading=heading,
                 )
             else:
                 return Chunk(
                     text=concat,
                     path=path,
+                    heading=heading,
                 )
         else:
             return None
@@ -323,11 +332,6 @@ def chunk(self, dl_doc: DLDocument, delim="\n", **kwargs: Any) -> Iterator[Chunk
         Yields:
             Iterator[Chunk]: iterator over extracted chunks
         """
-        if (not self.include_metadata) and self.heading_as_metadata:
-            raise RuntimeError(
-                "To enable `heading_as_metadata`, also `include_metadata` must be True."
-            )
-
         if dl_doc.main_text:
             # extract doc structure incl. metadata for
             # each item (e.g. parent, children)
diff --git a/test/data/chunker/0_out_chunks_with_meta_incl_heading.json b/test/data/chunker/0_out_chunks_heading_in_meta_with_extras.json
similarity index 89%
rename from test/data/chunker/0_out_chunks_with_meta_incl_heading.json
rename to test/data/chunker/0_out_chunks_heading_in_meta_with_extras.json
index de7a0bfa..af629629 100644
--- a/test/data/chunker/0_out_chunks_with_meta_incl_heading.json
+++ b/test/data/chunker/0_out_chunks_heading_in_meta_with_extras.json
@@ -14,38 +14,38 @@
         {
             "path": "#/main-text/4",
             "text": "This one should also include the subtitle above since it is long enough.",
+            "heading": "Some subtitle",
             "page": 3,
             "bbox": [
                 5.0,
                 6.0,
                 7.0,
                 8.0
-            ],
-            "heading": "Some subtitle"
+            ]
         },
         {
             "path": "#/tables/0",
             "text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
+            "heading": "Acquisitions",
             "page": 4,
             "bbox": [
                 8.0,
                 9.0,
                 10.0,
                 11.0
-            ],
-            "heading": "Acquisitions"
+            ]
         },
         {
             "path": "#/main-text/8",
             "text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
+            "heading": "Acquisitions",
             "page": 4,
             "bbox": [
                 8.0,
                 9.0,
                 10.0,
                 11.0
-            ],
-            "heading": "Acquisitions"
+            ]
         }
     ]
 }
diff --git a/test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json b/test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json
new file mode 100644
index 00000000..d45de944
--- /dev/null
+++ b/test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json
@@ -0,0 +1,23 @@
+{
+    "root": [
+        {
+            "path": "#/main-text/0",
+            "text": "This paragraph is marginally long enough for getting accepted as a chunk."
+        },
+        {
+            "path": "#/main-text/4",
+            "text": "This one should also include the subtitle above since it is long enough.",
+            "heading": "Some subtitle"
+        },
+        {
+            "path": "#/tables/0",
+            "text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
+            "heading": "Acquisitions"
+        },
+        {
+            "path": "#/main-text/8",
+            "text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
+            "heading": "Acquisitions"
+        }
+    ]
+}
diff --git a/test/data/chunker/0_out_chunks_with_meta_heading_in_text.json b/test/data/chunker/0_out_chunks_heading_in_text_with_extras.json
similarity index 100%
rename from test/data/chunker/0_out_chunks_with_meta_heading_in_text.json
rename to test/data/chunker/0_out_chunks_heading_in_text_with_extras.json
diff --git a/test/data/chunker/0_out_chunks_wout_meta.json b/test/data/chunker/0_out_chunks_heading_in_text_wout_extras.json
similarity index 100%
rename from test/data/chunker/0_out_chunks_wout_meta.json
rename to test/data/chunker/0_out_chunks_heading_in_text_wout_extras.json
diff --git a/test/test_hierarchical_chunker.py b/test/test_hierarchical_chunker.py
index 00ef9d5f..fe9cba10 100644
--- a/test/test_hierarchical_chunker.py
+++ b/test/test_hierarchical_chunker.py
@@ -9,37 +9,49 @@
 from docling_core.types import Document as DLDocument
 
 
-def test_chunk_without_metadata():
+def test_chunk_heading_in_text_wout_extras():
     with open("test/data/chunker/0_inp_dl_doc.json") as f:
         data_json = f.read()
     dl_doc = DLDocument.model_validate_json(data_json)
-    chunker = HierarchicalChunker(include_metadata=False)
+    chunker = HierarchicalChunker(heading_as_metadata=False, include_metadata=False)
     chunks = chunker.chunk(dl_doc=dl_doc)
-    act_data = dict(root=[n.model_dump() for n in chunks])
-    with open("test/data/chunker/0_out_chunks_wout_meta.json") as f:
+    act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
+    with open("test/data/chunker/0_out_chunks_heading_in_text_wout_extras.json") as f:
+        exp_data = json.load(fp=f)
+    assert exp_data == act_data
+
+
+def test_chunk_heading_in_text_with_extras():
+    with open("test/data/chunker/0_inp_dl_doc.json") as f:
+        data_json = f.read()
+    dl_doc = DLDocument.model_validate_json(data_json)
+    chunker = HierarchicalChunker(heading_as_metadata=False, include_metadata=True)
+    chunks = chunker.chunk(dl_doc=dl_doc)
+    act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
+    with open("test/data/chunker/0_out_chunks_heading_in_text_with_extras.json") as f:
         exp_data = json.load(fp=f)
     assert exp_data == act_data
 
 
-def test_chunk_with_metadata_heading_in_text():
+def test_chunk_heading_in_meta_wout_extras():
     with open("test/data/chunker/0_inp_dl_doc.json") as f:
         data_json = f.read()
     dl_doc = DLDocument.model_validate_json(data_json)
-    chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=False)
+    chunker = HierarchicalChunker(heading_as_metadata=True, include_metadata=False)
     chunks = chunker.chunk(dl_doc=dl_doc)
     act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
-    with open("test/data/chunker/0_out_chunks_with_meta_heading_in_text.json") as f:
+    with open("test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json") as f:
         exp_data = json.load(fp=f)
     assert exp_data == act_data
 
 
-def test_chunk_with_metadata_incl_heading():
+def test_chunk_heading_in_meta_with_extras():
     with open("test/data/chunker/0_inp_dl_doc.json") as f:
         data_json = f.read()
     dl_doc = DLDocument.model_validate_json(data_json)
-    chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=True)
+    chunker = HierarchicalChunker(heading_as_metadata=True, include_metadata=True)
     chunks = chunker.chunk(dl_doc=dl_doc)
     act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
-    with open("test/data/chunker/0_out_chunks_with_meta_incl_heading.json") as f:
+    with open("test/data/chunker/0_out_chunks_heading_in_meta_with_extras.json") as f:
         exp_data = json.load(fp=f)
     assert exp_data == act_data

From c1da27c0df8dc8e23a0c22799cfca785d0065489 Mon Sep 17 00:00:00 2001
From: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Date: Tue, 1 Oct 2024 12:36:32 +0200
Subject: [PATCH 3/3] improve heading flag description

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
---
 docling_core/transforms/chunker/hierarchical_chunker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py
index 39a54ce4..2ad17d03 100644
--- a/docling_core/transforms/chunker/hierarchical_chunker.py
+++ b/docling_core/transforms/chunker/hierarchical_chunker.py
@@ -27,7 +27,7 @@ class HierarchicalChunker(BaseChunker):
 
     heading_as_metadata: bool = Field(
         default=False,
-        description="Whether heading should be in metadata",
+        description="Whether heading should be in metadata (instead of text)",
     )
     include_metadata: bool = Field(
         default=True,