Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions docling_core/transforms/chunker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,40 @@
from abc import ABC, abstractmethod
from typing import Iterator, Optional

from pydantic import BaseModel
from pydantic import BaseModel, model_validator

from docling_core.types import BoundingBox, Document


def _create_path(pos: int, path_prefix: str = "main-text") -> str:
return f"#/{path_prefix}/{pos}"


class Chunk(BaseModel):
"""Data model for Chunk."""

path: str
text: str
heading: Optional[str] = None

@model_validator(mode="before")
@classmethod
def _json_pointer_from_json_path(cls, data):
path = data.get("path")
if path.startswith("$."):
parts = path.split("[")
data["path"] = _create_path(
pos=parts[1][:-1],
path_prefix=parts[0][2:],
)
return data


class ChunkWithMetadata(Chunk):
"""Data model for Chunk including metadata."""

page: Optional[int] = None
bbox: Optional[BoundingBox] = None
heading: Optional[str] = None


class BaseChunker(BaseModel, ABC):
Expand All @@ -44,3 +60,10 @@ def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
Iterator[Chunk]: iterator over extracted chunks
"""
raise NotImplementedError()

@classmethod
def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
return _create_path(
pos=pos,
path_prefix=path_prefix,
)
28 changes: 14 additions & 14 deletions docling_core/transforms/chunker/hierarchical_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from typing import Any, Iterator, Optional, Union

import pandas as pd
from pydantic import BaseModel, PositiveInt
from pydantic import BaseModel, Field, PositiveInt

from docling_core.transforms.chunker import BaseChunker, Chunk, ChunkWithMetadata
from docling_core.types import BaseText
Expand All @@ -25,9 +25,17 @@
class HierarchicalChunker(BaseChunker):
"""Chunker implementation leveraging the document layout."""

include_metadata: bool = True
heading_as_metadata: bool = False
min_chunk_len: PositiveInt = 64
heading_as_metadata: bool = Field(
default=False,
description="Whether heading should be in metadata (instead of text)",
)
include_metadata: bool = Field(
default=True,
description="Whether to include extras in the metadata",
)
min_chunk_len: PositiveInt = Field(
default=64, description="Minimum chunk text length to consider (in chars)"
)

class _NodeType(str, Enum):
PARAGRAPH = "paragraph"
Expand Down Expand Up @@ -83,10 +91,6 @@ def _triplet_serialize(cls, table) -> Optional[str]:

return output_text

@classmethod
def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
return f"$.{path_prefix}[{pos}]"

class _MainTextItemNode(BaseModel):
parent: Optional[int] = None
children: list[int] = []
Expand Down Expand Up @@ -304,14 +308,15 @@ def _build_chunk(
return ChunkWithMetadata(
text=concat,
path=path,
heading=heading,
page=item.prov[0].page if item.prov else None,
bbox=item.prov[0].bbox if item.prov else None,
heading=heading,
)
else:
return Chunk(
text=concat,
path=path,
heading=heading,
)
else:
return None
Expand All @@ -327,11 +332,6 @@ def chunk(self, dl_doc: DLDocument, delim="\n", **kwargs: Any) -> Iterator[Chunk
Yields:
Iterator[Chunk]: iterator over extracted chunks
"""
if (not self.include_metadata) and self.heading_as_metadata:
raise RuntimeError(
"To enable `heading_as_metadata`, also `include_metadata` must be True."
)

if dl_doc.main_text:
# extract doc structure incl. metadata for
# each item (e.g. parent, children)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"root": [
{
"path": "$.main-text[0]",
"path": "#/main-text/0",
"text": "This paragraph is marginally long enough for getting accepted as a chunk.",
"page": 1,
"bbox": [
Expand All @@ -12,40 +12,40 @@
]
},
{
"path": "$.main-text[4]",
"path": "#/main-text/4",
"text": "This one should also include the subtitle above since it is long enough.",
"heading": "Some subtitle",
"page": 3,
"bbox": [
5.0,
6.0,
7.0,
8.0
],
"heading": "Some subtitle"
]
},
{
"path": "$.tables[0]",
"path": "#/tables/0",
"text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
"heading": "Acquisitions",
"page": 4,
"bbox": [
8.0,
9.0,
10.0,
11.0
],
"heading": "Acquisitions"
]
},
{
"path": "$.main-text[8]",
"path": "#/main-text/8",
"text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
"heading": "Acquisitions",
"page": 4,
"bbox": [
8.0,
9.0,
10.0,
11.0
],
"heading": "Acquisitions"
]
}
]
}
23 changes: 23 additions & 0 deletions test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"root": [
{
"path": "#/main-text/0",
"text": "This paragraph is marginally long enough for getting accepted as a chunk."
},
{
"path": "#/main-text/4",
"text": "This one should also include the subtitle above since it is long enough.",
"heading": "Some subtitle"
},
{
"path": "#/tables/0",
"text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
"heading": "Acquisitions"
},
{
"path": "#/main-text/8",
"text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
"heading": "Acquisitions"
}
]
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"root": [
{
"path": "$.main-text[0]",
"path": "#/main-text/0",
"text": "This paragraph is marginally long enough for getting accepted as a chunk.",
"page": 1,
"bbox": [
Expand All @@ -12,7 +12,7 @@
]
},
{
"path": "$.main-text[4]",
"path": "#/main-text/4",
"text": "Some subtitle\nThis one should also include the subtitle above since it is long enough.",
"page": 3,
"bbox": [
Expand All @@ -23,7 +23,7 @@
]
},
{
"path": "$.tables[0]",
"path": "#/tables/0",
"text": "Acquisitions\nAtomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
"page": 4,
"bbox": [
Expand All @@ -34,7 +34,7 @@
]
},
{
"path": "$.main-text[7]",
"path": "#/main-text/7",
"text": "Acquisitions\nThis paragraph should actually include the latest subtitle.",
"page": 4,
"bbox": [
Expand All @@ -45,7 +45,7 @@
]
},
{
"path": "$.main-text[8]",
"path": "#/main-text/8",
"text": "Acquisitions\nThis paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
"page": 4,
"bbox": [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
{
"root": [
{
"path": "$.main-text[0]",
"path": "#/main-text/0",
"text": "This paragraph is marginally long enough for getting accepted as a chunk."
},
{
"path": "$.main-text[4]",
"path": "#/main-text/4",
"text": "Some subtitle\nThis one should also include the subtitle above since it is long enough."
},
{
"path": "$.tables[0]",
"path": "#/tables/0",
"text": "Acquisitions\nAtomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany"
},
{
"path": "$.main-text[7]",
"path": "#/main-text/7",
"text": "Acquisitions\nThis paragraph should actually include the latest subtitle."
},
{
"path": "$.main-text[8]",
"path": "#/main-text/8",
"text": "Acquisitions\nThis paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here."
}
]
Expand Down
32 changes: 22 additions & 10 deletions test/test_hierarchical_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,37 +9,49 @@
from docling_core.types import Document as DLDocument


def test_chunk_without_metadata():
def test_chunk_heading_in_text_wout_extras():
with open("test/data/chunker/0_inp_dl_doc.json") as f:
data_json = f.read()
dl_doc = DLDocument.model_validate_json(data_json)
chunker = HierarchicalChunker(include_metadata=False)
chunker = HierarchicalChunker(heading_as_metadata=False, include_metadata=False)
chunks = chunker.chunk(dl_doc=dl_doc)
act_data = dict(root=[n.model_dump() for n in chunks])
with open("test/data/chunker/0_out_chunks_wout_meta.json") as f:
act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
with open("test/data/chunker/0_out_chunks_heading_in_text_wout_extras.json") as f:
exp_data = json.load(fp=f)
assert exp_data == act_data


def test_chunk_heading_in_text_with_extras():
with open("test/data/chunker/0_inp_dl_doc.json") as f:
data_json = f.read()
dl_doc = DLDocument.model_validate_json(data_json)
chunker = HierarchicalChunker(heading_as_metadata=False, include_metadata=True)
chunks = chunker.chunk(dl_doc=dl_doc)
act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
with open("test/data/chunker/0_out_chunks_heading_in_text_with_extras.json") as f:
exp_data = json.load(fp=f)
assert exp_data == act_data


def test_chunk_with_metadata_heading_in_text():
def test_chunk_heading_in_meta_wout_extras():
with open("test/data/chunker/0_inp_dl_doc.json") as f:
data_json = f.read()
dl_doc = DLDocument.model_validate_json(data_json)
chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=False)
chunker = HierarchicalChunker(heading_as_metadata=True, include_metadata=False)
chunks = chunker.chunk(dl_doc=dl_doc)
act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
with open("test/data/chunker/0_out_chunks_with_meta_heading_in_text.json") as f:
with open("test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json") as f:
exp_data = json.load(fp=f)
assert exp_data == act_data


def test_chunk_with_metadata_incl_heading():
def test_chunk_heading_in_meta_with_extras():
with open("test/data/chunker/0_inp_dl_doc.json") as f:
data_json = f.read()
dl_doc = DLDocument.model_validate_json(data_json)
chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=True)
chunker = HierarchicalChunker(heading_as_metadata=True, include_metadata=True)
chunks = chunker.chunk(dl_doc=dl_doc)
act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
with open("test/data/chunker/0_out_chunks_with_meta_incl_heading.json") as f:
with open("test/data/chunker/0_out_chunks_heading_in_meta_with_extras.json") as f:
exp_data = json.load(fp=f)
assert exp_data == act_data
Loading