diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py
index 25b0be8e..547199e6 100644
--- a/docling_core/transforms/chunker/hierarchical_chunker.py
+++ b/docling_core/transforms/chunker/hierarchical_chunker.py
@@ -69,7 +69,11 @@ def serialize(
parts.append(cap_res)
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
- table_df = item.export_to_dataframe(doc)
+ table_df = item.export_to_dataframe(
+ doc,
+ doc_serializer=doc_serializer,
+ **kwargs,
+ )
if table_df.shape[0] >= 1 and table_df.shape[1] >= 2:
# copy header as first row and shift all rows by one
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 654fab99..8b562f06 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -1840,7 +1840,9 @@ def _migrate_annotations_to_meta(self) -> Self:
return self
def export_to_dataframe(
- self, doc: Optional["DoclingDocument"] = None
+ self,
+ doc: Optional["DoclingDocument"] = None,
+ **kwargs: Any,
) -> pd.DataFrame:
"""Export the table as a Pandas DataFrame."""
if doc is None:
@@ -1876,14 +1878,14 @@ def export_to_dataframe(
columns = ["" for _ in range(self.data.num_cols)]
for i in range(num_headers):
for j, cell in enumerate(self.data.grid[i]):
- col_name = cell._get_text(doc=doc)
+ col_name = cell._get_text(doc=doc, **kwargs)
if columns[j] != "":
col_name = f".{col_name}"
columns[j] += col_name
# Create table data
table_data = [
- [cell._get_text(doc=doc) for cell in row]
+ [cell._get_text(doc=doc, **kwargs) for cell in row]
for row in self.data.grid[num_headers:]
]
diff --git a/test/data/chunker/0c_out_chunks.json b/test/data/chunker/0c_out_chunks.json
new file mode 100644
index 00000000..7569e2b8
--- /dev/null
+++ b/test/data/chunker/0c_out_chunks.json
@@ -0,0 +1,39 @@
+{
+ "root": [
+ {
+ "text": "cell 0,0, 1 = cell 0,1. cell 1,0, 1 = text in italic
Some text in a generic group.
\nMore text in the group.
, 1 = cell 4,1", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/tables/1" + }, + { + "$ref": "#/groups/1" + } + ], + "content_layer": "body", + "label": "table", + "prov": [] + } + ], + "headings": [ + "Rich tables" + ] + } + } + ] +} diff --git a/test/test_hierarchical_chunker.py b/test/test_hierarchical_chunker.py index 687d72ba..2e01d308 100644 --- a/test/test_hierarchical_chunker.py +++ b/test/test_hierarchical_chunker.py @@ -10,12 +10,15 @@ ChunkingDocSerializer, ChunkingSerializerProvider, DocChunk, + TripletTableSerializer, ) +from docling_core.transforms.serializer.html import HTMLDocSerializer from docling_core.transforms.serializer.markdown import MarkdownTableSerializer from docling_core.types.doc import DoclingDocument as DLDocument from docling_core.types.doc.document import DoclingDocument from .test_data_gen_flag import GEN_TEST_DATA +from .test_docling_doc import _construct_rich_table_doc def _process(act_data, exp_path_str): @@ -71,3 +74,29 @@ def get_serializer(self, doc: DoclingDocument): act_data=act_data, exp_path_str="test/data/chunker/0b_out_chunks.json", ) + + +def test_chunk_rich_table_custom_serializer(): + doc = _construct_rich_table_doc() + + class MySerializerProvider(ChunkingSerializerProvider): + def get_serializer(self, doc: DoclingDocument): + return HTMLDocSerializer( + doc=doc, + table_serializer=TripletTableSerializer(), + ) + + chunker = HierarchicalChunker( + merge_list_items=True, + serializer_provider=MySerializerProvider(), + ) + + chunks = chunker.chunk(dl_doc=doc) + act_data = dict( + root=[DocChunk.model_validate(n).export_json_dict() for n in chunks] + ) + + _process( + act_data=act_data, + exp_path_str="test/data/chunker/0c_out_chunks.json", + )