From 4dd9c851d4917d97c569a4fc94117907d09c2cf4 Mon Sep 17 00:00:00 2001 From: Vdaleke Date: Sun, 16 Nov 2025 23:33:01 +0300 Subject: [PATCH] fix: rich table triplet serialization Signed-off-by: Vdaleke --- .../chunker/hierarchical_chunker.py | 6 ++- docling_core/types/doc/document.py | 8 ++-- test/data/chunker/0c_out_chunks.json | 39 +++++++++++++++++++ test/test_hierarchical_chunker.py | 29 ++++++++++++++ 4 files changed, 78 insertions(+), 4 deletions(-) create mode 100644 test/data/chunker/0c_out_chunks.json diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py index 25b0be8e..547199e6 100644 --- a/docling_core/transforms/chunker/hierarchical_chunker.py +++ b/docling_core/transforms/chunker/hierarchical_chunker.py @@ -69,7 +69,11 @@ def serialize( parts.append(cap_res) if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs): - table_df = item.export_to_dataframe(doc) + table_df = item.export_to_dataframe( + doc, + doc_serializer=doc_serializer, + **kwargs, + ) if table_df.shape[0] >= 1 and table_df.shape[1] >= 2: # copy header as first row and shift all rows by one diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 654fab99..8b562f06 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1840,7 +1840,9 @@ def _migrate_annotations_to_meta(self) -> Self: return self def export_to_dataframe( - self, doc: Optional["DoclingDocument"] = None + self, + doc: Optional["DoclingDocument"] = None, + **kwargs: Any, ) -> pd.DataFrame: """Export the table as a Pandas DataFrame.""" if doc is None: @@ -1876,14 +1878,14 @@ def export_to_dataframe( columns = ["" for _ in range(self.data.num_cols)] for i in range(num_headers): for j, cell in enumerate(self.data.grid[i]): - col_name = cell._get_text(doc=doc) + col_name = cell._get_text(doc=doc, **kwargs) if columns[j] != "": col_name = f".{col_name}" columns[j] += col_name # Create table data table_data = [ - [cell._get_text(doc=doc) for cell in row] + [cell._get_text(doc=doc, **kwargs) for cell in row] for row in self.data.grid[num_headers:] ] diff --git a/test/data/chunker/0c_out_chunks.json b/test/data/chunker/0c_out_chunks.json new file mode 100644 index 00000000..7569e2b8 --- /dev/null +++ b/test/data/chunker/0c_out_chunks.json @@ -0,0 +1,39 @@ +{ + "root": [ + { + "text": "cell 0,0, 1 = cell 0,1. cell 1,0, 1 =

text in italic

. , 1 = cell 2,1. cell 3,0, 1 = inner cell 0,0, 1 = inner cell 0,1. inner cell 0,0, 2 = inner cell 0,2. inner cell 1,0, 1 = inner cell 1,1. inner cell 1,0, 2 = inner cell 1,2.

Some text in a generic group.

\n

More text in the group.

, 1 = cell 4,1", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/tables/1" + }, + { + "$ref": "#/groups/1" + } + ], + "content_layer": "body", + "label": "table", + "prov": [] + } + ], + "headings": [ + "Rich tables" + ] + } + } + ] +} diff --git a/test/test_hierarchical_chunker.py b/test/test_hierarchical_chunker.py index 687d72ba..2e01d308 100644 --- a/test/test_hierarchical_chunker.py +++ b/test/test_hierarchical_chunker.py @@ -10,12 +10,15 @@ ChunkingDocSerializer, ChunkingSerializerProvider, DocChunk, + TripletTableSerializer, ) +from docling_core.transforms.serializer.html import HTMLDocSerializer from docling_core.transforms.serializer.markdown import MarkdownTableSerializer from docling_core.types.doc import DoclingDocument as DLDocument from docling_core.types.doc.document import DoclingDocument from .test_data_gen_flag import GEN_TEST_DATA +from .test_docling_doc import _construct_rich_table_doc def _process(act_data, exp_path_str): @@ -71,3 +74,29 @@ def get_serializer(self, doc: DoclingDocument): act_data=act_data, exp_path_str="test/data/chunker/0b_out_chunks.json", ) + + +def test_chunk_rich_table_custom_serializer(): + doc = _construct_rich_table_doc() + + class MySerializerProvider(ChunkingSerializerProvider): + def get_serializer(self, doc: DoclingDocument): + return HTMLDocSerializer( + doc=doc, + table_serializer=TripletTableSerializer(), + ) + + chunker = HierarchicalChunker( + merge_list_items=True, + serializer_provider=MySerializerProvider(), + ) + + chunks = chunker.chunk(dl_doc=doc) + act_data = dict( + root=[DocChunk.model_validate(n).export_json_dict() for n in chunks] + ) + + _process( + act_data=act_data, + exp_path_str="test/data/chunker/0c_out_chunks.json", + )