From 4dd9c851d4917d97c569a4fc94117907d09c2cf4 Mon Sep 17 00:00:00 2001
From: Vdaleke <vdalekesmirnov@gmail.com>
Date: Sun, 16 Nov 2025 23:33:01 +0300
Subject: [PATCH] fix: rich table triplet serialization

Signed-off-by: Vdaleke <vdalekesmirnov@gmail.com>
---
 .../chunker/hierarchical_chunker.py           |  6 ++-
 docling_core/types/doc/document.py            |  8 ++--
 test/data/chunker/0c_out_chunks.json          | 39 +++++++++++++++++++
 test/test_hierarchical_chunker.py             | 29 ++++++++++++++
 4 files changed, 78 insertions(+), 4 deletions(-)
 create mode 100644 test/data/chunker/0c_out_chunks.json
diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py
index 25b0be8e..547199e6 100644
--- a/docling_core/transforms/chunker/hierarchical_chunker.py
+++ b/docling_core/transforms/chunker/hierarchical_chunker.py
@@ -69,7 +69,11 @@ def serialize(
             parts.append(cap_res)
 
         if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
-            table_df = item.export_to_dataframe(doc)
+            table_df = item.export_to_dataframe(
+                doc,
+                doc_serializer=doc_serializer,
+                **kwargs,
+            )
             if table_df.shape[0] >= 1 and table_df.shape[1] >= 2:
 
                 # copy header as first row and shift all rows by one
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 654fab99..8b562f06 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -1840,7 +1840,9 @@ def _migrate_annotations_to_meta(self) -> Self:
         return self
 
     def export_to_dataframe(
-        self, doc: Optional["DoclingDocument"] = None
+        self,
+        doc: Optional["DoclingDocument"] = None,
+        **kwargs: Any,
     ) -> pd.DataFrame:
         """Export the table as a Pandas DataFrame."""
         if doc is None:
@@ -1876,14 +1878,14 @@ def export_to_dataframe(
             columns = ["" for _ in range(self.data.num_cols)]
             for i in range(num_headers):
                 for j, cell in enumerate(self.data.grid[i]):
-                    col_name = cell._get_text(doc=doc)
+                    col_name = cell._get_text(doc=doc, **kwargs)
                     if columns[j] != "":
                         col_name = f".{col_name}"
                     columns[j] += col_name
 
         # Create table data
         table_data = [
-            [cell._get_text(doc=doc) for cell in row]
+            [cell._get_text(doc=doc, **kwargs) for cell in row]
             for row in self.data.grid[num_headers:]
         ]
 
diff --git a/test/data/chunker/0c_out_chunks.json b/test/data/chunker/0c_out_chunks.json
new file mode 100644
index 00000000..7569e2b8
--- /dev/null
+++ b/test/data/chunker/0c_out_chunks.json
@@ -0,0 +1,39 @@
+{
+    "root": [
+        {
+            "text": "cell 0,0, 1 = cell 0,1. cell 1,0, 1 = <em><p>text in italic</p></em>. <ul>\n<li>list item 1</li>\n<li>list item 2</li>\n</ul>, 1 = cell 2,1. cell 3,0, 1 = inner cell 0,0, 1 = inner cell 0,1. inner cell 0,0, 2 = inner cell 0,2. inner cell 1,0, 1 = inner cell 1,1. inner cell 1,0, 2 = inner cell 1,2. <p>Some text in a generic group.</p>\n<p>More text in the group.</p>, 1 = cell 4,1",
+            "meta": {
+                "schema_name": "docling_core.transforms.chunker.DocMeta",
+                "version": "1.0.0",
+                "doc_items": [
+                    {
+                        "self_ref": "#/tables/0",
+                        "parent": {
+                            "$ref": "#/body"
+                        },
+                        "children": [
+                            {
+                                "$ref": "#/texts/1"
+                            },
+                            {
+                                "$ref": "#/groups/0"
+                            },
+                            {
+                                "$ref": "#/tables/1"
+                            },
+                            {
+                                "$ref": "#/groups/1"
+                            }
+                        ],
+                        "content_layer": "body",
+                        "label": "table",
+                        "prov": []
+                    }
+                ],
+                "headings": [
+                    "Rich tables"
+                ]
+            }
+        }
+    ]
+}
diff --git a/test/test_hierarchical_chunker.py b/test/test_hierarchical_chunker.py
index 687d72ba..2e01d308 100644
--- a/test/test_hierarchical_chunker.py
+++ b/test/test_hierarchical_chunker.py
@@ -10,12 +10,15 @@
     ChunkingDocSerializer,
     ChunkingSerializerProvider,
     DocChunk,
+    TripletTableSerializer,
 )
+from docling_core.transforms.serializer.html import HTMLDocSerializer
 from docling_core.transforms.serializer.markdown import MarkdownTableSerializer
 from docling_core.types.doc import DoclingDocument as DLDocument
 from docling_core.types.doc.document import DoclingDocument
 
 from .test_data_gen_flag import GEN_TEST_DATA
+from .test_docling_doc import _construct_rich_table_doc
 
 
 def _process(act_data, exp_path_str):
@@ -71,3 +74,29 @@ def get_serializer(self, doc: DoclingDocument):
         act_data=act_data,
         exp_path_str="test/data/chunker/0b_out_chunks.json",
     )
+
+
+def test_chunk_rich_table_custom_serializer():
+    doc = _construct_rich_table_doc()
+
+    class MySerializerProvider(ChunkingSerializerProvider):
+        def get_serializer(self, doc: DoclingDocument):
+            return HTMLDocSerializer(
+                doc=doc,
+                table_serializer=TripletTableSerializer(),
+            )
+
+    chunker = HierarchicalChunker(
+        merge_list_items=True,
+        serializer_provider=MySerializerProvider(),
+    )
+
+    chunks = chunker.chunk(dl_doc=doc)
+    act_data = dict(
+        root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]
+    )
+
+    _process(
+        act_data=act_data,
+        exp_path_str="test/data/chunker/0c_out_chunks.json",
+    )