From c22af5fe95e5d367164e6ec0582dca22d7c5df77 Mon Sep 17 00:00:00 2001 From: Saidgurbuz Date: Wed, 19 Feb 2025 16:05:02 +0100 Subject: [PATCH 1/8] add export_to_document_tokens method for KeyValueItem Signed-off-by: Saidgurbuz --- docling_core/types/doc/document.py | 59 ++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 6c1fb145..986824b1 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1443,6 +1443,65 @@ class KeyValueItem(FloatingItem): graph: GraphData + def export_to_document_tokens( + self, + doc: "DoclingDocument", + new_line: str = "", + xsize: int = 500, + ysize: int = 500, + add_location: bool = True, + add_content: bool = True, + ): + r"""Export key value item to document tokens format. + + :param doc: "DoclingDocument": + :param new_line: str (Default value = "") + :param xsize: int: (Default value = 500) + :param ysize: int: (Default value = 500) + :param add_location: bool: (Default value = True) + :param add_content: bool: (Default value = True) + + """ + body = f"<{self.label.value}>{new_line}" + + if add_location: + body += self.get_location_tokens( + doc=doc, + new_line=new_line, + xsize=xsize, + ysize=ysize, + ) + + # mapping from source_cell_id to a list of target_cell_ids + source_to_targets: Dict[int, List[int]] = {} + for link in self.graph.links: + source_to_targets.setdefault(link.source_cell_id, []).append( + link.target_cell_id + ) + + for cell in self.graph.cells: + body += f"<{cell.label.value} id='{cell.cell_id}'>{new_line}" + if cell.prov is not None: + body = self.get_location_tokens( + doc=doc, + new_line=new_line, + xsize=xsize, + ysize=ysize, + ) + if add_content: + body += f"{cell.text.strip()}{new_line}" + + if cell.cell_id in source_to_targets: + targets = source_to_targets[cell.cell_id] + targets_str = ",".join(str(t) for t in targets) + body += f"{new_line}" + + body += f"<{cell.label.value} id='{cell.cell_id}'>{new_line}" + + body += f"\n" + + return body + class FormItem(FloatingItem): """FormItem.""" From b5314477862e08e2ade353abe6624790c00ca62f Mon Sep 17 00:00:00 2001 From: Saidgurbuz Date: Wed, 19 Feb 2025 16:37:51 +0100 Subject: [PATCH 2/8] fix export_to_document_tokens kv-item Signed-off-by: Saidgurbuz --- docling_core/types/doc/document.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 986824b1..775f90a1 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1482,7 +1482,7 @@ def export_to_document_tokens( for cell in self.graph.cells: body += f"<{cell.label.value} id='{cell.cell_id}'>{new_line}" if cell.prov is not None: - body = self.get_location_tokens( + body += self.get_location_tokens( doc=doc, new_line=new_line, xsize=xsize, @@ -1498,7 +1498,7 @@ def export_to_document_tokens( body += f"<{cell.label.value} id='{cell.cell_id}'>{new_line}" - body += f"\n" + body += f"{new_line}" return body From 4aa2d309a7e891ce02b3bc531842e0f73c6ebf7c Mon Sep 17 00:00:00 2001 From: Saidgurbuz Date: Wed, 19 Feb 2025 17:23:49 +0100 Subject: [PATCH 3/8] update key-link representations in document tokens Signed-off-by: Saidgurbuz --- docling_core/types/doc/document.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 775f90a1..b32aa8f3 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -79,6 +79,7 @@ DocItemLabel.REFERENCE, DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER, + DocItemLabel.KEY_VALUE_REGION, } DOCUMENT_TOKENS_EXPORT_LABELS = DEFAULT_EXPORT_LABELS.copy() @@ -1480,7 +1481,7 @@ def export_to_document_tokens( ) for cell in self.graph.cells: - body += f"<{cell.label.value} id='{cell.cell_id}'>{new_line}" + body += f"<{cell.label.value}_{cell.cell_id}>{new_line}" if cell.prov is not None: body += self.get_location_tokens( doc=doc, @@ -1493,10 +1494,10 @@ def export_to_document_tokens( if cell.cell_id in source_to_targets: targets = source_to_targets[cell.cell_id] - targets_str = ",".join(str(t) for t in targets) - body += f"{new_line}" + for target in targets: + body += f"{new_line}" - body += f"<{cell.label.value} id='{cell.cell_id}'>{new_line}" + body += f"{new_line}" body += f"{new_line}" From 1d9479a921ae0b7ff215b73cd971be3f099cc23b Mon Sep 17 00:00:00 2001 From: Saidgurbuz Date: Tue, 25 Feb 2025 11:16:38 +0100 Subject: [PATCH 4/8] fix key-value cell location for doctags Signed-off-by: Saidgurbuz --- docling_core/types/doc/document.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index b32aa8f3..c619cf26 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1465,6 +1465,10 @@ def export_to_document_tokens( """ body = f"<{self.label.value}>{new_line}" + page_no = 0 + if len(self.prov) > 0: + page_no = self.prov[0].page_no + if add_location: body += self.get_location_tokens( doc=doc, @@ -1483,12 +1487,15 @@ def export_to_document_tokens( for cell in self.graph.cells: body += f"<{cell.label.value}_{cell.cell_id}>{new_line}" if cell.prov is not None: - body += self.get_location_tokens( - doc=doc, - new_line=new_line, - xsize=xsize, - ysize=ysize, - ) + if len(doc.pages.keys()): + page_w, page_h = doc.pages[page_no].size.as_tuple() + body += DocumentToken.get_location( + bbox=cell.prov.bbox.to_bottom_left_origin(page_h).as_tuple(), + page_w=page_w, + page_h=page_h, + xsize=xsize, + ysize=ysize, + ) if add_content: body += f"{cell.text.strip()}{new_line}" From 22005c7b3b308b2ce159535fc0e4b2219c18f25f Mon Sep 17 00:00:00 2001 From: Saidgurbuz Date: Wed, 26 Feb 2025 17:04:06 +0100 Subject: [PATCH 5/8] set default page_no to 1 Signed-off-by: Saidgurbuz --- docling_core/types/doc/document.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index c619cf26..310f3e41 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1465,7 +1465,7 @@ def export_to_document_tokens( """ body = f"<{self.label.value}>{new_line}" - page_no = 0 + page_no = 1 if len(self.prov) > 0: page_no = self.prov[0].page_no From 45754b878ab8ddf303c07f68f92a8d4468f82f6e Mon Sep 17 00:00:00 2001 From: Saidgurbuz Date: Sat, 1 Mar 2025 18:20:17 +0100 Subject: [PATCH 6/8] fix get_location call with to_top_left_origin Signed-off-by: Saidgurbuz --- docling_core/types/doc/document.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 310f3e41..e3380da2 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1490,7 +1490,7 @@ def export_to_document_tokens( if len(doc.pages.keys()): page_w, page_h = doc.pages[page_no].size.as_tuple() body += DocumentToken.get_location( - bbox=cell.prov.bbox.to_bottom_left_origin(page_h).as_tuple(), + bbox=cell.prov.bbox.to_top_left_origin(page_h).as_tuple(), page_w=page_w, page_h=page_h, xsize=xsize, From 12a0abe340890839fb0709d525994e4f40dd12e5 Mon Sep 17 00:00:00 2001 From: Saidgurbuz Date: Fri, 21 Mar 2025 16:36:33 +0100 Subject: [PATCH 7/8] integrate export_to_document_tokens to doctag serializer Signed-off-by: Saidgurbuz --- .../experimental/serializer/doctags.py | 53 +++++++++++++++++-- test/data/doc/constructed_doc.dt | 1 + test/data/doc/constructed_doc.dt.gt | 1 + test/data/doc/constructed_document.yaml.dt | 1 + 4 files changed, 52 insertions(+), 4 deletions(-) diff --git a/docling_core/experimental/serializer/doctags.py b/docling_core/experimental/serializer/doctags.py index 84c9ae38..0ab0a0ae 100644 --- a/docling_core/experimental/serializer/doctags.py +++ b/docling_core/experimental/serializer/doctags.py @@ -3,7 +3,7 @@ import html from enum import Enum from pathlib import Path -from typing import Optional, Union +from typing import Dict, List, Optional, Union from pydantic import AnyUrl, BaseModel from typing_extensions import override @@ -279,9 +279,54 @@ def serialize( **kwargs, ) -> SerializationResult: """Serializes the passed item.""" - # TODO add actual implementation - text_res = "" - return SerializationResult(text=text_res) + params = DocTagsParams(**kwargs) + + body = f"<{item.label.value}>{params.new_line}" + + page_no = 1 + if len(item.prov) > 0: + page_no = item.prov[0].page_no + + if params.add_location: + body += item.get_location_tokens( + doc=doc, + new_line=params.new_line, + xsize=params.xsize, + ysize=params.ysize, + ) + + # mapping from source_cell_id to a list of target_cell_ids + source_to_targets: Dict[int, List[int]] = {} + for link in item.graph.links: + source_to_targets.setdefault(link.source_cell_id, []).append( + link.target_cell_id + ) + + for cell in item.graph.cells: + body += f"<{cell.label.value}_{cell.cell_id}>{params.new_line}" + if cell.prov is not None: + if len(doc.pages.keys()): + page_w, page_h = doc.pages[page_no].size.as_tuple() + body += DocumentToken.get_location( + bbox=cell.prov.bbox.to_top_left_origin(page_h).as_tuple(), + page_w=page_w, + page_h=page_h, + xsize=params.xsize, + ysize=params.ysize, + ) + if params.add_content: + body += f"{cell.text.strip()}{params.new_line}" + + if cell.cell_id in source_to_targets: + targets = source_to_targets[cell.cell_id] + for target in targets: + body += f"{params.new_line}" + + body += f"{params.new_line}" + + body += f"{params.new_line}" + + return SerializationResult(text=body) class DocTagsFormSerializer(BaseFormSerializer): diff --git a/test/data/doc/constructed_doc.dt b/test/data/doc/constructed_doc.dt index ed3e994f..f4c4538a 100644 --- a/test/data/doc/constructed_doc.dt +++ b/test/data/doc/constructed_doc.dt @@ -43,6 +43,7 @@ Affiliation 2 <_unknown_>print("Hello world") Here a formula block: E=mc^2 +number1 Some formatting chops: bold italic diff --git a/test/data/doc/constructed_doc.dt.gt b/test/data/doc/constructed_doc.dt.gt index ed3e994f..f4c4538a 100644 --- a/test/data/doc/constructed_doc.dt.gt +++ b/test/data/doc/constructed_doc.dt.gt @@ -43,6 +43,7 @@ Affiliation 2 <_unknown_>print("Hello world") Here a formula block: E=mc^2 +number1 Some formatting chops: bold italic diff --git a/test/data/doc/constructed_document.yaml.dt b/test/data/doc/constructed_document.yaml.dt index 9d513d60..5d3d71e5 100644 --- a/test/data/doc/constructed_document.yaml.dt +++ b/test/data/doc/constructed_document.yaml.dt @@ -43,6 +43,7 @@ Affiliation 2 <_unknown_>print("Hello world") Here a formula block: E=mc^2 +number1 Some formatting chops: bold italic From ad00546ffefe0a1b33bafd022fe0a4c1bf0301b2 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Mon, 24 Mar 2025 10:26:38 +0100 Subject: [PATCH 8/8] Add DocTags serializer dispatching, deprecate new_line param (#212) * updates for key value region Signed-off-by: Panos Vagenas * deprecate "new_line" parameter Signed-off-by: Panos Vagenas --------- Signed-off-by: Panos Vagenas Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- .../experimental/serializer/doctags.py | 69 +++++++------- docling_core/types/doc/document.py | 94 ++++++------------- 2 files changed, 62 insertions(+), 101 deletions(-) diff --git a/docling_core/experimental/serializer/doctags.py b/docling_core/experimental/serializer/doctags.py index 0ab0a0ae..8b0d28d5 100644 --- a/docling_core/experimental/serializer/doctags.py +++ b/docling_core/experimental/serializer/doctags.py @@ -23,6 +23,7 @@ from docling_core.experimental.serializer.common import CommonParams, DocSerializer from docling_core.types.doc.document import ( CodeItem, + DocItem, DoclingDocument, Formatting, FormItem, @@ -54,7 +55,6 @@ class Mode(str, Enum): MINIFIED = "minified" HUMAN_FRIENDLY = "human_friendly" - new_line: str = "" xsize: int = 500 ysize: int = 500 add_location: bool = True @@ -67,13 +67,13 @@ class Mode(str, Enum): mode: Mode = Mode.HUMAN_FRIENDLY -def _get_delim(mode: DocTagsParams.Mode) -> str: - if mode == DocTagsParams.Mode.HUMAN_FRIENDLY: +def _get_delim(params: DocTagsParams) -> str: + if params.mode == DocTagsParams.Mode.HUMAN_FRIENDLY: delim = "\n" - elif mode == DocTagsParams.Mode.MINIFIED: + elif params.mode == DocTagsParams.Mode.MINIFIED: delim = "" else: - raise RuntimeError(f"Unknown DocTags mode: {mode}") + raise RuntimeError(f"Unknown DocTags mode: {params.mode}") return delim @@ -102,7 +102,6 @@ def serialize( if params.add_location: location = item.get_location_tokens( doc=doc, - new_line=params.new_line, xsize=params.xsize, ysize=params.ysize, ) @@ -158,7 +157,6 @@ def serialize( if params.add_location: body += item.get_location_tokens( doc=doc, - new_line=params.new_line, xsize=params.xsize, ysize=params.ysize, ) @@ -178,15 +176,14 @@ def serialize( body += f"<{DocumentToken.CAPTION.value}>" for caption in item.captions: if caption.cref not in doc_serializer.get_excluded_refs(**kwargs): - body += caption.resolve(doc).get_location_tokens( - doc=doc, - new_line=params.new_line, - xsize=params.xsize, - ysize=params.ysize, - ) + if isinstance(cap := caption.resolve(doc), DocItem): + body += cap.get_location_tokens( + doc=doc, + xsize=params.xsize, + ysize=params.ysize, + ) body += f"{text.strip()}" body += f"" - body += f"{params.new_line}" if body: body = _wrap(text=body, wrap_tag=DocumentToken.OTSL.value) @@ -208,7 +205,6 @@ def serialize( ) -> SerializationResult: """Serializes the passed item.""" params = DocTagsParams(**kwargs) - parts: list[str] = [] if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs): @@ -216,7 +212,6 @@ def serialize( if params.add_location: body += item.get_location_tokens( doc=doc, - new_line=params.new_line, xsize=params.xsize, ysize=params.ysize, ) @@ -246,13 +241,13 @@ def serialize( body = "" for caption in item.captions: if caption.cref not in doc_serializer.get_excluded_refs(**kwargs): - body += caption.resolve(doc).get_location_tokens( - doc=doc, - new_line=params.new_line, - xsize=params.xsize, - ysize=params.ysize, - ) - body += f"{text.strip()}" + if isinstance(cap := caption.resolve(doc), DocItem): + body += cap.get_location_tokens( + doc=doc, + xsize=params.xsize, + ysize=params.ysize, + ) + body += f"{text.strip()}" if body: body = _wrap(text=body, wrap_tag=DocumentToken.CAPTION.value) parts.append(body) @@ -281,7 +276,7 @@ def serialize( """Serializes the passed item.""" params = DocTagsParams(**kwargs) - body = f"<{item.label.value}>{params.new_line}" + body = "" page_no = 1 if len(item.prov) > 0: @@ -290,7 +285,6 @@ def serialize( if params.add_location: body += item.get_location_tokens( doc=doc, - new_line=params.new_line, xsize=params.xsize, ysize=params.ysize, ) @@ -303,11 +297,11 @@ def serialize( ) for cell in item.graph.cells: - body += f"<{cell.label.value}_{cell.cell_id}>{params.new_line}" + cell_txt = "" if cell.prov is not None: if len(doc.pages.keys()): page_w, page_h = doc.pages[page_no].size.as_tuple() - body += DocumentToken.get_location( + cell_txt += DocumentToken.get_location( bbox=cell.prov.bbox.to_top_left_origin(page_h).as_tuple(), page_w=page_w, page_h=page_h, @@ -315,17 +309,20 @@ def serialize( ysize=params.ysize, ) if params.add_content: - body += f"{cell.text.strip()}{params.new_line}" + cell_txt += cell.text.strip() if cell.cell_id in source_to_targets: targets = source_to_targets[cell.cell_id] for target in targets: - body += f"{params.new_line}" - - body += f"{params.new_line}" + # TODO centralize token creation + cell_txt += f"" - body += f"{params.new_line}" + # TODO centralize token creation + tok = f"{cell.label.value}_{cell.cell_id}" + cell_txt = _wrap(text=cell_txt, wrap_tag=tok) + body += cell_txt + body = _wrap(body, DocumentToken.KEY_VALUE_REGION.value) return SerializationResult(text=body) @@ -374,7 +371,7 @@ def serialize( visited=my_visited, **kwargs, ) - delim = _get_delim(mode=params.mode) + delim = _get_delim(params=params) if parts: text_res = delim.join( [ @@ -419,7 +416,7 @@ def serialize( **kwargs, ) wrap_tag = DocumentToken.INLINE.value - delim = _get_delim(mode=params.mode) + delim = _get_delim(params=params) text_res = delim.join([p.text for p in parts if p.text]) if text_res: text_res = f"{text_res}{delim}" @@ -482,14 +479,14 @@ def post_process( @override def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult: """Serialize a page out of its parts.""" - delim = _get_delim(mode=self.params.mode) + delim = _get_delim(params=self.params) text_res = delim.join([p.text for p in parts]) return SerializationResult(text=text_res) @override def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult: """Serialize a document out of its pages.""" - delim = _get_delim(mode=self.params.mode) + delim = _get_delim(params=self.params) if self.params.add_page_break: page_sep = f"{delim}<{DocumentToken.PAGE_BREAK.value}>{delim}" content = page_sep.join([p.text for p in pages if p.text]) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index e3380da2..5a33cf3b 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -644,7 +644,7 @@ class DocItem( def get_location_tokens( self, doc: "DoclingDocument", - new_line: str, + new_line: str = "", # deprecated xsize: int = 500, ysize: int = 500, ) -> str: @@ -663,7 +663,7 @@ def get_location_tokens( xsize=xsize, ysize=ysize, ) - location += f"{loc_str}{new_line}" + location += loc_str return location @@ -726,7 +726,7 @@ class TextItem(DocItem): def export_to_document_tokens( self, doc: "DoclingDocument", - new_line: str = "", + new_line: str = "", # deprecated xsize: int = 500, ysize: int = 500, add_location: bool = True, @@ -735,7 +735,7 @@ def export_to_document_tokens( r"""Export text element to document tokens format. :param doc: "DoclingDocument": - :param new_line: str (Default value = "") + :param new_line: str (Default value = "") Deprecated :param xsize: int: (Default value = 500) :param ysize: int: (Default value = 500) :param add_location: bool: (Default value = True) @@ -750,7 +750,6 @@ def export_to_document_tokens( serializer = DocTagsDocSerializer( doc=doc, params=DocTagsParams( - new_line=new_line, xsize=xsize, ysize=ysize, add_location=add_location, @@ -780,7 +779,7 @@ class SectionHeaderItem(TextItem): def export_to_document_tokens( self, doc: "DoclingDocument", - new_line: str = "", + new_line: str = "", # deprecated xsize: int = 500, ysize: int = 500, add_location: bool = True, @@ -789,7 +788,7 @@ def export_to_document_tokens( r"""Export text element to document tokens format. :param doc: "DoclingDocument": - :param new_line: str (Default value = "") + :param new_line: str (Default value = "") Deprecated :param xsize: int: (Default value = 500) :param ysize: int: (Default value = 500) :param add_location: bool: (Default value = True) @@ -804,7 +803,6 @@ def export_to_document_tokens( serializer = DocTagsDocSerializer( doc=doc, params=DocTagsParams( - new_line=new_line, xsize=xsize, ysize=ysize, add_location=add_location, @@ -866,7 +864,7 @@ class CodeItem(FloatingItem, TextItem): def export_to_document_tokens( self, doc: "DoclingDocument", - new_line: str = "", + new_line: str = "", # deprecated xsize: int = 500, ysize: int = 500, add_location: bool = True, @@ -875,7 +873,7 @@ def export_to_document_tokens( r"""Export text element to document tokens format. :param doc: "DoclingDocument": - :param new_line: str (Default value = "") + :param new_line: str (Default value = "") Deprecated :param xsize: int: (Default value = 500) :param ysize: int: (Default value = 500) :param add_location: bool: (Default value = True) @@ -890,7 +888,6 @@ def export_to_document_tokens( serializer = DocTagsDocSerializer( doc=doc, params=DocTagsParams( - new_line=new_line, xsize=xsize, ysize=ysize, add_location=add_location, @@ -1031,7 +1028,7 @@ def export_to_html( def export_to_document_tokens( self, doc: "DoclingDocument", - new_line: str = "", + new_line: str = "", # deprecated xsize: int = 500, ysize: int = 500, add_location: bool = True, @@ -1041,7 +1038,7 @@ def export_to_document_tokens( r"""Export picture to document tokens format. :param doc: "DoclingDocument": - :param new_line: str (Default value = "") + :param new_line: str (Default value = "") Deprecated :param xsize: int: (Default value = 500) :param ysize: int: (Default value = 500) :param add_location: bool: (Default value = True) @@ -1058,7 +1055,6 @@ def export_to_document_tokens( serializer = DocTagsDocSerializer( doc=doc, params=DocTagsParams( - new_line=new_line, xsize=xsize, ysize=ysize, add_location=add_location, @@ -1344,7 +1340,7 @@ def export_to_otsl( def export_to_document_tokens( self, doc: "DoclingDocument", - new_line: str = "", + new_line: str = "", # deprecated xsize: int = 500, ysize: int = 500, add_location: bool = True, @@ -1355,7 +1351,7 @@ def export_to_document_tokens( r"""Export table to document tokens format. :param doc: "DoclingDocument": - :param new_line: str (Default value = "") + :param new_line: str (Default value = "") Deprecated :param xsize: int: (Default value = 500) :param ysize: int: (Default value = 500) :param add_location: bool: (Default value = True) @@ -1372,7 +1368,6 @@ def export_to_document_tokens( serializer = DocTagsDocSerializer( doc=doc, params=DocTagsParams( - new_line=new_line, xsize=xsize, ysize=ysize, add_location=add_location, @@ -1447,7 +1442,7 @@ class KeyValueItem(FloatingItem): def export_to_document_tokens( self, doc: "DoclingDocument", - new_line: str = "", + new_line: str = "", # deprecated xsize: int = 500, ysize: int = 500, add_location: bool = True, @@ -1456,59 +1451,29 @@ def export_to_document_tokens( r"""Export key value item to document tokens format. :param doc: "DoclingDocument": - :param new_line: str (Default value = "") + :param new_line: str (Default value = "") Deprecated :param xsize: int: (Default value = 500) :param ysize: int: (Default value = 500) :param add_location: bool: (Default value = True) :param add_content: bool: (Default value = True) """ - body = f"<{self.label.value}>{new_line}" - - page_no = 1 - if len(self.prov) > 0: - page_no = self.prov[0].page_no + from docling_core.experimental.serializer.doctags import ( + DocTagsDocSerializer, + DocTagsParams, + ) - if add_location: - body += self.get_location_tokens( - doc=doc, - new_line=new_line, + serializer = DocTagsDocSerializer( + doc=doc, + params=DocTagsParams( xsize=xsize, ysize=ysize, - ) - - # mapping from source_cell_id to a list of target_cell_ids - source_to_targets: Dict[int, List[int]] = {} - for link in self.graph.links: - source_to_targets.setdefault(link.source_cell_id, []).append( - link.target_cell_id - ) - - for cell in self.graph.cells: - body += f"<{cell.label.value}_{cell.cell_id}>{new_line}" - if cell.prov is not None: - if len(doc.pages.keys()): - page_w, page_h = doc.pages[page_no].size.as_tuple() - body += DocumentToken.get_location( - bbox=cell.prov.bbox.to_top_left_origin(page_h).as_tuple(), - page_w=page_w, - page_h=page_h, - xsize=xsize, - ysize=ysize, - ) - if add_content: - body += f"{cell.text.strip()}{new_line}" - - if cell.cell_id in source_to_targets: - targets = source_to_targets[cell.cell_id] - for target in targets: - body += f"{new_line}" - - body += f"{new_line}" - - body += f"{new_line}" - - return body + add_location=add_location, + add_content=add_content, + ), + ) + text = serializer.serialize(item=self).text + return text class FormItem(FloatingItem): @@ -3557,7 +3522,7 @@ def save_as_doctags( def export_to_document_tokens( # noqa: C901 self, - delim: str = "", + delim: str = "", # deprecated from_element: int = 0, to_element: int = sys.maxsize, labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS, @@ -3575,7 +3540,7 @@ def export_to_document_tokens( # noqa: C901 Operates on a slice of the document's body as defined through arguments from_element and to_element; defaulting to the whole main_text. - :param delim: str: (Default value = "") + :param delim: str: (Default value = "") Deprecated :param from_element: int: (Default value = 0) :param to_element: Optional[int]: (Default value = None) :param labels: set[DocItemLabel] @@ -3601,7 +3566,6 @@ def export_to_document_tokens( # noqa: C901 # layers=..., # not exposed start_idx=from_element, stop_idx=to_element, - new_line=delim, xsize=xsize, ysize=ysize, add_location=add_location,