diff --git a/docling_core/experimental/serializer/doctags.py b/docling_core/experimental/serializer/doctags.py index 650f0aa3..9a3871de 100644 --- a/docling_core/experimental/serializer/doctags.py +++ b/docling_core/experimental/serializer/doctags.py @@ -23,6 +23,7 @@ CodeItem, DocItem, DoclingDocument, + FloatingItem, FormItem, InlineGroup, KeyValueItem, @@ -126,10 +127,15 @@ def serialize( if text_part: parts.append(text_part) - res = "".join(parts) + if params.add_caption and isinstance(item, FloatingItem): + cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text + if cap_text: + parts.append(cap_text) + + text_res = "".join(parts) if wrap_tag is not None: - res = _wrap(text=res, wrap_tag=wrap_tag) - return SerializationResult(text=res) + text_res = _wrap(text=text_res, wrap_tag=wrap_tag) + return SerializationResult(text=text_res) class DocTagsTableSerializer(BaseTableSerializer): @@ -147,44 +153,36 @@ def serialize( """Serializes the passed item.""" params = DocTagsParams(**kwargs) - body = "" + parts: list[str] = [] if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs): if params.add_location: - body += item.get_location_tokens( + loc_text = item.get_location_tokens( doc=doc, xsize=params.xsize, ysize=params.ysize, ) + parts.append(loc_text) - body += item.export_to_otsl( + otsl_text = item.export_to_otsl( doc=doc, add_cell_location=params.add_table_cell_location, add_cell_text=params.add_table_cell_text, xsize=params.xsize, ysize=params.ysize, ) + parts.append(otsl_text) - if params.add_caption and len(item.captions): - text = doc_serializer.serialize_captions(item, **kwargs).text - - if len(text): - body += f"<{DocumentToken.CAPTION.value}>" - for caption in item.captions: - if caption.cref not in doc_serializer.get_excluded_refs(**kwargs): - if isinstance(cap := caption.resolve(doc), DocItem): - body += cap.get_location_tokens( - doc=doc, - xsize=params.xsize, - ysize=params.ysize, - ) - body += f"{text.strip()}" - body += f"" - - if body: - body = _wrap(text=body, wrap_tag=DocumentToken.OTSL.value) + if params.add_caption: + cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text + if cap_text: + parts.append(cap_text) - return SerializationResult(text=body) + text_res = "".join(parts) + if text_res: + text_res = _wrap(text=text_res, wrap_tag=DocumentToken.OTSL.value) + + return SerializationResult(text=text_res) class DocTagsPictureSerializer(BasePictureSerializer): @@ -230,31 +228,18 @@ def serialize( ) parts.append(body) - if params.add_caption and len(item.captions): - text = doc_serializer.serialize_captions(item, **kwargs).text - - if len(text): - body = "" - for caption in item.captions: - if caption.cref not in doc_serializer.get_excluded_refs(**kwargs): - if isinstance(cap := caption.resolve(doc), DocItem): - body += cap.get_location_tokens( - doc=doc, - xsize=params.xsize, - ysize=params.ysize, - ) - body += f"{text.strip()}" - if body: - body = _wrap(text=body, wrap_tag=DocumentToken.CAPTION.value) - parts.append(body) - - text = "".join(parts) - if text: + if params.add_caption: + cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text + if cap_text: + parts.append(cap_text) + + text_res = "".join(parts) + if text_res: token = DocumentToken.create_token_name_from_doc_item_label( label=item.label ) - text = _wrap(text=text, wrap_tag=token) - return SerializationResult(text=text) + text_res = _wrap(text=text_res, wrap_tag=token) + return SerializationResult(text=text_res) class DocTagsKeyValueSerializer(BaseKeyValueSerializer): @@ -318,6 +303,11 @@ def serialize( cell_txt = _wrap(text=cell_txt, wrap_tag=tok) body += cell_txt + if params.add_caption: + cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text + if cap_text: + body += cap_text + body = _wrap(body, DocumentToken.KEY_VALUE_REGION.value) return SerializationResult(text=body) @@ -471,3 +461,32 @@ def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult wrap_tag = DocumentToken.DOCUMENT.value text_res = f"<{wrap_tag}>{content}{delim}" return SerializationResult(text=text_res) + + @override + def serialize_captions( + self, + item: FloatingItem, + **kwargs, + ) -> SerializationResult: + """Serialize the item's captions.""" + params = DocTagsParams(**kwargs) + parts: list[str] = [] + + if item.captions: + cap_text = super().serialize_captions(item, **kwargs).text + if cap_text: + if params.add_location: + for caption in item.captions: + if caption.cref not in self.get_excluded_refs(**kwargs): + if isinstance(cap := caption.resolve(self.doc), DocItem): + loc_txt = cap.get_location_tokens( + doc=self.doc, + xsize=params.xsize, + ysize=params.ysize, + ) + parts.append(loc_txt) + parts.append(cap_text) + text_res = "".join(parts) + if text_res: + text_res = _wrap(text=text_res, wrap_tag=DocumentToken.CAPTION.value) + return SerializationResult(text=text_res) diff --git a/docling_core/experimental/serializer/markdown.py b/docling_core/experimental/serializer/markdown.py index 99d7d394..f54155ca 100644 --- a/docling_core/experimental/serializer/markdown.py +++ b/docling_core/experimental/serializer/markdown.py @@ -33,6 +33,7 @@ ContentLayer, DocItem, DoclingDocument, + FloatingItem, Formatting, FormItem, FormulaItem, @@ -77,37 +78,46 @@ def serialize( ) -> SerializationResult: """Serializes the passed item.""" params = MarkdownParams(**kwargs) + parts: list[str] = [] escape_html = True escape_underscores = True if isinstance(item, TitleItem): - res = f"# {item.text}" + text = f"# {item.text}" elif isinstance(item, SectionHeaderItem): - res = f"{(item.level + 1) * '#'} {item.text}" + text = f"{(item.level + 1) * '#'} {item.text}" elif isinstance(item, CodeItem): - res = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```" + text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```" escape_html = False escape_underscores = False elif isinstance(item, FormulaItem): if item.text: - res = f"${item.text}$" if is_inline_scope else f"$${item.text}$$" + text = f"${item.text}$" if is_inline_scope else f"$${item.text}$$" elif item.orig: - res = "" + text = "" else: - res = "" + text = "" escape_html = False escape_underscores = False elif params.wrap_width: - res = textwrap.fill(item.text, width=params.wrap_width) + text = textwrap.fill(item.text, width=params.wrap_width) else: - res = item.text - res = doc_serializer.post_process( - text=res, + text = item.text + parts.append(text) + + if isinstance(item, FloatingItem): + cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text + if cap_text: + parts.append(cap_text) + + text_res = (" " if is_inline_scope else "\n\n").join(parts) + text_res = doc_serializer.post_process( + text=text_res, escape_html=escape_html, escape_underscores=escape_underscores, formatting=item.formatting, hyperlink=item.hyperlink, ) - return SerializationResult(text=res) + return SerializationResult(text=text_res) class MarkdownTableSerializer(BaseTableSerializer):