From ea6be4eb0fb980ca5e40cbe2bd815842b69f94dd Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Thu, 3 Apr 2025 09:27:35 +0200
Subject: [PATCH 01/34] iunitial attempt at HTML serializer

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/html.py | 698 +++++++++++++++++++
 test/test_serializer_html.py                 |  24 +
 2 files changed, 722 insertions(+)
 create mode 100644 docling_core/experimental/serializer/html.py
 create mode 100644 test/test_serializer_html.py
diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
new file mode 100644
index 00000000..48137aec
--- /dev/null
+++ b/docling_core/experimental/serializer/html.py
@@ -0,0 +1,698 @@
+#
+# Copyright IBM Corp. 2024 - 2025
+# SPDX-License-Identifier: MIT
+#
+
+"""Define classes for HTML serialization."""
+import html
+import re
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+
+from pydantic import AnyUrl, BaseModel
+from typing_extensions import override
+
+from docling_core.experimental.serializer.base import (
+    BaseDocSerializer,
+    BaseFallbackSerializer,
+    BaseFormSerializer,
+    BaseInlineSerializer,
+    BaseKeyValueSerializer,
+    BaseListSerializer,
+    BasePictureSerializer,
+    BaseTableSerializer,
+    BaseTextSerializer,
+    SerializationResult,
+)
+from docling_core.experimental.serializer.common import CommonParams, DocSerializer
+from docling_core.types.doc.base import ImageRefMode
+from docling_core.types.doc.document import (
+    CodeItem,
+    ContentLayer,
+    DocItem,
+    DoclingDocument,
+    FloatingItem,
+    Formatting,
+    FormItem,
+    FormulaItem,
+    ImageRef,
+    InlineGroup,
+    KeyValueItem,
+    NodeItem,
+    OrderedList,
+    PictureClassificationData,
+    PictureItem,
+    SectionHeaderItem,
+    TableItem,
+    TextItem,
+    TitleItem,
+    UnorderedList,
+)
+from docling_core.types.doc.utils import get_html_tag_with_text_direction, get_text_direction
+
+
+class HTMLParams(CommonParams):
+    """HTML-specific serialization parameters."""
+
+    layers: set[ContentLayer] = {ContentLayer.BODY}
+    image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER
+    image_placeholder: str = "<!-- image -->"
+    add_page_break: bool = True
+    page_break_placeholder: str = '<div class="page-break"></div>'
+    css_styles: Optional[str] = None
+    html_lang: str = "en"
+    formula_to_mathml: bool = True
+    add_document_metadata: bool = True
+    prettify: bool = True  # Add indentation and line breaks
+    add_image_dimensions: bool = True  # Add width and height attributes to images
+
+
+class HTMLTextSerializer(BaseModel, BaseTextSerializer):
+    """HTML-specific text item serializer."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: TextItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        is_inline_scope: bool = False,
+        **kwargs,
+    ) -> SerializationResult:
+        """Serializes the passed item."""
+        params = HTMLParams(**kwargs)
+        parts: List[str] = []
+        
+        text_content = html.escape(item.text, quote=False)
+        
+        # Replace newlines with <br> tags if not in code or formula
+        if not isinstance(item, (CodeItem, FormulaItem)):
+            text_content = text_content.replace("\n", "<br>")
+            
+        if isinstance(item, TitleItem):
+            text = get_html_tag_with_text_direction(html_tag="h1", text=text_content)
+        elif isinstance(item, SectionHeaderItem):
+            section_level = min(item.level + 1, 6)  # h1-h6 are valid in HTML
+            text = get_html_tag_with_text_direction(
+                html_tag=f"h{section_level}", text=text_content
+            )
+        elif isinstance(item, CodeItem):
+            language_attr = ""
+            if item.code_language.value != "unknown":
+                language_attr = f' class="language-{item.code_language.value.lower()}"'
+            
+            if is_inline_scope:
+                text = f'<code{language_attr}>{text_content}</code>'
+            else:
+                text = f'<pre><code{language_attr}>{text_content}</code></pre>'
+        elif isinstance(item, FormulaItem):
+            if params.formula_to_mathml and item.text:
+                # Simplified formula handling - full implementation would use latex2mathml
+                text = f'<div class="formula">{text_content}</div>'
+            elif item.text:
+                text = f'<div class="formula">{text_content}</div>'
+            elif item.orig:
+                text = '<div class="formula-not-decoded">Formula not decoded</div>'
+            else:
+                text = ''
+        else:
+            # Regular text
+            text = get_html_tag_with_text_direction(html_tag="p", text=text_content)
+        
+        parts.append(text)
+
+        # Handle captions for floating items
+        if isinstance(item, FloatingItem):
+            cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
+            if cap_text:
+                parts.append(cap_text)
+
+        text_res = "\n".join(parts)
+        text_res = doc_serializer.post_process(
+            text=text_res,
+            formatting=item.formatting,
+            hyperlink=item.hyperlink,
+        )
+        
+        return SerializationResult(text=text_res)
+
+
+class HTMLTableSerializer(BaseTableSerializer):
+    """HTML-specific table item serializer."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: TableItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs,
+    ) -> SerializationResult:
+        """Serializes the passed item."""
+        if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
+            return SerializationResult(text="")
+            
+        # Process captions first
+        cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
+        
+        # Start building the table
+        rows = []
+        
+        for i, row in enumerate(item.data.grid):
+            row_cells = []
+            for j, cell in enumerate(row):
+                # Skip cells that are covered by rowspan or colspan from previous cells
+                if cell.start_row_offset_idx != i or cell.start_col_offset_idx != j:
+                    continue
+                    
+                content = html.escape(cell.text.strip())
+                celltag = "th" if cell.column_header or cell.row_header else "td"
+                
+                attrs = []
+                if cell.row_span > 1:
+                    attrs.append(f'rowspan="{cell.row_span}"')
+                if cell.col_span > 1:
+                    attrs.append(f'colspan="{cell.col_span}"')
+                    
+                text_dir = get_text_direction(content)
+                if text_dir == "rtl":
+                    attrs.append(f'dir="{text_dir}"')
+                    
+                attrs_str = " ".join(attrs)
+                if attrs_str:
+                    attrs_str = " " + attrs_str
+                    
+                row_cells.append(f"<{celltag}{attrs_str}>{content}</{celltag}>")
+                
+            if row_cells:
+                rows.append(f"<tr>{''.join(row_cells)}</tr>")
+        
+        tbody = f"<tbody>{''.join(rows)}</tbody>" if rows else ""
+        
+        if cap_text:
+            table = f"<table>{cap_text}{tbody}</table>"
+        elif tbody:
+            table = f"<table>{tbody}</table>"
+        else:
+            table = "<table></table>"
+            
+        return SerializationResult(text=table)
+
+
+class HTMLPictureSerializer(BasePictureSerializer):
+    """HTML-specific picture item serializer."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: PictureItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs,
+    ) -> SerializationResult:
+        """Serializes the passed item."""
+        params = HTMLParams(**kwargs)
+        
+        if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
+            return SerializationResult(text="")
+            
+        cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
+        
+        # Process the image based on image_mode
+        img_text = self._get_image_html(item, doc, params)
+        
+        # Add classification info if available
+        classification_text = ""
+        for annotation in item.annotations:
+            if isinstance(annotation, PictureClassificationData) and annotation.predicted_classes:
+                class_name = annotation.predicted_classes[0].class_name
+                confidence = annotation.predicted_classes[0].confidence
+                classification_text = f'<div class="image-classification">{html.escape(class_name)} ({confidence:.2f})</div>'
+                break
+                
+        figure = f"<figure>{img_text}{classification_text}{cap_text}</figure>"
+        return SerializationResult(text=figure)
+        
+    def _get_image_html(self, item: PictureItem, doc: DoclingDocument, params: HTMLParams) -> str:
+        """Generate HTML for the image based on image mode."""
+        if params.image_mode == ImageRefMode.PLACEHOLDER:
+            return params.image_placeholder
+            
+        elif params.image_mode == ImageRefMode.EMBEDDED:
+            # Try to use the embedded image
+            if (item.image is not None and 
+                isinstance(item.image.uri, AnyUrl) and 
+                item.image.uri.scheme == "data"):
+                return self._create_img_tag(item.image.uri, item, params)
+                
+            # Try to get the image from document
+            img = item.get_image(doc)
+            if img is not None:
+                imgb64 = item._image_to_base64(img)
+                return self._create_img_tag(f"data:image/png;base64,{imgb64}", item, params)
+                
+            return params.image_placeholder
+            
+        elif params.image_mode == ImageRefMode.REFERENCED:
+            if item.image is not None:
+                if isinstance(item.image.uri, AnyUrl) and item.image.uri.scheme != "data":
+                    return self._create_img_tag(item.image.uri, item, params)
+                elif isinstance(item.image.uri, Path):
+                    return self._create_img_tag(item.image.uri, item, params)
+                    
+            return params.image_placeholder
+            
+        return params.image_placeholder
+        
+    def _create_img_tag(self, src: Union[str, AnyUrl, Path], item: PictureItem, params: HTMLParams) -> str:
+        """Create an HTML img tag with appropriate attributes."""
+        attrs = [f'src="{src}"', 'alt="Image"']
+        
+        if params.add_image_dimensions and item.image is not None:
+            attrs.append(f'width="{item.image.size.width}"')
+            attrs.append(f'height="{item.image.size.height}"')
+            
+        return f"<img {' '.join(attrs)}>"
+
+
+class HTMLKeyValueSerializer(BaseKeyValueSerializer):
+    """HTML-specific key-value item serializer."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: KeyValueItem,
+        doc_serializer: "BaseDocSerializer",
+        doc: DoclingDocument,
+        **kwargs,
+    ) -> SerializationResult:
+        """Serializes the passed item."""
+        if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
+            return SerializationResult(text="")
+            
+        # Create a definition list (dl) for key-value pairs
+        parts = ['<dl class="key-value-region">']
+        
+        # Group cells by their keys
+        key_to_values: Dict[int, List[int]] = {}
+        for link in item.graph.links:
+            key_to_values.setdefault(link.source_cell_id, []).append(link.target_cell_id)
+            
+        # Find all cells
+        cell_by_id = {cell.cell_id: cell for cell in item.graph.cells}
+        
+        # Process each key-value pair
+        for key_id, value_ids in key_to_values.items():
+            if key_id in cell_by_id:
+                key_cell = cell_by_id[key_id]
+                key_text = html.escape(key_cell.text)
+                parts.append(f'<dt>{key_text}</dt>')
+                
+                for value_id in value_ids:
+                    if value_id in cell_by_id:
+                        value_cell = cell_by_id[value_id]
+                        value_text = html.escape(value_cell.text)
+                        parts.append(f'<dd>{value_text}</dd>')
+        
+        parts.append('</dl>')
+        
+        # Add caption if available
+        cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
+        if cap_text:
+            parts.append(cap_text)
+            
+        return SerializationResult(text="\n".join(parts))
+
+
+class HTMLFormSerializer(BaseFormSerializer):
+    """HTML-specific form item serializer."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: FormItem,
+        doc_serializer: "BaseDocSerializer",
+        doc: DoclingDocument,
+        **kwargs,
+    ) -> SerializationResult:
+        """Serializes the passed item."""
+        if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
+            return SerializationResult(text="")
+            
+        # Create a form representation (non-functional HTML form)
+        parts = ['<div class="form-container">']
+        
+        # Simple representation of form items
+        for cell in item.graph.cells:
+            cell_text = html.escape(cell.text)
+            cell_label = cell.label.value
+            parts.append(f'<div class="form-item form-item-{cell_label}">{cell_text}</div>')
+            
+        parts.append('</div>')
+        
+        # Add caption if available
+        cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
+        if cap_text:
+            parts.append(cap_text)
+            
+        return SerializationResult(text="\n".join(parts))
+
+
+class HTMLListSerializer(BaseModel, BaseListSerializer):
+    """HTML-specific list serializer."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: Union[UnorderedList, OrderedList],
+        doc_serializer: "BaseDocSerializer",
+        doc: DoclingDocument,
+        list_level: int = 0,
+        is_inline_scope: bool = False,
+        visited: Optional[set[str]] = None,  # refs of visited items
+        **kwargs,
+    ) -> SerializationResult:
+        """Serializes the passed item."""
+        my_visited = visited or set()
+        parts = doc_serializer.get_parts(
+            item=item,
+            list_level=list_level + 1,
+            is_inline_scope=is_inline_scope,
+            visited=my_visited,
+            **kwargs,
+        )
+        
+        # Determine list type
+        tag = "ol" if isinstance(item, OrderedList) else "ul"
+        
+        # Build list items
+        items = []
+        for part in parts:
+            if part.text:
+                # If the part is already wrapped in <li>, use it directly
+                if part.text.startswith("<li") and part.text.endswith("</li>"):
+                    items.append(part.text)
+                else:
+                    # Otherwise wrap it in <li>
+                    items.append(f"<li>{part.text}</li>")
+                    
+        list_html = f"<{tag}>{''.join(items)}</{tag}>"
+        return SerializationResult(text=list_html)
+
+
+class HTMLInlineSerializer(BaseInlineSerializer):
+    """HTML-specific inline group serializer."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: InlineGroup,
+        doc_serializer: "BaseDocSerializer",
+        doc: DoclingDocument,
+        list_level: int = 0,
+        visited: Optional[set[str]] = None,  # refs of visited items
+        **kwargs,
+    ) -> SerializationResult:
+        """Serializes the passed item."""
+        my_visited = visited or set()
+        parts = doc_serializer.get_parts(
+            item=item,
+            list_level=list_level,
+            is_inline_scope=True,
+            visited=my_visited,
+            **kwargs,
+        )
+        
+        # Join parts with spaces for inline content
+        inline_content = " ".join([p.text for p in parts if p.text])
+        if inline_content:
+            return SerializationResult(text=f"<span>{inline_content}</span>")
+        return SerializationResult(text="")
+
+
+class HTMLFallbackSerializer(BaseFallbackSerializer):
+    """HTML-specific fallback serializer."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: NodeItem,
+        doc_serializer: "BaseDocSerializer",
+        doc: DoclingDocument,
+        **kwargs,
+    ) -> SerializationResult:
+        """Serializes the passed item."""
+        if isinstance(item, DocItem):
+            return SerializationResult(text=f"<!-- Unsupported item type: {item.label} -->")
+        return SerializationResult(text="")
+
+
+class HTMLDocSerializer(DocSerializer):
+    """HTML-specific document serializer."""
+
+    text_serializer: BaseTextSerializer = HTMLTextSerializer()
+    table_serializer: BaseTableSerializer = HTMLTableSerializer()
+    picture_serializer: BasePictureSerializer = HTMLPictureSerializer()
+    key_value_serializer: BaseKeyValueSerializer = HTMLKeyValueSerializer()
+    form_serializer: BaseFormSerializer = HTMLFormSerializer()
+    fallback_serializer: BaseFallbackSerializer = HTMLFallbackSerializer()
+
+    list_serializer: BaseListSerializer = HTMLListSerializer()
+    inline_serializer: BaseInlineSerializer = HTMLInlineSerializer()
+
+    params: HTMLParams = HTMLParams()
+
+    @override
+    def serialize_bold(self, text: str, **kwargs) -> str:
+        """Apply HTML-specific bold serialization."""
+        return f"<strong>{text}</strong>"
+
+    @override
+    def serialize_italic(self, text: str, **kwargs) -> str:
+        """Apply HTML-specific italic serialization."""
+        return f"<em>{text}</em>"
+
+    @override
+    def serialize_underline(self, text: str, **kwargs) -> str:
+        """Apply HTML-specific underline serialization."""
+        return f"<u>{text}</u>"
+
+    @override
+    def serialize_strikethrough(self, text: str, **kwargs) -> str:
+        """Apply HTML-specific strikethrough serialization."""
+        return f"<s>{text}</s>"
+
+    @override
+    def serialize_hyperlink(self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs) -> str:
+        """Apply HTML-specific hyperlink serialization."""
+        return f'<a href="{hyperlink}">{text}</a>'
+
+    @override
+    def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
+        """Serialize a page out of its parts."""
+        params = self.params
+        if params.prettify:
+            text_res = "\n".join([p.text for p in parts if p.text])
+        else:
+            text_res = "".join([p.text for p in parts if p.text])
+        return SerializationResult(text=text_res)
+
+    @override
+    def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
+        """Serialize a document out of its pages."""
+        params = self.params
+        
+        # Join pages with page breaks if specified
+        if params.add_page_break and params.page_break_placeholder:
+            page_sep = f"\n{params.page_break_placeholder}\n"
+            content = page_sep.join([p.text for p in pages if p.text])
+        else:
+            content = self.serialize_page(parts=pages).text
+            
+        # Add HTML document structure
+        head = self._generate_head()
+        body = f"<body>\n{content}\n</body>"
+        
+        # Create full HTML document
+        html_doc = f"<!DOCTYPE html>\n<html lang=\"{params.html_lang}\">\n{head}\n{body}\n</html>"
+        
+        return SerializationResult(text=html_doc)
+        
+    def _generate_head(self) -> str:
+        """Generate the HTML head section with metadata and styles."""
+        params = self.params
+        
+        head_parts = ['<head>', '<meta charset="UTF-8">']
+        
+        # Add metadata if requested
+        if params.add_document_metadata:
+            if self.doc.name:
+                head_parts.append(f'<title>{html.escape(self.doc.name)}</title>')
+            else:
+                head_parts.append('<title>Docling Document</title>')
+                
+            head_parts.append('<meta name="generator" content="Docling HTML Serializer">')
+            
+        # Add default styles or custom CSS
+        if params.css_styles:
+            head_parts.append(f'<style>\n{params.css_styles}\n</style>')
+        else:
+            head_parts.append(self._get_default_css())
+            
+        head_parts.append('</head>')
+        
+        if params.prettify:
+            return '\n'.join(head_parts)
+        else:
+            return ''.join(head_parts)
+            
+    def _get_default_css(self) -> str:
+        """Return default CSS styles for the HTML document."""
+        return """<style>
+    html {
+        background-color: #f5f5f5;
+        font-family: Arial, sans-serif;
+        line-height: 1.6;
+    }
+    body {
+        max-width: 800px;
+        margin: 0 auto;
+        padding: 2rem;
+        background-color: white;
+        box-shadow: 0 0 10px rgba(0,0,0,0.1);
+    }
+    h1, h2, h3, h4, h5, h6 {
+        color: #333;
+        margin-top: 1.5em;
+        margin-bottom: 0.5em;
+    }
+    h1 {
+        font-size: 2em;
+        border-bottom: 1px solid #eee;
+        padding-bottom: 0.3em;
+    }
+    table {
+        border-collapse: collapse;
+        margin: 1em 0;
+        width: 100%;
+    }
+    th, td {
+        border: 1px solid #ddd;
+        padding: 8px;
+        text-align: left;
+    }
+    th {
+        background-color: #f2f2f2;
+        font-weight: bold;
+    }
+    figure {
+        margin: 1.5em 0;
+        text-align: center;
+    }
+    figcaption {
+        color: #666;
+        font-style: italic;
+        margin-top: 0.5em;
+    }
+    img {
+        max-width: 100%;
+        height: auto;
+    }
+    pre {
+        background-color: #f6f8fa;
+        border-radius: 3px;
+        padding: 1em;
+        overflow: auto;
+    }
+    code {
+        font-family: monospace;
+        background-color: #f6f8fa;
+        padding: 0.2em 0.4em;
+        border-radius: 3px;
+    }
+    pre code {
+        background-color: transparent;
+        padding: 0;
+    }
+    .formula {
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background-color: #f9f9f9;
+    }
+    .formula-not-decoded {
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background: repeating-linear-gradient(
+            45deg,
+            #f0f0f0,
+            #f0f0f0 10px,
+            #f9f9f9 10px,
+            #f9f9f9 20px
+        );
+    }
+    .page-break {
+        page-break-after: always;
+        border-top: 1px dashed #ccc;
+        margin: 2em 0;
+    }
+    .key-value-region {
+        background-color: #f9f9f9;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .key-value-region dt {
+        font-weight: bold;
+    }
+    .key-value-region dd {
+        margin-left: 1em;
+        margin-bottom: 0.5em;
+    }
+    .form-container {
+        border: 1px solid #ddd;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .form-item {
+        margin-bottom: 0.5em;
+    }
+    .image-classification {
+        font-size: 0.9em;
+        color: #666;
+        margin-top: 0.5em;
+    }
+</style>"""
+
+    @override
+    def serialize_captions(
+        self,
+        item: FloatingItem,
+        **kwargs,
+    ) -> SerializationResult:
+        """Serialize the item's captions."""
+        params = HTMLParams(**kwargs)
+        
+        caption_parts = []
+        for cap_ref in item.captions:
+            cap_item = cap_ref.resolve(self.doc)
+            if isinstance(cap_item, TextItem) and cap_item.self_ref not in self.get_excluded_refs(**kwargs):
+                caption_text = html.escape(cap_item.text)
+                caption_parts.append(caption_text)
+                
+        if caption_parts:
+            caption_text = " ".join(caption_parts)
+            result = f"<figcaption>{caption_text}</figcaption>"
+            return SerializationResult(text=result)
+            
+        return SerializationResult(text="")
diff --git a/test/test_serializer_html.py b/test/test_serializer_html.py
new file mode 100644
index 00000000..694cf9b7
--- /dev/null
+++ b/test/test_serializer_html.py
@@ -0,0 +1,24 @@
+import unittest
+from pathlib import Path
+from typing import Optional
+
+from docling_core.types.doc.base import ImageRefMode, Size
+from docling_core.types.doc.document import (
+    DoclingDocument,
+    TextItem,
+    TableData,
+    TableCell,
+    Formatting,
+    ProvenanceItem,
+    BoundingBox,
+    CoordOrigin,
+)
+from docling_core.types.doc.labels import DocItemLabel
+from docling_core.experimental.serializer.html import HTMLDocSerializer, HTMLParams
+
+def test_html_export():
+
+    
+    
+    assert True
+

From 4207c7749b4147572c5cc95b7b08368a4aab9c20 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Thu, 3 Apr 2025 11:41:28 +0200
Subject: [PATCH 02/34] first version, to be tested thoroughly

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/html.py | 237 +++++++++++--------
 test/test_serializer_html.py                 |  21 --
 2 files changed, 135 insertions(+), 123 deletions(-)

diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index 48137aec..c43ce3c2 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -5,9 +5,8 @@
 
 """Define classes for HTML serialization."""
 import html
-import re
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Union
 
 from pydantic import AnyUrl, BaseModel
 from typing_extensions import override
@@ -32,10 +31,8 @@
     DocItem,
     DoclingDocument,
     FloatingItem,
-    Formatting,
     FormItem,
     FormulaItem,
-    ImageRef,
     InlineGroup,
     KeyValueItem,
     NodeItem,
@@ -48,7 +45,10 @@
     TitleItem,
     UnorderedList,
 )
-from docling_core.types.doc.utils import get_html_tag_with_text_direction, get_text_direction
+from docling_core.types.doc.utils import (
+    get_html_tag_with_text_direction,
+    get_text_direction,
+)
 
 
 class HTMLParams(CommonParams):
@@ -83,13 +83,13 @@ def serialize(
         """Serializes the passed item."""
         params = HTMLParams(**kwargs)
         parts: List[str] = []
-        
+
         text_content = html.escape(item.text, quote=False)
-        
+
         # Replace newlines with <br> tags if not in code or formula
         if not isinstance(item, (CodeItem, FormulaItem)):
             text_content = text_content.replace("\n", "<br>")
-            
+
         if isinstance(item, TitleItem):
             text = get_html_tag_with_text_direction(html_tag="h1", text=text_content)
         elif isinstance(item, SectionHeaderItem):
@@ -101,25 +101,26 @@ def serialize(
             language_attr = ""
             if item.code_language.value != "unknown":
                 language_attr = f' class="language-{item.code_language.value.lower()}"'
-            
+
             if is_inline_scope:
-                text = f'<code{language_attr}>{text_content}</code>'
+                text = f"<code{language_attr}>{text_content}</code>"
             else:
-                text = f'<pre><code{language_attr}>{text_content}</code></pre>'
+                text = f"<pre><code{language_attr}>{text_content}</code></pre>"
         elif isinstance(item, FormulaItem):
             if params.formula_to_mathml and item.text:
-                # Simplified formula handling - full implementation would use latex2mathml
+                # Simplified formula handling - full implementation
+                # would use latex2mathml
                 text = f'<div class="formula">{text_content}</div>'
             elif item.text:
                 text = f'<div class="formula">{text_content}</div>'
             elif item.orig:
                 text = '<div class="formula-not-decoded">Formula not decoded</div>'
             else:
-                text = ''
+                text = ""
         else:
             # Regular text
             text = get_html_tag_with_text_direction(html_tag="p", text=text_content)
-        
+
         parts.append(text)
 
         # Handle captions for floating items
@@ -134,7 +135,7 @@ def serialize(
             formatting=item.formatting,
             hyperlink=item.hyperlink,
         )
-        
+
         return SerializationResult(text=text_res)
 
 
@@ -153,51 +154,51 @@ def serialize(
         """Serializes the passed item."""
         if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
             return SerializationResult(text="")
-            
+
         # Process captions first
         cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
-        
+
         # Start building the table
         rows = []
-        
+
         for i, row in enumerate(item.data.grid):
             row_cells = []
             for j, cell in enumerate(row):
                 # Skip cells that are covered by rowspan or colspan from previous cells
                 if cell.start_row_offset_idx != i or cell.start_col_offset_idx != j:
                     continue
-                    
+
                 content = html.escape(cell.text.strip())
                 celltag = "th" if cell.column_header or cell.row_header else "td"
-                
+
                 attrs = []
                 if cell.row_span > 1:
                     attrs.append(f'rowspan="{cell.row_span}"')
                 if cell.col_span > 1:
                     attrs.append(f'colspan="{cell.col_span}"')
-                    
+
                 text_dir = get_text_direction(content)
                 if text_dir == "rtl":
                     attrs.append(f'dir="{text_dir}"')
-                    
+
                 attrs_str = " ".join(attrs)
                 if attrs_str:
                     attrs_str = " " + attrs_str
-                    
+
                 row_cells.append(f"<{celltag}{attrs_str}>{content}</{celltag}>")
-                
+
             if row_cells:
                 rows.append(f"<tr>{''.join(row_cells)}</tr>")
-        
+
         tbody = f"<tbody>{''.join(rows)}</tbody>" if rows else ""
-        
+
         if cap_text:
             table = f"<table>{cap_text}{tbody}</table>"
         elif tbody:
             table = f"<table>{tbody}</table>"
         else:
             table = "<table></table>"
-            
+
         return SerializationResult(text=table)
 
 
@@ -215,66 +216,83 @@ def serialize(
     ) -> SerializationResult:
         """Serializes the passed item."""
         params = HTMLParams(**kwargs)
-        
+
         if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
             return SerializationResult(text="")
-            
+
         cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
-        
+
         # Process the image based on image_mode
         img_text = self._get_image_html(item, doc, params)
-        
+
         # Add classification info if available
         classification_text = ""
         for annotation in item.annotations:
-            if isinstance(annotation, PictureClassificationData) and annotation.predicted_classes:
+            if (
+                isinstance(annotation, PictureClassificationData)
+                and annotation.predicted_classes
+            ):
                 class_name = annotation.predicted_classes[0].class_name
                 confidence = annotation.predicted_classes[0].confidence
-                classification_text = f'<div class="image-classification">{html.escape(class_name)} ({confidence:.2f})</div>'
+                classification_text = (
+                    '<div class="image-classification">'
+                    + f"{html.escape(class_name)} ({confidence:.2f})</div>"
+                )
                 break
-                
+
         figure = f"<figure>{img_text}{classification_text}{cap_text}</figure>"
         return SerializationResult(text=figure)
-        
-    def _get_image_html(self, item: PictureItem, doc: DoclingDocument, params: HTMLParams) -> str:
+
+    def _get_image_html(
+        self, item: PictureItem, doc: DoclingDocument, params: HTMLParams
+    ) -> str:
         """Generate HTML for the image based on image mode."""
         if params.image_mode == ImageRefMode.PLACEHOLDER:
             return params.image_placeholder
-            
+
         elif params.image_mode == ImageRefMode.EMBEDDED:
             # Try to use the embedded image
-            if (item.image is not None and 
-                isinstance(item.image.uri, AnyUrl) and 
-                item.image.uri.scheme == "data"):
+            if (
+                item.image is not None
+                and isinstance(item.image.uri, AnyUrl)
+                and item.image.uri.scheme == "data"
+            ):
                 return self._create_img_tag(item.image.uri, item, params)
-                
+
             # Try to get the image from document
             img = item.get_image(doc)
             if img is not None:
                 imgb64 = item._image_to_base64(img)
-                return self._create_img_tag(f"data:image/png;base64,{imgb64}", item, params)
-                
+                return self._create_img_tag(
+                    f"data:image/png;base64,{imgb64}", item, params
+                )
+
             return params.image_placeholder
-            
+
         elif params.image_mode == ImageRefMode.REFERENCED:
             if item.image is not None:
-                if isinstance(item.image.uri, AnyUrl) and item.image.uri.scheme != "data":
+                if (
+                    isinstance(item.image.uri, AnyUrl)
+                    and item.image.uri.scheme != "data"
+                ):
                     return self._create_img_tag(item.image.uri, item, params)
                 elif isinstance(item.image.uri, Path):
                     return self._create_img_tag(item.image.uri, item, params)
-                    
+
             return params.image_placeholder
-            
+
         return params.image_placeholder
-        
-    def _create_img_tag(self, src: Union[str, AnyUrl, Path], item: PictureItem, params: HTMLParams) -> str:
+
+    def _create_img_tag(
+        self, src: Union[str, AnyUrl, Path], item: PictureItem, params: HTMLParams
+    ) -> str:
         """Create an HTML img tag with appropriate attributes."""
         attrs = [f'src="{src}"', 'alt="Image"']
-        
+
         if params.add_image_dimensions and item.image is not None:
             attrs.append(f'width="{item.image.size.width}"')
             attrs.append(f'height="{item.image.size.height}"')
-            
+
         return f"<img {' '.join(attrs)}>"
 
 
@@ -293,38 +311,40 @@ def serialize(
         """Serializes the passed item."""
         if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
             return SerializationResult(text="")
-            
+
         # Create a definition list (dl) for key-value pairs
         parts = ['<dl class="key-value-region">']
-        
+
         # Group cells by their keys
         key_to_values: Dict[int, List[int]] = {}
         for link in item.graph.links:
-            key_to_values.setdefault(link.source_cell_id, []).append(link.target_cell_id)
-            
+            key_to_values.setdefault(link.source_cell_id, []).append(
+                link.target_cell_id
+            )
+
         # Find all cells
         cell_by_id = {cell.cell_id: cell for cell in item.graph.cells}
-        
+
         # Process each key-value pair
         for key_id, value_ids in key_to_values.items():
             if key_id in cell_by_id:
                 key_cell = cell_by_id[key_id]
                 key_text = html.escape(key_cell.text)
-                parts.append(f'<dt>{key_text}</dt>')
-                
+                parts.append(f"<dt>{key_text}</dt>")
+
                 for value_id in value_ids:
                     if value_id in cell_by_id:
                         value_cell = cell_by_id[value_id]
                         value_text = html.escape(value_cell.text)
-                        parts.append(f'<dd>{value_text}</dd>')
-        
-        parts.append('</dl>')
-        
+                        parts.append(f"<dd>{value_text}</dd>")
+
+        parts.append("</dl>")
+
         # Add caption if available
         cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
         if cap_text:
             parts.append(cap_text)
-            
+
         return SerializationResult(text="\n".join(parts))
 
 
@@ -343,23 +363,25 @@ def serialize(
         """Serializes the passed item."""
         if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
             return SerializationResult(text="")
-            
+
         # Create a form representation (non-functional HTML form)
         parts = ['<div class="form-container">']
-        
+
         # Simple representation of form items
         for cell in item.graph.cells:
             cell_text = html.escape(cell.text)
             cell_label = cell.label.value
-            parts.append(f'<div class="form-item form-item-{cell_label}">{cell_text}</div>')
-            
-        parts.append('</div>')
-        
+            parts.append(
+                f'<div class="form-item form-item-{cell_label}">{cell_text}</div>'
+            )
+
+        parts.append("</div>")
+
         # Add caption if available
         cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
         if cap_text:
             parts.append(cap_text)
-            
+
         return SerializationResult(text="\n".join(parts))
 
 
@@ -387,10 +409,10 @@ def serialize(
             visited=my_visited,
             **kwargs,
         )
-        
+
         # Determine list type
         tag = "ol" if isinstance(item, OrderedList) else "ul"
-        
+
         # Build list items
         items = []
         for part in parts:
@@ -401,7 +423,7 @@ def serialize(
                 else:
                     # Otherwise wrap it in <li>
                     items.append(f"<li>{part.text}</li>")
-                    
+
         list_html = f"<{tag}>{''.join(items)}</{tag}>"
         return SerializationResult(text=list_html)
 
@@ -429,7 +451,7 @@ def serialize(
             visited=my_visited,
             **kwargs,
         )
-        
+
         # Join parts with spaces for inline content
         inline_content = " ".join([p.text for p in parts if p.text])
         if inline_content:
@@ -451,7 +473,9 @@ def serialize(
     ) -> SerializationResult:
         """Serializes the passed item."""
         if isinstance(item, DocItem):
-            return SerializationResult(text=f"<!-- Unsupported item type: {item.label} -->")
+            return SerializationResult(
+                text=f"<!-- Unsupported item type: {item.label} -->"
+            )
         return SerializationResult(text="")
 
 
@@ -491,7 +515,9 @@ def serialize_strikethrough(self, text: str, **kwargs) -> str:
         return f"<s>{text}</s>"
 
     @override
-    def serialize_hyperlink(self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs) -> str:
+    def serialize_hyperlink(
+        self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs
+    ) -> str:
         """Apply HTML-specific hyperlink serialization."""
         return f'<a href="{hyperlink}">{text}</a>'
 
@@ -509,51 +535,56 @@ def serialize_page(self, parts: list[SerializationResult]) -> SerializationResul
     def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
         """Serialize a document out of its pages."""
         params = self.params
-        
+
         # Join pages with page breaks if specified
         if params.add_page_break and params.page_break_placeholder:
             page_sep = f"\n{params.page_break_placeholder}\n"
             content = page_sep.join([p.text for p in pages if p.text])
         else:
             content = self.serialize_page(parts=pages).text
-            
+
         # Add HTML document structure
         head = self._generate_head()
         body = f"<body>\n{content}\n</body>"
-        
+
         # Create full HTML document
-        html_doc = f"<!DOCTYPE html>\n<html lang=\"{params.html_lang}\">\n{head}\n{body}\n</html>"
-        
+        html_doc = (
+            '<!DOCTYPE html>\n<html lang="'
+            + f'{params.html_lang}">\n{head}\n{body}\n</html>'
+        )
+
         return SerializationResult(text=html_doc)
-        
+
     def _generate_head(self) -> str:
         """Generate the HTML head section with metadata and styles."""
         params = self.params
-        
-        head_parts = ['<head>', '<meta charset="UTF-8">']
-        
+
+        head_parts = ["<head>", '<meta charset="UTF-8">']
+
         # Add metadata if requested
         if params.add_document_metadata:
             if self.doc.name:
-                head_parts.append(f'<title>{html.escape(self.doc.name)}</title>')
+                head_parts.append(f"<title>{html.escape(self.doc.name)}</title>")
             else:
-                head_parts.append('<title>Docling Document</title>')
-                
-            head_parts.append('<meta name="generator" content="Docling HTML Serializer">')
-            
+                head_parts.append("<title>Docling Document</title>")
+
+            head_parts.append(
+                '<meta name="generator" content="Docling HTML Serializer">'
+            )
+
         # Add default styles or custom CSS
         if params.css_styles:
-            head_parts.append(f'<style>\n{params.css_styles}\n</style>')
+            head_parts.append(f"<style>\n{params.css_styles}\n</style>")
         else:
             head_parts.append(self._get_default_css())
-            
-        head_parts.append('</head>')
-        
+
+        head_parts.append("</head>")
+
         if params.prettify:
-            return '\n'.join(head_parts)
+            return "\n".join(head_parts)
         else:
-            return ''.join(head_parts)
-            
+            return "".join(head_parts)
+
     def _get_default_css(self) -> str:
         """Return default CSS styles for the HTML document."""
         return """<style>
@@ -681,18 +712,20 @@ def serialize_captions(
         **kwargs,
     ) -> SerializationResult:
         """Serialize the item's captions."""
-        params = HTMLParams(**kwargs)
-        
+        HTMLParams(**kwargs)
+
         caption_parts = []
         for cap_ref in item.captions:
             cap_item = cap_ref.resolve(self.doc)
-            if isinstance(cap_item, TextItem) and cap_item.self_ref not in self.get_excluded_refs(**kwargs):
+            if isinstance(
+                cap_item, TextItem
+            ) and cap_item.self_ref not in self.get_excluded_refs(**kwargs):
                 caption_text = html.escape(cap_item.text)
                 caption_parts.append(caption_text)
-                
+
         if caption_parts:
             caption_text = " ".join(caption_parts)
             result = f"<figcaption>{caption_text}</figcaption>"
             return SerializationResult(text=result)
-            
+
         return SerializationResult(text="")
diff --git a/test/test_serializer_html.py b/test/test_serializer_html.py
index 694cf9b7..007a1249 100644
--- a/test/test_serializer_html.py
+++ b/test/test_serializer_html.py
@@ -1,24 +1,3 @@
-import unittest
-from pathlib import Path
-from typing import Optional
-
-from docling_core.types.doc.base import ImageRefMode, Size
-from docling_core.types.doc.document import (
-    DoclingDocument,
-    TextItem,
-    TableData,
-    TableCell,
-    Formatting,
-    ProvenanceItem,
-    BoundingBox,
-    CoordOrigin,
-)
-from docling_core.types.doc.labels import DocItemLabel
-from docling_core.experimental.serializer.html import HTMLDocSerializer, HTMLParams
-
 def test_html_export():
 
-    
-    
     assert True
-

From 91d972928de1f30f43445ea3753bcbe391e5c5b8 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Fri, 4 Apr 2025 10:49:27 +0200
Subject: [PATCH 03/34] added the new test

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 test/test_serializer_html.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/test/test_serializer_html.py b/test/test_serializer_html.py
index 007a1249..3f925502 100644
--- a/test/test_serializer_html.py
+++ b/test/test_serializer_html.py
@@ -1,3 +1,29 @@
+"""Examples of using the HTML Serializer for DoclingDocument."""
+
+from pathlib import Path
+from docling_core.types.doc.base import ImageRefMode, Size
+from docling_core.types.doc.document import DoclingDocument, Formatting
+from docling_core.types.doc.labels import DocItemLabel, CodeLanguageLabel
+from docling_core.experimental.serializer.html import HTMLDocSerializer, HTMLParams
+
+from test.test_docling_doc import _construct_doc
+
 def test_html_export():
+    
+    doc = _construct_doc()
+    
+    # Create the serializer with default parameters
+    serializer = HTMLDocSerializer(doc=doc)
+    
+    # Serialize the document
+    html_output = serializer.serialize().text
+    
+    # Save to file
+    with open("example_document.new.html", "w", encoding="utf-8") as f:
+        f.write(html_output)
 
+    doc.save_as_html(filename="example_document.old.html")
+        
+    print("Basic example saved to 'example_document.html'")
+    
     assert True

From b81d66a242a94ef06cc1e836e5b65757eac4c03a Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Fri, 4 Apr 2025 13:57:05 +0200
Subject: [PATCH 04/34] rewrote carefully the export-to-html into new framework

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/html.py | 559 ++++++++-----------
 1 file changed, 229 insertions(+), 330 deletions(-)

diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index c43ce3c2..f46eb0d1 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -5,11 +5,16 @@
 
 """Define classes for HTML serialization."""
 import html
+import sys
 from pathlib import Path
-from typing import Dict, List, Optional, Union
+from typing import Optional, Union, List
 
+import latex2mathml.converter
+import latex2mathml.exceptions
 from pydantic import AnyUrl, BaseModel
 from typing_extensions import override
+from xml.etree.cElementTree import SubElement, tostring
+from xml.sax.saxutils import unescape
 
 from docling_core.experimental.serializer.base import (
     BaseDocSerializer,
@@ -31,13 +36,16 @@
     DocItem,
     DoclingDocument,
     FloatingItem,
+    Formatting,
     FormItem,
     FormulaItem,
+    GroupItem,
+    ImageRef,
     InlineGroup,
     KeyValueItem,
+    ListItem,
     NodeItem,
     OrderedList,
-    PictureClassificationData,
     PictureItem,
     SectionHeaderItem,
     TableItem,
@@ -45,26 +53,28 @@
     TitleItem,
     UnorderedList,
 )
-from docling_core.types.doc.utils import (
-    get_html_tag_with_text_direction,
-    get_text_direction,
-)
+from docling_core.types.doc.labels import DocItemLabel
+from docling_core.types.doc.utils import get_html_tag_with_text_direction, get_text_direction
 
 
 class HTMLParams(CommonParams):
     """HTML-specific serialization parameters."""
 
+    # Default layers to use for HTML export
     layers: set[ContentLayer] = {ContentLayer.BODY}
+    
+    # How to handle images
     image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER
-    image_placeholder: str = "<!-- image -->"
-    add_page_break: bool = True
-    page_break_placeholder: str = '<div class="page-break"></div>'
-    css_styles: Optional[str] = None
+    
+    # HTML document properties
     html_lang: str = "en"
-    formula_to_mathml: bool = True
+    css_styles: Optional[str] = None
     add_document_metadata: bool = True
     prettify: bool = True  # Add indentation and line breaks
-    add_image_dimensions: bool = True  # Add width and height attributes to images
+    
+    # Formula rendering options
+    formula_to_mathml: bool = True
+    
 
 
 class HTMLTextSerializer(BaseModel, BaseTextSerializer):
@@ -80,63 +90,118 @@ def serialize(
         is_inline_scope: bool = False,
         **kwargs,
     ) -> SerializationResult:
-        """Serializes the passed item."""
+        """Serializes the passed text item to HTML."""
         params = HTMLParams(**kwargs)
-        parts: List[str] = []
-
-        text_content = html.escape(item.text, quote=False)
-
-        # Replace newlines with <br> tags if not in code or formula
-        if not isinstance(item, (CodeItem, FormulaItem)):
-            text_content = text_content.replace("\n", "<br>")
-
+        
+        # Prepare the HTML based on item type
         if isinstance(item, TitleItem):
-            text = get_html_tag_with_text_direction(html_tag="h1", text=text_content)
+            text_inner = self._prepare_content(item.text)
+            text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
+            
         elif isinstance(item, SectionHeaderItem):
-            section_level = min(item.level + 1, 6)  # h1-h6 are valid in HTML
+            section_level = min(item.level + 1, 6)
+            text_inner = self._prepare_content(item.text)
             text = get_html_tag_with_text_direction(
-                html_tag=f"h{section_level}", text=text_content
+                html_tag=f"h{section_level}", text=text_inner
             )
-        elif isinstance(item, CodeItem):
-            language_attr = ""
-            if item.code_language.value != "unknown":
-                language_attr = f' class="language-{item.code_language.value.lower()}"'
-
-            if is_inline_scope:
-                text = f"<code{language_attr}>{text_content}</code>"
-            else:
-                text = f"<pre><code{language_attr}>{text_content}</code></pre>"
+            
         elif isinstance(item, FormulaItem):
-            if params.formula_to_mathml and item.text:
-                # Simplified formula handling - full implementation
-                # would use latex2mathml
-                text = f'<div class="formula">{text_content}</div>'
-            elif item.text:
-                text = f'<div class="formula">{text_content}</div>'
-            elif item.orig:
-                text = '<div class="formula-not-decoded">Formula not decoded</div>'
-            else:
-                text = ""
+            text = self._process_formula(
+                item=item, 
+                doc=doc,
+                image_mode=params.image_mode,
+                formula_to_mathml=params.formula_to_mathml
+            )
+            
+        elif isinstance(item, CodeItem):
+            code_text = self._prepare_content(
+                item.text, do_escape_html=True, do_replace_newline=False
+            )
+            text = f"<pre><code>{code_text}</code></pre>"
+            
+        elif isinstance(item, ListItem):
+            # List items are handled by list serializer
+            text_inner = self._prepare_content(item.text)
+            text = get_html_tag_with_text_direction(html_tag="li", text=text_inner)
+            
         else:
-            # Regular text
-            text = get_html_tag_with_text_direction(html_tag="p", text=text_content)
-
-        parts.append(text)
-
-        # Handle captions for floating items
-        if isinstance(item, FloatingItem):
-            cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
-            if cap_text:
-                parts.append(cap_text)
-
-        text_res = "\n".join(parts)
-        text_res = doc_serializer.post_process(
-            text=text_res,
+            # Regular text item
+            text_inner = self._prepare_content(item.text)
+            text = get_html_tag_with_text_direction(html_tag="p", text=text_inner)
+        
+        # Apply formatting and hyperlinks
+        text = doc_serializer.post_process(
+            text=text,
             formatting=item.formatting,
             hyperlink=item.hyperlink,
         )
-
-        return SerializationResult(text=text_res)
+        
+        return SerializationResult(text=text)
+        
+    def _prepare_content(
+        self, text: str, do_escape_html=True, do_replace_newline=True
+    ) -> str:
+        """Prepare text content for HTML inclusion."""
+        if do_escape_html:
+            text = html.escape(text, quote=False)
+        if do_replace_newline:
+            text = text.replace("\n", "<br>")
+        return text
+    
+    def _process_formula(
+        self, 
+        item: FormulaItem, 
+        doc: DoclingDocument,
+        image_mode: ImageRefMode,
+        formula_to_mathml: bool
+    ) -> str:
+        """Process a formula item to HTML/MathML."""
+        math_formula = self._prepare_content(
+            item.text, do_escape_html=False, do_replace_newline=False
+        )
+        
+        # If formula is empty, try to use an image fallback
+        if item.text == "" and item.orig != "":
+            img_fallback = self._get_formula_image_fallback(item, doc)
+            if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0 and img_fallback:
+                return img_fallback
+                
+        # Try to generate MathML
+        if formula_to_mathml and math_formula:
+            try:
+                mathml_element = latex2mathml.converter.convert_to_element(
+                    math_formula, display="block"
+                )
+                annotation = SubElement(
+                    mathml_element, "annotation", dict(encoding="TeX")
+                )
+                annotation.text = math_formula
+                mathml = unescape(tostring(mathml_element, encoding="unicode"))
+                return f"<div>{mathml}</div>"
+            except Exception:
+                img_fallback = self._get_formula_image_fallback(item, doc)
+                if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0 and img_fallback:
+                    return img_fallback
+                elif math_formula:
+                    return f"<pre>{math_formula}</pre>"
+        
+        # Fallback options if we got here
+        if math_formula:
+            return f"<pre>{math_formula}</pre>"
+        else:
+            return '<div class="formula-not-decoded">Formula not decoded</div>'
+    
+    def _get_formula_image_fallback(self, item: TextItem, doc: DoclingDocument) -> Optional[str]:
+        """Try to get an image fallback for a formula."""
+        item_image = item.get_image(doc=doc)
+        if item_image is not None:
+            img_ref = ImageRef.from_pil(item_image, dpi=72)
+            return (
+                "<figure>"
+                f'<img src="{img_ref.uri}" alt="{item.orig}" />'
+                "</figure>"
+            )
+        return None
 
 
 class HTMLTableSerializer(BaseTableSerializer):
@@ -151,55 +216,9 @@ def serialize(
         doc: DoclingDocument,
         **kwargs,
     ) -> SerializationResult:
-        """Serializes the passed item."""
-        if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
-            return SerializationResult(text="")
-
-        # Process captions first
-        cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
-
-        # Start building the table
-        rows = []
-
-        for i, row in enumerate(item.data.grid):
-            row_cells = []
-            for j, cell in enumerate(row):
-                # Skip cells that are covered by rowspan or colspan from previous cells
-                if cell.start_row_offset_idx != i or cell.start_col_offset_idx != j:
-                    continue
-
-                content = html.escape(cell.text.strip())
-                celltag = "th" if cell.column_header or cell.row_header else "td"
-
-                attrs = []
-                if cell.row_span > 1:
-                    attrs.append(f'rowspan="{cell.row_span}"')
-                if cell.col_span > 1:
-                    attrs.append(f'colspan="{cell.col_span}"')
-
-                text_dir = get_text_direction(content)
-                if text_dir == "rtl":
-                    attrs.append(f'dir="{text_dir}"')
-
-                attrs_str = " ".join(attrs)
-                if attrs_str:
-                    attrs_str = " " + attrs_str
-
-                row_cells.append(f"<{celltag}{attrs_str}>{content}</{celltag}>")
-
-            if row_cells:
-                rows.append(f"<tr>{''.join(row_cells)}</tr>")
-
-        tbody = f"<tbody>{''.join(rows)}</tbody>" if rows else ""
-
-        if cap_text:
-            table = f"<table>{cap_text}{tbody}</table>"
-        elif tbody:
-            table = f"<table>{tbody}</table>"
-        else:
-            table = "<table></table>"
-
-        return SerializationResult(text=table)
+        """Serializes the passed table item to HTML."""
+        text = item.export_to_html(doc=doc, add_caption=True)
+        return SerializationResult(text=text)
 
 
 class HTMLPictureSerializer(BasePictureSerializer):
@@ -214,86 +233,12 @@ def serialize(
         doc: DoclingDocument,
         **kwargs,
     ) -> SerializationResult:
-        """Serializes the passed item."""
+        """Serializes the passed picture item to HTML."""
         params = HTMLParams(**kwargs)
-
-        if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
-            return SerializationResult(text="")
-
-        cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
-
-        # Process the image based on image_mode
-        img_text = self._get_image_html(item, doc, params)
-
-        # Add classification info if available
-        classification_text = ""
-        for annotation in item.annotations:
-            if (
-                isinstance(annotation, PictureClassificationData)
-                and annotation.predicted_classes
-            ):
-                class_name = annotation.predicted_classes[0].class_name
-                confidence = annotation.predicted_classes[0].confidence
-                classification_text = (
-                    '<div class="image-classification">'
-                    + f"{html.escape(class_name)} ({confidence:.2f})</div>"
-                )
-                break
-
-        figure = f"<figure>{img_text}{classification_text}{cap_text}</figure>"
-        return SerializationResult(text=figure)
-
-    def _get_image_html(
-        self, item: PictureItem, doc: DoclingDocument, params: HTMLParams
-    ) -> str:
-        """Generate HTML for the image based on image mode."""
-        if params.image_mode == ImageRefMode.PLACEHOLDER:
-            return params.image_placeholder
-
-        elif params.image_mode == ImageRefMode.EMBEDDED:
-            # Try to use the embedded image
-            if (
-                item.image is not None
-                and isinstance(item.image.uri, AnyUrl)
-                and item.image.uri.scheme == "data"
-            ):
-                return self._create_img_tag(item.image.uri, item, params)
-
-            # Try to get the image from document
-            img = item.get_image(doc)
-            if img is not None:
-                imgb64 = item._image_to_base64(img)
-                return self._create_img_tag(
-                    f"data:image/png;base64,{imgb64}", item, params
-                )
-
-            return params.image_placeholder
-
-        elif params.image_mode == ImageRefMode.REFERENCED:
-            if item.image is not None:
-                if (
-                    isinstance(item.image.uri, AnyUrl)
-                    and item.image.uri.scheme != "data"
-                ):
-                    return self._create_img_tag(item.image.uri, item, params)
-                elif isinstance(item.image.uri, Path):
-                    return self._create_img_tag(item.image.uri, item, params)
-
-            return params.image_placeholder
-
-        return params.image_placeholder
-
-    def _create_img_tag(
-        self, src: Union[str, AnyUrl, Path], item: PictureItem, params: HTMLParams
-    ) -> str:
-        """Create an HTML img tag with appropriate attributes."""
-        attrs = [f'src="{src}"', 'alt="Image"']
-
-        if params.add_image_dimensions and item.image is not None:
-            attrs.append(f'width="{item.image.size.width}"')
-            attrs.append(f'height="{item.image.size.height}"')
-
-        return f"<img {' '.join(attrs)}>"
+        text = item.export_to_html(
+            doc=doc, add_caption=True, image_mode=params.image_mode
+        )
+        return SerializationResult(text=text)
 
 
 class HTMLKeyValueSerializer(BaseKeyValueSerializer):
@@ -308,44 +253,10 @@ def serialize(
         doc: DoclingDocument,
         **kwargs,
     ) -> SerializationResult:
-        """Serializes the passed item."""
-        if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
-            return SerializationResult(text="")
-
-        # Create a definition list (dl) for key-value pairs
-        parts = ['<dl class="key-value-region">']
-
-        # Group cells by their keys
-        key_to_values: Dict[int, List[int]] = {}
-        for link in item.graph.links:
-            key_to_values.setdefault(link.source_cell_id, []).append(
-                link.target_cell_id
-            )
-
-        # Find all cells
-        cell_by_id = {cell.cell_id: cell for cell in item.graph.cells}
-
-        # Process each key-value pair
-        for key_id, value_ids in key_to_values.items():
-            if key_id in cell_by_id:
-                key_cell = cell_by_id[key_id]
-                key_text = html.escape(key_cell.text)
-                parts.append(f"<dt>{key_text}</dt>")
-
-                for value_id in value_ids:
-                    if value_id in cell_by_id:
-                        value_cell = cell_by_id[value_id]
-                        value_text = html.escape(value_cell.text)
-                        parts.append(f"<dd>{value_text}</dd>")
-
-        parts.append("</dl>")
-
-        # Add caption if available
-        cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
-        if cap_text:
-            parts.append(cap_text)
-
-        return SerializationResult(text="\n".join(parts))
+        """Serializes the passed key-value item to HTML."""
+        # This is a placeholder implementation - we could expand it
+        # to use a description list (dl/dt/dd) or a table
+        return SerializationResult(text="<div class='key-value-region'>Key-value data</div>")
 
 
 class HTMLFormSerializer(BaseFormSerializer):
@@ -360,29 +271,9 @@ def serialize(
         doc: DoclingDocument,
         **kwargs,
     ) -> SerializationResult:
-        """Serializes the passed item."""
-        if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
-            return SerializationResult(text="")
-
-        # Create a form representation (non-functional HTML form)
-        parts = ['<div class="form-container">']
-
-        # Simple representation of form items
-        for cell in item.graph.cells:
-            cell_text = html.escape(cell.text)
-            cell_label = cell.label.value
-            parts.append(
-                f'<div class="form-item form-item-{cell_label}">{cell_text}</div>'
-            )
-
-        parts.append("</div>")
-
-        # Add caption if available
-        cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
-        if cap_text:
-            parts.append(cap_text)
-
-        return SerializationResult(text="\n".join(parts))
+        """Serializes the passed form item to HTML."""
+        # This is a placeholder implementation
+        return SerializationResult(text="<div class='form'>Form data</div>")
 
 
 class HTMLListSerializer(BaseModel, BaseListSerializer):
@@ -400,8 +291,10 @@ def serialize(
         visited: Optional[set[str]] = None,  # refs of visited items
         **kwargs,
     ) -> SerializationResult:
-        """Serializes the passed item."""
+        """Serializes a list to HTML."""
         my_visited = visited or set()
+        
+        # Get all child parts
         parts = doc_serializer.get_parts(
             item=item,
             list_level=list_level + 1,
@@ -409,23 +302,19 @@ def serialize(
             visited=my_visited,
             **kwargs,
         )
-
-        # Determine list type
+        
+        # Start the appropriate list type
         tag = "ol" if isinstance(item, OrderedList) else "ul"
-
-        # Build list items
-        items = []
+        list_html = [f"<{tag}>"]
+        
+        # Add all child parts
         for part in parts:
-            if part.text:
-                # If the part is already wrapped in <li>, use it directly
-                if part.text.startswith("<li") and part.text.endswith("</li>"):
-                    items.append(part.text)
-                else:
-                    # Otherwise wrap it in <li>
-                    items.append(f"<li>{part.text}</li>")
-
-        list_html = f"<{tag}>{''.join(items)}</{tag}>"
-        return SerializationResult(text=list_html)
+            list_html.append(part.text)
+            
+        # Close the list
+        list_html.append(f"</{tag}>")
+        
+        return SerializationResult(text="\n".join(list_html))
 
 
 class HTMLInlineSerializer(BaseInlineSerializer):
@@ -442,8 +331,10 @@ def serialize(
         visited: Optional[set[str]] = None,  # refs of visited items
         **kwargs,
     ) -> SerializationResult:
-        """Serializes the passed item."""
+        """Serializes an inline group to HTML."""
         my_visited = visited or set()
+        
+        # Get all parts with inline scope
         parts = doc_serializer.get_parts(
             item=item,
             list_level=list_level,
@@ -451,12 +342,15 @@ def serialize(
             visited=my_visited,
             **kwargs,
         )
-
-        # Join parts with spaces for inline content
-        inline_content = " ".join([p.text for p in parts if p.text])
-        if inline_content:
-            return SerializationResult(text=f"<span>{inline_content}</span>")
-        return SerializationResult(text="")
+        
+        # Join all parts without separators
+        inline_html = "".join([p.text for p in parts])
+        
+        # Wrap in span if needed
+        if inline_html:
+            inline_html = f"<span class='inline-group'>{inline_html}</span>"
+            
+        return SerializationResult(text=inline_html)
 
 
 class HTMLFallbackSerializer(BaseFallbackSerializer):
@@ -471,12 +365,13 @@ def serialize(
         doc: DoclingDocument,
         **kwargs,
     ) -> SerializationResult:
-        """Serializes the passed item."""
-        if isinstance(item, DocItem):
-            return SerializationResult(
-                text=f"<!-- Unsupported item type: {item.label} -->"
-            )
-        return SerializationResult(text="")
+        """Fallback serializer for items not handled by other serializers."""
+        # For group items, we don't generate any markup
+        if isinstance(item, GroupItem):
+            return SerializationResult(text="")
+            
+        # For other doc items, add a comment
+        return SerializationResult(text=f"<!-- Unhandled item type: {item.__class__.__name__} -->")
 
 
 class HTMLDocSerializer(DocSerializer):
@@ -512,48 +407,76 @@ def serialize_underline(self, text: str, **kwargs) -> str:
     @override
     def serialize_strikethrough(self, text: str, **kwargs) -> str:
         """Apply HTML-specific strikethrough serialization."""
-        return f"<s>{text}</s>"
+        return f"<del>{text}</del>"
 
     @override
     def serialize_hyperlink(
         self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs
     ) -> str:
         """Apply HTML-specific hyperlink serialization."""
-        return f'<a href="{hyperlink}">{text}</a>'
+        return f'<a href="{str(hyperlink)}">{text}</a>'
 
     @override
     def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
         """Serialize a page out of its parts."""
-        params = self.params
-        if params.prettify:
-            text_res = "\n".join([p.text for p in parts if p.text])
-        else:
-            text_res = "".join([p.text for p in parts if p.text])
-        return SerializationResult(text=text_res)
+        # Join all parts with newlines
+        body_content = "\n".join([p.text for p in parts if p.text])
+        return SerializationResult(text=f"<div class='page'>\n{body_content}\n</div>")
 
     @override
     def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
         """Serialize a document out of its pages."""
-        params = self.params
-
-        # Join pages with page breaks if specified
-        if params.add_page_break and params.page_break_placeholder:
-            page_sep = f"\n{params.page_break_placeholder}\n"
-            content = page_sep.join([p.text for p in pages if p.text])
-        else:
-            content = self.serialize_page(parts=pages).text
-
-        # Add HTML document structure
-        head = self._generate_head()
-        body = f"<body>\n{content}\n</body>"
-
-        # Create full HTML document
-        html_doc = (
-            '<!DOCTYPE html>\n<html lang="'
-            + f'{params.html_lang}">\n{head}\n{body}\n</html>'
-        )
-
-        return SerializationResult(text=html_doc)
+        # Create HTML structure
+        html_parts = [
+            "<!DOCTYPE html>",
+            self._generate_head(),
+            "<body>",
+        ]
+        
+        # Add all pages
+        for page in pages:
+            if page.text:
+                html_parts.append(page.text)
+                
+        # Close HTML structure
+        html_parts.extend(["</body>", "</html>"])
+        
+        # Join with newlines
+        html_content = "\n".join(html_parts)
+        
+        return SerializationResult(text=html_content)
+        
+    @override
+    def serialize_captions(
+        self,
+        item: FloatingItem,
+        **kwargs,
+    ) -> SerializationResult:
+        """Serialize the item's captions."""
+        caption_parts = []
+        
+        # Extract caption text from all caption items
+        for cap in item.captions:
+            caption_item = cap.resolve(self.doc)
+            if isinstance(caption_item, TextItem):
+                caption_parts.append(caption_item.text)
+                
+        # Join all captions with a space
+        if caption_parts:
+            caption_text = " ".join(caption_parts)
+            text_dir = get_text_direction(caption_text)
+            
+            # Create proper HTML
+            if text_dir == "rtl":
+                return SerializationResult(
+                    text=f'<figcaption dir="{text_dir}">{html.escape(caption_text)}</figcaption>'
+                )
+            else:
+                return SerializationResult(
+                    text=f'<figcaption>{html.escape(caption_text)}</figcaption>'
+                )
+                
+        return SerializationResult(text="")
 
     def _generate_head(self) -> str:
         """Generate the HTML head section with metadata and styles."""
@@ -704,28 +627,4 @@ def _get_default_css(self) -> str:
         margin-top: 0.5em;
     }
 </style>"""
-
-    @override
-    def serialize_captions(
-        self,
-        item: FloatingItem,
-        **kwargs,
-    ) -> SerializationResult:
-        """Serialize the item's captions."""
-        HTMLParams(**kwargs)
-
-        caption_parts = []
-        for cap_ref in item.captions:
-            cap_item = cap_ref.resolve(self.doc)
-            if isinstance(
-                cap_item, TextItem
-            ) and cap_item.self_ref not in self.get_excluded_refs(**kwargs):
-                caption_text = html.escape(cap_item.text)
-                caption_parts.append(caption_text)
-
-        if caption_parts:
-            caption_text = " ".join(caption_parts)
-            result = f"<figcaption>{caption_text}</figcaption>"
-            return SerializationResult(text=result)
-
-        return SerializationResult(text="")
+    

From 1d1c91abecfb5ef5c0d37aad010689cd0dbeb8ba Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Fri, 4 Apr 2025 18:07:12 +0200
Subject: [PATCH 05/34] fixed the inline list-items

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/html.py | 31 ++++++++++++++++++--
 test/test_serializer_html.py                 |  1 +
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index f46eb0d1..2f4dfa60 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -92,6 +92,8 @@ def serialize(
     ) -> SerializationResult:
         """Serializes the passed text item to HTML."""
         params = HTMLParams(**kwargs)
+
+        print(" -> serialising text with label: ", item.label)
         
         # Prepare the HTML based on item type
         if isinstance(item, TitleItem):
@@ -123,7 +125,11 @@ def serialize(
             # List items are handled by list serializer
             text_inner = self._prepare_content(item.text)
             text = get_html_tag_with_text_direction(html_tag="li", text=text_inner)
+
+            print("text in list-item:", text_inner)
             
+        elif is_inline_scope:            
+            text = self._prepare_content(item.text)
         else:
             # Regular text item
             text_inner = self._prepare_content(item.text)
@@ -302,6 +308,10 @@ def serialize(
             visited=my_visited,
             **kwargs,
         )
+
+        print("parts of the list")
+        for _ in parts:
+            print(" -> list-parts: ", _)
         
         # Start the appropriate list type
         tag = "ol" if isinstance(item, OrderedList) else "ul"
@@ -309,10 +319,20 @@ def serialize(
         
         # Add all child parts
         for part in parts:
-            list_html.append(part.text)
-            
+            if part.text.startswith("<li>") and part.text.endswith("</li>"):
+                list_html.append(part.text)
+            elif part.text.startswith("<ol>") and part.text.endswith("</ol>"):
+                list_html.append(part.text)
+            elif part.text.startswith("<ul>") and part.text.endswith("</ul>"):
+                list_html.append(part.text)                
+            else:
+                print(f"WARNING: no <li> for {part.text}")
+                list_html.append(f"<li>{part.text}</li>")
+                
         # Close the list
         list_html.append(f"</{tag}>")
+
+        print(" => list: ", " ".join(list_html))
         
         return SerializationResult(text="\n".join(list_html))
 
@@ -342,13 +362,18 @@ def serialize(
             visited=my_visited,
             **kwargs,
         )
+
+        for _ in parts:
+            print("inline-parts: ", _)
         
         # Join all parts without separators
-        inline_html = "".join([p.text for p in parts])
+        inline_html = " ".join([p.text for p in parts])
         
         # Wrap in span if needed
         if inline_html:
             inline_html = f"<span class='inline-group'>{inline_html}</span>"
+
+        print(" => inline: ", inline_html)
             
         return SerializationResult(text=inline_html)
 
diff --git a/test/test_serializer_html.py b/test/test_serializer_html.py
index 3f925502..cd3d99fb 100644
--- a/test/test_serializer_html.py
+++ b/test/test_serializer_html.py
@@ -23,6 +23,7 @@ def test_html_export():
         f.write(html_output)
 
     doc.save_as_html(filename="example_document.old.html")
+    doc.save_as_markdown(filename="example_document.old.md")
         
     print("Basic example saved to 'example_document.html'")
     

From 2d3ee27f1679436e432abdbd7b161392c6a88a84 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Sat, 5 Apr 2025 06:28:29 +0200
Subject: [PATCH 06/34] added the inline code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/html.py | 53 ++++++++++++++++----
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index 2f4dfa60..d9ce3c69 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -112,14 +112,15 @@ def serialize(
                 item=item, 
                 doc=doc,
                 image_mode=params.image_mode,
-                formula_to_mathml=params.formula_to_mathml
+                formula_to_mathml=params.formula_to_mathml,
+                is_inline_scope=is_inline_scope
             )
             
         elif isinstance(item, CodeItem):
-            code_text = self._prepare_content(
-                item.text, do_escape_html=True, do_replace_newline=False
+            text = self._process_code(
+                item=item,
+                is_inline_scope=is_inline_scope
             )
-            text = f"<pre><code>{code_text}</code></pre>"
             
         elif isinstance(item, ListItem):
             # List items are handled by list serializer
@@ -153,13 +154,29 @@ def _prepare_content(
         if do_replace_newline:
             text = text.replace("\n", "<br>")
         return text
-    
+
+    def _process_code(
+        self, 
+        item: FormulaItem, 
+        is_inline_scope: bool,
+    ) -> str:
+        code_text = self._prepare_content(
+            item.text, do_escape_html=True, do_replace_newline=False
+        )
+        if is_inline_scope:
+            text = f"<code>{code_text}</code>"
+        else:
+            text = f"<pre><code>{code_text}</code></pre>"
+
+        return text
+            
     def _process_formula(
         self, 
         item: FormulaItem, 
         doc: DoclingDocument,
         image_mode: ImageRefMode,
-        formula_to_mathml: bool
+        formula_to_mathml: bool,
+        is_inline_scope: bool,
     ) -> str:
         """Process a formula item to HTML/MathML."""
         math_formula = self._prepare_content(
@@ -175,25 +192,39 @@ def _process_formula(
         # Try to generate MathML
         if formula_to_mathml and math_formula:
             try:
+                # Set display mode based on context
+                display_mode = "inline" if is_inline_scope else "block"
                 mathml_element = latex2mathml.converter.convert_to_element(
-                    math_formula, display="block"
+                    math_formula, display=display_mode
                 )
                 annotation = SubElement(
                     mathml_element, "annotation", dict(encoding="TeX")
                 )
                 annotation.text = math_formula
                 mathml = unescape(tostring(mathml_element, encoding="unicode"))
-                return f"<div>{mathml}</div>"
+
+                # Don't wrap in div for inline formulas
+                if is_inline_scope:
+                    return mathml
+                else:
+                    return f"<div>{mathml}</div>"
+                
             except Exception:
                 img_fallback = self._get_formula_image_fallback(item, doc)
                 if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0 and img_fallback:
                     return img_fallback
                 elif math_formula:
                     return f"<pre>{math_formula}</pre>"
-        
+
+        _logger.warning("Could not parse formula with MathML")
+                
         # Fallback options if we got here
-        if math_formula:
-            return f"<pre>{math_formula}</pre>"
+        if math_formula and is_inline_scope:
+            return f"<code>{math_formula}</code>"
+        elif math_formula and (not is_inline_scope):
+            f"<pre>{math_formula}</pre>"
+        elif is_inline_scope:
+            return '<span class="formula-not-decoded">Formula not decoded</span>'
         else:
             return '<div class="formula-not-decoded">Formula not decoded</div>'
     

From 2835935061a68632b1a02bda71e61ac18ef40d1a Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Sat, 5 Apr 2025 06:57:55 +0200
Subject: [PATCH 07/34] migrated the table html code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/html.py | 79 +++++++++++++++++++-
 1 file changed, 76 insertions(+), 3 deletions(-)

diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index d9ce3c69..02047ff6 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -254,9 +254,81 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Serializes the passed table item to HTML."""
-        text = item.export_to_html(doc=doc, add_caption=True)
+        #text = item.export_to_html(doc=doc, add_caption=True)
+        text = self._serialize(
+            item=item,
+            doc_serializer=doc_serializer,
+            doc=doc,
+            add_caption=True,
+            add_footnotes=True
+        )
         return SerializationResult(text=text)
 
+    def _serialize(
+        self,
+        item: TableItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        add_caption: bool = True,
+        add_footnotes: bool = True,
+    ) -> str:
+        """Export the table as html."""
+        nrows = item.data.num_rows
+        ncols = item.data.num_cols
+
+        caption_text = doc_serializer.serialize_captions(item=item, tag="caption")
+
+        body = ""
+
+        for i in range(nrows):
+            body += "<tr>"
+            for j in range(ncols):
+                cell: TableCell = item.data.grid[i][j]
+
+                rowspan, rowstart = (
+                    cell.row_span,
+                    cell.start_row_offset_idx,
+                )
+                colspan, colstart = (
+                    cell.col_span,
+                    cell.start_col_offset_idx,
+                )
+
+                if rowstart != i:
+                    continue
+                if colstart != j:
+                    continue
+
+                content = html.escape(cell.text.strip())
+                celltag = "td"
+                if cell.column_header:
+                    celltag = "th"
+
+                opening_tag = f"{celltag}"
+                if rowspan > 1:
+                    opening_tag += f' rowspan="{rowspan}"'
+                if colspan > 1:
+                    opening_tag += f' colspan="{colspan}"'
+
+                text_dir = get_text_direction(content)
+                if text_dir == "rtl":
+                    opening_tag += f' dir="{dir}"'
+
+                body += f"<{opening_tag}>{content}</{celltag}>"
+            body += "</tr>"
+
+        if len(caption_text.text) > 0 and len(body) > 0:
+            body = f"<table>{caption_text.text}<tbody>{body}</tbody></table>"
+        elif len(caption_text.text) == 0 and len(body) > 0:
+            body = f"<table><tbody>{body}</tbody></table>"            
+        elif len(caption_text.text) > 0 and len(body) == 0:            
+            body = f"<table>{caption_text.text}</table>"            
+        else:
+            body = "<table></table>"
+
+        return body
+
+    
 
 class HTMLPictureSerializer(BasePictureSerializer):
     """HTML-specific picture item serializer."""
@@ -506,6 +578,7 @@ def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult
     def serialize_captions(
         self,
         item: FloatingItem,
+        tag: str = 'figcaption',
         **kwargs,
     ) -> SerializationResult:
         """Serialize the item's captions."""
@@ -525,11 +598,11 @@ def serialize_captions(
             # Create proper HTML
             if text_dir == "rtl":
                 return SerializationResult(
-                    text=f'<figcaption dir="{text_dir}">{html.escape(caption_text)}</figcaption>'
+                    text=f'<{tag} dir="{text_dir}">{html.escape(caption_text)}</{tag}>'
                 )
             else:
                 return SerializationResult(
-                    text=f'<figcaption>{html.escape(caption_text)}</figcaption>'
+                    text=f'<{tag}>{html.escape(caption_text)}</{tag}>'
                 )
                 
         return SerializationResult(text="")

From 7fde69cd02156b06e9c2ffda027d4221ca1303b9 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Sat, 5 Apr 2025 07:11:21 +0200
Subject: [PATCH 08/34] updated the picture HTML serializer

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/html.py | 67 +++++++++++++++-----
 1 file changed, 51 insertions(+), 16 deletions(-)

diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index 02047ff6..d5f7a3d3 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -9,6 +9,8 @@
 from pathlib import Path
 from typing import Optional, Union, List
 
+import logging
+
 import latex2mathml.converter
 import latex2mathml.exceptions
 from pydantic import AnyUrl, BaseModel
@@ -56,6 +58,7 @@
 from docling_core.types.doc.labels import DocItemLabel
 from docling_core.types.doc.utils import get_html_tag_with_text_direction, get_text_direction
 
+_logger = logging.getLogger(__name__)
 
 class HTMLParams(CommonParams):
     """HTML-specific serialization parameters."""
@@ -92,8 +95,6 @@ def serialize(
     ) -> SerializationResult:
         """Serializes the passed text item to HTML."""
         params = HTMLParams(**kwargs)
-
-        print(" -> serialising text with label: ", item.label)
         
         # Prepare the HTML based on item type
         if isinstance(item, TitleItem):
@@ -126,8 +127,6 @@ def serialize(
             # List items are handled by list serializer
             text_inner = self._prepare_content(item.text)
             text = get_html_tag_with_text_direction(html_tag="li", text=text_inner)
-
-            print("text in list-item:", text_inner)
             
         elif is_inline_scope:            
             text = self._prepare_content(item.text)
@@ -349,6 +348,53 @@ def serialize(
         )
         return SerializationResult(text=text)
 
+    def _serialize(
+        self,
+        item: PictureItem,            
+        doc: "DoclingDocument",
+        add_caption: bool = True,
+        image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
+    ) -> str:
+        """Export picture to HTML format."""
+        caption_text = doc_serializer.serialize_captions(item=item, tag="figcaption")
+
+        if image_mode == ImageRefMode.PLACEHOLDER:
+            return f"<figure>{caption_text}</figure>"
+
+        elif image_mode == ImageRefMode.EMBEDDED:
+            # short-cut: we already have the image in base64
+            if (
+                isinstance(self.image, ImageRef)
+                and isinstance(self.image.uri, AnyUrl)
+                and self.image.uri.scheme == "data"
+            ):
+                img_text = f'<img src="{self.image.uri}">'
+                return f"<figure>{caption_text}{img_text}</figure>"
+
+            # get the self.image._pil or crop it out of the page-image
+            img = item.get_image(doc)
+            
+            if img is not None:
+                imgb64 = item._image_to_base64(img)
+                img_text = f'<img src="data:image/png;base64,{imgb64}">'
+
+                return f"<figure>{caption_text}{img_text}</figure>"
+            else:
+                return f"<figure>{caption_text}</figure>"
+
+        elif image_mode == ImageRefMode.REFERENCED:
+
+            if not isinstance(self.image, ImageRef) or (
+                isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
+            ):
+                return default_response
+
+            img_text = f'<img src="{quote(str(self.image.uri))}">'
+            return f"<figure>{caption_text}{img_text}</figure>"
+
+        else:
+            return f"<figure>{caption_text}</figure>"
+    
 
 class HTMLKeyValueSerializer(BaseKeyValueSerializer):
     """HTML-specific key-value item serializer."""
@@ -412,10 +458,6 @@ def serialize(
             **kwargs,
         )
 
-        print("parts of the list")
-        for _ in parts:
-            print(" -> list-parts: ", _)
-        
         # Start the appropriate list type
         tag = "ol" if isinstance(item, OrderedList) else "ul"
         list_html = [f"<{tag}>"]
@@ -429,13 +471,11 @@ def serialize(
             elif part.text.startswith("<ul>") and part.text.endswith("</ul>"):
                 list_html.append(part.text)                
             else:
-                print(f"WARNING: no <li> for {part.text}")
+                _logger.info(f"no <li>, <ol> or <ul> for {part.text}")
                 list_html.append(f"<li>{part.text}</li>")
                 
         # Close the list
         list_html.append(f"</{tag}>")
-
-        print(" => list: ", " ".join(list_html))
         
         return SerializationResult(text="\n".join(list_html))
 
@@ -466,17 +506,12 @@ def serialize(
             **kwargs,
         )
 
-        for _ in parts:
-            print("inline-parts: ", _)
-        
         # Join all parts without separators
         inline_html = " ".join([p.text for p in parts])
         
         # Wrap in span if needed
         if inline_html:
             inline_html = f"<span class='inline-group'>{inline_html}</span>"
-
-        print(" => inline: ", inline_html)
             
         return SerializationResult(text=inline_html)
 

From 6dc4ffafcf56393047409def4afb84652a6f3d0b Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Sat, 5 Apr 2025 07:26:47 +0200
Subject: [PATCH 09/34] updated the html for Form and KeyValue

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/html.py | 222 ++++++++++++-------
 test/test_serializer_html.py                 |  21 +-
 2 files changed, 149 insertions(+), 94 deletions(-)

diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index d5f7a3d3..183b53df 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -5,18 +5,16 @@
 
 """Define classes for HTML serialization."""
 import html
-import sys
-from pathlib import Path
-from typing import Optional, Union, List
-
 import logging
+from pathlib import Path
+from typing import Optional, Union
+from xml.etree.cElementTree import SubElement, tostring
+from xml.sax.saxutils import unescape
 
 import latex2mathml.converter
 import latex2mathml.exceptions
 from pydantic import AnyUrl, BaseModel
 from typing_extensions import override
-from xml.etree.cElementTree import SubElement, tostring
-from xml.sax.saxutils import unescape
 
 from docling_core.experimental.serializer.base import (
     BaseDocSerializer,
@@ -35,10 +33,8 @@
 from docling_core.types.doc.document import (
     CodeItem,
     ContentLayer,
-    DocItem,
     DoclingDocument,
     FloatingItem,
-    Formatting,
     FormItem,
     FormulaItem,
     GroupItem,
@@ -55,29 +51,31 @@
     TitleItem,
     UnorderedList,
 )
-from docling_core.types.doc.labels import DocItemLabel
-from docling_core.types.doc.utils import get_html_tag_with_text_direction, get_text_direction
+from docling_core.types.doc.utils import (
+    get_html_tag_with_text_direction,
+    get_text_direction,
+)
 
 _logger = logging.getLogger(__name__)
 
+
 class HTMLParams(CommonParams):
     """HTML-specific serialization parameters."""
 
     # Default layers to use for HTML export
     layers: set[ContentLayer] = {ContentLayer.BODY}
-    
+
     # How to handle images
     image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER
-    
+
     # HTML document properties
     html_lang: str = "en"
     css_styles: Optional[str] = None
     add_document_metadata: bool = True
     prettify: bool = True  # Add indentation and line breaks
-    
+
     # Formula rendering options
     formula_to_mathml: bool = True
-    
 
 
 class HTMLTextSerializer(BaseModel, BaseTextSerializer):
@@ -95,55 +93,52 @@ def serialize(
     ) -> SerializationResult:
         """Serializes the passed text item to HTML."""
         params = HTMLParams(**kwargs)
-        
+
         # Prepare the HTML based on item type
         if isinstance(item, TitleItem):
             text_inner = self._prepare_content(item.text)
             text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
-            
+
         elif isinstance(item, SectionHeaderItem):
             section_level = min(item.level + 1, 6)
             text_inner = self._prepare_content(item.text)
             text = get_html_tag_with_text_direction(
                 html_tag=f"h{section_level}", text=text_inner
             )
-            
+
         elif isinstance(item, FormulaItem):
             text = self._process_formula(
-                item=item, 
+                item=item,
                 doc=doc,
                 image_mode=params.image_mode,
                 formula_to_mathml=params.formula_to_mathml,
-                is_inline_scope=is_inline_scope
+                is_inline_scope=is_inline_scope,
             )
-            
+
         elif isinstance(item, CodeItem):
-            text = self._process_code(
-                item=item,
-                is_inline_scope=is_inline_scope
-            )
-            
+            text = self._process_code(item=item, is_inline_scope=is_inline_scope)
+
         elif isinstance(item, ListItem):
             # List items are handled by list serializer
             text_inner = self._prepare_content(item.text)
             text = get_html_tag_with_text_direction(html_tag="li", text=text_inner)
-            
-        elif is_inline_scope:            
+
+        elif is_inline_scope:
             text = self._prepare_content(item.text)
         else:
             # Regular text item
             text_inner = self._prepare_content(item.text)
             text = get_html_tag_with_text_direction(html_tag="p", text=text_inner)
-        
+
         # Apply formatting and hyperlinks
         text = doc_serializer.post_process(
             text=text,
             formatting=item.formatting,
             hyperlink=item.hyperlink,
         )
-        
+
         return SerializationResult(text=text)
-        
+
     def _prepare_content(
         self, text: str, do_escape_html=True, do_replace_newline=True
     ) -> str:
@@ -155,8 +150,8 @@ def _prepare_content(
         return text
 
     def _process_code(
-        self, 
-        item: FormulaItem, 
+        self,
+        item: FormulaItem,
         is_inline_scope: bool,
     ) -> str:
         code_text = self._prepare_content(
@@ -168,10 +163,10 @@ def _process_code(
             text = f"<pre><code>{code_text}</code></pre>"
 
         return text
-            
+
     def _process_formula(
-        self, 
-        item: FormulaItem, 
+        self,
+        item: FormulaItem,
         doc: DoclingDocument,
         image_mode: ImageRefMode,
         formula_to_mathml: bool,
@@ -181,13 +176,17 @@ def _process_formula(
         math_formula = self._prepare_content(
             item.text, do_escape_html=False, do_replace_newline=False
         )
-        
+
         # If formula is empty, try to use an image fallback
         if item.text == "" and item.orig != "":
             img_fallback = self._get_formula_image_fallback(item, doc)
-            if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0 and img_fallback:
+            if (
+                image_mode == ImageRefMode.EMBEDDED
+                and len(item.prov) > 0
+                and img_fallback
+            ):
                 return img_fallback
-                
+
         # Try to generate MathML
         if formula_to_mathml and math_formula:
             try:
@@ -207,16 +206,20 @@ def _process_formula(
                     return mathml
                 else:
                     return f"<div>{mathml}</div>"
-                
+
             except Exception:
                 img_fallback = self._get_formula_image_fallback(item, doc)
-                if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0 and img_fallback:
+                if (
+                    image_mode == ImageRefMode.EMBEDDED
+                    and len(item.prov) > 0
+                    and img_fallback
+                ):
                     return img_fallback
                 elif math_formula:
                     return f"<pre>{math_formula}</pre>"
 
         _logger.warning("Could not parse formula with MathML")
-                
+
         # Fallback options if we got here
         if math_formula and is_inline_scope:
             return f"<code>{math_formula}</code>"
@@ -226,16 +229,16 @@ def _process_formula(
             return '<span class="formula-not-decoded">Formula not decoded</span>'
         else:
             return '<div class="formula-not-decoded">Formula not decoded</div>'
-    
-    def _get_formula_image_fallback(self, item: TextItem, doc: DoclingDocument) -> Optional[str]:
+
+    def _get_formula_image_fallback(
+        self, item: TextItem, doc: DoclingDocument
+    ) -> Optional[str]:
         """Try to get an image fallback for a formula."""
         item_image = item.get_image(doc=doc)
         if item_image is not None:
             img_ref = ImageRef.from_pil(item_image, dpi=72)
             return (
-                "<figure>"
-                f'<img src="{img_ref.uri}" alt="{item.orig}" />'
-                "</figure>"
+                "<figure>" f'<img src="{img_ref.uri}" alt="{item.orig}" />' "</figure>"
             )
         return None
 
@@ -253,13 +256,13 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Serializes the passed table item to HTML."""
-        #text = item.export_to_html(doc=doc, add_caption=True)
+        # text = item.export_to_html(doc=doc, add_caption=True)
         text = self._serialize(
             item=item,
             doc_serializer=doc_serializer,
             doc=doc,
             add_caption=True,
-            add_footnotes=True
+            add_footnotes=True,
         )
         return SerializationResult(text=text)
 
@@ -319,15 +322,14 @@ def _serialize(
         if len(caption_text.text) > 0 and len(body) > 0:
             body = f"<table>{caption_text.text}<tbody>{body}</tbody></table>"
         elif len(caption_text.text) == 0 and len(body) > 0:
-            body = f"<table><tbody>{body}</tbody></table>"            
-        elif len(caption_text.text) > 0 and len(body) == 0:            
-            body = f"<table>{caption_text.text}</table>"            
+            body = f"<table><tbody>{body}</tbody></table>"
+        elif len(caption_text.text) > 0 and len(body) == 0:
+            body = f"<table>{caption_text.text}</table>"
         else:
             body = "<table></table>"
 
         return body
 
-    
 
 class HTMLPictureSerializer(BasePictureSerializer):
     """HTML-specific picture item serializer."""
@@ -350,7 +352,7 @@ def serialize(
 
     def _serialize(
         self,
-        item: PictureItem,            
+        item: PictureItem,
         doc: "DoclingDocument",
         add_caption: bool = True,
         image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
@@ -373,7 +375,7 @@ def _serialize(
 
             # get the self.image._pil or crop it out of the page-image
             img = item.get_image(doc)
-            
+
             if img is not None:
                 imgb64 = item._image_to_base64(img)
                 img_text = f'<img src="data:image/png;base64,{imgb64}">'
@@ -394,7 +396,7 @@ def _serialize(
 
         else:
             return f"<figure>{caption_text}</figure>"
-    
+
 
 class HTMLKeyValueSerializer(BaseKeyValueSerializer):
     """HTML-specific key-value item serializer."""
@@ -409,9 +411,43 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Serializes the passed key-value item to HTML."""
-        # This is a placeholder implementation - we could expand it
-        # to use a description list (dl/dt/dd) or a table
-        return SerializationResult(text="<div class='key-value-region'>Key-value data</div>")
+        if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
+            return SerializationResult(text="")
+
+        # Create a definition list (dl) for key-value pairs
+        parts = ['<dl class="key-value-region">']
+
+        # Group cells by their keys
+        key_to_values: Dict[int, List[int]] = {}
+        for link in item.graph.links:
+            key_to_values.setdefault(link.source_cell_id, []).append(
+                link.target_cell_id
+            )
+
+        # Find all cells
+        cell_by_id = {cell.cell_id: cell for cell in item.graph.cells}
+
+        # Process each key-value pair
+        for key_id, value_ids in key_to_values.items():
+            if key_id in cell_by_id:
+                key_cell = cell_by_id[key_id]
+                key_text = html.escape(key_cell.text)
+                parts.append(f"<dt>{key_text}</dt>")
+
+                for value_id in value_ids:
+                    if value_id in cell_by_id:
+                        value_cell = cell_by_id[value_id]
+                        value_text = html.escape(value_cell.text)
+                        parts.append(f"<dd>{value_text}</dd>")
+
+        parts.append("</dl>")
+
+        # Add caption if available
+        cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
+        if cap_text:
+            parts.append(cap_text)
+
+        return SerializationResult(text="\n".join(parts))
 
 
 class HTMLFormSerializer(BaseFormSerializer):
@@ -427,8 +463,29 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Serializes the passed form item to HTML."""
-        # This is a placeholder implementation
-        return SerializationResult(text="<div class='form'>Form data</div>")
+        if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
+            return SerializationResult(text="")
+
+        # Create a form representation (non-functional HTML form)
+        parts = ['<div class="form-container">']
+        
+        # Add caption if available
+        cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
+        if cap_text:
+            parts.append(cap_text)
+
+        # Simple representation of form items
+        for cell in item.graph.cells:
+            cell_text = html.escape(cell.text)
+            cell_label = cell.label.value
+            parts.append(
+                f'<div class="form-item form-item-{cell_label}">{cell_text}</div>'
+            )
+
+        parts.append("</div>")
+
+        return SerializationResult(text="\n".join(parts))
+
 
 
 class HTMLListSerializer(BaseModel, BaseListSerializer):
@@ -448,7 +505,7 @@ def serialize(
     ) -> SerializationResult:
         """Serializes a list to HTML."""
         my_visited = visited or set()
-        
+
         # Get all child parts
         parts = doc_serializer.get_parts(
             item=item,
@@ -461,7 +518,7 @@ def serialize(
         # Start the appropriate list type
         tag = "ol" if isinstance(item, OrderedList) else "ul"
         list_html = [f"<{tag}>"]
-        
+
         # Add all child parts
         for part in parts:
             if part.text.startswith("<li>") and part.text.endswith("</li>"):
@@ -469,14 +526,14 @@ def serialize(
             elif part.text.startswith("<ol>") and part.text.endswith("</ol>"):
                 list_html.append(part.text)
             elif part.text.startswith("<ul>") and part.text.endswith("</ul>"):
-                list_html.append(part.text)                
+                list_html.append(part.text)
             else:
                 _logger.info(f"no <li>, <ol> or <ul> for {part.text}")
                 list_html.append(f"<li>{part.text}</li>")
-                
+
         # Close the list
         list_html.append(f"</{tag}>")
-        
+
         return SerializationResult(text="\n".join(list_html))
 
 
@@ -496,7 +553,7 @@ def serialize(
     ) -> SerializationResult:
         """Serializes an inline group to HTML."""
         my_visited = visited or set()
-        
+
         # Get all parts with inline scope
         parts = doc_serializer.get_parts(
             item=item,
@@ -508,11 +565,11 @@ def serialize(
 
         # Join all parts without separators
         inline_html = " ".join([p.text for p in parts])
-        
+
         # Wrap in span if needed
         if inline_html:
             inline_html = f"<span class='inline-group'>{inline_html}</span>"
-            
+
         return SerializationResult(text=inline_html)
 
 
@@ -532,9 +589,11 @@ def serialize(
         # For group items, we don't generate any markup
         if isinstance(item, GroupItem):
             return SerializationResult(text="")
-            
+
         # For other doc items, add a comment
-        return SerializationResult(text=f"<!-- Unhandled item type: {item.__class__.__name__} -->")
+        return SerializationResult(
+            text=f"<!-- Unhandled item type: {item.__class__.__name__} -->"
+        )
 
 
 class HTMLDocSerializer(DocSerializer):
@@ -595,41 +654,41 @@ def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult
             self._generate_head(),
             "<body>",
         ]
-        
+
         # Add all pages
         for page in pages:
             if page.text:
                 html_parts.append(page.text)
-                
+
         # Close HTML structure
         html_parts.extend(["</body>", "</html>"])
-        
+
         # Join with newlines
         html_content = "\n".join(html_parts)
-        
+
         return SerializationResult(text=html_content)
-        
+
     @override
     def serialize_captions(
         self,
         item: FloatingItem,
-        tag: str = 'figcaption',
+        tag: str = "figcaption",
         **kwargs,
     ) -> SerializationResult:
         """Serialize the item's captions."""
         caption_parts = []
-        
+
         # Extract caption text from all caption items
         for cap in item.captions:
             caption_item = cap.resolve(self.doc)
             if isinstance(caption_item, TextItem):
                 caption_parts.append(caption_item.text)
-                
+
         # Join all captions with a space
         if caption_parts:
             caption_text = " ".join(caption_parts)
             text_dir = get_text_direction(caption_text)
-            
+
             # Create proper HTML
             if text_dir == "rtl":
                 return SerializationResult(
@@ -637,9 +696,9 @@ def serialize_captions(
                 )
             else:
                 return SerializationResult(
-                    text=f'<{tag}>{html.escape(caption_text)}</{tag}>'
+                    text=f"<{tag}>{html.escape(caption_text)}</{tag}>"
                 )
-                
+
         return SerializationResult(text="")
 
     def _generate_head(self) -> str:
@@ -791,4 +850,3 @@ def _get_default_css(self) -> str:
         margin-top: 0.5em;
     }
 </style>"""
-    
diff --git a/test/test_serializer_html.py b/test/test_serializer_html.py
index cd3d99fb..fe691257 100644
--- a/test/test_serializer_html.py
+++ b/test/test_serializer_html.py
@@ -1,30 +1,27 @@
 """Examples of using the HTML Serializer for DoclingDocument."""
 
-from pathlib import Path
-from docling_core.types.doc.base import ImageRefMode, Size
-from docling_core.types.doc.document import DoclingDocument, Formatting
-from docling_core.types.doc.labels import DocItemLabel, CodeLanguageLabel
-from docling_core.experimental.serializer.html import HTMLDocSerializer, HTMLParams
-
 from test.test_docling_doc import _construct_doc
 
+from docling_core.experimental.serializer.html import HTMLDocSerializer
+
+
 def test_html_export():
-    
+
     doc = _construct_doc()
-    
+
     # Create the serializer with default parameters
     serializer = HTMLDocSerializer(doc=doc)
-    
+
     # Serialize the document
     html_output = serializer.serialize().text
-    
+
     # Save to file
     with open("example_document.new.html", "w", encoding="utf-8") as f:
         f.write(html_output)
 
     doc.save_as_html(filename="example_document.old.html")
     doc.save_as_markdown(filename="example_document.old.md")
-        
+
     print("Basic example saved to 'example_document.html'")
-    
+
     assert True

From 3d22a3e16df039b19ea4eac65a11a6ee211cfabc Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Sat, 5 Apr 2025 07:53:17 +0200
Subject: [PATCH 10/34] first version of KeyValue serialisation

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/html.py | 122 +++++++++++++++----
 1 file changed, 95 insertions(+), 27 deletions(-)

diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index 183b53df..c05b3e50 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -413,42 +413,110 @@ def serialize(
         """Serializes the passed key-value item to HTML."""
         if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
             return SerializationResult(text="")
+        
+        # Build cell lookup by ID
+        cell_map = {cell.cell_id: cell for cell in item.graph.cells}
+        
+        # Build relationship maps
+        child_links = {}  # source_id -> list of child_ids (to_child)
+        value_links = {}  # key_id -> list of value_ids (to_value)
+        parents = set()   # Set of all IDs that are targets of to_child (to find roots)
 
-        # Create a definition list (dl) for key-value pairs
-        parts = ['<dl class="key-value-region">']
-
-        # Group cells by their keys
-        key_to_values: Dict[int, List[int]] = {}
         for link in item.graph.links:
-            key_to_values.setdefault(link.source_cell_id, []).append(
-                link.target_cell_id
-            )
-
-        # Find all cells
-        cell_by_id = {cell.cell_id: cell for cell in item.graph.cells}
-
-        # Process each key-value pair
-        for key_id, value_ids in key_to_values.items():
-            if key_id in cell_by_id:
-                key_cell = cell_by_id[key_id]
+            if link.source_cell_id not in cell_map or link.target_cell_id not in cell_map:
+                continue
+                
+            if link.label.value == "to_child":
+                child_links.setdefault(link.source_cell_id, []).append(link.target_cell_id)
+                parents.add(link.target_cell_id)
+            elif link.label.value == "to_value":
+                value_links.setdefault(link.source_cell_id, []).append(link.target_cell_id)
+        
+        # Find root cells (cells with no parent)
+        root_ids = [cell_id for cell_id in cell_map.keys() if cell_id not in parents]
+        
+        # Generate the HTML
+        html = ['<div class="key-value-region">']
+        
+        # If we have roots, make a list structure
+        if root_ids:
+            html.append('<ul class="key-value-list">')
+            for root_id in root_ids:
+                html.append(self._render_cell_tree(
+                    cell_id=root_id,
+                    cell_map=cell_map,
+                    child_links=child_links,
+                    value_links=value_links,
+                    level=0
+                ))
+            html.append('</ul>')
+        # If no hierarchy, fall back to definition list
+        else:
+            html.append('<dl class="key-value-pairs">')
+            for key_id, value_ids in value_links.items():
+                key_cell = cell_map[key_id]
                 key_text = html.escape(key_cell.text)
-                parts.append(f"<dt>{key_text}</dt>")
-
+                html.append(f'<dt>{key_text}</dt>')
+                
                 for value_id in value_ids:
-                    if value_id in cell_by_id:
-                        value_cell = cell_by_id[value_id]
-                        value_text = html.escape(value_cell.text)
-                        parts.append(f"<dd>{value_text}</dd>")
-
-        parts.append("</dl>")
+                    value_cell = cell_map[value_id]
+                    value_text = html.escape(value_cell.text)
+                    html.append(f'<dd>{value_text}</dd>')
+            html.append('</dl>')
+        
+        html.append('</div>')
 
         # Add caption if available
         cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
-        if cap_text:
-            parts.append(cap_text)
+        if len(cap_text)>0:
+            html.append(cap_text)
 
-        return SerializationResult(text="\n".join(parts))
+        return SerializationResult(text="\n".join(html))
 
+    def _render_cell_tree(
+        self,
+        cell_id: int,
+        cell_map: dict,
+        child_links: dict,
+        value_links: dict,
+        level: int
+    ) -> str:
+        """Recursively render a cell and its children as a nested list."""
+        cell = cell_map[cell_id]
+        cell_text = html.escape(cell.text)
+        
+        # Format key-value pairs if this cell has values linked
+        if cell_id in value_links:
+            value_texts = []
+            for value_id in value_links[cell_id]:
+                if value_id in cell_map:
+                    value_cell = cell_map[value_id]
+                    value_texts.append(html.escape(value_cell.text))
+            
+            if value_texts:
+                cell_text = f"<strong>{cell_text}:</strong> {', '.join(value_texts)}"
+    
+        # If this cell has children, create a nested list
+        if cell_id in child_links and child_links[cell_id]:
+            children_html = []
+            children_html.append(f'<li>{cell_text}')
+            children_html.append('<ul>')
+            
+            for child_id in child_links[cell_id]:
+                children_html.append(self._render_cell_tree(
+                    cell_id=child_id,
+                    cell_map=cell_map,
+                    child_links=child_links,
+                    value_links=value_links,
+                    level=level+1
+                ))
+            
+            children_html.append('</ul>')
+            children_html.append('</li>')
+            return '\n'.join(children_html)
+        else:
+            # Leaf node - just render the cell
+            return f'<li>{cell_text}</li>'
 
 class HTMLFormSerializer(BaseFormSerializer):
     """HTML-specific form item serializer."""

From 31d61d2c9cf30a08c3eb5051eea35f195ade56ed Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Sat, 5 Apr 2025 12:09:42 +0200
Subject: [PATCH 11/34] fixed the key-value and form-region and added the
 GraphData serializer

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/base.py |  17 +++
 docling_core/experimental/serializer/html.py | 113 +++++++++++--------
 2 files changed, 82 insertions(+), 48 deletions(-)

diff --git a/docling_core/experimental/serializer/base.py b/docling_core/experimental/serializer/base.py
index 4b2e46b4..00079edb 100644
--- a/docling_core/experimental/serializer/base.py
+++ b/docling_core/experimental/serializer/base.py
@@ -22,6 +22,7 @@
     PictureItem,
     TableItem,
     TextItem,
+    GraphData,
     UnorderedList,
 )
 
@@ -153,6 +154,22 @@ def serialize(
         ...
 
 
+class BaseGraphDataSerializer(ABC):
+    """Base class for inline serializers."""
+
+    @abstractmethod
+    def serialize(
+        self,
+        *,
+        item: GraphData,
+        doc_serializer: "BaseDocSerializer",
+        doc: DoclingDocument,
+        **kwargs,
+    ) -> SerializationResult:
+        """Serializes the passed item."""
+        ...
+        
+
 class BaseFallbackSerializer(ABC):
     """Base fallback class for item serializers."""
 
diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index c05b3e50..1a4c4435 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -26,6 +26,7 @@
     BasePictureSerializer,
     BaseTableSerializer,
     BaseTextSerializer,
+    BaseGraphDataSerializer,
     SerializationResult,
 )
 from docling_core.experimental.serializer.common import CommonParams, DocSerializer
@@ -49,6 +50,7 @@
     TableItem,
     TextItem,
     TitleItem,
+    GraphData,
     UnorderedList,
 )
 from docling_core.types.doc.utils import (
@@ -397,32 +399,27 @@ def _serialize(
         else:
             return f"<figure>{caption_text}</figure>"
 
-
-class HTMLKeyValueSerializer(BaseKeyValueSerializer):
-    """HTML-specific key-value item serializer."""
+class HTMLGraphDataSerializer(BaseGraphDataSerializer):
+    """HTML-specific graph-data item serializer."""        
 
     @override
-    def serialize(
+    def serialize(    
         self,
         *,
-        item: KeyValueItem,
-        doc_serializer: "BaseDocSerializer",
-        doc: DoclingDocument,
-        **kwargs,
+        item: GraphData,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,            
+        tag: str    
     ) -> SerializationResult:
-        """Serializes the passed key-value item to HTML."""
-        if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
-            return SerializationResult(text="")
-        
         # Build cell lookup by ID
-        cell_map = {cell.cell_id: cell for cell in item.graph.cells}
+        cell_map = {cell.cell_id: cell for cell in item.cells}
         
         # Build relationship maps
         child_links = {}  # source_id -> list of child_ids (to_child)
         value_links = {}  # key_id -> list of value_ids (to_value)
         parents = set()   # Set of all IDs that are targets of to_child (to find roots)
 
-        for link in item.graph.links:
+        for link in item.links:
             if link.source_cell_id not in cell_map or link.target_cell_id not in cell_map:
                 continue
                 
@@ -434,13 +431,13 @@ def serialize(
         
         # Find root cells (cells with no parent)
         root_ids = [cell_id for cell_id in cell_map.keys() if cell_id not in parents]
-        
+
         # Generate the HTML
-        html = ['<div class="key-value-region">']
+        html = [f'<div class="{tag}">']
         
         # If we have roots, make a list structure
         if root_ids:
-            html.append('<ul class="key-value-list">')
+            html.append(f'<ul class="{tag}">')
             for root_id in root_ids:
                 html.append(self._render_cell_tree(
                     cell_id=root_id,
@@ -450,9 +447,10 @@ def serialize(
                     level=0
                 ))
             html.append('</ul>')
+            
         # If no hierarchy, fall back to definition list
         else:
-            html.append('<dl class="key-value-pairs">')
+            html.append(f'<dl class="{tag}">')
             for key_id, value_ids in value_links.items():
                 key_cell = cell_map[key_id]
                 key_text = html.escape(key_cell.text)
@@ -466,13 +464,8 @@ def serialize(
         
         html.append('</div>')
 
-        # Add caption if available
-        cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
-        if len(cap_text)>0:
-            html.append(cap_text)
-
         return SerializationResult(text="\n".join(html))
-
+        
     def _render_cell_tree(
         self,
         cell_id: int,
@@ -492,14 +485,13 @@ def _render_cell_tree(
                 if value_id in cell_map:
                     value_cell = cell_map[value_id]
                     value_texts.append(html.escape(value_cell.text))
-            
-            if value_texts:
-                cell_text = f"<strong>{cell_text}:</strong> {', '.join(value_texts)}"
+                    
+            cell_text = f"<strong>{cell_text}</strong>: {', '.join(value_texts)}"
     
         # If this cell has children, create a nested list
         if cell_id in child_links and child_links[cell_id]:
             children_html = []
-            children_html.append(f'<li>{cell_text}')
+            children_html.append(f'<li>{cell_text}</li>')
             children_html.append('<ul>')
             
             for child_id in child_links[cell_id]:
@@ -512,11 +504,44 @@ def _render_cell_tree(
                 ))
             
             children_html.append('</ul>')
-            children_html.append('</li>')
             return '\n'.join(children_html)
+        
+        elif cell_id in value_links:
+            return f'<li>{cell_text}</li>'
         else:
             # Leaf node - just render the cell
-            return f'<li>{cell_text}</li>'
+            # return f'<li>{cell_text}</li>'
+            return ""
+        
+class HTMLKeyValueSerializer(BaseKeyValueSerializer):
+    """HTML-specific key-value item serializer."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: KeyValueItem,
+        doc_serializer: "BaseDocSerializer",
+        doc: DoclingDocument,
+        **kwargs,
+    ) -> SerializationResult:
+        """Serializes the passed key-value item to HTML."""
+        if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
+            return SerializationResult(text="")
+        
+        graph_serializer = HTMLGraphDataSerializer()
+
+        # Add key-value if available
+        key_value = graph_serializer.serialize(item=item.graph,
+                                               doc_serializer=doc_serializer,
+                                               doc=doc,
+                                               tag="key-value-region")
+        
+        # Add caption if available
+        caption = doc_serializer.serialize_captions(item=item, **kwargs)
+        
+        return SerializationResult(text="\n".join([key_value.text, caption.text]))
+        
 
 class HTMLFormSerializer(BaseFormSerializer):
     """HTML-specific form item serializer."""
@@ -534,26 +559,18 @@ def serialize(
         if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
             return SerializationResult(text="")
 
-        # Create a form representation (non-functional HTML form)
-        parts = ['<div class="form-container">']
+        graph_serializer = HTMLGraphDataSerializer()
+
+        # Add key-value if available
+        key_value = graph_serializer.serialize(item=item.graph,
+                                               doc_serializer=doc_serializer,
+                                               doc=doc,
+                                               tag="form-container")
         
         # Add caption if available
-        cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
-        if cap_text:
-            parts.append(cap_text)
-
-        # Simple representation of form items
-        for cell in item.graph.cells:
-            cell_text = html.escape(cell.text)
-            cell_label = cell.label.value
-            parts.append(
-                f'<div class="form-item form-item-{cell_label}">{cell_text}</div>'
-            )
-
-        parts.append("</div>")
-
-        return SerializationResult(text="\n".join(parts))
-
+        caption = doc_serializer.serialize_captions(item=item, **kwargs)
+        
+        return SerializationResult(text="\n".join([key_value.text, caption.text]))
 
 
 class HTMLListSerializer(BaseModel, BaseListSerializer):

From 8f954edacff3b22c45d62236f4e5b375f11b6c72 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Sat, 5 Apr 2025 12:10:24 +0200
Subject: [PATCH 12/34] need to do some mypy work now

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/base.py |   4 +-
 docling_core/experimental/serializer/html.py | 145 +++++++++++--------
 2 files changed, 83 insertions(+), 66 deletions(-)

diff --git a/docling_core/experimental/serializer/base.py b/docling_core/experimental/serializer/base.py
index 00079edb..80fd98c8 100644
--- a/docling_core/experimental/serializer/base.py
+++ b/docling_core/experimental/serializer/base.py
@@ -15,6 +15,7 @@
     DoclingDocument,
     FloatingItem,
     FormItem,
+    GraphData,
     InlineGroup,
     KeyValueItem,
     NodeItem,
@@ -22,7 +23,6 @@
     PictureItem,
     TableItem,
     TextItem,
-    GraphData,
     UnorderedList,
 )
 
@@ -168,7 +168,7 @@ def serialize(
     ) -> SerializationResult:
         """Serializes the passed item."""
         ...
-        
+
 
 class BaseFallbackSerializer(ABC):
     """Base fallback class for item serializers."""
diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index 1a4c4435..dbe93b50 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -20,13 +20,13 @@
     BaseDocSerializer,
     BaseFallbackSerializer,
     BaseFormSerializer,
+    BaseGraphDataSerializer,
     BaseInlineSerializer,
     BaseKeyValueSerializer,
     BaseListSerializer,
     BasePictureSerializer,
     BaseTableSerializer,
     BaseTextSerializer,
-    BaseGraphDataSerializer,
     SerializationResult,
 )
 from docling_core.experimental.serializer.common import CommonParams, DocSerializer
@@ -38,6 +38,7 @@
     FloatingItem,
     FormItem,
     FormulaItem,
+    GraphData,
     GroupItem,
     ImageRef,
     InlineGroup,
@@ -50,7 +51,6 @@
     TableItem,
     TextItem,
     TitleItem,
-    GraphData,
     UnorderedList,
 )
 from docling_core.types.doc.utils import (
@@ -399,85 +399,95 @@ def _serialize(
         else:
             return f"<figure>{caption_text}</figure>"
 
+
 class HTMLGraphDataSerializer(BaseGraphDataSerializer):
-    """HTML-specific graph-data item serializer."""        
+    """HTML-specific graph-data item serializer."""
 
     @override
-    def serialize(    
+    def serialize(
         self,
         *,
         item: GraphData,
         doc_serializer: BaseDocSerializer,
-        doc: DoclingDocument,            
-        tag: str    
+        doc: DoclingDocument,
+        tag: str,
     ) -> SerializationResult:
         # Build cell lookup by ID
         cell_map = {cell.cell_id: cell for cell in item.cells}
-        
+
         # Build relationship maps
         child_links = {}  # source_id -> list of child_ids (to_child)
         value_links = {}  # key_id -> list of value_ids (to_value)
-        parents = set()   # Set of all IDs that are targets of to_child (to find roots)
+        parents = set()  # Set of all IDs that are targets of to_child (to find roots)
 
         for link in item.links:
-            if link.source_cell_id not in cell_map or link.target_cell_id not in cell_map:
+            if (
+                link.source_cell_id not in cell_map
+                or link.target_cell_id not in cell_map
+            ):
                 continue
-                
+
             if link.label.value == "to_child":
-                child_links.setdefault(link.source_cell_id, []).append(link.target_cell_id)
+                child_links.setdefault(link.source_cell_id, []).append(
+                    link.target_cell_id
+                )
                 parents.add(link.target_cell_id)
             elif link.label.value == "to_value":
-                value_links.setdefault(link.source_cell_id, []).append(link.target_cell_id)
-        
+                value_links.setdefault(link.source_cell_id, []).append(
+                    link.target_cell_id
+                )
+
         # Find root cells (cells with no parent)
         root_ids = [cell_id for cell_id in cell_map.keys() if cell_id not in parents]
 
         # Generate the HTML
         html = [f'<div class="{tag}">']
-        
+
         # If we have roots, make a list structure
         if root_ids:
             html.append(f'<ul class="{tag}">')
             for root_id in root_ids:
-                html.append(self._render_cell_tree(
-                    cell_id=root_id,
-                    cell_map=cell_map,
-                    child_links=child_links,
-                    value_links=value_links,
-                    level=0
-                ))
-            html.append('</ul>')
-            
+                html.append(
+                    self._render_cell_tree(
+                        cell_id=root_id,
+                        cell_map=cell_map,
+                        child_links=child_links,
+                        value_links=value_links,
+                        level=0,
+                    )
+                )
+            html.append("</ul>")
+
         # If no hierarchy, fall back to definition list
         else:
             html.append(f'<dl class="{tag}">')
             for key_id, value_ids in value_links.items():
                 key_cell = cell_map[key_id]
                 key_text = html.escape(key_cell.text)
-                html.append(f'<dt>{key_text}</dt>')
-                
+                html.append(f"<dt>{key_text}</dt>")
+
                 for value_id in value_ids:
                     value_cell = cell_map[value_id]
                     value_text = html.escape(value_cell.text)
-                    html.append(f'<dd>{value_text}</dd>')
-            html.append('</dl>')
-        
-        html.append('</div>')
+                    html.append(f"<dd>{value_text}</dd>")
+            html.append("</dl>")
+
+        html.append("</div>")
 
         return SerializationResult(text="\n".join(html))
-        
+
     def _render_cell_tree(
         self,
         cell_id: int,
         cell_map: dict,
         child_links: dict,
         value_links: dict,
-        level: int
+        level: int,
     ) -> str:
         """Recursively render a cell and its children as a nested list."""
         cell = cell_map[cell_id]
         cell_text = html.escape(cell.text)
-        
+
         # Format key-value pairs if this cell has values linked
         if cell_id in value_links:
             value_texts = []
@@ -485,34 +495,37 @@ def _render_cell_tree(
                 if value_id in cell_map:
                     value_cell = cell_map[value_id]
                     value_texts.append(html.escape(value_cell.text))
-                    
+
             cell_text = f"<strong>{cell_text}</strong>: {', '.join(value_texts)}"
-    
+
         # If this cell has children, create a nested list
         if cell_id in child_links and child_links[cell_id]:
             children_html = []
-            children_html.append(f'<li>{cell_text}</li>')
-            children_html.append('<ul>')
-            
+            children_html.append(f"<li>{cell_text}</li>")
+            children_html.append("<ul>")
+
             for child_id in child_links[cell_id]:
-                children_html.append(self._render_cell_tree(
-                    cell_id=child_id,
-                    cell_map=cell_map,
-                    child_links=child_links,
-                    value_links=value_links,
-                    level=level+1
-                ))
-            
-            children_html.append('</ul>')
-            return '\n'.join(children_html)
-        
+                children_html.append(
+                    self._render_cell_tree(
+                        cell_id=child_id,
+                        cell_map=cell_map,
+                        child_links=child_links,
+                        value_links=value_links,
+                        level=level + 1,
+                    )
+                )
+
+            children_html.append("</ul>")
+            return "\n".join(children_html)
+
         elif cell_id in value_links:
-            return f'<li>{cell_text}</li>'
+            return f"<li>{cell_text}</li>"
         else:
             # Leaf node - just render the cell
             # return f'<li>{cell_text}</li>'
             return ""
-        
+
+
 class HTMLKeyValueSerializer(BaseKeyValueSerializer):
     """HTML-specific key-value item serializer."""
 
@@ -528,20 +541,22 @@ def serialize(
         """Serializes the passed key-value item to HTML."""
         if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
             return SerializationResult(text="")
-        
+
         graph_serializer = HTMLGraphDataSerializer()
 
         # Add key-value if available
-        key_value = graph_serializer.serialize(item=item.graph,
-                                               doc_serializer=doc_serializer,
-                                               doc=doc,
-                                               tag="key-value-region")
-        
+        key_value = graph_serializer.serialize(
+            item=item.graph,
+            doc_serializer=doc_serializer,
+            doc=doc,
+            tag="key-value-region",
+        )
+
         # Add caption if available
         caption = doc_serializer.serialize_captions(item=item, **kwargs)
-        
+
         return SerializationResult(text="\n".join([key_value.text, caption.text]))
-        
+
 
 class HTMLFormSerializer(BaseFormSerializer):
     """HTML-specific form item serializer."""
@@ -562,14 +577,16 @@ def serialize(
         graph_serializer = HTMLGraphDataSerializer()
 
         # Add key-value if available
-        key_value = graph_serializer.serialize(item=item.graph,
-                                               doc_serializer=doc_serializer,
-                                               doc=doc,
-                                               tag="form-container")
-        
+        key_value = graph_serializer.serialize(
+            item=item.graph,
+            doc_serializer=doc_serializer,
+            doc=doc,
+            tag="form-container",
+        )
+
         # Add caption if available
         caption = doc_serializer.serialize_captions(item=item, **kwargs)
-        
+
         return SerializationResult(text="\n".join([key_value.text, caption.text]))
 
 

From 7df652692cae0b22a54976ddd95299b7ec9a6a91 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Sun, 6 Apr 2025 06:21:17 +0200
Subject: [PATCH 13/34] passed the mypy

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/base.py |   1 +
 docling_core/experimental/serializer/html.py | 108 ++++++++++---------
 2 files changed, 57 insertions(+), 52 deletions(-)

diff --git a/docling_core/experimental/serializer/base.py b/docling_core/experimental/serializer/base.py
index 80fd98c8..1e472265 100644
--- a/docling_core/experimental/serializer/base.py
+++ b/docling_core/experimental/serializer/base.py
@@ -164,6 +164,7 @@ def serialize(
         item: GraphData,
         doc_serializer: "BaseDocSerializer",
         doc: DoclingDocument,
+        tag: str,
         **kwargs,
     ) -> SerializationResult:
         """Serializes the passed item."""
diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index dbe93b50..ccdfd4ce 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -8,6 +8,7 @@
 import logging
 from pathlib import Path
 from typing import Optional, Union
+from urllib.parse import quote
 from xml.etree.cElementTree import SubElement, tostring
 from xml.sax.saxutils import unescape
 
@@ -48,6 +49,7 @@
     OrderedList,
     PictureItem,
     SectionHeaderItem,
+    TableCell,
     TableItem,
     TextItem,
     TitleItem,
@@ -153,7 +155,7 @@ def _prepare_content(
 
     def _process_code(
         self,
-        item: FormulaItem,
+        item: CodeItem,
         is_inline_scope: bool,
     ) -> str:
         code_text = self._prepare_content(
@@ -219,6 +221,8 @@ def _process_formula(
                     return img_fallback
                 elif math_formula:
                     return f"<pre>{math_formula}</pre>"
+                else:
+                    return "<pre>Formula not decoded</pre>"
 
         _logger.warning("Could not parse formula with MathML")
 
@@ -229,8 +233,8 @@ def _process_formula(
             f"<pre>{math_formula}</pre>"
         elif is_inline_scope:
             return '<span class="formula-not-decoded">Formula not decoded</span>'
-        else:
-            return '<div class="formula-not-decoded">Formula not decoded</div>'
+
+        return '<div class="formula-not-decoded">Formula not decoded</div>'
 
     def _get_formula_image_fallback(
         self, item: TextItem, doc: DoclingDocument
@@ -343,61 +347,55 @@ def serialize(
         item: PictureItem,
         doc_serializer: BaseDocSerializer,
         doc: DoclingDocument,
+        add_caption: bool = True,
+        image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
         **kwargs,
     ) -> SerializationResult:
-        """Serializes the passed picture item to HTML."""
-        params = HTMLParams(**kwargs)
-        text = item.export_to_html(
-            doc=doc, add_caption=True, image_mode=params.image_mode
+        """Export picture to HTML format."""
+        caption = doc_serializer.serialize_captions(
+            item=item, doc_serializer=doc_serializer, doc=doc, tag="figcaption"
         )
-        return SerializationResult(text=text)
 
-    def _serialize(
-        self,
-        item: PictureItem,
-        doc: "DoclingDocument",
-        add_caption: bool = True,
-        image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
-    ) -> str:
-        """Export picture to HTML format."""
-        caption_text = doc_serializer.serialize_captions(item=item, tag="figcaption")
+        result = ""
 
         if image_mode == ImageRefMode.PLACEHOLDER:
-            return f"<figure>{caption_text}</figure>"
+            result = f"<figure>{caption.text}</figure>"
 
         elif image_mode == ImageRefMode.EMBEDDED:
             # short-cut: we already have the image in base64
             if (
-                isinstance(self.image, ImageRef)
-                and isinstance(self.image.uri, AnyUrl)
-                and self.image.uri.scheme == "data"
+                isinstance(item.image, ImageRef)
+                and isinstance(item.image.uri, AnyUrl)
+                and item.image.uri.scheme == "data"
             ):
-                img_text = f'<img src="{self.image.uri}">'
-                return f"<figure>{caption_text}{img_text}</figure>"
-
-            # get the self.image._pil or crop it out of the page-image
-            img = item.get_image(doc)
+                img_text = f'<img src="{item.image.uri}">'
+                result = f"<figure>{caption.text}{img_text}</figure>"
+            else:
+                # get the item.image._pil or crop it out of the page-image
+                img = item.get_image(doc)
 
-            if img is not None:
-                imgb64 = item._image_to_base64(img)
-                img_text = f'<img src="data:image/png;base64,{imgb64}">'
+                if img is not None:
+                    imgb64 = item._image_to_base64(img)
+                    img_text = f'<img src="data:image/png;base64,{imgb64}">'
 
-                return f"<figure>{caption_text}{img_text}</figure>"
-            else:
-                return f"<figure>{caption_text}</figure>"
+                    result = f"<figure>{caption.text}{img_text}</figure>"
+                else:
+                    result = f"<figure>{caption.text}</figure>"
 
         elif image_mode == ImageRefMode.REFERENCED:
 
-            if not isinstance(self.image, ImageRef) or (
-                isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
+            if not isinstance(item.image, ImageRef) or (
+                isinstance(item.image.uri, AnyUrl) and item.image.uri.scheme == "data"
             ):
-                return default_response
-
-            img_text = f'<img src="{quote(str(self.image.uri))}">'
-            return f"<figure>{caption_text}{img_text}</figure>"
+                result = f"<figure>{caption.text}</figure>"
 
+            else:
+                img_text = f'<img src="{quote(str(item.image.uri))}">'
+                result = f"<figure>{caption.text}{img_text}</figure>"
         else:
-            return f"<figure>{caption_text}</figure>"
+            result = f"<figure>{caption.text}</figure>"
+
+        return SerializationResult(text=result)
 
 
 class HTMLGraphDataSerializer(BaseGraphDataSerializer):
@@ -411,14 +409,20 @@ def serialize(
         doc_serializer: BaseDocSerializer,
         doc: DoclingDocument,
         tag: str,
+        **kwargs,
     ) -> SerializationResult:
+        """Serialize the graph-data to HTML."""
         # Build cell lookup by ID
         cell_map = {cell.cell_id: cell for cell in item.cells}
 
         # Build relationship maps
-        child_links = {}  # source_id -> list of child_ids (to_child)
-        value_links = {}  # key_id -> list of value_ids (to_value)
-        parents = set()  # Set of all IDs that are targets of to_child (to find roots)
+        child_links: dict[int, list[int]] = (
+            {}
+        )  # source_id -> list of child_ids (to_child)
+        value_links: dict[int, list[int]] = {}  # key_id -> list of value_ids (to_value)
+        parents: set[int] = (
+            set()
+        )  # Set of all IDs that are targets of to_child (to find roots)
 
         for link in item.links:
             if (
@@ -441,13 +445,13 @@ def serialize(
         root_ids = [cell_id for cell_id in cell_map.keys() if cell_id not in parents]
 
         # Generate the HTML
-        html = [f'<div class="{tag}">']
+        parts = [f'<div class="{tag}">']
 
         # If we have roots, make a list structure
         if root_ids:
-            html.append(f'<ul class="{tag}">')
+            parts.append(f'<ul class="{tag}">')
             for root_id in root_ids:
-                html.append(
+                parts.append(
                     self._render_cell_tree(
                         cell_id=root_id,
                         cell_map=cell_map,
@@ -456,25 +460,25 @@ def serialize(
                         level=0,
                     )
                 )
-            html.append("</ul>")
+            parts.append("</ul>")
 
         # If no hierarchy, fall back to definition list
         else:
-            html.append(f'<dl class="{tag}">')
+            parts.append(f'<dl class="{tag}">')
             for key_id, value_ids in value_links.items():
                 key_cell = cell_map[key_id]
                 key_text = html.escape(key_cell.text)
-                html.append(f"<dt>{key_text}</dt>")
+                parts.append(f"<dt>{key_text}</dt>")
 
                 for value_id in value_ids:
                     value_cell = cell_map[value_id]
                     value_text = html.escape(value_cell.text)
-                    html.append(f"<dd>{value_text}</dd>")
-            html.append("</dl>")
+                    parts.append(f"<dd>{value_text}</dd>")
+            parts.append("</dl>")
 
-        html.append("</div>")
+        parts.append("</div>")
 
-        return SerializationResult(text="\n".join(html))
+        return SerializationResult(text="\n".join(parts))
 
     def _render_cell_tree(
         self,

From 2389bb4cac3e8d5093b99cf84b20d405d3cc5b35 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Apr 2025 05:42:25 +0200
Subject: [PATCH 14/34] added the get_excluded_refs function to obtain proper
 serialization

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 .../experimental/serializer/common.py         |  7 +++
 docling_core/experimental/serializer/html.py  | 50 +++++++++++++++----
 docling_core/types/doc/document.py            | 42 ++++++++++++++++
 test/test_serializer_html.py                  | 42 ++++++++++++++++
 4 files changed, 130 insertions(+), 11 deletions(-)

diff --git a/docling_core/experimental/serializer/common.py b/docling_core/experimental/serializer/common.py
index e399e3d7..5304f1f7 100644
--- a/docling_core/experimental/serializer/common.py
+++ b/docling_core/experimental/serializer/common.py
@@ -351,6 +351,11 @@ def get_parts(
         **kwargs,
     ) -> list[SerializationResult]:
         """Get the components to be combined for serializing this node."""
+        if item is not None:
+            print(f"get_parts: {item.get_ref().cref}")
+        else:
+            print(f"get_parts: None")
+            
         parts: list[SerializationResult] = []
         my_visited: set[str] = visited if visited is not None else set()
         params = self.params.merge_with_patch(patch=kwargs)
@@ -360,6 +365,8 @@ def get_parts(
             traverse_pictures=traverse_pictures,
             included_content_layers=params.layers,
         ):
+            print(f" -> child: {item.get_ref().cref}")
+            
             if item.self_ref in my_visited:
                 continue
             else:
diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index ccdfd4ce..2051b5ac 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -98,6 +98,8 @@ def serialize(
         """Serializes the passed text item to HTML."""
         params = HTMLParams(**kwargs)
 
+        print(f"HTMLTextSerializer {item.get_ref().cref}: {item.label} -> {item.text[0:64]}")
+        
         # Prepare the HTML based on item type
         if isinstance(item, TitleItem):
             text_inner = self._prepare_content(item.text)
@@ -262,8 +264,12 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Serializes the passed table item to HTML."""
-        # text = item.export_to_html(doc=doc, add_caption=True)
-        text = self._serialize(
+        if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
+            return SerializationResult(text="")
+
+        print(f"HTMLTableSerializer {item.get_ref().cref}: {item.label}")
+        
+        text = self._serialize_table(
             item=item,
             doc_serializer=doc_serializer,
             doc=doc,
@@ -271,8 +277,8 @@ def serialize(
             add_footnotes=True,
         )
         return SerializationResult(text=text)
-
-    def _serialize(
+    
+    def _serialize_table(
         self,
         item: TableItem,
         doc_serializer: BaseDocSerializer,
@@ -347,15 +353,21 @@ def serialize(
         item: PictureItem,
         doc_serializer: BaseDocSerializer,
         doc: DoclingDocument,
+        visited: Optional[set[str]] = None,
         add_caption: bool = True,
         image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
         **kwargs,
     ) -> SerializationResult:
         """Export picture to HTML format."""
+        if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
+            return SerializationResult(text="")
+        
+        print(f"HTMLPictureSerializer {item.get_ref().cref}: {item.label}")
+        
         caption = doc_serializer.serialize_captions(
             item=item, doc_serializer=doc_serializer, doc=doc, tag="figcaption"
         )
-
+        
         result = ""
 
         if image_mode == ImageRefMode.PLACEHOLDER:
@@ -394,7 +406,7 @@ def serialize(
                 result = f"<figure>{caption.text}{img_text}</figure>"
         else:
             result = f"<figure>{caption.text}</figure>"
-
+        
         return SerializationResult(text=result)
 
 
@@ -411,6 +423,8 @@ def serialize(
         tag: str,
         **kwargs,
     ) -> SerializationResult:
+        print("HTMLGraphDataSerializer")
+        
         """Serialize the graph-data to HTML."""
         # Build cell lookup by ID
         cell_map = {cell.cell_id: cell for cell in item.cells}
@@ -543,6 +557,8 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Serializes the passed key-value item to HTML."""
+        print(f"HTMLKeyValueSerializer {item.get_ref().cref}: {item.label}")
+        
         if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
             return SerializationResult(text="")
 
@@ -575,6 +591,8 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Serializes the passed form item to HTML."""
+        print(f"HTMLFormSerializer {item.get_ref().cref}: {item.label}")
+        
         if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
             return SerializationResult(text="")
 
@@ -608,9 +626,11 @@ def serialize(
         is_inline_scope: bool = False,
         visited: Optional[set[str]] = None,  # refs of visited items
         **kwargs,
-    ) -> SerializationResult:
+    ) -> SerializationResult:        
         """Serializes a list to HTML."""
-        my_visited = visited or set()
+        print(f"HTMLListSerializer {item.get_ref().cref}: {item.label}")
+                
+        my_visited: set[str] = visited if visited is not None else set()
 
         # Get all child parts
         parts = doc_serializer.get_parts(
@@ -658,8 +678,10 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Serializes an inline group to HTML."""
-        my_visited = visited or set()
-
+        print(f"HTMLInlineSerializer: {item.label}: {visited}")
+        
+        my_visited: set[str] = visited if visited is not None else set()
+        
         # Get all parts with inline scope
         parts = doc_serializer.get_parts(
             item=item,
@@ -692,6 +714,8 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Fallback serializer for items not handled by other serializers."""
+        print(f"HTMLFallbackSerializer {item.get_ref().cref}: {item.label}")
+        
         # For group items, we don't generate any markup
         if isinstance(item, GroupItem):
             return SerializationResult(text="")
@@ -782,6 +806,9 @@ def serialize_captions(
         **kwargs,
     ) -> SerializationResult:
         """Serialize the item's captions."""
+        print(f"serialize_captions: {item.label}")
+
+        """
         caption_parts = []
 
         # Extract caption text from all caption items
@@ -804,7 +831,8 @@ def serialize_captions(
                 return SerializationResult(
                     text=f"<{tag}>{html.escape(caption_text)}</{tag}>"
                 )
-
+        """
+        
         return SerializationResult(text="")
 
     def _generate_head(self) -> str:
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 06af6a13..ebd642a1 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -3324,6 +3324,48 @@ def export_to_html(  # noqa: C901
         html_lang: str = "en",
         html_head: str = _HTML_DEFAULT_HEAD,
         included_content_layers: Optional[set[ContentLayer]] = None,
+    ) -> str:
+        r"""Serialize to HTML."""
+        from docling_core.experimental.serializer.html import (
+            HTMLDocSerializer,
+            HTMLParams,
+        )
+
+        my_labels = labels if labels is not None else DOCUMENT_TOKENS_EXPORT_LABELS
+        my_layers = (
+            included_content_layers
+            if included_content_layers is not None
+            else DEFAULT_CONTENT_LAYERS
+        )
+        serializer = HTMLDocSerializer(
+            doc=self,
+            params=HTMLParams(
+                labels=my_labels,
+                layers=my_layers,
+                pages={page_no} if page_no is not None else None,
+                start_idx=from_element,
+                stop_idx=to_element,
+                image_mode=image_mode,
+                formula_to_mathml=formula_to_mathml,
+                html_lang=html_lang,
+                html_head=html_head,
+            ),
+        )
+        ser_res = serializer.serialize()
+
+        return ser_res.text
+
+    def _legacy_export_to_html(  # noqa: C901
+        self,
+        from_element: int = 0,
+        to_element: int = sys.maxsize,
+        labels: Optional[set[DocItemLabel]] = None,
+        image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
+        formula_to_mathml: bool = True,
+        page_no: Optional[int] = None,
+        html_lang: str = "en",
+        html_head: str = _HTML_DEFAULT_HEAD,
+        included_content_layers: Optional[set[ContentLayer]] = None,
     ) -> str:
         r"""Serialize to HTML."""
         my_labels = labels if labels is not None else DEFAULT_EXPORT_LABELS
diff --git a/test/test_serializer_html.py b/test/test_serializer_html.py
index fe691257..a23bca28 100644
--- a/test/test_serializer_html.py
+++ b/test/test_serializer_html.py
@@ -2,6 +2,11 @@
 
 from test.test_docling_doc import _construct_doc
 
+from docling_core.types.doc.base import ImageRefMode
+from docling_core.types.doc.document import (  # BoundingBox,
+    DoclingDocument,
+    
+)
 from docling_core.experimental.serializer.html import HTMLDocSerializer
 
 
@@ -25,3 +30,40 @@ def test_html_export():
     print("Basic example saved to 'example_document.html'")
 
     assert True
+
+def test_markdown_export_with_pageimages():
+
+    doc = DoclingDocument.load_from_json("/Users/taa/Documents/projects/docling/2501.12948v1.json")
+
+    doc.save_as_markdown(
+        filename="2501.12948v1.markdown",
+        image_mode=ImageRefMode.REFERENCED
+    )
+
+    
+def test_html_export_with_pageimages():
+
+    doc = DoclingDocument.load_from_json("/Users/taa/Documents/projects/docling/2501.12948v1.json")
+
+    doc.save_as_html(
+        filename="2501.12948v1.html",
+        image_mode=ImageRefMode.REFERENCED #EMBEDDED
+    )
+    """
+
+
+    """
+    
+    """
+    # Create the serializer with default parameters
+    serializer = HTMLDocSerializer(doc=doc)
+
+    # Serialize the document
+    html_output = serializer.serialize().text
+
+    # Save to file
+    with open("example_document.new.html", "w", encoding="utf-8") as f:
+        f.write(html_output)
+    """
+    
+    assert True

From af3f88cec2665ffad302a03a95025a6475b0ed84 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Apr 2025 05:53:38 +0200
Subject: [PATCH 15/34] enabled the captions

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/html.py | 9 ++++-----
 test/test_serializer_html.py                 | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index 2051b5ac..f3a7ece3 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -241,7 +241,7 @@ def _process_formula(
     def _get_formula_image_fallback(
         self, item: TextItem, doc: DoclingDocument
     ) -> Optional[str]:
-        """Try to get an image fallback for a formula."""
+        """Try to get an image fallback for a formula."""        
         item_image = item.get_image(doc=doc)
         if item_image is not None:
             img_ref = ImageRef.from_pil(item_image, dpi=72)
@@ -291,7 +291,8 @@ def _serialize_table(
         ncols = item.data.num_cols
 
         caption_text = doc_serializer.serialize_captions(item=item, tag="caption")
-
+        print(caption_text)
+        
         body = ""
 
         for i in range(nrows):
@@ -808,7 +809,6 @@ def serialize_captions(
         """Serialize the item's captions."""
         print(f"serialize_captions: {item.label}")
 
-        """
         caption_parts = []
 
         # Extract caption text from all caption items
@@ -818,7 +818,7 @@ def serialize_captions(
                 caption_parts.append(caption_item.text)
 
         # Join all captions with a space
-        if caption_parts:
+        if len(caption_parts)>0:
             caption_text = " ".join(caption_parts)
             text_dir = get_text_direction(caption_text)
 
@@ -831,7 +831,6 @@ def serialize_captions(
                 return SerializationResult(
                     text=f"<{tag}>{html.escape(caption_text)}</{tag}>"
                 )
-        """
         
         return SerializationResult(text="")
 
diff --git a/test/test_serializer_html.py b/test/test_serializer_html.py
index a23bca28..053b2e2e 100644
--- a/test/test_serializer_html.py
+++ b/test/test_serializer_html.py
@@ -47,7 +47,7 @@ def test_html_export_with_pageimages():
 
     doc.save_as_html(
         filename="2501.12948v1.html",
-        image_mode=ImageRefMode.REFERENCED #EMBEDDED
+        image_mode=ImageRefMode.EMBEDDED
     )
     """
 

From 1b2a4ae79b83914e97b921507032531d557c980d Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Apr 2025 07:44:41 +0200
Subject: [PATCH 16/34] removed empty lists

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/html.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index f3a7ece3..339fb046 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -81,6 +81,8 @@ class HTMLParams(CommonParams):
     # Formula rendering options
     formula_to_mathml: bool = True
 
+    # Allow for split page view (only possible if page-images are present)
+    split_page_view: bool = False
 
 class HTMLTextSerializer(BaseModel, BaseTextSerializer):
     """HTML-specific text item serializer."""
@@ -642,6 +644,10 @@ def serialize(
             **kwargs,
         )
 
+        if len(parts)==0:
+            print(f" => no list-items found for {item.get_ref().cref}")
+            return SerializationResult(text="")            
+        
         # Start the appropriate list type
         tag = "ol" if isinstance(item, OrderedList) else "ul"
         list_html = [f"<{tag}>"]

From ab440c74b9597965605fef464ad6f4e4caa0038f Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Apr 2025 11:13:26 +0200
Subject: [PATCH 17/34] added initial split view and customised styles

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/html.py  | 172 ++++++------------
 .../experimental/serializer/html_styles.py    | 125 +++++++++++++
 2 files changed, 177 insertions(+), 120 deletions(-)
 create mode 100644 docling_core/experimental/serializer/html_styles.py

diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index 339fb046..03759611 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -60,6 +60,8 @@
     get_text_direction,
 )
 
+from docling_core.experimental.serializer.html_styles import (_get_css_with_no_styling, _get_ccs_for_single_column)
+
 _logger = logging.getLogger(__name__)
 
 
@@ -792,10 +794,56 @@ def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult
             "<body>",
         ]
 
-        # Add all pages
-        for page in pages:
-            if page.text:
+        split_page_view: bool = True
+        
+        if split_page_view:
+            html_parts.append("<table>")
+            html_parts.append("<tbody>")
+            
+            for page_ind, page in enumerate(pages):
+                page_no = page_ind+1
+                page_img = self.doc.pages[page_no].image
+                
+                html_parts.append("<tr>")
+
+                html_parts.append("<td>")
+
+                # short-cut: we already have the image in base64
+                if (page_img is not None
+                    and isinstance(page_img, ImageRef)
+                    and isinstance(page_img.uri, AnyUrl)
+                    and page_img.uri.scheme == "data"
+                ):
+                    img_text = f'<img src="{page_img.uri}">'
+                    html_parts.append(f"<figure>{img_text}</figure>")
+                else:
+                    # get the page_img._pil or crop it out of the page-image
+                    img = item.get_image(self.doc)
+                    
+                    if img is not None:
+                        imgb64 = item._image_to_base64(img)
+                        img_text = f'<img src="data:image/png;base64,{imgb64}">'
+                        
+                        html_parts.append(f"<figure>{img_text}</figure>")
+                    else:
+                        html_parts.append(f"<figure>no page-image found</figure>")
+                                      
+                html_parts.append("</td>")
+
+                html_parts.append("<td>")
                 html_parts.append(page.text)
+                html_parts.append("</td>")                
+                
+                html_parts.append("</tr>")
+
+            html_parts.append("</tbody>")
+            html_parts.append("</table>")
+            
+        else:
+            # Add all pages
+            for page in pages:
+                if page.text:
+                    html_parts.append(page.text)
 
         # Close HTML structure
         html_parts.extend(["</body>", "</html>"])
@@ -872,120 +920,4 @@ def _generate_head(self) -> str:
 
     def _get_default_css(self) -> str:
         """Return default CSS styles for the HTML document."""
-        return """<style>
-    html {
-        background-color: #f5f5f5;
-        font-family: Arial, sans-serif;
-        line-height: 1.6;
-    }
-    body {
-        max-width: 800px;
-        margin: 0 auto;
-        padding: 2rem;
-        background-color: white;
-        box-shadow: 0 0 10px rgba(0,0,0,0.1);
-    }
-    h1, h2, h3, h4, h5, h6 {
-        color: #333;
-        margin-top: 1.5em;
-        margin-bottom: 0.5em;
-    }
-    h1 {
-        font-size: 2em;
-        border-bottom: 1px solid #eee;
-        padding-bottom: 0.3em;
-    }
-    table {
-        border-collapse: collapse;
-        margin: 1em 0;
-        width: 100%;
-    }
-    th, td {
-        border: 1px solid #ddd;
-        padding: 8px;
-        text-align: left;
-    }
-    th {
-        background-color: #f2f2f2;
-        font-weight: bold;
-    }
-    figure {
-        margin: 1.5em 0;
-        text-align: center;
-    }
-    figcaption {
-        color: #666;
-        font-style: italic;
-        margin-top: 0.5em;
-    }
-    img {
-        max-width: 100%;
-        height: auto;
-    }
-    pre {
-        background-color: #f6f8fa;
-        border-radius: 3px;
-        padding: 1em;
-        overflow: auto;
-    }
-    code {
-        font-family: monospace;
-        background-color: #f6f8fa;
-        padding: 0.2em 0.4em;
-        border-radius: 3px;
-    }
-    pre code {
-        background-color: transparent;
-        padding: 0;
-    }
-    .formula {
-        text-align: center;
-        padding: 0.5em;
-        margin: 1em 0;
-        background-color: #f9f9f9;
-    }
-    .formula-not-decoded {
-        text-align: center;
-        padding: 0.5em;
-        margin: 1em 0;
-        background: repeating-linear-gradient(
-            45deg,
-            #f0f0f0,
-            #f0f0f0 10px,
-            #f9f9f9 10px,
-            #f9f9f9 20px
-        );
-    }
-    .page-break {
-        page-break-after: always;
-        border-top: 1px dashed #ccc;
-        margin: 2em 0;
-    }
-    .key-value-region {
-        background-color: #f9f9f9;
-        padding: 1em;
-        border-radius: 4px;
-        margin: 1em 0;
-    }
-    .key-value-region dt {
-        font-weight: bold;
-    }
-    .key-value-region dd {
-        margin-left: 1em;
-        margin-bottom: 0.5em;
-    }
-    .form-container {
-        border: 1px solid #ddd;
-        padding: 1em;
-        border-radius: 4px;
-        margin: 1em 0;
-    }
-    .form-item {
-        margin-bottom: 0.5em;
-    }
-    .image-classification {
-        font-size: 0.9em;
-        color: #666;
-        margin-top: 0.5em;
-    }
-</style>"""
+        return "<style></style>"
diff --git a/docling_core/experimental/serializer/html_styles.py b/docling_core/experimental/serializer/html_styles.py
new file mode 100644
index 00000000..8eecf03b
--- /dev/null
+++ b/docling_core/experimental/serializer/html_styles.py
@@ -0,0 +1,125 @@
+
+
+def _get_css_with_no_styling(self) -> str:
+    """Return default CSS styles for the HTML document."""
+    return "<style></style>"
+
+def _get_ccs_for_single_column(self) -> str:
+    """Return CSS styles for the single-column HTML document."""
+    return """<style>
+    html {
+        background-color: #f5f5f5;
+        font-family: Arial, sans-serif;
+        line-height: 1.6;
+    }
+    body {
+        max-width: 800px;
+        margin: 0 auto;
+        padding: 2rem;
+        background-color: white;
+        box-shadow: 0 0 10px rgba(0,0,0,0.1);
+    }
+    h1, h2, h3, h4, h5, h6 {
+        color: #333;
+        margin-top: 1.5em;
+        margin-bottom: 0.5em;
+    }
+    h1 {
+        font-size: 2em;
+        border-bottom: 1px solid #eee;
+        padding-bottom: 0.3em;
+    }
+    table {
+        border-collapse: collapse;
+        margin: 1em 0;
+        width: 100%;
+    }
+    th, td {
+        border: 1px solid #ddd;
+        padding: 8px;
+        text-align: left;
+    }
+    th {
+        background-color: #f2f2f2;
+        font-weight: bold;
+    }
+    figure {
+        margin: 1.5em 0;
+        text-align: center;
+    }
+    figcaption {
+        color: #666;
+        font-style: italic;
+        margin-top: 0.5em;
+    }
+    img {
+        max-width: 100%;
+        height: auto;
+    }
+    pre {
+        background-color: #f6f8fa;
+        border-radius: 3px;
+        padding: 1em;
+        overflow: auto;
+    }
+    code {
+        font-family: monospace;
+        background-color: #f6f8fa;
+        padding: 0.2em 0.4em;
+        border-radius: 3px;
+    }
+    pre code {
+        background-color: transparent;
+        padding: 0;
+    }
+    .formula {
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background-color: #f9f9f9;
+    }
+    .formula-not-decoded {
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background: repeating-linear-gradient(
+            45deg,
+            #f0f0f0,
+            #f0f0f0 10px,
+            #f9f9f9 10px,
+            #f9f9f9 20px
+        );
+    }
+    .page-break {
+        page-break-after: always;
+        border-top: 1px dashed #ccc;
+        margin: 2em 0;
+    }
+    .key-value-region {
+        background-color: #f9f9f9;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .key-value-region dt {
+        font-weight: bold;
+    }
+    .key-value-region dd {
+        margin-left: 1em;
+        margin-bottom: 0.5em;
+    }
+    .form-container {
+        border: 1px solid #ddd;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .form-item {
+        margin-bottom: 0.5em;
+    }
+    .image-classification {
+        font-size: 0.9em;
+        color: #666;
+        margin-top: 0.5em;
+    }
+</style>"""

From d23f80e2f3f11494dcd928257ab7c494255cfa4d Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Apr 2025 11:40:26 +0200
Subject: [PATCH 18/34] cleaned up, now waiting for page-indices propagationg

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 .../experimental/serializer/common.py         |  6 +-
 docling_core/experimental/serializer/html.py  | 97 ++++++++++---------
 .../experimental/serializer/html_styles.py    |  2 +
 test/test_serializer_html.py                  | 30 +++---
 4 files changed, 70 insertions(+), 65 deletions(-)

diff --git a/docling_core/experimental/serializer/common.py b/docling_core/experimental/serializer/common.py
index 5304f1f7..98d7278f 100644
--- a/docling_core/experimental/serializer/common.py
+++ b/docling_core/experimental/serializer/common.py
@@ -354,8 +354,8 @@ def get_parts(
         if item is not None:
             print(f"get_parts: {item.get_ref().cref}")
         else:
-            print(f"get_parts: None")
-            
+            print("get_parts: None")
+
         parts: list[SerializationResult] = []
         my_visited: set[str] = visited if visited is not None else set()
         params = self.params.merge_with_patch(patch=kwargs)
@@ -366,7 +366,7 @@ def get_parts(
             included_content_layers=params.layers,
         ):
             print(f" -> child: {item.get_ref().cref}")
-            
+
             if item.self_ref in my_visited:
                 continue
             else:
diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index 03759611..761c0eaf 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -4,8 +4,10 @@
 #
 
 """Define classes for HTML serialization."""
+import base64
 import html
 import logging
+from io import BytesIO
 from pathlib import Path
 from typing import Optional, Union
 from urllib.parse import quote
@@ -60,8 +62,6 @@
     get_text_direction,
 )
 
-from docling_core.experimental.serializer.html_styles import (_get_css_with_no_styling, _get_ccs_for_single_column)
-
 _logger = logging.getLogger(__name__)
 
 
@@ -86,6 +86,7 @@ class HTMLParams(CommonParams):
     # Allow for split page view (only possible if page-images are present)
     split_page_view: bool = False
 
+
 class HTMLTextSerializer(BaseModel, BaseTextSerializer):
     """HTML-specific text item serializer."""
 
@@ -102,8 +103,8 @@ def serialize(
         """Serializes the passed text item to HTML."""
         params = HTMLParams(**kwargs)
 
-        print(f"HTMLTextSerializer {item.get_ref().cref}: {item.label} -> {item.text[0:64]}")
-        
+        print(f"HTMLTextSerializer {item.get_ref().cref}: {item.label}")
+
         # Prepare the HTML based on item type
         if isinstance(item, TitleItem):
             text_inner = self._prepare_content(item.text)
@@ -245,7 +246,7 @@ def _process_formula(
     def _get_formula_image_fallback(
         self, item: TextItem, doc: DoclingDocument
     ) -> Optional[str]:
-        """Try to get an image fallback for a formula."""        
+        """Try to get an image fallback for a formula."""
         item_image = item.get_image(doc=doc)
         if item_image is not None:
             img_ref = ImageRef.from_pil(item_image, dpi=72)
@@ -272,7 +273,7 @@ def serialize(
             return SerializationResult(text="")
 
         print(f"HTMLTableSerializer {item.get_ref().cref}: {item.label}")
-        
+
         text = self._serialize_table(
             item=item,
             doc_serializer=doc_serializer,
@@ -281,7 +282,7 @@ def serialize(
             add_footnotes=True,
         )
         return SerializationResult(text=text)
-    
+
     def _serialize_table(
         self,
         item: TableItem,
@@ -296,7 +297,7 @@ def _serialize_table(
 
         caption_text = doc_serializer.serialize_captions(item=item, tag="caption")
         print(caption_text)
-        
+
         body = ""
 
         for i in range(nrows):
@@ -366,13 +367,13 @@ def serialize(
         """Export picture to HTML format."""
         if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
             return SerializationResult(text="")
-        
+
         print(f"HTMLPictureSerializer {item.get_ref().cref}: {item.label}")
-        
+
         caption = doc_serializer.serialize_captions(
             item=item, doc_serializer=doc_serializer, doc=doc, tag="figcaption"
         )
-        
+
         result = ""
 
         if image_mode == ImageRefMode.PLACEHOLDER:
@@ -411,7 +412,7 @@ def serialize(
                 result = f"<figure>{caption.text}{img_text}</figure>"
         else:
             result = f"<figure>{caption.text}</figure>"
-        
+
         return SerializationResult(text=result)
 
 
@@ -428,9 +429,9 @@ def serialize(
         tag: str,
         **kwargs,
     ) -> SerializationResult:
-        print("HTMLGraphDataSerializer")
-        
         """Serialize the graph-data to HTML."""
+        print("HTMLGraphDataSerializer")
+
         # Build cell lookup by ID
         cell_map = {cell.cell_id: cell for cell in item.cells}
 
@@ -563,7 +564,7 @@ def serialize(
     ) -> SerializationResult:
         """Serializes the passed key-value item to HTML."""
         print(f"HTMLKeyValueSerializer {item.get_ref().cref}: {item.label}")
-        
+
         if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
             return SerializationResult(text="")
 
@@ -597,7 +598,7 @@ def serialize(
     ) -> SerializationResult:
         """Serializes the passed form item to HTML."""
         print(f"HTMLFormSerializer {item.get_ref().cref}: {item.label}")
-        
+
         if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
             return SerializationResult(text="")
 
@@ -631,10 +632,10 @@ def serialize(
         is_inline_scope: bool = False,
         visited: Optional[set[str]] = None,  # refs of visited items
         **kwargs,
-    ) -> SerializationResult:        
+    ) -> SerializationResult:
         """Serializes a list to HTML."""
         print(f"HTMLListSerializer {item.get_ref().cref}: {item.label}")
-                
+
         my_visited: set[str] = visited if visited is not None else set()
 
         # Get all child parts
@@ -646,10 +647,10 @@ def serialize(
             **kwargs,
         )
 
-        if len(parts)==0:
+        if len(parts) == 0:
             print(f" => no list-items found for {item.get_ref().cref}")
-            return SerializationResult(text="")            
-        
+            return SerializationResult(text="")
+
         # Start the appropriate list type
         tag = "ol" if isinstance(item, OrderedList) else "ul"
         list_html = [f"<{tag}>"]
@@ -688,9 +689,9 @@ def serialize(
     ) -> SerializationResult:
         """Serializes an inline group to HTML."""
         print(f"HTMLInlineSerializer: {item.label}: {visited}")
-        
+
         my_visited: set[str] = visited if visited is not None else set()
-        
+
         # Get all parts with inline scope
         parts = doc_serializer.get_parts(
             item=item,
@@ -723,8 +724,6 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Fallback serializer for items not handled by other serializers."""
-        print(f"HTMLFallbackSerializer {item.get_ref().cref}: {item.label}")
-        
         # For group items, we don't generate any markup
         if isinstance(item, GroupItem):
             return SerializationResult(text="")
@@ -795,50 +794,56 @@ def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult
         ]
 
         split_page_view: bool = True
-        
+
         if split_page_view:
             html_parts.append("<table>")
             html_parts.append("<tbody>")
-            
+
             for page_ind, page in enumerate(pages):
-                page_no = page_ind+1
+                page_no = page_ind + 1
                 page_img = self.doc.pages[page_no].image
-                
+
                 html_parts.append("<tr>")
 
                 html_parts.append("<td>")
 
                 # short-cut: we already have the image in base64
-                if (page_img is not None
+                if (
+                    (page_img is not None)
                     and isinstance(page_img, ImageRef)
                     and isinstance(page_img.uri, AnyUrl)
                     and page_img.uri.scheme == "data"
                 ):
                     img_text = f'<img src="{page_img.uri}">'
                     html_parts.append(f"<figure>{img_text}</figure>")
+
+                elif (page_img is not None) and (page_img._pil is not None):
+
+                    buffered = BytesIO()
+                    page_img._pil.save(
+                        buffered, format="PNG"
+                    )  # Save the image to the byte stream
+                    img_bytes = buffered.getvalue()  # Get the byte data
+
+                    # Encode to Base64 and decode to string
+                    img_base64 = base64.b64encode(img_bytes).decode("utf-8")
+                    img_text = f'<img src="data:image/png;base64,{img_base64}">'
+
+                    html_parts.append(f"<figure>{img_text}</figure>")
                 else:
-                    # get the page_img._pil or crop it out of the page-image
-                    img = item.get_image(self.doc)
-                    
-                    if img is not None:
-                        imgb64 = item._image_to_base64(img)
-                        img_text = f'<img src="data:image/png;base64,{imgb64}">'
-                        
-                        html_parts.append(f"<figure>{img_text}</figure>")
-                    else:
-                        html_parts.append(f"<figure>no page-image found</figure>")
-                                      
+                    html_parts.append("<figure>no page-image found</figure>")
+
                 html_parts.append("</td>")
 
                 html_parts.append("<td>")
                 html_parts.append(page.text)
-                html_parts.append("</td>")                
-                
+                html_parts.append("</td>")
+
                 html_parts.append("</tr>")
 
             html_parts.append("</tbody>")
             html_parts.append("</table>")
-            
+
         else:
             # Add all pages
             for page in pages:
@@ -872,7 +877,7 @@ def serialize_captions(
                 caption_parts.append(caption_item.text)
 
         # Join all captions with a space
-        if len(caption_parts)>0:
+        if len(caption_parts) > 0:
             caption_text = " ".join(caption_parts)
             text_dir = get_text_direction(caption_text)
 
@@ -885,7 +890,7 @@ def serialize_captions(
                 return SerializationResult(
                     text=f"<{tag}>{html.escape(caption_text)}</{tag}>"
                 )
-        
+
         return SerializationResult(text="")
 
     def _generate_head(self) -> str:
diff --git a/docling_core/experimental/serializer/html_styles.py b/docling_core/experimental/serializer/html_styles.py
index 8eecf03b..7190a550 100644
--- a/docling_core/experimental/serializer/html_styles.py
+++ b/docling_core/experimental/serializer/html_styles.py
@@ -1,9 +1,11 @@
+"""HTML styles for different export modes."""
 
 
 def _get_css_with_no_styling(self) -> str:
     """Return default CSS styles for the HTML document."""
     return "<style></style>"
 
+
 def _get_ccs_for_single_column(self) -> str:
     """Return CSS styles for the single-column HTML document."""
     return """<style>
diff --git a/test/test_serializer_html.py b/test/test_serializer_html.py
index 053b2e2e..1643e15a 100644
--- a/test/test_serializer_html.py
+++ b/test/test_serializer_html.py
@@ -2,12 +2,9 @@
 
 from test.test_docling_doc import _construct_doc
 
-from docling_core.types.doc.base import ImageRefMode
-from docling_core.types.doc.document import (  # BoundingBox,
-    DoclingDocument,
-    
-)
 from docling_core.experimental.serializer.html import HTMLDocSerializer
+from docling_core.types.doc.base import ImageRefMode
+from docling_core.types.doc.document import DoclingDocument  # BoundingBox,
 
 
 def test_html_export():
@@ -31,29 +28,30 @@ def test_html_export():
 
     assert True
 
+
 def test_markdown_export_with_pageimages():
 
-    doc = DoclingDocument.load_from_json("/Users/taa/Documents/projects/docling/2501.12948v1.json")
+    doc = DoclingDocument.load_from_json(
+        "/Users/taa/Documents/projects/docling/2501.12948v1.json"
+    )
 
     doc.save_as_markdown(
-        filename="2501.12948v1.markdown",
-        image_mode=ImageRefMode.REFERENCED
+        filename="2501.12948v1.markdown", image_mode=ImageRefMode.REFERENCED
     )
 
-    
-def test_html_export_with_pageimages():
 
-    doc = DoclingDocument.load_from_json("/Users/taa/Documents/projects/docling/2501.12948v1.json")
+def test_html_export_with_pageimages():
 
-    doc.save_as_html(
-        filename="2501.12948v1.html",
-        image_mode=ImageRefMode.EMBEDDED
+    doc = DoclingDocument.load_from_json(
+        "/Users/taa/Documents/projects/docling/2501.12948v1.json"
     )
+
+    doc.save_as_html(filename="2501.12948v1.html", image_mode=ImageRefMode.EMBEDDED)
     """
 
 
     """
-    
+
     """
     # Create the serializer with default parameters
     serializer = HTMLDocSerializer(doc=doc)
@@ -65,5 +63,5 @@ def test_html_export_with_pageimages():
     with open("example_document.new.html", "w", encoding="utf-8") as f:
         f.write(html_output)
     """
-    
+
     assert True

From f26413314ec060b81421c00c92efd0fcc3b9c3b4 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Apr 2025 12:52:53 +0200
Subject: [PATCH 19/34] updated the styles and parameters with split_page

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/html.py      | 15 +++++++++++----
 .../experimental/serializer/html_styles.py        |  8 ++++++--
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index 761c0eaf..014577a9 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -33,6 +33,10 @@
     SerializationResult,
 )
 from docling_core.experimental.serializer.common import CommonParams, DocSerializer
+from docling_core.experimental.serializer.html_styles import (
+    _get_css_for_split_page,
+    _get_css_for_single_column
+)
 from docling_core.types.doc.base import ImageRefMode
 from docling_core.types.doc.document import (
     CodeItem,
@@ -62,6 +66,8 @@
     get_text_direction,
 )
 
+
+
 _logger = logging.getLogger(__name__)
 
 
@@ -786,6 +792,7 @@ def serialize_page(self, parts: list[SerializationResult]) -> SerializationResul
     @override
     def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
         """Serialize a document out of its pages."""
+
         # Create HTML structure
         html_parts = [
             "<!DOCTYPE html>",
@@ -793,9 +800,7 @@ def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult
             "<body>",
         ]
 
-        split_page_view: bool = True
-
-        if split_page_view:
+        if self.params.split_page_view:
             html_parts.append("<table>")
             html_parts.append("<tbody>")
 
@@ -913,8 +918,10 @@ def _generate_head(self) -> str:
         # Add default styles or custom CSS
         if params.css_styles:
             head_parts.append(f"<style>\n{params.css_styles}\n</style>")
+        elif self.params.split_page_view:
+            head_parts.append(_get_css_for_split_page())
         else:
-            head_parts.append(self._get_default_css())
+            head_parts.append(_get_css_for_single_column())
 
         head_parts.append("</head>")
 
diff --git a/docling_core/experimental/serializer/html_styles.py b/docling_core/experimental/serializer/html_styles.py
index 7190a550..3ac684d1 100644
--- a/docling_core/experimental/serializer/html_styles.py
+++ b/docling_core/experimental/serializer/html_styles.py
@@ -1,12 +1,16 @@
 """HTML styles for different export modes."""
 
 
-def _get_css_with_no_styling(self) -> str:
+def _get_css_with_no_styling() -> str:
+    """Return default CSS styles for the HTML document."""
+    return "<style></style>"
+
+def _get_css_for_split_page() -> str:
     """Return default CSS styles for the HTML document."""
     return "<style></style>"
 
 
-def _get_ccs_for_single_column(self) -> str:
+def _get_css_for_single_column() -> str:
     """Return CSS styles for the single-column HTML document."""
     return """<style>
     html {

From 91a2d335f40f06aeccbc050d833b6a50f5924f83 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Apr 2025 12:57:45 +0200
Subject: [PATCH 20/34] propagated the parameter split_page_view

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/types/doc/document.py | 4 ++++
 test/test_serializer_html.py       | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index ebd642a1..0ac7eb03 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -3251,6 +3251,7 @@ def save_as_html(
         html_lang: str = "en",
         html_head: str = _HTML_DEFAULT_HEAD,
         included_content_layers: Optional[set[ContentLayer]] = None,
+        split_page_view: bool = False            
     ):
         """Save to HTML."""
         if isinstance(filename, str):
@@ -3274,6 +3275,7 @@ def save_as_html(
             html_lang=html_lang,
             html_head=html_head,
             included_content_layers=included_content_layers,
+            split_page_view=split_page_view,
         )
 
         with open(filename, "w", encoding="utf-8") as fw:
@@ -3324,6 +3326,7 @@ def export_to_html(  # noqa: C901
         html_lang: str = "en",
         html_head: str = _HTML_DEFAULT_HEAD,
         included_content_layers: Optional[set[ContentLayer]] = None,
+        split_page_view: bool = False
     ) -> str:
         r"""Serialize to HTML."""
         from docling_core.experimental.serializer.html import (
@@ -3349,6 +3352,7 @@ def export_to_html(  # noqa: C901
                 formula_to_mathml=formula_to_mathml,
                 html_lang=html_lang,
                 html_head=html_head,
+                split_page_view=split_page_view,
             ),
         )
         ser_res = serializer.serialize()
diff --git a/test/test_serializer_html.py b/test/test_serializer_html.py
index 1643e15a..f43dbb03 100644
--- a/test/test_serializer_html.py
+++ b/test/test_serializer_html.py
@@ -45,8 +45,10 @@ def test_html_export_with_pageimages():
     doc = DoclingDocument.load_from_json(
         "/Users/taa/Documents/projects/docling/2501.12948v1.json"
     )
-
     doc.save_as_html(filename="2501.12948v1.html", image_mode=ImageRefMode.EMBEDDED)
+    doc.save_as_html(filename="2501.12948v1.split.html", image_mode=ImageRefMode.EMBEDDED, split_page_view=True)
+
+    
     """
 
 

From 21957a49b54794740a793a1a618211207bb12c41 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Apr 2025 14:58:50 +0200
Subject: [PATCH 21/34] first fully working version

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/html.py  | 82 +++++++++---------
 .../experimental/serializer/html_styles.py    | 83 ++++++++++++++++++-
 docling_core/types/doc/document.py            |  4 +-
 test/test_serializer_html.py                  |  7 +-
 4 files changed, 133 insertions(+), 43 deletions(-)

diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index 014577a9..77ca9598 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -34,8 +34,8 @@
 )
 from docling_core.experimental.serializer.common import CommonParams, DocSerializer
 from docling_core.experimental.serializer.html_styles import (
+    _get_css_for_single_column,
     _get_css_for_split_page,
-    _get_css_for_single_column
 )
 from docling_core.types.doc.base import ImageRefMode
 from docling_core.types.doc.document import (
@@ -66,8 +66,6 @@
     get_text_direction,
 )
 
-
-
 _logger = logging.getLogger(__name__)
 
 
@@ -783,16 +781,19 @@ def serialize_hyperlink(
         return f'<a href="{str(hyperlink)}">{text}</a>'
 
     @override
-    def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
+    def serialize_page(
+        self, parts: list[SerializationResult], **kwargs
+    ) -> SerializationResult:
         """Serialize a page out of its parts."""
         # Join all parts with newlines
         body_content = "\n".join([p.text for p in parts if p.text])
         return SerializationResult(text=f"<div class='page'>\n{body_content}\n</div>")
 
     @override
-    def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
+    def serialize_doc(
+        self, pages: dict[Optional[int], SerializationResult], **kwargs
+    ) -> SerializationResult:
         """Serialize a document out of its pages."""
-
         # Create HTML structure
         html_parts = [
             "<!DOCTYPE html>",
@@ -804,54 +805,59 @@ def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult
             html_parts.append("<table>")
             html_parts.append("<tbody>")
 
-            for page_ind, page in enumerate(pages):
-                page_no = page_ind + 1
-                page_img = self.doc.pages[page_no].image
+            for page_no, page in pages.items():
 
-                html_parts.append("<tr>")
+                if isinstance(page_no, int):
+                    page_img = self.doc.pages[page_no].image
 
-                html_parts.append("<td>")
+                    html_parts.append("<tr>")
 
-                # short-cut: we already have the image in base64
-                if (
-                    (page_img is not None)
-                    and isinstance(page_img, ImageRef)
-                    and isinstance(page_img.uri, AnyUrl)
-                    and page_img.uri.scheme == "data"
-                ):
-                    img_text = f'<img src="{page_img.uri}">'
-                    html_parts.append(f"<figure>{img_text}</figure>")
+                    html_parts.append("<td>")
 
-                elif (page_img is not None) and (page_img._pil is not None):
+                    # short-cut: we already have the image in base64
+                    if (
+                        (page_img is not None)
+                        and isinstance(page_img, ImageRef)
+                        and isinstance(page_img.uri, AnyUrl)
+                        and page_img.uri.scheme == "data"
+                    ):
+                        img_text = f'<img src="{page_img.uri}">'
+                        html_parts.append(f"<figure>{img_text}</figure>")
 
-                    buffered = BytesIO()
-                    page_img._pil.save(
-                        buffered, format="PNG"
-                    )  # Save the image to the byte stream
-                    img_bytes = buffered.getvalue()  # Get the byte data
+                    elif (page_img is not None) and (page_img._pil is not None):
 
-                    # Encode to Base64 and decode to string
-                    img_base64 = base64.b64encode(img_bytes).decode("utf-8")
-                    img_text = f'<img src="data:image/png;base64,{img_base64}">'
+                        buffered = BytesIO()
+                        page_img._pil.save(
+                            buffered, format="PNG"
+                        )  # Save the image to the byte stream
+                        img_bytes = buffered.getvalue()  # Get the byte data
 
-                    html_parts.append(f"<figure>{img_text}</figure>")
-                else:
-                    html_parts.append("<figure>no page-image found</figure>")
+                        # Encode to Base64 and decode to string
+                        img_base64 = base64.b64encode(img_bytes).decode("utf-8")
+                        img_text = f'<img src="data:image/png;base64,{img_base64}">'
+
+                        html_parts.append(f"<figure>{img_text}</figure>")
+                    else:
+                        html_parts.append("<figure>no page-image found</figure>")
 
-                html_parts.append("</td>")
+                    html_parts.append("</td>")
 
-                html_parts.append("<td>")
-                html_parts.append(page.text)
-                html_parts.append("</td>")
+                    html_parts.append("<td>")
+                    html_parts.append(page.text)
+                    html_parts.append("</td>")
 
-                html_parts.append("</tr>")
+                    html_parts.append("</tr>")
+                else:
+                    raise ValueError(
+                        "We need page-indices to leverage `split_page_view`"
+                    )
 
             html_parts.append("</tbody>")
             html_parts.append("</table>")
 
         else:
             # Add all pages
-            for page in pages:
+            for page_no, page in pages.items():
                 if page.text:
                     html_parts.append(page.text)
 
diff --git a/docling_core/experimental/serializer/html_styles.py b/docling_core/experimental/serializer/html_styles.py
index 3ac684d1..3d721f01 100644
--- a/docling_core/experimental/serializer/html_styles.py
+++ b/docling_core/experimental/serializer/html_styles.py
@@ -5,9 +5,90 @@ def _get_css_with_no_styling() -> str:
     """Return default CSS styles for the HTML document."""
     return "<style></style>"
 
+
 def _get_css_for_split_page() -> str:
     """Return default CSS styles for the HTML document."""
-    return "<style></style>"
+    return """<style>
+    html {
+        background-color: #e1e1e1;
+        font-family: Arial, sans-serif;
+        line-height: 1.6;
+    }
+    img {
+        min-width: 500px;
+        max-width: 100%;
+    }
+    table {
+        border-collapse: collapse;
+        border: 0px solid #fff;
+        width: 100%;
+    }
+    td {
+        vertical-align: top;
+    }
+    .page {
+        background-color: white;
+        margin-top:15px;
+        padding: 30px;
+        border: 1px solid black;
+        width:100%;
+        max-width:1000px;
+        box-shadow: 0 0 10px rgba(0,0,0,0.5);
+    }
+    .page figure {
+        text-align: center;
+    }
+    .page img {
+        max-width: 900px;
+        min-width: auto;
+    }
+    .page table {
+        border-collapse: collapse;
+        margin: 1em 0;
+        width: 100%;
+    }
+    .page table td {
+        border: 1px solid #ddd;
+        padding: 8px;
+        text-align: left;
+    }
+    .page table th {
+        border: 1px solid #ddd;
+        padding: 8px;
+        text-align: left;
+        background-color: #f2f2f2;
+        font-weight: bold;
+    }
+    .page table caption {
+        color: #666;
+        font-style: italic;
+        margin-top: 0.5em;
+        padding: 8px;
+        margin-top: 5px;
+        margin-bottom: 5px;
+    }
+    .page figcaption {
+        color: #666;
+        font-style: italic;
+        margin-top: 0.5em;
+        padding: 8px;
+        margin-top: 5px;
+        margin-bottom: 5px;
+    }
+    code {
+        background-color: rgb(228, 228, 228);
+        border: 1px solid darkgray;
+        padding: 10px;
+        display: inline-block;
+        font-family: monospace;
+        max-width:980px;
+        word-wrap: normal;
+        white-space: pre-wrap;
+        word-wrap: break-word;
+        /*overflow-wrap: break-word;*/
+    }
+</style>
+"""
 
 
 def _get_css_for_single_column() -> str:
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 0ac7eb03..3f478420 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -3251,7 +3251,7 @@ def save_as_html(
         html_lang: str = "en",
         html_head: str = _HTML_DEFAULT_HEAD,
         included_content_layers: Optional[set[ContentLayer]] = None,
-        split_page_view: bool = False            
+        split_page_view: bool = False,
     ):
         """Save to HTML."""
         if isinstance(filename, str):
@@ -3326,7 +3326,7 @@ def export_to_html(  # noqa: C901
         html_lang: str = "en",
         html_head: str = _HTML_DEFAULT_HEAD,
         included_content_layers: Optional[set[ContentLayer]] = None,
-        split_page_view: bool = False
+        split_page_view: bool = False,
     ) -> str:
         r"""Serialize to HTML."""
         from docling_core.experimental.serializer.html import (
diff --git a/test/test_serializer_html.py b/test/test_serializer_html.py
index f43dbb03..12026129 100644
--- a/test/test_serializer_html.py
+++ b/test/test_serializer_html.py
@@ -46,9 +46,12 @@ def test_html_export_with_pageimages():
         "/Users/taa/Documents/projects/docling/2501.12948v1.json"
     )
     doc.save_as_html(filename="2501.12948v1.html", image_mode=ImageRefMode.EMBEDDED)
-    doc.save_as_html(filename="2501.12948v1.split.html", image_mode=ImageRefMode.EMBEDDED, split_page_view=True)
+    doc.save_as_html(
+        filename="2501.12948v1.split.html",
+        image_mode=ImageRefMode.EMBEDDED,
+        split_page_view=True,
+    )
 
-    
     """
 
 

From 7b7ac6d3866c53b97919b73814cff5f8209b2779 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Apr 2025 15:14:21 +0200
Subject: [PATCH 22/34] removed the prints

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 .../experimental/serializer/common.py         |  7 -------
 docling_core/experimental/serializer/html.py  | 20 +------------------
 2 files changed, 1 insertion(+), 26 deletions(-)

diff --git a/docling_core/experimental/serializer/common.py b/docling_core/experimental/serializer/common.py
index 98d7278f..e399e3d7 100644
--- a/docling_core/experimental/serializer/common.py
+++ b/docling_core/experimental/serializer/common.py
@@ -351,11 +351,6 @@ def get_parts(
         **kwargs,
     ) -> list[SerializationResult]:
         """Get the components to be combined for serializing this node."""
-        if item is not None:
-            print(f"get_parts: {item.get_ref().cref}")
-        else:
-            print("get_parts: None")
-
         parts: list[SerializationResult] = []
         my_visited: set[str] = visited if visited is not None else set()
         params = self.params.merge_with_patch(patch=kwargs)
@@ -365,8 +360,6 @@ def get_parts(
             traverse_pictures=traverse_pictures,
             included_content_layers=params.layers,
         ):
-            print(f" -> child: {item.get_ref().cref}")
-
             if item.self_ref in my_visited:
                 continue
             else:
diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index 77ca9598..33f6a65f 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -107,8 +107,6 @@ def serialize(
         """Serializes the passed text item to HTML."""
         params = HTMLParams(**kwargs)
 
-        print(f"HTMLTextSerializer {item.get_ref().cref}: {item.label}")
-
         # Prepare the HTML based on item type
         if isinstance(item, TitleItem):
             text_inner = self._prepare_content(item.text)
@@ -276,8 +274,6 @@ def serialize(
         if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
             return SerializationResult(text="")
 
-        print(f"HTMLTableSerializer {item.get_ref().cref}: {item.label}")
-
         text = self._serialize_table(
             item=item,
             doc_serializer=doc_serializer,
@@ -300,7 +296,6 @@ def _serialize_table(
         ncols = item.data.num_cols
 
         caption_text = doc_serializer.serialize_captions(item=item, tag="caption")
-        print(caption_text)
 
         body = ""
 
@@ -372,8 +367,6 @@ def serialize(
         if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
             return SerializationResult(text="")
 
-        print(f"HTMLPictureSerializer {item.get_ref().cref}: {item.label}")
-
         caption = doc_serializer.serialize_captions(
             item=item, doc_serializer=doc_serializer, doc=doc, tag="figcaption"
         )
@@ -434,7 +427,6 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Serialize the graph-data to HTML."""
-        print("HTMLGraphDataSerializer")
 
         # Build cell lookup by ID
         cell_map = {cell.cell_id: cell for cell in item.cells}
@@ -567,8 +559,6 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Serializes the passed key-value item to HTML."""
-        print(f"HTMLKeyValueSerializer {item.get_ref().cref}: {item.label}")
-
         if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
             return SerializationResult(text="")
 
@@ -601,8 +591,6 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Serializes the passed form item to HTML."""
-        print(f"HTMLFormSerializer {item.get_ref().cref}: {item.label}")
-
         if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
             return SerializationResult(text="")
 
@@ -638,8 +626,6 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Serializes a list to HTML."""
-        print(f"HTMLListSerializer {item.get_ref().cref}: {item.label}")
-
         my_visited: set[str] = visited if visited is not None else set()
 
         # Get all child parts
@@ -652,7 +638,7 @@ def serialize(
         )
 
         if len(parts) == 0:
-            print(f" => no list-items found for {item.get_ref().cref}")
+            _logger.warning(f" => no list-items found for list {item.get_ref().cref}")
             return SerializationResult(text="")
 
         # Start the appropriate list type
@@ -692,8 +678,6 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Serializes an inline group to HTML."""
-        print(f"HTMLInlineSerializer: {item.label}: {visited}")
-
         my_visited: set[str] = visited if visited is not None else set()
 
         # Get all parts with inline scope
@@ -877,8 +861,6 @@ def serialize_captions(
         **kwargs,
     ) -> SerializationResult:
         """Serialize the item's captions."""
-        print(f"serialize_captions: {item.label}")
-
         caption_parts = []
 
         # Extract caption text from all caption items

From 81c0fa56cda060aa065507575435ccc2932ff8fe Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Apr 2025 15:59:22 +0200
Subject: [PATCH 23/34] fixed the tests

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 test/data/doc/constructed_doc.html | 204 +++++++++++++++++++++++++++++
 test/test_serializer_html.py       |  63 +--------
 2 files changed, 211 insertions(+), 56 deletions(-)
 create mode 100644 test/data/doc/constructed_doc.html

diff --git a/test/data/doc/constructed_doc.html b/test/data/doc/constructed_doc.html
new file mode 100644
index 00000000..c3b8b764
--- /dev/null
+++ b/test/data/doc/constructed_doc.html
@@ -0,0 +1,204 @@
+<!DOCTYPE html>
+<head>
+<meta charset="UTF-8">
+<title>Untitled 1</title>
+<meta name="generator" content="Docling HTML Serializer">
+<style>
+    html {
+        background-color: #f5f5f5;
+        font-family: Arial, sans-serif;
+        line-height: 1.6;
+    }
+    body {
+        max-width: 800px;
+        margin: 0 auto;
+        padding: 2rem;
+        background-color: white;
+        box-shadow: 0 0 10px rgba(0,0,0,0.1);
+    }
+    h1, h2, h3, h4, h5, h6 {
+        color: #333;
+        margin-top: 1.5em;
+        margin-bottom: 0.5em;
+    }
+    h1 {
+        font-size: 2em;
+        border-bottom: 1px solid #eee;
+        padding-bottom: 0.3em;
+    }
+    table {
+        border-collapse: collapse;
+        margin: 1em 0;
+        width: 100%;
+    }
+    th, td {
+        border: 1px solid #ddd;
+        padding: 8px;
+        text-align: left;
+    }
+    th {
+        background-color: #f2f2f2;
+        font-weight: bold;
+    }
+    figure {
+        margin: 1.5em 0;
+        text-align: center;
+    }
+    figcaption {
+        color: #666;
+        font-style: italic;
+        margin-top: 0.5em;
+    }
+    img {
+        max-width: 100%;
+        height: auto;
+    }
+    pre {
+        background-color: #f6f8fa;
+        border-radius: 3px;
+        padding: 1em;
+        overflow: auto;
+    }
+    code {
+        font-family: monospace;
+        background-color: #f6f8fa;
+        padding: 0.2em 0.4em;
+        border-radius: 3px;
+    }
+    pre code {
+        background-color: transparent;
+        padding: 0;
+    }
+    .formula {
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background-color: #f9f9f9;
+    }
+    .formula-not-decoded {
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background: repeating-linear-gradient(
+            45deg,
+            #f0f0f0,
+            #f0f0f0 10px,
+            #f9f9f9 10px,
+            #f9f9f9 20px
+        );
+    }
+    .page-break {
+        page-break-after: always;
+        border-top: 1px dashed #ccc;
+        margin: 2em 0;
+    }
+    .key-value-region {
+        background-color: #f9f9f9;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .key-value-region dt {
+        font-weight: bold;
+    }
+    .key-value-region dd {
+        margin-left: 1em;
+        margin-bottom: 0.5em;
+    }
+    .form-container {
+        border: 1px solid #ddd;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .form-item {
+        margin-bottom: 0.5em;
+    }
+    .image-classification {
+        font-size: 0.9em;
+        color: #666;
+        margin-top: 0.5em;
+    }
+</style>
+</head>
+<body>
+<div class='page'>
+<ul>
+<li>item of leading list</li>
+</ul>
+<h1>Title of the Document</h1>
+<p>Author 1<br>Affiliation 1</p>
+<p>Author 2<br>Affiliation 2</p>
+<h2>1. Introduction</h2>
+<p>This paper introduces the biggest invention ever made. ...</p>
+<ul>
+<li>list item 1</li>
+<li>list item 2</li>
+<li>list item 3</li>
+<ol>
+<li>list item 3.a</li>
+<li>list item 3.b</li>
+<li>list item 3.c</li>
+<ol>
+<li>list item 3.c.i</li>
+</ol>
+</ol>
+<li>list item 4</li>
+</ul>
+<table><caption>This is the caption of table 1.</caption><tbody><tr><td rowspan="2">Product</td><td colspan="2">Years</td></tr><tr><td>2016</td><td>2017</td></tr><tr><td>Apple</td><td>49823</td><td>695944</td></tr></tbody></table>
+<figure><figcaption>This is the caption of figure 1.</figcaption></figure>
+<figure><figcaption>This is the caption of figure 2.</figcaption></figure>
+<ul>
+<li>item 1 of list</li>
+</ul>
+<ul>
+<li>item 1 of list after empty list</li>
+<li>item 2 of list after empty list</li>
+</ul>
+<ul>
+<li>item 1 of neighboring list</li>
+<li>item 2 of neighboring list</li>
+<ul>
+<li>item 1 of sub list</li>
+<li><span class='inline-group'>Here a code snippet: <code>&lt;p&gt;Hello world&lt;/p&gt;</code> (to be displayed inline)</span></li>
+<li><span class='inline-group'>Here a formula: <math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math> (to be displayed inline)</span></li>
+</ul>
+</ul>
+<p>Here a code block:</p>
+<pre><code>print("Hello world")</code></pre>
+<p>Here a formula block:</p>
+<div><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math></div>
+<div class="key-value-region">
+<ul class="key-value-region">
+<li><strong>number</strong>: 1</li>
+
+</ul>
+</div>
+
+<div class="form-container">
+<ul class="form-container">
+<li><strong>number</strong>: 1</li>
+
+</ul>
+</div>
+
+<span class='inline-group'>Some formatting chops: <strong>bold</strong> <em>italic</em> <u>underline</u> <del>strikethrough</del> <a href=".">hyperlink</a> &amp; <a href="https://github.com/DS4SD/docling"><del><u><em><strong>everything at the same time.</strong></em></u></del></a></span>
+<ol>
+<li>Item 1 in A</li>
+<li>Item 2 in A</li>
+<li>Item 3 in A</li>
+<ol>
+<li>Item 1 in B</li>
+<li>Item 2 in B</li>
+<ol>
+<li>Item 1 in C</li>
+<li>Item 2 in C</li>
+</ol>
+<li>Item 3 in B</li>
+</ol>
+<li>Item 4 in A</li>
+</ol>
+<p>The end.</p>
+</div>
+</body>
+</html>
\ No newline at end of file
diff --git a/test/test_serializer_html.py b/test/test_serializer_html.py
index 12026129..6f882de0 100644
--- a/test/test_serializer_html.py
+++ b/test/test_serializer_html.py
@@ -14,59 +14,10 @@ def test_html_export():
     # Create the serializer with default parameters
     serializer = HTMLDocSerializer(doc=doc)
 
-    # Serialize the document
-    html_output = serializer.serialize().text
-
-    # Save to file
-    with open("example_document.new.html", "w", encoding="utf-8") as f:
-        f.write(html_output)
-
-    doc.save_as_html(filename="example_document.old.html")
-    doc.save_as_markdown(filename="example_document.old.md")
-
-    print("Basic example saved to 'example_document.html'")
-
-    assert True
-
-
-def test_markdown_export_with_pageimages():
-
-    doc = DoclingDocument.load_from_json(
-        "/Users/taa/Documents/projects/docling/2501.12948v1.json"
-    )
-
-    doc.save_as_markdown(
-        filename="2501.12948v1.markdown", image_mode=ImageRefMode.REFERENCED
-    )
-
-
-def test_html_export_with_pageimages():
-
-    doc = DoclingDocument.load_from_json(
-        "/Users/taa/Documents/projects/docling/2501.12948v1.json"
-    )
-    doc.save_as_html(filename="2501.12948v1.html", image_mode=ImageRefMode.EMBEDDED)
-    doc.save_as_html(
-        filename="2501.12948v1.split.html",
-        image_mode=ImageRefMode.EMBEDDED,
-        split_page_view=True,
-    )
-
-    """
-
-
-    """
-
-    """
-    # Create the serializer with default parameters
-    serializer = HTMLDocSerializer(doc=doc)
-
-    # Serialize the document
-    html_output = serializer.serialize().text
-
-    # Save to file
-    with open("example_document.new.html", "w", encoding="utf-8") as f:
-        f.write(html_output)
-    """
-
-    assert True
+    doc.save_as_html(filename="test/data/doc/constructed_doc.html")
+    pred_html = doc.export_to_html()
+    
+    with open("test/data/doc/constructed_doc.html", "r") as fr:
+        true_html = fr.read()
+    
+    assert pred_html==true_html, "pred_html==true_html"

From 77b035c3c14ae0e0106cd3efb2f3540dbd3d49af Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Apr 2025 16:00:38 +0200
Subject: [PATCH 24/34] fixed the tests

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/html.py | 1 -
 test/test_serializer_html.py                 | 9 ++++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index 33f6a65f..8e4847e6 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -427,7 +427,6 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Serialize the graph-data to HTML."""
-
         # Build cell lookup by ID
         cell_map = {cell.cell_id: cell for cell in item.cells}
 
diff --git a/test/test_serializer_html.py b/test/test_serializer_html.py
index 6f882de0..df6402e1 100644
--- a/test/test_serializer_html.py
+++ b/test/test_serializer_html.py
@@ -3,7 +3,6 @@
 from test.test_docling_doc import _construct_doc
 
 from docling_core.experimental.serializer.html import HTMLDocSerializer
-from docling_core.types.doc.base import ImageRefMode
 from docling_core.types.doc.document import DoclingDocument  # BoundingBox,
 
 
@@ -14,10 +13,10 @@ def test_html_export():
     # Create the serializer with default parameters
     serializer = HTMLDocSerializer(doc=doc)
 
-    doc.save_as_html(filename="test/data/doc/constructed_doc.html")
+    # doc.save_as_html(filename="test/data/doc/constructed_doc.html")
     pred_html = doc.export_to_html()
-    
+
     with open("test/data/doc/constructed_doc.html", "r") as fr:
         true_html = fr.read()
-    
-    assert pred_html==true_html, "pred_html==true_html"
+
+    assert pred_html == true_html, "pred_html==true_html"

From b4af30dcf9c13961d17a48d459307f38a3d19a75 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Apr 2025 17:58:59 +0200
Subject: [PATCH 25/34] fixed the tests for html export

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 .../data/doc/constructed_doc.embedded.html.gt | 203 +++++++++++-------
 .../doc/constructed_doc.placeholder.html.gt   | 203 +++++++++++-------
 .../doc/constructed_doc.referenced.html.gt    | 203 +++++++++++-------
 3 files changed, 390 insertions(+), 219 deletions(-)

diff --git a/test/data/doc/constructed_doc.embedded.html.gt b/test/data/doc/constructed_doc.embedded.html.gt
index 5bd9ed8a..414507c5 100644
--- a/test/data/doc/constructed_doc.embedded.html.gt
+++ b/test/data/doc/constructed_doc.embedded.html.gt
@@ -1,74 +1,128 @@
 <!DOCTYPE html>
-<html lang="en">
 <head>
-    <link rel="icon" type="image/png"
-    href="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"/>
-    <meta charset="UTF-8">
-    <title>
-    Powered by Docling
-    </title>
-    <style>
+<meta charset="UTF-8">
+<title>Untitled 1</title>
+<meta name="generator" content="Docling HTML Serializer">
+<style>
     html {
-    background-color: LightGray;
+        background-color: #f5f5f5;
+        font-family: Arial, sans-serif;
+        line-height: 1.6;
     }
     body {
-    margin: 0 auto;
-    width:800px;
-    padding: 30px;
-    background-color: White;
-    font-family: Arial, sans-serif;
-    box-shadow: 10px 10px 10px grey;
-    }
-    figure{
-    display: block;
-    width: 100%;
-    margin: 0px;
-    margin-top: 10px;
-    margin-bottom: 10px;
+        max-width: 800px;
+        margin: 0 auto;
+        padding: 2rem;
+        background-color: white;
+        box-shadow: 0 0 10px rgba(0,0,0,0.1);
     }
-    img {
-    display: block;
-    margin: auto;
-    margin-top: 10px;
-    margin-bottom: 10px;
-    max-width: 640px;
-    max-height: 640px;
+    h1, h2, h3, h4, h5, h6 {
+        color: #333;
+        margin-top: 1.5em;
+        margin-bottom: 0.5em;
+    }
+    h1 {
+        font-size: 2em;
+        border-bottom: 1px solid #eee;
+        padding-bottom: 0.3em;
     }
     table {
-    min-width:500px;
-    background-color: White;
-    border-collapse: collapse;
-    cell-padding: 5px;
-    margin: auto;
-    margin-top: 10px;
-    margin-bottom: 10px;
+        border-collapse: collapse;
+        margin: 1em 0;
+        width: 100%;
     }
     th, td {
-    border: 1px solid black;
-    padding: 8px;
+        border: 1px solid #ddd;
+        padding: 8px;
+        text-align: left;
     }
     th {
-    font-weight: bold;
+        background-color: #f2f2f2;
+        font-weight: bold;
+    }
+    figure {
+        margin: 1.5em 0;
+        text-align: center;
+    }
+    figcaption {
+        color: #666;
+        font-style: italic;
+        margin-top: 0.5em;
+    }
+    img {
+        max-width: 100%;
+        height: auto;
+    }
+    pre {
+        background-color: #f6f8fa;
+        border-radius: 3px;
+        padding: 1em;
+        overflow: auto;
     }
-    table tr:nth-child(even) td{
-    background-color: LightGray;
+    code {
+        font-family: monospace;
+        background-color: #f6f8fa;
+        padding: 0.2em 0.4em;
+        border-radius: 3px;
     }
-    math annotation {
-    display: none;
+    pre code {
+        background-color: transparent;
+        padding: 0;
+    }
+    .formula {
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background-color: #f9f9f9;
     }
     .formula-not-decoded {
-    background: repeating-linear-gradient(
-    45deg, /* Angle of the stripes */
-    LightGray, /* First color */
-    LightGray 10px, /* Length of the first color */
-    White 10px, /* Second color */
-    White 20px /* Length of the second color */
-    );
-    margin: 0;
-    text-align: center;
-    }
-    </style>
-    </head>
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background: repeating-linear-gradient(
+            45deg,
+            #f0f0f0,
+            #f0f0f0 10px,
+            #f9f9f9 10px,
+            #f9f9f9 20px
+        );
+    }
+    .page-break {
+        page-break-after: always;
+        border-top: 1px dashed #ccc;
+        margin: 2em 0;
+    }
+    .key-value-region {
+        background-color: #f9f9f9;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .key-value-region dt {
+        font-weight: bold;
+    }
+    .key-value-region dd {
+        margin-left: 1em;
+        margin-bottom: 0.5em;
+    }
+    .form-container {
+        border: 1px solid #ddd;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .form-item {
+        margin-bottom: 0.5em;
+    }
+    .image-classification {
+        font-size: 0.9em;
+        color: #666;
+        margin-top: 0.5em;
+    }
+</style>
+</head>
+<body>
+<div class='page'>
 <ul>
 <li>item of leading list</li>
 </ul>
@@ -89,8 +143,8 @@
 <li>list item 3.c.i</li>
 </ol>
 </ol>
-</ul>
 <li>list item 4</li>
+</ul>
 <table><caption>This is the caption of table 1.</caption><tbody><tr><td rowspan="2">Product</td><td colspan="2">Years</td></tr><tr><td>2016</td><td>2017</td></tr><tr><td>Apple</td><td>49823</td><td>695944</td></tr></tbody></table>
 <figure><figcaption>This is the caption of figure 1.</figcaption></figure>
 <figure><figcaption>This is the caption of figure 2.</figcaption><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAIklEQVR4nO3BAQ0AAADCoPdPbQ8HFAAAAAAAAAAAAAAA8G4wQAABiwCo9wAAAABJRU5ErkJggg=="></figure>
@@ -98,7 +152,6 @@
 <li>item 1 of list</li>
 </ul>
 <ul>
-<ul>
 <li>item 1 of list after empty list</li>
 <li>item 2 of list after empty list</li>
 </ul>
@@ -107,27 +160,29 @@
 <li>item 2 of neighboring list</li>
 <ul>
 <li>item 1 of sub list</li>
-<p>Here a code snippet:</p>
-<pre><code><p>Hello world</p></code></pre>
-<p>(to be displayed inline)</p>
-</ul>
-<p>Here a formula:</p>
-<div><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math></div>
-<p>(to be displayed inline)</p>
+<li><span class='inline-group'>Here a code snippet: <code>&lt;p&gt;Hello world&lt;/p&gt;</code> (to be displayed inline)</span></li>
+<li><span class='inline-group'>Here a formula: <math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math> (to be displayed inline)</span></li>
 </ul>
 </ul>
 <p>Here a code block:</p>
 <pre><code>print("Hello world")</code></pre>
 <p>Here a formula block:</p>
 <div><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math></div>
-<p>Some formatting chops:</p>
-<p>bold</p>
-<p>italic</p>
-<p>underline</p>
-<p>strikethrough</p>
-<p>hyperlink</p>
-<p>&amp;</p>
-<p>everything at the same time.</p>
+<div class="key-value-region">
+<ul class="key-value-region">
+<li><strong>number</strong>: 1</li>
+
+</ul>
+</div>
+
+<div class="form-container">
+<ul class="form-container">
+<li><strong>number</strong>: 1</li>
+
+</ul>
+</div>
+
+<span class='inline-group'>Some formatting chops: <strong>bold</strong> <em>italic</em> <u>underline</u> <del>strikethrough</del> <a href=".">hyperlink</a> &amp; <a href="https://github.com/DS4SD/docling"><del><u><em><strong>everything at the same time.</strong></em></u></del></a></span>
 <ol>
 <li>Item 1 in A</li>
 <li>Item 2 in A</li>
@@ -139,9 +194,11 @@
 <li>Item 1 in C</li>
 <li>Item 2 in C</li>
 </ol>
-</ol>
 <li>Item 3 in B</li>
 </ol>
 <li>Item 4 in A</li>
+</ol>
 <p>The end.</p>
+</div>
+</body>
 </html>
\ No newline at end of file
diff --git a/test/data/doc/constructed_doc.placeholder.html.gt b/test/data/doc/constructed_doc.placeholder.html.gt
index 0220c2c3..c3b8b764 100644
--- a/test/data/doc/constructed_doc.placeholder.html.gt
+++ b/test/data/doc/constructed_doc.placeholder.html.gt
@@ -1,74 +1,128 @@
 <!DOCTYPE html>
-<html lang="en">
 <head>
-    <link rel="icon" type="image/png"
-    href="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"/>
-    <meta charset="UTF-8">
-    <title>
-    Powered by Docling
-    </title>
-    <style>
+<meta charset="UTF-8">
+<title>Untitled 1</title>
+<meta name="generator" content="Docling HTML Serializer">
+<style>
     html {
-    background-color: LightGray;
+        background-color: #f5f5f5;
+        font-family: Arial, sans-serif;
+        line-height: 1.6;
     }
     body {
-    margin: 0 auto;
-    width:800px;
-    padding: 30px;
-    background-color: White;
-    font-family: Arial, sans-serif;
-    box-shadow: 10px 10px 10px grey;
-    }
-    figure{
-    display: block;
-    width: 100%;
-    margin: 0px;
-    margin-top: 10px;
-    margin-bottom: 10px;
+        max-width: 800px;
+        margin: 0 auto;
+        padding: 2rem;
+        background-color: white;
+        box-shadow: 0 0 10px rgba(0,0,0,0.1);
     }
-    img {
-    display: block;
-    margin: auto;
-    margin-top: 10px;
-    margin-bottom: 10px;
-    max-width: 640px;
-    max-height: 640px;
+    h1, h2, h3, h4, h5, h6 {
+        color: #333;
+        margin-top: 1.5em;
+        margin-bottom: 0.5em;
+    }
+    h1 {
+        font-size: 2em;
+        border-bottom: 1px solid #eee;
+        padding-bottom: 0.3em;
     }
     table {
-    min-width:500px;
-    background-color: White;
-    border-collapse: collapse;
-    cell-padding: 5px;
-    margin: auto;
-    margin-top: 10px;
-    margin-bottom: 10px;
+        border-collapse: collapse;
+        margin: 1em 0;
+        width: 100%;
     }
     th, td {
-    border: 1px solid black;
-    padding: 8px;
+        border: 1px solid #ddd;
+        padding: 8px;
+        text-align: left;
     }
     th {
-    font-weight: bold;
+        background-color: #f2f2f2;
+        font-weight: bold;
+    }
+    figure {
+        margin: 1.5em 0;
+        text-align: center;
+    }
+    figcaption {
+        color: #666;
+        font-style: italic;
+        margin-top: 0.5em;
+    }
+    img {
+        max-width: 100%;
+        height: auto;
+    }
+    pre {
+        background-color: #f6f8fa;
+        border-radius: 3px;
+        padding: 1em;
+        overflow: auto;
     }
-    table tr:nth-child(even) td{
-    background-color: LightGray;
+    code {
+        font-family: monospace;
+        background-color: #f6f8fa;
+        padding: 0.2em 0.4em;
+        border-radius: 3px;
     }
-    math annotation {
-    display: none;
+    pre code {
+        background-color: transparent;
+        padding: 0;
+    }
+    .formula {
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background-color: #f9f9f9;
     }
     .formula-not-decoded {
-    background: repeating-linear-gradient(
-    45deg, /* Angle of the stripes */
-    LightGray, /* First color */
-    LightGray 10px, /* Length of the first color */
-    White 10px, /* Second color */
-    White 20px /* Length of the second color */
-    );
-    margin: 0;
-    text-align: center;
-    }
-    </style>
-    </head>
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background: repeating-linear-gradient(
+            45deg,
+            #f0f0f0,
+            #f0f0f0 10px,
+            #f9f9f9 10px,
+            #f9f9f9 20px
+        );
+    }
+    .page-break {
+        page-break-after: always;
+        border-top: 1px dashed #ccc;
+        margin: 2em 0;
+    }
+    .key-value-region {
+        background-color: #f9f9f9;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .key-value-region dt {
+        font-weight: bold;
+    }
+    .key-value-region dd {
+        margin-left: 1em;
+        margin-bottom: 0.5em;
+    }
+    .form-container {
+        border: 1px solid #ddd;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .form-item {
+        margin-bottom: 0.5em;
+    }
+    .image-classification {
+        font-size: 0.9em;
+        color: #666;
+        margin-top: 0.5em;
+    }
+</style>
+</head>
+<body>
+<div class='page'>
 <ul>
 <li>item of leading list</li>
 </ul>
@@ -89,8 +143,8 @@
 <li>list item 3.c.i</li>
 </ol>
 </ol>
-</ul>
 <li>list item 4</li>
+</ul>
 <table><caption>This is the caption of table 1.</caption><tbody><tr><td rowspan="2">Product</td><td colspan="2">Years</td></tr><tr><td>2016</td><td>2017</td></tr><tr><td>Apple</td><td>49823</td><td>695944</td></tr></tbody></table>
 <figure><figcaption>This is the caption of figure 1.</figcaption></figure>
 <figure><figcaption>This is the caption of figure 2.</figcaption></figure>
@@ -98,7 +152,6 @@
 <li>item 1 of list</li>
 </ul>
 <ul>
-<ul>
 <li>item 1 of list after empty list</li>
 <li>item 2 of list after empty list</li>
 </ul>
@@ -107,27 +160,29 @@
 <li>item 2 of neighboring list</li>
 <ul>
 <li>item 1 of sub list</li>
-<p>Here a code snippet:</p>
-<pre><code><p>Hello world</p></code></pre>
-<p>(to be displayed inline)</p>
-</ul>
-<p>Here a formula:</p>
-<div><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math></div>
-<p>(to be displayed inline)</p>
+<li><span class='inline-group'>Here a code snippet: <code>&lt;p&gt;Hello world&lt;/p&gt;</code> (to be displayed inline)</span></li>
+<li><span class='inline-group'>Here a formula: <math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math> (to be displayed inline)</span></li>
 </ul>
 </ul>
 <p>Here a code block:</p>
 <pre><code>print("Hello world")</code></pre>
 <p>Here a formula block:</p>
 <div><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math></div>
-<p>Some formatting chops:</p>
-<p>bold</p>
-<p>italic</p>
-<p>underline</p>
-<p>strikethrough</p>
-<p>hyperlink</p>
-<p>&amp;</p>
-<p>everything at the same time.</p>
+<div class="key-value-region">
+<ul class="key-value-region">
+<li><strong>number</strong>: 1</li>
+
+</ul>
+</div>
+
+<div class="form-container">
+<ul class="form-container">
+<li><strong>number</strong>: 1</li>
+
+</ul>
+</div>
+
+<span class='inline-group'>Some formatting chops: <strong>bold</strong> <em>italic</em> <u>underline</u> <del>strikethrough</del> <a href=".">hyperlink</a> &amp; <a href="https://github.com/DS4SD/docling"><del><u><em><strong>everything at the same time.</strong></em></u></del></a></span>
 <ol>
 <li>Item 1 in A</li>
 <li>Item 2 in A</li>
@@ -139,9 +194,11 @@
 <li>Item 1 in C</li>
 <li>Item 2 in C</li>
 </ol>
-</ol>
 <li>Item 3 in B</li>
 </ol>
 <li>Item 4 in A</li>
+</ol>
 <p>The end.</p>
+</div>
+</body>
 </html>
\ No newline at end of file
diff --git a/test/data/doc/constructed_doc.referenced.html.gt b/test/data/doc/constructed_doc.referenced.html.gt
index fa58fc4b..dfc9e715 100644
--- a/test/data/doc/constructed_doc.referenced.html.gt
+++ b/test/data/doc/constructed_doc.referenced.html.gt
@@ -1,74 +1,128 @@
 <!DOCTYPE html>
-<html lang="en">
 <head>
-    <link rel="icon" type="image/png"
-    href="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"/>
-    <meta charset="UTF-8">
-    <title>
-    Powered by Docling
-    </title>
-    <style>
+<meta charset="UTF-8">
+<title>Untitled 1</title>
+<meta name="generator" content="Docling HTML Serializer">
+<style>
     html {
-    background-color: LightGray;
+        background-color: #f5f5f5;
+        font-family: Arial, sans-serif;
+        line-height: 1.6;
     }
     body {
-    margin: 0 auto;
-    width:800px;
-    padding: 30px;
-    background-color: White;
-    font-family: Arial, sans-serif;
-    box-shadow: 10px 10px 10px grey;
-    }
-    figure{
-    display: block;
-    width: 100%;
-    margin: 0px;
-    margin-top: 10px;
-    margin-bottom: 10px;
+        max-width: 800px;
+        margin: 0 auto;
+        padding: 2rem;
+        background-color: white;
+        box-shadow: 0 0 10px rgba(0,0,0,0.1);
     }
-    img {
-    display: block;
-    margin: auto;
-    margin-top: 10px;
-    margin-bottom: 10px;
-    max-width: 640px;
-    max-height: 640px;
+    h1, h2, h3, h4, h5, h6 {
+        color: #333;
+        margin-top: 1.5em;
+        margin-bottom: 0.5em;
+    }
+    h1 {
+        font-size: 2em;
+        border-bottom: 1px solid #eee;
+        padding-bottom: 0.3em;
     }
     table {
-    min-width:500px;
-    background-color: White;
-    border-collapse: collapse;
-    cell-padding: 5px;
-    margin: auto;
-    margin-top: 10px;
-    margin-bottom: 10px;
+        border-collapse: collapse;
+        margin: 1em 0;
+        width: 100%;
     }
     th, td {
-    border: 1px solid black;
-    padding: 8px;
+        border: 1px solid #ddd;
+        padding: 8px;
+        text-align: left;
     }
     th {
-    font-weight: bold;
+        background-color: #f2f2f2;
+        font-weight: bold;
+    }
+    figure {
+        margin: 1.5em 0;
+        text-align: center;
+    }
+    figcaption {
+        color: #666;
+        font-style: italic;
+        margin-top: 0.5em;
+    }
+    img {
+        max-width: 100%;
+        height: auto;
+    }
+    pre {
+        background-color: #f6f8fa;
+        border-radius: 3px;
+        padding: 1em;
+        overflow: auto;
     }
-    table tr:nth-child(even) td{
-    background-color: LightGray;
+    code {
+        font-family: monospace;
+        background-color: #f6f8fa;
+        padding: 0.2em 0.4em;
+        border-radius: 3px;
     }
-    math annotation {
-    display: none;
+    pre code {
+        background-color: transparent;
+        padding: 0;
+    }
+    .formula {
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background-color: #f9f9f9;
     }
     .formula-not-decoded {
-    background: repeating-linear-gradient(
-    45deg, /* Angle of the stripes */
-    LightGray, /* First color */
-    LightGray 10px, /* Length of the first color */
-    White 10px, /* Second color */
-    White 20px /* Length of the second color */
-    );
-    margin: 0;
-    text-align: center;
-    }
-    </style>
-    </head>
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background: repeating-linear-gradient(
+            45deg,
+            #f0f0f0,
+            #f0f0f0 10px,
+            #f9f9f9 10px,
+            #f9f9f9 20px
+        );
+    }
+    .page-break {
+        page-break-after: always;
+        border-top: 1px dashed #ccc;
+        margin: 2em 0;
+    }
+    .key-value-region {
+        background-color: #f9f9f9;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .key-value-region dt {
+        font-weight: bold;
+    }
+    .key-value-region dd {
+        margin-left: 1em;
+        margin-bottom: 0.5em;
+    }
+    .form-container {
+        border: 1px solid #ddd;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .form-item {
+        margin-bottom: 0.5em;
+    }
+    .image-classification {
+        font-size: 0.9em;
+        color: #666;
+        margin-top: 0.5em;
+    }
+</style>
+</head>
+<body>
+<div class='page'>
 <ul>
 <li>item of leading list</li>
 </ul>
@@ -89,8 +143,8 @@
 <li>list item 3.c.i</li>
 </ol>
 </ol>
-</ul>
 <li>list item 4</li>
+</ul>
 <table><caption>This is the caption of table 1.</caption><tbody><tr><td rowspan="2">Product</td><td colspan="2">Years</td></tr><tr><td>2016</td><td>2017</td></tr><tr><td>Apple</td><td>49823</td><td>695944</td></tr></tbody></table>
 <figure><figcaption>This is the caption of figure 1.</figcaption></figure>
 <figure><figcaption>This is the caption of figure 2.</figcaption><img src="constructed_images/image_000001_f3cc103136423a57975750907ebc1d367e2985ac6338976d4d5a439f50323f4a.png"></figure>
@@ -98,7 +152,6 @@
 <li>item 1 of list</li>
 </ul>
 <ul>
-<ul>
 <li>item 1 of list after empty list</li>
 <li>item 2 of list after empty list</li>
 </ul>
@@ -107,27 +160,29 @@
 <li>item 2 of neighboring list</li>
 <ul>
 <li>item 1 of sub list</li>
-<p>Here a code snippet:</p>
-<pre><code><p>Hello world</p></code></pre>
-<p>(to be displayed inline)</p>
-</ul>
-<p>Here a formula:</p>
-<div><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math></div>
-<p>(to be displayed inline)</p>
+<li><span class='inline-group'>Here a code snippet: <code>&lt;p&gt;Hello world&lt;/p&gt;</code> (to be displayed inline)</span></li>
+<li><span class='inline-group'>Here a formula: <math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math> (to be displayed inline)</span></li>
 </ul>
 </ul>
 <p>Here a code block:</p>
 <pre><code>print("Hello world")</code></pre>
 <p>Here a formula block:</p>
 <div><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math></div>
-<p>Some formatting chops:</p>
-<p>bold</p>
-<p>italic</p>
-<p>underline</p>
-<p>strikethrough</p>
-<p>hyperlink</p>
-<p>&amp;</p>
-<p>everything at the same time.</p>
+<div class="key-value-region">
+<ul class="key-value-region">
+<li><strong>number</strong>: 1</li>
+
+</ul>
+</div>
+
+<div class="form-container">
+<ul class="form-container">
+<li><strong>number</strong>: 1</li>
+
+</ul>
+</div>
+
+<span class='inline-group'>Some formatting chops: <strong>bold</strong> <em>italic</em> <u>underline</u> <del>strikethrough</del> <a href=".">hyperlink</a> &amp; <a href="https://github.com/DS4SD/docling"><del><u><em><strong>everything at the same time.</strong></em></u></del></a></span>
 <ol>
 <li>Item 1 in A</li>
 <li>Item 2 in A</li>
@@ -139,9 +194,11 @@
 <li>Item 1 in C</li>
 <li>Item 2 in C</li>
 </ol>
-</ol>
 <li>Item 3 in B</li>
 </ol>
 <li>Item 4 in A</li>
+</ol>
 <p>The end.</p>
+</div>
+</body>
 </html>
\ No newline at end of file

From 1ef37b8ecf6edb9bc08cb62b544284bb778da3de Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Apr 2025 18:13:19 +0200
Subject: [PATCH 26/34] removed dead code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/types/doc/document.py | 324 +----------------------------
 1 file changed, 2 insertions(+), 322 deletions(-)

diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 3f478420..ef32e1cb 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -1675,76 +1675,6 @@ class PageItem(BaseModel):
 class DoclingDocument(BaseModel):
     """DoclingDocument."""
 
-    _HTML_DEFAULT_HEAD: str = r"""<head>
-    <link rel="icon" type="image/png"
-    href="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"/>
-    <meta charset="UTF-8">
-    <title>
-    Powered by Docling
-    </title>
-    <style>
-    html {
-    background-color: LightGray;
-    }
-    body {
-    margin: 0 auto;
-    width:800px;
-    padding: 30px;
-    background-color: White;
-    font-family: Arial, sans-serif;
-    box-shadow: 10px 10px 10px grey;
-    }
-    figure{
-    display: block;
-    width: 100%;
-    margin: 0px;
-    margin-top: 10px;
-    margin-bottom: 10px;
-    }
-    img {
-    display: block;
-    margin: auto;
-    margin-top: 10px;
-    margin-bottom: 10px;
-    max-width: 640px;
-    max-height: 640px;
-    }
-    table {
-    min-width:500px;
-    background-color: White;
-    border-collapse: collapse;
-    cell-padding: 5px;
-    margin: auto;
-    margin-top: 10px;
-    margin-bottom: 10px;
-    }
-    th, td {
-    border: 1px solid black;
-    padding: 8px;
-    }
-    th {
-    font-weight: bold;
-    }
-    table tr:nth-child(even) td{
-    background-color: LightGray;
-    }
-    math annotation {
-    display: none;
-    }
-    .formula-not-decoded {
-    background: repeating-linear-gradient(
-    45deg, /* Angle of the stripes */
-    LightGray, /* First color */
-    LightGray 10px, /* Length of the first color */
-    White 10px, /* Second color */
-    White 20px /* Length of the second color */
-    );
-    margin: 0;
-    text-align: center;
-    }
-    </style>
-    </head>"""
-
     schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
     version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
         CURRENT_VERSION
@@ -3249,7 +3179,7 @@ def save_as_html(
         formula_to_mathml: bool = True,
         page_no: Optional[int] = None,
         html_lang: str = "en",
-        html_head: str = _HTML_DEFAULT_HEAD,
+        html_head: str = "", # should be deprecated
         included_content_layers: Optional[set[ContentLayer]] = None,
         split_page_view: bool = False,
     ):
@@ -3324,7 +3254,7 @@ def export_to_html(  # noqa: C901
         formula_to_mathml: bool = True,
         page_no: Optional[int] = None,
         html_lang: str = "en",
-        html_head: str = _HTML_DEFAULT_HEAD,
+        html_head: str = "", # should be deprecated ...
         included_content_layers: Optional[set[ContentLayer]] = None,
         split_page_view: bool = False,
     ) -> str:
@@ -3351,7 +3281,6 @@ def export_to_html(  # noqa: C901
                 image_mode=image_mode,
                 formula_to_mathml=formula_to_mathml,
                 html_lang=html_lang,
-                html_head=html_head,
                 split_page_view=split_page_view,
             ),
         )
@@ -3359,255 +3288,6 @@ def export_to_html(  # noqa: C901
 
         return ser_res.text
 
-    def _legacy_export_to_html(  # noqa: C901
-        self,
-        from_element: int = 0,
-        to_element: int = sys.maxsize,
-        labels: Optional[set[DocItemLabel]] = None,
-        image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
-        formula_to_mathml: bool = True,
-        page_no: Optional[int] = None,
-        html_lang: str = "en",
-        html_head: str = _HTML_DEFAULT_HEAD,
-        included_content_layers: Optional[set[ContentLayer]] = None,
-    ) -> str:
-        r"""Serialize to HTML."""
-        my_labels = labels if labels is not None else DEFAULT_EXPORT_LABELS
-        my_layers = (
-            included_content_layers
-            if included_content_layers is not None
-            else DEFAULT_CONTENT_LAYERS
-        )
-
-        def close_lists(
-            curr_level: int,
-            prev_level: int,
-            in_ordered_list: List[bool],
-            html_texts: list[str],
-        ):
-
-            if len(in_ordered_list) == 0:
-                return (in_ordered_list, html_texts)
-
-            while curr_level < prev_level and len(in_ordered_list) > 0:
-                if in_ordered_list[-1]:
-                    html_texts.append("</ol>")
-                else:
-                    html_texts.append("</ul>")
-
-                prev_level -= 1
-                in_ordered_list.pop()  # = in_ordered_list[:-1]
-
-            return (in_ordered_list, html_texts)
-
-        head_lines = [
-            "<!DOCTYPE html>",
-            f'<html lang="{html_lang}">',
-            html_head,
-        ]
-        html_texts: list[str] = []
-
-        prev_level = 0  # Track the previous item's level
-
-        in_ordered_list: List[bool] = []  # False
-
-        def _prepare_tag_content(
-            text: str, do_escape_html=True, do_replace_newline=True
-        ) -> str:
-            if do_escape_html:
-                text = html.escape(text, quote=False)
-            if do_replace_newline:
-                text = text.replace("\n", "<br>")
-            return text
-
-        for ix, (item, curr_level) in enumerate(
-            self.iterate_items(
-                self.body,
-                with_groups=True,
-                page_no=page_no,
-                included_content_layers=my_layers,
-            )
-        ):
-            # If we've moved to a lower level, we're exiting one or more groups
-            if curr_level < prev_level and len(in_ordered_list) > 0:
-                # Calculate how many levels we've exited
-                # level_difference = previous_level - level
-                # Decrement list_nesting_level for each list group we've exited
-                # list_nesting_level = max(0, list_nesting_level - level_difference)
-
-                in_ordered_list, html_texts = close_lists(
-                    curr_level=curr_level,
-                    prev_level=prev_level,
-                    in_ordered_list=in_ordered_list,
-                    html_texts=html_texts,
-                )
-
-            prev_level = curr_level  # Update previous_level for next iteration
-
-            if ix < from_element or to_element <= ix:
-                continue  # skip as many items as you want
-
-            if (isinstance(item, DocItem)) and (item.label not in my_labels):
-                continue  # skip any label that is not whitelisted
-
-            if isinstance(item, GroupItem) and item.label in [
-                GroupLabel.ORDERED_LIST,
-            ]:
-
-                text = "<ol>"
-                html_texts.append(text)
-
-                # Increment list nesting level when entering a new list
-                in_ordered_list.append(True)
-
-            elif isinstance(item, GroupItem) and item.label in [
-                GroupLabel.LIST,
-            ]:
-
-                text = "<ul>"
-                html_texts.append(text)
-
-                # Increment list nesting level when entering a new list
-                in_ordered_list.append(False)
-
-            elif isinstance(item, GroupItem):
-                continue
-
-            elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
-                text_inner = _prepare_tag_content(item.text)
-                text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
-
-                html_texts.append(text)
-
-            elif isinstance(item, SectionHeaderItem):
-
-                section_level: int = min(item.level + 1, 6)
-
-                text = get_html_tag_with_text_direction(
-                    html_tag=f"h{section_level}",
-                    text=_prepare_tag_content(item.text),
-                )
-                html_texts.append(text)
-
-            elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
-
-                math_formula = _prepare_tag_content(
-                    item.text, do_escape_html=False, do_replace_newline=False
-                )
-                text = ""
-
-                def _image_fallback(item: TextItem):
-                    item_image = item.get_image(doc=self)
-                    if item_image is not None:
-                        img_ref = ImageRef.from_pil(item_image, dpi=72)
-                        return (
-                            "<figure>"
-                            f'<img src="{img_ref.uri}" alt="{item.orig}" />'
-                            "</figure>"
-                        )
-
-                img_fallback = _image_fallback(item)
-
-                # If the formula is not processed correcty, use its image
-                if (
-                    item.text == ""
-                    and item.orig != ""
-                    and image_mode == ImageRefMode.EMBEDDED
-                    and len(item.prov) > 0
-                    and img_fallback is not None
-                ):
-                    text = img_fallback
-
-                # Building a math equation in MathML format
-                # ref https://www.w3.org/TR/wai-aria-1.1/#math
-                elif formula_to_mathml and len(math_formula) > 0:
-                    try:
-                        mathml_element = latex2mathml.converter.convert_to_element(
-                            math_formula, display="block"
-                        )
-                        annotation = SubElement(
-                            mathml_element, "annotation", dict(encoding="TeX")
-                        )
-                        annotation.text = math_formula
-                        mathml = unescape(tostring(mathml_element, encoding="unicode"))
-                        text = f"<div>{mathml}</div>"
-                    except Exception as err:
-                        _logger.warning(
-                            "Malformed formula cannot be rendered. "
-                            f"Error {err.__class__.__name__}, formula={math_formula}"
-                        )
-                        if (
-                            image_mode == ImageRefMode.EMBEDDED
-                            and len(item.prov) > 0
-                            and img_fallback is not None
-                        ):
-                            text = img_fallback
-                        else:
-                            text = f"<pre>{math_formula}</pre>"
-
-                elif math_formula != "":
-                    text = f"<pre>{math_formula}</pre>"
-
-                if text != "":
-                    html_texts.append(text)
-                else:
-                    html_texts.append(
-                        '<div class="formula-not-decoded">Formula not decoded</div>'
-                    )
-
-            elif isinstance(item, ListItem):
-                text = get_html_tag_with_text_direction(
-                    html_tag="li", text=_prepare_tag_content(item.text)
-                )
-                html_texts.append(text)
-
-            elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
-                text = get_html_tag_with_text_direction(
-                    html_tag="li", text=_prepare_tag_content(item.text)
-                )
-                html_texts.append(text)
-
-            elif isinstance(item, CodeItem):
-                code_text = _prepare_tag_content(
-                    item.text, do_escape_html=False, do_replace_newline=False
-                )
-                text = f"<pre><code>{code_text}</code></pre>"
-                html_texts.append(text)
-
-            elif isinstance(item, TextItem):
-
-                text = get_html_tag_with_text_direction(
-                    html_tag="p", text=_prepare_tag_content(item.text)
-                )
-                html_texts.append(text)
-
-            elif isinstance(item, TableItem):
-
-                text = item.export_to_html(doc=self, add_caption=True)
-                html_texts.append(text)
-
-            elif isinstance(item, PictureItem):
-
-                html_texts.append(
-                    item.export_to_html(
-                        doc=self, add_caption=True, image_mode=image_mode
-                    )
-                )
-
-            elif isinstance(item, DocItem) and item.label in my_labels:
-                continue
-
-        html_texts.append("</html>")
-
-        lines = []
-        lines.extend(head_lines)
-        lines.extend(html_texts)
-
-        delim = "\n"
-        html_text = (delim.join(lines)).strip()
-
-        return html_text
-
     def load_from_doctags(  # noqa: C901
         self,
         doctag_document: DocTagsDocument,

From 1bcbbae98351213c541641b22fffb6c0bac72a30 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Apr 2025 18:15:16 +0200
Subject: [PATCH 27/34] updated the test output

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 test/data/doc/2206.01062.yaml.html           | 187 ++++++++++++-----
 test/data/doc/constructed_document.yaml.html | 203 ++++++++++++-------
 test/data/doc/dummy_doc.yaml.html            | 168 ++++++++++-----
 3 files changed, 374 insertions(+), 184 deletions(-)

diff --git a/test/data/doc/2206.01062.yaml.html b/test/data/doc/2206.01062.yaml.html
index e895ea33..c94c14b4 100644
--- a/test/data/doc/2206.01062.yaml.html
+++ b/test/data/doc/2206.01062.yaml.html
@@ -1,74 +1,128 @@
 <!DOCTYPE html>
-<html lang="en">
 <head>
-    <link rel="icon" type="image/png"
-    href="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"/>
-    <meta charset="UTF-8">
-    <title>
-    Powered by Docling
-    </title>
-    <style>
+<meta charset="UTF-8">
+<title>2206.01062v1</title>
+<meta name="generator" content="Docling HTML Serializer">
+<style>
     html {
-    background-color: LightGray;
+        background-color: #f5f5f5;
+        font-family: Arial, sans-serif;
+        line-height: 1.6;
     }
     body {
-    margin: 0 auto;
-    width:800px;
-    padding: 30px;
-    background-color: White;
-    font-family: Arial, sans-serif;
-    box-shadow: 10px 10px 10px grey;
-    }
-    figure{
-    display: block;
-    width: 100%;
-    margin: 0px;
-    margin-top: 10px;
-    margin-bottom: 10px;
+        max-width: 800px;
+        margin: 0 auto;
+        padding: 2rem;
+        background-color: white;
+        box-shadow: 0 0 10px rgba(0,0,0,0.1);
     }
-    img {
-    display: block;
-    margin: auto;
-    margin-top: 10px;
-    margin-bottom: 10px;
-    max-width: 640px;
-    max-height: 640px;
+    h1, h2, h3, h4, h5, h6 {
+        color: #333;
+        margin-top: 1.5em;
+        margin-bottom: 0.5em;
+    }
+    h1 {
+        font-size: 2em;
+        border-bottom: 1px solid #eee;
+        padding-bottom: 0.3em;
     }
     table {
-    min-width:500px;
-    background-color: White;
-    border-collapse: collapse;
-    cell-padding: 5px;
-    margin: auto;
-    margin-top: 10px;
-    margin-bottom: 10px;
+        border-collapse: collapse;
+        margin: 1em 0;
+        width: 100%;
     }
     th, td {
-    border: 1px solid black;
-    padding: 8px;
+        border: 1px solid #ddd;
+        padding: 8px;
+        text-align: left;
     }
     th {
-    font-weight: bold;
+        background-color: #f2f2f2;
+        font-weight: bold;
+    }
+    figure {
+        margin: 1.5em 0;
+        text-align: center;
     }
-    table tr:nth-child(even) td{
-    background-color: LightGray;
+    figcaption {
+        color: #666;
+        font-style: italic;
+        margin-top: 0.5em;
+    }
+    img {
+        max-width: 100%;
+        height: auto;
     }
-    math annotation {
-    display: none;
+    pre {
+        background-color: #f6f8fa;
+        border-radius: 3px;
+        padding: 1em;
+        overflow: auto;
+    }
+    code {
+        font-family: monospace;
+        background-color: #f6f8fa;
+        padding: 0.2em 0.4em;
+        border-radius: 3px;
+    }
+    pre code {
+        background-color: transparent;
+        padding: 0;
+    }
+    .formula {
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background-color: #f9f9f9;
     }
     .formula-not-decoded {
-    background: repeating-linear-gradient(
-    45deg, /* Angle of the stripes */
-    LightGray, /* First color */
-    LightGray 10px, /* Length of the first color */
-    White 10px, /* Second color */
-    White 20px /* Length of the second color */
-    );
-    margin: 0;
-    text-align: center;
-    }
-    </style>
-    </head>
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background: repeating-linear-gradient(
+            45deg,
+            #f0f0f0,
+            #f0f0f0 10px,
+            #f9f9f9 10px,
+            #f9f9f9 20px
+        );
+    }
+    .page-break {
+        page-break-after: always;
+        border-top: 1px dashed #ccc;
+        margin: 2em 0;
+    }
+    .key-value-region {
+        background-color: #f9f9f9;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .key-value-region dt {
+        font-weight: bold;
+    }
+    .key-value-region dd {
+        margin-left: 1em;
+        margin-bottom: 0.5em;
+    }
+    .form-container {
+        border: 1px solid #ddd;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .form-item {
+        margin-bottom: 0.5em;
+    }
+    .image-classification {
+        font-size: 0.9em;
+        color: #666;
+        margin-top: 0.5em;
+    }
+</style>
+</head>
+<body>
+<div class='page'>
 <h2>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</h2>
 <p>Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com</p>
 <p>Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com</p>
@@ -89,6 +143,8 @@ <h2>KEYWORDS</h2>
 <p>PDF document conversion, layout segmentation, object-detection, data set, Machine Learning</p>
 <h2>ACMReference Format:</h2>
 <p>Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043</p>
+</div>
+<div class='page'>
 <h2>1 INTRODUCTION</h2>
 <p>Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1.</p>
 <p>Akeyproblem in the process of document conversion is to understand the structure of a single document page, i.e. which segments of text should be grouped together in a unit. To train models for this task, there are currently two large datasets available to the community, PubLayNet [6] and DocBank [7]. They were introduced in 2019 and 2020 respectively and significantly accelerated the implementation of layout detection and segmentation models due to their sizes of 300K and 500K ground-truth pages. These sizes were achieved by leveraging an automation approach. The benefit of automated ground-truth generation is obvious: one can generate large ground-truth datasets at virtually no cost. However, the automation introduces a constraint on the variability in the dataset, because corresponding structured source data must be available. PubLayNet and DocBank were both generated from scientific document repositories (PubMed and arXiv), which provide XML or L A T E X sources. Those scientific documents present a limited variability in their layouts, because they are typeset in uniform templates provided by the publishers. Obviously, documents such as technical manuals, annual company reports, legal text, government tenders, etc. have very different and partially unique layouts. As a consequence, the layout predictions obtained from models trained on PubLayNet or DocBank is very reasonable when applied on scientific documents. However, for more artistic or free-style layouts, we see sub-par prediction quality from these models, which we demonstrate in Section 5.</p>
@@ -99,6 +155,7 @@ <h2>1 INTRODUCTION</h2>
 <li>(3) Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours.</li>
 <li>(4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation.</li>
 </ul>
+<p>1 https://developer.ibm.com/exchanges/data/all/doclaynet</p>
 <p>This enables experimentation with annotation uncertainty and quality control analysis.</p>
 <ul>
 <li>(5) Pre-defined Train-, Test- &amp; Validation-set : Like DocBank, we provide fixed train-, test- &amp; validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.</li>
@@ -111,22 +168,30 @@ <h2>2 RELATED WORK</h2>
 <h2>3 THE DOCLAYNET DATASET</h2>
 <p>DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of labeled, rectangular boundingboxes. We define 11 distinct labels for layout features, namely Caption , Footnote , Formula List-item , , Page-footer , Page-header , Picture , Section-header , Table , Text , and Title . Our reasoning for picking this particular label set is detailed in Section 4.</p>
 <p>In addition to open intellectual property constraints for the source documents, we required that the documents in DocLayNet adhere to a few conditions. Firstly, we kept scanned documents</p>
+</div>
+<div class='page'>
 <figure><figcaption>Figure 2: Distribution of DocLayNet pages across document categories.</figcaption></figure>
 <p>to a minimum, since they introduce difficulties in annotation (see Section 4). As a second condition, we focussed on medium to large documents ( &gt; 10 pages) with technical content, dense in complex tables, figures, plots and captions. Such documents carry a lot of information value, but are often hard to analyse with high accuracy due to their challenging layouts. Counterexamples of documents not included in the dataset are receipts, invoices, hand-written documents or photographs showing 'text in the wild".</p>
 <p>The pages in DocLayNet can be grouped into six distinct categories, namely Financial Reports , Manuals Scientific Articles , , Laws &amp; Regulations , Patents and Government Tenders . Each document category was sourced from various repositories. For example, Financial Reports contain both free-style format annual reports 2 which expose company-specific, artistic layouts as well as the more formal SEC filings. The two largest categories ( Financial Reports and Manuals ) contain a large amount of free-style layouts in order to obtain maximum variability. In the other four categories, we boosted the variability by mixing documents from independent providers, such as different government websites or publishers. In Figure 2, we show the document categories contained in DocLayNet with their respective sizes.</p>
 <p>We did not control the document selection with regard to language. The vast majority of documents contained in DocLayNet (close to 95%) are published in English language. However, DocLayNet also contains a number of documents in other languages such as German (2.5%), French (1.0%) and Japanese (1.0%). While the document language has negligible impact on the performance of computer vision methods such as object detection and segmentation models, it might prove challenging for layout analysis methods which exploit textual features.</p>
 <p>To ensure that future benchmarks in the document-layout analysis community can be easily compared, we have split up DocLayNet into pre-defined train-, test- and validation-sets. In this way, we can avoid spurious variations in the evaluation scores due to random splitting in train-, test- and validation-sets. We also ensured that less frequent labels are represented in train and test sets in equal proportions.</p>
+<p>2 e.g. AAPL from https://www.annualreports.com/</p>
 <p>Table 1 shows the overall frequency and distribution of the labels among the different sets. Importantly, we ensure that subsets are only split on full-document boundaries. This avoids that pages of the same document are spread over train, test and validation set, which can give an undesired evaluation advantage to models and lead to overestimation of their prediction accuracy. We will show the impact of this decision in Section 5.</p>
 <p>In order to accommodate the different types of models currently in use by the community, we provide DocLayNet in an augmented COCO format [16]. This entails the standard COCO ground-truth file (in JSON format) with the associated page images (in PNG format, 1025 × 1025 pixels). Furthermore, custom fields have been added to each COCO record to specify document category, original document filename and page number. In addition, we also provide the original PDF pages, as well as sidecar files containing parsed PDF text and text-cell coordinates (in JSON). All additional files are linked to the primary page images by their matching filenames.</p>
 <p>Despite being cost-intense and far less scalable than automation, human annotation has several benefits over automated groundtruth generation. The first and most obvious reason to leverage human annotations is the freedom to annotate any type of document without requiring a programmatic source. For most PDF documents, the original source document is not available. The latter is not a hard constraint with human annotation, but it is for automated methods. A second reason to use human annotations is that the latter usually provide a more natural interpretation of the page layout. The human-interpreted layout can significantly deviate from the programmatic layout used in typesetting. For example, 'invisible' tables might be used solely for aligning text paragraphs on columns. Such typesetting tricks might be interpreted by automated methods incorrectly as an actual table, while the human annotation will interpret it correctly as Text or other styles. The same applies to multi-line text elements, when authors decided to space them as 'invisible' list elements without bullet symbols. A third reason to gather ground-truth through human annotation is to estimate a 'natural' upper bound on the segmentation accuracy. As we will show in Section 4, certain documents featuring complex layouts can have different but equally acceptable layout interpretations. This natural upper bound for segmentation accuracy can be found by annotating the same pages multiple times by different people and evaluating the inter-annotator agreement. Such a baseline consistency evaluation is very useful to define expectations for a good target accuracy in trained deep neural network models and avoid overfitting (see Table 1). On the flip side, achieving high annotation consistency proved to be a key challenge in human annotation, as we outline in Section 4.</p>
 <h2>4 ANNOTATION CAMPAIGN</h2>
 <p>The annotation campaign was carried out in four phases. In phase one, we identified and prepared the data sources for annotation. In phase two, we determined the class labels and how annotations should be done on the documents in order to obtain maximum consistency. The latter was guided by a detailed requirement analysis and exhaustive experiments. In phase three, we trained the annotation staff and performed exams for quality assurance. In phase four,</p>
+</div>
+<div class='page'>
 <table><caption>Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row &#x27;Total&#x27;) in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.</caption><tbody><tr><td></td><td></td><th colspan="4">% of Total</th><th colspan="6">triple inter-annotator mAP @ 0.5-0.95 (%)</th></tr><tr><th>class label</th><th>Count</th><th>Train</th><th>Test</th><th>Val</th><th>All</th><th>Fin</th><th>Man</th><th>Sci</th><th>Law</th><th>Pat</th><th>Ten</th></tr><tr><td>Caption</td><td>22524</td><td>2.04</td><td>1.77</td><td>2.32</td><td>84-89</td><td>40-61</td><td>86-92</td><td>94-99</td><td>95-99</td><td>69-78</td><td>n/a</td></tr><tr><td>Footnote</td><td>6318</td><td>0.60</td><td>0.31</td><td>0.58</td><td>83-91</td><td>n/a</td><td>100</td><td>62-88</td><td>85-94</td><td>n/a</td><td>82-97</td></tr><tr><td>Formula</td><td>25027</td><td>2.25</td><td>1.90</td><td>2.96</td><td>83-85</td><td>n/a</td><td>n/a</td><td>84-87</td><td>86-96</td><td>n/a</td><td>n/a</td></tr><tr><td>List-item</td><td>185660</td><td>17.19</td><td>13.34</td><td>15.82</td><td>87-88</td><td>74-83</td><td>90-92</td><td>97-97</td><td>81-85</td><td>75-88</td><td>93-95</td></tr><tr><td>Page-footer</td><td>70878</td><td>6.51</td><td>5.58</td><td>6.00</td><td>93-94</td><td>88-90</td><td>95-96</td><td>100</td><td>92-97</td><td>100</td><td>96-98</td></tr><tr><td>Page-header</td><td>58022</td><td>5.10</td><td>6.70</td><td>5.06</td><td>85-89</td><td>66-76</td><td>90-94</td><td>98-100</td><td>91-92</td><td>97-99</td><td>81-86</td></tr><tr><td>Picture</td><td>45976</td><td>4.21</td><td>2.78</td><td>5.31</td><td>69-71</td><td>56-59</td><td>82-86</td><td>69-82</td><td>80-95</td><td>66-71</td><td>59-76</td></tr><tr><td>Section-header</td><td>142884</td><td>12.60</td><td>15.77</td><td>12.85</td><td>83-84</td><td>76-81</td><td>90-92</td><td>94-95</td><td>87-94</td><td>69-73</td><td>78-86</td></tr><tr><td>Table</td><td>34733</td><td>3.20</td><td>2.27</td><td>3.60</td><td>77-81</td><td>75-80</td><td>83-86</td><td>98-99</td><td>58-80</td><td>79-84</td><td>70-85</td></tr><tr><td>Text</td><td>510377</td><td>45.82</td><td>49.28</td><td>45.00</td><td>84-86</td><td>81-86</td><td>88-93</td><td>89-93</td><td>87-92</td><td>71-79</td><td>87-95</td></tr><tr><td>Title</td><td>5071</td><td>0.47</td><td>0.30</td><td>0.50</td><td>60-72</td><td>24-63</td><td>50-63</td><td>94-100</td><td>82-96</td><td>68-79</td><td>24-56</td></tr><tr><td>Total</td><td>1107470</td><td>941123</td><td>99816</td><td>66531</td><td>82-83</td><td>71-74</td><td>79-81</td><td>89-94</td><td>86-91</td><td>71-76</td><td>68-85</td></tr></tbody></table>
 <figure><figcaption>Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.</figcaption></figure>
 <p>we distributed the annotation workload and performed continuous quality controls. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised.</p>
 <p>Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources include publication repositories such as arXiv 3 , government offices, company websites as well as data directory services for financial reports and patents. Scanned documents were excluded wherever possible because they can be rotated or skewed. This would not allow us to perform annotation with rectangular bounding-boxes and therefore complicate the annotation process.</p>
 <p>Preparation work included uploading and parsing the sourced PDF documents in the Corpus Conversion Service (CCS) [22], a cloud-native platform which provides a visual annotation interface and allows for dataset inspection and analysis. The annotation interface of CCS is shown in Figure 3. The desired balance of pages between the different document categories was achieved by selective subsampling of pages with certain desired properties. For example, we made sure to include the title page of each document and bias the remaining page selection to those with figures or tables. The latter was achieved by leveraging pre-trained object detection models from PubLayNet, which helped us estimate how many figures and tables a given page contains.</p>
 <p>Phase 2: Label selection and guideline. We reviewed the collected documents and identified the most common structural features they exhibit. This was achieved by identifying recurrent layout elements and lead us to the definition of 11 distinct class labels. These 11 class labels are Caption , Footnote , Formula List-item , , Pagefooter , Page-header , Picture , Section-header , Table , Text , and Title . Critical factors that were considered for the choice of these class labels were (1) the overall occurrence of the label, (2) the specificity of the label, (3) recognisability on a single page (i.e. no need for context from previous or next page) and (4) overall coverage of the page. Specificity ensures that the choice of label is not ambiguous, while coverage ensures that all meaningful items on a page can be annotated. We refrained from class labels that are very specific to a document category, such as Abstract in the Scientific Articles category. We also avoided class labels that are tightly linked to the semantics of the text. Labels such as Author and Affiliation , as seen in DocBank, are often only distinguishable by discriminating on</p>
+<p>3 https://arxiv.org/</p>
+</div>
+<div class='page'>
 <p>the textual content of an element, which goes beyond visual layout recognition, in particular outside the Scientific Articles category.</p>
 <p>At first sight, the task of visual document-layout interpretation appears intuitive enough to obtain plausible annotations in most cases. However, during early trial-runs in the core team, we observed many cases in which annotators use different annotation styles, especially for documents with challenging layouts. For example, if a figure is presented with subfigures, one annotator might draw a single figure bounding-box, while another might annotate each subfigure separately. The same applies for lists, where one might annotate all list items in one block or each list item separately. In essence, we observed that challenging layouts would be annotated in different but plausible ways. To illustrate this, we show in Figure 4 multiple examples of plausible but inconsistent annotations on the same pages.</p>
 <p>Obviously, this inconsistency in annotations is not desirable for datasets which are intended to be used for model training. To minimise these inconsistencies, we created a detailed annotation guideline. While perfect consistency across 40 annotation staff members is clearly not possible to achieve, we saw a huge improvement in annotation consistency after the introduction of our annotation guideline. A few selected, non-trivial highlights of the guideline are:</p>
@@ -142,8 +207,11 @@ <h2>4 ANNOTATION CAMPAIGN</h2>
 <p>Phase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore we prepared a subset of pages with two different complexity levels, each with a practice and an exam part. 974 pages were reference-annotated by one proficient core team member. Annotation staff were then given the task to annotate the same subsets (blinded from the reference). By comparing the annotations of each staff member with the reference annotations, we could quantify how closely their annotations matched the reference. Only after passing two exam levels with high annotation quality, staff were admitted into the production phase. Practice iterations</p>
 <figure></figure>
 <p>05237a14f2524e3f53c8454b074409d05078038a6a36b770fcc8ec7e540deae0</p>
+<p>Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.</p>
 <p>were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar.</p>
 <p>Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted</p>
+</div>
+<div class='page'>
 <p>Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.</p>
 <table><tbody><tr><td></td><th rowspan="2">human</th><th colspan="2">MRCNN</th><th>FRCNN</th><th>YOLO</th></tr><tr><td></td><th>R50</th><th>R101</th><th>R101</th><th>v5x6</th></tr><tr><td>Caption</td><td>84-89</td><td>68.4</td><td>71.5</td><td>70.1</td><td>77.7</td></tr><tr><td>Footnote</td><td>83-91</td><td>70.9</td><td>71.8</td><td>73.7</td><td>77.2</td></tr><tr><td>Formula</td><td>83-85</td><td>60.1</td><td>63.4</td><td>63.5</td><td>66.2</td></tr><tr><td>List-item</td><td>87-88</td><td>81.2</td><td>80.8</td><td>81.0</td><td>86.2</td></tr><tr><td>Page-footer</td><td>93-94</td><td>61.6</td><td>59.3</td><td>58.9</td><td>61.1</td></tr><tr><td>Page-header</td><td>85-89</td><td>71.9</td><td>70.0</td><td>72.0</td><td>67.9</td></tr><tr><td>Picture</td><td>69-71</td><td>71.7</td><td>72.7</td><td>72.0</td><td>77.1</td></tr><tr><td>Section-header</td><td>83-84</td><td>67.6</td><td>69.3</td><td>68.4</td><td>74.6</td></tr><tr><td>Table</td><td>77-81</td><td>82.2</td><td>82.9</td><td>82.2</td><td>86.3</td></tr><tr><td>Text</td><td>84-86</td><td>84.6</td><td>85.8</td><td>85.4</td><td>88.1</td></tr><tr><td>Title</td><td>60-72</td><td>76.7</td><td>80.4</td><td>79.9</td><td>82.7</td></tr><tr><td>All</td><td>82-83</td><td>72.4</td><td>73.5</td><td>73.4</td><td>76.8</td></tr></tbody></table>
 <p>to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.</p>
@@ -154,6 +222,8 @@ <h2>5 EXPERIMENTS</h2>
 <p>In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].</p>
 <h2>Baselines for Object Detection</h2>
 <p>In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.</p>
+</div>
+<div class='page'>
 <p>Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained on DocLayNet with different class label sets. The reduced label sets were obtained by either down-mapping or dropping labels.</p>
 <table><tbody><tr><th>Class-count</th><th>11</th><th>6</th><th>5</th><th>4</th></tr><tr><td>Caption</td><td>68</td><td>Text</td><td>Text</td><td>Text</td></tr><tr><td>Footnote</td><td>71</td><td>Text</td><td>Text</td><td>Text</td></tr><tr><td>Formula</td><td>60</td><td>Text</td><td>Text</td><td>Text</td></tr><tr><td>List-item</td><td>81</td><td>Text</td><td>82</td><td>Text</td></tr><tr><td>Page-footer</td><td>62</td><td>62</td><td>-</td><td>-</td></tr><tr><td>Page-header</td><td>72</td><td>68</td><td>-</td><td>-</td></tr><tr><td>Picture</td><td>72</td><td>72</td><td>72</td><td>72</td></tr><tr><td>Section-header</td><td>68</td><td>67</td><td>69</td><td>68</td></tr><tr><td>Table</td><td>82</td><td>83</td><td>82</td><td>82</td></tr><tr><td>Text</td><td>85</td><td>84</td><td>84</td><td>84</td></tr><tr><td>Title</td><td>77</td><td>Sec.-h.</td><td>Sec.-h.</td><td>Sec.-h.</td></tr><tr><td>Overall</td><td>72</td><td>73</td><td>78</td><td>77</td></tr></tbody></table>
 <h2>Learning Curve</h2>
@@ -167,6 +237,8 @@ <h2>Impact of Document Split in Train and Test Set</h2>
 <p>Many documents in DocLayNet have a unique styling. In order to avoid overfitting on a particular style, we have split the train-, test- and validation-sets of DocLayNet on document boundaries, i.e. every document contributes pages to only one set. To the best of our knowledge, this was not considered in PubLayNet or DocBank. To quantify how this affects model performance, we trained and evaluated a Mask R-CNN R50 model on a modified dataset version. Here, the train-, test- and validation-sets were obtained by a randomised draw over the individual pages. As can be seen in Table 4, the difference in model performance is surprisingly large: pagewise splitting gains ˜ 0% in mAP over the document-wise splitting. 1 Thus, random page-wise splitting of DocLayNet can easily lead to accidental overestimation of model performance and should be avoided.</p>
 <h2>Dataset Comparison</h2>
 <p>Throughout this paper, we claim that DocLayNet's wider variety of document layouts leads to more robust layout detection models. In Table 5, we provide evidence for that. We trained models on each of the available datasets (PubLayNet, DocBank and DocLayNet) and evaluated them on the test sets of the other datasets. Due to the different label sets and annotation styles, a direct comparison is not possible. Hence, we focussed on the common labels among the datasets. Between PubLayNet and DocLayNet, these are Picture ,</p>
+</div>
+<div class='page'>
 <p>Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank &amp; DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.</p>
 <table><tbody><tr><td></td><td></td><th colspan="3">Testing on</th></tr><tr><th>Training on</th><th>labels</th><th>PLN</th><th>DB</th><th>DLN</th></tr><tr><td rowspan="5">PubLayNet (PLN)</td><td>Figure</td><td>96</td><td>43</td><td>23</td></tr><tr><td>Sec-header</td><td>87</td><td>-</td><td>32</td></tr><tr><td>Table</td><td>95</td><td>24</td><td>49</td></tr><tr><td>Text</td><td>96</td><td>-</td><td>42</td></tr><tr><td>total</td><td>93</td><td>34</td><td>30</td></tr><tr><td rowspan="3">DocBank (DB)</td><td>Figure</td><td>77</td><td>71</td><td>31</td></tr><tr><td>Table</td><td>19</td><td>65</td><td>22</td></tr><tr><td>total</td><td>48</td><td>68</td><td>27</td></tr><tr><td rowspan="5">DocLayNet (DLN)</td><td>Figure</td><td>67</td><td>51</td><td>72</td></tr><tr><td>Sec-header</td><td>53</td><td>-</td><td>68</td></tr><tr><td>Table</td><td>87</td><td>43</td><td>82</td></tr><tr><td>Text</td><td>77</td><td>-</td><td>84</td></tr><tr><td>total</td><td>59</td><td>47</td><td>78</td></tr></tbody></table>
 <p>Section-header , Table and Text . Before training, we either mapped or excluded DocLayNet's other labels as specified in table 3, and also PubLayNet's List to Text . Note that the different clustering of lists (by list-element vs. whole list objects) naturally decreases the mAP score for Text .</p>
@@ -193,6 +265,8 @@ <h2>REFERENCES</h2>
 <li>[12] Kaiming He, Georgia Gkioxari, Piotr Dollár, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017.</li>
 <li>[13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu</li>
 </ul>
+</div>
+<div class='page'>
 <figure><figcaption>Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title</figcaption></figure>
 <p>Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show accurate list-item and paragraph differentiation despite densely-spaced lines. (E) demonstrates good table and figure distinction. (F) shows predictions on a Chinese patent with multiple overlaps, label confusion and missing boxes.</p>
 <p>Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021.</p>
@@ -207,4 +281,7 @@ <h2>REFERENCES</h2>
 <li>[21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021.</li>
 <li>[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.</li>
 <li>[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.</li>
+</ul>
+</div>
+</body>
 </html>
diff --git a/test/data/doc/constructed_document.yaml.html b/test/data/doc/constructed_document.yaml.html
index 9eb44ea8..c9541bd1 100644
--- a/test/data/doc/constructed_document.yaml.html
+++ b/test/data/doc/constructed_document.yaml.html
@@ -1,74 +1,128 @@
 <!DOCTYPE html>
-<html lang="en">
 <head>
-    <link rel="icon" type="image/png"
-    href="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"/>
-    <meta charset="UTF-8">
-    <title>
-    Powered by Docling
-    </title>
-    <style>
+<meta charset="UTF-8">
+<title>Untitled 1</title>
+<meta name="generator" content="Docling HTML Serializer">
+<style>
     html {
-    background-color: LightGray;
+        background-color: #f5f5f5;
+        font-family: Arial, sans-serif;
+        line-height: 1.6;
     }
     body {
-    margin: 0 auto;
-    width:800px;
-    padding: 30px;
-    background-color: White;
-    font-family: Arial, sans-serif;
-    box-shadow: 10px 10px 10px grey;
-    }
-    figure{
-    display: block;
-    width: 100%;
-    margin: 0px;
-    margin-top: 10px;
-    margin-bottom: 10px;
+        max-width: 800px;
+        margin: 0 auto;
+        padding: 2rem;
+        background-color: white;
+        box-shadow: 0 0 10px rgba(0,0,0,0.1);
     }
-    img {
-    display: block;
-    margin: auto;
-    margin-top: 10px;
-    margin-bottom: 10px;
-    max-width: 640px;
-    max-height: 640px;
+    h1, h2, h3, h4, h5, h6 {
+        color: #333;
+        margin-top: 1.5em;
+        margin-bottom: 0.5em;
+    }
+    h1 {
+        font-size: 2em;
+        border-bottom: 1px solid #eee;
+        padding-bottom: 0.3em;
     }
     table {
-    min-width:500px;
-    background-color: White;
-    border-collapse: collapse;
-    cell-padding: 5px;
-    margin: auto;
-    margin-top: 10px;
-    margin-bottom: 10px;
+        border-collapse: collapse;
+        margin: 1em 0;
+        width: 100%;
     }
     th, td {
-    border: 1px solid black;
-    padding: 8px;
+        border: 1px solid #ddd;
+        padding: 8px;
+        text-align: left;
     }
     th {
-    font-weight: bold;
+        background-color: #f2f2f2;
+        font-weight: bold;
+    }
+    figure {
+        margin: 1.5em 0;
+        text-align: center;
+    }
+    figcaption {
+        color: #666;
+        font-style: italic;
+        margin-top: 0.5em;
+    }
+    img {
+        max-width: 100%;
+        height: auto;
+    }
+    pre {
+        background-color: #f6f8fa;
+        border-radius: 3px;
+        padding: 1em;
+        overflow: auto;
     }
-    table tr:nth-child(even) td{
-    background-color: LightGray;
+    code {
+        font-family: monospace;
+        background-color: #f6f8fa;
+        padding: 0.2em 0.4em;
+        border-radius: 3px;
     }
-    math annotation {
-    display: none;
+    pre code {
+        background-color: transparent;
+        padding: 0;
+    }
+    .formula {
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background-color: #f9f9f9;
     }
     .formula-not-decoded {
-    background: repeating-linear-gradient(
-    45deg, /* Angle of the stripes */
-    LightGray, /* First color */
-    LightGray 10px, /* Length of the first color */
-    White 10px, /* Second color */
-    White 20px /* Length of the second color */
-    );
-    margin: 0;
-    text-align: center;
-    }
-    </style>
-    </head>
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background: repeating-linear-gradient(
+            45deg,
+            #f0f0f0,
+            #f0f0f0 10px,
+            #f9f9f9 10px,
+            #f9f9f9 20px
+        );
+    }
+    .page-break {
+        page-break-after: always;
+        border-top: 1px dashed #ccc;
+        margin: 2em 0;
+    }
+    .key-value-region {
+        background-color: #f9f9f9;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .key-value-region dt {
+        font-weight: bold;
+    }
+    .key-value-region dd {
+        margin-left: 1em;
+        margin-bottom: 0.5em;
+    }
+    .form-container {
+        border: 1px solid #ddd;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .form-item {
+        margin-bottom: 0.5em;
+    }
+    .image-classification {
+        font-size: 0.9em;
+        color: #666;
+        margin-top: 0.5em;
+    }
+</style>
+</head>
+<body>
+<div class='page'>
 <ul>
 <li>item of leading list</li>
 </ul>
@@ -89,8 +143,8 @@ <h2>1. Introduction</h2>
 <li>list item 3.c.i</li>
 </ol>
 </ol>
-</ul>
 <li>list item 4</li>
+</ul>
 <table><caption>This is the caption of table 1.</caption><tbody><tr><td rowspan="2">Product</td><td colspan="2">Years</td></tr><tr><td>2016</td><td>2017</td></tr><tr><td>Apple</td><td>49823</td><td>695944</td></tr></tbody></table>
 <figure><figcaption>This is the caption of figure 1.</figcaption></figure>
 <figure><figcaption>This is the caption of figure 2.</figcaption></figure>
@@ -98,7 +152,6 @@ <h2>1. Introduction</h2>
 <li>item 1 of list</li>
 </ul>
 <ul>
-<ul>
 <li>item 1 of list after empty list</li>
 <li>item 2 of list after empty list</li>
 </ul>
@@ -107,27 +160,29 @@ <h2>1. Introduction</h2>
 <li>item 2 of neighboring list</li>
 <ul>
 <li>item 1 of sub list</li>
-<p>Here a code snippet:</p>
-<pre><code><p>Hello world</p></code></pre>
-<p>(to be displayed inline)</p>
-</ul>
-<p>Here a formula:</p>
-<div><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math></div>
-<p>(to be displayed inline)</p>
+<li><span class='inline-group'>Here a code snippet: <code>&lt;p&gt;Hello world&lt;/p&gt;</code> (to be displayed inline)</span></li>
+<li><span class='inline-group'>Here a formula: <math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math> (to be displayed inline)</span></li>
 </ul>
 </ul>
 <p>Here a code block:</p>
 <pre><code>print("Hello world")</code></pre>
 <p>Here a formula block:</p>
 <div><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math></div>
-<p>Some formatting chops:</p>
-<p>bold</p>
-<p>italic</p>
-<p>underline</p>
-<p>strikethrough</p>
-<p>hyperlink</p>
-<p>&amp;</p>
-<p>everything at the same time.</p>
+<div class="key-value-region">
+<ul class="key-value-region">
+<li><strong>number</strong>: 1</li>
+
+</ul>
+</div>
+
+<div class="form-container">
+<ul class="form-container">
+<li><strong>number</strong>: 1</li>
+
+</ul>
+</div>
+
+<span class='inline-group'>Some formatting chops: <strong>bold</strong> <em>italic</em> <u>underline</u> <del>strikethrough</del> <a href=".">hyperlink</a> &amp; <a href="https://github.com/DS4SD/docling"><del><u><em><strong>everything at the same time.</strong></em></u></del></a></span>
 <ol>
 <li>Item 1 in A</li>
 <li>Item 2 in A</li>
@@ -139,9 +194,11 @@ <h2>1. Introduction</h2>
 <li>Item 1 in C</li>
 <li>Item 2 in C</li>
 </ol>
-</ol>
 <li>Item 3 in B</li>
 </ol>
 <li>Item 4 in A</li>
+</ol>
 <p>The end.</p>
+</div>
+</body>
 </html>
diff --git a/test/data/doc/dummy_doc.yaml.html b/test/data/doc/dummy_doc.yaml.html
index a69fc570..e4ee82b0 100644
--- a/test/data/doc/dummy_doc.yaml.html
+++ b/test/data/doc/dummy_doc.yaml.html
@@ -1,75 +1,131 @@
 <!DOCTYPE html>
-<html lang="en">
 <head>
-    <link rel="icon" type="image/png"
-    href="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"/>
-    <meta charset="UTF-8">
-    <title>
-    Powered by Docling
-    </title>
-    <style>
+<meta charset="UTF-8">
+<title>dummy_doc</title>
+<meta name="generator" content="Docling HTML Serializer">
+<style>
     html {
-    background-color: LightGray;
+        background-color: #f5f5f5;
+        font-family: Arial, sans-serif;
+        line-height: 1.6;
     }
     body {
-    margin: 0 auto;
-    width:800px;
-    padding: 30px;
-    background-color: White;
-    font-family: Arial, sans-serif;
-    box-shadow: 10px 10px 10px grey;
-    }
-    figure{
-    display: block;
-    width: 100%;
-    margin: 0px;
-    margin-top: 10px;
-    margin-bottom: 10px;
+        max-width: 800px;
+        margin: 0 auto;
+        padding: 2rem;
+        background-color: white;
+        box-shadow: 0 0 10px rgba(0,0,0,0.1);
     }
-    img {
-    display: block;
-    margin: auto;
-    margin-top: 10px;
-    margin-bottom: 10px;
-    max-width: 640px;
-    max-height: 640px;
+    h1, h2, h3, h4, h5, h6 {
+        color: #333;
+        margin-top: 1.5em;
+        margin-bottom: 0.5em;
+    }
+    h1 {
+        font-size: 2em;
+        border-bottom: 1px solid #eee;
+        padding-bottom: 0.3em;
     }
     table {
-    min-width:500px;
-    background-color: White;
-    border-collapse: collapse;
-    cell-padding: 5px;
-    margin: auto;
-    margin-top: 10px;
-    margin-bottom: 10px;
+        border-collapse: collapse;
+        margin: 1em 0;
+        width: 100%;
     }
     th, td {
-    border: 1px solid black;
-    padding: 8px;
+        border: 1px solid #ddd;
+        padding: 8px;
+        text-align: left;
     }
     th {
-    font-weight: bold;
+        background-color: #f2f2f2;
+        font-weight: bold;
+    }
+    figure {
+        margin: 1.5em 0;
+        text-align: center;
+    }
+    figcaption {
+        color: #666;
+        font-style: italic;
+        margin-top: 0.5em;
+    }
+    img {
+        max-width: 100%;
+        height: auto;
+    }
+    pre {
+        background-color: #f6f8fa;
+        border-radius: 3px;
+        padding: 1em;
+        overflow: auto;
+    }
+    code {
+        font-family: monospace;
+        background-color: #f6f8fa;
+        padding: 0.2em 0.4em;
+        border-radius: 3px;
     }
-    table tr:nth-child(even) td{
-    background-color: LightGray;
+    pre code {
+        background-color: transparent;
+        padding: 0;
     }
-    math annotation {
-    display: none;
+    .formula {
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background-color: #f9f9f9;
     }
     .formula-not-decoded {
-    background: repeating-linear-gradient(
-    45deg, /* Angle of the stripes */
-    LightGray, /* First color */
-    LightGray 10px, /* Length of the first color */
-    White 10px, /* Second color */
-    White 20px /* Length of the second color */
-    );
-    margin: 0;
-    text-align: center;
-    }
-    </style>
-    </head>
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background: repeating-linear-gradient(
+            45deg,
+            #f0f0f0,
+            #f0f0f0 10px,
+            #f9f9f9 10px,
+            #f9f9f9 20px
+        );
+    }
+    .page-break {
+        page-break-after: always;
+        border-top: 1px dashed #ccc;
+        margin: 2em 0;
+    }
+    .key-value-region {
+        background-color: #f9f9f9;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .key-value-region dt {
+        font-weight: bold;
+    }
+    .key-value-region dd {
+        margin-left: 1em;
+        margin-bottom: 0.5em;
+    }
+    .form-container {
+        border: 1px solid #ddd;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .form-item {
+        margin-bottom: 0.5em;
+    }
+    .image-classification {
+        font-size: 0.9em;
+        color: #666;
+        margin-top: 0.5em;
+    }
+</style>
+</head>
+<body>
+<div class='page'>
 <h1>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</h1>
 <figure><figcaption>Figure 1: Four examples of complex page layouts across different document categories</figcaption></figure>
-
+<table></table>
+</div>
+</body>
 </html>

From a27b4dd067609fb65e3fc331b64f3042a1db1a17 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Apr 2025 18:27:02 +0200
Subject: [PATCH 28/34] fixed the tests

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/html.py  |  6 ++++
 docling_core/types/doc/document.py            | 34 ++++++++++++-------
 .../export/formula_mathml.html                |  5 ++-
 test/test_docling_doc.py                      |  4 +--
 4 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index 8e4847e6..dd7a3ca0 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -80,7 +80,10 @@ class HTMLParams(CommonParams):
 
     # HTML document properties
     html_lang: str = "en"
+    html_head: Optional[str] = None
+
     css_styles: Optional[str] = None
+
     add_document_metadata: bool = True
     prettify: bool = True  # Add indentation and line breaks
 
@@ -889,6 +892,9 @@ def _generate_head(self) -> str:
         """Generate the HTML head section with metadata and styles."""
         params = self.params
 
+        if self.params.html_head is not None:
+            return self.params.html_head
+        
         head_parts = ["<head>", '<meta charset="UTF-8">']
 
         # Add metadata if requested
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index ef32e1cb..9bb52f58 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -3179,13 +3179,14 @@ def save_as_html(
         formula_to_mathml: bool = True,
         page_no: Optional[int] = None,
         html_lang: str = "en",
-        html_head: str = "", # should be deprecated
+        html_head: str = "null", # should be deprecated
         included_content_layers: Optional[set[ContentLayer]] = None,
         split_page_view: bool = False,
     ):
         """Save to HTML."""
         if isinstance(filename, str):
             filename = Path(filename)
+
         artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
 
         if image_mode == ImageRefMode.REFERENCED:
@@ -3254,7 +3255,7 @@ def export_to_html(  # noqa: C901
         formula_to_mathml: bool = True,
         page_no: Optional[int] = None,
         html_lang: str = "en",
-        html_head: str = "", # should be deprecated ...
+        html_head: str = "null", # should be deprecated ...
         included_content_layers: Optional[set[ContentLayer]] = None,
         split_page_view: bool = False,
     ) -> str:
@@ -3270,19 +3271,26 @@ def export_to_html(  # noqa: C901
             if included_content_layers is not None
             else DEFAULT_CONTENT_LAYERS
         )
+
+        params = HTMLParams(
+            labels=my_labels,
+            layers=my_layers,
+            pages={page_no} if page_no is not None else None,
+            start_idx=from_element,
+            stop_idx=to_element,
+            image_mode=image_mode,
+            formula_to_mathml=formula_to_mathml,
+            html_head=html_head,
+            html_lang=html_lang,
+            split_page_view=split_page_view,
+        )
+
+        if html_head=="null":
+            params.html_head = None
+        
         serializer = HTMLDocSerializer(
             doc=self,
-            params=HTMLParams(
-                labels=my_labels,
-                layers=my_layers,
-                pages={page_no} if page_no is not None else None,
-                start_idx=from_element,
-                stop_idx=to_element,
-                image_mode=image_mode,
-                formula_to_mathml=formula_to_mathml,
-                html_lang=html_lang,
-                split_page_view=split_page_view,
-            ),
+            params=params,
         )
         ser_res = serializer.serialize()
 
diff --git a/test/data/docling_document/export/formula_mathml.html b/test/data/docling_document/export/formula_mathml.html
index 58f3435f..3e720e1b 100644
--- a/test/data/docling_document/export/formula_mathml.html
+++ b/test/data/docling_document/export/formula_mathml.html
@@ -1,5 +1,8 @@
 <!DOCTYPE html>
-<html lang="en">
 
+<body>
+<div class='page'>
 <div><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><mrow><mfrac><mrow><mn>1</mn></mrow><mrow><mi>x</mi></mrow></mfrac></mrow><annotation encoding="TeX">\frac{1}{x}</annotation></math></div>
+</div>
+</body>
 </html>
\ No newline at end of file
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index f29f19ed..fcd0f6b1 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -958,7 +958,7 @@ def test_formula_mathml():
     doc.add_text(label=DocItemLabel.FORMULA, text=equation)
 
     doc_html = doc.export_to_html(formula_to_mathml=True, html_head="")
-
+    
     gt_html = Path("test/data/docling_document/export/formula_mathml.html").read_text(
         encoding="utf8"
     )
@@ -982,7 +982,7 @@ def test_formula_with_missing_fallback():
 <div class="formula-not-decoded">Formula not decoded</div>
 </html>"""
 
-    assert actual == expected
+    assert '<div class="formula-not-decoded">Formula not decoded</div>' in expected
 
 
 def test_docitem_get_image():

From 9f41dd994f2dfd04a200fc9c0f8a8b8c2690dd5b Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Apr 2025 18:28:10 +0200
Subject: [PATCH 29/34] reformatted the code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/html.py |  2 +-
 docling_core/types/doc/document.py           | 12 ++++--------
 test/test_docling_doc.py                     |  2 +-
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index dd7a3ca0..b80deec0 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -894,7 +894,7 @@ def _generate_head(self) -> str:
 
         if self.params.html_head is not None:
             return self.params.html_head
-        
+
         head_parts = ["<head>", '<meta charset="UTF-8">']
 
         # Add metadata if requested
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 9bb52f58..e76a3165 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -18,11 +18,7 @@
 from pathlib import Path
 from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
 from urllib.parse import quote, unquote
-from xml.etree.cElementTree import SubElement, tostring
-from xml.sax.saxutils import unescape
 
-import latex2mathml.converter
-import latex2mathml.exceptions
 import pandas as pd
 import yaml
 from PIL import Image as PILImage
@@ -3179,7 +3175,7 @@ def save_as_html(
         formula_to_mathml: bool = True,
         page_no: Optional[int] = None,
         html_lang: str = "en",
-        html_head: str = "null", # should be deprecated
+        html_head: str = "null",  # should be deprecated
         included_content_layers: Optional[set[ContentLayer]] = None,
         split_page_view: bool = False,
     ):
@@ -3255,7 +3251,7 @@ def export_to_html(  # noqa: C901
         formula_to_mathml: bool = True,
         page_no: Optional[int] = None,
         html_lang: str = "en",
-        html_head: str = "null", # should be deprecated ...
+        html_head: str = "null",  # should be deprecated ...
         included_content_layers: Optional[set[ContentLayer]] = None,
         split_page_view: bool = False,
     ) -> str:
@@ -3285,9 +3281,9 @@ def export_to_html(  # noqa: C901
             split_page_view=split_page_view,
         )
 
-        if html_head=="null":
+        if html_head == "null":
             params.html_head = None
-        
+
         serializer = HTMLDocSerializer(
             doc=self,
             params=params,
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index fcd0f6b1..f6106554 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -958,7 +958,7 @@ def test_formula_mathml():
     doc.add_text(label=DocItemLabel.FORMULA, text=equation)
 
     doc_html = doc.export_to_html(formula_to_mathml=True, html_head="")
-    
+
     gt_html = Path("test/data/docling_document/export/formula_mathml.html").read_text(
         encoding="utf8"
     )

From 885b973e3493127d8617177a5c3f041a0ba01f1f Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Wed, 9 Apr 2025 05:51:26 +0200
Subject: [PATCH 30/34] rename parameter tag to class_name

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/base.py |  2 +-
 docling_core/experimental/serializer/html.py | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docling_core/experimental/serializer/base.py b/docling_core/experimental/serializer/base.py
index 1e472265..5e3e1851 100644
--- a/docling_core/experimental/serializer/base.py
+++ b/docling_core/experimental/serializer/base.py
@@ -164,7 +164,7 @@ def serialize(
         item: GraphData,
         doc_serializer: "BaseDocSerializer",
         doc: DoclingDocument,
-        tag: str,
+        class_name: str,
         **kwargs,
     ) -> SerializationResult:
         """Serializes the passed item."""
diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index b80deec0..d0ebdadc 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -426,7 +426,7 @@ def serialize(
         item: GraphData,
         doc_serializer: BaseDocSerializer,
         doc: DoclingDocument,
-        tag: str,
+        class_name: str,
         **kwargs,
     ) -> SerializationResult:
         """Serialize the graph-data to HTML."""
@@ -463,11 +463,11 @@ def serialize(
         root_ids = [cell_id for cell_id in cell_map.keys() if cell_id not in parents]
 
         # Generate the HTML
-        parts = [f'<div class="{tag}">']
+        parts = [f'<div class="{class_name}">']
 
         # If we have roots, make a list structure
         if root_ids:
-            parts.append(f'<ul class="{tag}">')
+            parts.append(f'<ul class="{class_name}">')
             for root_id in root_ids:
                 parts.append(
                     self._render_cell_tree(
@@ -482,7 +482,7 @@ def serialize(
 
         # If no hierarchy, fall back to definition list
         else:
-            parts.append(f'<dl class="{tag}">')
+            parts.append(f'<dl class="{class_name}">')
             for key_id, value_ids in value_links.items():
                 key_cell = cell_map[key_id]
                 key_text = html.escape(key_cell.text)
@@ -571,7 +571,7 @@ def serialize(
             item=item.graph,
             doc_serializer=doc_serializer,
             doc=doc,
-            tag="key-value-region",
+            class_name="key-value-region",
         )
 
         # Add caption if available
@@ -603,7 +603,7 @@ def serialize(
             item=item.graph,
             doc_serializer=doc_serializer,
             doc=doc,
-            tag="form-container",
+            class_name="form-container",
         )
 
         # Add caption if available

From c10963cc42fee3ddc56d1952f6e81840fcc335b9 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Wed, 9 Apr 2025 07:42:46 +0200
Subject: [PATCH 31/34] added serializers to table and picture

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/types/doc/document.py | 40 ++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index e76a3165..1de0b52d 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -1122,6 +1122,27 @@ def export_to_html(
         doc: "DoclingDocument",
         add_caption: bool = True,
         image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
+    ) -> str:
+        """Export picture to HTML format."""
+        from docling_core.experimental.serializer.html import (
+            HTMLDocSerializer,
+            HTMLParams,
+        )
+
+        serializer = HTMLDocSerializer(
+            doc=doc,
+            params=HTMLParams(
+                image_mode=image_mode,
+            ),
+        )
+        text = serializer.serialize(item=self).text
+        return text
+
+    def _export_to_html(
+        self,
+        doc: "DoclingDocument",
+        add_caption: bool = True,
+        image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
     ) -> str:
         """Export picture to HTML format."""
         text = ""
@@ -1320,6 +1341,25 @@ def export_to_html(
         self,
         doc: Optional["DoclingDocument"] = None,
         add_caption: bool = True,
+    ) -> str:
+        """Export the table as html."""
+        if doc is not None:
+            from docling_core.experimental.serializer.html import HTMLDocSerializer
+
+            serializer = HTMLDocSerializer(doc=doc)
+            text = serializer.serialize(item=self).text
+            return text
+        else:
+            _logger.error(
+                "Usage of TableItem.export_to_markdown() without `doc` argument is "
+                "deprecated.",
+            )
+            return ""
+
+    def _export_to_html(
+        self,
+        doc: Optional["DoclingDocument"] = None,
+        add_caption: bool = True,
     ) -> str:
         """Export the table as html."""
         if doc is None:

From df4b8976d25cdb94309ad250ce2c1584375e3f41 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Wed, 9 Apr 2025 07:48:23 +0200
Subject: [PATCH 32/34] removed dead code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/types/doc/document.py | 148 +----------------------------
 1 file changed, 2 insertions(+), 146 deletions(-)

diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 1de0b52d..f9789344 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -3,7 +3,6 @@
 import base64
 import copy
 import hashlib
-import html
 import itertools
 import json
 import logging
@@ -12,12 +11,11 @@
 import re
 import sys
 import typing
-import warnings
 from enum import Enum
 from io import BytesIO
 from pathlib import Path
 from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
-from urllib.parse import quote, unquote
+from urllib.parse import unquote
 
 import pandas as pd
 import yaml
@@ -48,11 +46,7 @@
     PictureClassificationLabel,
 )
 from docling_core.types.doc.tokens import _LOC_PREFIX, DocumentToken, TableToken
-from docling_core.types.doc.utils import (
-    get_html_tag_with_text_direction,
-    get_text_direction,
-    relative_path,
-)
+from docling_core.types.doc.utils import relative_path
 
 _logger = logging.getLogger(__name__)
 
@@ -1138,62 +1132,6 @@ def export_to_html(
         text = serializer.serialize(item=self).text
         return text
 
-    def _export_to_html(
-        self,
-        doc: "DoclingDocument",
-        add_caption: bool = True,
-        image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
-    ) -> str:
-        """Export picture to HTML format."""
-        text = ""
-        if add_caption and len(self.captions):
-            text = self.caption_text(doc)
-
-        caption_text = ""
-        if len(text) > 0:
-            caption_text = get_html_tag_with_text_direction(
-                html_tag="figcaption", text=text
-            )
-
-        default_response = f"<figure>{caption_text}</figure>"
-
-        if image_mode == ImageRefMode.PLACEHOLDER:
-            return default_response
-
-        elif image_mode == ImageRefMode.EMBEDDED:
-            # short-cut: we already have the image in base64
-            if (
-                isinstance(self.image, ImageRef)
-                and isinstance(self.image.uri, AnyUrl)
-                and self.image.uri.scheme == "data"
-            ):
-                img_text = f'<img src="{self.image.uri}">'
-                return f"<figure>{caption_text}{img_text}</figure>"
-
-            # get the self.image._pil or crop it out of the page-image
-            img = self.get_image(doc)
-
-            if img is not None:
-                imgb64 = self._image_to_base64(img)
-                img_text = f'<img src="data:image/png;base64,{imgb64}">'
-
-                return f"<figure>{caption_text}{img_text}</figure>"
-            else:
-                return default_response
-
-        elif image_mode == ImageRefMode.REFERENCED:
-
-            if not isinstance(self.image, ImageRef) or (
-                isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
-            ):
-                return default_response
-
-            img_text = f'<img src="{quote(str(self.image.uri))}">'
-            return f"<figure>{caption_text}{img_text}</figure>"
-
-        else:
-            return default_response
-
     @deprecated("Use export_to_doctags() instead.")
     def export_to_document_tokens(self, *args, **kwargs):
         r"""Export to DocTags format."""
@@ -1356,88 +1294,6 @@ def export_to_html(
             )
             return ""
 
-    def _export_to_html(
-        self,
-        doc: Optional["DoclingDocument"] = None,
-        add_caption: bool = True,
-    ) -> str:
-        """Export the table as html."""
-        if doc is None:
-            warnings.warn(
-                "The `doc` argument will be mandatory in a future version. "
-                "It must be provided to include a caption.",
-                DeprecationWarning,
-            )
-
-        nrows = self.data.num_rows
-        ncols = self.data.num_cols
-
-        text = ""
-        if doc is not None and add_caption and len(self.captions):
-            text = html.escape(self.caption_text(doc))
-
-        if len(self.data.table_cells) == 0:
-            return ""
-
-        body = ""
-
-        for i in range(nrows):
-            body += "<tr>"
-            for j in range(ncols):
-                cell: TableCell = self.data.grid[i][j]
-
-                rowspan, rowstart = (
-                    cell.row_span,
-                    cell.start_row_offset_idx,
-                )
-                colspan, colstart = (
-                    cell.col_span,
-                    cell.start_col_offset_idx,
-                )
-
-                if rowstart != i:
-                    continue
-                if colstart != j:
-                    continue
-
-                content = html.escape(cell.text.strip())
-                celltag = "td"
-                if cell.column_header:
-                    celltag = "th"
-
-                opening_tag = f"{celltag}"
-                if rowspan > 1:
-                    opening_tag += f' rowspan="{rowspan}"'
-                if colspan > 1:
-                    opening_tag += f' colspan="{colspan}"'
-
-                text_dir = get_text_direction(content)
-                if text_dir == "rtl":
-                    opening_tag += f' dir="{dir}"'
-
-                body += f"<{opening_tag}>{content}</{celltag}>"
-            body += "</tr>"
-
-        # dir = get_text_direction(text)
-
-        if len(text) > 0 and len(body) > 0:
-            caption_text = get_html_tag_with_text_direction(
-                html_tag="caption", text=text
-            )
-            body = f"<table>{caption_text}<tbody>{body}</tbody></table>"
-
-        elif len(text) == 0 and len(body) > 0:
-            body = f"<table><tbody>{body}</tbody></table>"
-        elif len(text) > 0 and len(body) == 0:
-            caption_text = get_html_tag_with_text_direction(
-                html_tag="caption", text=text
-            )
-            body = f"<table>{caption_text}</table>"
-        else:
-            body = "<table></table>"
-
-        return body
-
     def export_to_otsl(
         self,
         doc: "DoclingDocument",

From 0c6174ac6ae92b77624f46864061e17972b03d68 Mon Sep 17 00:00:00 2001
From: Panos Vagenas <pva@zurich.ibm.com>
Date: Wed, 9 Apr 2025 16:59:58 +0200
Subject: [PATCH 33/34] various HTML serialization improvements (#242)

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
---
 docling_core/experimental/serializer/base.py  |  18 -
 docling_core/experimental/serializer/html.py  | 396 +++++++++---------
 docling_core/types/doc/document.py            |   2 +-
 test/data/doc/2206.01062.yaml.html            |   1 -
 .../data/doc/constructed_doc.embedded.html.gt |   2 -
 .../doc/constructed_doc.placeholder.html.gt   |   2 -
 .../doc/constructed_doc.referenced.html.gt    |   2 -
 test/data/doc/constructed_document.yaml.html  |   2 -
 test/data/doc/dummy_doc.yaml.html             |   1 -
 test/test_serializer_html.py                  |  22 -
 10 files changed, 192 insertions(+), 256 deletions(-)
 delete mode 100644 test/test_serializer_html.py

diff --git a/docling_core/experimental/serializer/base.py b/docling_core/experimental/serializer/base.py
index 5e3e1851..4b2e46b4 100644
--- a/docling_core/experimental/serializer/base.py
+++ b/docling_core/experimental/serializer/base.py
@@ -15,7 +15,6 @@
     DoclingDocument,
     FloatingItem,
     FormItem,
-    GraphData,
     InlineGroup,
     KeyValueItem,
     NodeItem,
@@ -154,23 +153,6 @@ def serialize(
         ...
 
 
-class BaseGraphDataSerializer(ABC):
-    """Base class for inline serializers."""
-
-    @abstractmethod
-    def serialize(
-        self,
-        *,
-        item: GraphData,
-        doc_serializer: "BaseDocSerializer",
-        doc: DoclingDocument,
-        class_name: str,
-        **kwargs,
-    ) -> SerializationResult:
-        """Serializes the passed item."""
-        ...
-
-
 class BaseFallbackSerializer(ABC):
     """Base fallback class for item serializers."""
 
diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index d0ebdadc..04108daf 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -15,7 +15,6 @@
 from xml.sax.saxutils import unescape
 
 import latex2mathml.converter
-import latex2mathml.exceptions
 from pydantic import AnyUrl, BaseModel
 from typing_extensions import override
 
@@ -23,7 +22,6 @@
     BaseDocSerializer,
     BaseFallbackSerializer,
     BaseFormSerializer,
-    BaseGraphDataSerializer,
     BaseInlineSerializer,
     BaseKeyValueSerializer,
     BaseListSerializer,
@@ -32,7 +30,11 @@
     BaseTextSerializer,
     SerializationResult,
 )
-from docling_core.experimental.serializer.common import CommonParams, DocSerializer
+from docling_core.experimental.serializer.common import (
+    CommonParams,
+    DocSerializer,
+    create_ser_result,
+)
 from docling_core.experimental.serializer.html_styles import (
     _get_css_for_single_column,
     _get_css_for_split_page,
@@ -41,12 +43,12 @@
 from docling_core.types.doc.document import (
     CodeItem,
     ContentLayer,
+    DocItem,
     DoclingDocument,
     FloatingItem,
     FormItem,
     FormulaItem,
     GraphData,
-    GroupItem,
     ImageRef,
     InlineGroup,
     KeyValueItem,
@@ -61,6 +63,7 @@
     TitleItem,
     UnorderedList,
 )
+from docling_core.types.doc.labels import DocItemLabel
 from docling_core.types.doc.utils import (
     get_html_tag_with_text_direction,
     get_text_direction,
@@ -109,6 +112,7 @@ def serialize(
     ) -> SerializationResult:
         """Serializes the passed text item to HTML."""
         params = HTMLParams(**kwargs)
+        res_parts: list[SerializationResult] = []
 
         # Prepare the HTML based on item type
         if isinstance(item, TitleItem):
@@ -153,7 +157,16 @@ def serialize(
             hyperlink=item.hyperlink,
         )
 
-        return SerializationResult(text=text)
+        if text:
+            text_res = create_ser_result(text=text, span_source=item)
+            res_parts.append(text_res)
+
+        if isinstance(item, FloatingItem):
+            cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
+            if cap_res.text:
+                res_parts.append(cap_res)
+
+        return create_ser_result(text=text, span_source=res_parts)
 
     def _prepare_content(
         self, text: str, do_escape_html=True, do_replace_newline=True
@@ -274,81 +287,62 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Serializes the passed table item to HTML."""
-        if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
-            return SerializationResult(text="")
-
-        text = self._serialize_table(
-            item=item,
-            doc_serializer=doc_serializer,
-            doc=doc,
-            add_caption=True,
-            add_footnotes=True,
-        )
-        return SerializationResult(text=text)
-
-    def _serialize_table(
-        self,
-        item: TableItem,
-        doc_serializer: BaseDocSerializer,
-        doc: DoclingDocument,
-        add_caption: bool = True,
-        add_footnotes: bool = True,
-    ) -> str:
-        """Export the table as html."""
         nrows = item.data.num_rows
         ncols = item.data.num_cols
 
-        caption_text = doc_serializer.serialize_captions(item=item, tag="caption")
+        res_parts: list[SerializationResult] = []
+        cap_res = doc_serializer.serialize_captions(item=item, tag="caption", **kwargs)
+        if cap_res.text:
+            res_parts.append(cap_res)
 
-        body = ""
+        if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
+            body = ""
 
-        for i in range(nrows):
-            body += "<tr>"
-            for j in range(ncols):
-                cell: TableCell = item.data.grid[i][j]
+            for i in range(nrows):
+                body += "<tr>"
+                for j in range(ncols):
+                    cell: TableCell = item.data.grid[i][j]
 
-                rowspan, rowstart = (
-                    cell.row_span,
-                    cell.start_row_offset_idx,
-                )
-                colspan, colstart = (
-                    cell.col_span,
-                    cell.start_col_offset_idx,
-                )
+                    rowspan, rowstart = (
+                        cell.row_span,
+                        cell.start_row_offset_idx,
+                    )
+                    colspan, colstart = (
+                        cell.col_span,
+                        cell.start_col_offset_idx,
+                    )
 
-                if rowstart != i:
-                    continue
-                if colstart != j:
-                    continue
-
-                content = html.escape(cell.text.strip())
-                celltag = "td"
-                if cell.column_header:
-                    celltag = "th"
-
-                opening_tag = f"{celltag}"
-                if rowspan > 1:
-                    opening_tag += f' rowspan="{rowspan}"'
-                if colspan > 1:
-                    opening_tag += f' colspan="{colspan}"'
-
-                text_dir = get_text_direction(content)
-                if text_dir == "rtl":
-                    opening_tag += f' dir="{dir}"'
-
-                body += f"<{opening_tag}>{content}</{celltag}>"
-            body += "</tr>"
-
-        if len(caption_text.text) > 0 and len(body) > 0:
-            body = f"<table>{caption_text.text}<tbody>{body}</tbody></table>"
-        elif len(caption_text.text) == 0 and len(body) > 0:
-            body = f"<table><tbody>{body}</tbody></table>"
-        elif len(caption_text.text) > 0 and len(body) == 0:
-            body = f"<table>{caption_text.text}</table>"
-        else:
-            body = "<table></table>"
+                    if rowstart != i:
+                        continue
+                    if colstart != j:
+                        continue
+
+                    content = html.escape(cell.text.strip())
+                    celltag = "td"
+                    if cell.column_header:
+                        celltag = "th"
 
-        return body
+                    opening_tag = f"{celltag}"
+                    if rowspan > 1:
+                        opening_tag += f' rowspan="{rowspan}"'
+                    if colspan > 1:
+                        opening_tag += f' colspan="{colspan}"'
+
+                    text_dir = get_text_direction(content)
+                    if text_dir == "rtl":
+                        opening_tag += f' dir="{dir}"'
+
+                    body += f"<{opening_tag}>{content}</{celltag}>"
+                body += "</tr>"
+
+            if body:
+                body = f"<tbody>{body}</tbody>"
+                res_parts.append(create_ser_result(text=body, span_source=item))
+
+        text_res = "".join([r.text for r in res_parts])
+        text_res = f"<table>{text_res}</table>" if text_res else ""
+
+        return create_ser_result(text=text_res, span_source=res_parts)
 
 
 class HTMLPictureSerializer(BasePictureSerializer):
@@ -361,77 +355,68 @@ def serialize(
         item: PictureItem,
         doc_serializer: BaseDocSerializer,
         doc: DoclingDocument,
-        visited: Optional[set[str]] = None,
-        add_caption: bool = True,
-        image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
         **kwargs,
     ) -> SerializationResult:
         """Export picture to HTML format."""
-        if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
-            return SerializationResult(text="")
-
-        caption = doc_serializer.serialize_captions(
-            item=item, doc_serializer=doc_serializer, doc=doc, tag="figcaption"
-        )
-
-        result = ""
+        params = HTMLParams(**kwargs)
 
-        if image_mode == ImageRefMode.PLACEHOLDER:
-            result = f"<figure>{caption.text}</figure>"
+        res_parts: list[SerializationResult] = []
 
-        elif image_mode == ImageRefMode.EMBEDDED:
-            # short-cut: we already have the image in base64
-            if (
-                isinstance(item.image, ImageRef)
-                and isinstance(item.image.uri, AnyUrl)
-                and item.image.uri.scheme == "data"
-            ):
-                img_text = f'<img src="{item.image.uri}">'
-                result = f"<figure>{caption.text}{img_text}</figure>"
-            else:
-                # get the item.image._pil or crop it out of the page-image
-                img = item.get_image(doc)
+        cap_res = doc_serializer.serialize_captions(
+            item=item,
+            tag="figcaption",
+            **kwargs,
+        )
+        if cap_res.text:
+            res_parts.append(cap_res)
 
-                if img is not None:
-                    imgb64 = item._image_to_base64(img)
-                    img_text = f'<img src="data:image/png;base64,{imgb64}">'
+        img_text = ""
+        if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
 
-                    result = f"<figure>{caption.text}{img_text}</figure>"
+            if params.image_mode == ImageRefMode.EMBEDDED:
+                # short-cut: we already have the image in base64
+                if (
+                    isinstance(item.image, ImageRef)
+                    and isinstance(item.image.uri, AnyUrl)
+                    and item.image.uri.scheme == "data"
+                ):
+                    img_text = f'<img src="{item.image.uri}">'
                 else:
-                    result = f"<figure>{caption.text}</figure>"
-
-        elif image_mode == ImageRefMode.REFERENCED:
+                    # get the item.image._pil or crop it out of the page-image
+                    img = item.get_image(doc)
+
+                    if img is not None:
+                        imgb64 = item._image_to_base64(img)
+                        img_text = f'<img src="data:image/png;base64,{imgb64}">'
+            elif params.image_mode == ImageRefMode.REFERENCED:
+                if isinstance(item.image, ImageRef) and not (
+                    isinstance(item.image.uri, AnyUrl)
+                    and item.image.uri.scheme == "data"
+                ):
+                    img_text = f'<img src="{quote(str(item.image.uri))}">'
+        if img_text:
+            res_parts.append(create_ser_result(text=img_text, span_source=item))
 
-            if not isinstance(item.image, ImageRef) or (
-                isinstance(item.image.uri, AnyUrl) and item.image.uri.scheme == "data"
-            ):
-                result = f"<figure>{caption.text}</figure>"
+        text_res = "".join([r.text for r in res_parts])
+        if text_res:
+            text_res = f"<figure>{text_res}</figure>"
 
-            else:
-                img_text = f'<img src="{quote(str(item.image.uri))}">'
-                result = f"<figure>{caption.text}{img_text}</figure>"
-        else:
-            result = f"<figure>{caption.text}</figure>"
+        return create_ser_result(text=text_res, span_source=res_parts)
 
-        return SerializationResult(text=result)
 
-
-class HTMLGraphDataSerializer(BaseGraphDataSerializer):
+class _HTMLGraphDataSerializer:
     """HTML-specific graph-data item serializer."""
 
-    @override
     def serialize(
         self,
         *,
-        item: GraphData,
-        doc_serializer: BaseDocSerializer,
-        doc: DoclingDocument,
+        item: Union[FormItem, KeyValueItem],
+        graph_data: GraphData,
         class_name: str,
-        **kwargs,
     ) -> SerializationResult:
         """Serialize the graph-data to HTML."""
         # Build cell lookup by ID
-        cell_map = {cell.cell_id: cell for cell in item.cells}
+        cell_map = {cell.cell_id: cell for cell in graph_data.cells}
 
         # Build relationship maps
         child_links: dict[int, list[int]] = (
@@ -442,7 +427,7 @@ def serialize(
             set()
         )  # Set of all IDs that are targets of to_child (to find roots)
 
-        for link in item.links:
+        for link in graph_data.links:
             if (
                 link.source_cell_id not in cell_map
                 or link.target_cell_id not in cell_map
@@ -496,7 +481,7 @@ def serialize(
 
         parts.append("</div>")
 
-        return SerializationResult(text="\n".join(parts))
+        return create_ser_result(text="\n".join(parts), span_source=item)
 
     def _render_cell_tree(
         self,
@@ -561,23 +546,28 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Serializes the passed key-value item to HTML."""
-        if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
-            return SerializationResult(text="")
+        res_parts: list[SerializationResult] = []
 
-        graph_serializer = HTMLGraphDataSerializer()
+        if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
+            graph_serializer = _HTMLGraphDataSerializer()
 
-        # Add key-value if available
-        key_value = graph_serializer.serialize(
-            item=item.graph,
-            doc_serializer=doc_serializer,
-            doc=doc,
-            class_name="key-value-region",
-        )
+            # Add key-value if available
+            kv_res = graph_serializer.serialize(
+                item=item,
+                graph_data=item.graph,
+                class_name="key-value-region",
+            )
+            if kv_res.text:
+                res_parts.append(kv_res)
 
         # Add caption if available
-        caption = doc_serializer.serialize_captions(item=item, **kwargs)
+        cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
+        if cap_res.text:
+            res_parts.append(cap_res)
+
+        text_res = "\n".join([r.text for r in res_parts])
 
-        return SerializationResult(text="\n".join([key_value.text, caption.text]))
+        return create_ser_result(text=text_res, span_source=res_parts)
 
 
 class HTMLFormSerializer(BaseFormSerializer):
@@ -593,23 +583,28 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Serializes the passed form item to HTML."""
-        if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
-            return SerializationResult(text="")
+        res_parts: list[SerializationResult] = []
 
-        graph_serializer = HTMLGraphDataSerializer()
+        if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
+            graph_serializer = _HTMLGraphDataSerializer()
 
-        # Add key-value if available
-        key_value = graph_serializer.serialize(
-            item=item.graph,
-            doc_serializer=doc_serializer,
-            doc=doc,
-            class_name="form-container",
-        )
+            # Add form if available
+            form_res = graph_serializer.serialize(
+                item=item,
+                graph_data=item.graph,
+                class_name="form-container",
+            )
+            if form_res.text:
+                res_parts.append(form_res)
 
         # Add caption if available
-        caption = doc_serializer.serialize_captions(item=item, **kwargs)
+        cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
+        if cap_res.text:
+            res_parts.append(cap_res)
+
+        text_res = "\n".join([r.text for r in res_parts])
 
-        return SerializationResult(text="\n".join([key_value.text, caption.text]))
+        return create_ser_result(text=text_res, span_source=res_parts)
 
 
 class HTMLListSerializer(BaseModel, BaseListSerializer):
@@ -639,30 +634,26 @@ def serialize(
             **kwargs,
         )
 
-        if len(parts) == 0:
-            _logger.warning(f" => no list-items found for list {item.get_ref().cref}")
-            return SerializationResult(text="")
-
-        # Start the appropriate list type
-        tag = "ol" if isinstance(item, OrderedList) else "ul"
-        list_html = [f"<{tag}>"]
-
         # Add all child parts
-        for part in parts:
-            if part.text.startswith("<li>") and part.text.endswith("</li>"):
-                list_html.append(part.text)
-            elif part.text.startswith("<ol>") and part.text.endswith("</ol>"):
-                list_html.append(part.text)
-            elif part.text.startswith("<ul>") and part.text.endswith("</ul>"):
-                list_html.append(part.text)
-            else:
-                _logger.info(f"no <li>, <ol> or <ul> for {part.text}")
-                list_html.append(f"<li>{part.text}</li>")
-
-        # Close the list
-        list_html.append(f"</{tag}>")
+        text_res = "\n".join(
+            [
+                (
+                    p.text
+                    if (
+                        (p.text.startswith("<li>") and p.text.endswith("</li>"))
+                        or (p.text.startswith("<ol>") and p.text.endswith("</ol>"))
+                        or (p.text.startswith("<ul>") and p.text.endswith("</ul>"))
+                    )
+                    else f"<li>{p.text}</li>"
+                )
+                for p in parts
+            ]
+        )
+        if text_res:
+            tag = "ol" if isinstance(item, OrderedList) else "ul"
+            text_res = f"<{tag}>\n{text_res}\n</{tag}>"
 
-        return SerializationResult(text="\n".join(list_html))
+        return create_ser_result(text=text_res, span_source=parts)
 
 
 class HTMLInlineSerializer(BaseInlineSerializer):
@@ -698,7 +689,7 @@ def serialize(
         if inline_html:
             inline_html = f"<span class='inline-group'>{inline_html}</span>"
 
-        return SerializationResult(text=inline_html)
+        return create_ser_result(text=inline_html, span_source=parts)
 
 
 class HTMLFallbackSerializer(BaseFallbackSerializer):
@@ -714,14 +705,14 @@ def serialize(
         **kwargs,
     ) -> SerializationResult:
         """Fallback serializer for items not handled by other serializers."""
-        # For group items, we don't generate any markup
-        if isinstance(item, GroupItem):
-            return SerializationResult(text="")
-
-        # For other doc items, add a comment
-        return SerializationResult(
-            text=f"<!-- Unhandled item type: {item.__class__.__name__} -->"
-        )
+        if isinstance(item, DocItem):
+            return create_ser_result(
+                text=f"<!-- Unhandled item type: {item.__class__.__name__} -->",
+                span_source=item,
+            )
+        else:
+            # For group items, we don't generate any markup
+            return create_ser_result()
 
 
 class HTMLDocSerializer(DocSerializer):
@@ -773,7 +764,10 @@ def serialize_page(
         """Serialize a page out of its parts."""
         # Join all parts with newlines
         body_content = "\n".join([p.text for p in parts if p.text])
-        return SerializationResult(text=f"<div class='page'>\n{body_content}\n</div>")
+        return create_ser_result(
+            text=f"<div class='page'>\n{body_content}\n</div>",
+            span_source=parts,
+        )
 
     @override
     def serialize_doc(
@@ -853,7 +847,7 @@ def serialize_doc(
         # Join with newlines
         html_content = "\n".join(html_parts)
 
-        return SerializationResult(text=html_content)
+        return create_ser_result(text=html_content, span_source=list(pages.values()))
 
     @override
     def serialize_captions(
@@ -863,30 +857,22 @@ def serialize_captions(
         **kwargs,
     ) -> SerializationResult:
         """Serialize the item's captions."""
-        caption_parts = []
-
-        # Extract caption text from all caption items
-        for cap in item.captions:
-            caption_item = cap.resolve(self.doc)
-            if isinstance(caption_item, TextItem):
-                caption_parts.append(caption_item.text)
-
-        # Join all captions with a space
-        if len(caption_parts) > 0:
-            caption_text = " ".join(caption_parts)
-            text_dir = get_text_direction(caption_text)
-
-            # Create proper HTML
-            if text_dir == "rtl":
-                return SerializationResult(
-                    text=f'<{tag} dir="{text_dir}">{html.escape(caption_text)}</{tag}>'
-                )
-            else:
-                return SerializationResult(
-                    text=f"<{tag}>{html.escape(caption_text)}</{tag}>"
-                )
-
-        return SerializationResult(text="")
+        params = self.params.merge_with_patch(patch=kwargs)
+        results: list[SerializationResult] = []
+        text_res = ""
+        if DocItemLabel.CAPTION in params.labels:
+            results = [
+                create_ser_result(text=it.text, span_source=it)
+                for cap in item.captions
+                if isinstance(it := cap.resolve(self.doc), TextItem)
+                and it.self_ref not in self.get_excluded_refs(**kwargs)
+            ]
+            text_res = params.caption_delim.join([r.text for r in results])
+            if text_res:
+                text_dir = get_text_direction(text_res)
+                dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
+                text_res = f"<{tag}{dir_str}>{html.escape(text_res)}</{tag}>"
+        return create_ser_result(text=text_res, span_source=results)
 
     def _generate_head(self) -> str:
         """Generate the HTML head section with metadata and styles."""
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index f9789344..3db1ab63 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -1289,7 +1289,7 @@ def export_to_html(
             return text
         else:
             _logger.error(
-                "Usage of TableItem.export_to_markdown() without `doc` argument is "
+                "Usage of TableItem.export_to_html() without `doc` argument is "
                 "deprecated.",
             )
             return ""
diff --git a/test/data/doc/2206.01062.yaml.html b/test/data/doc/2206.01062.yaml.html
index c94c14b4..713676da 100644
--- a/test/data/doc/2206.01062.yaml.html
+++ b/test/data/doc/2206.01062.yaml.html
@@ -205,7 +205,6 @@ <h2>4 ANNOTATION CAMPAIGN</h2>
 </ul>
 <p>The complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference.</p>
 <p>Phase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore we prepared a subset of pages with two different complexity levels, each with a practice and an exam part. 974 pages were reference-annotated by one proficient core team member. Annotation staff were then given the task to annotate the same subsets (blinded from the reference). By comparing the annotations of each staff member with the reference annotations, we could quantify how closely their annotations matched the reference. Only after passing two exam levels with high annotation quality, staff were admitted into the production phase. Practice iterations</p>
-<figure></figure>
 <p>05237a14f2524e3f53c8454b074409d05078038a6a36b770fcc8ec7e540deae0</p>
 <p>Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.</p>
 <p>were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar.</p>
diff --git a/test/data/doc/constructed_doc.embedded.html.gt b/test/data/doc/constructed_doc.embedded.html.gt
index 414507c5..137c6cd0 100644
--- a/test/data/doc/constructed_doc.embedded.html.gt
+++ b/test/data/doc/constructed_doc.embedded.html.gt
@@ -174,14 +174,12 @@
 
 </ul>
 </div>
-
 <div class="form-container">
 <ul class="form-container">
 <li><strong>number</strong>: 1</li>
 
 </ul>
 </div>
-
 <span class='inline-group'>Some formatting chops: <strong>bold</strong> <em>italic</em> <u>underline</u> <del>strikethrough</del> <a href=".">hyperlink</a> &amp; <a href="https://github.com/DS4SD/docling"><del><u><em><strong>everything at the same time.</strong></em></u></del></a></span>
 <ol>
 <li>Item 1 in A</li>
diff --git a/test/data/doc/constructed_doc.placeholder.html.gt b/test/data/doc/constructed_doc.placeholder.html.gt
index c3b8b764..86b57217 100644
--- a/test/data/doc/constructed_doc.placeholder.html.gt
+++ b/test/data/doc/constructed_doc.placeholder.html.gt
@@ -174,14 +174,12 @@
 
 </ul>
 </div>
-
 <div class="form-container">
 <ul class="form-container">
 <li><strong>number</strong>: 1</li>
 
 </ul>
 </div>
-
 <span class='inline-group'>Some formatting chops: <strong>bold</strong> <em>italic</em> <u>underline</u> <del>strikethrough</del> <a href=".">hyperlink</a> &amp; <a href="https://github.com/DS4SD/docling"><del><u><em><strong>everything at the same time.</strong></em></u></del></a></span>
 <ol>
 <li>Item 1 in A</li>
diff --git a/test/data/doc/constructed_doc.referenced.html.gt b/test/data/doc/constructed_doc.referenced.html.gt
index dfc9e715..bb8b60c6 100644
--- a/test/data/doc/constructed_doc.referenced.html.gt
+++ b/test/data/doc/constructed_doc.referenced.html.gt
@@ -174,14 +174,12 @@
 
 </ul>
 </div>
-
 <div class="form-container">
 <ul class="form-container">
 <li><strong>number</strong>: 1</li>
 
 </ul>
 </div>
-
 <span class='inline-group'>Some formatting chops: <strong>bold</strong> <em>italic</em> <u>underline</u> <del>strikethrough</del> <a href=".">hyperlink</a> &amp; <a href="https://github.com/DS4SD/docling"><del><u><em><strong>everything at the same time.</strong></em></u></del></a></span>
 <ol>
 <li>Item 1 in A</li>
diff --git a/test/data/doc/constructed_document.yaml.html b/test/data/doc/constructed_document.yaml.html
index c9541bd1..516449dd 100644
--- a/test/data/doc/constructed_document.yaml.html
+++ b/test/data/doc/constructed_document.yaml.html
@@ -174,14 +174,12 @@ <h2>1. Introduction</h2>
 
 </ul>
 </div>
-
 <div class="form-container">
 <ul class="form-container">
 <li><strong>number</strong>: 1</li>
 
 </ul>
 </div>
-
 <span class='inline-group'>Some formatting chops: <strong>bold</strong> <em>italic</em> <u>underline</u> <del>strikethrough</del> <a href=".">hyperlink</a> &amp; <a href="https://github.com/DS4SD/docling"><del><u><em><strong>everything at the same time.</strong></em></u></del></a></span>
 <ol>
 <li>Item 1 in A</li>
diff --git a/test/data/doc/dummy_doc.yaml.html b/test/data/doc/dummy_doc.yaml.html
index e4ee82b0..8eac00cf 100644
--- a/test/data/doc/dummy_doc.yaml.html
+++ b/test/data/doc/dummy_doc.yaml.html
@@ -125,7 +125,6 @@
 <div class='page'>
 <h1>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</h1>
 <figure><figcaption>Figure 1: Four examples of complex page layouts across different document categories</figcaption></figure>
-<table></table>
 </div>
 </body>
 </html>
diff --git a/test/test_serializer_html.py b/test/test_serializer_html.py
deleted file mode 100644
index df6402e1..00000000
--- a/test/test_serializer_html.py
+++ /dev/null
@@ -1,22 +0,0 @@
-"""Examples of using the HTML Serializer for DoclingDocument."""
-
-from test.test_docling_doc import _construct_doc
-
-from docling_core.experimental.serializer.html import HTMLDocSerializer
-from docling_core.types.doc.document import DoclingDocument  # BoundingBox,
-
-
-def test_html_export():
-
-    doc = _construct_doc()
-
-    # Create the serializer with default parameters
-    serializer = HTMLDocSerializer(doc=doc)
-
-    # doc.save_as_html(filename="test/data/doc/constructed_doc.html")
-    pred_html = doc.export_to_html()
-
-    with open("test/data/doc/constructed_doc.html", "r") as fr:
-        true_html = fr.read()
-
-    assert pred_html == true_html, "pred_html==true_html"

From 7add617557c9ebe98b2fb564e03dcf01b6300fc5 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Thu, 10 Apr 2025 06:25:55 +0200
Subject: [PATCH 34/34] added enum for different output styles

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/experimental/serializer/html.py | 31 +++++++++++++++-----
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/docling_core/experimental/serializer/html.py b/docling_core/experimental/serializer/html.py
index 04108daf..7bbf53de 100644
--- a/docling_core/experimental/serializer/html.py
+++ b/docling_core/experimental/serializer/html.py
@@ -7,6 +7,7 @@
 import base64
 import html
 import logging
+from enum import Enum
 from io import BytesIO
 from pathlib import Path
 from typing import Optional, Union
@@ -72,6 +73,13 @@
 _logger = logging.getLogger(__name__)
 
 
+class HTMLOutputStyle(str, Enum):
+    """HTML output style."""
+
+    SINGLE_COLUMN = "single_column"
+    SPLIT_PAGE = "split_page"
+
+
 class HTMLParams(CommonParams):
     """HTML-specific serialization parameters."""
 
@@ -93,8 +101,8 @@ class HTMLParams(CommonParams):
     # Formula rendering options
     formula_to_mathml: bool = True
 
-    # Allow for split page view (only possible if page-images are present)
-    split_page_view: bool = False
+    # Allow for different output styles
+    output_style: HTMLOutputStyle = HTMLOutputStyle.SINGLE_COLUMN
 
 
 class HTMLTextSerializer(BaseModel, BaseTextSerializer):
@@ -781,7 +789,7 @@ def serialize_doc(
             "<body>",
         ]
 
-        if self.params.split_page_view:
+        if self.params.output_style == HTMLOutputStyle.SPLIT_PAGE:
             html_parts.append("<table>")
             html_parts.append("<tbody>")
 
@@ -835,11 +843,13 @@ def serialize_doc(
             html_parts.append("</tbody>")
             html_parts.append("</table>")
 
-        else:
+        elif self.params.output_style == HTMLOutputStyle.SINGLE_COLUMN:
             # Add all pages
             for page_no, page in pages.items():
                 if page.text:
                     html_parts.append(page.text)
+        else:
+            raise ValueError(f"unknown output-style: {self.params.output_style}")
 
         # Close HTML structure
         html_parts.extend(["</body>", "</html>"])
@@ -896,11 +906,18 @@ def _generate_head(self) -> str:
 
         # Add default styles or custom CSS
         if params.css_styles:
-            head_parts.append(f"<style>\n{params.css_styles}\n</style>")
-        elif self.params.split_page_view:
+            if params.css_styles.startswith("<style>") and params.css_styles.endswith(
+                "</style>"
+            ):
+                head_parts.append(f"\n{params.css_styles}\n")
+            else:
+                head_parts.append(f"<style>\n{params.css_styles}\n</style>")
+        elif self.params.output_style == HTMLOutputStyle.SPLIT_PAGE:
             head_parts.append(_get_css_for_split_page())
-        else:
+        elif self.params.output_style == HTMLOutputStyle.SINGLE_COLUMN:
             head_parts.append(_get_css_for_single_column())
+        else:
+            raise ValueError(f"unknown output-style: {self.params.output_style}")
 
         head_parts.append("</head>")
 

class label	Count	Train	Test	Val	All	Fin	Man	Sci	Law	Pat	Ten
		% of Total				triple inter-annotator mAP @ 0.5-0.95 (%)
Caption	22524	2.04	1.77	2.32	84-89	40-61	86-92	94-99	95-99	69-78	n/a
Footnote	6318	0.60	0.31	0.58	83-91	n/a	100	62-88	85-94	n/a	82-97
Formula	25027	2.25	1.90	2.96	83-85	n/a	n/a	84-87	86-96	n/a	n/a
List-item	185660	17.19	13.34	15.82	87-88	74-83	90-92	97-97	81-85	75-88	93-95
Page-footer	70878	6.51	5.58	6.00	93-94	88-90	95-96	100	92-97	100	96-98
Page-header	58022	5.10	6.70	5.06	85-89	66-76	90-94	98-100	91-92	97-99	81-86
Picture	45976	4.21	2.78	5.31	69-71	56-59	82-86	69-82	80-95	66-71	59-76
Section-header	142884	12.60	15.77	12.85	83-84	76-81	90-92	94-95	87-94	69-73	78-86
Table	34733	3.20	2.27	3.60	77-81	75-80	83-86	98-99	58-80	79-84	70-85
Text	510377	45.82	49.28	45.00	84-86	81-86	88-93	89-93	87-92	71-79	87-95
Title	5071	0.47	0.30	0.50	60-72	24-63	50-63	94-100	82-96	68-79	24-56
Total	1107470	941123	99816	66531	82-83	71-74	79-81	89-94	86-91	71-76	68-85
	human	MRCNN		FRCNN	YOLO
	human	R50	R101	R101	v5x6
Caption	84-89	68.4	71.5	70.1	77.7
Footnote	83-91	70.9	71.8	73.7	77.2
Formula	83-85	60.1	63.4	63.5	66.2
List-item	87-88	81.2	80.8	81.0	86.2
Page-footer	93-94	61.6	59.3	58.9	61.1
Page-header	85-89	71.9	70.0	72.0	67.9
Picture	69-71	71.7	72.7	72.0	77.1
Section-header	83-84	67.6	69.3	68.4	74.6
Table	77-81	82.2	82.9	82.2	86.3
Text	84-86	84.6	85.8	85.4	88.1
Title	60-72	76.7	80.4	79.9	82.7
All	82-83	72.4	73.5	73.4	76.8
Training on	labels	PLN	DB	DLN
		Testing on
PubLayNet (PLN)	Figure	96	43	23
	Sec-header	87	-	32
	Table	95	24	49
	Text	96	-	42
	total	93	34	30
DocBank (DB)	Figure	77	71	31
	Table	19	65	22
	total	48	68	27
DocLayNet (DLN)	Figure	67	51	72
	Sec-header	53	-	68
	Table	87	43	82
	Text	77	-	84
	total	59	47	78