"
+
+ _logger.warning("Could not parse formula with MathML")
+
+ # Fallback options if we got here
+ if math_formula and is_inline_scope:
+ return f"{math_formula}"
+ elif math_formula and (not is_inline_scope):
+ f"
'
+
+ def _get_formula_image_fallback(
+ self, item: TextItem, doc: DoclingDocument
+ ) -> Optional[str]:
+ """Try to get an image fallback for a formula."""
+ item_image = item.get_image(doc=doc)
+ if item_image is not None:
+ img_ref = ImageRef.from_pil(item_image, dpi=72)
+ return (
+ "" f'' ""
+ )
+ return None
+
+
+class HTMLTableSerializer(BaseTableSerializer):
+ """HTML-specific table item serializer."""
+
+ @override
+ def serialize(
+ self,
+ *,
+ item: TableItem,
+ doc_serializer: BaseDocSerializer,
+ doc: DoclingDocument,
+ **kwargs,
+ ) -> SerializationResult:
+ """Serializes the passed table item to HTML."""
+ nrows = item.data.num_rows
+ ncols = item.data.num_cols
+
+ res_parts: list[SerializationResult] = []
+ cap_res = doc_serializer.serialize_captions(item=item, tag="caption", **kwargs)
+ if cap_res.text:
+ res_parts.append(cap_res)
+
+ if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
+ body = ""
+
+ for i in range(nrows):
+ body += "
"
+ for j in range(ncols):
+ cell: TableCell = item.data.grid[i][j]
+
+ rowspan, rowstart = (
+ cell.row_span,
+ cell.start_row_offset_idx,
+ )
+ colspan, colstart = (
+ cell.col_span,
+ cell.start_col_offset_idx,
+ )
+
+ if rowstart != i:
+ continue
+ if colstart != j:
+ continue
+
+ content = html.escape(cell.text.strip())
+ celltag = "td"
+ if cell.column_header:
+ celltag = "th"
+
+ opening_tag = f"{celltag}"
+ if rowspan > 1:
+ opening_tag += f' rowspan="{rowspan}"'
+ if colspan > 1:
+ opening_tag += f' colspan="{colspan}"'
+
+ text_dir = get_text_direction(content)
+ if text_dir == "rtl":
+ opening_tag += f' dir="{dir}"'
+
+ body += f"<{opening_tag}>{content}{celltag}>"
+ body += "
"
+
+ if body:
+ body = f"{body}"
+ res_parts.append(create_ser_result(text=body, span_source=item))
+
+ text_res = "".join([r.text for r in res_parts])
+ text_res = f"
{text_res}
" if text_res else ""
+
+ return create_ser_result(text=text_res, span_source=res_parts)
+
+
+class HTMLPictureSerializer(BasePictureSerializer):
+ """HTML-specific picture item serializer."""
+
+ @override
+ def serialize(
+ self,
+ *,
+ item: PictureItem,
+ doc_serializer: BaseDocSerializer,
+ doc: DoclingDocument,
+ **kwargs,
+ ) -> SerializationResult:
+ """Export picture to HTML format."""
+ params = HTMLParams(**kwargs)
+
+ res_parts: list[SerializationResult] = []
+
+ cap_res = doc_serializer.serialize_captions(
+ item=item,
+ tag="figcaption",
+ **kwargs,
+ )
+ if cap_res.text:
+ res_parts.append(cap_res)
+
+ img_text = ""
+ if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
+
+ if params.image_mode == ImageRefMode.EMBEDDED:
+ # short-cut: we already have the image in base64
+ if (
+ isinstance(item.image, ImageRef)
+ and isinstance(item.image.uri, AnyUrl)
+ and item.image.uri.scheme == "data"
+ ):
+ img_text = f''
+ else:
+ # get the item.image._pil or crop it out of the page-image
+ img = item.get_image(doc)
+
+ if img is not None:
+ imgb64 = item._image_to_base64(img)
+ img_text = f''
+ elif params.image_mode == ImageRefMode.REFERENCED:
+ if isinstance(item.image, ImageRef) and not (
+ isinstance(item.image.uri, AnyUrl)
+ and item.image.uri.scheme == "data"
+ ):
+ img_text = f''
+ if img_text:
+ res_parts.append(create_ser_result(text=img_text, span_source=item))
+
+ text_res = "".join([r.text for r in res_parts])
+ if text_res:
+ text_res = f"{text_res}"
+
+ return create_ser_result(text=text_res, span_source=res_parts)
+
+
+class _HTMLGraphDataSerializer:
+ """HTML-specific graph-data item serializer."""
+
+ def serialize(
+ self,
+ *,
+ item: Union[FormItem, KeyValueItem],
+ graph_data: GraphData,
+ class_name: str,
+ ) -> SerializationResult:
+ """Serialize the graph-data to HTML."""
+ # Build cell lookup by ID
+ cell_map = {cell.cell_id: cell for cell in graph_data.cells}
+
+ # Build relationship maps
+ child_links: dict[int, list[int]] = (
+ {}
+ ) # source_id -> list of child_ids (to_child)
+ value_links: dict[int, list[int]] = {} # key_id -> list of value_ids (to_value)
+ parents: set[int] = (
+ set()
+ ) # Set of all IDs that are targets of to_child (to find roots)
+
+ for link in graph_data.links:
+ if (
+ link.source_cell_id not in cell_map
+ or link.target_cell_id not in cell_map
+ ):
+ continue
+
+ if link.label.value == "to_child":
+ child_links.setdefault(link.source_cell_id, []).append(
+ link.target_cell_id
+ )
+ parents.add(link.target_cell_id)
+ elif link.label.value == "to_value":
+ value_links.setdefault(link.source_cell_id, []).append(
+ link.target_cell_id
+ )
+
+ # Find root cells (cells with no parent)
+ root_ids = [cell_id for cell_id in cell_map.keys() if cell_id not in parents]
+
+ # Generate the HTML
+ parts = [f'
']
+
+ # If we have roots, make a list structure
+ if root_ids:
+ parts.append(f'
")
+
+ # If no hierarchy, fall back to definition list
+ else:
+ parts.append(f'
')
+ for key_id, value_ids in value_links.items():
+ key_cell = cell_map[key_id]
+ key_text = html.escape(key_cell.text)
+ parts.append(f"
{key_text}
")
+
+ for value_id in value_ids:
+ value_cell = cell_map[value_id]
+ value_text = html.escape(value_cell.text)
+ parts.append(f"
{value_text}
")
+ parts.append("
")
+
+ parts.append("
")
+
+ return create_ser_result(text="\n".join(parts), span_source=item)
+
+ def _render_cell_tree(
+ self,
+ cell_id: int,
+ cell_map: dict,
+ child_links: dict,
+ value_links: dict,
+ level: int,
+ ) -> str:
+ """Recursively render a cell and its children as a nested list."""
+ cell = cell_map[cell_id]
+ cell_text = html.escape(cell.text)
+
+ # Format key-value pairs if this cell has values linked
+ if cell_id in value_links:
+ value_texts = []
+ for value_id in value_links[cell_id]:
+ if value_id in cell_map:
+ value_cell = cell_map[value_id]
+ value_texts.append(html.escape(value_cell.text))
+
+ cell_text = f"{cell_text}: {', '.join(value_texts)}"
+
+ # If this cell has children, create a nested list
+ if cell_id in child_links and child_links[cell_id]:
+ children_html = []
+ children_html.append(f"
"
+ else:
+ # Leaf node - just render the cell
+ # return f'
{cell_text}
'
+ return ""
+
+
+class HTMLKeyValueSerializer(BaseKeyValueSerializer):
+ """HTML-specific key-value item serializer."""
+
+ @override
+ def serialize(
+ self,
+ *,
+ item: KeyValueItem,
+ doc_serializer: "BaseDocSerializer",
+ doc: DoclingDocument,
+ **kwargs,
+ ) -> SerializationResult:
+ """Serializes the passed key-value item to HTML."""
+ res_parts: list[SerializationResult] = []
+
+ if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
+ graph_serializer = _HTMLGraphDataSerializer()
+
+ # Add key-value if available
+ kv_res = graph_serializer.serialize(
+ item=item,
+ graph_data=item.graph,
+ class_name="key-value-region",
+ )
+ if kv_res.text:
+ res_parts.append(kv_res)
+
+ # Add caption if available
+ cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
+ if cap_res.text:
+ res_parts.append(cap_res)
+
+ text_res = "\n".join([r.text for r in res_parts])
+
+ return create_ser_result(text=text_res, span_source=res_parts)
+
+
+class HTMLFormSerializer(BaseFormSerializer):
+ """HTML-specific form item serializer."""
+
+ @override
+ def serialize(
+ self,
+ *,
+ item: FormItem,
+ doc_serializer: "BaseDocSerializer",
+ doc: DoclingDocument,
+ **kwargs,
+ ) -> SerializationResult:
+ """Serializes the passed form item to HTML."""
+ res_parts: list[SerializationResult] = []
+
+ if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
+ graph_serializer = _HTMLGraphDataSerializer()
+
+ # Add form if available
+ form_res = graph_serializer.serialize(
+ item=item,
+ graph_data=item.graph,
+ class_name="form-container",
+ )
+ if form_res.text:
+ res_parts.append(form_res)
+
+ # Add caption if available
+ cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
+ if cap_res.text:
+ res_parts.append(cap_res)
+
+ text_res = "\n".join([r.text for r in res_parts])
+
+ return create_ser_result(text=text_res, span_source=res_parts)
+
+
+class HTMLListSerializer(BaseModel, BaseListSerializer):
+ """HTML-specific list serializer."""
+
+ @override
+ def serialize(
+ self,
+ *,
+ item: Union[UnorderedList, OrderedList],
+ doc_serializer: "BaseDocSerializer",
+ doc: DoclingDocument,
+ list_level: int = 0,
+ is_inline_scope: bool = False,
+ visited: Optional[set[str]] = None, # refs of visited items
+ **kwargs,
+ ) -> SerializationResult:
+ """Serializes a list to HTML."""
+ my_visited: set[str] = visited if visited is not None else set()
+
+ # Get all child parts
+ parts = doc_serializer.get_parts(
+ item=item,
+ list_level=list_level + 1,
+ is_inline_scope=is_inline_scope,
+ visited=my_visited,
+ **kwargs,
+ )
+
+ # Add all child parts
+ text_res = "\n".join(
+ [
+ (
+ p.text
+ if (
+ (p.text.startswith("
") and p.text.endswith("
"))
+ or (p.text.startswith("") and p.text.endswith(""))
+ or (p.text.startswith("
") and p.text.endswith("
"))
+ )
+ else f"
{p.text}
"
+ )
+ for p in parts
+ ]
+ )
+ if text_res:
+ tag = "ol" if isinstance(item, OrderedList) else "ul"
+ text_res = f"<{tag}>\n{text_res}\n{tag}>"
+
+ return create_ser_result(text=text_res, span_source=parts)
+
+
+class HTMLInlineSerializer(BaseInlineSerializer):
+ """HTML-specific inline group serializer."""
+
+ @override
+ def serialize(
+ self,
+ *,
+ item: InlineGroup,
+ doc_serializer: "BaseDocSerializer",
+ doc: DoclingDocument,
+ list_level: int = 0,
+ visited: Optional[set[str]] = None, # refs of visited items
+ **kwargs,
+ ) -> SerializationResult:
+ """Serializes an inline group to HTML."""
+ my_visited: set[str] = visited if visited is not None else set()
+
+ # Get all parts with inline scope
+ parts = doc_serializer.get_parts(
+ item=item,
+ list_level=list_level,
+ is_inline_scope=True,
+ visited=my_visited,
+ **kwargs,
+ )
+
+ # Join all parts without separators
+ inline_html = " ".join([p.text for p in parts])
+
+ # Wrap in span if needed
+ if inline_html:
+ inline_html = f"{inline_html}"
+
+ return create_ser_result(text=inline_html, span_source=parts)
+
+
+class HTMLFallbackSerializer(BaseFallbackSerializer):
+ """HTML-specific fallback serializer."""
+
+ @override
+ def serialize(
+ self,
+ *,
+ item: NodeItem,
+ doc_serializer: "BaseDocSerializer",
+ doc: DoclingDocument,
+ **kwargs,
+ ) -> SerializationResult:
+ """Fallback serializer for items not handled by other serializers."""
+ if isinstance(item, DocItem):
+ return create_ser_result(
+ text=f"",
+ span_source=item,
+ )
+ else:
+ # For group items, we don't generate any markup
+ return create_ser_result()
+
+
+class HTMLDocSerializer(DocSerializer):
+ """HTML-specific document serializer."""
+
+ text_serializer: BaseTextSerializer = HTMLTextSerializer()
+ table_serializer: BaseTableSerializer = HTMLTableSerializer()
+ picture_serializer: BasePictureSerializer = HTMLPictureSerializer()
+ key_value_serializer: BaseKeyValueSerializer = HTMLKeyValueSerializer()
+ form_serializer: BaseFormSerializer = HTMLFormSerializer()
+ fallback_serializer: BaseFallbackSerializer = HTMLFallbackSerializer()
+
+ list_serializer: BaseListSerializer = HTMLListSerializer()
+ inline_serializer: BaseInlineSerializer = HTMLInlineSerializer()
+
+ params: HTMLParams = HTMLParams()
+
+ @override
+ def serialize_bold(self, text: str, **kwargs) -> str:
+ """Apply HTML-specific bold serialization."""
+ return f"{text}"
+
+ @override
+ def serialize_italic(self, text: str, **kwargs) -> str:
+ """Apply HTML-specific italic serialization."""
+ return f"{text}"
+
+ @override
+ def serialize_underline(self, text: str, **kwargs) -> str:
+ """Apply HTML-specific underline serialization."""
+ return f"{text}"
+
+ @override
+ def serialize_strikethrough(self, text: str, **kwargs) -> str:
+ """Apply HTML-specific strikethrough serialization."""
+ return f"{text}"
+
+ @override
+ def serialize_hyperlink(
+ self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs
+ ) -> str:
+ """Apply HTML-specific hyperlink serialization."""
+ return f'{text}'
+
+ @override
+ def serialize_page(
+ self, parts: list[SerializationResult], **kwargs
+ ) -> SerializationResult:
+ """Serialize a page out of its parts."""
+ # Join all parts with newlines
+ body_content = "\n".join([p.text for p in parts if p.text])
+ return create_ser_result(
+ text=f"
\n{body_content}\n
",
+ span_source=parts,
+ )
+
+ @override
+ def serialize_doc(
+ self, pages: dict[Optional[int], SerializationResult], **kwargs
+ ) -> SerializationResult:
+ """Serialize a document out of its pages."""
+ # Create HTML structure
+ html_parts = [
+ "",
+ self._generate_head(),
+ "",
+ ]
+
+ if self.params.output_style == HTMLOutputStyle.SPLIT_PAGE:
+ html_parts.append("
")
+ html_parts.append("")
+
+ for page_no, page in pages.items():
+
+ if isinstance(page_no, int):
+ page_img = self.doc.pages[page_no].image
+
+ html_parts.append("
")
+
+ html_parts.append("
")
+
+ # short-cut: we already have the image in base64
+ if (
+ (page_img is not None)
+ and isinstance(page_img, ImageRef)
+ and isinstance(page_img.uri, AnyUrl)
+ and page_img.uri.scheme == "data"
+ ):
+ img_text = f''
+ html_parts.append(f"{img_text}")
+
+ elif (page_img is not None) and (page_img._pil is not None):
+
+ buffered = BytesIO()
+ page_img._pil.save(
+ buffered, format="PNG"
+ ) # Save the image to the byte stream
+ img_bytes = buffered.getvalue() # Get the byte data
+
+ # Encode to Base64 and decode to string
+ img_base64 = base64.b64encode(img_bytes).decode("utf-8")
+ img_text = f''
+
+ html_parts.append(f"{img_text}")
+ else:
+ html_parts.append("no page-image found")
+
+ html_parts.append("