From 6d236d83f60ab49dd47a010325e28372b404a8ba Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Tue, 22 Jul 2025 16:52:18 +0200 Subject: [PATCH 1/2] fix(serializer): the HTML serializer should append nested lists to list items For a valid HTML document, the serializer should ensure that sub-list items are indented under their respective main list items. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling_core/transforms/serializer/html.py | 17 +++++++++++++++++ test/data/doc/constructed_doc.embedded.html.gt | 15 ++++++++++----- test/data/doc/constructed_doc.html | 15 ++++++++++----- .../doc/constructed_doc.placeholder.html.gt | 15 ++++++++++----- .../data/doc/constructed_doc.referenced.html.gt | 15 ++++++++++----- test/data/doc/constructed_document.yaml.html | 15 ++++++++++----- test/data/doc/constructed_orig_false.gt.html | 15 ++++++++++----- test/data/doc/constructed_orig_true.gt.html | 15 ++++++++++----- 8 files changed, 87 insertions(+), 35 deletions(-) diff --git a/docling_core/transforms/serializer/html.py b/docling_core/transforms/serializer/html.py index 73a2fac7..7ef64847 100644 --- a/docling_core/transforms/serializer/html.py +++ b/docling_core/transforms/serializer/html.py @@ -713,6 +713,23 @@ def serialize( **kwargs, ) + # Append nested list to parent list item: + i = 0 + while i < len(parts): + prt = parts[i] + if prt.text.startswith(("

Here a code block:

print("Hello world")
@@ -185,16 +188,18 @@
  1. Item 1 in A
  2. Item 2 in A
  3. -
  4. Item 3 in A
  5. +
  6. Item 3 in A
    1. Item 1 in B
    2. -
    3. Item 2 in B
    4. +
    5. Item 2 in B
      1. Item 1 in C
      2. Item 2 in C
      +
    6. Item 3 in B
    +
  7. Item 4 in A

Here a code block:

print("Hello world")
@@ -186,16 +189,18 @@

1. Introduction

  1. Item 1 in A
  2. Item 2 in A
  3. -
  4. Item 3 in A
  5. +
  6. Item 3 in A
    1. Item 1 in B
    2. -
    3. Item 2 in B
    4. +
    5. Item 2 in B
      1. Item 1 in C
      2. Item 2 in C
      +
    6. Item 3 in B
    +
  7. Item 4 in A

The end.

diff --git a/test/data/doc/constructed_doc.placeholder.html.gt b/test/data/doc/constructed_doc.placeholder.html.gt index 55a9cee8..a4f0130a 100644 --- a/test/data/doc/constructed_doc.placeholder.html.gt +++ b/test/data/doc/constructed_doc.placeholder.html.gt @@ -135,15 +135,17 @@
This is the caption of table 1.
ProductYears
20162017
Apple49823695944
@@ -158,12 +160,13 @@

Here a code block:

print("Hello world")
@@ -185,16 +188,18 @@
  1. Item 1 in A
  2. Item 2 in A
  3. -
  4. Item 3 in A
  5. +
  6. Item 3 in A
    1. Item 1 in B
    2. -
    3. Item 2 in B
    4. +
    5. Item 2 in B
      1. Item 1 in C
      2. Item 2 in C
      +
    6. Item 3 in B
    +
  7. Item 4 in A

Here a code block:

print("Hello world")
@@ -185,16 +188,18 @@
  1. Item 1 in A
  2. Item 2 in A
  3. -
  4. Item 3 in A
  5. +
  6. Item 3 in A
    1. Item 1 in B
    2. -
    3. Item 2 in B
    4. +
    5. Item 2 in B
      1. Item 1 in C
      2. Item 2 in C
      +
    6. Item 3 in B
    +
  7. Item 4 in A

Here a code block:

print("Hello world")
@@ -185,16 +188,18 @@

1. Introduction

  1. Item 1 in A
  2. Item 2 in A
  3. -
  4. Item 3 in A
  5. +
  6. Item 3 in A
    1. Item 1 in B
    2. -
    3. Item 2 in B
    4. +
    5. Item 2 in B
      1. Item 1 in C
      2. Item 2 in C
      +
    6. Item 3 in B
    +
  7. Item 4 in A

Here a code block:

print("Hello world")
@@ -185,16 +188,18 @@

1. Introduction

  1. Item 1 in A
  2. Item 2 in A
  3. -
  4. Item 3 in A
  5. +
  6. Item 3 in A
    1. Item 1 in B
    2. -
    3. Item 2 in B
    4. +
    5. Item 2 in B
      1. Item 1 in C
      2. Item 2 in C
      +
    6. Item 3 in B
    +
  7. Item 4 in A

Here a code block:

print("Hello world")
@@ -185,16 +188,18 @@

1. Introduction

  1. Item 1 in A
  2. Item 2 in A
  3. -
  4. Item 3 in A
  5. +
  6. Item 3 in A
    1. Item 1 in B
    2. -
    3. Item 2 in B
    4. +
    5. Item 2 in B
      1. Item 1 in C
      2. Item 2 in C
      +
    6. Item 3 in B
    +
  7. Item 4 in A