From 6d236d83f60ab49dd47a010325e28372b404a8ba Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Date: Tue, 22 Jul 2025 16:52:18 +0200
Subject: [PATCH 1/2] fix(serializer): the HTML serializer should append nested
lists to list items
For a valid HTML document, the serializer should ensure that sub-list items are indented
under their respective main list items.
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
---
docling_core/transforms/serializer/html.py | 17 +++++++++++++++++
test/data/doc/constructed_doc.embedded.html.gt | 15 ++++++++++-----
test/data/doc/constructed_doc.html | 15 ++++++++++-----
.../doc/constructed_doc.placeholder.html.gt | 15 ++++++++++-----
.../data/doc/constructed_doc.referenced.html.gt | 15 ++++++++++-----
test/data/doc/constructed_document.yaml.html | 15 ++++++++++-----
test/data/doc/constructed_orig_false.gt.html | 15 ++++++++++-----
test/data/doc/constructed_orig_true.gt.html | 15 ++++++++++-----
8 files changed, 87 insertions(+), 35 deletions(-)
diff --git a/docling_core/transforms/serializer/html.py b/docling_core/transforms/serializer/html.py
index 73a2fac7..7ef64847 100644
--- a/docling_core/transforms/serializer/html.py
+++ b/docling_core/transforms/serializer/html.py
@@ -713,6 +713,23 @@ def serialize(
**kwargs,
)
+ # Append nested list to parent list item:
+ i = 0
+ while i < len(parts):
+ prt = parts[i]
+ if prt.text.startswith(("
", "")):
+ for j in range(i - 1, -1, -1):
+ if parts[j].text.startswith(("- ", "
- "):
+ before, _, _ = parts[j].text.rpartition("
")
+ parts[j].text = f"{before}\n{prt.text}\n"
+ break
+ if j > -1:
+ parts.pop(i)
+ else:
+ i += 1
+
# Add all child parts
text_res = "\n".join(
[
diff --git a/test/data/doc/constructed_doc.embedded.html.gt b/test/data/doc/constructed_doc.embedded.html.gt
index 22c85c92..e87ad306 100644
--- a/test/data/doc/constructed_doc.embedded.html.gt
+++ b/test/data/doc/constructed_doc.embedded.html.gt
@@ -135,15 +135,17 @@
- list item 1
- list item 2
-- list item 3
+- list item 3
- list item 3.a
- list item 3.b
-- list item 3.c
+- list item 3.c
- list item 3.c.i
+
+
- list item 4
This is the caption of table 1.
| Product | Years |
| 2016 | 2017 |
| Apple | 49823 | 695944 |
@@ -158,12 +160,13 @@
- item 1 of neighboring list
-- item 2 of neighboring list
+- item 2 of neighboring list
- item 1 of sub list
- Here a code snippet:
print("Hello world") (to be displayed inline)
- Here a formula: (to be displayed inline)
+
Here a code block:
print("Hello world")
@@ -185,16 +188,18 @@
- Item 1 in A
- Item 2 in A
-- Item 3 in A
+- Item 3 in A
- Item 1 in B
-- Item 2 in B
+- Item 2 in B
- Item 1 in C
- Item 2 in C
+
- Item 3 in B
+
- Item 4 in A
diff --git a/test/data/doc/constructed_doc.html b/test/data/doc/constructed_doc.html
index c3b8b764..ff14a594 100644
--- a/test/data/doc/constructed_doc.html
+++ b/test/data/doc/constructed_doc.html
@@ -134,15 +134,17 @@ 1. Introduction
- list item 1
- list item 2
-- list item 3
+- list item 3
- list item 3.a
- list item 3.b
-- list item 3.c
+- list item 3.c
- list item 3.c.i
+
+
- list item 4
This is the caption of table 1.| Product | Years |
| 2016 | 2017 |
| Apple | 49823 | 695944 |
@@ -157,12 +159,13 @@ 1. Introduction
- item 1 of neighboring list
-- item 2 of neighboring list
+- item 2 of neighboring list
- item 1 of sub list
- Here a code snippet:
<p>Hello world</p> (to be displayed inline)
- Here a formula: (to be displayed inline)
+
Here a code block:
print("Hello world")
@@ -186,16 +189,18 @@ 1. Introduction
- Item 1 in A
- Item 2 in A
-- Item 3 in A
+- Item 3 in A
- Item 1 in B
-- Item 2 in B
+- Item 2 in B
- Item 1 in C
- Item 2 in C
+
- Item 3 in B
+
- Item 4 in A
The end.
diff --git a/test/data/doc/constructed_doc.placeholder.html.gt b/test/data/doc/constructed_doc.placeholder.html.gt
index 55a9cee8..a4f0130a 100644
--- a/test/data/doc/constructed_doc.placeholder.html.gt
+++ b/test/data/doc/constructed_doc.placeholder.html.gt
@@ -135,15 +135,17 @@
- list item 1
- list item 2
-- list item 3
+- list item 3
- list item 3.a
- list item 3.b
-- list item 3.c
+- list item 3.c
- list item 3.c.i
+
+
- list item 4
This is the caption of table 1.
| Product | Years |
| 2016 | 2017 |
| Apple | 49823 | 695944 |
@@ -158,12 +160,13 @@
- item 1 of neighboring list
-- item 2 of neighboring list
+- item 2 of neighboring list
- item 1 of sub list
- Here a code snippet:
print("Hello world") (to be displayed inline)
- Here a formula: (to be displayed inline)
+
Here a code block:
print("Hello world")
@@ -185,16 +188,18 @@
- Item 1 in A
- Item 2 in A
-- Item 3 in A
+- Item 3 in A
- Item 1 in B
-- Item 2 in B
+- Item 2 in B
- Item 1 in C
- Item 2 in C
+
- Item 3 in B
+
- Item 4 in A
diff --git a/test/data/doc/constructed_doc.referenced.html.gt b/test/data/doc/constructed_doc.referenced.html.gt
index 0ccda7cf..43649aee 100644
--- a/test/data/doc/constructed_doc.referenced.html.gt
+++ b/test/data/doc/constructed_doc.referenced.html.gt
@@ -135,15 +135,17 @@
- list item 1
- list item 2
-- list item 3
+- list item 3
- list item 3.a
- list item 3.b
-- list item 3.c
+- list item 3.c
- list item 3.c.i
+
+
- list item 4
This is the caption of table 1.
| Product | Years |
| 2016 | 2017 |
| Apple | 49823 | 695944 |
@@ -158,12 +160,13 @@
- item 1 of neighboring list
-- item 2 of neighboring list
+- item 2 of neighboring list
- item 1 of sub list
- Here a code snippet:
print("Hello world") (to be displayed inline)
- Here a formula: (to be displayed inline)
+
Here a code block:
print("Hello world")
@@ -185,16 +188,18 @@
- Item 1 in A
- Item 2 in A
-- Item 3 in A
+- Item 3 in A
- Item 1 in B
-- Item 2 in B
+- Item 2 in B
- Item 1 in C
- Item 2 in C
+
- Item 3 in B
+
- Item 4 in A
diff --git a/test/data/doc/constructed_document.yaml.html b/test/data/doc/constructed_document.yaml.html
index 04754103..bc3e2333 100644
--- a/test/data/doc/constructed_document.yaml.html
+++ b/test/data/doc/constructed_document.yaml.html
@@ -135,15 +135,17 @@ 1. Introduction
- list item 1
- list item 2
-- list item 3
+- list item 3
- list item 3.a
- list item 3.b
-- list item 3.c
+- list item 3.c
- list item 3.c.i
+
+
- list item 4
This is the caption of table 1.
| Product | Years |
| 2016 | 2017 |
| Apple | 49823 | 695944 |
@@ -158,12 +160,13 @@ 1. Introduction
- item 1 of neighboring list
-- item 2 of neighboring list
+- item 2 of neighboring list
- item 1 of sub list
- Here a code snippet:
print("Hello world") (to be displayed inline)
- Here a formula: (to be displayed inline)
+
Here a code block:
print("Hello world")
@@ -185,16 +188,18 @@ 1. Introduction
- Item 1 in A
- Item 2 in A
-- Item 3 in A
+- Item 3 in A
- Item 1 in B
-- Item 2 in B
+- Item 2 in B
- Item 1 in C
- Item 2 in C
+
- Item 3 in B
+
- Item 4 in A
diff --git a/test/data/doc/constructed_orig_false.gt.html b/test/data/doc/constructed_orig_false.gt.html
index 19f0398a..1f35636a 100644
--- a/test/data/doc/constructed_orig_false.gt.html
+++ b/test/data/doc/constructed_orig_false.gt.html
@@ -135,15 +135,17 @@ 1. Introduction
- list item 1
- list item 2
-- list item 3
+- list item 3
- list item 3.a
- list item 3.b
-- list item 3.c
+- list item 3.c
- list item 3.c.i
+
+
- list item 4
This is the caption of table 1.
| Product | Years |
| 2016 | 2017 |
| Apple | 49823 | 695944 |
@@ -158,12 +160,13 @@ 1. Introduction
- item 1 of neighboring list
-- item 2 of neighboring list
+- item 2 of neighboring list
- item 1 of sub list
- Here a code snippet:
print("Hello world") (to be displayed inline)
- Here a formula: (to be displayed inline)
+
Here a code block:
print("Hello world")
@@ -185,16 +188,18 @@ 1. Introduction
- Item 1 in A
- Item 2 in A
-- Item 3 in A
+- Item 3 in A
- Item 1 in B
-- Item 2 in B
+- Item 2 in B
- Item 1 in C
- Item 2 in C
+
- Item 3 in B
+
- Item 4 in A
diff --git a/test/data/doc/constructed_orig_true.gt.html b/test/data/doc/constructed_orig_true.gt.html
index 04754103..bc3e2333 100644
--- a/test/data/doc/constructed_orig_true.gt.html
+++ b/test/data/doc/constructed_orig_true.gt.html
@@ -135,15 +135,17 @@ 1. Introduction
- list item 1
- list item 2
-- list item 3
+- list item 3
- list item 3.a
- list item 3.b
-- list item 3.c
+- list item 3.c
- list item 3.c.i
+
+
- list item 4
This is the caption of table 1.
| Product | Years |
| 2016 | 2017 |
| Apple | 49823 | 695944 |
@@ -158,12 +160,13 @@ 1. Introduction
- item 1 of neighboring list
-- item 2 of neighboring list
+- item 2 of neighboring list
- item 1 of sub list
- Here a code snippet:
print("Hello world") (to be displayed inline)
- Here a formula: (to be displayed inline)
+
Here a code block:
print("Hello world")
@@ -185,16 +188,18 @@ 1. Introduction
- Item 1 in A
- Item 2 in A
-- Item 3 in A
+- Item 3 in A
- Item 1 in B
-- Item 2 in B
+- Item 2 in B
- Item 1 in C
- Item 2 in C
+
- Item 3 in B
+
- Item 4 in A
From 175e5f6018d823630559a287b82c2ef1f90d9feb Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Date: Tue, 22 Jul 2025 17:08:27 +0200
Subject: [PATCH 2/2] test: address several warning messages
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
---
test/test_base.py | 4 ++--
test/test_docling_doc.py | 20 +++++++++++---------
test/test_doctags_load.py | 12 ++++++++----
3 files changed, 21 insertions(+), 15 deletions(-)
diff --git a/test/test_base.py b/test/test_base.py
index 0fe09ab5..d6f07f51 100644
--- a/test/test_base.py
+++ b/test/test_base.py
@@ -36,8 +36,8 @@ def test_identifier():
)
# schema_json(): no need to set by_alias since it is True by the default
- tf = open("test/data/json_schemas/base_identifier.json", encoding="utf-8")
- gold_json = json.load(tf)
+ with open("test/data/json_schemas/base_identifier.json", encoding="utf-8") as tf:
+ gold_json = json.load(tf)
assert Identifier.model_json_schema() == gold_json
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index 1858f5f6..e0aa622f 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -1819,9 +1819,10 @@ def _verify(
# Test the handling of list items in insert_* methods, both with and without parent groups
- li_sibling = doc.insert_list_item(
- sibling=node, text="Inserted List Item, Incorrect Parent", after=False
- )
+ with pytest.warns(DeprecationWarning, match="ListItem parent must be a ListGroup"):
+ li_sibling = doc.insert_list_item(
+ sibling=node, text="Inserted List Item, Incorrect Parent", after=False
+ )
doc.insert_list_item(
sibling=li_sibling, text="Inserted List Item, Correct Parent", after=True
)
@@ -1831,12 +1832,13 @@ def _verify(
text="Inserted Text with LIST_ITEM Label, Correct Parent",
after=False,
)
- doc.insert_text(
- sibling=node,
- label=DocItemLabel.LIST_ITEM,
- text="Inserted Text with LIST_ITEM Label, Incorrect Parent",
- after=True,
- )
+ with pytest.warns(DeprecationWarning, match="ListItem parent must be a ListGroup"):
+ doc.insert_text(
+ sibling=node,
+ label=DocItemLabel.LIST_ITEM,
+ text="Inserted Text with LIST_ITEM Label, Incorrect Parent",
+ after=True,
+ )
filename = Path(
"test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json"
diff --git a/test/test_doctags_load.py b/test/test_doctags_load.py
index 5f1d5733..a6bf5bea 100644
--- a/test/test_doctags_load.py
+++ b/test/test_doctags_load.py
@@ -60,7 +60,8 @@ def test_doctags_load_from_files():
def test_doctags_load_from_memory():
- doctags = Path("test/data/doc/page_with_pic.dt").open("r").read()
+ with Path("test/data/doc/page_with_pic.dt").open() as file:
+ doctags = file.read()
image = PILImage.open(Path("test/data/doc/page_with_pic.png"))
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
@@ -75,7 +76,8 @@ def test_doctags_load_from_memory():
def test_doctags_load_without_image():
- doctags = Path("test/data/doc/page_with_pic.dt").open("r").read()
+ with Path("test/data/doc/page_with_pic.dt").open() as file:
+ doctags = file.read()
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], None)
doc = DoclingDocument.load_from_doctags(doctags_doc)
exp = "test/data/doc/page_without_pic.dt.json"
@@ -86,7 +88,8 @@ def test_doctags_load_without_image():
def test_doctags_load_for_kv_region():
- doctags = Path("test/data/doc/doc_with_kv.dt").open("r").read()
+ with Path("test/data/doc/doc_with_kv.dt").open() as file:
+ doctags = file.read()
image = PILImage.open(Path("test/data/doc/doc_with_kv.png"))
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
doc = DoclingDocument.load_from_doctags(doctags_doc)
@@ -98,7 +101,8 @@ def test_doctags_load_for_kv_region():
def test_multipage_doctags_load():
- doctags = Path("test/data/doc/2206.01062.yaml.dt").open("r").read()
+ with Path("test/data/doc/2206.01062.yaml.dt").open() as file:
+ doctags = file.read()
doctags_doc = DocTagsDocument.from_multipage_doctags_and_images(doctags, None)
doc = DoclingDocument.load_from_doctags(doctags_doc)
exp = "test/data/doc/2206.01062.yaml.dt.json"