Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions docling_core/experimental/serializer/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
NodeItem,
OrderedList,
PictureItem,
PictureTabularChartData,
SectionHeaderItem,
TableCell,
TableItem,
Expand Down Expand Up @@ -104,6 +105,9 @@ class HTMLParams(CommonParams):
# Allow for different output styles
output_style: HTMLOutputStyle = HTMLOutputStyle.SINGLE_COLUMN

# Enable charts to be printed into HTML as tables
enable_chart_tables: bool = True


class HTMLTextSerializer(BaseModel, BaseTextSerializer):
"""HTML-specific text item serializer."""
Expand Down Expand Up @@ -402,9 +406,28 @@ def serialize(
and item.image.uri.scheme == "data"
):
img_text = f'<img src="{quote(str(item.image.uri))}">'

if img_text:
res_parts.append(create_ser_result(text=img_text, span_source=item))

if params.enable_chart_tables:
# Check if picture has attached PictureTabularChartData
tabular_chart_annotations = [
ann
for ann in item.annotations
if isinstance(ann, PictureTabularChartData)
]
if len(tabular_chart_annotations) > 0:
temp_doc = DoclingDocument(name="temp")
temp_table = temp_doc.add_table(
data=tabular_chart_annotations[0].chart_data
)
html_table_content = temp_table.export_to_html(temp_doc)
if len(html_table_content) > 0:
res_parts.append(
create_ser_result(text=html_table_content, span_source=item)
)

text_res = "".join([r.text for r in res_parts])
if text_res:
text_res = f"<figure>{text_res}</figure>"
Expand Down
2 changes: 2 additions & 0 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3143,6 +3143,7 @@ def export_to_html( # noqa: C901
from_element: int = 0,
to_element: int = sys.maxsize,
labels: Optional[set[DocItemLabel]] = None,
enable_chart_tables: bool = True,
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
formula_to_mathml: bool = True,
page_no: Optional[int] = None,
Expand Down Expand Up @@ -3176,6 +3177,7 @@ def export_to_html( # noqa: C901
start_idx=from_element,
stop_idx=to_element,
image_mode=image_mode,
enable_chart_tables=enable_chart_tables,
formula_to_mathml=formula_to_mathml,
html_head=html_head,
html_lang=html_lang,
Expand Down
3 changes: 3 additions & 0 deletions test/data/doc/barchart.dt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<doctag><page_header><loc_71><loc_14><loc_217><loc_20>Probability, Combinatorics and Control</page_header>
<chart><loc_102><loc_37><loc_392><loc_148><bar_chart><ched>Number of impellers<ched>single-frequency<ched>multi-frequency<nl><fcel>1<fcel>0.06<fcel>0.16<nl><fcel>2<fcel>0.12<fcel>0.26<nl><fcel>3<fcel>0.16<fcel>0.27<nl><fcel>4<fcel>0.14<fcel>0.26<nl><fcel>5<fcel>0.16<fcel>0.25<nl><fcel>6<fcel>0.24<fcel>0.24<nl></chart>
</doctag>
130 changes: 130 additions & 0 deletions test/data/doc/barchart.gt.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Document</title>
<meta name="generator" content="Docling HTML Serializer">
<style>
html {
background-color: #f5f5f5;
font-family: Arial, sans-serif;
line-height: 1.6;
}
body {
max-width: 800px;
margin: 0 auto;
padding: 2rem;
background-color: white;
box-shadow: 0 0 10px rgba(0,0,0,0.1);
}
h1, h2, h3, h4, h5, h6 {
color: #333;
margin-top: 1.5em;
margin-bottom: 0.5em;
}
h1 {
font-size: 2em;
border-bottom: 1px solid #eee;
padding-bottom: 0.3em;
}
table {
border-collapse: collapse;
margin: 1em 0;
width: 100%;
}
th, td {
border: 1px solid #ddd;
padding: 8px;
text-align: left;
}
th {
background-color: #f2f2f2;
font-weight: bold;
}
figure {
margin: 1.5em 0;
text-align: center;
}
figcaption {
color: #666;
font-style: italic;
margin-top: 0.5em;
}
img {
max-width: 100%;
height: auto;
}
pre {
background-color: #f6f8fa;
border-radius: 3px;
padding: 1em;
overflow: auto;
}
code {
font-family: monospace;
background-color: #f6f8fa;
padding: 0.2em 0.4em;
border-radius: 3px;
}
pre code {
background-color: transparent;
padding: 0;
}
.formula {
text-align: center;
padding: 0.5em;
margin: 1em 0;
background-color: #f9f9f9;
}
.formula-not-decoded {
text-align: center;
padding: 0.5em;
margin: 1em 0;
background: repeating-linear-gradient(
45deg,
#f0f0f0,
#f0f0f0 10px,
#f9f9f9 10px,
#f9f9f9 20px
);
}
.page-break {
page-break-after: always;
border-top: 1px dashed #ccc;
margin: 2em 0;
}
.key-value-region {
background-color: #f9f9f9;
padding: 1em;
border-radius: 4px;
margin: 1em 0;
}
.key-value-region dt {
font-weight: bold;
}
.key-value-region dd {
margin-left: 1em;
margin-bottom: 0.5em;
}
.form-container {
border: 1px solid #ddd;
padding: 1em;
border-radius: 4px;
margin: 1em 0;
}
.form-item {
margin-bottom: 0.5em;
}
.image-classification {
font-size: 0.9em;
color: #666;
margin-top: 0.5em;
}
</style>
</head>
<body>
<div class='page'>
<figure><table><tbody><tr><td>Number of impellers</td><td>single-frequency</td><td>multi-frequency</td></tr><tr><td>1</td><td>0.06</td><td>0.16</td></tr><tr><td>2</td><td>0.12</td><td>0.26</td></tr><tr><td>3</td><td>0.16</td><td>0.27</td></tr><tr><td>4</td><td>0.14</td><td>0.26</td></tr><tr><td>5</td><td>0.16</td><td>0.25</td></tr><tr><td>6</td><td>0.24</td><td>0.24</td></tr></tbody></table></figure>
</div>
</body>
</html>
10 changes: 10 additions & 0 deletions test/data/doc/barchart.gt.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<!-- image -->

| Number of impellers | single-frequency | multi-frequency |
|-----------------------|--------------------|-------------------|
| 1 | 0.06 | 0.16 |
| 2 | 0.12 | 0.26 |
| 3 | 0.16 | 0.27 |
| 4 | 0.14 | 0.26 |
| 5 | 0.16 | 0.25 |
| 6 | 0.24 | 0.24 |
1 change: 1 addition & 0 deletions test/data/doc/barchart.json

Large diffs are not rendered by default.

Binary file added test/data/doc/barchart.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
16 changes: 15 additions & 1 deletion test/test_doctags_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from PIL import Image as PILImage

from docling_core.types.doc import DoclingDocument
from docling_core.types.doc.document import DocTagsDocument
from docling_core.types.doc.document import DocTagsDocument, PictureTabularChartData


def test_doctags_load_from_files():
Expand Down Expand Up @@ -55,6 +55,20 @@ def test_multipage_doctags_load():
# print(doc.export_to_html())


def test_doctags_chart():
doc = DoclingDocument(name="Document")
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
[Path("test/data/doc/barchart.dt")],
[Path("test/data/doc/barchart.png")],
)
doc.load_from_doctags(doctags_doc)
for pic in doc.pictures:
tabular_chart_annotations = [
ann for ann in pic.annotations if isinstance(ann, PictureTabularChartData)
]
assert len(tabular_chart_annotations) > 0


def test_doctags_table_provenances_and_captions():
doc = DoclingDocument(name="Document")
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
Expand Down
28 changes: 28 additions & 0 deletions test/test_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,34 @@ def test_md_cross_page_list_page_break_p2():
verify(exp_file=src.parent / f"{src.stem}_p2.gt.md", actual=actual)


def test_html_charts():
src = Path("./test/data/doc/barchart.json")
doc = DoclingDocument.load_from_json(src)

ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.PLACEHOLDER,
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}.gt.html", actual=actual)


def test_md_charts():
src = Path("./test/data/doc/barchart.json")
doc = DoclingDocument.load_from_json(src)

ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.PLACEHOLDER,
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)


def test_html_cross_page_list_page_break():
src = Path("./test/data/doc/activities.json")
doc = DoclingDocument.load_from_json(src)
Expand Down