Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
573 changes: 293 additions & 280 deletions docling_core/types/doc/document.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docling_core/types/doc/labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ class PictureClassificationLabel(str, Enum):
SIGNATURE = "signature"
STAMP = "stamp"
QR_CODE = "qr_code"
BAR_CODE = "bat_code"
BAR_CODE = "bar_code"
SCREENSHOT = "screenshot"

# Geology/Geography
Expand Down
59 changes: 28 additions & 31 deletions docling_core/types/doc/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
from enum import Enum
from typing import Tuple

from docling_core.types.doc.labels import PictureClassificationLabel


class TableToken(Enum):
"""Class to represent an LLM friendly representation of a Table."""

CELL_LABEL_COLUMN_HEADER = "<column_header>"
CELL_LABEL_ROW_HEADER = "<row_header>"
CELL_LABEL_SECTION_HEADERE = "<section_header>"
CELL_LABEL_SECTION_HEADER = "<shed>"
CELL_LABEL_DATA = "<data>"

OTSL_ECEL = "<ecel>" # empty cell
Expand Down Expand Up @@ -42,8 +44,8 @@ def is_known_token(label):
class DocumentToken(Enum):
"""Class to represent an LLM friendly representation of a Document."""

BEG_DOCUMENT = "<document>"
END_DOCUMENT = "</document>"
BEG_DOCUMENT = "<doctag>"
END_DOCUMENT = "</doctag>"

BEG_TITLE = "<title>"
END_TITLE = "</title>"
Expand All @@ -65,31 +67,35 @@ class DocumentToken(Enum):
END_AFFILIATIONS = "</affiliations>"
BEG_AFFILIATION = "<affiliation>"
END_AFFILIATION = "</affiliation>"

BEG_HEADER = "<section-header>"
END_HEADER = "</section-header>"
BEG_TEXT = "<text>"
END_TEXT = "</text>"
BEG_PARAGRAPH = "<paragraph>"
END_PARAGRAPH = "</paragraph>"
BEG_TABLE = "<table>"
END_TABLE = "</table>"
BEG_FIGURE = "<figure>"
END_FIGURE = "</figure>"
BEG_OTSL = "<otsl>"
END_OTSL = "</otsl>"
BEG_PICTURE = "<picture>"
END_PICTURE = "</picture>"
BEG_CAPTION = "<caption>"
END_CAPTION = "</caption>"
BEG_EQUATION = "<equation>"
END_EQUATION = "</equation>"
BEG_EQUATION = "<formula>"
END_EQUATION = "</formula>"
BEG_CODE = "<code>"
END_CODE = "</code>"
BEG_LIST = "<list>"
END_LIST = "</list>"
BEG_LISTITEM = "<list-item>"
END_LISTITEM = "</list-item>"

BEG_LINE_NUMBER = "<line_number>"
END_LINE_NUMBER = "</line_number>"
BEG_LOCATION = "<location>"
END_LOCATION = "</location>"
BEG_GROUP = "<group>"
END_GROUP = "</group>"

PAGE_BREAK = "<page_break>"

@classmethod
def get_special_tokens(
cls,
Expand All @@ -109,16 +115,14 @@ def get_special_tokens(
special_tokens += [f"<col_{i}>", f"</col_{i}>"]

for i in range(6):
special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
special_tokens += [
f"<section_header_level_{i}>",
f"</section_header_level_{i}>",
]

# FIXME: this is synonym of section header
for i in range(6):
special_tokens += [f"<subtitle-level-{i}>", f"</subtitle-level-{i}>"]

# Adding dynamically generated page-tokens
for i in range(0, max_pages + 1):
special_tokens.append(f"<page_{i}>")
special_tokens.append(f"</page_{i}>")
# Add dynamically picture classification tokens
for _, member in PictureClassificationLabel.__members__.items():
special_tokens.append(f"<{member}>")

# Adding dynamically generated location-tokens
for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
Expand Down Expand Up @@ -148,9 +152,9 @@ def get_col_token(col: int, beg=bool) -> str:
return f"</col_{col}>"

@staticmethod
def get_page_token(page: int):
"""Function to get page tokens."""
return f"<page_{page}>"
def get_picture_classification_token(classification: str) -> str:
"""Function to get picture classification tokens."""
return f"<{classification}>"

@staticmethod
def get_location_token(val: float, rnorm: int = 100):
Expand All @@ -172,7 +176,6 @@ def get_location(
page_h: float,
xsize: int = 100,
ysize: int = 100,
page_i: int = -1,
):
"""Get the location string give bbox and page-dim."""
assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
Expand All @@ -183,17 +186,11 @@ def get_location(
x1 = bbox[2] / page_w
y1 = bbox[3] / page_h

page_tok = ""
if page_i != -1:
page_tok = DocumentToken.get_page_token(page=page_i)

x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize)
y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize)
x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize)
y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize)

loc_str = f"{DocumentToken.BEG_LOCATION.value}"
loc_str += f"{page_tok}{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
loc_str += f"{DocumentToken.END_LOCATION.value}"
loc_str = f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"

return loc_str
354 changes: 137 additions & 217 deletions test/data/doc/2206.01062.yaml.dt

Large diffs are not rendered by default.

5 changes: 2 additions & 3 deletions test/data/doc/bad_doc.yaml.dt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
<document>
<title>This is the title</title>
<doctag><title>This is the title</title>
<section_header_level_1>This is the first section</section_header_level_1>
</document>
</doctag>
26 changes: 7 additions & 19 deletions test/data/doc/constructed_doc.dt
Original file line number Diff line number Diff line change
@@ -1,32 +1,20 @@
<document>
<title>Title of the Document</title>
<doctag><title>Title of the Document</title>
<text>Author 1
Affiliation 1</text>
<text>Author 2
Affiliation 2</text>
<section_header_level_1>1. Introduction</section_header_level_1>
<text>This paper introduces the biggest invention ever made. ...</text>
<unordered_list>
<list_item>list item 1</list_item>
<unordered_list><list_item>list item 1</list_item>
<list_item>list item 2</list_item>
<list_item>list item 3</list_item>
<ordered_list>
<list_item>list item 3.a</list_item>
<ordered_list><list_item>list item 3.a</list_item>
<list_item>list item 3.b</list_item>
<list_item>list item 3.c</list_item>
</ordered_list>
<list_item>list item 4</list_item>
</unordered_list>
<table>
<caption>This is the caption of table 1.</caption>
<row_0><col_0><body>Product</col_0><col_1><body>Years</col_1><col_2><body>Years</col_2></row_0>
<row_1><col_0><body>Product</col_0><col_1><body>2016</col_1><col_2><body>2017</col_2></row_1>
<row_2><col_0><body>Apple</col_0><col_1><body>49823</col_1><col_2><body>695944</col_2></row_2>
</table>
<figure>
<caption>This is the caption of figure 1.</caption>
</figure>
<figure>
<caption>This is the caption of figure 2.</caption>
</figure>
</document>
<otsl><fcel>Product<fcel>Years<lcel><nl><ucel><fcel>2016<fcel>2017<nl><fcel>Apple<fcel>49823<fcel>695944<nl><caption>This is the caption of table 1.</caption></otsl>
<picture><caption>This is the caption of figure 1.</caption></picture>
<picture><caption>This is the caption of figure 2.</caption></picture>
</doctag>
26 changes: 7 additions & 19 deletions test/data/doc/constructed_doc.dt.gt
Original file line number Diff line number Diff line change
@@ -1,32 +1,20 @@
<document>
<title>Title of the Document</title>
<doctag><title>Title of the Document</title>
<text>Author 1
Affiliation 1</text>
<text>Author 2
Affiliation 2</text>
<section_header_level_1>1. Introduction</section_header_level_1>
<text>This paper introduces the biggest invention ever made. ...</text>
<unordered_list>
<list_item>list item 1</list_item>
<unordered_list><list_item>list item 1</list_item>
<list_item>list item 2</list_item>
<list_item>list item 3</list_item>
<ordered_list>
<list_item>list item 3.a</list_item>
<ordered_list><list_item>list item 3.a</list_item>
<list_item>list item 3.b</list_item>
<list_item>list item 3.c</list_item>
</ordered_list>
<list_item>list item 4</list_item>
</unordered_list>
<table>
<caption>This is the caption of table 1.</caption>
<row_0><col_0><body>Product</col_0><col_1><body>Years</col_1><col_2><body>Years</col_2></row_0>
<row_1><col_0><body>Product</col_0><col_1><body>2016</col_1><col_2><body>2017</col_2></row_1>
<row_2><col_0><body>Apple</col_0><col_1><body>49823</col_1><col_2><body>695944</col_2></row_2>
</table>
<figure>
<caption>This is the caption of figure 1.</caption>
</figure>
<figure>
<caption>This is the caption of figure 2.</caption>
</figure>
</document>
<otsl><fcel>Product<fcel>Years<lcel><nl><ucel><fcel>2016<fcel>2017<nl><fcel>Apple<fcel>49823<fcel>695944<nl><caption>This is the caption of table 1.</caption></otsl>
<picture><caption>This is the caption of figure 1.</caption></picture>
<picture><caption>This is the caption of figure 2.</caption></picture>
</doctag>
26 changes: 7 additions & 19 deletions test/data/doc/constructed_document.yaml.dt
Original file line number Diff line number Diff line change
@@ -1,32 +1,20 @@
<document>
<title>Title of the Document</title>
<doctag><title>Title of the Document</title>
<text>Author 1
Affiliation 1</text>
<text>Author 2
Affiliation 2</text>
<section_header_level_1>1. Introduction</section_header_level_1>
<text>This paper introduces the biggest invention ever made. ...</text>
<unordered_list>
<list_item>list item 1</list_item>
<unordered_list><list_item>list item 1</list_item>
<list_item>list item 2</list_item>
<list_item>list item 3</list_item>
<ordered_list>
<list_item>list item 3.a</list_item>
<ordered_list><list_item>list item 3.a</list_item>
<list_item>list item 3.b</list_item>
<list_item>list item 3.c</list_item>
</ordered_list>
<list_item>list item 4</list_item>
</unordered_list>
<table>
<caption>This is the caption of table 1.</caption>
<row_0><col_0><body>Product</col_0><col_1><body>Years</col_1><col_2><body>Years</col_2></row_0>
<row_1><col_0><body>Product</col_0><col_1><body>2016</col_1><col_2><body>2017</col_2></row_1>
<row_2><col_0><body>Apple</col_0><col_1><body>49823</col_1><col_2><body>695944</col_2></row_2>
</table>
<figure>
<caption>This is the caption of figure 1.</caption>
</figure>
<figure>
<caption>This is the caption of figure 2.</caption>
</figure>
</document>
<otsl><fcel>Product<fcel>Years<lcel><nl><ucel><fcel>2016<fcel>2017<nl><fcel>Apple<fcel>49823<fcel>695944<nl><caption>This is the caption of table 1.</caption></otsl>
<picture><caption>This is the caption of figure 1.</caption></picture>
<picture><caption>This is the caption of figure 2.</caption></picture>
</doctag>
14 changes: 4 additions & 10 deletions test/data/doc/dummy_doc.yaml.dt
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
<document>
<title><location><page_1><loc_8><loc_91><loc_81><loc_95></location>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</title>
<figure>
<location><page_1><loc_59><loc_0><loc_91><loc_75></location>
<caption>Figure 1: Four examples of complex page layouts across different document categories</caption>
</figure>
<table>
<location><page_1><loc_42><loc_57><loc_49><loc_61></location>
</table>
</document>
<doctag><title><loc_42><loc_26><loc_406><loc_46>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</title>
<picture><loc_297><loc_125><loc_457><loc_500><illustration><caption><loc_210><loc_196><loc_245><loc_213>Figure 1: Four examples of complex page layouts across different document categories</caption></picture>
<otsl><loc_210><loc_196><loc_245><loc_213></otsl>
</doctag>
7 changes: 3 additions & 4 deletions test/test_docling_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,6 @@ def _test_serialize_and_reload(doc):


def _verify_regression_test(pred: str, filename: str, ext: str):

if os.path.exists(filename + f".{ext}") and not GENERATE:
with open(filename + f".{ext}", "r", encoding="utf-8") as fr:
gt_true = fr.read()
Expand All @@ -389,15 +388,15 @@ def _verify_regression_test(pred: str, filename: str, ext: str):


def _test_export_methods(doc: DoclingDocument, filename: str):
### Iterate all elements
# Iterate all elements
et_pred = doc.export_to_element_tree()
_verify_regression_test(et_pred, filename=filename, ext="et")

## Export stuff
# Export stuff
md_pred = doc.export_to_markdown()
_verify_regression_test(md_pred, filename=filename, ext="md")

# Test HTML export ...
# Test sHTML export ...
html_pred = doc.export_to_html()
_verify_regression_test(html_pred, filename=filename, ext="html")

Expand Down