Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3017,6 +3017,7 @@ def load_from_doctags( # noqa: C901
"list_item": DocItemLabel.LIST_ITEM,
"footnote": DocItemLabel.FOOTNOTE,
"code": DocItemLabel.CODE,
"key_value_region": DocItemLabel.KEY_VALUE_REGION,
}

def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
Expand Down Expand Up @@ -3189,6 +3190,95 @@ def parse_table_content(otsl_content: str) -> TableData:
table_cells=table_cells,
)

def parse_key_value_item(
tokens: str, image: Optional[PILImage.Image] = None
) -> Tuple[GraphData, Optional[ProvenanceItem]]:
if image is not None:
pg_width = image.width
pg_height = image.height
else:
pg_width = 1
pg_height = 1

start_locs_match = re.search(r"<key_value_region>(.*?)<key", tokens)
if start_locs_match:
overall_locs = start_locs_match.group(1)
overall_bbox = extract_bounding_box(overall_locs) if image else None
overall_prov = (
ProvenanceItem(
bbox=overall_bbox.resize_by_scale(pg_width, pg_height),
charspan=(0, 0),
page_no=1,
)
if overall_bbox
else None
)
else:
overall_prov = None

# here we assumed the labels as only key or value, later on we can update
# it to have unspecified, checkbox etc.
cell_pattern = re.compile(
r"<(?P<label>key|value)_(?P<id>\d+)>"
r"(?P<content>.*?)"
r"</(?P=label)_(?P=id)>",
re.DOTALL,
)

cells: List["GraphCell"] = []
links: List["GraphLink"] = []
raw_link_predictions = []

for cell_match in cell_pattern.finditer(tokens):
cell_label_str = cell_match.group("label") # "key" or "value"
cell_id = int(cell_match.group("id"))
raw_content = cell_match.group("content")

# link tokens
link_matches = re.findall(r"<link_(\d+)>", raw_content)

cell_bbox = extract_bounding_box(raw_content) if image else None
cell_prov = None
if cell_bbox is not None:
cell_prov = ProvenanceItem(
bbox=cell_bbox.resize_by_scale(pg_width, pg_height),
charspan=(0, 0),
page_no=1,
)

cleaned_text = re.sub(r"<loc_\d+>", "", raw_content)
cleaned_text = re.sub(r"<link_\d+>", "", cleaned_text).strip()

cell_obj = GraphCell(
label=GraphCellLabel(cell_label_str),
cell_id=cell_id,
text=cleaned_text,
orig=cleaned_text,
prov=cell_prov,
item_ref=None,
)
cells.append(cell_obj)

cell_ids = {cell.cell_id for cell in cells}

for target_str in link_matches:
raw_link_predictions.append((cell_id, int(target_str)))

cell_ids = {cell.cell_id for cell in cells}

for source_id, target_id in raw_link_predictions:
# basic check to validate the prediction
if target_id not in cell_ids:
continue
link_obj = GraphLink(
label=GraphLinkLabel.TO_VALUE,
source_cell_id=source_id,
target_cell_id=target_id,
)
links.append(link_obj)

return (GraphData(cells=cells, links=links), overall_prov)

# doc = DoclingDocument(name="Document")
for pg_idx, doctag_page in enumerate(doctag_document.pages):
page_doctags = doctag_page.tokens
Expand All @@ -3204,6 +3294,12 @@ def parse_table_content(otsl_content: str) -> TableData:
pg_width = 1
pg_height = 1

self.add_page(
page_no=page_no,
size=Size(width=pg_width, height=pg_height),
image=ImageRef.from_pil(image=image, dpi=72) if image else None,
)

"""
1. Finds all <tag>...</tag>
blocks in the entire string (multi-line friendly)
Expand All @@ -3224,6 +3320,7 @@ def parse_table_content(otsl_content: str) -> TableData:
rf"{DocItemLabel.SECTION_HEADER}_level_1|"
rf"{DocumentToken.ORDERED_LIST.value}|"
rf"{DocumentToken.UNORDERED_LIST.value}|"
rf"{DocItemLabel.KEY_VALUE_REGION}|"
rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
)

Expand Down Expand Up @@ -3301,6 +3398,11 @@ def parse_table_content(otsl_content: str) -> TableData:
parent=None,
)
pic.captions.append(caption_item.get_ref())
elif tag_name == DocItemLabel.KEY_VALUE_REGION:
key_value_data, kv_item_prov = parse_key_value_item(
full_chunk, image
)
self.add_key_values(graph=key_value_data, prov=kv_item_prov)
elif tag_name in [
DocumentToken.ORDERED_LIST.value,
DocumentToken.UNORDERED_LIST.value,
Expand Down
1 change: 1 addition & 0 deletions test/data/doc/doc_with_kv.dt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<doctag><key_value_region><loc_30><loc_50><loc_434><loc_444><key_0><loc_31><loc_51><loc_49><loc_60>TO:<link_4></key_0><key_1><loc_31><loc_70><loc_64><loc_80>FROM:<link_5></key_1><key_2><loc_453><loc_400><loc_469><loc_456>8623474</key_2><value_3><loc_408><loc_69><loc_423><loc_78>☑</value_3><value_4><loc_82><loc_51><loc_162><loc_61>Mrs. K. A. Sparrow</value_4><value_5><loc_84><loc_69><loc_130><loc_79>R. G. Ryan</value_5><value_6><loc_339><loc_70><loc_371><loc_78>JUNE7<link_3></value_6><value_7><loc_338><loc_78><loc_373><loc_87>AUG.2</value_7><value_8><loc_339><loc_88><loc_372><loc_96>OCT.7</value_8><key_9><loc_344><loc_50><loc_434><loc_60>SUBMISSION DATE:<link_6><link_8><link_7></key_9><key_10><loc_112><loc_106><loc_361><loc_117>NEWPORT LIGHTS HEAVY UP PROGRESS REPORT</key_10><key_11><loc_31><loc_134><loc_276><loc_144>EFFECTIVENESS OF DISTRIBUTION ALLOWANCE:<link_16><link_14><link_12></key_11><value_12><loc_30><loc_154><loc_190><loc_164>DIRECT ACCOUNT/ WHOLESALERS:<link_13></value_12><value_13><loc_32><loc_164><loc_397><loc_182>Distribution allowance was very effective in accomplishing our objectives. All accounts have purchased introductory products.</value_13><value_14><loc_31><loc_218><loc_156><loc_227>DIRECT ACCOUNT CHAINS:<link_15></value_14><value_15><loc_31><loc_228><loc_156><loc_238>Eagle Foods is the only Void.</value_15><value_16><loc_31><loc_276><loc_186><loc_285>NON- DIRECT ACCOUNT CHAINS:<link_17></value_16><value_17><loc_31><loc_286><loc_381><loc_295>Reception from these accounts is most positive with a solid incentitive to purchase.</value_17><key_18><loc_31><loc_331><loc_161><loc_360>EFFECTIVENESS OF THE RETAIL (1 00 OFF CARTON) DISTRIBUTION ALLOWANCE:<link_19></key_18><value_19><loc_185><loc_350><loc_429><loc_370>Has been most helpful in acquiring desireable distribution when needed by Sales Reps.</value_19><key_20><loc_31><loc_398><loc_155><loc_408>PROMOTIONAL ACTIVITY<link_21></key_20><value_21><loc_31><loc_417><loc_120><loc_436>40c OFF PACK- GENERAL MARKET:<link_22></value_21><value_22><loc_135><loc_426><loc_401><loc_444>The 40c off promotions continue to be well received at the retail stores and by consumers, as well.</value_22></key_value_region></doctag>
Binary file added test/data/doc/doc_with_kv.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
File renamed without changes.
8 changes: 4 additions & 4 deletions test/test_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_table_export_to_tokens():
page_w=pagedims[page][0], page_h=pagedims[page][1]
)

fname = f"{filename}_table_{i}.doctags.txt"
fname = f"{filename}_table_{i}.dt.txt"
if GENERATE:
print(f"writing {fname}")
with open(fname, "w", encoding="utf-8") as gold_obj:
Expand All @@ -93,7 +93,7 @@ def test_table_export_to_tokens():
add_table_location=False, add_cell_location=False
)

fname = f"{filename}_table_{i}.doctags.txt"
fname = f"{filename}_table_{i}.dt.txt"
if GENERATE:
print(f"writing {fname}")
with open(fname, "w", encoding="utf-8") as gold_obj:
Expand Down Expand Up @@ -138,12 +138,12 @@ def test_document_export_to_tokens():

if GENERATE:
with open(
"test/data/legacy_doc/doc-export.doctags.txt", "w", encoding="utf-8"
"test/data/legacy_doc/doc-export.dt.txt", "w", encoding="utf-8"
) as gold_obj:
gold_obj.write(xml)

with open(
"test/data/legacy_doc/doc-export.doctags.txt", "r", encoding="utf-8"
"test/data/legacy_doc/doc-export.dt.txt", "r", encoding="utf-8"
) as gold_obj:
gold_data = gold_obj.read().strip()

Expand Down
13 changes: 11 additions & 2 deletions test/test_doctags_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def test_doctags_load_from_files():
doc = DoclingDocument(name="Document")

doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
[Path("test/data/doc/page_with_pic.doctags")],
[Path("test/data/doc/page_with_pic.dt")],
[Path("test/data/doc/page_with_pic.png")],
)

Expand All @@ -21,10 +21,19 @@ def test_doctags_load_from_files():
def test_doctags_load_from_memory():
doc = DoclingDocument(name="Document")

doctags = Path("test/data/doc/page_with_pic.doctags").open("r").read()
doctags = Path("test/data/doc/page_with_pic.dt").open("r").read()
image = PILImage.open(Path("test/data/doc/page_with_pic.png"))

doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])

doc.load_from_doctags(doctags_doc)
# print(doc.export_to_html())


def test_doctags_load_for_kv_region():
doc = DoclingDocument(name="Document")
doctags = Path("test/data/doc/doc_with_kv.dt").open("r").read()
image = PILImage.open(Path("test/data/doc/doc_with_kv.png"))
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
doc.load_from_doctags(doctags_doc)
# print(doc.export_to_html())