diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 094edda6..6353f12d 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -3017,6 +3017,7 @@ def load_from_doctags( # noqa: C901 "list_item": DocItemLabel.LIST_ITEM, "footnote": DocItemLabel.FOOTNOTE, "code": DocItemLabel.CODE, + "key_value_region": DocItemLabel.KEY_VALUE_REGION, } def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]: @@ -3189,6 +3190,95 @@ def parse_table_content(otsl_content: str) -> TableData: table_cells=table_cells, ) + def parse_key_value_item( + tokens: str, image: Optional[PILImage.Image] = None + ) -> Tuple[GraphData, Optional[ProvenanceItem]]: + if image is not None: + pg_width = image.width + pg_height = image.height + else: + pg_width = 1 + pg_height = 1 + + start_locs_match = re.search(r"(.*?)key|value)_(?P\d+)>" + r"(?P.*?)" + r"", + re.DOTALL, + ) + + cells: List["GraphCell"] = [] + links: List["GraphLink"] = [] + raw_link_predictions = [] + + for cell_match in cell_pattern.finditer(tokens): + cell_label_str = cell_match.group("label") # "key" or "value" + cell_id = int(cell_match.group("id")) + raw_content = cell_match.group("content") + + # link tokens + link_matches = re.findall(r"", raw_content) + + cell_bbox = extract_bounding_box(raw_content) if image else None + cell_prov = None + if cell_bbox is not None: + cell_prov = ProvenanceItem( + bbox=cell_bbox.resize_by_scale(pg_width, pg_height), + charspan=(0, 0), + page_no=1, + ) + + cleaned_text = re.sub(r"", "", raw_content) + cleaned_text = re.sub(r"", "", cleaned_text).strip() + + cell_obj = GraphCell( + label=GraphCellLabel(cell_label_str), + cell_id=cell_id, + text=cleaned_text, + orig=cleaned_text, + prov=cell_prov, + item_ref=None, + ) + cells.append(cell_obj) + + cell_ids = {cell.cell_id for cell in cells} + + for target_str in link_matches: + raw_link_predictions.append((cell_id, int(target_str))) + + cell_ids = {cell.cell_id for cell in cells} + + for source_id, target_id in raw_link_predictions: + # basic check to validate the prediction + if target_id not in cell_ids: + continue + link_obj = GraphLink( + label=GraphLinkLabel.TO_VALUE, + source_cell_id=source_id, + target_cell_id=target_id, + ) + links.append(link_obj) + + return (GraphData(cells=cells, links=links), overall_prov) + # doc = DoclingDocument(name="Document") for pg_idx, doctag_page in enumerate(doctag_document.pages): page_doctags = doctag_page.tokens @@ -3204,6 +3294,12 @@ def parse_table_content(otsl_content: str) -> TableData: pg_width = 1 pg_height = 1 + self.add_page( + page_no=page_no, + size=Size(width=pg_width, height=pg_height), + image=ImageRef.from_pil(image=image, dpi=72) if image else None, + ) + """ 1. Finds all ... blocks in the entire string (multi-line friendly) @@ -3224,6 +3320,7 @@ def parse_table_content(otsl_content: str) -> TableData: rf"{DocItemLabel.SECTION_HEADER}_level_1|" rf"{DocumentToken.ORDERED_LIST.value}|" rf"{DocumentToken.UNORDERED_LIST.value}|" + rf"{DocItemLabel.KEY_VALUE_REGION}|" rf"{DocumentToken.OTSL.value})>.*?" ) @@ -3301,6 +3398,11 @@ def parse_table_content(otsl_content: str) -> TableData: parent=None, ) pic.captions.append(caption_item.get_ref()) + elif tag_name == DocItemLabel.KEY_VALUE_REGION: + key_value_data, kv_item_prov = parse_key_value_item( + full_chunk, image + ) + self.add_key_values(graph=key_value_data, prov=kv_item_prov) elif tag_name in [ DocumentToken.ORDERED_LIST.value, DocumentToken.UNORDERED_LIST.value, diff --git a/test/data/doc/doc_with_kv.dt b/test/data/doc/doc_with_kv.dt new file mode 100644 index 00000000..d67e99d3 --- /dev/null +++ b/test/data/doc/doc_with_kv.dt @@ -0,0 +1 @@ +TO:FROM:8623474Mrs. K. A. SparrowR. G. RyanJUNE7AUG.2OCT.7SUBMISSION DATE:NEWPORT LIGHTS HEAVY UP PROGRESS REPORTEFFECTIVENESS OF DISTRIBUTION ALLOWANCE:DIRECT ACCOUNT/ WHOLESALERS:Distribution allowance was very effective in accomplishing our objectives. All accounts have purchased introductory products.DIRECT ACCOUNT CHAINS:Eagle Foods is the only Void.NON- DIRECT ACCOUNT CHAINS:Reception from these accounts is most positive with a solid incentitive to purchase.EFFECTIVENESS OF THE RETAIL (1 00 OFF CARTON) DISTRIBUTION ALLOWANCE:Has been most helpful in acquiring desireable distribution when needed by Sales Reps.PROMOTIONAL ACTIVITY40c OFF PACK- GENERAL MARKET:The 40c off promotions continue to be well received at the retail stores and by consumers, as well. \ No newline at end of file diff --git a/test/data/doc/doc_with_kv.png b/test/data/doc/doc_with_kv.png new file mode 100644 index 00000000..d24ee2ec Binary files /dev/null and b/test/data/doc/doc_with_kv.png differ diff --git a/test/data/doc/page_with_pic.doctags b/test/data/doc/page_with_pic.dt similarity index 100% rename from test/data/doc/page_with_pic.doctags rename to test/data/doc/page_with_pic.dt diff --git a/test/data/legacy_doc/doc-1.json_table_0.doctags.txt b/test/data/legacy_doc/doc-1.json_table_0.dt.txt similarity index 100% rename from test/data/legacy_doc/doc-1.json_table_0.doctags.txt rename to test/data/legacy_doc/doc-1.json_table_0.dt.txt diff --git a/test/data/legacy_doc/doc-2.json_table_0.doctags.txt b/test/data/legacy_doc/doc-2.json_table_0.dt.txt similarity index 100% rename from test/data/legacy_doc/doc-2.json_table_0.doctags.txt rename to test/data/legacy_doc/doc-2.json_table_0.dt.txt diff --git a/test/data/legacy_doc/doc-6.json_table_0.doctags.txt b/test/data/legacy_doc/doc-6.json_table_0.dt.txt similarity index 100% rename from test/data/legacy_doc/doc-6.json_table_0.doctags.txt rename to test/data/legacy_doc/doc-6.json_table_0.dt.txt diff --git a/test/data/legacy_doc/doc-7.json_table_0.doctags.txt b/test/data/legacy_doc/doc-7.json_table_0.dt.txt similarity index 100% rename from test/data/legacy_doc/doc-7.json_table_0.doctags.txt rename to test/data/legacy_doc/doc-7.json_table_0.dt.txt diff --git a/test/data/legacy_doc/doc-8.json_table_0.doctags.txt b/test/data/legacy_doc/doc-8.json_table_0.dt.txt similarity index 100% rename from test/data/legacy_doc/doc-8.json_table_0.doctags.txt rename to test/data/legacy_doc/doc-8.json_table_0.dt.txt diff --git a/test/data/legacy_doc/doc-export.doctags.txt b/test/data/legacy_doc/doc-export.dt.txt similarity index 100% rename from test/data/legacy_doc/doc-export.doctags.txt rename to test/data/legacy_doc/doc-export.dt.txt diff --git a/test/data/legacy_doc/doc-export.json_table_0.doctags.txt b/test/data/legacy_doc/doc-export.json_table_0.dt.txt similarity index 100% rename from test/data/legacy_doc/doc-export.json_table_0.doctags.txt rename to test/data/legacy_doc/doc-export.json_table_0.dt.txt diff --git a/test/test_collection.py b/test/test_collection.py index 6fc18aed..843c9abc 100644 --- a/test/test_collection.py +++ b/test/test_collection.py @@ -70,7 +70,7 @@ def test_table_export_to_tokens(): page_w=pagedims[page][0], page_h=pagedims[page][1] ) - fname = f"{filename}_table_{i}.doctags.txt" + fname = f"{filename}_table_{i}.dt.txt" if GENERATE: print(f"writing {fname}") with open(fname, "w", encoding="utf-8") as gold_obj: @@ -93,7 +93,7 @@ def test_table_export_to_tokens(): add_table_location=False, add_cell_location=False ) - fname = f"{filename}_table_{i}.doctags.txt" + fname = f"{filename}_table_{i}.dt.txt" if GENERATE: print(f"writing {fname}") with open(fname, "w", encoding="utf-8") as gold_obj: @@ -138,12 +138,12 @@ def test_document_export_to_tokens(): if GENERATE: with open( - "test/data/legacy_doc/doc-export.doctags.txt", "w", encoding="utf-8" + "test/data/legacy_doc/doc-export.dt.txt", "w", encoding="utf-8" ) as gold_obj: gold_obj.write(xml) with open( - "test/data/legacy_doc/doc-export.doctags.txt", "r", encoding="utf-8" + "test/data/legacy_doc/doc-export.dt.txt", "r", encoding="utf-8" ) as gold_obj: gold_data = gold_obj.read().strip() diff --git a/test/test_doctags_load.py b/test/test_doctags_load.py index 5dd2146c..a570b769 100644 --- a/test/test_doctags_load.py +++ b/test/test_doctags_load.py @@ -10,7 +10,7 @@ def test_doctags_load_from_files(): doc = DoclingDocument(name="Document") doctags_doc = DocTagsDocument.from_doctags_and_image_pairs( - [Path("test/data/doc/page_with_pic.doctags")], + [Path("test/data/doc/page_with_pic.dt")], [Path("test/data/doc/page_with_pic.png")], ) @@ -21,10 +21,19 @@ def test_doctags_load_from_files(): def test_doctags_load_from_memory(): doc = DoclingDocument(name="Document") - doctags = Path("test/data/doc/page_with_pic.doctags").open("r").read() + doctags = Path("test/data/doc/page_with_pic.dt").open("r").read() image = PILImage.open(Path("test/data/doc/page_with_pic.png")) doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image]) doc.load_from_doctags(doctags_doc) # print(doc.export_to_html()) + + +def test_doctags_load_for_kv_region(): + doc = DoclingDocument(name="Document") + doctags = Path("test/data/doc/doc_with_kv.dt").open("r").read() + image = PILImage.open(Path("test/data/doc/doc_with_kv.png")) + doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image]) + doc.load_from_doctags(doctags_doc) + # print(doc.export_to_html())