From 28a984612b4a6a4f5a0700ef48c5570ae24bf88e Mon Sep 17 00:00:00 2001 From: Saidgurbuz Date: Wed, 12 Mar 2025 16:51:57 +0100 Subject: [PATCH 1/5] add kv_item support for doctag to docling_document Signed-off-by: Saidgurbuz --- docling_core/types/doc/document.py | 80 ++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 094edda6..2b8dac2f 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -3189,6 +3189,82 @@ def parse_table_content(otsl_content: str) -> TableData: table_cells=table_cells, ) + def parse_key_value_item( + tokens: str, + ) -> Tuple[GraphData, Optional[ProvenanceItem]]: + start_locs_match = re.match(r"^\s*(\s*)+", tokens) + if start_locs_match: + overall_locs = start_locs_match.group(0) + overall_bbox = extract_bounding_box(overall_locs) + overall_prov = ( + ProvenanceItem(bbox=overall_bbox, charspan=(0, 0), page_no=1) + if overall_bbox + else None + ) + else: + overall_prov = None + + # here we assumed the labels as only key or value, later on we can update + # it to have unspecified, checkbox etc. + cell_pattern = re.compile( + r"<(?P