From d506650982f5b1dce981c47e25799d1f13cbc27c Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Tue, 18 Nov 2025 10:57:50 +0100 Subject: [PATCH 1/5] Introduction of regions.py - convert regions into TableData Signed-off-by: Maksym Lysak --- docling_core/types/doc/regions.py | 337 ++++++++++++++++++++++++++++++ test/test_regions_to_table.py | 79 +++++++ 2 files changed, 416 insertions(+) create mode 100644 docling_core/types/doc/regions.py create mode 100644 test/test_regions_to_table.py diff --git a/docling_core/types/doc/regions.py b/docling_core/types/doc/regions.py new file mode 100644 index 00000000..e8a7f37b --- /dev/null +++ b/docling_core/types/doc/regions.py @@ -0,0 +1,337 @@ +# +# Copyright IBM Corp. 2025 - 2025 +# SPDX-License-Identifier: MIT +# + +"""Utils to work with region-defined tables.""" + +from typing import List, Optional, Protocol, Sequence, Set, Tuple + +from docling_core.types.doc.base import BoundingBox, CoordOrigin +from docling_core.types.doc.document import TableCell, TableData + + +def bbox_fraction_inside( + inner: BoundingBox, outer: BoundingBox, *, eps: float = 1.0e-9 +) -> float: + """Return the fraction of ``inner`` area that lies inside ``outer``.""" + area = inner.area() + if area <= eps: + return 0.0 + intersection = inner.intersection_area_with(outer) + return intersection / max(area, eps) + + +def bbox_contains( + inner: BoundingBox, outer: BoundingBox, *, threshold: float, eps: float = 1.0e-9 +) -> bool: + """Return ``True`` when ``inner`` is contained in ``outer`` above ``threshold``.""" + return bbox_fraction_inside(inner, outer, eps=eps) >= threshold + + +def bbox_iou(a: BoundingBox, b: BoundingBox, *, eps: float = 1.0e-6) -> float: + """Return the intersection over union between two bounding boxes.""" + return a.intersection_over_union(b, eps=eps) + + +class HasBoundingBox(Protocol): + """Protocol for objects exposing a bounding box.""" + + bbox: BoundingBox + + +def dedupe_bboxes( + elements: Sequence[BoundingBox], + *, + iou_threshold: float = 0.9, +) -> list[BoundingBox]: + """Return elements whose bounding boxes are unique within ``iou_threshold``.""" + deduped: list[BoundingBox] = [] + for element in elements: + if all(bbox_iou(element, kept) < iou_threshold for kept in deduped): + deduped.append(element) + return deduped + + +def is_bbox_within( + bbox_a: BoundingBox, bbox_b: BoundingBox, threshold: float = 0.5 +) -> bool: + """Return ``True`` when ``bbox_b`` lies within ``bbox_a`` above ``threshold``.""" + return bbox_contains(bbox_b, bbox_a, threshold=threshold) + + +def _process_table_headers( + bbox: BoundingBox, + row_headers: List[BoundingBox] = [], + col_headers: List[BoundingBox] = [], + row_sections: List[BoundingBox] = [], +) -> Tuple[bool, bool, bool]: + c_column_header = False + c_row_header = False + c_row_section = False + + for col_header in col_headers: + if is_bbox_within(col_header, bbox): + c_column_header = True + for row_header in row_headers: + if is_bbox_within(row_header, bbox): + c_row_header = True + for row_section in row_sections: + if is_bbox_within(row_section, bbox): + c_row_section = True + return c_column_header, c_row_header, c_row_section + + +def bbox_intersection(a: BoundingBox, b: BoundingBox) -> Optional[BoundingBox]: + """Return the intersection of two bounding boxes or ``None`` when disjoint.""" + if a.coord_origin != b.coord_origin: + raise ValueError("BoundingBoxes have different CoordOrigin") + + left = max(a.l, b.l) + right = min(a.r, b.r) + + if a.coord_origin == CoordOrigin.TOPLEFT: + top = max(a.t, b.t) + bottom = min(a.b, b.b) + if right <= left or bottom <= top: + return None + return BoundingBox( + l=left, t=top, r=right, b=bottom, coord_origin=a.coord_origin + ) + + top = min(a.t, b.t) + bottom = max(a.b, b.b) + if right <= left or top <= bottom: + return None + return BoundingBox(l=left, t=top, r=right, b=bottom, coord_origin=a.coord_origin) + + +def compute_cells( + rows: List[BoundingBox], + columns: List[BoundingBox], + merges: List[BoundingBox], + row_headers: List[BoundingBox] = [], + col_headers: List[BoundingBox] = [], + row_sections: List[BoundingBox] = [], + row_overlap_threshold: float = 0.5, # how much of a row a merge must cover vertically + col_overlap_threshold: float = 0.5, # how much of a column a merge must cover horizontally +) -> List[TableCell]: + """Returns TableCell. Merged cells are aligned to grid boundaries. + + rows, columns, merges are lists of BoundingBox(l,t,r,b). + """ + rows.sort(key=lambda r: (r.t + r.b) / 2.0) + columns.sort(key=lambda c: (c.l + c.r) / 2.0) + + # n_rows, n_cols = len(rows), len(columns) + + def span_from_merge( + m: BoundingBox, lines: List[BoundingBox], axis: str, frac_threshold: float + ) -> Optional[Tuple[int, int]]: + """Map a merge bbox to an inclusive index span over rows or columns. + + axis='row' uses vertical overlap vs row height; axis='col' uses horizontal overlap vs col width. + If nothing meets threshold, pick the single best-overlapping line if overlap>0; else return None. + """ + idxs = [] + best_i, best_len = None, 0.0 + for i, elem in enumerate(lines): + inter = bbox_intersection(m, elem) + if not inter: + continue + if axis == "row": + overlap_len = inter.height + base = max(1e-9, elem.height) + else: + overlap_len = inter.width + base = max(1e-9, elem.width) + + frac = overlap_len / base + if frac >= frac_threshold: + idxs.append(i) + + if overlap_len > best_len: + best_len, best_i = overlap_len, i + + if idxs: + return min(idxs), max(idxs) + if best_i is not None and best_len > 0.0: + return best_i, best_i + return None + + cells: List[TableCell] = [] + covered: Set[Tuple[int, int]] = set() + seen_merge_rects: Set[Tuple[int, int, int, int]] = set() + + # 1) Add merged cells first (and mark their covered simple cells) + for m in merges: + rspan = span_from_merge( + m, rows, axis="row", frac_threshold=row_overlap_threshold + ) + cspan = span_from_merge( + m, columns, axis="col", frac_threshold=col_overlap_threshold + ) + if rspan is None or cspan is None: + # Can't confidently map this merge to grid -> skip it + continue + + sr, er = rspan + sc, ec = cspan + rect_key = (sr, er, sc, ec) + if rect_key in seen_merge_rects: + continue + seen_merge_rects.add(rect_key) + + # Grid-aligned bbox for the merged cell + grid_bbox = BoundingBox( + l=columns[sc].l, + t=rows[sr].t, + r=columns[ec].r, + b=rows[er].b, + ) + c_column_header, c_row_header, c_row_section = _process_table_headers( + grid_bbox, col_headers, row_headers, row_sections + ) + + cells.append( + TableCell( + text="", + row_span=er - sr + 1, + col_span=ec - sc + 1, + start_row_offset_idx=sr, + end_row_offset_idx=er + 1, + start_col_offset_idx=sc, + end_col_offset_idx=ec + 1, + bbox=grid_bbox, + column_header=c_column_header, + row_header=c_row_header, + row_section=c_row_section, + ) + ) + for ri in range(sr, er + 1): + for ci in range(sc, ec + 1): + covered.add((ri, ci)) + + # 2) Add simple (1x1) cells where not covered by merges + for ri, row in enumerate(rows): + for ci, col in enumerate(columns): + if (ri, ci) in covered: + continue + inter = bbox_intersection(row, col) + if not inter: + # In degenerate cases (big gaps), there might be no intersection; skip. + continue + c_column_header, c_row_header, c_row_section = _process_table_headers( + inter, col_headers, row_headers, row_sections + ) + cells.append( + TableCell( + text="", + row_span=1, + col_span=1, + start_row_offset_idx=ri, + end_row_offset_idx=ri + 1, + start_col_offset_idx=ci, + end_col_offset_idx=ci + 1, + bbox=inter, + column_header=c_column_header, + row_header=c_row_header, + row_section=c_row_section, + ) + ) + return cells + + +def regions_to_table( + table_bbox: BoundingBox, + rows: List[BoundingBox], + cols: List[BoundingBox], + merges: List[BoundingBox], + row_headers: List[BoundingBox] = [], + col_headers: List[BoundingBox] = [], + row_sections: List[BoundingBox] = [], +) -> Optional[TableData]: + """Converts regions: rows, columns, merged cells into table_data structure. + + Adds semantics for regions of row_headers, col_headers, row_section + """ + default_containment_thresh = 0.50 + rows.extend(row_sections) # use row sections to compensate for missing rows + rows = dedupe_bboxes( + [ + e + for e in rows + if bbox_contains(e, table_bbox, threshold=default_containment_thresh) + ] + ) + cols = dedupe_bboxes( + [ + e + for e in cols + if bbox_contains(e, table_bbox, threshold=default_containment_thresh) + ] + ) + merges = dedupe_bboxes( + [ + e + for e in merges + if bbox_contains(e, table_bbox, threshold=default_containment_thresh) + ] + ) + + col_headers = dedupe_bboxes( + [ + e + for e in col_headers + if bbox_contains(e, table_bbox, threshold=default_containment_thresh) + ] + ) + row_headers = dedupe_bboxes( + [ + e + for e in row_headers + if bbox_contains(e, table_bbox, threshold=default_containment_thresh) + ] + ) + row_sections = dedupe_bboxes( + [ + e + for e in row_sections + if bbox_contains(e, table_bbox, threshold=default_containment_thresh) + ] + ) + + # Compute table cells from CVAT elements: rows, cols, merges + computed_table_cells = compute_cells( + rows, + cols, + merges, + col_headers, + row_headers, + row_sections, + ) + + # If no table structure found, create single fake cell for content + if not rows or not cols: + computed_table_cells = [ + TableCell( + text="", + row_span=1, + col_span=1, + start_row_offset_idx=0, + end_row_offset_idx=1, + start_col_offset_idx=0, + end_col_offset_idx=1, + bbox=table_bbox, + column_header=False, + row_header=False, + row_section=False, + ) + ] + table_data = TableData(num_rows=1, num_cols=1) + else: + table_data = TableData(num_rows=len(rows), num_cols=len(cols)) + + table_data.table_cells = computed_table_cells + + return table_data diff --git a/test/test_regions_to_table.py b/test/test_regions_to_table.py new file mode 100644 index 00000000..1e1dac53 --- /dev/null +++ b/test/test_regions_to_table.py @@ -0,0 +1,79 @@ +from docling_core.types.doc.base import BoundingBox +from docling_core.types.doc.regions import regions_to_table + +# Table bbox - defines region of a table, everything outside will be ignored +table_bbox: BoundingBox = BoundingBox(l=0, t=0, r=100, b=175) + +# List of regions that defines rows for table structure +rows: list[BoundingBox] = [ + BoundingBox(l=1, t=1, r=99, b=25), + BoundingBox(l=1, t=25, r=99, b=50), + BoundingBox(l=1, t=50, r=99, b=75), + BoundingBox(l=1, t=75, r=99, b=99), + BoundingBox(l=1, t=100, r=99, b=149), + BoundingBox(l=1, t=150, r=99, b=175), +] + +# List of regions that defines columns for table structure +cols: list[BoundingBox] = [ + BoundingBox(l=1, t=1, r=25, b=149), + BoundingBox(l=25, t=1, r=50, b=149), + BoundingBox(l=50, t=1, r=75, b=149), + BoundingBox(l=75, t=1, r=99, b=149), +] + +# List of regions that defines merged cells on top of row/clumn grid (spans) +merges: list[BoundingBox] = [ + BoundingBox(l=0, t=0, r=50, b=25), + BoundingBox(l=50, t=0, r=99, b=25), +] + +# (OPTIONAL) Semantic of a table - region that cover column headers +col_headers: list[BoundingBox] = [ + BoundingBox(l=0, t=0, r=99, b=25), +] + +# (OPTIONAL) Semantic of a table - region that cover row headers +row_headers: list[BoundingBox] = [ + BoundingBox(l=0, t=0, r=50, b=150), +] + +# (OPTIONAL) Semantic of a table - region that cover section rows +row_section: list[BoundingBox] = [ + BoundingBox(l=1, t=75, r=99, b=99), +] + + +def test_regions_to_table_convert(): + # Converts regions: rows, columns, merged cells + # into table_data structure, + # Adds semantics for regions of row_headers, col_headers, row_section + table_data = regions_to_table( + table_bbox=table_bbox, + rows=rows, + cols=cols, + merges=merges, + row_headers=row_headers, + col_headers=col_headers, + row_sections=row_section, + ) + + assert table_data.num_cols == 4 + assert table_data.num_rows == 6 + + assert table_data.table_cells[0].bbox.l == 1.0 + assert table_data.table_cells[0].bbox.t == 1.0 + assert table_data.table_cells[0].bbox.r == 50.0 + assert table_data.table_cells[0].bbox.b == 25.0 + + assert table_data.table_cells[0].col_span == 2 + assert table_data.table_cells[0].column_header == True + assert table_data.table_cells[1].column_header == True + + assert table_data.table_cells[10].row_header == True + assert table_data.table_cells[12].row_section == True + + assert table_data.table_cells[17].bbox.l == 75.0 + assert table_data.table_cells[17].bbox.t == 100.0 + assert table_data.table_cells[17].bbox.r == 99.0 + assert table_data.table_cells[17].bbox.b == 149.0 From ba4d1ea7d944f73824b10570d49d07ff9cea7fac Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Tue, 18 Nov 2025 12:44:55 +0100 Subject: [PATCH 2/5] Cleaned up bbox helper functions, reusing more of the existing code Signed-off-by: Maksym Lysak --- docling_core/types/doc/regions.py | 86 ++++++++++--------------------- 1 file changed, 26 insertions(+), 60 deletions(-) diff --git a/docling_core/types/doc/regions.py b/docling_core/types/doc/regions.py index e8a7f37b..5e72c86d 100644 --- a/docling_core/types/doc/regions.py +++ b/docling_core/types/doc/regions.py @@ -5,42 +5,13 @@ """Utils to work with region-defined tables.""" -from typing import List, Optional, Protocol, Sequence, Set, Tuple +from typing import List, Optional, Sequence, Set, Tuple from docling_core.types.doc.base import BoundingBox, CoordOrigin from docling_core.types.doc.document import TableCell, TableData -def bbox_fraction_inside( - inner: BoundingBox, outer: BoundingBox, *, eps: float = 1.0e-9 -) -> float: - """Return the fraction of ``inner`` area that lies inside ``outer``.""" - area = inner.area() - if area <= eps: - return 0.0 - intersection = inner.intersection_area_with(outer) - return intersection / max(area, eps) - - -def bbox_contains( - inner: BoundingBox, outer: BoundingBox, *, threshold: float, eps: float = 1.0e-9 -) -> bool: - """Return ``True`` when ``inner`` is contained in ``outer`` above ``threshold``.""" - return bbox_fraction_inside(inner, outer, eps=eps) >= threshold - - -def bbox_iou(a: BoundingBox, b: BoundingBox, *, eps: float = 1.0e-6) -> float: - """Return the intersection over union between two bounding boxes.""" - return a.intersection_over_union(b, eps=eps) - - -class HasBoundingBox(Protocol): - """Protocol for objects exposing a bounding box.""" - - bbox: BoundingBox - - -def dedupe_bboxes( +def _dedupe_bboxes( elements: Sequence[BoundingBox], *, iou_threshold: float = 0.9, @@ -48,18 +19,13 @@ def dedupe_bboxes( """Return elements whose bounding boxes are unique within ``iou_threshold``.""" deduped: list[BoundingBox] = [] for element in elements: - if all(bbox_iou(element, kept) < iou_threshold for kept in deduped): + if all( + element.intersection_over_union(kept) < iou_threshold for kept in deduped + ): deduped.append(element) return deduped -def is_bbox_within( - bbox_a: BoundingBox, bbox_b: BoundingBox, threshold: float = 0.5 -) -> bool: - """Return ``True`` when ``bbox_b`` lies within ``bbox_a`` above ``threshold``.""" - return bbox_contains(bbox_b, bbox_a, threshold=threshold) - - def _process_table_headers( bbox: BoundingBox, row_headers: List[BoundingBox] = [], @@ -71,18 +37,18 @@ def _process_table_headers( c_row_section = False for col_header in col_headers: - if is_bbox_within(col_header, bbox): + if bbox.intersection_over_self(col_header) >= 0.5: c_column_header = True for row_header in row_headers: - if is_bbox_within(row_header, bbox): + if bbox.intersection_over_self(row_header) >= 0.5: c_row_header = True for row_section in row_sections: - if is_bbox_within(row_section, bbox): + if bbox.intersection_over_self(row_section) >= 0.5: c_row_section = True return c_column_header, c_row_header, c_row_section -def bbox_intersection(a: BoundingBox, b: BoundingBox) -> Optional[BoundingBox]: +def _bbox_intersection(a: BoundingBox, b: BoundingBox) -> Optional[BoundingBox]: """Return the intersection of two bounding boxes or ``None`` when disjoint.""" if a.coord_origin != b.coord_origin: raise ValueError("BoundingBoxes have different CoordOrigin") @@ -106,7 +72,7 @@ def bbox_intersection(a: BoundingBox, b: BoundingBox) -> Optional[BoundingBox]: return BoundingBox(l=left, t=top, r=right, b=bottom, coord_origin=a.coord_origin) -def compute_cells( +def _compute_cells( rows: List[BoundingBox], columns: List[BoundingBox], merges: List[BoundingBox], @@ -136,7 +102,7 @@ def span_from_merge( idxs = [] best_i, best_len = None, 0.0 for i, elem in enumerate(lines): - inter = bbox_intersection(m, elem) + inter = _bbox_intersection(m, elem) if not inter: continue if axis == "row": @@ -217,7 +183,7 @@ def span_from_merge( for ci, col in enumerate(columns): if (ri, ci) in covered: continue - inter = bbox_intersection(row, col) + inter = _bbox_intersection(row, col) if not inter: # In degenerate cases (big gaps), there might be no intersection; skip. continue @@ -255,54 +221,54 @@ def regions_to_table( Adds semantics for regions of row_headers, col_headers, row_section """ - default_containment_thresh = 0.50 + default_containment_thresh = 0.5 rows.extend(row_sections) # use row sections to compensate for missing rows - rows = dedupe_bboxes( + rows = _dedupe_bboxes( [ e for e in rows - if bbox_contains(e, table_bbox, threshold=default_containment_thresh) + if e.intersection_over_self(table_bbox) >= default_containment_thresh ] ) - cols = dedupe_bboxes( + cols = _dedupe_bboxes( [ e for e in cols - if bbox_contains(e, table_bbox, threshold=default_containment_thresh) + if e.intersection_over_self(table_bbox) >= default_containment_thresh ] ) - merges = dedupe_bboxes( + merges = _dedupe_bboxes( [ e for e in merges - if bbox_contains(e, table_bbox, threshold=default_containment_thresh) + if e.intersection_over_self(table_bbox) >= default_containment_thresh ] ) - col_headers = dedupe_bboxes( + col_headers = _dedupe_bboxes( [ e for e in col_headers - if bbox_contains(e, table_bbox, threshold=default_containment_thresh) + if e.intersection_over_self(table_bbox) >= default_containment_thresh ] ) - row_headers = dedupe_bboxes( + row_headers = _dedupe_bboxes( [ e for e in row_headers - if bbox_contains(e, table_bbox, threshold=default_containment_thresh) + if e.intersection_over_self(table_bbox) >= default_containment_thresh ] ) - row_sections = dedupe_bboxes( + row_sections = _dedupe_bboxes( [ e for e in row_sections - if bbox_contains(e, table_bbox, threshold=default_containment_thresh) + if e.intersection_over_self(table_bbox) >= default_containment_thresh ] ) # Compute table cells from CVAT elements: rows, cols, merges - computed_table_cells = compute_cells( + computed_table_cells = _compute_cells( rows, cols, merges, From a6d10b7630058575fce15593cf5f8ab368ce5c2a Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Tue, 18 Nov 2025 15:32:55 +0100 Subject: [PATCH 3/5] Small fixes Signed-off-by: Maksym Lysak --- docling_core/types/doc/regions.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/docling_core/types/doc/regions.py b/docling_core/types/doc/regions.py index 5e72c86d..b6357aaf 100644 --- a/docling_core/types/doc/regions.py +++ b/docling_core/types/doc/regions.py @@ -1,8 +1,3 @@ -# -# Copyright IBM Corp. 2025 - 2025 -# SPDX-License-Identifier: MIT -# - """Utils to work with region-defined tables.""" from typing import List, Optional, Sequence, Set, Tuple @@ -89,8 +84,6 @@ def _compute_cells( rows.sort(key=lambda r: (r.t + r.b) / 2.0) columns.sort(key=lambda c: (c.l + c.r) / 2.0) - # n_rows, n_cols = len(rows), len(columns) - def span_from_merge( m: BoundingBox, lines: List[BoundingBox], axis: str, frac_threshold: float ) -> Optional[Tuple[int, int]]: @@ -216,7 +209,7 @@ def regions_to_table( row_headers: List[BoundingBox] = [], col_headers: List[BoundingBox] = [], row_sections: List[BoundingBox] = [], -) -> Optional[TableData]: +) -> TableData: """Converts regions: rows, columns, merged cells into table_data structure. Adds semantics for regions of row_headers, col_headers, row_section From 15d17fb57c7814fad483ee62eee1cbe96710c6ac Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Tue, 18 Nov 2025 16:01:46 +0100 Subject: [PATCH 4/5] refactored _bbox_intersection from regions.py into a method of BoundingBox.get_intersection_bbox Signed-off-by: Maksym Lysak --- docling_core/types/doc/base.py | 27 ++++++++++++++++++++++++++- docling_core/types/doc/regions.py | 30 +++--------------------------- 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/docling_core/types/doc/base.py b/docling_core/types/doc/base.py index 673b0e9c..f4a020e4 100644 --- a/docling_core/types/doc/base.py +++ b/docling_core/types/doc/base.py @@ -1,7 +1,7 @@ """Models for the base data types.""" from enum import Enum -from typing import Any, List, Tuple +from typing import Any, List, Optional, Tuple from pydantic import BaseModel, FieldSerializationInfo, field_serializer @@ -231,6 +231,31 @@ def to_bottom_left_origin(self, page_height: float) -> "BoundingBox": coord_origin=CoordOrigin.BOTTOMLEFT, ) + def get_intersection_bbox(self, other: "BoundingBox") -> Optional["BoundingBox"]: + """Return the intersection bounding box with another bounding box or ``None`` when disjoint.""" + if self.coord_origin != other.coord_origin: + raise ValueError("BoundingBoxes have different CoordOrigin") + + left = max(self.l, other.l) + right = min(self.r, other.r) + + if self.coord_origin == CoordOrigin.TOPLEFT: + top = max(self.t, other.t) + bottom = min(self.b, other.b) + if right <= left or bottom <= top: + return None + return BoundingBox( + l=left, t=top, r=right, b=bottom, coord_origin=self.coord_origin + ) + + top = min(self.t, other.t) + bottom = max(self.b, other.b) + if right <= left or top <= bottom: + return None + return BoundingBox( + l=left, t=top, r=right, b=bottom, coord_origin=self.coord_origin + ) + def to_top_left_origin(self, page_height: float) -> "BoundingBox": """to_top_left_origin. diff --git a/docling_core/types/doc/regions.py b/docling_core/types/doc/regions.py index b6357aaf..c66701a3 100644 --- a/docling_core/types/doc/regions.py +++ b/docling_core/types/doc/regions.py @@ -2,7 +2,7 @@ from typing import List, Optional, Sequence, Set, Tuple -from docling_core.types.doc.base import BoundingBox, CoordOrigin +from docling_core.types.doc.base import BoundingBox from docling_core.types.doc.document import TableCell, TableData @@ -43,30 +43,6 @@ def _process_table_headers( return c_column_header, c_row_header, c_row_section -def _bbox_intersection(a: BoundingBox, b: BoundingBox) -> Optional[BoundingBox]: - """Return the intersection of two bounding boxes or ``None`` when disjoint.""" - if a.coord_origin != b.coord_origin: - raise ValueError("BoundingBoxes have different CoordOrigin") - - left = max(a.l, b.l) - right = min(a.r, b.r) - - if a.coord_origin == CoordOrigin.TOPLEFT: - top = max(a.t, b.t) - bottom = min(a.b, b.b) - if right <= left or bottom <= top: - return None - return BoundingBox( - l=left, t=top, r=right, b=bottom, coord_origin=a.coord_origin - ) - - top = min(a.t, b.t) - bottom = max(a.b, b.b) - if right <= left or top <= bottom: - return None - return BoundingBox(l=left, t=top, r=right, b=bottom, coord_origin=a.coord_origin) - - def _compute_cells( rows: List[BoundingBox], columns: List[BoundingBox], @@ -95,7 +71,7 @@ def span_from_merge( idxs = [] best_i, best_len = None, 0.0 for i, elem in enumerate(lines): - inter = _bbox_intersection(m, elem) + inter = m.get_intersection_bbox(elem) if not inter: continue if axis == "row": @@ -176,7 +152,7 @@ def span_from_merge( for ci, col in enumerate(columns): if (ri, ci) in covered: continue - inter = _bbox_intersection(row, col) + inter = row.get_intersection_bbox(col) if not inter: # In degenerate cases (big gaps), there might be no intersection; skip. continue From c1d534eefe396fe7cd3fd796b29b2b783f3da5f2 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Wed, 19 Nov 2025 07:45:56 +0100 Subject: [PATCH 5/5] move region-based construction into `TableData` class Signed-off-by: Panos Vagenas --- docling_core/types/doc/document.py | 286 ++++++++++++++++++++++++++++- docling_core/types/doc/regions.py | 272 --------------------------- test/test_regions_to_table.py | 4 +- 3 files changed, 287 insertions(+), 275 deletions(-) delete mode 100644 docling_core/types/doc/regions.py diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 626a9734..660c8c59 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -14,7 +14,18 @@ from enum import Enum from io import BytesIO from pathlib import Path -from typing import Any, Dict, Final, List, Literal, Optional, Sequence, Tuple, Union +from typing import ( + Any, + Dict, + Final, + List, + Literal, + Optional, + Sequence, + Set, + Tuple, + Union, +) from urllib.parse import unquote import pandas as pd @@ -681,6 +692,279 @@ def get_column_bounding_boxes(self) -> dict[int, BoundingBox]: return col_bboxes + @classmethod + def _dedupe_bboxes( + cls, + elements: Sequence[BoundingBox], + *, + iou_threshold: float = 0.9, + ) -> list[BoundingBox]: + """Return elements whose bounding boxes are unique within ``iou_threshold``.""" + deduped: list[BoundingBox] = [] + for element in elements: + if all( + element.intersection_over_union(kept) < iou_threshold + for kept in deduped + ): + deduped.append(element) + return deduped + + @classmethod + def _process_table_headers( + cls, + bbox: BoundingBox, + row_headers: List[BoundingBox] = [], + col_headers: List[BoundingBox] = [], + row_sections: List[BoundingBox] = [], + ) -> Tuple[bool, bool, bool]: + c_column_header = False + c_row_header = False + c_row_section = False + + for col_header in col_headers: + if bbox.intersection_over_self(col_header) >= 0.5: + c_column_header = True + for row_header in row_headers: + if bbox.intersection_over_self(row_header) >= 0.5: + c_row_header = True + for row_section in row_sections: + if bbox.intersection_over_self(row_section) >= 0.5: + c_row_section = True + return c_column_header, c_row_header, c_row_section + + @classmethod + def _compute_cells( + cls, + rows: List[BoundingBox], + columns: List[BoundingBox], + merges: List[BoundingBox], + row_headers: List[BoundingBox] = [], + col_headers: List[BoundingBox] = [], + row_sections: List[BoundingBox] = [], + row_overlap_threshold: float = 0.5, # how much of a row a merge must cover vertically + col_overlap_threshold: float = 0.5, # how much of a column a merge must cover horizontally + ) -> List[TableCell]: + """Returns TableCell. Merged cells are aligned to grid boundaries. + + rows, columns, merges are lists of BoundingBox(l,t,r,b). + """ + rows.sort(key=lambda r: (r.t + r.b) / 2.0) + columns.sort(key=lambda c: (c.l + c.r) / 2.0) + + def span_from_merge( + m: BoundingBox, lines: List[BoundingBox], axis: str, frac_threshold: float + ) -> Optional[Tuple[int, int]]: + """Map a merge bbox to an inclusive index span over rows or columns. + + axis='row' uses vertical overlap vs row height; axis='col' uses horizontal overlap vs col width. + If nothing meets threshold, pick the single best-overlapping line if overlap>0; else return None. + """ + idxs = [] + best_i, best_len = None, 0.0 + for i, elem in enumerate(lines): + inter = m.get_intersection_bbox(elem) + if not inter: + continue + if axis == "row": + overlap_len = inter.height + base = max(1e-9, elem.height) + else: + overlap_len = inter.width + base = max(1e-9, elem.width) + + frac = overlap_len / base + if frac >= frac_threshold: + idxs.append(i) + + if overlap_len > best_len: + best_len, best_i = overlap_len, i + + if idxs: + return min(idxs), max(idxs) + if best_i is not None and best_len > 0.0: + return best_i, best_i + return None + + cells: List[TableCell] = [] + covered: Set[Tuple[int, int]] = set() + seen_merge_rects: Set[Tuple[int, int, int, int]] = set() + + # 1) Add merged cells first (and mark their covered simple cells) + for m in merges: + rspan = span_from_merge( + m, rows, axis="row", frac_threshold=row_overlap_threshold + ) + cspan = span_from_merge( + m, columns, axis="col", frac_threshold=col_overlap_threshold + ) + if rspan is None or cspan is None: + # Can't confidently map this merge to grid -> skip it + continue + + sr, er = rspan + sc, ec = cspan + rect_key = (sr, er, sc, ec) + if rect_key in seen_merge_rects: + continue + seen_merge_rects.add(rect_key) + + # Grid-aligned bbox for the merged cell + grid_bbox = BoundingBox( + l=columns[sc].l, + t=rows[sr].t, + r=columns[ec].r, + b=rows[er].b, + ) + c_column_header, c_row_header, c_row_section = cls._process_table_headers( + grid_bbox, col_headers, row_headers, row_sections + ) + + cells.append( + TableCell( + text="", + row_span=er - sr + 1, + col_span=ec - sc + 1, + start_row_offset_idx=sr, + end_row_offset_idx=er + 1, + start_col_offset_idx=sc, + end_col_offset_idx=ec + 1, + bbox=grid_bbox, + column_header=c_column_header, + row_header=c_row_header, + row_section=c_row_section, + ) + ) + for ri in range(sr, er + 1): + for ci in range(sc, ec + 1): + covered.add((ri, ci)) + + # 2) Add simple (1x1) cells where not covered by merges + for ri, row in enumerate(rows): + for ci, col in enumerate(columns): + if (ri, ci) in covered: + continue + inter = row.get_intersection_bbox(col) + if not inter: + # In degenerate cases (big gaps), there might be no intersection; skip. + continue + c_column_header, c_row_header, c_row_section = ( + cls._process_table_headers( + inter, col_headers, row_headers, row_sections + ) + ) + cells.append( + TableCell( + text="", + row_span=1, + col_span=1, + start_row_offset_idx=ri, + end_row_offset_idx=ri + 1, + start_col_offset_idx=ci, + end_col_offset_idx=ci + 1, + bbox=inter, + column_header=c_column_header, + row_header=c_row_header, + row_section=c_row_section, + ) + ) + return cells + + @classmethod + def from_regions( + cls, + table_bbox: BoundingBox, + rows: List[BoundingBox], + cols: List[BoundingBox], + merges: List[BoundingBox], + row_headers: List[BoundingBox] = [], + col_headers: List[BoundingBox] = [], + row_sections: List[BoundingBox] = [], + ) -> Self: + """Converts regions: rows, columns, merged cells into table_data structure. + + Adds semantics for regions of row_headers, col_headers, row_section + """ + default_containment_thresh = 0.5 + rows.extend(row_sections) # use row sections to compensate for missing rows + rows = cls._dedupe_bboxes( + [ + e + for e in rows + if e.intersection_over_self(table_bbox) >= default_containment_thresh + ] + ) + cols = cls._dedupe_bboxes( + [ + e + for e in cols + if e.intersection_over_self(table_bbox) >= default_containment_thresh + ] + ) + merges = cls._dedupe_bboxes( + [ + e + for e in merges + if e.intersection_over_self(table_bbox) >= default_containment_thresh + ] + ) + + col_headers = cls._dedupe_bboxes( + [ + e + for e in col_headers + if e.intersection_over_self(table_bbox) >= default_containment_thresh + ] + ) + row_headers = cls._dedupe_bboxes( + [ + e + for e in row_headers + if e.intersection_over_self(table_bbox) >= default_containment_thresh + ] + ) + row_sections = cls._dedupe_bboxes( + [ + e + for e in row_sections + if e.intersection_over_self(table_bbox) >= default_containment_thresh + ] + ) + + # Compute table cells from CVAT elements: rows, cols, merges + computed_table_cells = cls._compute_cells( + rows, + cols, + merges, + col_headers, + row_headers, + row_sections, + ) + + # If no table structure found, create single fake cell for content + if not rows or not cols: + computed_table_cells = [ + TableCell( + text="", + row_span=1, + col_span=1, + start_row_offset_idx=0, + end_row_offset_idx=1, + start_col_offset_idx=0, + end_col_offset_idx=1, + bbox=table_bbox, + column_header=False, + row_header=False, + row_section=False, + ) + ] + table_data = cls(num_rows=1, num_cols=1) + else: + table_data = cls(num_rows=len(rows), num_cols=len(cols)) + + table_data.table_cells = computed_table_cells + + return table_data + class PictureTabularChartData(PictureChartData): """Base class for picture chart data. diff --git a/docling_core/types/doc/regions.py b/docling_core/types/doc/regions.py deleted file mode 100644 index c66701a3..00000000 --- a/docling_core/types/doc/regions.py +++ /dev/null @@ -1,272 +0,0 @@ -"""Utils to work with region-defined tables.""" - -from typing import List, Optional, Sequence, Set, Tuple - -from docling_core.types.doc.base import BoundingBox -from docling_core.types.doc.document import TableCell, TableData - - -def _dedupe_bboxes( - elements: Sequence[BoundingBox], - *, - iou_threshold: float = 0.9, -) -> list[BoundingBox]: - """Return elements whose bounding boxes are unique within ``iou_threshold``.""" - deduped: list[BoundingBox] = [] - for element in elements: - if all( - element.intersection_over_union(kept) < iou_threshold for kept in deduped - ): - deduped.append(element) - return deduped - - -def _process_table_headers( - bbox: BoundingBox, - row_headers: List[BoundingBox] = [], - col_headers: List[BoundingBox] = [], - row_sections: List[BoundingBox] = [], -) -> Tuple[bool, bool, bool]: - c_column_header = False - c_row_header = False - c_row_section = False - - for col_header in col_headers: - if bbox.intersection_over_self(col_header) >= 0.5: - c_column_header = True - for row_header in row_headers: - if bbox.intersection_over_self(row_header) >= 0.5: - c_row_header = True - for row_section in row_sections: - if bbox.intersection_over_self(row_section) >= 0.5: - c_row_section = True - return c_column_header, c_row_header, c_row_section - - -def _compute_cells( - rows: List[BoundingBox], - columns: List[BoundingBox], - merges: List[BoundingBox], - row_headers: List[BoundingBox] = [], - col_headers: List[BoundingBox] = [], - row_sections: List[BoundingBox] = [], - row_overlap_threshold: float = 0.5, # how much of a row a merge must cover vertically - col_overlap_threshold: float = 0.5, # how much of a column a merge must cover horizontally -) -> List[TableCell]: - """Returns TableCell. Merged cells are aligned to grid boundaries. - - rows, columns, merges are lists of BoundingBox(l,t,r,b). - """ - rows.sort(key=lambda r: (r.t + r.b) / 2.0) - columns.sort(key=lambda c: (c.l + c.r) / 2.0) - - def span_from_merge( - m: BoundingBox, lines: List[BoundingBox], axis: str, frac_threshold: float - ) -> Optional[Tuple[int, int]]: - """Map a merge bbox to an inclusive index span over rows or columns. - - axis='row' uses vertical overlap vs row height; axis='col' uses horizontal overlap vs col width. - If nothing meets threshold, pick the single best-overlapping line if overlap>0; else return None. - """ - idxs = [] - best_i, best_len = None, 0.0 - for i, elem in enumerate(lines): - inter = m.get_intersection_bbox(elem) - if not inter: - continue - if axis == "row": - overlap_len = inter.height - base = max(1e-9, elem.height) - else: - overlap_len = inter.width - base = max(1e-9, elem.width) - - frac = overlap_len / base - if frac >= frac_threshold: - idxs.append(i) - - if overlap_len > best_len: - best_len, best_i = overlap_len, i - - if idxs: - return min(idxs), max(idxs) - if best_i is not None and best_len > 0.0: - return best_i, best_i - return None - - cells: List[TableCell] = [] - covered: Set[Tuple[int, int]] = set() - seen_merge_rects: Set[Tuple[int, int, int, int]] = set() - - # 1) Add merged cells first (and mark their covered simple cells) - for m in merges: - rspan = span_from_merge( - m, rows, axis="row", frac_threshold=row_overlap_threshold - ) - cspan = span_from_merge( - m, columns, axis="col", frac_threshold=col_overlap_threshold - ) - if rspan is None or cspan is None: - # Can't confidently map this merge to grid -> skip it - continue - - sr, er = rspan - sc, ec = cspan - rect_key = (sr, er, sc, ec) - if rect_key in seen_merge_rects: - continue - seen_merge_rects.add(rect_key) - - # Grid-aligned bbox for the merged cell - grid_bbox = BoundingBox( - l=columns[sc].l, - t=rows[sr].t, - r=columns[ec].r, - b=rows[er].b, - ) - c_column_header, c_row_header, c_row_section = _process_table_headers( - grid_bbox, col_headers, row_headers, row_sections - ) - - cells.append( - TableCell( - text="", - row_span=er - sr + 1, - col_span=ec - sc + 1, - start_row_offset_idx=sr, - end_row_offset_idx=er + 1, - start_col_offset_idx=sc, - end_col_offset_idx=ec + 1, - bbox=grid_bbox, - column_header=c_column_header, - row_header=c_row_header, - row_section=c_row_section, - ) - ) - for ri in range(sr, er + 1): - for ci in range(sc, ec + 1): - covered.add((ri, ci)) - - # 2) Add simple (1x1) cells where not covered by merges - for ri, row in enumerate(rows): - for ci, col in enumerate(columns): - if (ri, ci) in covered: - continue - inter = row.get_intersection_bbox(col) - if not inter: - # In degenerate cases (big gaps), there might be no intersection; skip. - continue - c_column_header, c_row_header, c_row_section = _process_table_headers( - inter, col_headers, row_headers, row_sections - ) - cells.append( - TableCell( - text="", - row_span=1, - col_span=1, - start_row_offset_idx=ri, - end_row_offset_idx=ri + 1, - start_col_offset_idx=ci, - end_col_offset_idx=ci + 1, - bbox=inter, - column_header=c_column_header, - row_header=c_row_header, - row_section=c_row_section, - ) - ) - return cells - - -def regions_to_table( - table_bbox: BoundingBox, - rows: List[BoundingBox], - cols: List[BoundingBox], - merges: List[BoundingBox], - row_headers: List[BoundingBox] = [], - col_headers: List[BoundingBox] = [], - row_sections: List[BoundingBox] = [], -) -> TableData: - """Converts regions: rows, columns, merged cells into table_data structure. - - Adds semantics for regions of row_headers, col_headers, row_section - """ - default_containment_thresh = 0.5 - rows.extend(row_sections) # use row sections to compensate for missing rows - rows = _dedupe_bboxes( - [ - e - for e in rows - if e.intersection_over_self(table_bbox) >= default_containment_thresh - ] - ) - cols = _dedupe_bboxes( - [ - e - for e in cols - if e.intersection_over_self(table_bbox) >= default_containment_thresh - ] - ) - merges = _dedupe_bboxes( - [ - e - for e in merges - if e.intersection_over_self(table_bbox) >= default_containment_thresh - ] - ) - - col_headers = _dedupe_bboxes( - [ - e - for e in col_headers - if e.intersection_over_self(table_bbox) >= default_containment_thresh - ] - ) - row_headers = _dedupe_bboxes( - [ - e - for e in row_headers - if e.intersection_over_self(table_bbox) >= default_containment_thresh - ] - ) - row_sections = _dedupe_bboxes( - [ - e - for e in row_sections - if e.intersection_over_self(table_bbox) >= default_containment_thresh - ] - ) - - # Compute table cells from CVAT elements: rows, cols, merges - computed_table_cells = _compute_cells( - rows, - cols, - merges, - col_headers, - row_headers, - row_sections, - ) - - # If no table structure found, create single fake cell for content - if not rows or not cols: - computed_table_cells = [ - TableCell( - text="", - row_span=1, - col_span=1, - start_row_offset_idx=0, - end_row_offset_idx=1, - start_col_offset_idx=0, - end_col_offset_idx=1, - bbox=table_bbox, - column_header=False, - row_header=False, - row_section=False, - ) - ] - table_data = TableData(num_rows=1, num_cols=1) - else: - table_data = TableData(num_rows=len(rows), num_cols=len(cols)) - - table_data.table_cells = computed_table_cells - - return table_data diff --git a/test/test_regions_to_table.py b/test/test_regions_to_table.py index 1e1dac53..309e39fd 100644 --- a/test/test_regions_to_table.py +++ b/test/test_regions_to_table.py @@ -1,5 +1,5 @@ from docling_core.types.doc.base import BoundingBox -from docling_core.types.doc.regions import regions_to_table +from docling_core.types.doc.document import TableData # Table bbox - defines region of a table, everything outside will be ignored table_bbox: BoundingBox = BoundingBox(l=0, t=0, r=100, b=175) @@ -48,7 +48,7 @@ def test_regions_to_table_convert(): # Converts regions: rows, columns, merged cells # into table_data structure, # Adds semantics for regions of row_headers, col_headers, row_section - table_data = regions_to_table( + table_data = TableData.from_regions( table_bbox=table_bbox, rows=rows, cols=cols,