From ed98c2dc9c2b8d96ce11e9087a2b9967603c647a Mon Sep 17 00:00:00 2001 From: dev360 Date: Thu, 21 May 2026 17:23:23 -0400 Subject: [PATCH] feat(template): add column and nth to Anchor for label disambiguation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A cover sheet that stacks two side-by-side blocks — for example a ``REPORTING`` block in column A and a ``BILLING`` block in column D, each carrying its own ``Company:`` / ``Email:`` rows — used to force both anchored fields to bind to the first occurrence of the shared label. The canonical output would then carry the reporting block's company under both ``reporting_company`` and ``billing_company`` keys with no signal that the bind had collapsed. Adds two optional fields on ``Anchor``: - ``column: int`` — restrict the label scan to a single 0-indexed column. Skipping cells outside that column means the label_match in one block can't accidentally hit the parallel block. - ``nth: int`` (default ``1``) — pick the N-th match (1-indexed) when the same label legitimately appears multiple times. Both default to no-op behavior (``column=None`` scans the whole sheet, ``nth=1`` picks the first match), so existing templates continue to extract identically. Graduates the two P1-1 xfail tests in ``tests/test_field_scan_gaps.py`` and documents the disambiguators in ``docs/guides/templates.md``. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/guides/templates.md | 28 ++++++++++++++++++++++++++++ src/crease/extractor.py | 18 ++++++++++++++---- src/crease/template_model.py | 2 ++ tests/test_field_scan_gaps.py | 11 ++--------- 4 files changed, 46 insertions(+), 13 deletions(-) diff --git a/docs/guides/templates.md b/docs/guides/templates.md index 4c09fd9..118214a 100644 --- a/docs/guides/templates.md +++ b/docs/guides/templates.md @@ -105,6 +105,34 @@ by OR across rules. `value_pattern` is a regex full-matched against the stringified cell value; combine it with `column:` to pin a single column. +## Disambiguating anchored labels + +When a worksheet stacks two cover-sheet-style blocks side by side and +both carry the same labels (a "REPORTING" block in column A and a +"BILLING" block in column D, each with its own `Company:` / +`Email:` rows), an `anchor` whose `label_match: "Company:"` would +default to the first hit. Two optional fields scope the search: + +- `column: int` — restrict the scan to a single 0-indexed column. +- `nth: int` — pick the Nth match (1-indexed; default 1). + +```yaml +fields: + - name: reporting_company + type: string + anchor: { label_match: "Company:", column: 0, value_at: right, offset: 1 } + - name: billing_company + type: string + anchor: { label_match: "Company:", column: 3, value_at: right, offset: 1 } + - name: section_two_carrier + type: string + anchor: + label_match: "SHIPPING INFORMATION" + nth: 2 # the second occurrence of the label + value_at: right + offset: 2 +``` + ## Templates that pin the read backend Crease reads spreadsheets through two interchangeable backends — calamine diff --git a/src/crease/extractor.py b/src/crease/extractor.py index 3f72f56..d993fbf 100644 --- a/src/crease/extractor.py +++ b/src/crease/extractor.py @@ -866,16 +866,26 @@ def _extract_anchored( def _find_anchor(grid: list[list[Any]], anchor) -> tuple[int, int] | None: target = anchor.label_match mode = anchor.match_mode + pinned_col = anchor.column + nth = max(1, anchor.nth) + seen = 0 for r, row in enumerate(grid): for c, val in enumerate(row): + if pinned_col is not None and c != pinned_col: + continue if val is None: continue s = str(val).strip() if mode == "exact" and s == target: - return (r, c) - if mode == "contains" and target in s: - return (r, c) - if mode == "regex" and re.search(target, s): + pass + elif mode == "contains" and target in s: + pass + elif mode == "regex" and re.search(target, s): + pass + else: + continue + seen += 1 + if seen == nth: return (r, c) return None diff --git a/src/crease/template_model.py b/src/crease/template_model.py index 6bace50..f3f9c96 100644 --- a/src/crease/template_model.py +++ b/src/crease/template_model.py @@ -51,6 +51,8 @@ class Anchor(BaseModel): match_mode: MatchMode = "contains" value_at: Direction = "right" offset: int = 1 + column: int | None = None # restrict label search to a single column; None = any column + nth: int = 1 # 1-indexed match to return when the label appears more than once class DataEnd(BaseModel): diff --git a/tests/test_field_scan_gaps.py b/tests/test_field_scan_gaps.py index df07300..9a1ff85 100644 --- a/tests/test_field_scan_gaps.py +++ b/tests/test_field_scan_gaps.py @@ -356,11 +356,6 @@ def build(wb): # ====================================================================== -@pytest.mark.xfail( - strict=True, - reason="P1-1: Anchor.column not yet implemented; duplicated labels in side-by-side " - "blocks always match the first occurrence.", -) def test_anchor_column_scopes_match_to_one_column(tmp_path): """Two side-by-side blocks (REPORTING in col A, BILLING in col D) carry the same labels. ``anchor.column: 3`` should restrict the search to the @@ -379,6 +374,7 @@ def build(wb): """ template_id: anchor_column_scope version: 1 + description: P1-1 fixture - anchor.column scopes label search entities: - name: cover cardinality: one @@ -400,10 +396,6 @@ def build(wb): assert result.canonical["cover"]["billing_company"] == "Globex Corp" -@pytest.mark.xfail( - strict=True, - reason="P1-1: Anchor.nth not yet implemented; cannot pick the Nth occurrence " "of an ambiguous label.", -) def test_anchor_nth_picks_second_match(tmp_path): """A label ``SHIPPING INFORMATION`` appears twice on the sheet (a header label at row 0 and a sub-section label at row 4). ``nth: 2`` should pick @@ -424,6 +416,7 @@ def build(wb): """ template_id: anchor_nth version: 1 + description: P1-1 fixture - anchor.nth picks second match entities: - name: cover cardinality: one