Skip to content

Commit fad6462

Browse files
committed
fix(scripts,review): gate skipped-cell rendering on status != "ok"
Addresses both Copilot inline comments on PR #235. The previous fix used `if head_status or base_status:` to detect skipped matrix cells, but `benchmarks/benchmark.py::MatrixCell.status` defaults to `"ok"` for every real cell — making `"ok"` truthy and causing the delta script to render `_skipped_ (ok)` for every backend/size cell. The benchmark-delta bot comment on this PR caught the bug live: all 9 real cells (bm25/fuzzy/tfidf × 100/500/1000) were marked as skipped. The gate is now `status != "ok"`, mirroring the existing convention in `scripts/render_scorecard.py:150-151`. Cells with no `status` field are treated as "ok" for backwards compat. Test updates: - `test_matrix_delta_ok_cells_render_metrics` (renamed from `test_matrix_delta_real_cells_still_render_metrics`) now explicitly sets `status: "ok"` on the fixtures and asserts metrics render — this is the regression test that would have caught the original bug. - `test_matrix_delta_missing_status_treated_as_ok` added for backwards compatibility on cells that omit `status` entirely. Verification: - ruff format/lint/mypy clean - pytest -q: 1012 passed, 5 skipped (one more test than before the fix)
1 parent ce9d146 commit fad6462

2 files changed

Lines changed: 52 additions & 10 deletions

File tree

scripts/benchmark_delta.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -137,13 +137,16 @@ def _render_matrix_section(base: dict[str, Any], head: dict[str, Any]) -> str:
137137
b = base_idx.get((backend, size), {})
138138
h = head_idx.get((backend, size), b)
139139
# Skip-cells carry zeroed metrics by design (e.g. fuzzy with no
140-
# rapidfuzz). Treating them as accuracy/latency regressions would
141-
# produce false-positive ⚠️ markers on every PR; emit a single
142-
# "skipped" row that surfaces the reason text instead.
143-
head_status = str(h.get("status", ""))
144-
base_status = str(b.get("status", ""))
145-
if head_status or base_status:
146-
reason = head_status or base_status
140+
# rapidfuzz, status="skipped: ..."). Treating them as accuracy/
141+
# latency regressions would produce false-positive ⚠️ markers on
142+
# every PR; emit a single "skipped" row that surfaces the reason
143+
# instead. Real cells carry status="ok" (the MatrixCell default in
144+
# benchmarks/benchmark.py), so the gate is status != "ok",
145+
# aligning with scripts/render_scorecard.py's existing convention.
146+
head_status = str(h.get("status", "ok"))
147+
base_status = str(b.get("status", "ok"))
148+
if head_status != "ok" or base_status != "ok":
149+
reason = head_status if head_status != "ok" else base_status
147150
lines.append(f"| {backend} | {size} | _skipped_ ({reason}) | — | — |")
148151
continue
149152
br = float(b.get("recall_at_k", 0.0))

tests/test_benchmark_delta.py

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,16 @@ def test_matrix_delta_renders_skipped_row_for_status_bearing_cell() -> None:
5252
assert "⚠️" not in out
5353

5454

55-
def test_matrix_delta_real_cells_still_render_metrics() -> None:
56-
"""Status-less cells still produce a full recall/MRR/latency row."""
55+
def test_matrix_delta_ok_cells_render_metrics() -> None:
56+
"""Cells with the canonical ``status="ok"`` must render full metrics.
57+
58+
Regression: an earlier version of `_render_matrix_section` checked
59+
`if head_status or base_status`, which matched every real cell
60+
(``MatrixCell.status`` defaults to ``"ok"``) and suppressed every
61+
metric row. The benchmark-delta bot comment on PR #235 caught this
62+
in production. The gate is now `status != "ok"`, aligned with
63+
`scripts/render_scorecard.py`.
64+
"""
5765
base = _payload(
5866
[
5967
{
@@ -62,6 +70,7 @@ def test_matrix_delta_real_cells_still_render_metrics() -> None:
6270
"recall_at_k": 0.5,
6371
"mrr": 0.4,
6472
"latency_ms_p99": 1.0,
73+
"status": "ok",
6574
}
6675
]
6776
)
@@ -73,15 +82,45 @@ def test_matrix_delta_real_cells_still_render_metrics() -> None:
7382
"recall_at_k": 0.5,
7483
"mrr": 0.4,
7584
"latency_ms_p99": 1.0,
85+
"status": "ok",
7686
}
7787
]
7888
)
7989
out = _render_matrix_section(base, head)
80-
# Numeric markers present, no skipped marker, no regression marker.
90+
# Numeric markers present; no skipped marker for "ok" cells.
8191
assert "_skipped_" not in out
8292
assert "0.5000" in out
8393

8494

95+
def test_matrix_delta_missing_status_treated_as_ok() -> None:
96+
"""Backwards-compatibility: a cell with no `status` field still renders metrics."""
97+
base = _payload(
98+
[
99+
{
100+
"backend": "bm25",
101+
"catalog_size": 100,
102+
"recall_at_k": 0.3,
103+
"mrr": 0.2,
104+
"latency_ms_p99": 5.0,
105+
}
106+
]
107+
)
108+
head = _payload(
109+
[
110+
{
111+
"backend": "bm25",
112+
"catalog_size": 100,
113+
"recall_at_k": 0.3,
114+
"mrr": 0.2,
115+
"latency_ms_p99": 5.0,
116+
}
117+
]
118+
)
119+
out = _render_matrix_section(base, head)
120+
assert "_skipped_" not in out
121+
assert "0.3000" in out
122+
123+
85124
def test_matrix_delta_empty_payload_returns_empty_string() -> None:
86125
"""No matrix rows in either side → no section emitted."""
87126
assert _render_matrix_section({}, {}) == ""

0 commit comments

Comments
 (0)