diff --git a/CHANGELOG.md b/CHANGELOG.md index 94251d2..cf4d0d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,10 @@ This project follows [Semantic Versioning](https://semver.org/). From **v1.0.0** - **`.github/workflows/release-pypi.yml`:** on push of **`vMAJOR.MINOR.PATCH`**, verify tag matches **`pyproject.toml`** and **`src/flightdeck/__init__.py`**, run **ruff** / **pytest** / schema drift, **`uv build`**, publish to **PyPI** via **OIDC** trusted publishing (**publish attestations**), and create a **GitHub Release** with **`dist/*`** assets (**`softprops/action-gh-release`**). - **`tests/test_version_consistency.py`:** assert **`pyproject.toml`** **`version`** matches **`flightdeck.__version__`** (same invariant as the release workflow). +### Fixed + +- **`diff_releases` zero policy sample thresholds:** `Policy.min_candidate_runs`, `Policy.min_baseline_runs`, and `Policy.min_low_runs` set to **`0`** now correctly override workspace config defaults to `0` instead of being silently ignored. Previously, `or`-based fallback treated `0` as falsy and fell back to the config value (typically `500` / `50`). Fixed by using explicit `is not None` checks. A policy can now unconditionally accept any sample size by setting thresholds to `0` — for example, to allow diffs over empty event windows without a confidence downgrade. + ### Changed - **`tests/conftest.py`:** create repo **`.tmp/`** at import time so **`pytest --basetemp=.tmp/pytest`** works on fresh checkouts and **Linux** CI (parent dir is no longer Windows-only). diff --git a/schemas/v1/policy.schema.json b/schemas/v1/policy.schema.json index 7a1465e..33df8be 100644 --- a/schemas/v1/policy.schema.json +++ b/schemas/v1/policy.schema.json @@ -1,4 +1,5 @@ { + "description": "Promotion-gate policy for a release diff.\n\n**Constraint fields** (``max_*``) \u2014 when ``None`` the constraint is\ndisabled. When set, the candidate rollup must not exceed the limit for the\npolicy to pass.\n\n**Sample threshold fields** (``min_*``) \u2014 control the confidence label\nassigned by ``diff_releases``:\n\n- ``None`` (default) \u2014 defer to ``WorkspaceConfig.diff`` defaults\n (typically ``min_candidate_runs=500``, ``min_baseline_runs=500``,\n ``min_low_runs=50``).\n- ``0`` \u2014 unconditionally accept any sample size for that threshold,\n including an empty event list. All three set to ``0`` means any diff\n window, even an empty one, can reach HIGH confidence.\n\nThe ``None`` / ``0`` distinction is intentional: ``None`` means \"inherit\nfrom config\", not \"zero runs required\". ``diff_releases`` uses\n``is not None`` checks to respect an explicit ``0`` override.", "properties": { "max_cost_per_run_usd": { "anyOf": [ diff --git a/src/flightdeck/ledger.py b/src/flightdeck/ledger.py index 3b90793..4ed0874 100644 --- a/src/flightdeck/ledger.py +++ b/src/flightdeck/ledger.py @@ -41,6 +41,19 @@ def confidence_label( min_candidate_runs: int, min_low_runs: int, ) -> Literal["HIGH", "MEDIUM", "LOW"]: + """Return a three-tier confidence label for a diff comparison. + + Thresholds come from the resolved policy (which can set them to 0 to + unconditionally accept any sample size): + + - HIGH — both sides meet their minimum run counts. + - LOW — either side falls below the floor (``min_low_runs``). + - MEDIUM — in between: at least one side misses its target but neither + is below the floor. + + A threshold of 0 is valid and means "no minimum required"; for example, + setting all three to 0 lets an empty-window diff still return HIGH. + """ if baseline_runs >= min_baseline_runs and candidate_runs >= min_candidate_runs: return "HIGH" if baseline_runs < min_low_runs or candidate_runs < min_low_runs: @@ -128,6 +141,20 @@ def evaluate_policy( diff_confidence: Literal["HIGH", "MEDIUM", "LOW"], diff_confidence_reason: str | None, ) -> PolicyResult: + """Evaluate promotion-gate policy against a computed diff. + + Each active constraint appends a human-readable failure reason; an empty + reasons list means the policy passed. + + Constraints checked (only when the corresponding policy field is not None): + + - ``max_cost_per_run_usd`` — candidate average cost must not exceed limit. + - ``max_latency_ms`` — candidate average latency must not exceed limit + (skipped when candidate has no latency data). + - ``max_error_rate`` — candidate error rate must not exceed limit. + - ``require_high_diff_confidence`` — when ``True``, the diff must reach + HIGH confidence (based on sample thresholds) before promotion is allowed. + """ reasons: list[str] = [] # Cost @@ -167,6 +194,45 @@ def diff_releases( candidate_pricing_table: PricingTable, window: str, ) -> DiffResult: + """Compute a trusted diff between a baseline and candidate release. + + Each side is costed independently against its own pricing table, then + rolled up into a :class:`Rollup` (cost, latency, error rate). Confidence + is determined by sample size relative to the resolved thresholds, and + policy constraints are evaluated on top. + + **Threshold resolution** — policy fields use ``is not None`` to distinguish + "not set" from "explicitly zero": + + - If ``Policy.min_candidate_runs`` (/ ``min_baseline_runs`` / ``min_low_runs``) + is ``None``, the workspace config default is used (typically 500 / 500 / 50). + - If the policy sets a threshold to ``0``, that override is respected and + the config default is *not* used — any sample size satisfies the threshold. + + **Agent-id invariant** — when both sides are non-empty, every run event on + each side must share the same ``agent_id``, and baseline and candidate must + use the same ``agent_id``. Cross-agent diffs are rejected with + ``ValueError``. + + Args: + cfg: Workspace configuration supplying diff defaults. + policy: Active promotion policy (thresholds and constraints). + baseline_events: Run events for the baseline release. + candidate_events: Run events for the candidate release. + baseline_pricing_table: Pricing used to cost baseline events. + candidate_pricing_table: Pricing used to cost candidate events. + window: Human-readable label for the time window (e.g. ``"7d"``); + stored on the result for display only — filtering is the caller's + responsibility. + + Returns: + A :class:`DiffResult` containing rollups, deltas, confidence label, + and the policy evaluation outcome. + + Raises: + ValueError: If events span multiple agent IDs on one side, or if + baseline and candidate use different agent IDs. + """ if baseline_events and candidate_events: b_agents = {e.agent_id for e in baseline_events} c_agents = {e.agent_id for e in candidate_events} diff --git a/src/flightdeck/models.py b/src/flightdeck/models.py index aee6ebe..37af999 100644 --- a/src/flightdeck/models.py +++ b/src/flightdeck/models.py @@ -165,6 +165,27 @@ class RunEvent(BaseModel): class Policy(BaseModel): + """Promotion-gate policy for a release diff. + + **Constraint fields** (``max_*``) — when ``None`` the constraint is + disabled. When set, the candidate rollup must not exceed the limit for the + policy to pass. + + **Sample threshold fields** (``min_*``) — control the confidence label + assigned by ``diff_releases``: + + - ``None`` (default) — defer to ``WorkspaceConfig.diff`` defaults + (typically ``min_candidate_runs=500``, ``min_baseline_runs=500``, + ``min_low_runs=50``). + - ``0`` — unconditionally accept any sample size for that threshold, + including an empty event list. All three set to ``0`` means any diff + window, even an empty one, can reach HIGH confidence. + + The ``None`` / ``0`` distinction is intentional: ``None`` means "inherit + from config", not "zero runs required". ``diff_releases`` uses + ``is not None`` checks to respect an explicit ``0`` override. + """ + policy_id: str = "default" max_cost_per_run_usd: float | None = Field(default=None, ge=0) max_latency_ms: int | None = Field(default=None, ge=0)