flightdeckdev · Gsbreddy · May 1, 2026 · May 1, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,10 @@ This project follows [Semantic Versioning](https://semver.org/). From **v1.0.0**
 - **`.github/workflows/release-pypi.yml`:** on push of **`vMAJOR.MINOR.PATCH`**, verify tag matches **`pyproject.toml`** and **`src/flightdeck/__init__.py`**, run **ruff** / **pytest** / schema drift, **`uv build`**, publish to **PyPI** via **OIDC** trusted publishing (**publish attestations**), and create a **GitHub Release** with **`dist/*`** assets (**`softprops/action-gh-release`**).
 - **`tests/test_version_consistency.py`:** assert **`pyproject.toml`** **`version`** matches **`flightdeck.__version__`** (same invariant as the release workflow).
 
+### Fixed
+
+- **`diff_releases` zero policy sample thresholds:** `Policy.min_candidate_runs`, `Policy.min_baseline_runs`, and `Policy.min_low_runs` set to **`0`** now correctly override workspace config defaults to `0` instead of being silently ignored. Previously, `or`-based fallback treated `0` as falsy and fell back to the config value (typically `500` / `50`). Fixed by using explicit `is not None` checks. A policy can now unconditionally accept any sample size by setting thresholds to `0` — for example, to allow diffs over empty event windows without a confidence downgrade.
+
 ### Changed
 
 - **`tests/conftest.py`:** create repo **`.tmp/`** at import time so **`pytest --basetemp=.tmp/pytest`** works on fresh checkouts and **Linux** CI (parent dir is no longer Windows-only).

diff --git a/schemas/v1/policy.schema.json b/schemas/v1/policy.schema.json
@@ -1,4 +1,5 @@
 {
+  "description": "Promotion-gate policy for a release diff.\n\n**Constraint fields** (``max_*``) \u2014 when ``None`` the constraint is\ndisabled.  When set, the candidate rollup must not exceed the limit for the\npolicy to pass.\n\n**Sample threshold fields** (``min_*``) \u2014 control the confidence label\nassigned by ``diff_releases``:\n\n- ``None`` (default) \u2014 defer to ``WorkspaceConfig.diff`` defaults\n  (typically ``min_candidate_runs=500``, ``min_baseline_runs=500``,\n  ``min_low_runs=50``).\n- ``0`` \u2014 unconditionally accept any sample size for that threshold,\n  including an empty event list.  All three set to ``0`` means any diff\n  window, even an empty one, can reach HIGH confidence.\n\nThe ``None`` / ``0`` distinction is intentional: ``None`` means \"inherit\nfrom config\", not \"zero runs required\".  ``diff_releases`` uses\n``is not None`` checks to respect an explicit ``0`` override.",
   "properties": {
     "max_cost_per_run_usd": {
       "anyOf": [

diff --git a/src/flightdeck/ledger.py b/src/flightdeck/ledger.py
@@ -41,6 +41,19 @@ def confidence_label(
     min_candidate_runs: int,
     min_low_runs: int,
 ) -> Literal["HIGH", "MEDIUM", "LOW"]:
+    """Return a three-tier confidence label for a diff comparison.
+
+    Thresholds come from the resolved policy (which can set them to 0 to
+    unconditionally accept any sample size):
+
+    - HIGH  — both sides meet their minimum run counts.
+    - LOW   — either side falls below the floor (``min_low_runs``).
+    - MEDIUM — in between: at least one side misses its target but neither
+               is below the floor.
+
+    A threshold of 0 is valid and means "no minimum required"; for example,
+    setting all three to 0 lets an empty-window diff still return HIGH.
+    """
     if baseline_runs >= min_baseline_runs and candidate_runs >= min_candidate_runs:
         return "HIGH"
     if baseline_runs < min_low_runs or candidate_runs < min_low_runs:
@@ -128,6 +141,20 @@ def evaluate_policy(
     diff_confidence: Literal["HIGH", "MEDIUM", "LOW"],
     diff_confidence_reason: str | None,
 ) -> PolicyResult:
+    """Evaluate promotion-gate policy against a computed diff.
+
+    Each active constraint appends a human-readable failure reason; an empty
+    reasons list means the policy passed.
+
+    Constraints checked (only when the corresponding policy field is not None):
+
+    - ``max_cost_per_run_usd`` — candidate average cost must not exceed limit.
+    - ``max_latency_ms`` — candidate average latency must not exceed limit
+      (skipped when candidate has no latency data).
+    - ``max_error_rate`` — candidate error rate must not exceed limit.
+    - ``require_high_diff_confidence`` — when ``True``, the diff must reach
+      HIGH confidence (based on sample thresholds) before promotion is allowed.
+    """
     reasons: list[str] = []
 
     # Cost
@@ -167,6 +194,45 @@ def diff_releases(
     candidate_pricing_table: PricingTable,
     window: str,
 ) -> DiffResult:
+    """Compute a trusted diff between a baseline and candidate release.
+
+    Each side is costed independently against its own pricing table, then
+    rolled up into a :class:`Rollup` (cost, latency, error rate).  Confidence
+    is determined by sample size relative to the resolved thresholds, and
+    policy constraints are evaluated on top.
+
+    **Threshold resolution** — policy fields use ``is not None`` to distinguish
+    "not set" from "explicitly zero":
+
+    - If ``Policy.min_candidate_runs`` (/ ``min_baseline_runs`` / ``min_low_runs``)
+      is ``None``, the workspace config default is used (typically 500 / 500 / 50).
+    - If the policy sets a threshold to ``0``, that override is respected and
+      the config default is *not* used — any sample size satisfies the threshold.
+
+    **Agent-id invariant** — when both sides are non-empty, every run event on
+    each side must share the same ``agent_id``, and baseline and candidate must
+    use the same ``agent_id``.  Cross-agent diffs are rejected with
+    ``ValueError``.
+
+    Args:
+        cfg: Workspace configuration supplying diff defaults.
+        policy: Active promotion policy (thresholds and constraints).
+        baseline_events: Run events for the baseline release.
+        candidate_events: Run events for the candidate release.
+        baseline_pricing_table: Pricing used to cost baseline events.
+        candidate_pricing_table: Pricing used to cost candidate events.
+        window: Human-readable label for the time window (e.g. ``"7d"``);
+            stored on the result for display only — filtering is the caller's
+            responsibility.
+
+    Returns:
+        A :class:`DiffResult` containing rollups, deltas, confidence label,
+        and the policy evaluation outcome.
+
+    Raises:
+        ValueError: If events span multiple agent IDs on one side, or if
+            baseline and candidate use different agent IDs.
+    """
     if baseline_events and candidate_events:
         b_agents = {e.agent_id for e in baseline_events}
         c_agents = {e.agent_id for e in candidate_events}

diff --git a/src/flightdeck/models.py b/src/flightdeck/models.py
@@ -165,6 +165,27 @@ class RunEvent(BaseModel):
 
 
 class Policy(BaseModel):
+    """Promotion-gate policy for a release diff.
+
+    **Constraint fields** (``max_*``) — when ``None`` the constraint is
+    disabled.  When set, the candidate rollup must not exceed the limit for the
+    policy to pass.
+
+    **Sample threshold fields** (``min_*``) — control the confidence label
+    assigned by ``diff_releases``:
+
+    - ``None`` (default) — defer to ``WorkspaceConfig.diff`` defaults
+      (typically ``min_candidate_runs=500``, ``min_baseline_runs=500``,
+      ``min_low_runs=50``).
+    - ``0`` — unconditionally accept any sample size for that threshold,
+      including an empty event list.  All three set to ``0`` means any diff
+      window, even an empty one, can reach HIGH confidence.
+
+    The ``None`` / ``0`` distinction is intentional: ``None`` means "inherit
+    from config", not "zero runs required".  ``diff_releases`` uses
+    ``is not None`` checks to respect an explicit ``0`` override.
+    """
+
     policy_id: str = "default"
     max_cost_per_run_usd: float | None = Field(default=None, ge=0)
     max_latency_ms: int | None = Field(default=None, ge=0)