cipher813 · cipher813 · May 6, 2026 · May 6, 2026
diff --git a/infrastructure/deploy_step_function.sh b/infrastructure/deploy_step_function.sh
@@ -106,6 +106,7 @@ POLICY='{
         "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-eval-judge*",
         "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-eval-rolling-mean*",
         "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-rationale-clustering*",
+        "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-replay-concordance*",
         "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-data-collector*",
         "arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-predictor-inference*"
       ]

diff --git a/infrastructure/iam/github-actions-lambda-deploy.json b/infrastructure/iam/github-actions-lambda-deploy.json
@@ -47,6 +47,7 @@
         "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-eval-judge",
         "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-eval-rolling-mean",
         "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-rationale-clustering",
+        "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-replay-concordance",
         "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-predictor-inference",
         "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-data-collector",
         "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-health-check"
@@ -69,6 +70,8 @@
         "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-eval-rolling-mean:*",
         "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-rationale-clustering",
         "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-rationale-clustering:*",
+        "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-replay-concordance",
+        "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-replay-concordance:*",
         "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-predictor-inference",
         "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-predictor-inference:*",
         "arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-data-collector",

diff --git a/infrastructure/step_function.json b/infrastructure/step_function.json
@@ -645,7 +645,7 @@
             {"Variable": "$.skip_rationale_clustering", "IsPresent": true},
             {"Variable": "$.skip_rationale_clustering", "BooleanEquals": true}
           ],
-          "Next": "SaturdayHealthCheck"
+          "Next": "CheckSkipReplayConcordance"
         }
       ],
       "Default": "RationaleClustering"
@@ -674,11 +674,60 @@
         {
           "ErrorEquals": ["States.ALL"],
           "Comment": "Eval observability layer — failures must NOT halt the pipeline.",
-          "Next": "SaturdayHealthCheck",
+          "Next": "CheckSkipReplayConcordance",
           "ResultPath": "$.rationale_clustering_error"
         }
       ],
       "ResultPath": "$.rationale_clustering_result",
+      "Next": "CheckSkipReplayConcordance"
+    },
+
+    "CheckSkipReplayConcordance": {
+      "Type": "Choice",
+      "Comment": "Skip-gate. {\"skip_replay_concordance\": true} bypasses the cheap-model concordance Lambda (used for ad-hoc reruns where Anthropic spend is unwanted, or while iterating on the comparison scorers in alpha-engine-backtester replay/comparison.py). Independent of skip_rationale_clustering — the two are different agent-justification signals (clustering = cross-week templating; concordance = same-input cross-model agreement). Standalone invocation of the Lambda is always available regardless of this flag.",
+      "Choices": [
+        {
+          "And": [
+            {"Variable": "$.skip_replay_concordance", "IsPresent": true},
+            {"Variable": "$.skip_replay_concordance", "BooleanEquals": true}
+          ],
+          "Next": "SaturdayHealthCheck"
+        }
+      ],
+      "Default": "ReplayConcordance"
+    },
+
+    "ReplayConcordance": {
+      "Type": "Task",
+      "Comment": "Weekly cheap-model concordance Lambda (ROADMAP P0, agent-justification gate signal #3). Replays each captured DecisionArtifact in the trailing 8-week window under the configured target model(s) via langchain_anthropic.with_structured_output against the canonical agent_schemas Pydantic contract; aggregates agreement_score per (agent_id_base, target_model); emits CW metric agent_cheap_model_concordance; persists per-target summary JSON to decision_artifacts/_replay_summary/{YYYY-MM-DD}/{target_model}.json. Default target: claude-haiku-4-5 (Sonnet→Haiku concordance ≈ \"is the larger model earning its cost?\"). Runs every Saturday after RationaleClustering.",
+      "Resource": "arn:aws:states:::lambda:invoke",
+      "Parameters": {
+        "FunctionName": "alpha-engine-replay-concordance:live",
+        "Payload": {
+          "end_time_iso.$": "$$.Execution.StartTime",
+          "target_models": ["claude-haiku-4-5"],
+          "window_days": 56,
+          "max_artifacts": 150
+        }
+      },
+      "TimeoutSeconds": 900,
+      "Retry": [
+        {
+          "ErrorEquals": ["Lambda.ServiceException", "Lambda.TooManyRequestsException"],
+          "MaxAttempts": 1,
+          "IntervalSeconds": 60,
+          "BackoffRate": 1.0
+        }
+      ],
+      "Catch": [
+        {
+          "ErrorEquals": ["States.ALL"],
+          "Comment": "Eval observability layer — concordance failure must NOT halt the pipeline. Same Catch pattern as eval-judge / eval-rolling-mean / rationale-clustering.",
+          "Next": "SaturdayHealthCheck",
+          "ResultPath": "$.replay_concordance_error"
+        }
+      ],
+      "ResultPath": "$.replay_concordance_result",
       "Next": "SaturdayHealthCheck"
     },
 

diff --git a/tests/test_sf_eval_judge_wiring.py b/tests/test_sf_eval_judge_wiring.py
@@ -46,6 +46,8 @@ def test_all_eval_judge_states_exist(self, states):
             "EvalRollingMean",
             "CheckSkipRationaleClustering",
             "RationaleClustering",
+            "CheckSkipReplayConcordance",
+            "ReplayConcordance",
         ):
             assert name in states, f"missing SF state: {name}"
 
@@ -276,7 +278,13 @@ def test_retries_on_transient_lambda_errors(self, states):
 
 
 class TestSkipRationaleClustering:
-    def test_skip_flag_bypasses_to_health_check(self, states):
+    def test_skip_flag_bypasses_to_concordance_gate(self, states):
+        """Skipping clustering must NOT also skip concordance — they
+        are independent agent-justification signals (clustering = cross-
+        week templating; concordance = same-input cross-model agreement).
+        The skip path lands on CheckSkipReplayConcordance rather than
+        SaturdayHealthCheck so the concordance Lambda still fires
+        unless its own skip flag is set."""
         skip = states["CheckSkipRationaleClustering"]
         choice = skip["Choices"][0]
         and_clauses = choice["And"]
@@ -285,7 +293,10 @@ def test_skip_flag_bypasses_to_health_check(self, states):
             and c.get("BooleanEquals") is True
             for c in and_clauses
         )
-        assert choice["Next"] == "SaturdayHealthCheck"
+        assert choice["Next"] == "CheckSkipReplayConcordance"
+        # Critically NOT routed directly to SaturdayHealthCheck — that
+        # would bundle-skip both observability paths.
+        assert choice["Next"] != "SaturdayHealthCheck"
 
     def test_default_runs_clustering(self, states):
         assert states["CheckSkipRationaleClustering"]["Default"] == "RationaleClustering"
@@ -297,31 +308,88 @@ def test_invokes_live_alias(self, states):
         assert params["FunctionName"] == "alpha-engine-research-rationale-clustering:live"
 
     def test_payload_passes_execution_start_time(self, states):
-        # SF passes its own start time so the clustering window aligns
-        # with the SF execution date — same alignment principle as the
-        # rolling-mean state above.
         payload = states["RationaleClustering"]["Parameters"]["Payload"]
         assert payload["end_time_iso.$"] == "$$.Execution.StartTime"
 
     def test_timeout_matches_lambda_cap(self, states):
-        # Clustering Lambda is configured with timeout=600s
-        # (alpha-engine-research infrastructure/deploy.sh) — SF state
-        # TimeoutSeconds must equal that ceiling.
         assert states["RationaleClustering"]["TimeoutSeconds"] == 600
 
+    def test_success_continues_to_concordance_gate(self, states):
+        # Clustering converges to CheckSkipReplayConcordance (the gate
+        # in front of the cheap-model concordance Lambda) rather than
+        # directly to SaturdayHealthCheck.
+        assert states["RationaleClustering"]["Next"] == "CheckSkipReplayConcordance"
+
+    def test_catch_routes_to_concordance_gate_not_failure(self, states):
+        catch = states["RationaleClustering"]["Catch"][0]
+        assert catch["ErrorEquals"] == ["States.ALL"]
+        assert catch["Next"] == "CheckSkipReplayConcordance"
+        assert catch["Next"] != "HandleFailure"
+
+    def test_retries_on_transient_lambda_errors(self, states):
+        retry = states["RationaleClustering"]["Retry"][0]
+        assert "Lambda.ServiceException" in retry["ErrorEquals"]
+        assert "Lambda.TooManyRequestsException" in retry["ErrorEquals"]
+        assert retry["MaxAttempts"] == 1
+
+
+# ── Replay concordance skip-gate + state ─────────────────────────────────
+
+
+class TestSkipReplayConcordance:
+    def test_skip_flag_bypasses_to_health_check(self, states):
+        skip = states["CheckSkipReplayConcordance"]
+        choice = skip["Choices"][0]
+        and_clauses = choice["And"]
+        assert any(
+            c.get("Variable") == "$.skip_replay_concordance"
+            and c.get("BooleanEquals") is True
+            for c in and_clauses
+        )
+        assert choice["Next"] == "SaturdayHealthCheck"
+
+    def test_default_runs_concordance(self, states):
+        assert states["CheckSkipReplayConcordance"]["Default"] == "ReplayConcordance"
+
+
+class TestReplayConcordance:
+    def test_invokes_live_alias(self, states):
+        params = states["ReplayConcordance"]["Parameters"]
+        assert params["FunctionName"] == "alpha-engine-replay-concordance:live"
+
+    def test_payload_carries_required_fields(self, states):
+        payload = states["ReplayConcordance"]["Parameters"]["Payload"]
+        # SF execution start time aligns the window with the SF run date.
+        assert payload["end_time_iso.$"] == "$$.Execution.StartTime"
+        # Default target_models pinned at the Saturday SF level so the
+        # production cadence is reproducible — operator overrides via
+        # SF input parameters when running ad-hoc against a different
+        # target model.
+        assert payload["target_models"] == ["claude-haiku-4-5"]
+        assert payload["window_days"] == 56  # 8 weeks
+        # max_artifacts cap fits the 900s Lambda timeout at ~3-5 sec
+        # per replay call (150 × 5 = 750s, comfortable).
+        assert payload["max_artifacts"] == 150
+
+    def test_timeout_matches_lambda_cap(self, states):
+        # Concordance Lambda is configured with timeout=900s
+        # (alpha-engine-backtester infrastructure/deploy_concordance.sh).
+        assert states["ReplayConcordance"]["TimeoutSeconds"] == 900
+
     def test_success_continues_to_health_check(self, states):
-        assert states["RationaleClustering"]["Next"] == "SaturdayHealthCheck"
+        assert states["ReplayConcordance"]["Next"] == "SaturdayHealthCheck"
 
     def test_catch_routes_to_health_check_not_failure(self, states):
-        # Eval is observability — clustering failures must NOT halt
-        # the pipeline (matches eval-judge + eval-rolling-mean posture).
-        catch = states["RationaleClustering"]["Catch"][0]
+        # Eval is observability — concordance failures must NOT halt
+        # the pipeline (matches eval-judge + eval-rolling-mean +
+        # rationale-clustering posture).
+        catch = states["ReplayConcordance"]["Catch"][0]
         assert catch["ErrorEquals"] == ["States.ALL"]
         assert catch["Next"] == "SaturdayHealthCheck"
         assert catch["Next"] != "HandleFailure"
 
     def test_retries_on_transient_lambda_errors(self, states):
-        retry = states["RationaleClustering"]["Retry"][0]
+        retry = states["ReplayConcordance"]["Retry"][0]
         assert "Lambda.ServiceException" in retry["ErrorEquals"]
         assert "Lambda.TooManyRequestsException" in retry["ErrorEquals"]
         assert retry["MaxAttempts"] == 1