Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions infrastructure/deploy_step_function.sh
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ POLICY='{
"arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-eval-judge*",
"arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-eval-rolling-mean*",
"arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-research-rationale-clustering*",
"arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-replay-concordance*",
"arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-data-collector*",
"arn:aws:lambda:'"$REGION"':'"$ACCOUNT_ID"':function:alpha-engine-predictor-inference*"
]
Expand Down
3 changes: 3 additions & 0 deletions infrastructure/iam/github-actions-lambda-deploy.json
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
"arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-eval-judge",
"arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-eval-rolling-mean",
"arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-rationale-clustering",
"arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-replay-concordance",
"arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-predictor-inference",
"arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-data-collector",
"arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-health-check"
Expand All @@ -69,6 +70,8 @@
"arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-eval-rolling-mean:*",
"arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-rationale-clustering",
"arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-research-rationale-clustering:*",
"arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-replay-concordance",
"arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-replay-concordance:*",
"arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-predictor-inference",
"arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-predictor-inference:*",
"arn:aws:lambda:us-east-1:711398986525:function:alpha-engine-data-collector",
Expand Down
53 changes: 51 additions & 2 deletions infrastructure/step_function.json
Original file line number Diff line number Diff line change
Expand Up @@ -645,7 +645,7 @@
{"Variable": "$.skip_rationale_clustering", "IsPresent": true},
{"Variable": "$.skip_rationale_clustering", "BooleanEquals": true}
],
"Next": "SaturdayHealthCheck"
"Next": "CheckSkipReplayConcordance"
}
],
"Default": "RationaleClustering"
Expand Down Expand Up @@ -674,11 +674,60 @@
{
"ErrorEquals": ["States.ALL"],
"Comment": "Eval observability layer — failures must NOT halt the pipeline.",
"Next": "SaturdayHealthCheck",
"Next": "CheckSkipReplayConcordance",
"ResultPath": "$.rationale_clustering_error"
}
],
"ResultPath": "$.rationale_clustering_result",
"Next": "CheckSkipReplayConcordance"
},

"CheckSkipReplayConcordance": {
"Type": "Choice",
"Comment": "Skip-gate. {\"skip_replay_concordance\": true} bypasses the cheap-model concordance Lambda (used for ad-hoc reruns where Anthropic spend is unwanted, or while iterating on the comparison scorers in alpha-engine-backtester replay/comparison.py). Independent of skip_rationale_clustering — the two are different agent-justification signals (clustering = cross-week templating; concordance = same-input cross-model agreement). Standalone invocation of the Lambda is always available regardless of this flag.",
"Choices": [
{
"And": [
{"Variable": "$.skip_replay_concordance", "IsPresent": true},
{"Variable": "$.skip_replay_concordance", "BooleanEquals": true}
],
"Next": "SaturdayHealthCheck"
}
],
"Default": "ReplayConcordance"
},

"ReplayConcordance": {
"Type": "Task",
"Comment": "Weekly cheap-model concordance Lambda (ROADMAP P0, agent-justification gate signal #3). Replays each captured DecisionArtifact in the trailing 8-week window under the configured target model(s) via langchain_anthropic.with_structured_output against the canonical agent_schemas Pydantic contract; aggregates agreement_score per (agent_id_base, target_model); emits CW metric agent_cheap_model_concordance; persists per-target summary JSON to decision_artifacts/_replay_summary/{YYYY-MM-DD}/{target_model}.json. Default target: claude-haiku-4-5 (Sonnet→Haiku concordance ≈ \"is the larger model earning its cost?\"). Runs every Saturday after RationaleClustering.",
"Resource": "arn:aws:states:::lambda:invoke",
"Parameters": {
"FunctionName": "alpha-engine-replay-concordance:live",
"Payload": {
"end_time_iso.$": "$$.Execution.StartTime",
"target_models": ["claude-haiku-4-5"],
"window_days": 56,
"max_artifacts": 150
}
},
"TimeoutSeconds": 900,
"Retry": [
{
"ErrorEquals": ["Lambda.ServiceException", "Lambda.TooManyRequestsException"],
"MaxAttempts": 1,
"IntervalSeconds": 60,
"BackoffRate": 1.0
}
],
"Catch": [
{
"ErrorEquals": ["States.ALL"],
"Comment": "Eval observability layer — concordance failure must NOT halt the pipeline. Same Catch pattern as eval-judge / eval-rolling-mean / rationale-clustering.",
"Next": "SaturdayHealthCheck",
"ResultPath": "$.replay_concordance_error"
}
],
"ResultPath": "$.replay_concordance_result",
"Next": "SaturdayHealthCheck"
},

Expand Down
94 changes: 81 additions & 13 deletions tests/test_sf_eval_judge_wiring.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ def test_all_eval_judge_states_exist(self, states):
"EvalRollingMean",
"CheckSkipRationaleClustering",
"RationaleClustering",
"CheckSkipReplayConcordance",
"ReplayConcordance",
):
assert name in states, f"missing SF state: {name}"

Expand Down Expand Up @@ -276,7 +278,13 @@ def test_retries_on_transient_lambda_errors(self, states):


class TestSkipRationaleClustering:
def test_skip_flag_bypasses_to_health_check(self, states):
def test_skip_flag_bypasses_to_concordance_gate(self, states):
"""Skipping clustering must NOT also skip concordance — they
are independent agent-justification signals (clustering = cross-
week templating; concordance = same-input cross-model agreement).
The skip path lands on CheckSkipReplayConcordance rather than
SaturdayHealthCheck so the concordance Lambda still fires
unless its own skip flag is set."""
skip = states["CheckSkipRationaleClustering"]
choice = skip["Choices"][0]
and_clauses = choice["And"]
Expand All @@ -285,7 +293,10 @@ def test_skip_flag_bypasses_to_health_check(self, states):
and c.get("BooleanEquals") is True
for c in and_clauses
)
assert choice["Next"] == "SaturdayHealthCheck"
assert choice["Next"] == "CheckSkipReplayConcordance"
# Critically NOT routed directly to SaturdayHealthCheck — that
# would bundle-skip both observability paths.
assert choice["Next"] != "SaturdayHealthCheck"

def test_default_runs_clustering(self, states):
assert states["CheckSkipRationaleClustering"]["Default"] == "RationaleClustering"
Expand All @@ -297,31 +308,88 @@ def test_invokes_live_alias(self, states):
assert params["FunctionName"] == "alpha-engine-research-rationale-clustering:live"

def test_payload_passes_execution_start_time(self, states):
# SF passes its own start time so the clustering window aligns
# with the SF execution date — same alignment principle as the
# rolling-mean state above.
payload = states["RationaleClustering"]["Parameters"]["Payload"]
assert payload["end_time_iso.$"] == "$$.Execution.StartTime"

def test_timeout_matches_lambda_cap(self, states):
# Clustering Lambda is configured with timeout=600s
# (alpha-engine-research infrastructure/deploy.sh) — SF state
# TimeoutSeconds must equal that ceiling.
assert states["RationaleClustering"]["TimeoutSeconds"] == 600

def test_success_continues_to_concordance_gate(self, states):
# Clustering converges to CheckSkipReplayConcordance (the gate
# in front of the cheap-model concordance Lambda) rather than
# directly to SaturdayHealthCheck.
assert states["RationaleClustering"]["Next"] == "CheckSkipReplayConcordance"

def test_catch_routes_to_concordance_gate_not_failure(self, states):
catch = states["RationaleClustering"]["Catch"][0]
assert catch["ErrorEquals"] == ["States.ALL"]
assert catch["Next"] == "CheckSkipReplayConcordance"
assert catch["Next"] != "HandleFailure"

def test_retries_on_transient_lambda_errors(self, states):
retry = states["RationaleClustering"]["Retry"][0]
assert "Lambda.ServiceException" in retry["ErrorEquals"]
assert "Lambda.TooManyRequestsException" in retry["ErrorEquals"]
assert retry["MaxAttempts"] == 1


# ── Replay concordance skip-gate + state ─────────────────────────────────


class TestSkipReplayConcordance:
def test_skip_flag_bypasses_to_health_check(self, states):
skip = states["CheckSkipReplayConcordance"]
choice = skip["Choices"][0]
and_clauses = choice["And"]
assert any(
c.get("Variable") == "$.skip_replay_concordance"
and c.get("BooleanEquals") is True
for c in and_clauses
)
assert choice["Next"] == "SaturdayHealthCheck"

def test_default_runs_concordance(self, states):
assert states["CheckSkipReplayConcordance"]["Default"] == "ReplayConcordance"


class TestReplayConcordance:
def test_invokes_live_alias(self, states):
params = states["ReplayConcordance"]["Parameters"]
assert params["FunctionName"] == "alpha-engine-replay-concordance:live"

def test_payload_carries_required_fields(self, states):
payload = states["ReplayConcordance"]["Parameters"]["Payload"]
# SF execution start time aligns the window with the SF run date.
assert payload["end_time_iso.$"] == "$$.Execution.StartTime"
# Default target_models pinned at the Saturday SF level so the
# production cadence is reproducible — operator overrides via
# SF input parameters when running ad-hoc against a different
# target model.
assert payload["target_models"] == ["claude-haiku-4-5"]
assert payload["window_days"] == 56 # 8 weeks
# max_artifacts cap fits the 900s Lambda timeout at ~3-5 sec
# per replay call (150 × 5 = 750s, comfortable).
assert payload["max_artifacts"] == 150

def test_timeout_matches_lambda_cap(self, states):
# Concordance Lambda is configured with timeout=900s
# (alpha-engine-backtester infrastructure/deploy_concordance.sh).
assert states["ReplayConcordance"]["TimeoutSeconds"] == 900

def test_success_continues_to_health_check(self, states):
assert states["RationaleClustering"]["Next"] == "SaturdayHealthCheck"
assert states["ReplayConcordance"]["Next"] == "SaturdayHealthCheck"

def test_catch_routes_to_health_check_not_failure(self, states):
# Eval is observability — clustering failures must NOT halt
# the pipeline (matches eval-judge + eval-rolling-mean posture).
catch = states["RationaleClustering"]["Catch"][0]
# Eval is observability — concordance failures must NOT halt
# the pipeline (matches eval-judge + eval-rolling-mean +
# rationale-clustering posture).
catch = states["ReplayConcordance"]["Catch"][0]
assert catch["ErrorEquals"] == ["States.ALL"]
assert catch["Next"] == "SaturdayHealthCheck"
assert catch["Next"] != "HandleFailure"

def test_retries_on_transient_lambda_errors(self, states):
retry = states["RationaleClustering"]["Retry"][0]
retry = states["ReplayConcordance"]["Retry"][0]
assert "Lambda.ServiceException" in retry["ErrorEquals"]
assert "Lambda.TooManyRequestsException" in retry["ErrorEquals"]
assert retry["MaxAttempts"] == 1
Loading