Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions infrastructure/cloudformation/alpha-engine-orchestration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,29 @@ Resources:
AlarmActions:
- !Ref AlertsTopic

# Research↔Predictor coverage gap. Emitted by the executor's signal_reader
# on every run (value 0 on success, >0 when buy_candidates have no
# prediction row). A positive value means the weekday Step Function's
# coverage-gap self-heal (CheckPredictorCoverage → ReinvokePredictor) also
# failed to close the gap before the executor read predictions.json —
# indicates a regression of that orchestration. First observed as a silent
# veto bypass on 2026-04-20 (4 of 5 entries bypassed the GBM veto).
UnscoredBuyCandidatesGap:
Type: AWS::CloudWatch::Alarm
Properties:
AlarmName: alpha-engine-predictor-unscored-buy-candidates
AlarmDescription: Executor saw buy_candidates without a prediction row — Step Function coverage-gap self-heal may have regressed
Namespace: AlphaEngine/Predictor
MetricName: unscored_buy_candidates_count
Statistic: Maximum
Period: 3600 # 1 hour
EvaluationPeriods: 1
Threshold: 1
ComparisonOperator: GreaterThanOrEqualToThreshold
TreatMissingData: notBreaching # executor not running off-hours is expected
AlarmActions:
- !Ref AlertsTopic

# Heartbeat alarms for EC2-based processes
ExecutorMorningHeartbeat:
Type: AWS::CloudWatch::Alarm
Expand Down
119 changes: 118 additions & 1 deletion infrastructure/step_function_daily.json
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,124 @@
}
],
"ResultPath": "$.predictor_result",
"Next": "PredictorHealthCheck"
"Next": "CheckPredictorCoverage"
},

"CheckPredictorCoverage": {
"Type": "Task",
"Comment": "Self-healing coverage gate: detect buy_candidates in signals.json that lack a prediction row. Primary guard against the 2026-04-20 silent-veto-bypass bug. If any are missing, ReinvokePredictor scores them and RecheckCoverage verifies before we proceed to executor.",
"Resource": "arn:aws:states:::lambda:invoke",
"Parameters": {
"FunctionName": "alpha-engine-predictor-inference:live",
"Payload": {
"action": "check_coverage"
}
},
"TimeoutSeconds": 60,
"Retry": [
{
"ErrorEquals": ["Lambda.ServiceException", "Lambda.TooManyRequestsException"],
"MaxAttempts": 1,
"IntervalSeconds": 15,
"BackoffRate": 1.0
}
],
"Catch": [
{
"ErrorEquals": ["States.ALL"],
"Next": "HandleFailure",
"ResultPath": "$.error"
}
],
"ResultPath": "$.coverage_result",
"Next": "CoverageGapChoice"
},

"CoverageGapChoice": {
"Type": "Choice",
"Comment": "If buy_candidates have unscored tickers, re-invoke the predictor with --tickers. Otherwise advance to PredictorHealthCheck.",
"Choices": [
{
"Variable": "$.coverage_result.Payload.has_gap",
"BooleanEquals": true,
"Next": "ReinvokePredictor"
}
],
"Default": "PredictorHealthCheck"
},

"ReinvokePredictor": {
"Type": "Task",
"Comment": "Supplemental-scoring re-invoke for the tickers CheckPredictorCoverage flagged as missing. Predictor merges the new predictions into predictions/{date}.json (re-ranks the union).",
"Resource": "arn:aws:states:::lambda:invoke",
"Parameters": {
"FunctionName": "alpha-engine-predictor-inference:live",
"Payload": {
"action": "predict",
"tickers.$": "$.coverage_result.Payload.missing_tickers"
}
},
"TimeoutSeconds": 900,
"Retry": [
{
"ErrorEquals": ["Lambda.ServiceException", "Lambda.TooManyRequestsException"],
"MaxAttempts": 1,
"IntervalSeconds": 60,
"BackoffRate": 1.0
}
],
"Catch": [
{
"ErrorEquals": ["States.ALL"],
"Next": "HandleFailure",
"ResultPath": "$.error"
}
],
"ResultPath": "$.reinvoke_result",
"Next": "RecheckCoverage"
},

"RecheckCoverage": {
"Type": "Task",
"Comment": "Second coverage check after ReinvokePredictor. If the gap persists (e.g. some ticker genuinely cannot be scored), bail out instead of looping — HandleFailure fires the SNS alert.",
"Resource": "arn:aws:states:::lambda:invoke",
"Parameters": {
"FunctionName": "alpha-engine-predictor-inference:live",
"Payload": {
"action": "check_coverage"
}
},
"TimeoutSeconds": 60,
"Retry": [
{
"ErrorEquals": ["Lambda.ServiceException", "Lambda.TooManyRequestsException"],
"MaxAttempts": 1,
"IntervalSeconds": 15,
"BackoffRate": 1.0
}
],
"Catch": [
{
"ErrorEquals": ["States.ALL"],
"Next": "HandleFailure",
"ResultPath": "$.error"
}
],
"ResultPath": "$.coverage_recheck_result",
"Next": "FinalCoverageGate"
},

"FinalCoverageGate": {
"Type": "Choice",
"Comment": "After one re-invocation attempt, either coverage is complete (proceed) or we have a deeper problem (fail). No further retry loop — prevents runaway invocations if a ticker is un-scorable by the model.",
"Choices": [
{
"Variable": "$.coverage_recheck_result.Payload.has_gap",
"BooleanEquals": true,
"Next": "HandleFailure"
}
],
"Default": "PredictorHealthCheck"
},

"PredictorHealthCheck": {
Expand Down
Loading