From e7ab635685d51d80e320cbc96323491b71f22a07 Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Mon, 20 Apr 2026 14:28:08 -0700 Subject: [PATCH 1/2] feat(sf): coverage-gap self-heal between Predictor and executor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the Research↔Predictor coverage gap at the orchestration layer (Phase 2). Pairs with alpha-engine-predictor PR #42 (--tickers flag + check_coverage action) and alpha-engine PR #72 (executor read-time guard). Problem ------- 2026-04-20: executor daemon bought SNDK/WDC/BIIB/XEL at market open despite 7 buy_candidates having no prediction row. GBM veto gate was structurally unreachable for those tickers (no prediction → no veto). 4 of 5 live entries (~80% of capital) routed around a risk control. Architecture ------------ The invariant is "every buy_candidate must have a prediction before the executor sees signals.json". Previously enforced nowhere. Now enforced in two layers: - **Self-heal (this PR):** PredictorInference → CheckPredictorCoverage → (if gap) ReinvokePredictor with tickers=missing → RecheckCoverage → (if STILL gap) HandleFailure. Single retry — no infinite loop. - **Defense-in-depth (predictor #42 + executor #72):** both predictor write-time and executor read-time refuse to proceed on a coverage gap. These fire if the self-heal mechanism above ever regresses. State graph added ----------------- PredictorInference └→ CheckPredictorCoverage (new, Lambda action=check_coverage) └→ CoverageGapChoice (new) ├─ has_gap=true → ReinvokePredictor (new, Lambda action=predict │ + tickers=$.coverage_result.Payload.missing_tickers) │ └→ RecheckCoverage (new) │ └→ FinalCoverageGate (new) │ ├─ still has_gap → HandleFailure │ └─ default → PredictorHealthCheck └─ default → PredictorHealthCheck All state references validated: 24 states total, no missing Next targets, no unreachable states. Co-Authored-By: Claude Opus 4.7 (1M context) --- infrastructure/step_function_daily.json | 119 +++++++++++++++++++++++- 1 file changed, 118 insertions(+), 1 deletion(-) diff --git a/infrastructure/step_function_daily.json b/infrastructure/step_function_daily.json index ad28eef..6ecddf8 100644 --- a/infrastructure/step_function_daily.json +++ b/infrastructure/step_function_daily.json @@ -218,7 +218,124 @@ } ], "ResultPath": "$.predictor_result", - "Next": "PredictorHealthCheck" + "Next": "CheckPredictorCoverage" + }, + + "CheckPredictorCoverage": { + "Type": "Task", + "Comment": "Self-healing coverage gate: detect buy_candidates in signals.json that lack a prediction row. Primary guard against the 2026-04-20 silent-veto-bypass bug. If any are missing, ReinvokePredictor scores them and RecheckCoverage verifies before we proceed to executor.", + "Resource": "arn:aws:states:::lambda:invoke", + "Parameters": { + "FunctionName": "alpha-engine-predictor-inference:live", + "Payload": { + "action": "check_coverage" + } + }, + "TimeoutSeconds": 60, + "Retry": [ + { + "ErrorEquals": ["Lambda.ServiceException", "Lambda.TooManyRequestsException"], + "MaxAttempts": 1, + "IntervalSeconds": 15, + "BackoffRate": 1.0 + } + ], + "Catch": [ + { + "ErrorEquals": ["States.ALL"], + "Next": "HandleFailure", + "ResultPath": "$.error" + } + ], + "ResultPath": "$.coverage_result", + "Next": "CoverageGapChoice" + }, + + "CoverageGapChoice": { + "Type": "Choice", + "Comment": "If buy_candidates have unscored tickers, re-invoke the predictor with --tickers. Otherwise advance to PredictorHealthCheck.", + "Choices": [ + { + "Variable": "$.coverage_result.Payload.has_gap", + "BooleanEquals": true, + "Next": "ReinvokePredictor" + } + ], + "Default": "PredictorHealthCheck" + }, + + "ReinvokePredictor": { + "Type": "Task", + "Comment": "Supplemental-scoring re-invoke for the tickers CheckPredictorCoverage flagged as missing. Predictor merges the new predictions into predictions/{date}.json (re-ranks the union).", + "Resource": "arn:aws:states:::lambda:invoke", + "Parameters": { + "FunctionName": "alpha-engine-predictor-inference:live", + "Payload": { + "action": "predict", + "tickers.$": "$.coverage_result.Payload.missing_tickers" + } + }, + "TimeoutSeconds": 900, + "Retry": [ + { + "ErrorEquals": ["Lambda.ServiceException", "Lambda.TooManyRequestsException"], + "MaxAttempts": 1, + "IntervalSeconds": 60, + "BackoffRate": 1.0 + } + ], + "Catch": [ + { + "ErrorEquals": ["States.ALL"], + "Next": "HandleFailure", + "ResultPath": "$.error" + } + ], + "ResultPath": "$.reinvoke_result", + "Next": "RecheckCoverage" + }, + + "RecheckCoverage": { + "Type": "Task", + "Comment": "Second coverage check after ReinvokePredictor. If the gap persists (e.g. some ticker genuinely cannot be scored), bail out instead of looping — HandleFailure fires the SNS alert.", + "Resource": "arn:aws:states:::lambda:invoke", + "Parameters": { + "FunctionName": "alpha-engine-predictor-inference:live", + "Payload": { + "action": "check_coverage" + } + }, + "TimeoutSeconds": 60, + "Retry": [ + { + "ErrorEquals": ["Lambda.ServiceException", "Lambda.TooManyRequestsException"], + "MaxAttempts": 1, + "IntervalSeconds": 15, + "BackoffRate": 1.0 + } + ], + "Catch": [ + { + "ErrorEquals": ["States.ALL"], + "Next": "HandleFailure", + "ResultPath": "$.error" + } + ], + "ResultPath": "$.coverage_recheck_result", + "Next": "FinalCoverageGate" + }, + + "FinalCoverageGate": { + "Type": "Choice", + "Comment": "After one re-invocation attempt, either coverage is complete (proceed) or we have a deeper problem (fail). No further retry loop — prevents runaway invocations if a ticker is un-scorable by the model.", + "Choices": [ + { + "Variable": "$.coverage_recheck_result.Payload.has_gap", + "BooleanEquals": true, + "Next": "HandleFailure" + } + ], + "Default": "PredictorHealthCheck" }, "PredictorHealthCheck": { From 71e4116b17fddea0ef630226a1c8aa11d61736c0 Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Mon, 20 Apr 2026 14:39:04 -0700 Subject: [PATCH 2/2] feat(cf): CloudWatch alarm on unscored_buy_candidates_count MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 4 (CW alarm) of the Research↔Predictor coverage-gap closure. Bundled into the same PR as Phase 2 (SF self-heal) since both are infra living in this repo. New alarm --------- - Namespace: AlphaEngine/Predictor - MetricName: unscored_buy_candidates_count - Emitted by executor's signal_reader on every run (0 on success, >0 on gap) - Threshold: Maximum ≥ 1 over any 1-hour window - Action: existing alpha-engine-alerts SNS topic - TreatMissingData: notBreaching (executor is off-hours M-F only) Semantics --------- A positive value means the SF self-heal (CheckPredictorCoverage → ReinvokePredictor) failed to close the gap before the executor read predictions.json — either orchestration regressed or a ticker is genuinely un-scorable. Long-term regression guard for the coverage invariant. cfn-lint clean (only pre-existing W2001 warnings on unused parameters). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../alpha-engine-orchestration.yaml | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/infrastructure/cloudformation/alpha-engine-orchestration.yaml b/infrastructure/cloudformation/alpha-engine-orchestration.yaml index 96b6bf8..6b7cbbc 100644 --- a/infrastructure/cloudformation/alpha-engine-orchestration.yaml +++ b/infrastructure/cloudformation/alpha-engine-orchestration.yaml @@ -274,6 +274,29 @@ Resources: AlarmActions: - !Ref AlertsTopic + # Research↔Predictor coverage gap. Emitted by the executor's signal_reader + # on every run (value 0 on success, >0 when buy_candidates have no + # prediction row). A positive value means the weekday Step Function's + # coverage-gap self-heal (CheckPredictorCoverage → ReinvokePredictor) also + # failed to close the gap before the executor read predictions.json — + # indicates a regression of that orchestration. First observed as a silent + # veto bypass on 2026-04-20 (4 of 5 entries bypassed the GBM veto). + UnscoredBuyCandidatesGap: + Type: AWS::CloudWatch::Alarm + Properties: + AlarmName: alpha-engine-predictor-unscored-buy-candidates + AlarmDescription: Executor saw buy_candidates without a prediction row — Step Function coverage-gap self-heal may have regressed + Namespace: AlphaEngine/Predictor + MetricName: unscored_buy_candidates_count + Statistic: Maximum + Period: 3600 # 1 hour + EvaluationPeriods: 1 + Threshold: 1 + ComparisonOperator: GreaterThanOrEqualToThreshold + TreatMissingData: notBreaching # executor not running off-hours is expected + AlarmActions: + - !Ref AlertsTopic + # Heartbeat alarms for EC2-based processes ExecutorMorningHeartbeat: Type: AWS::CloudWatch::Alarm