From da8ed4d808b2c2c260e3c7df40375ec7805b4d38 Mon Sep 17 00:00:00 2001 From: doquanghuy Date: Thu, 21 May 2026 21:34:03 +0700 Subject: [PATCH] feat(workflows): add `continue_on_error` step field MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #2591. Adds an optional `continue_on_error: bool` field on every step. When set to `true` and the step fails, the engine records the result (exit_code, stderr, status) into `steps..output` and continues to the next sibling step instead of halting the run. Downstream `if`, `switch`, or `gate` steps can then branch on `{{ steps..output.exit_code }}` to route the recovery path. This composes with primitives that already exist (the exit code is already captured, the expression engine already resolves it, and `if`/`switch`/`gate` are already available) — the only gap was that a non-zero exit hard-stopped the pipeline before any downstream step could evaluate it. ### Engine `WorkflowEngine._execute_steps` now consults the step config when a step returns `StepStatus.FAILED`: - Gate aborts (`output.aborted`) always halt the run — operator decisions take precedence over the flag. - Otherwise, if `continue_on_error: true`, log a `step_continue_on_error` event and proceed to the next sibling. - Otherwise, behave as before: set `RunStatus.FAILED` and return. ### Validation `_validate_steps` rejects non-bool values for `continue_on_error`. Coerced strings like `"true"` are not accepted so authoring mistakes surface at validation time rather than silently changing run semantics. ### Default behaviour preserved When `continue_on_error` is omitted, every code path is byte-equivalent to before this change. Existing workflows see no difference. ### Tests New `TestContinueOnError` class in `tests/test_workflows.py` covers all four scenarios from the issue's acceptance criteria plus two extras: - undeclared (default) failure halts the run. - declared-and-fired continues past the failure. - declared-but-step-succeeded is a no-op (flag only matters on FAILED). - if-branch end-to-end exercising the canonical recovery pattern from the issue discussion. - gate abort still halts even with `continue_on_error: true` set. - validation rejects non-bool values; accepts both `true` and `false` cleanly. ### Docs Adds an "Error Handling" section to `workflows/README.md` documenting the field, the gate-abort precedence rule, and the canonical recovery pattern. ### Follow-on Auto-retry-on-transient (e.g. retry a 429 at 3 AM without operator attendance) is intentionally out of scope. The current proposal covers the **skip** and **abort** verdicts from the original discussion; the **retry** verdict still pauses for an operator at the gate step. A future loop/retry-count primitive or an auto-approving gate could close that gap on top of this mechanism without further engine changes. --- src/specify_cli/workflows/engine.py | 45 ++++- tests/test_workflows.py | 250 ++++++++++++++++++++++++++++ workflows/README.md | 39 +++++ 3 files changed, 330 insertions(+), 4 deletions(-) diff --git a/src/specify_cli/workflows/engine.py b/src/specify_cli/workflows/engine.py index 934cfbe5ee..dd5b58eeaf 100644 --- a/src/specify_cli/workflows/engine.py +++ b/src/specify_cli/workflows/engine.py @@ -231,6 +231,20 @@ def _validate_steps( step_errors = step_impl.validate(step_config) errors.extend(step_errors) + # Validate optional `continue_on_error` field. The engine honours + # this on any step that returns FAILED so the pipeline can route + # around the failure via downstream `if`/`switch`/`gate`. The + # field must be a literal boolean — coercion from truthy strings + # is deliberately not supported so authoring mistakes surface + # at validation time rather than silently changing run semantics. + if "continue_on_error" in step_config: + coe = step_config["continue_on_error"] + if not isinstance(coe, bool): + errors.append( + f"Step {step_id!r}: 'continue_on_error' must be a " + f"boolean, got {type(coe).__name__}." + ) + # Recursively validate nested steps for nested_key in ("then", "else", "steps"): nested = step_config.get(nested_key) @@ -622,7 +636,10 @@ def _execute_steps( # Handle failures if result.status == StepStatus.FAILED: - # Gate abort (output.aborted) maps to ABORTED status + # Gate abort (output.aborted) maps to ABORTED status. + # Aborts are deliberate operator decisions, so + # `continue_on_error` does NOT override them — that flag + # is for transient/expected step failures only. if result.output.get("aborted"): state.status = RunStatus.ABORTED state.append_log( @@ -631,15 +648,35 @@ def _execute_steps( "step_id": step_id, } ) - else: - state.status = RunStatus.FAILED + state.save() + return + + # `continue_on_error: true` lets the pipeline route + # around the failure instead of halting. The step + # result (including exit_code, stderr, status) is + # still recorded so downstream `if`/`switch`/`gate` + # steps can branch on it. Log a single, unambiguous + # event per failure resolution — either the run + # continued past it, or it halted. + if step_config.get("continue_on_error"): state.append_log( { - "event": "step_failed", + "event": "step_continue_on_error", "step_id": step_id, "error": result.error, } ) + state.save() + continue + + state.status = RunStatus.FAILED + state.append_log( + { + "event": "step_failed", + "step_id": step_id, + "error": result.error, + } + ) state.save() return diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 3fa71f3404..7cd9cdbb0e 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -1890,6 +1890,256 @@ def test_validate_workflow_rejects_non_string_default_for_string_type(self): assert any("invalid default" in e for e in errors), errors +# ===== continue_on_error Tests ===== +# +# Locks the contract documented in workflows/README.md "Error Handling" +# section: when an executable step fails and `continue_on_error: true` +# is declared, the engine records the result (exit_code, stderr, status) +# and continues to the next sibling step instead of halting the run. +# Gate aborts (`output.aborted`) still halt regardless of the flag. + + +class TestContinueOnError: + """Test the `continue_on_error` step-level field.""" + + def test_undeclared_failure_halts_run(self, project_dir): + """Default behaviour (no `continue_on_error`): a failing step + halts the workflow run with `status == FAILED`. + + Locks the byte-equivalent default — workflows that do not + declare the flag must behave exactly as before this feature. + """ + from specify_cli.workflows.engine import WorkflowDefinition, WorkflowEngine + from specify_cli.workflows.base import RunStatus + + definition = WorkflowDefinition.from_string(""" +schema_version: "1.0" +workflow: + id: "halt-on-fail" + name: "Halt On Fail" + version: "1.0.0" +steps: + - id: fail-step + type: shell + run: "exit 7" + - id: after + type: shell + run: "echo should-not-run" +""") + engine = WorkflowEngine(project_dir) + state = engine.execute(definition) + + assert state.status == RunStatus.FAILED + assert "fail-step" in state.step_results + assert state.step_results["fail-step"]["output"]["exit_code"] == 7 + # Subsequent step never executes when the flag is absent. + assert "after" not in state.step_results + + def test_declared_and_fired_continues_run(self, project_dir): + """`continue_on_error: true` + failing step: the run keeps + going, the failed step's result is recorded, and the + downstream step runs. + """ + from specify_cli.workflows.engine import WorkflowDefinition, WorkflowEngine + from specify_cli.workflows.base import RunStatus + + definition = WorkflowDefinition.from_string(""" +schema_version: "1.0" +workflow: + id: "continue-past-fail" + name: "Continue Past Fail" + version: "1.0.0" +steps: + - id: flaky-step + type: shell + run: "exit 42" + continue_on_error: true + - id: after + type: shell + run: "echo did-run" +""") + engine = WorkflowEngine(project_dir) + state = engine.execute(definition) + + assert state.status == RunStatus.COMPLETED + # Failed step's exit_code is preserved so downstream branching + # can inspect it. + assert state.step_results["flaky-step"]["output"]["exit_code"] == 42 + assert state.step_results["flaky-step"]["status"] == "failed" + # Downstream step ran successfully. + assert state.step_results["after"]["output"]["exit_code"] == 0 + + def test_declared_but_step_succeeded_is_noop(self, project_dir): + """`continue_on_error: true` on a step that succeeds is a + no-op — the flag only changes behaviour on FAILED status. + """ + from specify_cli.workflows.engine import WorkflowDefinition, WorkflowEngine + from specify_cli.workflows.base import RunStatus + + definition = WorkflowDefinition.from_string(""" +schema_version: "1.0" +workflow: + id: "flag-but-success" + name: "Flag But Success" + version: "1.0.0" +steps: + - id: ok-step + type: shell + run: "echo ok" + continue_on_error: true + - id: after + type: shell + run: "echo done" +""") + engine = WorkflowEngine(project_dir) + state = engine.execute(definition) + + assert state.status == RunStatus.COMPLETED + assert state.step_results["ok-step"]["status"] == "completed" + assert state.step_results["ok-step"]["output"]["exit_code"] == 0 + assert state.step_results["after"]["output"]["exit_code"] == 0 + + def test_if_branch_routes_around_failure(self, project_dir): + """End-to-end: `continue_on_error` + `if` cleanly routes around + a failure. The recovery branch runs; the success branch does + not. + + Mirrors the canonical usage pattern from the original feature + discussion in issue #2591. + """ + from specify_cli.workflows.engine import WorkflowDefinition, WorkflowEngine + from specify_cli.workflows.base import RunStatus + + definition = WorkflowDefinition.from_string(""" +schema_version: "1.0" +workflow: + id: "route-around" + name: "Route Around Failure" + version: "1.0.0" +steps: + - id: heavy-thing + type: shell + run: "exit 1" + continue_on_error: true + - id: check-result + type: if + condition: "{{ steps.heavy-thing.output.exit_code != 0 }}" + then: + - id: recovery + type: shell + run: "echo recovery-ran" + else: + - id: happy-path + type: shell + run: "echo happy-path-ran" +""") + engine = WorkflowEngine(project_dir) + state = engine.execute(definition) + + assert state.status == RunStatus.COMPLETED + assert "recovery" in state.step_results + assert "happy-path" not in state.step_results + + def test_gate_abort_still_halts_with_continue_on_error( + self, project_dir, monkeypatch + ): + """`continue_on_error` does NOT override a deliberate gate + abort. `output.aborted` always halts the run with + `status == ABORTED`. + + Aborts are explicit operator decisions; continue_on_error + is for transient/expected step failures only. + """ + from specify_cli.workflows.engine import WorkflowDefinition, WorkflowEngine + from specify_cli.workflows.base import RunStatus + from specify_cli.workflows.steps.gate import GateStep + from specify_cli.workflows.steps import gate as gate_module + + # Force the gate step into interactive mode and feed a "reject" + # choice so the abort path actually runs in the test env + # (default behaviour returns PAUSED when stdin is not a TTY). + monkeypatch.setattr(gate_module.sys.stdin, "isatty", lambda: True) + monkeypatch.setattr( + GateStep, "_prompt", staticmethod(lambda _msg, _opts: "reject") + ) + + definition = WorkflowDefinition.from_string(""" +schema_version: "1.0" +workflow: + id: "gate-abort-halts" + name: "Gate Abort Halts" + version: "1.0.0" +steps: + - id: gate-step + type: gate + message: "Approve?" + options: [approve, reject] + on_reject: abort + continue_on_error: true + - id: should-not-run + type: shell + run: "echo nope" +""") + engine = WorkflowEngine(project_dir) + state = engine.execute(definition) + + assert state.status == RunStatus.ABORTED + assert "should-not-run" not in state.step_results + + def test_validation_rejects_non_bool_continue_on_error(self): + """`continue_on_error` must be a literal boolean; coerced + strings like `"true"` are rejected at validation time so + authoring mistakes surface before execution. + """ + from specify_cli.workflows.engine import ( + WorkflowDefinition, + validate_workflow, + ) + + definition = WorkflowDefinition.from_string(""" +schema_version: "1.0" +workflow: + id: "bad-coe" + name: "Bad COE" + version: "1.0.0" +steps: + - id: step-one + type: shell + run: "true" + continue_on_error: "true" +""") + errors = validate_workflow(definition) + assert any( + "continue_on_error" in e and "boolean" in e for e in errors + ), errors + + def test_validation_accepts_bool_continue_on_error(self): + """Boolean values pass validation cleanly.""" + from specify_cli.workflows.engine import ( + WorkflowDefinition, + validate_workflow, + ) + + for value in (True, False): + yaml_value = "true" if value else "false" + definition = WorkflowDefinition.from_string(f""" +schema_version: "1.0" +workflow: + id: "good-coe" + name: "Good COE" + version: "1.0.0" +steps: + - id: step-one + type: shell + run: "true" + continue_on_error: {yaml_value} +""") + errors = validate_workflow(definition) + assert not any( + "continue_on_error" in e for e in errors + ), errors + + # ===== State Persistence Tests ===== class TestRunState: diff --git a/workflows/README.md b/workflows/README.md index 31f736ff76..ff0812ea62 100644 --- a/workflows/README.md +++ b/workflows/README.md @@ -219,6 +219,45 @@ Aggregate results from fan-out steps: output: {} ``` +## Error Handling + +By default, a non-zero exit code from any step halts the entire run. +Set `continue_on_error: true` on a step to record its result and +continue to the next sibling step instead. The exit code remains +available on `steps..output.exit_code` so downstream `if`, +`switch`, or `gate` steps can branch on it: + +```yaml +- id: heavy-thing + type: command + integration: claude + command: speckit.heavy-thing + continue_on_error: true + +- id: check-result + type: if + condition: "{{ steps.heavy-thing.output.exit_code != 0 }}" + then: + - id: review + type: gate + message: "Step failed (exit {{ steps.heavy-thing.output.exit_code }}). Retry or skip?" + on_reject: skip + else: + - id: next-thing + command: speckit.next-thing +``` + +**Notes:** + +- The field must be a literal boolean (`true` / `false`); coerced + strings like `"true"` are rejected at validation time. +- Gate aborts (`on_reject: abort` chosen by the operator) always halt + the run — `continue_on_error` does not override them. The flag is + for transient/expected step failures, not for overriding deliberate + operator decisions. +- When the flag is omitted, behaviour is byte-equivalent to before + this feature. + ## Expressions Workflow definitions use `{{ expression }}` syntax for dynamic values: