feat: add allowFailure option for workflow steps (#6114)

* feat: add an allowFailure field to WorkflowStepSpec * do not fail the entire workflow if a step labeled with allowFailure throws an error Signed-off-by: Alex Johnson <hello@alex-johnson.net> * improvement: improved error logging and standardised output on failure Co-authored-by: Anna Mager <anna@garden.io> --------- Signed-off-by: Alex Johnson <hello@alex-johnson.net> Co-authored-by: Steffen Neubauer <steffen@garden.io> Co-authored-by: Anna Mager <anna@garden.io>
garden-io · Jun 18, 2024 · 70b8ca8 · 70b8ca8
1 parent 7e8aefd
commit 70b8ca8
Show file tree

Hide file tree

Showing 9 changed files with 189 additions and 49 deletions.
diff --git a/core/src/commands/workflow.ts b/core/src/commands/workflow.ts
@@ -146,8 +146,6 @@ export class WorkflowCommand extends Command<Args, {}> {
         bodyLog: stepBodyLog,
       }
 
-      let stepResult: CommandResult
-
       garden.events.emit("workflowStepProcessing", { index })
       const stepTemplateContext = new WorkflowStepConfigContext({
         allStepNames,
@@ -161,6 +159,17 @@ export class WorkflowCommand extends Command<Args, {}> {
 
       const initSaveLogState = stepBodyLog.root.storeEntries
       stepBodyLog.root.storeEntries = true
+
+      if ((!step.command && !step.script) || (step.command && step.script)) {
+        garden.events.emit("workflowStepError", getStepEndEvent(index, stepStartedAt))
+        // This should be caught by the validation layer
+        throw new InternalError({
+          message: `Workflow steps must specify either a command or a script. Got: ${JSON.stringify(step)}`,
+        })
+      }
+
+      let stepResult: CommandResult | undefined
+
       try {
         if (step.command) {
           step.command = resolveTemplateStrings({
@@ -174,25 +183,31 @@ export class WorkflowCommand extends Command<Args, {}> {
           step.script = resolveTemplateString({ string: step.script, context: stepTemplateContext })
           stepResult = await runStepScript(stepParams)
         } else {
-          garden.events.emit("workflowStepError", getStepEndEvent(index, stepStartedAt))
-          // This should be caught by the validation layer
-          throw new InternalError({
-            message: `Workflow steps must specify either a command or a script. Got: ${JSON.stringify(step)}`,
-          })
+          stepResult = undefined
         }
       } catch (rawErr) {
         const err = toGardenError(rawErr)
+
         garden.events.emit("workflowStepError", getStepEndEvent(index, stepStartedAt))
         stepErrors[index] = [err]
         printStepDuration({ ...stepParams, success: false })
-        logErrors(stepBodyLog, [err], index, steps.length, step.description)
-        // There may be succeeding steps with `when: onError` or `when: always`, so we continue.
-        continue
+        // runStepCommand and runStepScript should not throw. If that happens it's either a bug (e.g. InternalError) or a user-error (e.g. TemplateError)
+        // In these cases we do not continue workflow execution, even when continueOnError is true and even when the following steps declared to run `when: onError` or `when: always`.
+        const continueOnError = false
+        logErrors(stepBodyLog, [err], index, steps.length, continueOnError, step.description)
+        break
+      }
+
+      if (stepResult === undefined) {
+        throw new InternalError({
+          message: `Workflow step did not return stepResult. Step: ${JSON.stringify(step)}`,
+        })
       }
 
       // Extract the text from the body log entry, info-level and higher
       const stepLog = stepBodyLog.toString((entry) => entry.level <= LogLevel.info)
 
+      // TODO: add step conclusion, so following steps can be aware of the error if step.continueOnError is true.
       result.steps[stepName] = {
         number: index + 1,
         outputs: stepResult.result || {},
@@ -201,9 +216,15 @@ export class WorkflowCommand extends Command<Args, {}> {
       stepBodyLog.root.storeEntries = initSaveLogState
 
       if (stepResult.errors && stepResult.errors.length > 0) {
+        logErrors(outerLog, stepResult.errors, index, steps.length, step.continueOnError || false, step.description)
+
+        // If we ignore errors
         garden.events.emit("workflowStepError", getStepEndEvent(index, stepStartedAt))
-        logErrors(outerLog, stepResult.errors, index, steps.length, step.description)
-        stepErrors[index] = stepResult.errors
+
+        if (!step.continueOnError) {
+          stepErrors[index] = stepResult.errors
+        }
+
         // There may be succeeding steps with `when: onError` or `when: always`, so we continue.
         continue
       }
@@ -383,10 +404,22 @@ export async function runStepCommand(params: RunStepCommandParams): Promise<Comm
   return await command.action(commandParams)
 }
 
-export async function runStepScript({ garden, bodyLog, step }: RunStepParams): Promise<CommandResult<any>> {
+export async function runStepScript({
+  garden,
+  bodyLog,
+  step,
+}: RunStepParams): Promise<
+  CommandResult<{ exitCode: number | undefined; stdout: string | undefined; stderr: string | undefined }>
+> {
   try {
-    await runScript({ log: bodyLog, cwd: garden.projectRoot, script: step.script!, envVars: step.envVars })
-    return { result: {} }
+    const res = await runScript({ log: bodyLog, cwd: garden.projectRoot, script: step.script!, envVars: step.envVars })
+    return {
+      result: {
+        exitCode: res.exitCode,
+        stdout: res.stdout,
+        stderr: res.stderr,
+      },
+    }
   } catch (err) {
     // Unexpected error (failed to execute script, as opposed to script returning an error code)
     if (!(err instanceof ChildProcessError)) {
@@ -400,12 +433,15 @@ export async function runStepScript({ garden, bodyLog, step }: RunStepParams): P
       stderr: err.details.stderr,
     })
 
-    bodyLog.error("")
-    bodyLog.error({ msg: `Script failed with the following error:`, error: scriptError })
-    bodyLog.error("")
-    bodyLog.error(err.details.stderr)
-
-    throw scriptError
+    // We return the error here because we want to separately handle unexpected internal errors (like syntax errors) and user error (like script failure).
+    return {
+      result: {
+        exitCode: err.details.code,
+        stdout: err.details.stdout,
+        stderr: err.details.stderr,
+      },
+      errors: [scriptError],
+    }
   }
 }
 
@@ -459,24 +495,31 @@ export function logErrors(
   errors: GardenError[],
   stepIndex: number,
   stepCount: number,
+  continueOnError: boolean,
   stepDescription?: string
 ) {
   const description = formattedStepDescription(stepIndex, stepCount, stepDescription)
-  const errMsg = `An error occurred while running step ${styles.accent(description)}.\n`
-  log.error(errMsg)
-  log.debug("")
+  const allowedToFailMessage = `Because ${styles.bold("continueOnError")} is ${styles.bold(true)}, the workflow will continue as if the step succeeded.`
+  const errMsg = `\nAn error occurred while running step ${styles.accent(description)}.${continueOnError ? ` ${allowedToFailMessage}` : ``}\n`
+
+  const logFn: typeof log.warn | typeof log.error = (...args) =>
+    continueOnError ? log.warn(...args) : log.error(...args)
+
+  logFn(errMsg)
   for (const error of errors) {
     if (error instanceof WorkflowScriptError) {
-      const scriptErrMsg = renderMessageWithDivider({
-        prefix: `Script exited with code ${error.details.exitCode} ${renderDuration(log.getDuration())}`,
-        msg: error.explain(),
-        isError: true,
-      })
-      log.error(scriptErrMsg)
+      logFn(
+        renderMessageWithDivider({
+          prefix: `Script exited with code ${error.details.exitCode} ${renderDuration(log.getDuration())}. This is the stderr output:`,
+          msg: error.details.stderr || error.details.output,
+          isError: !continueOnError,
+        })
+      )
+      logFn("")
     } else {
-      const taskDetailErrMsg = error.toString(true)
+      const taskDetailErrMsg = error.toString(false)
       log.debug(taskDetailErrMsg)
-      log.error(error.explain() + "\n")
+      logFn(error.message + "\n\n")
     }
   }
 }

diff --git a/core/src/config/template-contexts/workflow.ts b/core/src/config/template-contexts/workflow.ts
@@ -29,7 +29,7 @@ class WorkflowStepContext extends ConfigContext {
       // TODO: populate and then link to command docs
       .description(
         dedent`
-        The outputs returned by the step, as a mapping. Script steps will always have \`stdout\` and \`stderr\` keys.
+        The outputs returned by the step, as a mapping. Script steps will always have \`stdout\`, \`stderr\` and \`exitCode\` keys.
         Command steps return different keys, including potentially nested maps and arrays. Please refer to each command
         for its output schema.
         `

diff --git a/core/src/config/workflow.ts b/core/src/config/workflow.ts
@@ -198,8 +198,9 @@ export interface WorkflowStepSpec {
   description?: string
   envVars?: PrimitiveMap
   script?: string
-  skip?: boolean
   when?: workflowStepModifier
+  skip?: boolean
+  continueOnError?: boolean
 }
 
 export const workflowStepSchema = createSchema({
@@ -265,6 +266,7 @@ export const workflowStepSchema = createSchema({
       See the [workflows guide](${DOCS_BASE_URL}/using-garden/workflows#the-skip-and-when-options) for details
       and examples.
       `),
+    continueOnError: joi.boolean().description(`Set to true to continue if the step errors.`).default(false),
   }),
   xor: [["command", "script"]],
 })

diff --git a/core/src/exceptions.ts b/core/src/exceptions.ts
@@ -287,7 +287,7 @@ export class WorkflowScriptError extends GardenError {
   constructor(details: WorkflowScriptErrorDetails) {
     super({
       message: dedent`
-      Script exited with code ${details.exitCode}. This is the output:
+      Script exited with code ${details.exitCode}. This is the stderr output:
 
       ${details.stderr || details.output}`,
     })

diff --git a/core/test/unit/src/commands/workflow.ts b/core/test/unit/src/commands/workflow.ts
@@ -82,6 +82,49 @@ describe("RunWorkflowCommand", () => {
     expect(result.errors || []).to.eql([])
   })
 
+  it("should continue on error if continueOnError = true", async () => {
+    garden.setWorkflowConfigs([
+      {
+        apiVersion: GardenApiVersion.v0,
+        name: "workflow-a",
+        kind: "Workflow",
+        internal: {
+          basePath: garden.projectRoot,
+        },
+        files: [],
+        envVars: {},
+        resources: defaultWorkflowResources,
+        steps: [
+          {
+            script: dedent`
+              echo stdout
+              echo stderr 1>&2
+              exit 1
+            `, // <-- error thrown here
+            continueOnError: true,
+          },
+          { command: ["echo", "success!"] },
+        ],
+      },
+    ])
+
+    const { result, errors } = await cmd.action({ ...defaultParams, args: { workflow: "workflow-a" } })
+
+    expect(result).to.exist
+    expect(errors).to.not.exist
+
+    // step 1 is a script command
+    expect(result?.steps).to.have.property("step-1")
+    expect(result?.steps["step-1"].log).to.equal("stdout\nstderr")
+    expect(result?.steps["step-1"].outputs["stderr"]).to.equal("stderr")
+    expect(result?.steps["step-1"].outputs["stdout"]).to.equal("stdout")
+    expect(result?.steps["step-1"].outputs["exitCode"]).to.equal(1)
+
+    // we should have executed step 2, which is a Garden command, because continueOnError is true in step-1.
+    expect(result?.steps).to.have.property("step-2")
+    expect(result?.steps["step-2"].outputs).to.not.have.property("stderr")
+  })
+
   it("should add workflowStep metadata to log entries provided to steps", async () => {
     const _garden = await makeTestGardenA(undefined)
     // Ensure log entries are empty
@@ -796,6 +839,9 @@ describe("RunWorkflowCommand", () => {
     expect(result).to.exist
     expect(errors).to.not.exist
     expect(result?.steps["step-1"].log).to.equal("stdout\nstderr")
+    expect(result?.steps["step-1"].outputs["stderr"]).to.equal("stderr")
+    expect(result?.steps["step-1"].outputs["stdout"]).to.equal("stdout")
+    expect(result?.steps["step-1"].outputs["exitCode"]).to.equal(0)
   })
 
   it("should throw if a script step fails", async () => {
@@ -845,7 +891,7 @@ describe("RunWorkflowCommand", () => {
     if (!(error instanceof WorkflowScriptError)) {
       expect.fail("Expected error to be a WorkflowScriptError")
     }
-    expect(error.message).to.equal("Script exited with code 1. This is the output:\n\nboo!")
+    expect(error.message).to.equal("Script exited with code 1. This is the stderr output:\n\nboo!")
     expect(error.details.stdout).to.equal("boo!")
   })
 

diff --git a/core/test/unit/src/config/workflow.ts b/core/test/unit/src/config/workflow.ts
@@ -9,7 +9,7 @@
 import { expect } from "chai"
 import type { TestGarden } from "../../../helpers.js"
 import { expectError, getDataDir, makeTestGarden, makeTestGardenA } from "../../../helpers.js"
-import type { WorkflowConfig, TriggerSpec } from "../../../../src/config/workflow.js"
+import type { WorkflowConfig, WorkflowStepSpec, TriggerSpec } from "../../../../src/config/workflow.js"
 import {
   resolveWorkflowConfig,
   populateNamespaceForTriggers,
@@ -39,6 +39,12 @@ describe("resolveWorkflowConfig", () => {
     keepAliveHours: 48,
   }
 
+  const defaultWorkflowStep: WorkflowStepSpec = {
+    skip: false,
+    when: "onSuccess",
+    continueOnError: false,
+  }
+
   before(async () => {
     garden = await makeTestGardenA()
     garden["secrets"] = { foo: "bar", bar: "baz", baz: "banana" }
@@ -55,8 +61,15 @@ describe("resolveWorkflowConfig", () => {
       description: "Sample workflow",
       envVars: {},
       steps: [
-        { description: "Deploy the stack", command: ["deploy"], skip: false, when: "onSuccess", envVars: {} },
-        { command: ["test"], skip: false, when: "onSuccess", envVars: {} },
+        {
+          ...defaultWorkflowStep,
+          description: "Deploy the stack",
+          command: ["deploy"],
+          skip: false,
+          when: "onSuccess",
+          envVars: {},
+        },
+        { ...defaultWorkflowStep, command: ["test"], skip: false, when: "onSuccess", envVars: {} },
       ],
       triggers: [
         {
@@ -85,8 +98,15 @@ describe("resolveWorkflowConfig", () => {
       envVars: {},
       limits: minimumWorkflowLimits, // <----
       steps: [
-        { description: "Deploy the stack", command: ["deploy"], skip: false, when: "onSuccess", envVars: {} },
-        { command: ["test"], skip: false, when: "onSuccess", envVars: {} },
+        {
+          ...defaultWorkflowStep,
+          description: "Deploy the stack",
+          command: ["deploy"],
+          skip: false,
+          when: "onSuccess",
+          envVars: {},
+        },
+        { ...defaultWorkflowStep, command: ["test"], skip: false, when: "onSuccess", envVars: {} },
       ],
       triggers: [
         {
@@ -145,12 +165,13 @@ describe("resolveWorkflowConfig", () => {
       envVars: {},
       steps: [
         {
+          ...defaultWorkflowStep,
           description: "Deploy the stack",
           command: ["deploy", "${var.foo}"],
           skip: false,
           when: "onSuccess",
         },
-        { script: "echo ${var.foo}", skip: false, when: "onSuccess" },
+        { ...defaultWorkflowStep, script: "echo ${var.foo}", skip: false, when: "onSuccess" },
       ],
     }
 
@@ -169,8 +190,14 @@ describe("resolveWorkflowConfig", () => {
 
       envVars: {},
       steps: [
-        { description: "Deploy the stack", command: ["deploy"], skip: false, when: "onSuccess" },
-        { command: ["test"], skip: false, when: "onSuccess" },
+        {
+          ...defaultWorkflowStep,
+          description: "Deploy the stack",
+          command: ["deploy"],
+          skip: false,
+          when: "onSuccess",
+        },
+        { ...defaultWorkflowStep, command: ["test"], skip: false, when: "onSuccess" },
       ],
     }
 
@@ -186,8 +213,14 @@ describe("resolveWorkflowConfig", () => {
 
       envVars: {},
       steps: [
-        { description: "Deploy the stack", command: ["deploy"], skip: false, when: "onSuccess" },
-        { command: ["test"], skip: false, when: "onSuccess" },
+        {
+          ...defaultWorkflowStep,
+          description: "Deploy the stack",
+          command: ["deploy"],
+          skip: false,
+          when: "onSuccess",
+        },
+        { ...defaultWorkflowStep, command: ["test"], skip: false, when: "onSuccess" },
       ],
       triggers: [
         {
@@ -221,8 +254,8 @@ describe("resolveWorkflowConfig", () => {
       ...defaults,
       ...config,
       steps: [
-        { description: "Deploy the stack", command: ["deploy"], skip: false, when: "onSuccess", envVars: {} },
-        { command: ["test"], skip: false, when: "onSuccess", envVars: {} },
+        { ...defaultWorkflowStep, description: "Deploy the stack", command: ["deploy"], envVars: {} },
+        { ...defaultWorkflowStep, command: ["test"], envVars: {} },
       ],
     })
   })