diff --git a/actions/setup/js/check_circuit_breaker.cjs b/actions/setup/js/check_circuit_breaker.cjs new file mode 100644 index 0000000000..92436fd229 --- /dev/null +++ b/actions/setup/js/check_circuit_breaker.cjs @@ -0,0 +1,103 @@ +// @ts-check +/// + +const fs = require("fs"); +const path = require("path"); +const { writeDenialSummary } = require("./pre_activation_summary.cjs"); +const { getErrorMessage } = require("./error_helpers.cjs"); + +/** + * Circuit breaker check for agentic workflows. + * + * Reads the circuit-breaker state from /circuit-breaker-state.json + * (downloaded by the preceding actions/download-artifact step) and implements + * the standard closed → open → half-open state machine pattern: + * + * CLOSED → normal execution (consecutive_failures < max) + * OPEN → execution blocked (consecutive_failures >= max AND cooldown not elapsed) + * HALF-OPEN → one retry allowed (consecutive_failures >= max AND cooldown elapsed) + * + * GH_AW_CB_STATE_DIR overrides the default state directory (/tmp/gh-aw) for tests. + */ +async function main() { + const maxFailures = parseInt(process.env.GH_AW_CB_MAX_FAILURES?.trim() || "5", 10); + const timeWindowMinutes = parseInt(process.env.GH_AW_CB_TIME_WINDOW_MINUTES?.trim() || "1440", 10); + const cooldownMinutes = parseInt(process.env.GH_AW_CB_COOLDOWN_MINUTES?.trim() || "60", 10); + const notify = (process.env.GH_AW_CB_NOTIFY?.trim() || "true") === "true"; + const workflowName = process.env.GH_AW_WORKFLOW_NAME || "Unknown Workflow"; + const stateDir = process.env.GH_AW_CB_STATE_DIR || "/tmp/gh-aw"; + + core.info(`🔌 Circuit breaker check for workflow '${workflowName}'`); + core.info(` Configuration: max=${maxFailures} failures, window=${timeWindowMinutes}m, cooldown=${cooldownMinutes}m`); + + // Read the circuit breaker state downloaded by the preceding download-artifact step. + const stateFile = path.join(stateDir, "circuit-breaker-state.json"); + let state = { consecutive_failures: 0 }; + + try { + if (fs.existsSync(stateFile)) { + const content = fs.readFileSync(stateFile, "utf8"); + state = JSON.parse(content); + core.info(` Loaded state: consecutive_failures=${state.consecutive_failures}`); + } else { + core.info(` No previous state found — starting fresh (circuit CLOSED)`); + } + } catch (error) { + // If we can't load the previous state, assume circuit is closed (fail-open for availability) + core.warning(`Could not read previous circuit breaker state from ${stateFile}: ${getErrorMessage(error)}. Assuming circuit is closed.`); + } + + const consecutiveFailures = state.consecutive_failures ?? 0; + const lastFailure = state.last_failure ? new Date(state.last_failure) : null; + + core.info(` Consecutive failures: ${consecutiveFailures} / ${maxFailures}`); + + const nowMs = Date.now(); + const windowMs = timeWindowMinutes * 60 * 1000; + const failureIsRecent = lastFailure !== null && nowMs - lastFailure.getTime() <= windowMs; + + // CLOSED state: fewer failures than threshold, or failures are outside the time window + if (consecutiveFailures < maxFailures || !failureIsRecent) { + core.info(`✅ Circuit breaker is CLOSED — workflow execution allowed`); + core.setOutput("circuit_breaker_ok", "true"); + core.setOutput("consecutive_failures", String(consecutiveFailures)); + return; + } + + // Circuit is OPEN — check if cooldown has elapsed (HALF-OPEN state) + const cooldownMs = cooldownMinutes * 60 * 1000; + const cooldownElapsed = lastFailure !== null && nowMs - lastFailure.getTime() >= cooldownMs; + + if (cooldownElapsed) { + core.info(`🔄 Circuit breaker is HALF-OPEN — cooldown elapsed, allowing one retry`); + core.setOutput("circuit_breaker_ok", "true"); + core.setOutput("consecutive_failures", String(consecutiveFailures)); + return; + } + + // OPEN state: block execution + const minutesSinceFail = lastFailure ? Math.floor((nowMs - lastFailure.getTime()) / 60000) : 0; + const minutesUntilRetry = Math.max(0, cooldownMinutes - minutesSinceFail); + + core.warning(`🔴 Circuit breaker is OPEN — workflow execution blocked`); + core.warning(` ${consecutiveFailures} consecutive failures in the last ${timeWindowMinutes} minutes`); + core.warning(` Retry allowed in approximately ${minutesUntilRetry} minute(s)`); + + core.setOutput("circuit_breaker_ok", "false"); + core.setOutput("consecutive_failures", String(consecutiveFailures)); + + if (notify) { + core.error( + `Circuit breaker OPEN for '${workflowName}': ${consecutiveFailures} consecutive failures detected. ` + + `Workflow execution is blocked until the cooldown period expires (≈${minutesUntilRetry} min remaining). ` + + `Fix the underlying issue and wait for the cooldown, or manually reset by deleting the 'circuit-breaker-state' artifact.` + ); + } + + await writeDenialSummary( + `Circuit breaker OPEN for workflow '${workflowName}': ${consecutiveFailures} consecutive failures detected within the ${timeWindowMinutes}-minute window.`, + `The circuit breaker will allow a retry after the cooldown period (≈${minutesUntilRetry} min remaining). Fix the underlying issue and wait, or delete the \`circuit-breaker-state\` artifact to reset manually.` + ); +} + +module.exports = { main }; diff --git a/actions/setup/js/check_circuit_breaker.test.cjs b/actions/setup/js/check_circuit_breaker.test.cjs new file mode 100644 index 0000000000..fc6a013d57 --- /dev/null +++ b/actions/setup/js/check_circuit_breaker.test.cjs @@ -0,0 +1,189 @@ +// @ts-check +import { describe, it, expect, beforeEach, afterEach, vi } from "vitest"; +import fs from "fs"; +import os from "os"; +import path from "path"; + +// ---- Globals ---- +const mockCore = { + debug: vi.fn(), + info: vi.fn(), + notice: vi.fn(), + warning: vi.fn(), + error: vi.fn(), + setFailed: vi.fn(), + setOutput: vi.fn(), + exportVariable: vi.fn(), + setSecret: vi.fn(), + getCancelled: vi.fn(), + setCancelled: vi.fn(), + setError: vi.fn(), + getInput: vi.fn(), + getBooleanInput: vi.fn(), + getMultilineInput: vi.fn(), + getState: vi.fn(), + saveState: vi.fn(), + startGroup: vi.fn(), + endGroup: vi.fn(), + group: vi.fn(), + addPath: vi.fn(), + setCommandEcho: vi.fn(), + isDebug: vi.fn().mockReturnValue(false), + getIDToken: vi.fn(), + toPlatformPath: vi.fn(), + toPosixPath: vi.fn(), + toWin32Path: vi.fn(), + summary: { addRaw: vi.fn().mockReturnThis(), write: vi.fn().mockResolvedValue(undefined) }, +}; +global.core = mockCore; +global.github = {}; +global.context = { repo: { owner: "test-owner", repo: "test-repo" }, runId: 123456 }; + +// Helper: relative timestamps +const minutesAgoISO = m => new Date(Date.now() - m * 60 * 1000).toISOString(); + +describe("check_circuit_breaker.cjs", () => { + let tmpDir; + let originalEnv; + + beforeEach(() => { + vi.clearAllMocks(); + // Restore summary stubs after clearAllMocks + mockCore.summary.addRaw.mockReturnThis(); + mockCore.summary.write.mockResolvedValue(undefined); + + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "cb-check-test-")); + + originalEnv = { + GH_AW_CB_MAX_FAILURES: process.env.GH_AW_CB_MAX_FAILURES, + GH_AW_CB_TIME_WINDOW_MINUTES: process.env.GH_AW_CB_TIME_WINDOW_MINUTES, + GH_AW_CB_COOLDOWN_MINUTES: process.env.GH_AW_CB_COOLDOWN_MINUTES, + GH_AW_CB_NOTIFY: process.env.GH_AW_CB_NOTIFY, + GH_AW_WORKFLOW_NAME: process.env.GH_AW_WORKFLOW_NAME, + GH_AW_CB_STATE_DIR: process.env.GH_AW_CB_STATE_DIR, + }; + process.env.GH_AW_CB_MAX_FAILURES = "5"; + process.env.GH_AW_CB_TIME_WINDOW_MINUTES = "1440"; + process.env.GH_AW_CB_COOLDOWN_MINUTES = "60"; + process.env.GH_AW_CB_NOTIFY = "true"; + process.env.GH_AW_WORKFLOW_NAME = "Test Workflow"; + process.env.GH_AW_CB_STATE_DIR = tmpDir; + }); + + afterEach(() => { + for (const [key, val] of Object.entries(originalEnv)) { + if (val === undefined) { + delete process.env[key]; + } else { + process.env[key] = val; + } + } + fs.rmSync(tmpDir, { recursive: true, force: true }); + }); + + /** Write a state file to the temp dir. */ + function writeState(state) { + fs.writeFileSync(path.join(tmpDir, "circuit-breaker-state.json"), JSON.stringify(state), "utf8"); + } + + async function runCheck() { + vi.resetModules(); + const mod = await import("./check_circuit_breaker.cjs"); + await mod.main(); + } + + it("CLOSED — no previous state file: allows execution", async () => { + await runCheck(); + + expect(mockCore.setOutput).toHaveBeenCalledWith("circuit_breaker_ok", "true"); + expect(mockCore.setOutput).toHaveBeenCalledWith("consecutive_failures", "0"); + expect(mockCore.info).toHaveBeenCalledWith(expect.stringContaining("CLOSED")); + }); + + it("CLOSED — failures below threshold: allows execution", async () => { + writeState({ consecutive_failures: 3, last_failure: minutesAgoISO(10) }); + + await runCheck(); + + expect(mockCore.setOutput).toHaveBeenCalledWith("circuit_breaker_ok", "true"); + expect(mockCore.info).toHaveBeenCalledWith(expect.stringContaining("CLOSED")); + }); + + it("CLOSED — failures at threshold but outside time window: allows execution", async () => { + // 2 days ago — outside the 24h (1440 min) window + writeState({ consecutive_failures: 5, last_failure: minutesAgoISO(2 * 24 * 60) }); + + await runCheck(); + + expect(mockCore.setOutput).toHaveBeenCalledWith("circuit_breaker_ok", "true"); + expect(mockCore.info).toHaveBeenCalledWith(expect.stringContaining("CLOSED")); + }); + + it("OPEN — failures at threshold within window: blocks execution", async () => { + writeState({ consecutive_failures: 5, last_failure: minutesAgoISO(5) }); + + await runCheck(); + + expect(mockCore.setOutput).toHaveBeenCalledWith("circuit_breaker_ok", "false"); + expect(mockCore.warning).toHaveBeenCalledWith(expect.stringContaining("OPEN")); + }); + + it("OPEN — notify=true posts an error annotation", async () => { + process.env.GH_AW_CB_NOTIFY = "true"; + writeState({ consecutive_failures: 5, last_failure: minutesAgoISO(5) }); + + await runCheck(); + + expect(mockCore.error).toHaveBeenCalledWith(expect.stringContaining("Circuit breaker OPEN")); + }); + + it("OPEN — notify=false skips the error annotation", async () => { + process.env.GH_AW_CB_NOTIFY = "false"; + writeState({ consecutive_failures: 5, last_failure: minutesAgoISO(5) }); + + await runCheck(); + + expect(mockCore.setOutput).toHaveBeenCalledWith("circuit_breaker_ok", "false"); + expect(mockCore.error).not.toHaveBeenCalled(); + }); + + it("HALF-OPEN — cooldown elapsed: allows one retry", async () => { + process.env.GH_AW_CB_COOLDOWN_MINUTES = "60"; + // 90 min ago — cooldown (60 min) elapsed, still within 24h window + writeState({ consecutive_failures: 5, last_failure: minutesAgoISO(90) }); + + await runCheck(); + + expect(mockCore.setOutput).toHaveBeenCalledWith("circuit_breaker_ok", "true"); + expect(mockCore.info).toHaveBeenCalledWith(expect.stringContaining("HALF-OPEN")); + }); + + it("OPEN — cooldown not yet elapsed: blocks execution", async () => { + process.env.GH_AW_CB_COOLDOWN_MINUTES = "60"; + // 30 min ago — cooldown (60 min) not yet elapsed + writeState({ consecutive_failures: 5, last_failure: minutesAgoISO(30) }); + + await runCheck(); + + expect(mockCore.setOutput).toHaveBeenCalledWith("circuit_breaker_ok", "false"); + }); + + it("CLOSED — custom lower threshold respected", async () => { + process.env.GH_AW_CB_MAX_FAILURES = "3"; + writeState({ consecutive_failures: 2, last_failure: minutesAgoISO(5) }); + + await runCheck(); + + expect(mockCore.setOutput).toHaveBeenCalledWith("circuit_breaker_ok", "true"); + }); + + it("handles corrupt state file gracefully — circuit CLOSED (fail-open)", async () => { + fs.writeFileSync(path.join(tmpDir, "circuit-breaker-state.json"), "NOT VALID JSON", "utf8"); + + await runCheck(); + + expect(mockCore.warning).toHaveBeenCalledWith(expect.stringContaining("Could not read")); + // Fail-open: allow execution + expect(mockCore.setOutput).toHaveBeenCalledWith("circuit_breaker_ok", "true"); + }); +}); diff --git a/actions/setup/js/find_circuit_breaker_artifact.cjs b/actions/setup/js/find_circuit_breaker_artifact.cjs new file mode 100644 index 0000000000..910532234d --- /dev/null +++ b/actions/setup/js/find_circuit_breaker_artifact.cjs @@ -0,0 +1,71 @@ +// @ts-check +/// + +const { getErrorMessage } = require("./error_helpers.cjs"); + +/** + * Find the most recent completed workflow run (other than the current one) + * that has a 'circuit-breaker-state' artifact, and output its run ID. + * + * Output: previous_run_id — the run ID string, or empty string if not found. + */ +async function main() { + const { + repo: { owner, repo }, + runId, + } = context; + + core.info(`🔌 Looking for previous circuit-breaker-state artifact`); + + try { + // Get the workflow ID of the current run + const { data: runData } = await github.rest.actions.getWorkflowRun({ + owner, + repo, + run_id: runId, + }); + const workflowId = runData.workflow_id; + core.info(` Workflow ID: ${workflowId}`); + + // List recent completed runs for this workflow (excluding the current one) + const { data: runsData } = await github.rest.actions.listWorkflowRuns({ + owner, + repo, + workflow_id: workflowId, + status: "completed", + per_page: 20, + }); + + core.info(` Found ${runsData.workflow_runs.length} recent completed runs`); + + for (const run of runsData.workflow_runs) { + if (run.id === runId) continue; + + try { + const { data: artifactsData } = await github.rest.actions.listWorkflowRunArtifacts({ + owner, + repo, + run_id: run.id, + }); + + const artifact = artifactsData.artifacts.find(a => a.name === "circuit-breaker-state" && !a.expired); + if (artifact) { + core.info(` Found circuit-breaker-state artifact in run #${run.id}`); + core.setOutput("previous_run_id", String(run.id)); + return; + } + } catch (error) { + core.debug(` Could not list artifacts for run #${run.id}: ${getErrorMessage(error)}`); + continue; + } + } + + core.info(` No previous circuit-breaker-state artifact found`); + core.setOutput("previous_run_id", ""); + } catch (error) { + core.warning(`Could not search for previous circuit breaker state: ${getErrorMessage(error)}`); + core.setOutput("previous_run_id", ""); + } +} + +module.exports = { main }; diff --git a/actions/setup/js/find_circuit_breaker_artifact.test.cjs b/actions/setup/js/find_circuit_breaker_artifact.test.cjs new file mode 100644 index 0000000000..8c00ad20ca --- /dev/null +++ b/actions/setup/js/find_circuit_breaker_artifact.test.cjs @@ -0,0 +1,206 @@ +// @ts-check +import { describe, it, expect, beforeEach, vi } from "vitest"; + +const mockCore = { + debug: vi.fn(), + info: vi.fn(), + notice: vi.fn(), + warning: vi.fn(), + error: vi.fn(), + setFailed: vi.fn(), + setOutput: vi.fn(), + exportVariable: vi.fn(), + setSecret: vi.fn(), + setCancelled: vi.fn(), + setError: vi.fn(), + getInput: vi.fn(), + getBooleanInput: vi.fn(), + getMultilineInput: vi.fn(), + getState: vi.fn(), + saveState: vi.fn(), + startGroup: vi.fn(), + endGroup: vi.fn(), + group: vi.fn(), + addPath: vi.fn(), + setCommandEcho: vi.fn(), + isDebug: vi.fn().mockReturnValue(false), + getIDToken: vi.fn(), + toPlatformPath: vi.fn(), + toPosixPath: vi.fn(), + toWin32Path: vi.fn(), + summary: { addRaw: vi.fn().mockReturnThis(), write: vi.fn().mockResolvedValue(undefined) }, +}; + +const mockGithub = { + rest: { + actions: { + getWorkflowRun: vi.fn(), + listWorkflowRuns: vi.fn(), + listWorkflowRunArtifacts: vi.fn(), + }, + }, +}; + +const mockContext = { + repo: { owner: "test-owner", repo: "test-repo" }, + runId: 99999, +}; + +global.core = mockCore; +global.github = mockGithub; +global.context = mockContext; + +describe("find_circuit_breaker_artifact.cjs", () => { + let findArtifact; + + beforeEach(async () => { + vi.clearAllMocks(); + vi.resetModules(); + findArtifact = await import("./find_circuit_breaker_artifact.cjs"); + + // Default: current run is workflow 42 + mockGithub.rest.actions.getWorkflowRun.mockResolvedValue({ + data: { workflow_id: 42 }, + }); + }); + + it("returns empty when no completed runs exist", async () => { + mockGithub.rest.actions.listWorkflowRuns.mockResolvedValue({ + data: { workflow_runs: [] }, + }); + + await findArtifact.main(); + + expect(mockCore.setOutput).toHaveBeenCalledWith("previous_run_id", ""); + expect(mockCore.info).toHaveBeenCalledWith(expect.stringContaining("No previous circuit-breaker-state artifact found")); + }); + + it("skips the current run ID", async () => { + // The only completed run is the current run itself + mockGithub.rest.actions.listWorkflowRuns.mockResolvedValue({ + data: { + workflow_runs: [{ id: 99999 }], + }, + }); + + await findArtifact.main(); + + expect(mockGithub.rest.actions.listWorkflowRunArtifacts).not.toHaveBeenCalled(); + expect(mockCore.setOutput).toHaveBeenCalledWith("previous_run_id", ""); + }); + + it("returns the run ID of the most recent run with the artifact", async () => { + mockGithub.rest.actions.listWorkflowRuns.mockResolvedValue({ + data: { + workflow_runs: [{ id: 100 }, { id: 200 }], + }, + }); + + // Run 100 has the artifact + mockGithub.rest.actions.listWorkflowRunArtifacts.mockImplementation(({ run_id }) => { + if (run_id === 100) { + return Promise.resolve({ + data: { + artifacts: [{ name: "circuit-breaker-state", expired: false }], + }, + }); + } + return Promise.resolve({ data: { artifacts: [] } }); + }); + + await findArtifact.main(); + + expect(mockCore.setOutput).toHaveBeenCalledWith("previous_run_id", "100"); + }); + + it("skips expired artifacts and continues to next run", async () => { + mockGithub.rest.actions.listWorkflowRuns.mockResolvedValue({ + data: { + workflow_runs: [{ id: 100 }, { id: 200 }], + }, + }); + + // Run 100 has the artifact but it's expired; run 200 has it fresh + mockGithub.rest.actions.listWorkflowRunArtifacts.mockImplementation(({ run_id }) => { + if (run_id === 100) { + return Promise.resolve({ + data: { + artifacts: [{ name: "circuit-breaker-state", expired: true }], + }, + }); + } + if (run_id === 200) { + return Promise.resolve({ + data: { + artifacts: [{ name: "circuit-breaker-state", expired: false }], + }, + }); + } + return Promise.resolve({ data: { artifacts: [] } }); + }); + + await findArtifact.main(); + + expect(mockCore.setOutput).toHaveBeenCalledWith("previous_run_id", "200"); + }); + + it("continues past runs where artifact listing fails", async () => { + mockGithub.rest.actions.listWorkflowRuns.mockResolvedValue({ + data: { + workflow_runs: [{ id: 100 }, { id: 200 }], + }, + }); + + // Run 100 throws an error; run 200 has the artifact + mockGithub.rest.actions.listWorkflowRunArtifacts.mockImplementation(({ run_id }) => { + if (run_id === 100) { + return Promise.reject(new Error("403 Forbidden")); + } + return Promise.resolve({ + data: { + artifacts: [{ name: "circuit-breaker-state", expired: false }], + }, + }); + }); + + await findArtifact.main(); + + expect(mockCore.setOutput).toHaveBeenCalledWith("previous_run_id", "200"); + expect(mockCore.debug).toHaveBeenCalledWith(expect.stringContaining("Could not list artifacts")); + }); + + it("returns empty when no run has the named artifact", async () => { + mockGithub.rest.actions.listWorkflowRuns.mockResolvedValue({ + data: { + workflow_runs: [{ id: 100 }, { id: 200 }], + }, + }); + + // Neither run has the circuit-breaker-state artifact + mockGithub.rest.actions.listWorkflowRunArtifacts.mockResolvedValue({ + data: { artifacts: [{ name: "some-other-artifact", expired: false }] }, + }); + + await findArtifact.main(); + + expect(mockCore.setOutput).toHaveBeenCalledWith("previous_run_id", ""); + }); + + it("handles getWorkflowRun API failure gracefully", async () => { + mockGithub.rest.actions.getWorkflowRun.mockRejectedValue(new Error("API unavailable")); + + await findArtifact.main(); + + expect(mockCore.warning).toHaveBeenCalledWith(expect.stringContaining("Could not search for previous circuit breaker state")); + expect(mockCore.setOutput).toHaveBeenCalledWith("previous_run_id", ""); + }); + + it("handles listWorkflowRuns API failure gracefully", async () => { + mockGithub.rest.actions.listWorkflowRuns.mockRejectedValue(new Error("Rate limited")); + + await findArtifact.main(); + + expect(mockCore.warning).toHaveBeenCalledWith(expect.stringContaining("Could not search for previous circuit breaker state")); + expect(mockCore.setOutput).toHaveBeenCalledWith("previous_run_id", ""); + }); +}); diff --git a/actions/setup/js/update_circuit_breaker.cjs b/actions/setup/js/update_circuit_breaker.cjs new file mode 100644 index 0000000000..fe1206423c --- /dev/null +++ b/actions/setup/js/update_circuit_breaker.cjs @@ -0,0 +1,101 @@ +// @ts-check +/// + +const fs = require("fs"); +const path = require("path"); +const { getErrorMessage } = require("./error_helpers.cjs"); + +/** + * Circuit breaker state update — runs with if: always() after agent execution. + * + * Reads the current job status and updates the circuit breaker state: + * - SUCCESS: reset consecutive_failures to 0 + * - FAILURE/CANCELLED: increment consecutive_failures (but only count failures + * within the configured time window; older failures are discarded) + * + * The updated state is written to /circuit-breaker-state.json, + * which is then uploaded as the 'circuit-breaker-state' artifact. + * + * GH_AW_CB_STATE_DIR overrides the default state directory (/tmp/gh-aw) for tests. + */ +async function main() { + const jobStatus = (process.env.GH_AW_CB_JOB_STATUS || "").toLowerCase(); + const maxFailures = parseInt(process.env.GH_AW_CB_MAX_FAILURES?.trim() || "5", 10); + const timeWindowMinutes = parseInt(process.env.GH_AW_CB_TIME_WINDOW_MINUTES?.trim() || "1440", 10); + const workflowName = process.env.GH_AW_WORKFLOW_NAME || "Unknown Workflow"; + const stateDir = process.env.GH_AW_CB_STATE_DIR || "/tmp/gh-aw"; + + core.info(`🔌 Updating circuit breaker state for workflow '${workflowName}'`); + core.info(` Job status: ${jobStatus}`); + + // Load the previous state from the artifact downloaded in the check step (if available). + // The check step would have written the state to /circuit-breaker-state.json + // if one was found; otherwise we start fresh. + let previousState = { consecutive_failures: 0 }; + + const stateFile = path.join(stateDir, "circuit-breaker-state.json"); + + try { + if (fs.existsSync(stateFile)) { + const content = fs.readFileSync(stateFile, "utf8"); + previousState = JSON.parse(content); + core.info(` Loaded previous state: consecutive_failures=${previousState.consecutive_failures}`); + } + } catch (error) { + core.warning(`Could not load existing circuit breaker state: ${getErrorMessage(error)}. Starting fresh.`); + } + + const nowISO = new Date().toISOString(); + const nowMs = Date.now(); + const windowMs = timeWindowMinutes * 60 * 1000; + + // If the last failure is outside the time window, the accumulated count no longer + // applies and we treat the previous state as if it were a fresh start. + const lastFailureMs = previousState.last_failure ? new Date(previousState.last_failure).getTime() : null; + const previousCountInWindow = lastFailureMs !== null && nowMs - lastFailureMs <= windowMs ? (previousState.consecutive_failures ?? 0) : 0; + + let newState; + + if (jobStatus === "success") { + // Success — reset the circuit breaker + newState = { + consecutive_failures: 0, + last_success: nowISO, + last_failure: previousState.last_failure ?? null, + circuit_opened_at: null, + }; + core.info(`✅ Job succeeded — resetting circuit breaker (was ${previousState.consecutive_failures} failures)`); + } else { + // Failure or cancellation — increment the failure counter (using only in-window count) + const newCount = previousCountInWindow + 1; + // Preserve the original circuit_opened_at timestamp from when the circuit first opened. + // Using ?? ensures we only record the timestamp on the first opening (newCount === maxFailures), + // and keep that value on all subsequent failures without overwriting it. + const circuitOpenedAt = newCount >= maxFailures ? (previousState.circuit_opened_at ?? nowISO) : null; + + newState = { + consecutive_failures: newCount, + last_failure: nowISO, + last_success: previousState.last_success ?? null, + circuit_opened_at: circuitOpenedAt, + }; + + core.info(`❌ Job failed — consecutive failures: ${newCount} / ${maxFailures}`); + if (newCount >= maxFailures) { + core.warning(`🔴 Circuit breaker threshold reached (${newCount} consecutive failures). Circuit is now OPEN.`); + } + } + + // Write the updated state to disk so the upload-artifact step can find it + try { + if (!fs.existsSync(stateDir)) { + fs.mkdirSync(stateDir, { recursive: true }); + } + fs.writeFileSync(stateFile, JSON.stringify(newState, null, 2), "utf8"); + core.info(` State written to ${stateFile}`); + } catch (error) { + core.error(`Failed to write circuit breaker state: ${getErrorMessage(error)}`); + } +} + +module.exports = { main }; diff --git a/actions/setup/js/update_circuit_breaker.test.cjs b/actions/setup/js/update_circuit_breaker.test.cjs new file mode 100644 index 0000000000..24ff6f345e --- /dev/null +++ b/actions/setup/js/update_circuit_breaker.test.cjs @@ -0,0 +1,212 @@ +// @ts-check +import { describe, it, expect, beforeEach, afterEach, vi } from "vitest"; +import fs from "fs"; +import os from "os"; +import path from "path"; + +// ---- Globals ---- +const mockCore = { + debug: vi.fn(), + info: vi.fn(), + notice: vi.fn(), + warning: vi.fn(), + error: vi.fn(), + setFailed: vi.fn(), + setOutput: vi.fn(), + exportVariable: vi.fn(), + setSecret: vi.fn(), + getCancelled: vi.fn(), + setCancelled: vi.fn(), + setError: vi.fn(), + getInput: vi.fn(), + getBooleanInput: vi.fn(), + getMultilineInput: vi.fn(), + getState: vi.fn(), + saveState: vi.fn(), + startGroup: vi.fn(), + endGroup: vi.fn(), + group: vi.fn(), + addPath: vi.fn(), + setCommandEcho: vi.fn(), + isDebug: vi.fn().mockReturnValue(false), + getIDToken: vi.fn(), + toPlatformPath: vi.fn(), + toPosixPath: vi.fn(), + toWin32Path: vi.fn(), + summary: { addRaw: vi.fn().mockReturnThis(), write: vi.fn().mockResolvedValue(undefined) }, +}; +global.core = mockCore; +global.github = {}; +global.context = { repo: { owner: "test-owner", repo: "test-repo" }, runId: 123456 }; + +// Helper: relative timestamps +const minutesAgoISO = m => new Date(Date.now() - m * 60 * 1000).toISOString(); + +describe("update_circuit_breaker.cjs", () => { + let tmpDir; + let stateFile; + let originalEnv; + + beforeEach(() => { + vi.clearAllMocks(); + // Restore summary stubs after clearAllMocks + mockCore.summary.addRaw.mockReturnThis(); + mockCore.summary.write.mockResolvedValue(undefined); + + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "cb-update-test-")); + stateFile = path.join(tmpDir, "circuit-breaker-state.json"); + + originalEnv = { + GH_AW_CB_JOB_STATUS: process.env.GH_AW_CB_JOB_STATUS, + GH_AW_CB_MAX_FAILURES: process.env.GH_AW_CB_MAX_FAILURES, + GH_AW_CB_TIME_WINDOW_MINUTES: process.env.GH_AW_CB_TIME_WINDOW_MINUTES, + GH_AW_WORKFLOW_NAME: process.env.GH_AW_WORKFLOW_NAME, + GH_AW_CB_STATE_DIR: process.env.GH_AW_CB_STATE_DIR, + }; + process.env.GH_AW_CB_MAX_FAILURES = "5"; + process.env.GH_AW_CB_TIME_WINDOW_MINUTES = "1440"; + process.env.GH_AW_WORKFLOW_NAME = "Test Workflow"; + process.env.GH_AW_CB_STATE_DIR = tmpDir; + }); + + afterEach(() => { + for (const [key, val] of Object.entries(originalEnv)) { + if (val === undefined) { + delete process.env[key]; + } else { + process.env[key] = val; + } + } + fs.rmSync(tmpDir, { recursive: true, force: true }); + }); + + /** Write a previous state JSON to the temp dir. */ + function writePreviousState(state) { + fs.writeFileSync(stateFile, JSON.stringify(state), "utf8"); + } + + /** Read the state that was written by the script. */ + function readWrittenState() { + expect(fs.existsSync(stateFile)).toBe(true); + return JSON.parse(fs.readFileSync(stateFile, "utf8")); + } + + async function runUpdate() { + vi.resetModules(); + const mod = await import("./update_circuit_breaker.cjs"); + await mod.main(); + } + + it("SUCCESS — resets consecutive_failures to 0", async () => { + process.env.GH_AW_CB_JOB_STATUS = "success"; + writePreviousState({ consecutive_failures: 3, last_failure: minutesAgoISO(10) }); + + await runUpdate(); + + const state = readWrittenState(); + expect(state.consecutive_failures).toBe(0); + expect(state.last_success).toBeTruthy(); + expect(state.circuit_opened_at).toBeNull(); + expect(mockCore.info).toHaveBeenCalledWith(expect.stringContaining("resetting circuit breaker")); + }); + + it("SUCCESS — preserves last_failure timestamp after reset", async () => { + process.env.GH_AW_CB_JOB_STATUS = "success"; + const pastFailure = minutesAgoISO(60); + writePreviousState({ consecutive_failures: 2, last_failure: pastFailure }); + + await runUpdate(); + + const state = readWrittenState(); + expect(state.consecutive_failures).toBe(0); + expect(state.last_failure).toBe(pastFailure); + }); + + it("FAILURE — increments consecutive_failures from 0 (no prior state)", async () => { + process.env.GH_AW_CB_JOB_STATUS = "failure"; + // No previous state file written + + await runUpdate(); + + const state = readWrittenState(); + expect(state.consecutive_failures).toBe(1); + expect(state.last_failure).toBeTruthy(); + }); + + it("FAILURE — increments consecutive_failures from existing in-window count", async () => { + process.env.GH_AW_CB_JOB_STATUS = "failure"; + writePreviousState({ consecutive_failures: 3, last_failure: minutesAgoISO(5) }); + + await runUpdate(); + + const state = readWrittenState(); + expect(state.consecutive_failures).toBe(4); + }); + + it("FAILURE — sets circuit_opened_at when threshold is first reached", async () => { + process.env.GH_AW_CB_JOB_STATUS = "failure"; + process.env.GH_AW_CB_MAX_FAILURES = "5"; + // 4 existing failures → this run makes 5, hitting the threshold + writePreviousState({ consecutive_failures: 4, last_failure: minutesAgoISO(5) }); + + await runUpdate(); + + const state = readWrittenState(); + expect(state.consecutive_failures).toBe(5); + expect(state.circuit_opened_at).toBeTruthy(); + expect(mockCore.warning).toHaveBeenCalledWith(expect.stringContaining("threshold reached")); + }); + + it("FAILURE — preserves original circuit_opened_at on subsequent failures", async () => { + process.env.GH_AW_CB_JOB_STATUS = "failure"; + process.env.GH_AW_CB_MAX_FAILURES = "5"; + const originalOpenedAt = minutesAgoISO(30); + writePreviousState({ + consecutive_failures: 6, + last_failure: minutesAgoISO(5), + circuit_opened_at: originalOpenedAt, + }); + + await runUpdate(); + + const state = readWrittenState(); + expect(state.consecutive_failures).toBe(7); + // Original timestamp must be preserved, not overwritten + expect(state.circuit_opened_at).toBe(originalOpenedAt); + }); + + it("FAILURE — resets counter when last_failure is outside time window", async () => { + process.env.GH_AW_CB_JOB_STATUS = "failure"; + process.env.GH_AW_CB_TIME_WINDOW_MINUTES = "60"; // 1-hour window + // Last failure was 2 hours ago — outside window + writePreviousState({ consecutive_failures: 4, last_failure: minutesAgoISO(120) }); + + await runUpdate(); + + // Old out-of-window failures are ignored; counter starts fresh at 1 + const state = readWrittenState(); + expect(state.consecutive_failures).toBe(1); + }); + + it("CANCELLED — treated as failure, increments counter", async () => { + process.env.GH_AW_CB_JOB_STATUS = "cancelled"; + writePreviousState({ consecutive_failures: 2, last_failure: minutesAgoISO(5) }); + + await runUpdate(); + + const state = readWrittenState(); + expect(state.consecutive_failures).toBe(3); + }); + + it("corrupt state file — starts fresh rather than crashing", async () => { + process.env.GH_AW_CB_JOB_STATUS = "failure"; + fs.writeFileSync(stateFile, "NOT VALID JSON{{{", "utf8"); + + await runUpdate(); + + expect(mockCore.warning).toHaveBeenCalledWith(expect.stringContaining("Could not load")); + const state = readWrittenState(); + // Starts from 0 + 1 failure = 1 + expect(state.consecutive_failures).toBe(1); + }); +}); diff --git a/docs/adr/28778-circuit-breaker-for-repeatedly-failing-workflows.md b/docs/adr/28778-circuit-breaker-for-repeatedly-failing-workflows.md new file mode 100644 index 0000000000..c6a74b2d20 --- /dev/null +++ b/docs/adr/28778-circuit-breaker-for-repeatedly-failing-workflows.md @@ -0,0 +1,110 @@ +# ADR-28778: Circuit Breaker for Repeatedly Failing Agentic Workflows + +**Date**: 2026-04-27 +**Status**: Draft +**Deciders**: lpcox, copilot-swe-agent + +--- + +## Part 1 — Narrative (Human-Friendly) + +### Context + +Agentic workflows compiled by gh-aw run on GitHub Actions runners on a scheduled or event-driven basis. When a workflow is misconfigured, encounters a broken dependency, or has a systematic runtime error, it will continue triggering on every event and consuming runner resources indefinitely, because GitHub Actions has no native mechanism to suppress a workflow that fails repeatedly. This is a resource-exhaustion risk catalogued as OWASP Agentic Top-10 item ASI-08. The system needed an opt-in guard that workflow authors could add with a single frontmatter field and that did not introduce external infrastructure dependencies beyond what GitHub Actions already provides. + +### Decision + +We will implement the classic closed → open → half-open circuit breaker state machine as an opt-in frontmatter feature (`circuit-breaker: true` or a full object form). State is persisted across runs exclusively via GitHub Actions artifacts: a pre-activation job reads the previous state artifact from the most recent completed run, evaluates the state machine, and gates the `activated` output. After agent execution, two `if: always()` steps update the state and re-upload the artifact. This approach requires no infrastructure beyond a standard GitHub Actions token with `actions: read` permission. + +### Alternatives Considered + +#### Alternative 1: External State Store (Redis / Database) + +A persistent external datastore (Redis, PostgreSQL, or similar) could track failure counts reliably across runs without artifact expiry or manual deletion risks. This was rejected because it introduces an infrastructure dependency that does not exist in the current gh-aw deployment model and would require every adopting repository to provision and credential-manage an external service, dramatically increasing the adoption barrier for what is an optional resilience feature. + +#### Alternative 2: GitHub Repository/Environment Variables via the API + +Failure counts could be written back to a GitHub Actions environment variable or repository variable via the REST API on each run. This avoids binary artifact management. It was rejected because updating repository or environment variables requires elevated `repo` or `admin` scopes that violate the principle of least privilege; environment variables also lack the structured JSON schema needed to store timestamps and state transitions cleanly. + +#### Alternative 3: No Built-in Circuit Breaker (External Monitoring) + +Teams could configure external alerting (Datadog, PagerDuty, etc.) to detect repeated failures and disable workflows manually. This was rejected because it places the burden on each team to set up monitoring, does not enforce a cooldown programmatically, and provides no self-healing (half-open) behaviour that allows automatic recovery once the root cause is fixed. + +### Consequences + +#### Positive +- Prevents resource exhaustion from runaway failing workflows with zero external infrastructure. +- Backward-compatible opt-in design: existing workflows are unaffected unless `circuit-breaker` frontmatter is added. +- Follows standard resilience engineering pattern (closed/open/half-open) that operators already understand. +- Self-healing: the half-open state allows automatic recovery after the cooldown period without manual intervention. +- Configurable thresholds (`max-consecutive-failures`, `time-window`, `cooldown`) let teams tune behaviour for their failure tolerance. + +#### Negative +- State is stored in GitHub Actions artifacts, which are subject to repository retention policies and can be manually deleted, inadvertently resetting the circuit. +- Artifact I/O (list runs, list artifacts, download, upload) adds latency and API calls to every workflow execution that enables the feature. +- The `actions: read` permission is added automatically to the pre-activation job; teams with strict minimal-permission policies must be aware of this implicit grant. +- The feature relies on GitHub's artifact API being available; artifact service degradation could prevent state updates and cause the circuit to silently fail open. + +#### Neutral +- The circuit breaker state is identified by workflow ID, so renaming a workflow file effectively resets the breaker. +- State inspection currently requires downloading the `circuit-breaker-state` artifact manually; no UI or CLI command is provided to read current state. +- Manual reset is performed by deleting the `circuit-breaker-state` artifact from the most recent run. + +--- + +## Part 2 — Normative Specification (RFC 2119) + +> The key words **MUST**, **MUST NOT**, **REQUIRED**, **SHALL**, **SHALL NOT**, **SHOULD**, **SHOULD NOT**, **RECOMMENDED**, **MAY**, and **OPTIONAL** in this section are to be interpreted as described in [RFC 2119](https://www.rfc-editor.org/rfc/rfc2119). + +### Feature Activation + +1. The circuit breaker feature **MUST** be opt-in: workflows that do not declare `circuit-breaker` frontmatter **MUST NOT** have any circuit breaker steps injected into their compiled output. +2. A workflow **MAY** enable the circuit breaker with `circuit-breaker: true` (boolean shorthand using all defaults) or with a `circuit-breaker` object specifying explicit configuration fields. +3. A workflow **MAY** also enable the circuit breaker via `features.circuit-breaker: true`; this form **MUST** behave identically to `circuit-breaker: true`. +4. Setting `circuit-breaker: false` **MUST** be treated as explicitly disabled and **MUST NOT** inject any steps. + +### State Machine + +1. Implementations **MUST** implement the three-state closed → open → half-open circuit breaker state machine. +2. In the **CLOSED** state, implementations **MUST** allow workflow execution to proceed (output `circuit_breaker_ok=true`). +3. A circuit **MUST** transition from CLOSED to OPEN when `consecutive_failures` reaches or exceeds `max-consecutive-failures` within the configured `time-window`. +4. In the **OPEN** state, implementations **MUST** block workflow execution (output `circuit_breaker_ok=false`) until the `cooldown` period has elapsed since the last recorded failure. +5. When the `cooldown` period has elapsed, the circuit **MUST** transition to the **HALF-OPEN** state and allow exactly one probe execution (output `circuit_breaker_ok=true`). +6. A successful probe in HALF-OPEN **MUST** reset `consecutive_failures` to zero and return the circuit to CLOSED. +7. A failed probe in HALF-OPEN **MUST** increment `consecutive_failures` and return the circuit to OPEN with an updated `last_failure` timestamp. +8. If no previous state artifact is found, implementations **MUST** treat the circuit as CLOSED (fail-open for availability). + +### State Persistence + +1. Circuit breaker state **MUST** be persisted as a JSON artifact named `circuit-breaker-state` uploaded to GitHub Actions after every execution, regardless of job outcome. +2. The update step **MUST** use `if: always()` to ensure state is written even when the agent job fails or is cancelled. +3. The state JSON **MUST** include at minimum: `consecutive_failures` (integer), `last_failure` (ISO 8601 timestamp or null), and `circuit_opened_at` (ISO 8601 timestamp or null). +4. Implementations **MUST** use `overwrite: true` when uploading the artifact so that only the latest state is retained per run. +5. Implementations **SHOULD** use `if-no-files-found: ignore` on the upload step to tolerate cases where the state file could not be written. + +### Configuration Defaults + +1. When not explicitly specified, `max-consecutive-failures` **MUST** default to `5`. +2. When not explicitly specified, `time-window` **MUST** default to `24h`. +3. When not explicitly specified, `cooldown` **MUST** default to `1h`. +4. When not explicitly specified, `notify` **MUST** default to `true`. +5. Duration values for `time-window` and `cooldown` **MUST** be parseable by Go's `time.ParseDuration`. Implementations **MUST NOT** accept arbitrary string formats that cannot be parsed by this function. + +### Permissions + +1. When the circuit breaker is enabled, the pre-activation job **MUST** be granted `actions: read` permission to allow listing workflow runs and downloading artifacts. +2. Implementations **MUST NOT** require permissions beyond `actions: read` and the standard `GITHUB_TOKEN` for circuit breaker operation. + +### Notifications + +1. When `notify: true` (the default), implementations **MUST** emit a workflow error annotation via `core.error()` when the circuit is OPEN and blocking execution. +2. When `notify: false`, implementations **MUST NOT** emit error annotations for circuit breaker state changes. +3. Implementations **SHOULD** emit a GitHub Actions step summary when the circuit is OPEN, describing the failure count, time window, and estimated retry time. + +### Conformance + +An implementation is considered conformant with this ADR if it satisfies all **MUST** and **MUST NOT** requirements above. Failure to meet any **MUST** or **MUST NOT** requirement constitutes non-conformance. Specifically: (a) the circuit breaker **MUST** be opt-in, (b) the three-state machine transitions **MUST** be correctly implemented, (c) state **MUST** be persisted via artifact after every execution, and (d) the pre-activation job **MUST** gate on `circuit_breaker_ok=true` when the feature is enabled. + +--- + +*This is a DRAFT ADR generated by the [Design Decision Gate](https://github.com/github/gh-aw/actions/runs/25013046554) workflow. The PR author must review, complete, and finalize this document before the PR can merge.* diff --git a/pkg/constants/job_constants.go b/pkg/constants/job_constants.go index bcf3944332..df306f585b 100644 --- a/pkg/constants/job_constants.go +++ b/pkg/constants/job_constants.go @@ -185,6 +185,19 @@ const CheckSkipRolesStepID StepID = "check_skip_roles" const CheckSkipBotsStepID StepID = "check_skip_bots" const CheckSkipIfCheckFailingStepID StepID = "check_skip_if_check_failing" +// CheckCircuitBreakerStepID is the step ID for the circuit breaker check in the pre-activation job. +const CheckCircuitBreakerStepID StepID = "check_circuit_breaker" + +// FindCircuitBreakerArtifactStepID is the step ID for the step that finds the previous +// circuit-breaker-state artifact run ID in the pre-activation job. +const FindCircuitBreakerArtifactStepID StepID = "find_circuit_breaker_artifact" + +// UpdateCircuitBreakerStepID is the step ID for the circuit breaker state update in the agent job. +const UpdateCircuitBreakerStepID StepID = "update_circuit_breaker" + +// CircuitBreakerArtifactName is the artifact name used to persist the circuit breaker state. +const CircuitBreakerArtifactName = "circuit-breaker-state" + // PreActivationAppTokenStepID is the step ID for the unified GitHub App token mint step // emitted in the pre-activation job when on.github-app is configured alongside skip-if checks. const PreActivationAppTokenStepID StepID = "pre-activation-app-token" @@ -206,6 +219,7 @@ const SkipRolesOkOutput = "skip_roles_ok" const SkipBotsOkOutput = "skip_bots_ok" const SkipIfCheckFailingOkOutput = "skip_if_check_failing_ok" const ActivatedOutput = "activated" +const CircuitBreakerOkOutput = "circuit_breaker_ok" // Rate limit defaults const DefaultRateLimitMax = 5 // Default maximum runs per time window diff --git a/pkg/parser/schemas/main_workflow_schema.json b/pkg/parser/schemas/main_workflow_schema.json index 060101e3e1..09e454b322 100644 --- a/pkg/parser/schemas/main_workflow_schema.json +++ b/pkg/parser/schemas/main_workflow_schema.json @@ -8788,6 +8788,59 @@ } ] }, + "circuit-breaker": { + "description": "Circuit breaker configuration to prevent runaway execution of a consistently failing workflow. Opens after N consecutive failures and blocks execution until the cooldown period expires. Follows the standard closed → open → half-open state machine pattern.", + "oneOf": [ + { + "type": "boolean", + "description": "Set to true to enable the circuit breaker with default settings (5 failures, 24h window, 1h cooldown). Set to false to explicitly disable it." + }, + { + "type": "object", + "description": "Circuit breaker object configuration with explicit thresholds.", + "properties": { + "max-consecutive-failures": { + "type": "integer", + "minimum": 1, + "maximum": 100, + "default": 5, + "description": "Number of consecutive failures before the circuit opens. Defaults to 5." + }, + "time-window": { + "type": "string", + "default": "24h", + "description": "Duration window within which consecutive failures are counted (e.g. '24h', '1h', '30m'). Defaults to '24h'.", + "examples": ["24h", "6h", "1h", "30m"] + }, + "cooldown": { + "type": "string", + "default": "1h", + "description": "Duration to wait before allowing a retry (half-open probe) after the circuit opens (e.g. '1h', '30m'). Defaults to '1h'.", + "examples": ["1h", "30m", "15m"] + }, + "notify": { + "type": "boolean", + "default": true, + "description": "Post a workflow annotation when the circuit opens or closes. Defaults to true." + } + }, + "additionalProperties": false, + "examples": [ + { + "max-consecutive-failures": 5, + "time-window": "24h", + "cooldown": "1h" + }, + { + "max-consecutive-failures": 3, + "time-window": "6h", + "cooldown": "30m", + "notify": true + } + ] + } + ] + }, "strict": { "type": "boolean", "default": true, diff --git a/pkg/workflow/circuit_breaker.go b/pkg/workflow/circuit_breaker.go new file mode 100644 index 0000000000..9c17585bef --- /dev/null +++ b/pkg/workflow/circuit_breaker.go @@ -0,0 +1,234 @@ +package workflow + +import ( + "fmt" + "strings" + "time" + + "github.com/github/gh-aw/pkg/constants" + "github.com/github/gh-aw/pkg/logger" +) + +var circuitBreakerLog = logger.New("workflow:circuit_breaker") + +// defaultCircuitBreakerMaxConsecutiveFailures is the default number of consecutive failures +// before the circuit breaker opens. +const defaultCircuitBreakerMaxConsecutiveFailures = 5 + +// defaultCircuitBreakerTimeWindow is the default time window for counting failures. +const defaultCircuitBreakerTimeWindow = "24h" + +// defaultCircuitBreakerCooldown is the default cooldown period after the circuit opens. +const defaultCircuitBreakerCooldown = "1h" + +// extractCircuitBreakerConfig extracts the 'circuit-breaker' field from frontmatter. +// It also handles the feature flag form: features.circuit-breaker: true. +func (c *Compiler) extractCircuitBreakerConfig(frontmatter map[string]any) *CircuitBreakerConfig { + // Check for explicit circuit-breaker configuration + if cbValue, exists := frontmatter["circuit-breaker"]; exists && cbValue != nil { + switch v := cbValue.(type) { + case map[string]any: + config := &CircuitBreakerConfig{} + + // Extract max-consecutive-failures (default: 5) + if maxValue, ok := v["max-consecutive-failures"]; ok { + switch max := maxValue.(type) { + case int: + config.MaxConsecutiveFailures = max + case int64: + config.MaxConsecutiveFailures = int(max) + case uint64: + config.MaxConsecutiveFailures = int(max) + case float64: + config.MaxConsecutiveFailures = int(max) + } + } + + // Extract time-window (default: "24h") + if windowValue, ok := v["time-window"]; ok { + if str, ok := windowValue.(string); ok { + config.TimeWindow = str + } + } + + // Extract cooldown (default: "1h") + if cooldownValue, ok := v["cooldown"]; ok { + if str, ok := cooldownValue.(string); ok { + config.Cooldown = str + } + } + + // Extract notify (default: true) + if notifyValue, ok := v["notify"]; ok { + if b, ok := notifyValue.(bool); ok { + config.Notify = &b + } + } + + applyCircuitBreakerDefaults(config) + circuitBreakerLog.Printf("Extracted circuit-breaker config: max=%d, window=%s, cooldown=%s", + config.MaxConsecutiveFailures, config.TimeWindow, config.Cooldown) + return config + + case bool: + if v { + // circuit-breaker: true → use all defaults + config := &CircuitBreakerConfig{} + applyCircuitBreakerDefaults(config) + circuitBreakerLog.Print("Circuit-breaker enabled via boolean flag (using defaults)") + return config + } + // circuit-breaker: false → explicitly disabled + circuitBreakerLog.Print("Circuit-breaker explicitly disabled via boolean false") + return nil + } + } + + // Check the feature flag: features.circuit-breaker: true + if featuresValue, exists := frontmatter["features"]; exists && featuresValue != nil { + if features, ok := featuresValue.(map[string]any); ok { + if cbFeature, exists := features["circuit-breaker"]; exists { + if b, ok := cbFeature.(bool); ok && b { + config := &CircuitBreakerConfig{} + applyCircuitBreakerDefaults(config) + circuitBreakerLog.Print("Circuit-breaker enabled via features.circuit-breaker: true (using defaults)") + return config + } + } + } + } + + circuitBreakerLog.Print("No circuit-breaker configuration specified") + return nil +} + +// applyCircuitBreakerDefaults fills in default values for any unset circuit-breaker config fields. +func applyCircuitBreakerDefaults(config *CircuitBreakerConfig) { + if config.MaxConsecutiveFailures <= 0 { + config.MaxConsecutiveFailures = defaultCircuitBreakerMaxConsecutiveFailures + } + if config.TimeWindow == "" { + config.TimeWindow = defaultCircuitBreakerTimeWindow + } + if config.Cooldown == "" { + config.Cooldown = defaultCircuitBreakerCooldown + } + if config.Notify == nil { + t := true + config.Notify = &t + } +} + +// circuitBreakerDurationToMinutes parses a duration string (e.g. "24h", "30m") and returns +// the equivalent number of minutes as an integer. Sub-minute durations are rounded up to 1 minute +// to prevent a 0-minute window/cooldown from breaking the check logic. +func circuitBreakerDurationToMinutes(d string) (int, error) { + dur, err := time.ParseDuration(d) + if err != nil { + return 0, fmt.Errorf("invalid circuit-breaker duration %q: %w", d, err) + } + // Round up to 1 minute to avoid a 0-minute window that would disable the check. + return max(int(dur.Minutes()), 1), nil +} + +// generateCircuitBreakerCheckSteps generates the pre-activation steps that check whether +// the circuit breaker is open. Three steps are generated: +// 1. Find the previous run with a circuit-breaker-state artifact (GitHub Script). +// 2. Download the artifact if found (actions/download-artifact@v4). +// 3. Read the JSON file and evaluate the circuit state, outputting circuit_breaker_ok. +func (c *Compiler) generateCircuitBreakerCheckSteps(data *WorkflowData, steps []string) []string { + cfg := data.CircuitBreaker + if cfg == nil { + return steps + } + + timeWindowMinutes, err := circuitBreakerDurationToMinutes(cfg.TimeWindow) + if err != nil { + circuitBreakerLog.Printf("Warning: could not parse circuit-breaker time-window %q, using default 1440 minutes: %v", cfg.TimeWindow, err) + timeWindowMinutes = 1440 + } + cooldownMinutes, err := circuitBreakerDurationToMinutes(cfg.Cooldown) + if err != nil { + circuitBreakerLog.Printf("Warning: could not parse circuit-breaker cooldown %q, using default 60 minutes: %v", cfg.Cooldown, err) + cooldownMinutes = 60 + } + + notify := "true" + if cfg.Notify != nil && !*cfg.Notify { + notify = "false" + } + + // Step 1: Find the previous run with the circuit-breaker-state artifact. + steps = append(steps, " - name: Find previous circuit breaker state\n") + steps = append(steps, fmt.Sprintf(" id: %s\n", constants.FindCircuitBreakerArtifactStepID)) + steps = append(steps, fmt.Sprintf(" uses: %s\n", getCachedActionPin("actions/github-script", data))) + steps = append(steps, " with:\n") + steps = append(steps, " script: |\n") + steps = append(steps, generateGitHubScriptWithRequire("find_circuit_breaker_artifact.cjs")) + + // Step 2: Download the artifact (actions/download-artifact handles ZIP extraction natively). + steps = append(steps, " - name: Download previous circuit breaker state\n") + steps = append(steps, fmt.Sprintf(" if: steps.%s.outputs.previous_run_id != ''\n", constants.FindCircuitBreakerArtifactStepID)) + steps = append(steps, fmt.Sprintf(" uses: %s\n", getActionPin("actions/download-artifact"))) + steps = append(steps, " with:\n") + steps = append(steps, fmt.Sprintf(" name: %s\n", constants.CircuitBreakerArtifactName)) + steps = append(steps, fmt.Sprintf(" run-id: ${{ steps.%s.outputs.previous_run_id }}\n", constants.FindCircuitBreakerArtifactStepID)) + steps = append(steps, " path: /tmp/gh-aw\n") + steps = append(steps, " github-token: ${{ secrets.GITHUB_TOKEN }}\n") + + // Step 3: Read the JSON and evaluate the circuit breaker state. + steps = append(steps, " - name: Check circuit breaker\n") + steps = append(steps, fmt.Sprintf(" id: %s\n", constants.CheckCircuitBreakerStepID)) + steps = append(steps, fmt.Sprintf(" uses: %s\n", getCachedActionPin("actions/github-script", data))) + steps = append(steps, " env:\n") + steps = append(steps, fmt.Sprintf(" GH_AW_CB_MAX_FAILURES: \"%d\"\n", cfg.MaxConsecutiveFailures)) + steps = append(steps, fmt.Sprintf(" GH_AW_CB_TIME_WINDOW_MINUTES: \"%d\"\n", timeWindowMinutes)) + steps = append(steps, fmt.Sprintf(" GH_AW_CB_COOLDOWN_MINUTES: \"%d\"\n", cooldownMinutes)) + steps = append(steps, fmt.Sprintf(" GH_AW_CB_NOTIFY: \"%s\"\n", notify)) + steps = append(steps, fmt.Sprintf(" GH_AW_WORKFLOW_NAME: %q\n", data.Name)) + steps = append(steps, " with:\n") + steps = append(steps, " script: |\n") + steps = append(steps, generateGitHubScriptWithRequire("check_circuit_breaker.cjs")) + + circuitBreakerLog.Printf("Added circuit breaker check steps: max=%d, window=%dm, cooldown=%dm", + cfg.MaxConsecutiveFailures, timeWindowMinutes, cooldownMinutes) + return steps +} + +// generateCircuitBreakerUpdateSteps generates the post-execution steps that update the circuit +// breaker state artifact. These steps use if: always() so they run regardless of job outcome. +func (c *Compiler) generateCircuitBreakerUpdateSteps(yaml *strings.Builder, data *WorkflowData) { + if data.CircuitBreaker == nil { + return + } + + timeWindowMinutes, err := circuitBreakerDurationToMinutes(data.CircuitBreaker.TimeWindow) + if err != nil { + circuitBreakerLog.Printf("Warning: could not parse circuit-breaker time-window %q for update step, using default 1440 minutes: %v", data.CircuitBreaker.TimeWindow, err) + timeWindowMinutes = 1440 + } + + circuitBreakerLog.Print("Adding circuit breaker state update steps to agent job") + + yaml.WriteString(" - name: Update circuit breaker state\n") + yaml.WriteString(" if: always()\n") + fmt.Fprintf(yaml, " id: %s\n", constants.UpdateCircuitBreakerStepID) + fmt.Fprintf(yaml, " uses: %s\n", getCachedActionPin("actions/github-script", data)) + yaml.WriteString(" env:\n") + yaml.WriteString(" GH_AW_CB_JOB_STATUS: ${{ job.status }}\n") + fmt.Fprintf(yaml, " GH_AW_CB_MAX_FAILURES: \"%d\"\n", data.CircuitBreaker.MaxConsecutiveFailures) + fmt.Fprintf(yaml, " GH_AW_CB_TIME_WINDOW_MINUTES: \"%d\"\n", timeWindowMinutes) + fmt.Fprintf(yaml, " GH_AW_WORKFLOW_NAME: %q\n", data.Name) + yaml.WriteString(" with:\n") + yaml.WriteString(" script: |\n") + yaml.WriteString(generateGitHubScriptWithRequire("update_circuit_breaker.cjs")) + + yaml.WriteString(" - name: Upload circuit breaker state\n") + yaml.WriteString(" if: always()\n") + fmt.Fprintf(yaml, " uses: %s\n", getActionPin("actions/upload-artifact")) + yaml.WriteString(" with:\n") + fmt.Fprintf(yaml, " name: %s\n", constants.CircuitBreakerArtifactName) + yaml.WriteString(" path: /tmp/gh-aw/circuit-breaker-state.json\n") + yaml.WriteString(" if-no-files-found: ignore\n") + yaml.WriteString(" overwrite: true\n") +} diff --git a/pkg/workflow/circuit_breaker_test.go b/pkg/workflow/circuit_breaker_test.go new file mode 100644 index 0000000000..5208e576ef --- /dev/null +++ b/pkg/workflow/circuit_breaker_test.go @@ -0,0 +1,345 @@ +//go:build !integration + +package workflow + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestExtractCircuitBreakerConfig tests that circuit-breaker frontmatter is correctly parsed. +func TestExtractCircuitBreakerConfig(t *testing.T) { + compiler := &Compiler{} + + tests := []struct { + name string + frontmatter map[string]any + expectedConfig *CircuitBreakerConfig + }{ + { + name: "no circuit-breaker key", + frontmatter: map[string]any{}, + expectedConfig: nil, + }, + { + name: "circuit-breaker: false (explicit disable)", + frontmatter: map[string]any{"circuit-breaker": false}, + expectedConfig: nil, + }, + { + name: "circuit-breaker: true (boolean enable, use defaults)", + frontmatter: map[string]any{"circuit-breaker": true}, + expectedConfig: func() *CircuitBreakerConfig { + t := true + return &CircuitBreakerConfig{ + MaxConsecutiveFailures: 5, + TimeWindow: "24h", + Cooldown: "1h", + Notify: &t, + } + }(), + }, + { + name: "circuit-breaker: object with all fields", + frontmatter: map[string]any{ + "circuit-breaker": map[string]any{ + "max-consecutive-failures": 3, + "time-window": "6h", + "cooldown": "30m", + "notify": false, + }, + }, + expectedConfig: func() *CircuitBreakerConfig { + f := false + return &CircuitBreakerConfig{ + MaxConsecutiveFailures: 3, + TimeWindow: "6h", + Cooldown: "30m", + Notify: &f, + } + }(), + }, + { + name: "circuit-breaker: object with defaults applied", + frontmatter: map[string]any{ + "circuit-breaker": map[string]any{ + "max-consecutive-failures": 10, + }, + }, + expectedConfig: func() *CircuitBreakerConfig { + tr := true + return &CircuitBreakerConfig{ + MaxConsecutiveFailures: 10, + TimeWindow: "24h", + Cooldown: "1h", + Notify: &tr, + } + }(), + }, + { + name: "circuit-breaker enabled via features flag", + frontmatter: map[string]any{ + "features": map[string]any{ + "circuit-breaker": true, + }, + }, + expectedConfig: func() *CircuitBreakerConfig { + tr := true + return &CircuitBreakerConfig{ + MaxConsecutiveFailures: 5, + TimeWindow: "24h", + Cooldown: "1h", + Notify: &tr, + } + }(), + }, + { + name: "circuit-breaker NOT enabled via features flag (false)", + frontmatter: map[string]any{ + "features": map[string]any{ + "circuit-breaker": false, + }, + }, + expectedConfig: nil, + }, + { + name: "max-consecutive-failures as float64 (YAML parser produces float64 for numbers)", + frontmatter: map[string]any{ + "circuit-breaker": map[string]any{ + "max-consecutive-failures": float64(7), + }, + }, + expectedConfig: func() *CircuitBreakerConfig { + tr := true + return &CircuitBreakerConfig{ + MaxConsecutiveFailures: 7, + TimeWindow: "24h", + Cooldown: "1h", + Notify: &tr, + } + }(), + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := compiler.extractCircuitBreakerConfig(tt.frontmatter) + if tt.expectedConfig == nil { + assert.Nil(t, got, "expected nil CircuitBreakerConfig") + return + } + require.NotNil(t, got, "expected non-nil CircuitBreakerConfig") + assert.Equal(t, tt.expectedConfig.MaxConsecutiveFailures, got.MaxConsecutiveFailures, + "MaxConsecutiveFailures should match") + assert.Equal(t, tt.expectedConfig.TimeWindow, got.TimeWindow, + "TimeWindow should match") + assert.Equal(t, tt.expectedConfig.Cooldown, got.Cooldown, + "Cooldown should match") + require.NotNil(t, got.Notify, "Notify should not be nil") + assert.Equal(t, *tt.expectedConfig.Notify, *got.Notify, + "Notify should match") + }) + } +} + +// TestApplyCircuitBreakerDefaults tests that defaults are applied correctly. +func TestApplyCircuitBreakerDefaults(t *testing.T) { + tests := []struct { + name string + input *CircuitBreakerConfig + expected *CircuitBreakerConfig + }{ + { + name: "empty config gets all defaults", + input: &CircuitBreakerConfig{}, + expected: func() *CircuitBreakerConfig { + tr := true + return &CircuitBreakerConfig{ + MaxConsecutiveFailures: 5, + TimeWindow: "24h", + Cooldown: "1h", + Notify: &tr, + } + }(), + }, + { + name: "existing values are preserved", + input: func() *CircuitBreakerConfig { + f := false + return &CircuitBreakerConfig{ + MaxConsecutiveFailures: 3, + TimeWindow: "6h", + Cooldown: "30m", + Notify: &f, + } + }(), + expected: func() *CircuitBreakerConfig { + f := false + return &CircuitBreakerConfig{ + MaxConsecutiveFailures: 3, + TimeWindow: "6h", + Cooldown: "30m", + Notify: &f, + } + }(), + }, + { + name: "zero max-consecutive-failures gets default", + input: &CircuitBreakerConfig{ + MaxConsecutiveFailures: 0, + TimeWindow: "6h", + }, + expected: func() *CircuitBreakerConfig { + tr := true + return &CircuitBreakerConfig{ + MaxConsecutiveFailures: 5, + TimeWindow: "6h", + Cooldown: "1h", + Notify: &tr, + } + }(), + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + applyCircuitBreakerDefaults(tt.input) + assert.Equal(t, tt.expected.MaxConsecutiveFailures, tt.input.MaxConsecutiveFailures, + "MaxConsecutiveFailures should match") + assert.Equal(t, tt.expected.TimeWindow, tt.input.TimeWindow, + "TimeWindow should match") + assert.Equal(t, tt.expected.Cooldown, tt.input.Cooldown, + "Cooldown should match") + require.NotNil(t, tt.input.Notify, "Notify should not be nil after defaults") + assert.Equal(t, *tt.expected.Notify, *tt.input.Notify, + "Notify should match") + }) + } +} + +// TestCircuitBreakerDurationToMinutes tests duration string parsing. +func TestCircuitBreakerDurationToMinutes(t *testing.T) { + tests := []struct { + name string + input string + expected int + expectError bool + }{ + {name: "1 hour", input: "1h", expected: 60}, + {name: "24 hours", input: "24h", expected: 1440}, + {name: "30 minutes", input: "30m", expected: 30}, + {name: "90 minutes", input: "1h30m", expected: 90}, + {name: "sub-minute (30s) rounds up to 1", input: "30s", expected: 1}, + {name: "zero seconds rounds up to 1", input: "0s", expected: 1}, + {name: "invalid duration", input: "invalid", expectError: true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := circuitBreakerDurationToMinutes(tt.input) + if tt.expectError { + assert.Error(t, err, "should return error for invalid duration") + return + } + require.NoError(t, err, "should not return error for valid duration") + assert.Equal(t, tt.expected, got, "duration in minutes should match") + }) + } +} + +// TestGenerateCircuitBreakerCheckSteps tests that the generated YAML steps are well-formed. +func TestGenerateCircuitBreakerCheckSteps(t *testing.T) { + tr := true + data := &WorkflowData{ + Name: "My Workflow", + CircuitBreaker: &CircuitBreakerConfig{ + MaxConsecutiveFailures: 5, + TimeWindow: "24h", + Cooldown: "1h", + Notify: &tr, + }, + } + + compiler := &Compiler{} + var steps []string + steps = compiler.generateCircuitBreakerCheckSteps(data, steps) + + require.NotEmpty(t, steps, "should generate steps") + + combined := strings.Join(steps, "") + + // Step 1: find artifact + assert.Contains(t, combined, "find_circuit_breaker_artifact", "find-artifact step ID should be present") + assert.Contains(t, combined, "find_circuit_breaker_artifact.cjs", "find-artifact script should be referenced") + + // Step 2: download artifact + assert.Contains(t, combined, "Download previous circuit breaker state", "download step name should be present") + assert.Contains(t, combined, "circuit-breaker-state", "artifact name should be present") + assert.Contains(t, combined, "previous_run_id", "run-id reference should be present") + + // Step 3: evaluate state + assert.Contains(t, combined, "check_circuit_breaker", "check step ID should be present") + assert.Contains(t, combined, "GH_AW_CB_MAX_FAILURES", "max failures env var should be present") + assert.Contains(t, combined, "GH_AW_CB_TIME_WINDOW_MINUTES", "time window env var should be present") + assert.Contains(t, combined, "GH_AW_CB_COOLDOWN_MINUTES", "cooldown env var should be present") + assert.Contains(t, combined, "GH_AW_CB_NOTIFY", "notify env var should be present") + assert.Contains(t, combined, "check_circuit_breaker.cjs", "check script should be referenced") +} + +// TestGenerateCircuitBreakerCheckSteps_NilConfig ensures no steps are generated when disabled. +func TestGenerateCircuitBreakerCheckSteps_NilConfig(t *testing.T) { + data := &WorkflowData{ + Name: "My Workflow", + CircuitBreaker: nil, + } + + compiler := &Compiler{} + var steps []string + steps = compiler.generateCircuitBreakerCheckSteps(data, steps) + + assert.Empty(t, steps, "should generate no steps when circuit breaker is disabled") +} + +// TestGenerateCircuitBreakerUpdateSteps tests that update steps are generated correctly. +func TestGenerateCircuitBreakerUpdateSteps(t *testing.T) { + tr := true + data := &WorkflowData{ + Name: "My Workflow", + CircuitBreaker: &CircuitBreakerConfig{ + MaxConsecutiveFailures: 5, + TimeWindow: "24h", + Cooldown: "1h", + Notify: &tr, + }, + } + + compiler := &Compiler{} + var yaml strings.Builder + compiler.generateCircuitBreakerUpdateSteps(&yaml, data) + + output := yaml.String() + assert.Contains(t, output, "Update circuit breaker state", "update step name should be present") + assert.Contains(t, output, "if: always()", "update step should run always") + assert.Contains(t, output, "update_circuit_breaker.cjs", "update script should be referenced") + assert.Contains(t, output, "Upload circuit breaker state", "upload step should be present") + assert.Contains(t, output, "circuit-breaker-state", "artifact name should be present") + assert.Contains(t, output, "GH_AW_CB_JOB_STATUS", "job status env var should be present") + assert.Contains(t, output, "GH_AW_CB_TIME_WINDOW_MINUTES", "time window env var should be passed to update step") +} + +// TestGenerateCircuitBreakerUpdateSteps_NilConfig ensures no steps are generated when disabled. +func TestGenerateCircuitBreakerUpdateSteps_NilConfig(t *testing.T) { + data := &WorkflowData{ + Name: "My Workflow", + CircuitBreaker: nil, + } + + compiler := &Compiler{} + var yaml strings.Builder + compiler.generateCircuitBreakerUpdateSteps(&yaml, data) + + assert.Empty(t, yaml.String(), "should generate no steps when circuit breaker is disabled") +} diff --git a/pkg/workflow/compiler_jobs.go b/pkg/workflow/compiler_jobs.go index 8c3e50d998..7477a6cdae 100644 --- a/pkg/workflow/compiler_jobs.go +++ b/pkg/workflow/compiler_jobs.go @@ -266,12 +266,16 @@ func (c *Compiler) buildPreActivationAndActivationJobs(data *WorkflowData, front hasSkipBots := len(data.SkipBots) > 0 hasCommandTrigger := len(data.Command) > 0 hasRateLimit := data.RateLimit != nil + hasCircuitBreaker := data.CircuitBreaker != nil hasOnSteps := len(data.OnSteps) > 0 hasOnNeeds := len(data.OnNeeds) > 0 - compilerJobsLog.Printf("Job configuration: needsPermissionCheck=%v, hasStopTime=%v, hasSkipIfMatch=%v, hasSkipIfNoMatch=%v, hasSkipRoles=%v, hasSkipBots=%v, hasCommand=%v, hasRateLimit=%v, hasOnSteps=%v, hasOnNeeds=%v", needsPermissionCheck, hasStopTime, hasSkipIfMatch, hasSkipIfNoMatch, hasSkipRoles, hasSkipBots, hasCommandTrigger, hasRateLimit, hasOnSteps, hasOnNeeds) + // Log pre-activation gate configuration in grouped sections for readability. + compilerJobsLog.Printf("Job configuration — permission/time: needsPermissionCheck=%v, hasStopTime=%v", needsPermissionCheck, hasStopTime) + compilerJobsLog.Printf("Job configuration — skip checks: hasSkipIfMatch=%v, hasSkipIfNoMatch=%v, hasSkipRoles=%v, hasSkipBots=%v", hasSkipIfMatch, hasSkipIfNoMatch, hasSkipRoles, hasSkipBots) + compilerJobsLog.Printf("Job configuration — features: hasCommand=%v, hasRateLimit=%v, hasCircuitBreaker=%v, hasOnSteps=%v, hasOnNeeds=%v", hasCommandTrigger, hasRateLimit, hasCircuitBreaker, hasOnSteps, hasOnNeeds) - // Build pre-activation job if needed (combines membership checks, stop-time validation, skip-if-match check, skip-if-no-match check, skip-roles check, skip-bots check, rate limit check, command position check, and on.steps injection) - if needsPermissionCheck || hasStopTime || hasSkipIfMatch || hasSkipIfNoMatch || hasSkipRoles || hasSkipBots || hasCommandTrigger || hasRateLimit || hasOnSteps || hasOnNeeds { + // Build pre-activation job if needed (combines membership checks, stop-time validation, skip-if-match check, skip-if-no-match check, skip-roles check, skip-bots check, rate limit check, circuit breaker check, command position check, and on.steps injection) + if needsPermissionCheck || hasStopTime || hasSkipIfMatch || hasSkipIfNoMatch || hasSkipRoles || hasSkipBots || hasCommandTrigger || hasRateLimit || hasCircuitBreaker || hasOnSteps || hasOnNeeds { compilerJobsLog.Print("Building pre-activation job") preActivationJob, err := c.buildPreActivationJob(data, needsPermissionCheck) if err != nil { diff --git a/pkg/workflow/compiler_orchestrator_workflow.go b/pkg/workflow/compiler_orchestrator_workflow.go index 93a6c740c9..47c3e34a6b 100644 --- a/pkg/workflow/compiler_orchestrator_workflow.go +++ b/pkg/workflow/compiler_orchestrator_workflow.go @@ -267,6 +267,7 @@ func (c *Compiler) extractAdditionalConfigurations( workflowData.Roles = c.extractRoles(frontmatter) workflowData.Bots = c.extractBots(frontmatter) workflowData.RateLimit = c.extractRateLimitConfig(frontmatter) + workflowData.CircuitBreaker = c.extractCircuitBreakerConfig(frontmatter) workflowData.SkipRoles = c.mergeSkipRoles(c.extractSkipRoles(frontmatter), importsResult.MergedSkipRoles) workflowData.SkipBots = c.mergeSkipBots(c.extractSkipBots(frontmatter), importsResult.MergedSkipBots) workflowData.ActivationGitHubToken = c.resolveActivationGitHubToken(frontmatter, importsResult) diff --git a/pkg/workflow/compiler_pre_activation_job.go b/pkg/workflow/compiler_pre_activation_job.go index dcde9ff6bf..0393bb401d 100644 --- a/pkg/workflow/compiler_pre_activation_job.go +++ b/pkg/workflow/compiler_pre_activation_job.go @@ -56,6 +56,14 @@ func (c *Compiler) buildPreActivationJob(data *WorkflowData, needsPermissionChec perms.Set(PermissionActions, PermissionRead) } + // Add actions: read permission if circuit breaker is configured (needed to list/download artifacts) + if data.CircuitBreaker != nil { + if perms == nil { + perms = NewPermissions() + } + perms.Set(PermissionActions, PermissionRead) + } + // Merge on.permissions into the pre-activation job permissions. // on.permissions lets users declare extra scopes required by their on.steps steps. if data.OnPermissions != nil { @@ -80,6 +88,11 @@ func (c *Compiler) buildPreActivationJob(data *WorkflowData, needsPermissionChec steps = c.generateRateLimitCheck(data, steps) } + // Add circuit breaker check if configured + if data.CircuitBreaker != nil { + steps = c.generateCircuitBreakerCheckSteps(data, steps) + } + // Add stop-time check if configured if data.StopTime != "" { compilerActivationJobsLog.Printf("Adding stop-time check step: stop_time=%s", data.StopTime) @@ -339,6 +352,16 @@ func (c *Compiler) buildPreActivationJob(data *WorkflowData, needsPermissionChec conditions = append(conditions, rateLimitCheck) } + if data.CircuitBreaker != nil { + // Add circuit breaker check condition + circuitBreakerCheck := BuildComparison( + BuildPropertyAccess(fmt.Sprintf("steps.%s.outputs.%s", constants.CheckCircuitBreakerStepID, constants.CircuitBreakerOkOutput)), + "==", + BuildStringLiteral("true"), + ) + conditions = append(conditions, circuitBreakerCheck) + } + if len(data.Command) > 0 { // Add command position check condition commandPositionCheck := BuildComparison( diff --git a/pkg/workflow/compiler_types.go b/pkg/workflow/compiler_types.go index 267db0c76d..8666fef3a4 100644 --- a/pkg/workflow/compiler_types.go +++ b/pkg/workflow/compiler_types.go @@ -475,6 +475,7 @@ type WorkflowData struct { Roles []string // permission levels required to trigger workflow Bots []string // allow list of bot identifiers that can trigger workflow RateLimit *RateLimitConfig // rate limiting configuration for workflow triggers + CircuitBreaker *CircuitBreakerConfig // circuit breaker configuration for repeated failure protection CacheMemoryConfig *CacheMemoryConfig // parsed cache-memory configuration RepoMemoryConfig *RepoMemoryConfig // parsed repo-memory configuration Runtimes map[string]any // runtime version overrides from frontmatter diff --git a/pkg/workflow/compiler_yaml_main_job.go b/pkg/workflow/compiler_yaml_main_job.go index f50e3dbbd4..9cd7b3ba65 100644 --- a/pkg/workflow/compiler_yaml_main_job.go +++ b/pkg/workflow/compiler_yaml_main_job.go @@ -611,6 +611,10 @@ func (c *Compiler) generateMainJobSteps(yaml *strings.Builder, data *WorkflowDat // Add post-steps (if any) after AI execution c.generatePostSteps(yaml, data) + // Add circuit breaker state update and upload (runs always, even on failure) + // This must run after agent execution so that the job.status reflects the agent outcome. + c.generateCircuitBreakerUpdateSteps(yaml, data) + // Include firewall audit/observability logs in the unified agent artifact // so all agent job outputs ship as a single artifact (AWF v0.25.0+). if isFirewallEnabled(data) { diff --git a/pkg/workflow/frontmatter_types.go b/pkg/workflow/frontmatter_types.go index 46761b4d18..f464c40651 100644 --- a/pkg/workflow/frontmatter_types.go +++ b/pkg/workflow/frontmatter_types.go @@ -105,6 +105,17 @@ type PermissionsConfig struct { GitHubAppPermissionsConfig } +// CircuitBreakerConfig represents the circuit breaker configuration for a workflow. +// The circuit breaker prevents repeated execution of a consistently failing workflow, +// following the standard closed → open → half-open state machine pattern. +// See: https://github.github.com/gh-aw/reference/frontmatter/#circuit-breaker +type CircuitBreakerConfig struct { + MaxConsecutiveFailures int `json:"max-consecutive-failures,omitempty"` // Number of consecutive failures before circuit opens (default: 5) + TimeWindow string `json:"time-window,omitempty"` // Duration window for counting failures, e.g. "24h" (default: "24h") + Cooldown string `json:"cooldown,omitempty"` // Duration to wait before allowing retry after circuit opens, e.g. "1h" (default: "1h") + Notify *bool `json:"notify,omitempty"` // Post workflow annotation when circuit opens/closes (default: true) +} + // RateLimitConfig represents rate limiting configuration for workflow triggers // Limits how many times a user can trigger a workflow within a time window type RateLimitConfig struct { @@ -210,6 +221,9 @@ type FrontmatterConfig struct { // Rate limiting configuration RateLimit *RateLimitConfig `json:"rate-limit,omitempty"` + // Circuit breaker configuration + CircuitBreaker *CircuitBreakerConfig `json:"circuit-breaker,omitempty"` + // Update check configuration. // When set to false, the version update check step is skipped in the activation job. // This flag is not allowed in strict mode.