diff --git a/.github/workflows/aw-failure-investigator.lock.yml b/.github/workflows/aw-failure-investigator.lock.yml index a06fc67096..b9a1cce496 100644 --- a/.github/workflows/aw-failure-investigator.lock.yml +++ b/.github/workflows/aw-failure-investigator.lock.yml @@ -1,4 +1,4 @@ -# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"1ad248ea1db0894bbc04d5258f256ae1f0b0641b145e5b79b3b3d327b83a27ca","strict":true,"agent_id":"claude"} +# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"7042be7235aba0afc736f3291e4bcc19917eb7f5fc389058f4631ad497bdd587","strict":true,"agent_id":"claude"} # gh-aw-manifest: {"version":1,"secrets":["ANTHROPIC_API_KEY","GH_AW_GITHUB_MCP_SERVER_TOKEN","GH_AW_GITHUB_TOKEN","GITHUB_TOKEN"],"actions":[{"repo":"actions/checkout","sha":"de0fac2e4500dabe0009e67214ff5f5447ce83dd","version":"v6.0.2"},{"repo":"actions/download-artifact","sha":"3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c","version":"v8.0.1"},{"repo":"actions/github-script","sha":"373c709c69115d41ff229c7e5df9f8788daa9553","version":"v9"},{"repo":"actions/setup-go","sha":"4a3601121dd01d1626a1e23e37211e3254c1c06c","version":"v6.4.0"},{"repo":"actions/setup-node","sha":"53b83947a5a98c8d113130e565377fae1a50d02f","version":"v6.3.0"},{"repo":"actions/upload-artifact","sha":"043fb46d1a93c77aae656e7c1c64a875d1fc6a0a","version":"v7.0.1"},{"repo":"docker/build-push-action","sha":"bcafcacb16a39f128d818304e6c9c0c18556b85f","version":"v7.1.0"},{"repo":"docker/setup-buildx-action","sha":"4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd","version":"v4"}],"containers":[{"image":"ghcr.io/github/gh-aw-firewall/agent:0.25.22"},{"image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.25.22"},{"image":"ghcr.io/github/gh-aw-firewall/squid:0.25.22"},{"image":"ghcr.io/github/gh-aw-mcpg:v0.2.22"},{"image":"ghcr.io/github/github-mcp-server:v0.32.0","digest":"sha256:2763823c63bcca718ce53850a1d7fcf2f501ec84028394f1b63ce7e9f4f9be28","pinned_image":"ghcr.io/github/github-mcp-server:v0.32.0@sha256:2763823c63bcca718ce53850a1d7fcf2f501ec84028394f1b63ce7e9f4f9be28"},{"image":"node:lts-alpine","digest":"sha256:01743339035a5c3c11a373cd7c83aeab6ed1457b55da6a69e014a95ac4e4700b","pinned_image":"node:lts-alpine@sha256:01743339035a5c3c11a373cd7c83aeab6ed1457b55da6a69e014a95ac4e4700b"}]} # ___ _ _ # / _ \ | | (_) @@ -22,7 +22,7 @@ # # For more information: https://github.github.com/gh-aw/introduction/overview/ # -# Investigates [aw] failures from the last 6 hours, correlates with open agentic-workflows issues, and opens a parent report with fix sub-issues +# Investigates [aw] failures from the last 6 hours, correlates with open agentic-workflows issues, closes fixed issues, and opens focused fix sub-issues when needed # # Resolved workflow manifest: # Imports: @@ -178,21 +178,21 @@ jobs: run: | bash "${RUNNER_TEMP}/gh-aw/actions/create_prompt_first.sh" { - cat << 'GH_AW_PROMPT_cecbe9d9e6fd732f_EOF' + cat << 'GH_AW_PROMPT_8dd97efdd6795a37_EOF' - GH_AW_PROMPT_cecbe9d9e6fd732f_EOF + GH_AW_PROMPT_8dd97efdd6795a37_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/xpia.md" cat "${RUNNER_TEMP}/gh-aw/prompts/temp_folder_prompt.md" cat "${RUNNER_TEMP}/gh-aw/prompts/markdown.md" cat "${RUNNER_TEMP}/gh-aw/prompts/agentic_workflows_guide.md" cat "${RUNNER_TEMP}/gh-aw/prompts/safe_outputs_prompt.md" - cat << 'GH_AW_PROMPT_cecbe9d9e6fd732f_EOF' + cat << 'GH_AW_PROMPT_8dd97efdd6795a37_EOF' - Tools: create_issue(max:8), link_sub_issue(max:20), missing_tool, missing_data, noop + Tools: create_issue(max:2), update_issue(max:10), link_sub_issue(max:10), missing_tool, missing_data, noop - GH_AW_PROMPT_cecbe9d9e6fd732f_EOF + GH_AW_PROMPT_8dd97efdd6795a37_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/mcp_cli_tools_prompt.md" - cat << 'GH_AW_PROMPT_cecbe9d9e6fd732f_EOF' + cat << 'GH_AW_PROMPT_8dd97efdd6795a37_EOF' The following GitHub context information is available for this workflow: {{#if __GH_AW_GITHUB_ACTOR__ }} @@ -221,13 +221,13 @@ jobs: {{/if}} - GH_AW_PROMPT_cecbe9d9e6fd732f_EOF + GH_AW_PROMPT_8dd97efdd6795a37_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/github_mcp_tools_with_safeoutputs_prompt.md" - cat << 'GH_AW_PROMPT_cecbe9d9e6fd732f_EOF' + cat << 'GH_AW_PROMPT_8dd97efdd6795a37_EOF' {{#runtime-import .github/workflows/shared/reporting.md}} {{#runtime-import .github/workflows/aw-failure-investigator.md}} - GH_AW_PROMPT_cecbe9d9e6fd732f_EOF + GH_AW_PROMPT_8dd97efdd6795a37_EOF } > "$GH_AW_PROMPT" - name: Interpolate variables and render templates uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 @@ -463,16 +463,17 @@ jobs: mkdir -p "${RUNNER_TEMP}/gh-aw/safeoutputs" mkdir -p /tmp/gh-aw/safeoutputs mkdir -p /tmp/gh-aw/mcp-logs/safeoutputs - cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_001b9ee7083ebf0d_EOF' - {"create_issue":{"expires":168,"group":true,"labels":["agentic-workflows","automation","cookie"],"max":8,"title_prefix":"[aw-failures] "},"create_report_incomplete_issue":{},"link_sub_issue":{"max":20},"missing_data":{},"missing_tool":{},"noop":{"max":1,"report-as-issue":"true"},"report_incomplete":{}} - GH_AW_SAFE_OUTPUTS_CONFIG_001b9ee7083ebf0d_EOF + cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_49a37fa9d22cb844_EOF' + {"create_issue":{"expires":168,"group":true,"labels":["agentic-workflows","automation","cookie"],"max":2,"title_prefix":"[aw-failures] "},"create_report_incomplete_issue":{},"link_sub_issue":{"max":10},"missing_data":{},"missing_tool":{},"noop":{"max":1,"report-as-issue":"true"},"report_incomplete":{},"update_issue":{"allow_body":true,"max":10,"target":"*"}} + GH_AW_SAFE_OUTPUTS_CONFIG_49a37fa9d22cb844_EOF - name: Write Safe Outputs Tools env: GH_AW_TOOLS_META_JSON: | { "description_suffixes": { - "create_issue": " CONSTRAINTS: Maximum 8 issue(s) can be created. Title will be prefixed with \"[aw-failures] \". Labels [\"agentic-workflows\" \"automation\" \"cookie\"] will be automatically added.", - "link_sub_issue": " CONSTRAINTS: Maximum 20 sub-issue link(s) can be created." + "create_issue": " CONSTRAINTS: Maximum 2 issue(s) can be created. Title will be prefixed with \"[aw-failures] \". Labels [\"agentic-workflows\" \"automation\" \"cookie\"] will be automatically added.", + "link_sub_issue": " CONSTRAINTS: Maximum 10 sub-issue link(s) can be created.", + "update_issue": " CONSTRAINTS: Maximum 10 issue(s) can be updated. Target: *." }, "repo_params": {}, "dynamic_tools": [] @@ -602,6 +603,60 @@ jobs: "maxLength": 1024 } } + }, + "update_issue": { + "defaultMax": 1, + "fields": { + "assignees": { + "type": "array", + "itemType": "string", + "itemSanitize": true, + "itemMaxLength": 39 + }, + "body": { + "type": "string", + "sanitize": true, + "maxLength": 65000 + }, + "issue_number": { + "issueOrPRNumber": true + }, + "labels": { + "type": "array", + "itemType": "string", + "itemSanitize": true, + "itemMaxLength": 128 + }, + "milestone": { + "optionalPositiveInteger": true + }, + "operation": { + "type": "string", + "enum": [ + "replace", + "append", + "prepend", + "replace-island" + ] + }, + "repo": { + "type": "string", + "maxLength": 256 + }, + "status": { + "type": "string", + "enum": [ + "open", + "closed" + ] + }, + "title": { + "type": "string", + "sanitize": true, + "maxLength": 128 + } + }, + "customValidation": "requiresOneOf:status,title,body" } } uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 @@ -683,7 +738,7 @@ jobs: export MCP_GATEWAY_DOCKER_COMMAND='docker run -i --rm --network host --group-add '"${DOCKER_SOCK_GID}"' -v /var/run/docker.sock:/var/run/docker.sock -e MCP_GATEWAY_PORT -e MCP_GATEWAY_DOMAIN -e MCP_GATEWAY_API_KEY -e MCP_GATEWAY_PAYLOAD_DIR -e MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD -e DEBUG -e MCP_GATEWAY_LOG_DIR -e GH_AW_MCP_LOG_DIR -e GH_AW_SAFE_OUTPUTS -e GH_AW_SAFE_OUTPUTS_CONFIG_PATH -e GH_AW_SAFE_OUTPUTS_TOOLS_PATH -e GH_AW_ASSETS_BRANCH -e GH_AW_ASSETS_MAX_SIZE_KB -e GH_AW_ASSETS_ALLOWED_EXTS -e DEFAULT_BRANCH -e GITHUB_MCP_SERVER_TOKEN -e GITHUB_MCP_GUARD_MIN_INTEGRITY -e GITHUB_MCP_GUARD_REPOS -e GITHUB_REPOSITORY -e GITHUB_SERVER_URL -e GITHUB_SHA -e GITHUB_WORKSPACE -e GITHUB_TOKEN -e GITHUB_RUN_ID -e GITHUB_RUN_NUMBER -e GITHUB_RUN_ATTEMPT -e GITHUB_JOB -e GITHUB_ACTION -e GITHUB_EVENT_NAME -e GITHUB_EVENT_PATH -e GITHUB_ACTOR -e GITHUB_ACTOR_ID -e GITHUB_TRIGGERING_ACTOR -e GITHUB_WORKFLOW -e GITHUB_WORKFLOW_REF -e GITHUB_WORKFLOW_SHA -e GITHUB_REF -e GITHUB_REF_NAME -e GITHUB_REF_TYPE -e GITHUB_HEAD_REF -e GITHUB_BASE_REF -e GH_AW_SAFE_OUTPUTS_PORT -e GH_AW_SAFE_OUTPUTS_API_KEY -v /tmp/gh-aw/mcp-payloads:/tmp/gh-aw/mcp-payloads:rw -v /opt:/opt:ro -v /tmp:/tmp:rw -v '"${GITHUB_WORKSPACE}"':'"${GITHUB_WORKSPACE}"':rw ghcr.io/github/gh-aw-mcpg:v0.2.22' GH_AW_NODE=$(which node 2>/dev/null || command -v node 2>/dev/null || echo node) - cat << GH_AW_MCP_CONFIG_445612af8682882b_EOF | "$GH_AW_NODE" "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.cjs" + cat << GH_AW_MCP_CONFIG_1c0184af1338d201_EOF | "$GH_AW_NODE" "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.cjs" { "mcpServers": { "agenticworkflows": { @@ -741,7 +796,7 @@ jobs: "payloadDir": "${MCP_GATEWAY_PAYLOAD_DIR}" } } - GH_AW_MCP_CONFIG_445612af8682882b_EOF + GH_AW_MCP_CONFIG_1c0184af1338d201_EOF - name: Mount MCP servers as CLIs id: mount-mcp-clis continue-on-error: true @@ -1241,7 +1296,7 @@ jobs: uses: actions/github-script@373c709c69115d41ff229c7e5df9f8788daa9553 # v9 env: WORKFLOW_NAME: "[aw] Failure Investigator (6h)" - WORKFLOW_DESCRIPTION: "Investigates [aw] failures from the last 6 hours, correlates with open agentic-workflows issues, and opens a parent report with fix sub-issues" + WORKFLOW_DESCRIPTION: "Investigates [aw] failures from the last 6 hours, correlates with open agentic-workflows issues, closes fixed issues, and opens focused fix sub-issues when needed" HAS_PATCH: ${{ needs.agent.outputs.has_patch }} with: script: | @@ -1403,7 +1458,7 @@ jobs: GH_AW_ALLOWED_DOMAINS: "*.githubusercontent.com,anthropic.com,api.anthropic.com,api.github.com,api.snapcraft.io,archive.ubuntu.com,azure.archive.ubuntu.com,cdn.playwright.dev,codeload.github.com,crl.geotrust.com,crl.globalsign.com,crl.identrust.com,crl.sectigo.com,crl.thawte.com,crl.usertrust.com,crl.verisign.com,crl3.digicert.com,crl4.digicert.com,crls.ssl.com,files.pythonhosted.org,ghcr.io,github-cloud.githubusercontent.com,github-cloud.s3.amazonaws.com,github.com,host.docker.internal,json-schema.org,json.schemastore.org,keyserver.ubuntu.com,lfs.github.com,objects.githubusercontent.com,ocsp.digicert.com,ocsp.geotrust.com,ocsp.globalsign.com,ocsp.identrust.com,ocsp.sectigo.com,ocsp.ssl.com,ocsp.thawte.com,ocsp.usertrust.com,ocsp.verisign.com,packagecloud.io,packages.cloud.google.com,packages.microsoft.com,playwright.download.prss.microsoft.com,ppa.launchpad.net,pypi.org,raw.githubusercontent.com,registry.npmjs.org,s.symcb.com,s.symcd.com,security.ubuntu.com,sentry.io,statsig.anthropic.com,ts-crl.ws.symantec.com,ts-ocsp.ws.symantec.com,www.googleapis.com" GITHUB_SERVER_URL: ${{ github.server_url }} GITHUB_API_URL: ${{ github.api_url }} - GH_AW_SAFE_OUTPUTS_HANDLER_CONFIG: "{\"create_issue\":{\"expires\":168,\"group\":true,\"labels\":[\"agentic-workflows\",\"automation\",\"cookie\"],\"max\":8,\"title_prefix\":\"[aw-failures] \"},\"create_report_incomplete_issue\":{},\"link_sub_issue\":{\"max\":20},\"missing_data\":{},\"missing_tool\":{},\"noop\":{\"max\":1,\"report-as-issue\":\"true\"},\"report_incomplete\":{}}" + GH_AW_SAFE_OUTPUTS_HANDLER_CONFIG: "{\"create_issue\":{\"expires\":168,\"group\":true,\"labels\":[\"agentic-workflows\",\"automation\",\"cookie\"],\"max\":2,\"title_prefix\":\"[aw-failures] \"},\"create_report_incomplete_issue\":{},\"link_sub_issue\":{\"max\":10},\"missing_data\":{},\"missing_tool\":{},\"noop\":{\"max\":1,\"report-as-issue\":\"true\"},\"report_incomplete\":{},\"update_issue\":{\"allow_body\":true,\"max\":10,\"target\":\"*\"}}" with: github-token: ${{ secrets.GH_AW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} script: | diff --git a/.github/workflows/aw-failure-investigator.md b/.github/workflows/aw-failure-investigator.md index d27e49da22..a1942a2457 100644 --- a/.github/workflows/aw-failure-investigator.md +++ b/.github/workflows/aw-failure-investigator.md @@ -1,5 +1,5 @@ --- -description: Investigates [aw] failures from the last 6 hours, correlates with open agentic-workflows issues, and opens a parent report with fix sub-issues +description: Investigates [aw] failures from the last 6 hours, correlates with open agentic-workflows issues, closes fixed issues, and opens focused fix sub-issues when needed on: schedule: - cron: "every 6h" @@ -22,10 +22,13 @@ safe-outputs: expires: 7d title-prefix: "[aw-failures] " labels: [agentic-workflows, automation, cookie] - max: 8 + max: 2 group: true + update-issue: + target: "*" + max: 10 link-sub-issue: - max: 20 + max: 10 noop: timeout-minutes: 60 imports: @@ -49,7 +52,7 @@ Investigate agentic workflow failures from the last 6 hours and produce actionab 1. Find recent failures from agentic workflows in the last 6 hours. 2. Correlate findings with currently open `agentic-workflows` issues. 3. Perform large-scale failure analysis using logs + audit + audit-diff. -4. Create one parent report issue and linked sub-issues proposing concrete fixes. +4. Close fixed/stale issues first, then create only the minimum necessary linked fix sub-issues. ## Required Investigation Steps @@ -91,16 +94,15 @@ Use `agentic-workflows` MCP `audit-diff` to compare: Identify regressions and deltas (metrics/tooling/firewall/MCP behavior) that support fix recommendations. -### 5) Create parent report issue + sub-issues +### 5) Close fixed issues first, then add focused sub-issues -Create a **single parent report issue** with a temporary ID (format `aw_` + 3-8 alphanumeric characters) summarizing: -- observed failure clusters in last 6h -- links to analyzed run IDs -- evidence from logs/audit/audit-diff -- mapping to existing open issues (duplicate / related / new) -- prioritized fix plan +First, identify currently open `agentic-workflows` issues that are now fixed, stale, or no longer actionable based on fresh evidence, and close them using `update-issue`. -Then create **sub-issues** (linked to the parent) for concrete fixes. Each sub-issue must include: +Then, if new uncovered work remains, add **sub-issues** for concrete fixes to the **most recent open parent report issue** instead of creating a new parent by default. + +Only create a new parent report issue (temporary ID format `aw_` + 3-8 alphanumeric characters) when **P0 failures have no existing tracking coverage**. + +Each new sub-issue must include: - clear problem statement - affected workflows and run IDs - probable root cause @@ -128,7 +130,9 @@ Include these sections: ## Decision Rules - If there are **no failures** in the last 6h, or no actionable delta vs existing issues, call `noop` with a concise reason. -- If failures exist but are already fully tracked, update by creating a minimal parent report that links to existing issues and only create new sub-issues for uncovered gaps. +- If failures exist but are already fully tracked, prefer closing stale/fixed issues and avoid creating new issues. +- Only create a new parent report issue when P0 failures have no existing tracking coverage. +- Prefer closing stale/fixed issues over creating new issues when issue volume is high. - Always be explicit about confidence and unknowns. **Important**: If no action is needed after completing your analysis, you **MUST** call the `noop` safe-output tool with a brief explanation.