diff --git a/.github/workflows/outcome-collector.lock.yml b/.github/workflows/outcome-collector.lock.yml index 9e8f4862439..70903a801f6 100644 --- a/.github/workflows/outcome-collector.lock.yml +++ b/.github/workflows/outcome-collector.lock.yml @@ -1,4 +1,4 @@ -# gh-aw-metadata: {"schema_version":"v4","frontmatter_hash":"3ad63fcd0883d76bc969fa87251bdb3dcd466af5d829c186bb36f1b0957a49d9","body_hash":"3565b4cf9ab34b225d570f274fb56525421c1ff7da6fe786d59ff9118a00c4cf","strict":true,"agent_id":"copilot","agent_model":"claude-haiku-4.5"} +# gh-aw-metadata: {"schema_version":"v4","frontmatter_hash":"42da1460588c18a99e30c64976af9003edee365e4da7180a0cdff8ca6000d492","body_hash":"cd82d186131dc6ce8076dce733396924205463a125355290b44d3dc1c0deb065","strict":true,"agent_id":"copilot","agent_model":"claude-haiku-4.5"} # gh-aw-manifest: {"version":1,"secrets":["COPILOT_GITHUB_TOKEN","GH_AW_GITHUB_MCP_SERVER_TOKEN","GH_AW_GITHUB_TOKEN","GH_AW_OTEL_GRAFANA_AUTHORIZATION","GH_AW_OTEL_GRAFANA_ENDPOINT","GH_AW_OTEL_SENTRY_AUTHORIZATION","GH_AW_OTEL_SENTRY_ENDPOINT","GITHUB_TOKEN"],"actions":[{"repo":"actions/cache/restore","sha":"27d5ce7f107fe9357f9df03efb73ab90386fccae","version":"v5.0.5"},{"repo":"actions/cache/save","sha":"27d5ce7f107fe9357f9df03efb73ab90386fccae","version":"v5.0.5"},{"repo":"actions/checkout","sha":"de0fac2e4500dabe0009e67214ff5f5447ce83dd","version":"v6.0.2"},{"repo":"actions/download-artifact","sha":"3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c","version":"v8.0.1"},{"repo":"actions/github-script","sha":"3a2844b7e9c422d3c10d287c895573f7108da1b3","version":"v9.0.0"},{"repo":"actions/setup-node","sha":"48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e","version":"v6.4.0"},{"repo":"actions/upload-artifact","sha":"043fb46d1a93c77aae656e7c1c64a875d1fc6a0a","version":"v7.0.1"}],"containers":[{"image":"ghcr.io/github/gh-aw-firewall/agent:0.25.56"},{"image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.25.56"},{"image":"ghcr.io/github/gh-aw-firewall/cli-proxy:0.25.56"},{"image":"ghcr.io/github/gh-aw-firewall/squid:0.25.56"},{"image":"ghcr.io/github/gh-aw-mcpg:v0.3.20"},{"image":"ghcr.io/github/github-mcp-server:v1.1.0"},{"image":"node:lts-alpine","digest":"sha256:2bdb65ed1dab192432bc31c95f94155ca5ad7fc1392fb7eb7526ab682fa5bf14","pinned_image":"node:lts-alpine@sha256:2bdb65ed1dab192432bc31c95f94155ca5ad7fc1392fb7eb7526ab682fa5bf14"}]} # ___ _ _ # / _ \ | | (_) @@ -60,8 +60,8 @@ name: "Outcome Collector" on: schedule: - - cron: "54 */6 * * *" - # Friendly format: every 6 hours (scattered) + - cron: "0 0 */3 * *" + # Friendly format: every 3 days workflow_dispatch: inputs: aw_context: @@ -206,21 +206,21 @@ jobs: run: | bash "${RUNNER_TEMP}/gh-aw/actions/create_prompt_first.sh" { - cat << 'GH_AW_PROMPT_404234a26d1e1f34_EOF' + cat << 'GH_AW_PROMPT_95e9db6a69cbc077_EOF' - GH_AW_PROMPT_404234a26d1e1f34_EOF + GH_AW_PROMPT_95e9db6a69cbc077_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/xpia.md" cat "${RUNNER_TEMP}/gh-aw/prompts/temp_folder_prompt.md" cat "${RUNNER_TEMP}/gh-aw/prompts/markdown.md" cat "${RUNNER_TEMP}/gh-aw/prompts/cache_memory_prompt.md" cat "${RUNNER_TEMP}/gh-aw/prompts/safe_outputs_prompt.md" - cat << 'GH_AW_PROMPT_404234a26d1e1f34_EOF' + cat << 'GH_AW_PROMPT_95e9db6a69cbc077_EOF' Tools: create_issue, missing_tool, missing_data, noop - GH_AW_PROMPT_404234a26d1e1f34_EOF + GH_AW_PROMPT_95e9db6a69cbc077_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/mcp_cli_tools_prompt.md" - cat << 'GH_AW_PROMPT_404234a26d1e1f34_EOF' + cat << 'GH_AW_PROMPT_95e9db6a69cbc077_EOF' The following GitHub context information is available for this workflow: {{#if github.actor}} @@ -249,13 +249,13 @@ jobs: {{/if}} - GH_AW_PROMPT_404234a26d1e1f34_EOF + GH_AW_PROMPT_95e9db6a69cbc077_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/cli_proxy_with_safeoutputs_prompt.md" - cat << 'GH_AW_PROMPT_404234a26d1e1f34_EOF' + cat << 'GH_AW_PROMPT_95e9db6a69cbc077_EOF' {{#runtime-import .github/workflows/shared/otlp.md}} {{#runtime-import .github/workflows/outcome-collector.md}} - GH_AW_PROMPT_404234a26d1e1f34_EOF + GH_AW_PROMPT_95e9db6a69cbc077_EOF } > "$GH_AW_PROMPT" - name: Interpolate variables and render templates uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 @@ -513,9 +513,9 @@ jobs: mkdir -p "${RUNNER_TEMP}/gh-aw/safeoutputs" mkdir -p /tmp/gh-aw/safeoutputs mkdir -p /tmp/gh-aw/mcp-logs/safeoutputs - cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_d1e1c30b757f9430_EOF' + cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_79868090f537bed5_EOF' {"create_issue":{"close_older_issues":true,"expires":168,"group_by_day":true,"labels":["automation","observability","outcomes"],"max":1,"title_prefix":"[Outcome Report]"},"create_report_incomplete_issue":{},"missing_data":{},"missing_tool":{},"noop":{"max":1,"report-as-issue":"true"},"report_incomplete":{}} - GH_AW_SAFE_OUTPUTS_CONFIG_d1e1c30b757f9430_EOF + GH_AW_SAFE_OUTPUTS_CONFIG_79868090f537bed5_EOF - name: Generate Safe Outputs Tools env: GH_AW_TOOLS_META_JSON: | @@ -720,7 +720,7 @@ jobs: mkdir -p /home/runner/.copilot GH_AW_NODE=$(which node 2>/dev/null || command -v node 2>/dev/null || echo node) - cat << GH_AW_MCP_CONFIG_6dba530ac7daeba4_EOF | "$GH_AW_NODE" "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.cjs" + cat << GH_AW_MCP_CONFIG_bdbd1c859418aee5_EOF | "$GH_AW_NODE" "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.cjs" { "mcpServers": { "safeoutputs": { @@ -750,7 +750,7 @@ jobs: } } } - GH_AW_MCP_CONFIG_6dba530ac7daeba4_EOF + GH_AW_MCP_CONFIG_bdbd1c859418aee5_EOF - name: Mount MCP servers as CLIs id: mount-mcp-clis continue-on-error: true diff --git a/.github/workflows/outcome-collector.md b/.github/workflows/outcome-collector.md index 453a1df555b..fad4a8b41ca 100644 --- a/.github/workflows/outcome-collector.md +++ b/.github/workflows/outcome-collector.md @@ -4,7 +4,7 @@ name: Outcome Collector description: Periodic evaluation of safe output outcomes to measure workflow value and acceptance rates on: schedule: - - cron: every 6 hours + - cron: every 3 days workflow_dispatch: permissions: contents: read @@ -118,13 +118,14 @@ The report must open with an executive-first view. Place the following at the to **Executive read:** {one sentence: overall quality signal, where unresolved volume is concentrated, and whether any workflows are stuck or underdefined} -| Workflow | Status | Lifecycle health | -|---|---|---| -| {workflow_name} | {status_bar} | {lifecycle_emoji} {lifecycle_label} | +| Workflow | Status | Lifecycle health | References | +|---|---|---|---| +| {workflow_name} | {status_bar} | {lifecycle_emoji} {lifecycle_label} | {reference_links_by_status e.g. `A: [#123](...) [#456](...) · R: [#78](...) · P: [#90](...)`} | **Legend:** - **Status:** 🟩 accepted · 🟥 rejected · 🟨 pending · ⬜ unknown - **Lifecycle health:** 🟢 resolving · 🟡 in flight · 🟠 aging · 🔴 stuck · ⚪ underdefined +- **References:** accepted/rejected/pending/ignored/unknown links for quick verification ``` **Status bar rules:** @@ -133,6 +134,23 @@ The report must open with an executive-first view. Place the following at the to - Do not include numeric counts in the top table — the bar communicates volume. - Sort rows by management attention: most pending first, then most unknown, then resolved-only workflows last. +**References column rules:** +- Add grouped links for each status present in that workflow (accepted, rejected, pending, ignored, unknown). +- Use short status prefixes and compact link lists (example format: `A: [#123](...) [#456](...) · R: [#78](...) · P: [#90](...)`). +- Link labels must be the real item identifiers when available (issue/PR/discussion/comment number, run id, or short commit SHA), not a synthetic sequence. +- Include only valid issue/PR/discussion/comment/run URLs from the evaluated outcomes. + +### 🔴 Action Items + +List concrete actions the team should take based on the data directly under the executive summary table (outside `
`): + +1. **Highest-waste workflows** — Name the top 2-3 workflows by waste rate. If waste rate >25%, recommend reviewing the prompt or safe-output configuration. +2. **Stuck pending items** — List any items pending >48 hours or any workflow classified as 🔴 stuck. These need human review or the workflow needs a timeout. +3. **Underdefined workflows** — Any workflow classified as ⚪ underdefined needs clearer acceptance/rejection criteria or a dedicated evaluator. The outcome model for that workflow is not yet mature. +4. **Low zero-touch workflows** — Workflows where accepted items always need human edits indicate the agent's output quality needs improvement. +5. **High ignored rate** — If ignored items exceed 30% of total outcomes, the workflow may be producing outputs that nobody engages with; consider refining targeting or output type. +6. **Data quality: fallback evaluations** — If `fallback_exists_only_count` > 20% of total outcomes, many items were evaluated with only a generic existence check (weak signal). This means the acceptance numbers may be overstated; note this in the report. + **Lifecycle health classification** — assign one label per workflow based on its outcome history: | Label | Emoji | When to assign | @@ -147,11 +165,11 @@ Use cache-memory to determine lifecycle health: compare this run's per-workflow ### Details section (inside `
`) -Place all detailed metrics, numeric breakdowns, evidence quality, trends, and action items inside a collapsible block: +Place all detailed metrics, numeric breakdowns, evidence quality, and trends inside a collapsible block: ```markdown
-Detailed metrics, evidence quality, workflow counts, and actions +Detailed metrics, evidence quality, workflow counts, and trends ### Outcome Scorecard — {date} @@ -171,17 +189,6 @@ Place all detailed metrics, numeric breakdowns, evidence quality, trends, and ac | Pending | {pending} | — | | Runs checked | {runs_checked} | — | -### 🔴 Action Items - -List concrete actions the team should take based on the data: - -1. **Highest-waste workflows** — Name the top 2-3 workflows by waste rate. If waste rate >25%, recommend reviewing the prompt or safe-output configuration. -2. **Stuck pending items** — List any items pending >48 hours or any workflow classified as 🔴 stuck. These need human review or the workflow needs a timeout. -3. **Underdefined workflows** — Any workflow classified as ⚪ underdefined needs clearer acceptance/rejection criteria or a dedicated evaluator. The outcome model for that workflow is not yet mature. -4. **Low zero-touch workflows** — Workflows where accepted items always need human edits indicate the agent's output quality needs improvement. -5. **High ignored rate** — If ignored items exceed 30% of total outcomes, the workflow may be producing outputs that nobody engages with; consider refining targeting or output type. -6. **Data quality: fallback evaluations** — If `fallback_exists_only_count` > 20% of total outcomes, many items were evaluated with only a generic existence check (weak signal). This means the acceptance numbers may be overstated; note this in the report. - ### Per-Workflow Breakdown For each workflow with outcomes, show a mini-scorecard: @@ -213,7 +220,7 @@ If no previous data exists, skip this section. - Keep the report factual — numbers only, no speculation - Do not re-evaluate outcomes — use the pre-computed data -- Optimize the top executive section for at-a-glance scanning; put all numeric detail in the `
` block +- Optimize the top executive section for at-a-glance scanning; keep action items directly under the executive summary table and put numeric detail in the `
` block - Sort the executive table rows by management attention: most pending first, then most unknown, then resolved-only workflows last. - Sort the per-workflow breakdown inside `
` by waste rate descending (worst first) - Flag any workflow with acceptance rate <60% as needing attention