diff --git a/.github/workflows/outcome-collector.lock.yml b/.github/workflows/outcome-collector.lock.yml
index 9e8f4862439..70903a801f6 100644
--- a/.github/workflows/outcome-collector.lock.yml
+++ b/.github/workflows/outcome-collector.lock.yml
@@ -1,4 +1,4 @@
-# gh-aw-metadata: {"schema_version":"v4","frontmatter_hash":"3ad63fcd0883d76bc969fa87251bdb3dcd466af5d829c186bb36f1b0957a49d9","body_hash":"3565b4cf9ab34b225d570f274fb56525421c1ff7da6fe786d59ff9118a00c4cf","strict":true,"agent_id":"copilot","agent_model":"claude-haiku-4.5"}
+# gh-aw-metadata: {"schema_version":"v4","frontmatter_hash":"42da1460588c18a99e30c64976af9003edee365e4da7180a0cdff8ca6000d492","body_hash":"cd82d186131dc6ce8076dce733396924205463a125355290b44d3dc1c0deb065","strict":true,"agent_id":"copilot","agent_model":"claude-haiku-4.5"}
# gh-aw-manifest: {"version":1,"secrets":["COPILOT_GITHUB_TOKEN","GH_AW_GITHUB_MCP_SERVER_TOKEN","GH_AW_GITHUB_TOKEN","GH_AW_OTEL_GRAFANA_AUTHORIZATION","GH_AW_OTEL_GRAFANA_ENDPOINT","GH_AW_OTEL_SENTRY_AUTHORIZATION","GH_AW_OTEL_SENTRY_ENDPOINT","GITHUB_TOKEN"],"actions":[{"repo":"actions/cache/restore","sha":"27d5ce7f107fe9357f9df03efb73ab90386fccae","version":"v5.0.5"},{"repo":"actions/cache/save","sha":"27d5ce7f107fe9357f9df03efb73ab90386fccae","version":"v5.0.5"},{"repo":"actions/checkout","sha":"de0fac2e4500dabe0009e67214ff5f5447ce83dd","version":"v6.0.2"},{"repo":"actions/download-artifact","sha":"3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c","version":"v8.0.1"},{"repo":"actions/github-script","sha":"3a2844b7e9c422d3c10d287c895573f7108da1b3","version":"v9.0.0"},{"repo":"actions/setup-node","sha":"48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e","version":"v6.4.0"},{"repo":"actions/upload-artifact","sha":"043fb46d1a93c77aae656e7c1c64a875d1fc6a0a","version":"v7.0.1"}],"containers":[{"image":"ghcr.io/github/gh-aw-firewall/agent:0.25.56"},{"image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.25.56"},{"image":"ghcr.io/github/gh-aw-firewall/cli-proxy:0.25.56"},{"image":"ghcr.io/github/gh-aw-firewall/squid:0.25.56"},{"image":"ghcr.io/github/gh-aw-mcpg:v0.3.20"},{"image":"ghcr.io/github/github-mcp-server:v1.1.0"},{"image":"node:lts-alpine","digest":"sha256:2bdb65ed1dab192432bc31c95f94155ca5ad7fc1392fb7eb7526ab682fa5bf14","pinned_image":"node:lts-alpine@sha256:2bdb65ed1dab192432bc31c95f94155ca5ad7fc1392fb7eb7526ab682fa5bf14"}]}
# ___ _ _
# / _ \ | | (_)
@@ -60,8 +60,8 @@
name: "Outcome Collector"
on:
schedule:
- - cron: "54 */6 * * *"
- # Friendly format: every 6 hours (scattered)
+ - cron: "0 0 */3 * *"
+ # Friendly format: every 3 days
workflow_dispatch:
inputs:
aw_context:
@@ -206,21 +206,21 @@ jobs:
run: |
bash "${RUNNER_TEMP}/gh-aw/actions/create_prompt_first.sh"
{
- cat << 'GH_AW_PROMPT_404234a26d1e1f34_EOF'
+ cat << 'GH_AW_PROMPT_95e9db6a69cbc077_EOF'
- GH_AW_PROMPT_404234a26d1e1f34_EOF
+ GH_AW_PROMPT_95e9db6a69cbc077_EOF
cat "${RUNNER_TEMP}/gh-aw/prompts/xpia.md"
cat "${RUNNER_TEMP}/gh-aw/prompts/temp_folder_prompt.md"
cat "${RUNNER_TEMP}/gh-aw/prompts/markdown.md"
cat "${RUNNER_TEMP}/gh-aw/prompts/cache_memory_prompt.md"
cat "${RUNNER_TEMP}/gh-aw/prompts/safe_outputs_prompt.md"
- cat << 'GH_AW_PROMPT_404234a26d1e1f34_EOF'
+ cat << 'GH_AW_PROMPT_95e9db6a69cbc077_EOF'
Tools: create_issue, missing_tool, missing_data, noop
- GH_AW_PROMPT_404234a26d1e1f34_EOF
+ GH_AW_PROMPT_95e9db6a69cbc077_EOF
cat "${RUNNER_TEMP}/gh-aw/prompts/mcp_cli_tools_prompt.md"
- cat << 'GH_AW_PROMPT_404234a26d1e1f34_EOF'
+ cat << 'GH_AW_PROMPT_95e9db6a69cbc077_EOF'
The following GitHub context information is available for this workflow:
{{#if github.actor}}
@@ -249,13 +249,13 @@ jobs:
{{/if}}
- GH_AW_PROMPT_404234a26d1e1f34_EOF
+ GH_AW_PROMPT_95e9db6a69cbc077_EOF
cat "${RUNNER_TEMP}/gh-aw/prompts/cli_proxy_with_safeoutputs_prompt.md"
- cat << 'GH_AW_PROMPT_404234a26d1e1f34_EOF'
+ cat << 'GH_AW_PROMPT_95e9db6a69cbc077_EOF'
{{#runtime-import .github/workflows/shared/otlp.md}}
{{#runtime-import .github/workflows/outcome-collector.md}}
- GH_AW_PROMPT_404234a26d1e1f34_EOF
+ GH_AW_PROMPT_95e9db6a69cbc077_EOF
} > "$GH_AW_PROMPT"
- name: Interpolate variables and render templates
uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
@@ -513,9 +513,9 @@ jobs:
mkdir -p "${RUNNER_TEMP}/gh-aw/safeoutputs"
mkdir -p /tmp/gh-aw/safeoutputs
mkdir -p /tmp/gh-aw/mcp-logs/safeoutputs
- cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_d1e1c30b757f9430_EOF'
+ cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_79868090f537bed5_EOF'
{"create_issue":{"close_older_issues":true,"expires":168,"group_by_day":true,"labels":["automation","observability","outcomes"],"max":1,"title_prefix":"[Outcome Report]"},"create_report_incomplete_issue":{},"missing_data":{},"missing_tool":{},"noop":{"max":1,"report-as-issue":"true"},"report_incomplete":{}}
- GH_AW_SAFE_OUTPUTS_CONFIG_d1e1c30b757f9430_EOF
+ GH_AW_SAFE_OUTPUTS_CONFIG_79868090f537bed5_EOF
- name: Generate Safe Outputs Tools
env:
GH_AW_TOOLS_META_JSON: |
@@ -720,7 +720,7 @@ jobs:
mkdir -p /home/runner/.copilot
GH_AW_NODE=$(which node 2>/dev/null || command -v node 2>/dev/null || echo node)
- cat << GH_AW_MCP_CONFIG_6dba530ac7daeba4_EOF | "$GH_AW_NODE" "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.cjs"
+ cat << GH_AW_MCP_CONFIG_bdbd1c859418aee5_EOF | "$GH_AW_NODE" "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.cjs"
{
"mcpServers": {
"safeoutputs": {
@@ -750,7 +750,7 @@ jobs:
}
}
}
- GH_AW_MCP_CONFIG_6dba530ac7daeba4_EOF
+ GH_AW_MCP_CONFIG_bdbd1c859418aee5_EOF
- name: Mount MCP servers as CLIs
id: mount-mcp-clis
continue-on-error: true
diff --git a/.github/workflows/outcome-collector.md b/.github/workflows/outcome-collector.md
index 453a1df555b..fad4a8b41ca 100644
--- a/.github/workflows/outcome-collector.md
+++ b/.github/workflows/outcome-collector.md
@@ -4,7 +4,7 @@ name: Outcome Collector
description: Periodic evaluation of safe output outcomes to measure workflow value and acceptance rates
on:
schedule:
- - cron: every 6 hours
+ - cron: every 3 days
workflow_dispatch:
permissions:
contents: read
@@ -118,13 +118,14 @@ The report must open with an executive-first view. Place the following at the to
**Executive read:** {one sentence: overall quality signal, where unresolved volume is concentrated, and whether any workflows are stuck or underdefined}
-| Workflow | Status | Lifecycle health |
-|---|---|---|
-| {workflow_name} | {status_bar} | {lifecycle_emoji} {lifecycle_label} |
+| Workflow | Status | Lifecycle health | References |
+|---|---|---|---|
+| {workflow_name} | {status_bar} | {lifecycle_emoji} {lifecycle_label} | {reference_links_by_status e.g. `A: [#123](...) [#456](...) · R: [#78](...) · P: [#90](...)`} |
**Legend:**
- **Status:** 🟩 accepted · 🟥 rejected · 🟨 pending · ⬜ unknown
- **Lifecycle health:** 🟢 resolving · 🟡 in flight · 🟠 aging · 🔴 stuck · ⚪ underdefined
+- **References:** accepted/rejected/pending/ignored/unknown links for quick verification
```
**Status bar rules:**
@@ -133,6 +134,23 @@ The report must open with an executive-first view. Place the following at the to
- Do not include numeric counts in the top table — the bar communicates volume.
- Sort rows by management attention: most pending first, then most unknown, then resolved-only workflows last.
+**References column rules:**
+- Add grouped links for each status present in that workflow (accepted, rejected, pending, ignored, unknown).
+- Use short status prefixes and compact link lists (example format: `A: [#123](...) [#456](...) · R: [#78](...) · P: [#90](...)`).
+- Link labels must be the real item identifiers when available (issue/PR/discussion/comment number, run id, or short commit SHA), not a synthetic sequence.
+- Include only valid issue/PR/discussion/comment/run URLs from the evaluated outcomes.
+
+### 🔴 Action Items
+
+List concrete actions the team should take based on the data directly under the executive summary table (outside ``):
+
+1. **Highest-waste workflows** — Name the top 2-3 workflows by waste rate. If waste rate >25%, recommend reviewing the prompt or safe-output configuration.
+2. **Stuck pending items** — List any items pending >48 hours or any workflow classified as 🔴 stuck. These need human review or the workflow needs a timeout.
+3. **Underdefined workflows** — Any workflow classified as ⚪ underdefined needs clearer acceptance/rejection criteria or a dedicated evaluator. The outcome model for that workflow is not yet mature.
+4. **Low zero-touch workflows** — Workflows where accepted items always need human edits indicate the agent's output quality needs improvement.
+5. **High ignored rate** — If ignored items exceed 30% of total outcomes, the workflow may be producing outputs that nobody engages with; consider refining targeting or output type.
+6. **Data quality: fallback evaluations** — If `fallback_exists_only_count` > 20% of total outcomes, many items were evaluated with only a generic existence check (weak signal). This means the acceptance numbers may be overstated; note this in the report.
+
**Lifecycle health classification** — assign one label per workflow based on its outcome history:
| Label | Emoji | When to assign |
@@ -147,11 +165,11 @@ Use cache-memory to determine lifecycle health: compare this run's per-workflow
### Details section (inside ``)
-Place all detailed metrics, numeric breakdowns, evidence quality, trends, and action items inside a collapsible block:
+Place all detailed metrics, numeric breakdowns, evidence quality, and trends inside a collapsible block:
```markdown
-Detailed metrics, evidence quality, workflow counts, and actions
+Detailed metrics, evidence quality, workflow counts, and trends
### Outcome Scorecard — {date}
@@ -171,17 +189,6 @@ Place all detailed metrics, numeric breakdowns, evidence quality, trends, and ac
| Pending | {pending} | — |
| Runs checked | {runs_checked} | — |
-### 🔴 Action Items
-
-List concrete actions the team should take based on the data:
-
-1. **Highest-waste workflows** — Name the top 2-3 workflows by waste rate. If waste rate >25%, recommend reviewing the prompt or safe-output configuration.
-2. **Stuck pending items** — List any items pending >48 hours or any workflow classified as 🔴 stuck. These need human review or the workflow needs a timeout.
-3. **Underdefined workflows** — Any workflow classified as ⚪ underdefined needs clearer acceptance/rejection criteria or a dedicated evaluator. The outcome model for that workflow is not yet mature.
-4. **Low zero-touch workflows** — Workflows where accepted items always need human edits indicate the agent's output quality needs improvement.
-5. **High ignored rate** — If ignored items exceed 30% of total outcomes, the workflow may be producing outputs that nobody engages with; consider refining targeting or output type.
-6. **Data quality: fallback evaluations** — If `fallback_exists_only_count` > 20% of total outcomes, many items were evaluated with only a generic existence check (weak signal). This means the acceptance numbers may be overstated; note this in the report.
-
### Per-Workflow Breakdown
For each workflow with outcomes, show a mini-scorecard:
@@ -213,7 +220,7 @@ If no previous data exists, skip this section.
- Keep the report factual — numbers only, no speculation
- Do not re-evaluate outcomes — use the pre-computed data
-- Optimize the top executive section for at-a-glance scanning; put all numeric detail in the `` block
+- Optimize the top executive section for at-a-glance scanning; keep action items directly under the executive summary table and put numeric detail in the `` block
- Sort the executive table rows by management attention: most pending first, then most unknown, then resolved-only workflows last.
- Sort the per-workflow breakdown inside `` by waste rate descending (worst first)
- Flag any workflow with acceptance rate <60% as needing attention