From b55518d601ad69848007f339d43adfe5d53a848b Mon Sep 17 00:00:00 2001 From: Mara Nikola Kiefer Date: Thu, 21 May 2026 07:46:15 +0200 Subject: [PATCH 1/7] chore: update otlp data quality validator description and architecture details --- .github/agents/agentic-workflows.agent.md | 34 +- .../workflows/otlp-data-quality-validator.md | 356 +++++++++--------- 2 files changed, 192 insertions(+), 198 deletions(-) diff --git a/.github/agents/agentic-workflows.agent.md b/.github/agents/agentic-workflows.agent.md index 43071e7216c..f7e5eb4f1cd 100644 --- a/.github/agents/agentic-workflows.agent.md +++ b/.github/agents/agentic-workflows.agent.md @@ -25,7 +25,7 @@ This is a **dispatcher agent** that routes your request to the appropriate speci - **Choosing workflow architectures and design patterns**: Routes to `patterns` guide โ€” consult this whenever the user asks for strategy, architecture, operating models, or pattern selection for agentic workflows > [!IMPORTANT] -> For architecture/pattern-selection requests, load `https://github.com/github/gh-aw/blob/main/.github/aw/patterns.md` first. +> For architecture/pattern-selection requests, load `https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/patterns.md` first. Workflows may optionally include: @@ -37,7 +37,7 @@ Workflows may optionally include: - Workflow files: `.github/workflows/*.md` and `.github/workflows/**/*.md` - Workflow lock files: `.github/workflows/*.lock.yml` - Shared components: `.github/workflows/shared/*.md` -- Configuration: https://github.com/github/gh-aw/blob/main/.github/aw/github-agentic-workflows.md +- Configuration: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/github-agentic-workflows.md ## Problems This Solves @@ -59,7 +59,7 @@ When you interact with this agent, it will: ### Create New Workflow **Load when**: User wants to create a new workflow from scratch, add automation, or design a workflow that doesn't exist yet -**Prompt file**: https://github.com/github/gh-aw/blob/main/.github/aw/create-agentic-workflow.md +**Prompt file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/create-agentic-workflow.md **Use cases**: - "Create a workflow that triages issues" @@ -69,7 +69,7 @@ When you interact with this agent, it will: ### Update Existing Workflow **Load when**: User wants to modify, improve, or refactor an existing workflow -**Prompt file**: https://github.com/github/gh-aw/blob/main/.github/aw/update-agentic-workflow.md +**Prompt file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/update-agentic-workflow.md **Use cases**: - "Add web-fetch tool to the issue-classifier workflow" @@ -79,7 +79,7 @@ When you interact with this agent, it will: ### Debug Workflow **Load when**: User needs to investigate, audit, debug, or understand a workflow, troubleshoot issues, analyze logs, or fix errors -**Prompt file**: https://github.com/github/gh-aw/blob/main/.github/aw/debug-agentic-workflow.md +**Prompt file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/debug-agentic-workflow.md **Use cases**: - "Why is this workflow failing?" @@ -89,7 +89,7 @@ When you interact with this agent, it will: ### Upgrade Agentic Workflows **Load when**: User wants to upgrade workflows to a new gh-aw version or fix deprecations -**Prompt file**: https://github.com/github/gh-aw/blob/main/.github/aw/upgrade-agentic-workflows.md +**Prompt file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/upgrade-agentic-workflows.md **Use cases**: - "Upgrade all workflows to the latest version" @@ -99,7 +99,7 @@ When you interact with this agent, it will: ### Create a Report-Generating Workflow **Load when**: The workflow being created or updated produces reports โ€” recurring status updates, audit summaries, analyses, or any structured output posted as a GitHub issue, discussion, or comment -**Prompt file**: https://github.com/github/gh-aw/blob/main/.github/aw/report.md +**Prompt file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/report.md **Use cases**: - "Create a weekly CI health report" @@ -109,7 +109,7 @@ When you interact with this agent, it will: ### Create Shared Agentic Workflow **Load when**: User wants to create a reusable workflow component or wrap an MCP server -**Prompt file**: https://github.com/github/gh-aw/blob/main/.github/aw/create-shared-agentic-workflow.md +**Prompt file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/create-shared-agentic-workflow.md **Use cases**: - "Create a shared component for Notion integration" @@ -119,7 +119,7 @@ When you interact with this agent, it will: ### Fix Dependabot PRs **Load when**: User needs to close or fix open Dependabot PRs that update dependencies in generated manifest files (`.github/workflows/package.json`, `.github/workflows/requirements.txt`, `.github/workflows/go.mod`) -**Prompt file**: https://github.com/github/gh-aw/blob/main/.github/aw/dependabot.md +**Prompt file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/dependabot.md **Use cases**: - "Fix the open Dependabot PRs for npm dependencies" @@ -129,7 +129,7 @@ When you interact with this agent, it will: ### Analyze Test Coverage **Load when**: The workflow reads, analyzes, or reports test coverage โ€” whether triggered by a PR, a schedule, or a slash command. Always consult this prompt before designing the coverage data strategy. -**Prompt file**: https://github.com/github/gh-aw/blob/main/.github/aw/test-coverage.md +**Prompt file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/test-coverage.md **Use cases**: - "Create a workflow that comments coverage on PRs" @@ -139,7 +139,7 @@ When you interact with this agent, it will: ### Render ASCII Charts in Markdown **Load when**: The workflow needs in-markdown charts (sparklines, bars, table+trend views) that must align cleanly and render reliably across GitHub surfaces, including mobile. -**Reference file**: https://github.com/github/gh-aw/blob/main/.github/aw/asciicharts.md +**Reference file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/asciicharts.md **Use cases**: - "Show a compact trend chart in an issue comment" @@ -149,7 +149,7 @@ When you interact with this agent, it will: ### CLI Commands Reference **Load when**: The user asks how to run, compile, debug, or manage workflows from the command line; needs the MCP tool equivalent of a `gh aw` command; or is in a restricted environment (e.g., Copilot Cloud) without direct CLI access. -**Reference file**: https://github.com/github/gh-aw/blob/main/.github/aw/cli-commands.md +**Reference file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/cli-commands.md **Use cases**: - "How do I trigger workflow X on the main branch?" @@ -160,7 +160,7 @@ When you interact with this agent, it will: ### Token Consumption Optimization **Load when**: The user asks how to reduce token usage, lower workflow costs, make a workflow faster or cheaper, or measure the impact of prompt or configuration changes. -**Reference file**: https://github.com/github/gh-aw/blob/main/.github/aw/token-optimization.md +**Reference file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/token-optimization.md **Use cases**: - "How do I reduce the token cost of this workflow?" @@ -173,7 +173,7 @@ When you interact with this agent, it will: ### Workflow Pattern Selection **Load when**: The user asks for architecture, strategy, operating model selection, or pattern recommendations for building agentic workflows. -**Reference file**: https://github.com/github/gh-aw/blob/main/.github/aw/patterns.md +**Reference file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/patterns.md **Use cases**: - "Which pattern should I use for multi-repo rollout?" @@ -225,12 +225,12 @@ gh aw compile --validate ## Important Notes -- Always reference the instructions file at https://github.com/github/gh-aw/blob/main/.github/aw/github-agentic-workflows.md for complete documentation +- Always reference the instructions file at https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/github-agentic-workflows.md for complete documentation - Use the MCP tool `agentic-workflows` when running in GitHub Copilot Cloud - Workflows must be compiled to `.lock.yml` files before running in GitHub Actions - **Bash tools are enabled by default** - Don't restrict bash commands unnecessarily since workflows are sandboxed by the AWF - Follow security best practices: minimal permissions, explicit network access, no template injection -- **Network configuration**: Use ecosystem identifiers (`node`, `python`, `go`, etc.) or explicit FQDNs in `network.allowed`. Bare shorthands like `npm` or `pypi` are **not** valid. See https://github.com/github/gh-aw/blob/main/.github/aw/network.md for the full list of valid ecosystem identifiers and domain patterns. +- **Network configuration**: Use ecosystem identifiers (`node`, `python`, `go`, etc.) or explicit FQDNs in `network.allowed`. Bare shorthands like `npm` or `pypi` are **not** valid. See https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/network.md for the full list of valid ecosystem identifiers and domain patterns. - **Single-file output**: When creating a workflow, produce exactly **one** workflow `.md` file. Do not create separate documentation files (architecture docs, runbooks, usage guides, etc.). If documentation is needed, add a brief `## Usage` section inside the workflow file itself. - **Triggering runs**: Always use `gh aw run ` to trigger a workflow on demand โ€” not `gh workflow run .lock.yml`. `gh aw run` handles workflow resolution by short name, input parsing and validation, and correct run-tracking for agentic workflows. Use `--ref ` to run on a specific branch. -- **CLI commands reference**: For a complete guide on all `gh aw` commands and their MCP tool equivalents (for restricted environments), see https://github.com/github/gh-aw/blob/main/.github/aw/cli-commands.md +- **CLI commands reference**: For a complete guide on all `gh aw` commands and their MCP tool equivalents (for restricted environments), see https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/cli-commands.md diff --git a/.github/workflows/otlp-data-quality-validator.md b/.github/workflows/otlp-data-quality-validator.md index e910b595237..5f2baf979cf 100644 --- a/.github/workflows/otlp-data-quality-validator.md +++ b/.github/workflows/otlp-data-quality-validator.md @@ -1,7 +1,7 @@ --- emoji: "๐Ÿงญ" name: OTLP Data Quality Validator -description: Validates OTLP trace, metric, and log data quality across app emission, Collector processing, and backend visibility +description: Validates gh-aw OTLP trace data quality across local JSONL mirror, direct vendor export, and backend visibility on: schedule: daily on weekdays workflow_dispatch: @@ -35,30 +35,30 @@ imports: # OTLP Data Quality Validator -You are an OpenTelemetry/OTLP data quality validation agent. +You are an OpenTelemetry/OTLP data quality validation agent for GitHub Agentic Workflows (`gh-aw`). -Your goal is to determine whether telemetry data is complete, deduplicated, correctly shaped, and reliably flowing from source applications through the Collector to the observability backend. +Your goal is to determine whether gh-aw trace data is complete, deduplicated, correctly shaped, and reliably flowing from the workflow runtime to configured OTLP vendor endpoints. -Signal scope: -- traces -- metrics -- logs +## Architecture -Pipeline scope: -- SDK/app emission -- Collector receiver -- Collector processors -- Collector exporters -- backend ingestion and query-visible layer +gh-aw emits **traces only** (no metrics or logs). It sends OTLP spans **directly to vendor endpoints** โ€” there is no OpenTelemetry Collector in the pipeline. + +```text +gh-aw workflow runtime (actions/setup/js/send_otlp_span.cjs) + โ†’ local JSONL mirror (/tmp/gh-aw/otel.jsonl) + โ†’ OTLP/HTTP POST to vendor endpoints (concurrent fan-out) + โ†’ vendor backends (Sentry, Grafana Tempo, Datadog, etc.) +``` + +Normative specification: `specs/otel-observability-spec.md` Use the cheapest trustworthy source first: -1. local files/artifacts and mirrors (for example `/tmp/gh-aw/otel.jsonl`) -2. Collector/internal telemetry artifacts -3. backend queries +1. local JSONL mirror (`/tmp/gh-aw/otel.jsonl`) and export error logs (`/tmp/gh-aw/otlp-export-errors.jsonl`) +2. backend queries via MCP tools (when available) Always distinguish: -- emitted vs ingested vs query-visible -- true loss vs expected sampling or visibility delay +- emitted (in JSONL mirror) vs exported (HTTP response) vs query-visible (backend) +- true loss vs expected visibility delay - suspected cause vs proven cause If required evidence is unavailable, continue and mark confidence/uncertainty explicitly. @@ -69,156 +69,150 @@ If required evidence is unavailable, continue and mark confidence/uncertainty ex Define and report: - validation time window (start/end) -- expected services, environments, namespaces, and signal types - -When synthetic fields exist, prefer exact matching using: -- `validation.run_id` -- `validation.sequence_id` -- `validation.expected_count` +- expected `service.name` values (format: `gh-aw.`) +- expected job names and span operations (setup, conclusion, agent) -If synthetic fields do not exist, infer expectations from: -- source-side counters -- Collector receiver counts -- backend ingestion/query counts +Infer expectations from: +- local JSONL mirror span count +- `github.run_id` from resource attributes +- export error count from `/tmp/gh-aw/otlp-export-errors.count` ### Step 2: Validate trace completeness and integrity -Compute and report: -- unique `trace_id` count +From the local JSONL mirror (`/tmp/gh-aw/otel.jsonl`), compute and report: +- unique `trace_id` count (expect 1 per workflow run) - unique span identity count using `trace_id + span_id` - duplicate spans with same `trace_id + span_id` -When expected per-trace span counts exist, compare expected vs observed. - -Validate structure: -- every non-root span must reference an existing `parent_span_id` in the same trace -- root spans must not have `parent_span_id` +Validate the expected span hierarchy per the spec (ยง9.3): +- all setup spans share a single global `parent_span_id` +- each conclusion span parents under its job's setup span +- agent spans parent under the conclusion span +- root setup parent has no parent Validate required fields per span: -- `trace_id` -- `span_id` -- `name` -- `kind` -- `start_time` -- `end_time` -- `service.name` -- resource attributes +- `trace_id` (32-char hex) +- `span_id` (16-char hex) +- `name` (must match pattern `gh-aw..`) +- `kind` (INTERNAL=1 for setup/conclusion, CLIENT=3 for agent) +- `start_time_unix_nano` +- `end_time_unix_nano` Flag timestamp issues: - `start_time > end_time` - far-future timestamps - timestamps far outside the validation window -### Step 3: Validate metric completeness and quality - -Report: -- observed metric names -- diff between observed names and expected metric inventory - -Count metric points by: -- metric name -- resource identity -- scope/instrumentation library -- datapoint attributes -- timestamp - -Detect duplicate datapoints using: -`resource identity + scope + metric name + datapoint attributes + timestamp` - -Validate temporality: -- cumulative counters should not reset unexpectedly -- delta counters must not be interpreted as cumulative - -Flag suspicious behavior: -- missing datapoints -- counter decreases without reset evidence -- unexpected zero values -- cardinality spikes -- missing required dimensions - -### Step 4: Validate log completeness and correlation - -Report total log records in the validation window. - -Detect duplicates using stable fingerprint: -`timestamp + observed timestamp + body hash + severity + trace_id + span_id + resource identity` - -If `validation.sequence_id` exists: -- identify missing sequence IDs -- identify duplicate sequence IDs - -Validate required fields: -- `timestamp` -- `body` -- `severity` or `severity_text` -- `service.name` -- resource attributes - -Check trace correlation: -- logs emitted inside traces should contain both `trace_id` and `span_id` - -### Step 5: Check Collector health - -Inspect and report Collector internal telemetry. Use actual metric names when version-specific names differ. - -Cover: -- accepted records by receiver -- refused records by receiver -- dropped records by processor -- sent records by exporter -- failed sends by exporter -- retry counts -- queue size/capacity -- memory limiter drops -- batch behavior -- timeout/rate-limit exporter errors - -Pay special attention to metrics such as: -- `otelcol_receiver_accepted_spans` -- `otelcol_receiver_refused_spans` -- `otelcol_processor_dropped_spans` -- `otelcol_exporter_sent_spans` -- `otelcol_exporter_send_failed_spans` -- `otelcol_receiver_accepted_metric_points` -- `otelcol_processor_dropped_metric_points` -- `otelcol_exporter_sent_metric_points` -- `otelcol_receiver_accepted_log_records` -- `otelcol_processor_dropped_log_records` -- `otelcol_exporter_sent_log_records` - -### Step 6: Reconcile pipeline stages - -For traces, metrics, and logs independently, reconcile: - -app emitted -โ†’ Collector received -โ†’ Collector processed -โ†’ Collector exported -โ†’ backend ingested -โ†’ backend query-visible - -For each mismatch, identify the most likely stage of loss, duplication, or transformation. - -Do not claim data loss unless cross-stage evidence supports it. +```bash +# Example: Extract span summary from JSONL mirror +jq -c '.resourceSpans[].scopeSpans[].spans[] | {name, traceId, spanId, parentSpanId, kind, status}' /tmp/gh-aw/otel.jsonl +``` + +### Step 3: Validate span attribute contract + +Check setup spans for required attributes (spec ยง10.1): +- `gh-aw.job.name` +- `gh-aw.workflow.name` +- `gh-aw.run.id` +- `gh-aw.run.attempt` +- `gh-aw.run.actor` +- `gh-aw.repository` +- `gh-aw.staged` + +Check conclusion spans for required attributes (spec ยง10.2): +- `gh-aw.run.status` (must be `success`, `failure`, `timeout`, or `cancelled`) +- `gh-aw.error_count` +- `gh-aw.warning_count` +- `gh-aw.action_minutes` +- `gh-aw.output.item_count` +- `gh-aw.otlp.export_errors` + +Check agent spans for GenAI semantic conventions (spec ยง10.3): +- `gen_ai.system` +- `gen_ai.request.model` +- `gen_ai.operation.name` (must be `"chat"`) +- `gen_ai.usage.input_tokens` +- `gen_ai.usage.output_tokens` + +```bash +# Example: Check required attributes on setup spans +jq -c '.resourceSpans[].scopeSpans[].spans[] | select(.name | endswith(".setup")) | {name, attrs: [.attributes[]? | {(.key): .value}] | add}' /tmp/gh-aw/otel.jsonl +``` + +### Step 4: Validate resource attributes + +Check all spans for required resource attributes (spec ยง11.1): +- `service.name` (format: `gh-aw.` or `gh-aw`) +- `service.version` +- `github.repository` +- `github.run_id` +- `github.run_attempt` +- `github.actions.run_url` + +Check instrumentation scope: +- `scope.name` must be `gh-aw` +- `scope.version` should match `service.version` + +```bash +# Example: Extract resource attributes +jq -c '.resourceSpans[].resource.attributes[] | {(.key): .value}' /tmp/gh-aw/otel.jsonl | sort -u +``` + +### Step 5: Validate trace ID propagation + +Verify trace ID consistency across jobs (spec ยง12): +- all spans in a single workflow run share the same `trace_id` +- setup spans across different jobs share the same global `parent_span_id` +- the JSONL mirror `trace_id` matches the value in `GITHUB_AW_OTEL_TRACE_ID` + +If export errors exist, check `/tmp/gh-aw/otlp-export-errors.jsonl`: +- which endpoints failed +- HTTP status codes +- whether failures are transient (retryable) or permanent + +```bash +# Example: Check trace ID consistency +jq -r '.resourceSpans[].scopeSpans[].spans[].traceId' /tmp/gh-aw/otel.jsonl | sort -u | wc -l +# Expected: 1 (single trace ID per run) + +# Example: Check export errors +cat /tmp/gh-aw/otlp-export-errors.jsonl 2>/dev/null || echo "No export errors" +cat /tmp/gh-aw/otlp-export-errors.count 2>/dev/null || echo "0" +``` + +### Step 6: Reconcile local mirror vs backend visibility + +For each configured OTLP endpoint, reconcile: + +```text +local JSONL mirror (emitted) + โ†’ OTLP/HTTP export (sent) + โ†’ vendor backend (query-visible) +``` + +Check: +- span count in JSONL mirror vs backend +- whether all span names from the mirror appear in the backend +- whether resource attributes survived backend ingestion +- whether `trace_id` is searchable in the backend + +For multi-endpoint fan-out, validate each endpoint independently. Failure on one endpoint SHOULD NOT affect others. + +Do not claim data loss unless cross-stage evidence supports it. Distinguish ingestion delay from actual loss. ### Step 7: Root-cause hypotheses -Evaluate likely causes, including: -- SDK not flushing on shutdown -- sampling misconfiguration -- duplicate exporters in app config -- duplicate flow through both agent and gateway -- multiple Collectors scraping same source -- retry behavior causing duplicate ingestion -- filelog receiver offset rereads -- batch timeout/size effects -- memory limiter drops -- exporter queue overflow -- backend rate limits -- resource attribute mutation/overwrite -- OTLP gRPC/HTTP protocol mismatch -- wrong endpoint/path -- metrics temporality mismatch +Evaluate likely causes for any issues found, including: +- OTLP endpoint misconfiguration (wrong URL, missing `/v1/traces` suffix) +- authentication failures (expired API key, wrong header name) +- Sentry header rewrite not applied (`Authorization` should become `x-sentry-auth`) +- network allowlist missing vendor hostname +- `if-missing: error` blocking gateway OTLP when secrets are unresolved +- retry exhaustion (3 attempts with exponential backoff) +- OTLP/HTTP JSON vs OTLP/HTTP protobuf mismatch +- vendor rate limits or ingestion delays +- span attribute redaction removing useful diagnostic data +- proxy configuration interfering with `fetch`-based export Rank hypotheses by evidence strength and include alternatives. @@ -231,31 +225,32 @@ Create exactly one issue with these sections in order: - main risks - most likely root cause (if any) -### B. Completeness results -Per signal (traces/metrics/logs): -- expected count -- observed count -- missing count -- duplicate count +### B. Trace completeness +- expected span count (from JSONL mirror) +- observed span count (in backend) +- missing spans +- duplicate spans +- trace ID consistency (single trace per run) - confidence level -### C. Duplicate analysis -- duplicate keys -- affected services -- affected windows -- sample duplicate records - -### D. Schema and quality issues -- missing fields -- invalid timestamps -- missing resource attributes -- cardinality problems -- trace/log correlation gaps - -### E. Pipeline health -- Collector receiver/processor/exporter counters -- dropped/refused/failed signals -- queue/retry indicators +### C. Span hierarchy validation +- setup spans share global parent: pass/fail +- conclusion spans parent under setup: pass/fail +- agent spans parent under conclusion: pass/fail +- span naming pattern `gh-aw..`: pass/fail + +### D. Attribute contract validation +- setup span required attributes: present/missing list +- conclusion span required attributes: present/missing list +- agent span GenAI attributes: present/missing list +- resource attributes: present/missing list +- instrumentation scope: correct/incorrect + +### E. Export and fan-out health +- per-endpoint export status (success/fail/partial) +- export error count and details +- JSONL mirror write status +- multi-endpoint fan-out independence ### F. Root-cause hypothesis - likely cause @@ -263,18 +258,17 @@ Per signal (traces/metrics/logs): - alternative explanations ### G. Recommended fixes (prioritized) -1. stop data loss -2. stop duplication -3. fix schema/resource attributes -4. improve observability and alerts +1. fix data loss or export failures +2. fix missing required attributes +3. fix span hierarchy or naming issues +4. improve diagnostic coverage ### H. Validation queries or commands -Provide concrete queries/commands/pseudocode used. +Provide concrete jq/bash commands used against the JSONL mirror and backend. Rules: - Never assume missing equals lost without cross-stage evidence. - Always distinguish ingestion completeness from query visibility. -- Treat sampled traces as intentionally incomplete only when sampling config is verified. -- Do not flag legitimate metric resets as errors when reset metadata or restart evidence exists. -- Prefer exact validation keyed by `validation.run_id` and `validation.sequence_id` when available. +- Do not flag visibility delays under 5 minutes as data loss. - Be explicit about uncertainty. +- Reference the normative spec (`specs/otel-observability-spec.md`) section numbers when reporting violations. From b0f9068bf75a9f41c919de7c5f3ad5efcde7f024 Mon Sep 17 00:00:00 2001 From: Mara Nikola Kiefer Date: Thu, 21 May 2026 07:59:05 +0200 Subject: [PATCH 2/7] update otel observability spec --- .github/agents/agentic-workflows.agent.md | 34 +- specs/otel-observability-spec.md | 385 +++++++++++++++++++++- 2 files changed, 390 insertions(+), 29 deletions(-) diff --git a/.github/agents/agentic-workflows.agent.md b/.github/agents/agentic-workflows.agent.md index f7e5eb4f1cd..43071e7216c 100644 --- a/.github/agents/agentic-workflows.agent.md +++ b/.github/agents/agentic-workflows.agent.md @@ -25,7 +25,7 @@ This is a **dispatcher agent** that routes your request to the appropriate speci - **Choosing workflow architectures and design patterns**: Routes to `patterns` guide โ€” consult this whenever the user asks for strategy, architecture, operating models, or pattern selection for agentic workflows > [!IMPORTANT] -> For architecture/pattern-selection requests, load `https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/patterns.md` first. +> For architecture/pattern-selection requests, load `https://github.com/github/gh-aw/blob/main/.github/aw/patterns.md` first. Workflows may optionally include: @@ -37,7 +37,7 @@ Workflows may optionally include: - Workflow files: `.github/workflows/*.md` and `.github/workflows/**/*.md` - Workflow lock files: `.github/workflows/*.lock.yml` - Shared components: `.github/workflows/shared/*.md` -- Configuration: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/github-agentic-workflows.md +- Configuration: https://github.com/github/gh-aw/blob/main/.github/aw/github-agentic-workflows.md ## Problems This Solves @@ -59,7 +59,7 @@ When you interact with this agent, it will: ### Create New Workflow **Load when**: User wants to create a new workflow from scratch, add automation, or design a workflow that doesn't exist yet -**Prompt file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/create-agentic-workflow.md +**Prompt file**: https://github.com/github/gh-aw/blob/main/.github/aw/create-agentic-workflow.md **Use cases**: - "Create a workflow that triages issues" @@ -69,7 +69,7 @@ When you interact with this agent, it will: ### Update Existing Workflow **Load when**: User wants to modify, improve, or refactor an existing workflow -**Prompt file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/update-agentic-workflow.md +**Prompt file**: https://github.com/github/gh-aw/blob/main/.github/aw/update-agentic-workflow.md **Use cases**: - "Add web-fetch tool to the issue-classifier workflow" @@ -79,7 +79,7 @@ When you interact with this agent, it will: ### Debug Workflow **Load when**: User needs to investigate, audit, debug, or understand a workflow, troubleshoot issues, analyze logs, or fix errors -**Prompt file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/debug-agentic-workflow.md +**Prompt file**: https://github.com/github/gh-aw/blob/main/.github/aw/debug-agentic-workflow.md **Use cases**: - "Why is this workflow failing?" @@ -89,7 +89,7 @@ When you interact with this agent, it will: ### Upgrade Agentic Workflows **Load when**: User wants to upgrade workflows to a new gh-aw version or fix deprecations -**Prompt file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/upgrade-agentic-workflows.md +**Prompt file**: https://github.com/github/gh-aw/blob/main/.github/aw/upgrade-agentic-workflows.md **Use cases**: - "Upgrade all workflows to the latest version" @@ -99,7 +99,7 @@ When you interact with this agent, it will: ### Create a Report-Generating Workflow **Load when**: The workflow being created or updated produces reports โ€” recurring status updates, audit summaries, analyses, or any structured output posted as a GitHub issue, discussion, or comment -**Prompt file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/report.md +**Prompt file**: https://github.com/github/gh-aw/blob/main/.github/aw/report.md **Use cases**: - "Create a weekly CI health report" @@ -109,7 +109,7 @@ When you interact with this agent, it will: ### Create Shared Agentic Workflow **Load when**: User wants to create a reusable workflow component or wrap an MCP server -**Prompt file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/create-shared-agentic-workflow.md +**Prompt file**: https://github.com/github/gh-aw/blob/main/.github/aw/create-shared-agentic-workflow.md **Use cases**: - "Create a shared component for Notion integration" @@ -119,7 +119,7 @@ When you interact with this agent, it will: ### Fix Dependabot PRs **Load when**: User needs to close or fix open Dependabot PRs that update dependencies in generated manifest files (`.github/workflows/package.json`, `.github/workflows/requirements.txt`, `.github/workflows/go.mod`) -**Prompt file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/dependabot.md +**Prompt file**: https://github.com/github/gh-aw/blob/main/.github/aw/dependabot.md **Use cases**: - "Fix the open Dependabot PRs for npm dependencies" @@ -129,7 +129,7 @@ When you interact with this agent, it will: ### Analyze Test Coverage **Load when**: The workflow reads, analyzes, or reports test coverage โ€” whether triggered by a PR, a schedule, or a slash command. Always consult this prompt before designing the coverage data strategy. -**Prompt file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/test-coverage.md +**Prompt file**: https://github.com/github/gh-aw/blob/main/.github/aw/test-coverage.md **Use cases**: - "Create a workflow that comments coverage on PRs" @@ -139,7 +139,7 @@ When you interact with this agent, it will: ### Render ASCII Charts in Markdown **Load when**: The workflow needs in-markdown charts (sparklines, bars, table+trend views) that must align cleanly and render reliably across GitHub surfaces, including mobile. -**Reference file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/asciicharts.md +**Reference file**: https://github.com/github/gh-aw/blob/main/.github/aw/asciicharts.md **Use cases**: - "Show a compact trend chart in an issue comment" @@ -149,7 +149,7 @@ When you interact with this agent, it will: ### CLI Commands Reference **Load when**: The user asks how to run, compile, debug, or manage workflows from the command line; needs the MCP tool equivalent of a `gh aw` command; or is in a restricted environment (e.g., Copilot Cloud) without direct CLI access. -**Reference file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/cli-commands.md +**Reference file**: https://github.com/github/gh-aw/blob/main/.github/aw/cli-commands.md **Use cases**: - "How do I trigger workflow X on the main branch?" @@ -160,7 +160,7 @@ When you interact with this agent, it will: ### Token Consumption Optimization **Load when**: The user asks how to reduce token usage, lower workflow costs, make a workflow faster or cheaper, or measure the impact of prompt or configuration changes. -**Reference file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/token-optimization.md +**Reference file**: https://github.com/github/gh-aw/blob/main/.github/aw/token-optimization.md **Use cases**: - "How do I reduce the token cost of this workflow?" @@ -173,7 +173,7 @@ When you interact with this agent, it will: ### Workflow Pattern Selection **Load when**: The user asks for architecture, strategy, operating model selection, or pattern recommendations for building agentic workflows. -**Reference file**: https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/patterns.md +**Reference file**: https://github.com/github/gh-aw/blob/main/.github/aw/patterns.md **Use cases**: - "Which pattern should I use for multi-repo rollout?" @@ -225,12 +225,12 @@ gh aw compile --validate ## Important Notes -- Always reference the instructions file at https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/github-agentic-workflows.md for complete documentation +- Always reference the instructions file at https://github.com/github/gh-aw/blob/main/.github/aw/github-agentic-workflows.md for complete documentation - Use the MCP tool `agentic-workflows` when running in GitHub Copilot Cloud - Workflows must be compiled to `.lock.yml` files before running in GitHub Actions - **Bash tools are enabled by default** - Don't restrict bash commands unnecessarily since workflows are sandboxed by the AWF - Follow security best practices: minimal permissions, explicit network access, no template injection -- **Network configuration**: Use ecosystem identifiers (`node`, `python`, `go`, etc.) or explicit FQDNs in `network.allowed`. Bare shorthands like `npm` or `pypi` are **not** valid. See https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/network.md for the full list of valid ecosystem identifiers and domain patterns. +- **Network configuration**: Use ecosystem identifiers (`node`, `python`, `go`, etc.) or explicit FQDNs in `network.allowed`. Bare shorthands like `npm` or `pypi` are **not** valid. See https://github.com/github/gh-aw/blob/main/.github/aw/network.md for the full list of valid ecosystem identifiers and domain patterns. - **Single-file output**: When creating a workflow, produce exactly **one** workflow `.md` file. Do not create separate documentation files (architecture docs, runbooks, usage guides, etc.). If documentation is needed, add a brief `## Usage` section inside the workflow file itself. - **Triggering runs**: Always use `gh aw run ` to trigger a workflow on demand โ€” not `gh workflow run .lock.yml`. `gh aw run` handles workflow resolution by short name, input parsing and validation, and correct run-tracking for agentic workflows. Use `--ref ` to run on a specific branch. -- **CLI commands reference**: For a complete guide on all `gh aw` commands and their MCP tool equivalents (for restricted environments), see https://github.com/github/gh-aw/blob/v0.74.8/.github/aw/cli-commands.md +- **CLI commands reference**: For a complete guide on all `gh aw` commands and their MCP tool equivalents (for restricted environments), see https://github.com/github/gh-aw/blob/main/.github/aw/cli-commands.md diff --git a/specs/otel-observability-spec.md b/specs/otel-observability-spec.md index 375aa0e9797..236ec0cc920 100644 --- a/specs/otel-observability-spec.md +++ b/specs/otel-observability-spec.md @@ -1,9 +1,9 @@ --- title: OTel Observability Specification -version: 0.1.0 +version: 0.2.0 status: Working Draft date: 2026-05-19 -last_updated: 2026-05-19 +last_updated: 2026-05-21 editors: - GitHub gh-aw Team --- @@ -38,10 +38,14 @@ Changes to `observability.otlp`, OTLP environment injection, MCP gateway tracing 6. [Export and Gateway Integration](#6-export-and-gateway-integration) 7. [Local Mirrors and Artifacts](#7-local-mirrors-and-artifacts) 8. [Security and Privacy Requirements](#8-security-and-privacy-requirements) -9. [Implementation Mapping](#9-implementation-mapping) -10. [Compliance Testing](#10-compliance-testing) -11. [References](#11-references) -12. [Change Log](#12-change-log) +9. [Trace Model](#9-trace-model) +10. [Span Attribute Contract](#10-span-attribute-contract) +11. [Resource Attributes](#11-resource-attributes) +12. [Trace ID Propagation and Lookup](#12-trace-id-propagation-and-lookup) +13. [Implementation Mapping](#13-implementation-mapping) +14. [Compliance Testing](#14-compliance-testing) +15. [References](#15-references) +16. [Change Log](#16-change-log) --- @@ -85,7 +89,7 @@ The following documents are informative companions and do not override this spec ## 2. Conformance -An implementation conforms to this specification if it satisfies all MUST and MUST NOT requirements in Sections 4 through 10. +An implementation conforms to this specification if it satisfies all MUST and MUST NOT requirements in Sections 4 through 12. The key words **MUST**, **MUST NOT**, **SHOULD**, **SHOULD NOT**, and **MAY** are to be interpreted as described in [RFC 2119](https://www.rfc-editor.org/rfc/rfc2119). @@ -97,7 +101,7 @@ This specification defines three conformance levels: |---|---| | **Level 1 - Config** | Correct parsing and normalization of `observability.otlp` and workflow environment injection as defined in Sections 4 and 5. | | **Level 2 - Runtime** | Level 1 plus MCP gateway integration and degraded-mode export behavior from Section 6. | -| **Level 3 - Complete** | Level 2 plus local mirror, artifact, implementation-mapping, and compliance obligations in Sections 7 through 10. | +| **Level 3 - Complete** | Level 2 plus local mirror, artifact, trace model, span attribute contract, resource attributes, trace ID propagation, implementation-mapping, and compliance obligations in Sections 7 through 12. | --- @@ -267,7 +271,326 @@ The JavaScript OTLP helper layer SHOULD remain non-fatal: --- -## 9. Implementation Mapping +## 9. Trace Model + +### 9.1 Overview + +gh-aw emits OpenTelemetry trace spans directly to configured OTLP-compatible vendor endpoints. gh-aw does **not** require or run an OpenTelemetry Collector. All transformation, batching, retry, endpoint selection, and authentication happens in-process before sending to the vendor OTLP endpoint. + +Tracing is best-effort. Export failures MUST NOT fail the workflow. + +### 9.2 Span Naming Convention + +All gh-aw span names MUST follow the pattern: `gh-aw..`. + +When no job name is available, the fallback `job` MUST be used, yielding names such as `gh-aw.job.setup`. + +### 9.3 Span Hierarchy + +A single trace ID is shared across all jobs in a workflow run. All setup spans share a global parent span ID so they render as siblings in OTLP backends. + +```text +Single Trace: trace_id (32-char hex, shared across all jobs in a run) +โ”œโ”€โ”€ Root Setup Parent: parent_span_id (global, shared across all jobs) +โ”‚ +โ”œโ”€โ”€ Activation Job +โ”‚ โ”œโ”€โ”€ gh-aw.activation.setup (parent: root setup parent) +โ”‚ โ””โ”€โ”€ gh-aw.activation.conclusion (parent: activation setup span) +โ”‚ +โ”œโ”€โ”€ Agent Job +โ”‚ โ”œโ”€โ”€ gh-aw.agent.setup (parent: root setup parent) +โ”‚ โ”œโ”€โ”€ gh-aw.agent.conclusion (parent: agent setup span) +โ”‚ โ”‚ โ””โ”€โ”€ gh-aw.agent.agent (parent: agent conclusion span) +โ”‚ โ”‚ [dedicated AI latency measurement] +โ”‚ โ”‚ +โ”‚ +โ””โ”€โ”€ Other Jobs + โ”œโ”€โ”€ gh-aw..setup (parent: root setup parent) + โ””โ”€โ”€ gh-aw..conclusion (parent: job setup span) +``` + +### 9.4 Span Kinds + +Span kind assignments MUST follow these rules: + +| Span | OTLP `kind` | Rationale | +|---|---|---| +| `gh-aw.*.setup` | `SPAN_KIND_INTERNAL` (1) | Internal job lifecycle | +| `gh-aw.*.conclusion` | `SPAN_KIND_INTERNAL` (1) | Internal job lifecycle | +| `gh-aw.*.agent` | `SPAN_KIND_CLIENT` (3) | Outbound AI model request | + +### 9.5 Span Status + +Conclusion spans MUST set `status.code` based on the job outcome: + +| Outcome | `status.code` | +|---|---| +| `success` | `OK` (1) | +| `failure`, `timeout`, `cancelled` | `ERROR` (2) | + +### 9.6 Exception Events + +When errors are present in `agent_output.json`, the conclusion span MUST emit OTel exception events: + +```json +{ + "timeUnixNano": "...", + "name": "exception", + "attributes": [ + {"key": "exception.type", "value": {"stringValue": "gh-aw."}}, + {"key": "exception.message", "value": {"stringValue": "Error description"}} + ] +} +``` + +Exception type resolution: + +1. If the error message matches the format `type:message`, use `gh-aw.` as the exception type. +2. Otherwise, derive the type from the run status: `gh-aw.AgentError`, `gh-aw.AgentFailed`, `gh-aw.AgentTimedOut`, or `gh-aw.AgentCancelled`. + +--- + +## 10. Span Attribute Contract + +This section defines the attributes each span type MUST or MAY carry. + +### 10.1 Setup Span Attributes + +**Required attributes** (MUST be present on every setup span): + +| Attribute | Type | Description | +|---|---|---| +| `gh-aw.job.name` | string | Job name from action input | +| `gh-aw.workflow.name` | string | Workflow name or ID | +| `gh-aw.run.id` | string | GitHub Actions run ID | +| `gh-aw.run.attempt` | string | Run attempt number | +| `gh-aw.run.actor` | string | User or bot initiating the run | +| `gh-aw.repository` | string | `owner/repo` | +| `gh-aw.staged` | boolean | Whether this is a staging deployment | + +**Conditional attributes** (MUST be present when the value is available): + +| Attribute | Type | Description | +|---|---|---| +| `gen_ai.system` | string | Mapped AI system name (e.g., `github_models`, `anthropic`, `openai`) | +| `gh-aw.engine.id` | string | Raw engine identifier (`copilot`, `claude`, `codex`, `gemini`, custom) | +| `gh-aw.event_name` | string | GitHub event type | +| `gh-aw.trigger.item_type` | string | Triggering item (`issue`, `pull_request`, `discussion`, etc.) | +| `gh-aw.trigger.item_number` | string | Triggering item ID/number | +| `gh-aw.trigger.label` | string | Label on triggering item | +| `gh-aw.trigger.comment_id` | string | Comment ID on triggering item | +| `gh-aw.episode.id` | string | Episode/session ID for cross-run correlation | +| `gh-aw.episode.kind` | string | `run` or `workflow_call` | +| `gh-aw.hop.id` | string | Current workflow invocation ID | +| `gh-aw.hop.parent_id` | string | Parent workflow invocation ID | +| `gh-aw.origin.event` | string | Origin event type | +| `gh-aw.root.repo` | string | Root repository (for dispatched workflows) | +| `gh-aw.root.workflow_id` | string | Root workflow ID | +| `gh-aw.frontmatter.source` | string | Frontmatter source type | +| `gh-aw.frontmatter.emoji` | string | Frontmatter emoji | +| `gh-aw.frontmatter.body_modified` | boolean | Whether body was edited | +| `gh-aw.experiment.` | string | Per-experiment variant assignment | +| `gh-aw.experiments` | string | Compact JSON of all experiment assignments | +| `gh-aw.deployment.state` | string | Deployment status | +| `gh-aw.workflow_run.conclusion` | string | Workflow-level outcome | + +### 10.2 Conclusion Span Attributes + +**Required attributes** (MUST be present on every conclusion span): + +| Attribute | Type | Description | +|---|---|---| +| `gh-aw.workflow.name` | string | Workflow name | +| `gh-aw.run.id` | string | Run ID | +| `gh-aw.run.attempt` | string | Attempt number | +| `gh-aw.run.actor` | string | Actor | +| `gh-aw.repository` | string | Repository | +| `gh-aw.run.status` | string | Run outcome (`success`, `failure`, `timeout`, `cancelled`) | +| `gh-aw.error_count` | int | Number of errors | +| `gh-aw.warning_count` | int | Number of warnings | +| `gh-aw.action_minutes` | double | Duration in minutes | +| `gh-aw.output.item_count` | int | Safe output items produced | +| `gh-aw.otlp.export_errors` | int | Count of OTLP export failures during this run | + +**Conditional attributes** (MUST be present when the value is available): + +| Attribute | Type | Description | +|---|---|---| +| `gh-aw.job.name` | string | Job name | +| `gen_ai.system` | string | AI system | +| `gh-aw.engine.id` | string | Engine ID | +| `gen_ai.request.model` | string | Requested model name | +| `gh-aw.tracker.id` | string | Tracker identifier | +| `gh-aw.event_name` | string | Event type | +| `gh-aw.staged` | boolean | Staging flag | +| `gh-aw.trigger.*` | string | Trigger context (same fields as setup span) | +| `gh-aw.frontmatter.*` | string | Frontmatter metadata (same fields as setup span) | +| `gh-aw.effective_tokens` | int | Effective token count | +| `gh-aw.turns` | int | Number of agent turns | +| `gh-aw.estimated_cost_usd` | double | Estimated cost | +| `gh-aw.agent.conclusion` | string | Agent job outcome | +| `gh-aw.detection.conclusion` | string | Threat detection outcome | +| `gh-aw.detection.reason` | string | Detection reasoning | +| `gh-aw.otlp.export_error_details` | string | Export failure details | +| `gh-aw.error.count` | int | Output error count | +| `gh-aw.error.messages` | string | Error messages joined by ` \| ` | +| `gh-aw.output.item_types` | string | Comma-separated types of safe output items | +| `gh-aw.github.rate_limit.remaining` | int | API rate limit remaining | +| `gh-aw.github.rate_limit.limit` | int | API rate limit total | +| `gh-aw.github.rate_limit.used` | int | API rate limit used | +| `gh-aw.github.rate_limit.resource` | string | Rate limit resource category | +| `gh-aw.github.rate_limit.reset` | string | ISO 8601 rate limit reset time | +| `gh-aw.outcome.total` | int | Total outcomes | +| `gh-aw.outcome.accepted` | int | Accepted outcomes | +| `gh-aw.outcome.rejected` | int | Rejected outcomes | +| `gh-aw.outcome.pending` | int | Pending outcomes | +| `gh-aw.outcome.ignored` | int | Ignored outcomes | +| `gh-aw.outcome.acceptance_rate` | double | Acceptance rate | +| `gh-aw.outcome.waste_rate` | double | Waste rate | + +### 10.3 Agent Span Attributes + +The dedicated agent span (`gh-aw.*.agent`) follows OpenTelemetry [GenAI semantic conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/). + +**Required attributes** (MUST be present when available from the AI engine): + +| Attribute | Type | Description | +|---|---|---| +| `gen_ai.system` | string | Mapped AI system name | +| `gen_ai.request.model` | string | Requested model | +| `gen_ai.response.model` | string | Resolved runtime model | +| `gen_ai.operation.name` | string | Always `"chat"` | +| `gen_ai.workflow.name` | string | Workflow name | +| `gen_ai.usage.input_tokens` | int | Input tokens consumed | +| `gen_ai.usage.output_tokens` | int | Output tokens generated | +| `gen_ai.usage.total_tokens` | int | Total tokens (input + output, excluding cache) | +| `gen_ai.response.finish_reasons` | string[] | Stop reasons (e.g., `["stop"]`, `["length"]`, `["timeout"]`) | + +**Optional attributes** (MAY be present): + +| Attribute | Type | Description | +|---|---|---| +| `gen_ai.usage.cache_read.input_tokens` | int | Cache read tokens | +| `gen_ai.usage.cache_creation.input_tokens` | int | Cache write tokens | + +--- + +## 11. Resource Attributes + +Resource attributes are applied to all OTLP spans and describe the service and execution environment. + +### 11.1 Required Resource Attributes + +A conforming implementation MUST include these resource attributes on every exported span: + +| Attribute | Type | Description | Example | +|---|---|---|---| +| `service.name` | string | `gh-aw.` or `gh-aw` | `gh-aw.daily-report` | +| `service.version` | string | gh-aw CLI version or commit SHA | `v0.23.4` | +| `github.repository` | string | `owner/repo` | `github/gh-aw` | +| `github.run_id` | string | GitHub Actions run ID | `12345678` | +| `github.run_attempt` | string | Run attempt number | `1` | +| `github.actions.run_url` | string | URL to the run | `https://github.com/owner/repo/actions/runs/123` | + +### 11.2 Conditional Resource Attributes + +These resource attributes MUST be included when the corresponding value is available: + +| Attribute | Type | Description | +|---|---|---| +| `github.event_name` | string | Event type (e.g., `push`, `pull_request`) | +| `github.ref` | string | Git ref (branch/tag) | +| `github.ref_name` | string | Ref name | +| `github.head_ref` | string | Head ref (for PRs) | +| `github.sha` | string | Commit SHA | +| `github.job` | string | Job name | +| `github.workflow_ref` | string | Workflow ref | +| `github.actor_id` | string | Actor ID | +| `runner.os` | string | Runner OS (`Linux`, `Windows`, `macOS`) | +| `runner.arch` | string | Runner architecture (`X64`, `ARM64`) | +| `runner.name` | string | Runner name/label | +| `runner.environment` | string | Runner environment | +| `gh-aw.awf.version` | string | Agentic Workflows Framework version | +| `gh-aw.awmg.version` | string | Agentic Workflows Manager version | +| `deployment.environment` | string | `staging` or `production` | + +### 11.3 Instrumentation Scope + +All gh-aw spans MUST be emitted under an instrumentation scope with: + +| Field | Value | +|---|---| +| `scope.name` | `gh-aw` | +| `scope.version` | The gh-aw CLI version | + +--- + +## 12. Trace ID Propagation and Lookup + +### 12.1 Trace ID Format + +The OTLP trace ID is a 32-character lowercase hexadecimal string (16 random bytes). The span ID is a 16-character lowercase hexadecimal string (8 random bytes). + +Do **not** confuse the OTLP trace ID with `workflow_call_id`, which is derived from the GitHub run ID and attempt number. The OTLP trace ID is the value to search for in vendor backends (Sentry, Honeycomb, Datadog, Grafana Tempo, etc.). + +### 12.2 Trace ID Resolution Order + +The setup span MUST resolve the trace ID using the following priority order: + +1. **Explicit option** โ€” `options.traceId` passed to the setup function (used for activation job reuse). +2. **Action input** โ€” `INPUT_TRACE_ID` environment variable (from `trace-id` action input, used for cross-job propagation). +3. **Parent context** โ€” `aw_info.context.otel_trace_id` (propagated from parent workflow via `aw_context`). +4. **Generate new** โ€” 32-character random hex string via `randomBytes(16).toString("hex")`. + +The conclusion span MUST resolve the trace ID using: + +1. **Job environment** โ€” `GITHUB_AW_OTEL_TRACE_ID` (set by this job's setup step). +2. **Parent context** โ€” `aw_info.context.otel_trace_id` (inherited from parent). +3. **Legacy fallback** โ€” `aw_info.context.workflow_call_id` (converted to hex). +4. **Generate new** โ€” 32-character random hex string. + +### 12.3 Trace ID Storage + +After generating or resolving a trace ID, the setup step MUST: + +1. **Write to `$GITHUB_OUTPUT`** so downstream jobs can access: + - `trace-id` โ€” 32-char hex trace ID + - `span-id` โ€” 16-char hex setup span ID + - `parent-span-id` โ€” 16-char hex global parent span ID + +2. **Write to `$GITHUB_ENV`** so downstream steps in the same job can access: + - `GITHUB_AW_OTEL_TRACE_ID` โ€” Trace ID + - `GITHUB_AW_OTEL_PARENT_SPAN_ID` โ€” Setup span ID (parent for conclusion span) + - `GITHUB_AW_OTEL_JOB_START_MS` โ€” Epoch milliseconds when setup completed + +### 12.4 Cross-Job Propagation + +The compiler MUST wire setup outputs through the job dependency graph so all jobs in a run share a single trace ID. Downstream jobs receive `needs..outputs.trace-id` and `needs..outputs.parent-span-id` as action inputs. + +### 12.5 Dispatch and Composite Action Propagation + +When a workflow dispatches a child workflow or composite action, parent trace context MUST be passed via `aw_context`: + +- `aw_context.otel_trace_id` โ†’ child inherits parent trace ID +- `aw_context.otel_parent_span_id` โ†’ child setup span parents under parent's setup span + +This context is written to `/tmp/gh-aw/aw_info.json` and propagated through action inputs. + +### 12.6 Trace ID Lookup + +To find a trace in an OTLP backend: + +1. Locate the OTLP trace ID from the GitHub Actions job summary or the `trace-id` output. +2. Search the backend by trace ID (32-char hex string). +3. For local debugging, query the JSONL mirror: + +```bash +jq '.resourceSpans[].scopeSpans[].spans[] | {name, traceId, spanId, status}' /tmp/gh-aw/otel.jsonl +``` + +--- + +## 13. Implementation Mapping This section maps the normative behavior in this specification to the current `gh-aw` implementation. These mappings MUST be kept in sync when behavior changes. @@ -280,12 +603,16 @@ This section maps the normative behavior in this specification to the current `g | ยง6.5 | Trace Context Variables | `actions/setup/js/action_setup_otlp.cjs`, `actions/setup/js/aw_context.cjs` | | ยง7 | Local Mirrors and Artifacts | `actions/setup/js/send_otlp_span.cjs`, `actions/setup/js/constants.cjs`, `actions/setup/post.js` | | ยง8 | Security and Privacy Requirements | `pkg/workflow/observability_otlp.go`, `pkg/workflow/mcp_renderer.go`, `pkg/workflow/mcp_setup_generator.go`, `actions/setup/js/send_otlp_span.cjs` | +| ยง9 | Trace Model | `actions/setup/js/send_otlp_span.cjs`, `actions/setup/js/action_setup_otlp.cjs`, `actions/setup/js/action_conclusion_otlp.cjs` | +| ยง10 | Span Attribute Contract | `actions/setup/js/action_setup_otlp.cjs`, `actions/setup/js/action_conclusion_otlp.cjs`, `actions/setup/js/send_otlp_span.cjs` | +| ยง11 | Resource Attributes | `actions/setup/js/action_setup_otlp.cjs`, `actions/setup/js/send_otlp_span.cjs` | +| ยง12 | Trace ID Propagation | `actions/setup/js/action_setup_otlp.cjs`, `actions/setup/js/aw_context.cjs`, `pkg/workflow/compiler_yaml.go` | When behavior changes in any mapped file, this table SHOULD be updated in the same change set. --- -## 10. Compliance Testing +## 14. Compliance Testing A conforming implementation MUST include automated coverage for the following behaviors. @@ -301,12 +628,28 @@ A conforming implementation MUST include automated coverage for the following be | `T-OTEL-OBS-008` | Local mirror persistence | Helper emission writes `/tmp/gh-aw/otel.jsonl` even when OTLP export fails or is absent. | `actions/setup/js/send_otlp_span.test.cjs` | | `T-OTEL-OBS-009` | Trace context propagation | Setup writes valid trace and parent span IDs into runtime environment. | `actions/setup/js/action_setup_otlp.test.cjs`, `actions/setup/js/otlp.test.cjs` | | `T-OTEL-OBS-010` | Artifact inclusion | Observability artifacts include the OTEL JSONL mirror when artifact collection is enabled. | `pkg/workflow/compiled_lock_files_test.go` | +| `T-OTEL-OBS-011` | Span naming convention | All emitted span names follow `gh-aw..` pattern. | `actions/setup/js/send_otlp_span.test.cjs` | +| `T-OTEL-OBS-012` | Span hierarchy | Setup spans share a global parent span ID; conclusion spans parent under the setup span. | `actions/setup/js/action_setup_otlp.test.cjs`, `actions/setup/js/action_conclusion_otlp.test.cjs` | +| `T-OTEL-OBS-013` | Span attribute contract | Setup and conclusion spans contain all required attributes from ยง10. | `actions/setup/js/action_setup_otlp.test.cjs`, `actions/setup/js/action_conclusion_otlp.test.cjs` | +| `T-OTEL-OBS-014` | Resource attributes | All exported spans include required resource attributes from ยง11. | `actions/setup/js/send_otlp_span.test.cjs` | +| `T-OTEL-OBS-015` | Trace ID resolution order | Trace ID follows the priority chain: explicit option โ†’ action input โ†’ parent context โ†’ generate new. | `actions/setup/js/action_setup_otlp.test.cjs` | Additional tests SHOULD be added when new helper APIs, new OTLP normalization rules, or new runtime sinks become normative. +### 14.1 Runtime Conformance Workflows + +The following agentic workflows provide runtime conformance validation: + +| Workflow | Purpose | Coverage | +|---|---|---| +| [`smoke-otel-backends.md`](../.github/workflows/smoke-otel-backends.md) | End-to-end OTLP smoke test | Local mirror + Sentry/Grafana/Datadog visibility | +| [`daily-otel-instrumentation-advisor.md`](../.github/workflows/daily-otel-instrumentation-advisor.md) | Daily code review + live data validation | Sentry + Grafana backend data | +| [`daily-grafana-otel-instrumentation-advisor.md`](../.github/workflows/daily-grafana-otel-instrumentation-advisor.md) | Grafana-only variant | Grafana Tempo data | +| [`otlp-data-quality-validator.md`](../.github/workflows/otlp-data-quality-validator.md) | OTLP data quality validation | JSONL + vendor traces + attribute contract | + --- -## 11. References +## 15. References ### Normative References @@ -321,9 +664,27 @@ Additional tests SHOULD be added when new helper APIs, new OTLP normalization ru - [specs/aw-harness.md](./aw-harness.md) - [specs/safe-output-outcome-evaluation.md](./safe-output-outcome-evaluation.md) +### Runtime Conformance Workflows + +- [.github/workflows/smoke-otel-backends.md](../.github/workflows/smoke-otel-backends.md) โ€” End-to-end OTLP smoke test +- [.github/workflows/daily-otel-instrumentation-advisor.md](../.github/workflows/daily-otel-instrumentation-advisor.md) โ€” Daily code review + live data validation +- [.github/workflows/daily-grafana-otel-instrumentation-advisor.md](../.github/workflows/daily-grafana-otel-instrumentation-advisor.md) โ€” Grafana-only variant +- [.github/workflows/otlp-data-quality-validator.md](../.github/workflows/otlp-data-quality-validator.md) โ€” OTLP data quality validation + --- -## 12. Change Log +## 16. Change Log + +### Version 0.2.0 (Working Draft) + +- Added ยง9 Trace Model: span naming, hierarchy, kinds, status, exception events +- Added ยง10 Span Attribute Contract: required and conditional attributes for setup, conclusion, and agent spans +- Added ยง11 Resource Attributes: required and conditional resource attributes, instrumentation scope +- Added ยง12 Trace ID Propagation and Lookup: resolution order, storage, cross-job and dispatch propagation +- Added ยง14.1 Runtime Conformance Workflows +- Added compliance tests T-OTEL-OBS-011 through T-OTEL-OBS-015 +- Updated implementation mapping table with ยง9โ€“ยง12 entries +- Renumbered ยง9โ€“ยง12 to ยง13โ€“ยง16 ### Version 0.1.0 (Working Draft) From 469f81380e24943687c80946a6ee2cc3d3401f22 Mon Sep 17 00:00:00 2001 From: Mara Nikola Kiefer Date: Thu, 21 May 2026 08:41:05 +0200 Subject: [PATCH 3/7] enhance outcome evaluation with additional attributes and metrics --- actions/setup/js/emit_outcome_spans.cjs | 35 ++++++++++---- actions/setup/js/evaluate_outcomes.cjs | 61 +++++++++++++++++++++++++ specs/otel-observability-spec.md | 56 ++++++++++++++++++++++- 3 files changed, 142 insertions(+), 10 deletions(-) diff --git a/actions/setup/js/emit_outcome_spans.cjs b/actions/setup/js/emit_outcome_spans.cjs index c857223e899..344770bb841 100644 --- a/actions/setup/js/emit_outcome_spans.cjs +++ b/actions/setup/js/emit_outcome_spans.cjs @@ -148,6 +148,11 @@ async function main() { const changedFiles = typeof eval_.changed_files === "number" ? eval_.changed_files : null; const additions = typeof eval_.additions === "number" ? eval_.additions : null; const deletions = typeof eval_.deletions === "number" ? eval_.deletions : null; + const reactionsTotal = typeof eval_.reactions_total === "number" ? eval_.reactions_total : null; + const reactionsPositive = typeof eval_.reactions_positive === "number" ? eval_.reactions_positive : null; + const reactionsNegative = typeof eval_.reactions_negative === "number" ? eval_.reactions_negative : null; + const comments = typeof eval_.comments === "number" ? eval_.comments : null; + const zeroTouch = eval_.zero_touch === true; const attributes = [ buildAttr("gh-aw.exporter.name", "outcome-collector"), @@ -168,6 +173,11 @@ async function main() { if (changedFiles !== null) attributes.push(buildAttr("gh-aw.outcome.changed_files", changedFiles)); if (additions !== null) attributes.push(buildAttr("gh-aw.outcome.additions", additions)); if (deletions !== null) attributes.push(buildAttr("gh-aw.outcome.deletions", deletions)); + if (reactionsTotal !== null) attributes.push(buildAttr("gh-aw.outcome.reactions_total", reactionsTotal)); + if (reactionsPositive !== null) attributes.push(buildAttr("gh-aw.outcome.reactions_positive", reactionsPositive)); + if (reactionsNegative !== null) attributes.push(buildAttr("gh-aw.outcome.reactions_negative", reactionsNegative)); + if (comments !== null) attributes.push(buildAttr("gh-aw.outcome.comments", comments)); + if (zeroTouch) attributes.push(buildAttr("gh-aw.outcome.zero_touch", true)); // Map result to OTLP status: accepted=OK, rejected=ERROR, noop=UNSET, pending/ignored=UNSET const statusCode = result === "rejected" ? 2 : result === "accepted" ? 1 : 0; @@ -205,6 +215,8 @@ async function main() { buildAttr("gh-aw.outcome.acceptance_rate", getSummaryNumber("acceptance_rate", 0)), buildAttr("gh-aw.outcome.waste_rate", getSummaryNumber("waste_rate", 0)), buildAttr("gh-aw.outcome.noop_rate", getSummaryNumber("noop_rate", 0)), + buildAttr("gh-aw.outcome.zero_touch", getSummaryNumber("zero_touch", 0)), + buildAttr("gh-aw.outcome.zero_touch_rate", getSummaryNumber("zero_touch_rate", 0)), buildAttr("gh-aw.outcome.item_count", evaluations.length), ]; @@ -212,15 +224,20 @@ async function main() { summaryAttributes.push(buildAttr("gh-aw.outcome.date", summary.date)); } - // Median time-to-resolution for resolved items - const resolutionTimes = evaluations - .filter(e => typeof e.resolution_sec === "number" && e.resolution_sec > 0) - .map(e => e.resolution_sec) - .sort((a, b) => a - b); - if (resolutionTimes.length > 0) { - const mid = Math.floor(resolutionTimes.length / 2); - const median = resolutionTimes.length % 2 !== 0 ? resolutionTimes[mid] : Math.round((resolutionTimes[mid - 1] + resolutionTimes[mid]) / 2); - summaryAttributes.push(buildAttr("gh-aw.outcome.median_resolution_sec", median)); + // Median time-to-resolution: prefer summary value, fall back to local computation + const summaryMedian = summary && typeof summary.median_resolution_sec === "number" ? summary.median_resolution_sec : null; + if (summaryMedian !== null) { + summaryAttributes.push(buildAttr("gh-aw.outcome.median_resolution_sec", summaryMedian)); + } else { + const resolutionTimes = evaluations + .filter(e => typeof e.resolution_sec === "number" && e.resolution_sec > 0) + .map(e => e.resolution_sec) + .sort((a, b) => a - b); + if (resolutionTimes.length > 0) { + const mid = Math.floor(resolutionTimes.length / 2); + const median = resolutionTimes.length % 2 !== 0 ? resolutionTimes[mid] : Math.round((resolutionTimes[mid - 1] + resolutionTimes[mid]) / 2); + summaryAttributes.push(buildAttr("gh-aw.outcome.median_resolution_sec", median)); + } } // Trigger type distribution diff --git a/actions/setup/js/evaluate_outcomes.cjs b/actions/setup/js/evaluate_outcomes.cjs index eef8a218129..1addf3239fe 100644 --- a/actions/setup/js/evaluate_outcomes.cjs +++ b/actions/setup/js/evaluate_outcomes.cjs @@ -163,6 +163,11 @@ function secondsBetween(from, to) { * @property {number | null} changed_files * @property {number | null} additions * @property {number | null} deletions + * @property {number | null} reactions_total + * @property {number | null} reactions_positive + * @property {number | null} reactions_negative + * @property {number | null} comments + * @property {boolean} zero_touch */ /** @@ -186,6 +191,11 @@ function evaluateItem(item, defaultRepo) { changed_files: null, additions: null, deletions: null, + reactions_total: null, + reactions_positive: null, + reactions_negative: null, + comments: null, + zero_touch: false, }; if (!url) { @@ -206,6 +216,18 @@ function evaluateItem(item, defaultRepo) { } out.result = "accepted"; out.detail = data.state; + out.comments = typeof data.comments === "number" ? data.comments : null; + + // Reactions on issues + if (data.reactions && typeof data.reactions === "object") { + const r = data.reactions; + const positive = (r["+1"] || 0) + (r.heart || 0) + (r.hooray || 0) + (r.rocket || 0); + const negative = (r["-1"] || 0) + (r.confused || 0); + out.reactions_total = (r.total_count != null) ? r.total_count : positive + negative + (r.laugh || 0) + (r.eyes || 0); + out.reactions_positive = positive; + out.reactions_negative = negative; + } + if (data.state === "closed" && data.created_at && data.closed_at) { out.resolution_sec = secondsBetween(data.created_at, data.closed_at); } @@ -228,6 +250,22 @@ function evaluateItem(item, defaultRepo) { out.changed_files = typeof data.changed_files === "number" ? data.changed_files : null; out.additions = typeof data.additions === "number" ? data.additions : null; out.deletions = typeof data.deletions === "number" ? data.deletions : null; + out.comments = typeof data.comments === "number" ? data.comments : null; + + // Reactions + if (data.reactions && typeof data.reactions === "object") { + const r = data.reactions; + const positive = (r["+1"] || 0) + (r.heart || 0) + (r.hooray || 0) + (r.rocket || 0); + const negative = (r["-1"] || 0) + (r.confused || 0); + out.reactions_total = (r.total_count != null) ? r.total_count : positive + negative + (r.laugh || 0) + (r.eyes || 0); + out.reactions_positive = positive; + out.reactions_negative = negative; + } + + // Zero-touch: merged with no human review comments and no issue-level comments + if (data.merged === true && (out.review_comments === 0 || out.review_comments === null) && (out.comments === 0 || out.comments === null)) { + out.zero_touch = true; + } if (data.merged === true) { out.result = "accepted"; @@ -420,6 +458,11 @@ function main() { changed_files: evalResult.changed_files, additions: evalResult.additions, deletions: evalResult.deletions, + reactions_total: evalResult.reactions_total, + reactions_positive: evalResult.reactions_positive, + reactions_negative: evalResult.reactions_negative, + comments: evalResult.comments, + zero_touch: evalResult.zero_touch || false, }) + "\n" ); } @@ -442,6 +485,21 @@ function main() { const wasteRate = total > 0 ? rejected / total : 0; const noopRate = total + noop > 0 ? noop / (total + noop) : 0; + // Economics: zero-touch rate and median time-to-outcome + const allEvals = readJSONL(EVAL_JSONL); + const acceptedEvals = allEvals.filter(e => e.result === "accepted"); + const zeroTouchCount = acceptedEvals.filter(e => e.zero_touch === true).length; + const zeroTouchRate = acceptedEvals.length > 0 ? zeroTouchCount / acceptedEvals.length : 0; + const resolutionTimes = allEvals + .filter(e => typeof e.resolution_sec === "number" && e.resolution_sec > 0) + .map(e => e.resolution_sec) + .sort((a, b) => a - b); + let medianResolutionSec = null; + if (resolutionTimes.length > 0) { + const mid = Math.floor(resolutionTimes.length / 2); + medianResolutionSec = resolutionTimes.length % 2 !== 0 ? resolutionTimes[mid] : Math.round((resolutionTimes[mid - 1] + resolutionTimes[mid]) / 2); + } + writeJSONAtomic(SUMMARY_PATH, { runs_checked: checked, total_outcomes: total, @@ -453,6 +511,9 @@ function main() { acceptance_rate: Math.round(acceptanceRate * 10000) / 10000, waste_rate: Math.round(wasteRate * 10000) / 10000, noop_rate: Math.round(noopRate * 10000) / 10000, + zero_touch: zeroTouchCount, + zero_touch_rate: Math.round(zeroTouchRate * 10000) / 10000, + median_resolution_sec: medianResolutionSec, date: new Date().toISOString().slice(0, 10), }); diff --git a/specs/otel-observability-spec.md b/specs/otel-observability-spec.md index 236ec0cc920..626efd5ca0b 100644 --- a/specs/otel-observability-spec.md +++ b/specs/otel-observability-spec.md @@ -473,6 +473,58 @@ The dedicated agent span (`gh-aw.*.agent`) follows OpenTelemetry [GenAI semantic | `gen_ai.usage.cache_read.input_tokens` | int | Cache read tokens | | `gen_ai.usage.cache_creation.input_tokens` | int | Cache write tokens | +### 10.4 Outcome Evaluation Span Attributes + +Per-item outcome evaluation spans (`gh-aw.outcome.evaluation`) are emitted by the outcome-collector workflow. Each span represents one safe output item evaluated against the GitHub API. + +| Attribute | Type | Condition | Description | +|---|---|---|---| +| `gh-aw.outcome.type` | string | Required | Safe output type (e.g., `create_pull_request`, `create_issue`) | +| `gh-aw.outcome.result` | string | Required | `accepted`, `rejected`, `pending`, `ignored`, `noop` | +| `gh-aw.outcome.workflow` | string | Required | Source workflow name | +| `gh-aw.outcome.run_id` | int | Required | Source run ID | +| `gh-aw.outcome.repo` | string | Required | Repository | +| `gh-aw.outcome.url` | string | When available | URL to the created object | +| `gh-aw.outcome.detail` | string | When available | Result detail (e.g., `merged`, `closed`, `open`) | +| `gh-aw.outcome.created_at` | string | When available | Item creation timestamp | +| `gh-aw.outcome.event` | string | When available | Triggering event type | +| `gh-aw.outcome.resolution_sec` | int | When resolved | Seconds from creation to resolution | +| `gh-aw.outcome.pending_age_sec` | int | When pending | Seconds since creation | +| `gh-aw.outcome.review_comments` | int | PRs only | Number of review comments | +| `gh-aw.outcome.comments` | int | When available | Number of issue-level comments | +| `gh-aw.outcome.changed_files` | int | PRs only | Files changed | +| `gh-aw.outcome.additions` | int | PRs only | Lines added | +| `gh-aw.outcome.deletions` | int | PRs only | Lines deleted | +| `gh-aw.outcome.reactions_total` | int | When available | Total reaction count | +| `gh-aw.outcome.reactions_positive` | int | When available | Positive reactions (+1, heart, hooray, rocket) | +| `gh-aw.outcome.reactions_negative` | int | When available | Negative reactions (-1, confused) | +| `gh-aw.outcome.zero_touch` | boolean | When true | Accepted with no human review comments or issue comments | + +### 10.5 Outcome Summary Span Attributes + +The fleet summary span (`gh-aw.outcome.summary`) aggregates all evaluated outcomes into a single span with economics metrics. + +| Attribute | Type | Description | +|---|---|---| +| `gh-aw.outcome.runs_checked` | int | Number of runs evaluated | +| `gh-aw.outcome.total` | int | Total actionable outcomes | +| `gh-aw.outcome.accepted` | int | Accepted outcomes | +| `gh-aw.outcome.rejected` | int | Rejected outcomes | +| `gh-aw.outcome.ignored` | int | Ignored outcomes | +| `gh-aw.outcome.pending` | int | Pending outcomes | +| `gh-aw.outcome.noop` | int | Noop outcomes | +| `gh-aw.outcome.acceptance_rate` | double | Accepted / (accepted + rejected) | +| `gh-aw.outcome.waste_rate` | double | Rejected / total | +| `gh-aw.outcome.noop_rate` | double | Noop / (total + noop) | +| `gh-aw.outcome.zero_touch` | int | Count of zero-touch accepted outcomes | +| `gh-aw.outcome.zero_touch_rate` | double | Zero-touch / accepted | +| `gh-aw.outcome.median_resolution_sec` | int | Median seconds from creation to resolution | +| `gh-aw.outcome.item_count` | int | Number of per-item spans emitted | +| `gh-aw.outcome.date` | string | Evaluation date (YYYY-MM-DD) | +| `gh-aw.outcome.events` | string | Comma-separated distinct trigger events | +| `gh-aw.outcome.workflows` | string | Comma-separated distinct workflow names | +| `gh-aw.outcome.types` | string | Comma-separated distinct outcome types | + --- ## 11. Resource Attributes @@ -604,7 +656,7 @@ This section maps the normative behavior in this specification to the current `g | ยง7 | Local Mirrors and Artifacts | `actions/setup/js/send_otlp_span.cjs`, `actions/setup/js/constants.cjs`, `actions/setup/post.js` | | ยง8 | Security and Privacy Requirements | `pkg/workflow/observability_otlp.go`, `pkg/workflow/mcp_renderer.go`, `pkg/workflow/mcp_setup_generator.go`, `actions/setup/js/send_otlp_span.cjs` | | ยง9 | Trace Model | `actions/setup/js/send_otlp_span.cjs`, `actions/setup/js/action_setup_otlp.cjs`, `actions/setup/js/action_conclusion_otlp.cjs` | -| ยง10 | Span Attribute Contract | `actions/setup/js/action_setup_otlp.cjs`, `actions/setup/js/action_conclusion_otlp.cjs`, `actions/setup/js/send_otlp_span.cjs` | +| ยง10 | Span Attribute Contract | `actions/setup/js/action_setup_otlp.cjs`, `actions/setup/js/action_conclusion_otlp.cjs`, `actions/setup/js/send_otlp_span.cjs`, `actions/setup/js/evaluate_outcomes.cjs`, `actions/setup/js/emit_outcome_spans.cjs` | | ยง11 | Resource Attributes | `actions/setup/js/action_setup_otlp.cjs`, `actions/setup/js/send_otlp_span.cjs` | | ยง12 | Trace ID Propagation | `actions/setup/js/action_setup_otlp.cjs`, `actions/setup/js/aw_context.cjs`, `pkg/workflow/compiler_yaml.go` | @@ -679,6 +731,8 @@ The following agentic workflows provide runtime conformance validation: - Added ยง9 Trace Model: span naming, hierarchy, kinds, status, exception events - Added ยง10 Span Attribute Contract: required and conditional attributes for setup, conclusion, and agent spans +- Added ยง10.4 Outcome Evaluation Span Attributes: reactions, zero-touch, comments +- Added ยง10.5 Outcome Summary Span Attributes: zero-touch rate, median resolution, economics metrics - Added ยง11 Resource Attributes: required and conditional resource attributes, instrumentation scope - Added ยง12 Trace ID Propagation and Lookup: resolution order, storage, cross-job and dispatch propagation - Added ยง14.1 Runtime Conformance Workflows From ed15447b73bba3d67d2349713c27637b19fb8f26 Mon Sep 17 00:00:00 2001 From: Mara Nikola Kiefer Date: Thu, 21 May 2026 08:41:58 +0200 Subject: [PATCH 4/7] refactor outcome report structure to enhance clarity and actionable insights --- .github/workflows/outcome-collector.md | 73 +++++++++++++++++--------- 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/.github/workflows/outcome-collector.md b/.github/workflows/outcome-collector.md index 0a69a87a464..c33c757f2e0 100644 --- a/.github/workflows/outcome-collector.md +++ b/.github/workflows/outcome-collector.md @@ -86,44 +86,69 @@ Use h3 (`###`) or lower for all headers in your report. Never use h1 (`#`) or h2 Wrap long sections in `
Section Name` tags to improve readability and reduce scrolling. Keep critical summaries and key metrics always visible. Suggested structure: -- Brief summary (always visible) -- Key metrics or highlights (always visible) -- Detailed analysis (in `
` tags) -- Recommendations (always visible) +- Scorecard with economics metrics (always visible) +- Actionable recommendations with specific next steps (always visible) +- Per-workflow breakdown (in `
` tags) +- Detailed per-run data (in `
` tags) ```markdown -## Safe Output Outcomes โ€” {date} +### Outcome Scorecard โ€” {date} -### Fleet Summary +| Metric | Value | Status | +|--------|-------|--------| +| **Acceptance rate** | **{acceptance_rate}%** | ๐ŸŸข >80% / ๐ŸŸก 60-80% / ๐Ÿ”ด <60% | +| **Zero-touch rate** | **{zero_touch_rate}%** | ๐ŸŸข >50% / ๐ŸŸก 25-50% / ๐Ÿ”ด <25% | +| **Waste rate** | {waste_rate}% | ๐ŸŸข <10% / ๐ŸŸก 10-25% / ๐Ÿ”ด >25% | +| **Median time to resolution** | {median_resolution} | โ€” | +| Accepted | {accepted} / {total_outcomes} | โ€” | +| Rejected | {rejected} | โ€” | +| Zero-touch | {zero_touch} / {accepted} | โ€” | +| Pending | {pending} | โ€” | +| Runs checked | {runs_checked} | โ€” | -| Metric | Value | -|--------|-------| -| Runs checked | {runs_checked} | -| Total outcomes | {total_outcomes} | -| Accepted | {accepted} | -| Rejected | {rejected} | -| Ignored | {ignored} | -| Pending | {pending} | -| **Acceptance rate** | **{acceptance_rate}%** | -| Waste rate | {waste_rate}% | +### ๐Ÿ”ด Action Items + +List concrete actions the team should take based on the data: + +1. **Highest-waste workflows** โ€” Name the top 2-3 workflows by waste rate. If waste rate >25%, recommend reviewing the prompt or safe-output configuration. +2. **Stuck pending items** โ€” List any items pending >48 hours. These need human review or the workflow needs a timeout. +3. **Low zero-touch workflows** โ€” Workflows where accepted items always need human edits indicate the agent's output quality needs improvement. +4. **Negative reactions** โ€” Items with negative reactions (๐Ÿ‘Ž, confused) signal user dissatisfaction even on "accepted" items. ### Per-Workflow Breakdown -For each workflow with outcomes, show: -- Workflow name -- Outcomes: accepted / rejected / ignored -- Acceptance rate +For each workflow with outcomes, show a mini-scorecard: + +| Workflow | Accepted | Rejected | Pending | Acceptance | Zero-touch | Reactions ๐Ÿ‘/๐Ÿ‘Ž | +|----------|----------|----------|---------|------------|------------|----------------| + +Sort by waste rate descending (worst first). + +### Reaction Summary + +If any items have reactions, summarize: +- Items with positive reactions (๐Ÿ‘ heart rocket hooray): these workflows are producing valued output +- Items with negative reactions (๐Ÿ‘Ž confused): these need prompt or quality improvements +- Items with zero reactions: no signal yet + +### Trend Signal -### Key Observations +Compare today's acceptance rate and zero-touch rate against the previous report in cache-memory (if available). Flag: +- โฌ†๏ธ Improving: acceptance rate up >5pp or zero-touch rate up >10pp +- โฌ‡๏ธ Regressing: acceptance rate down >5pp or waste rate up >5pp +- โžก๏ธ Stable: within 5pp of previous -- Which workflows have the highest acceptance rate? -- Which workflows have the highest waste rate? -- Any workflows with all outcomes ignored (noise signal)? +If no previous data exists, skip this section. ``` ## Guidelines - Keep the report factual โ€” numbers only, no speculation - Do not re-evaluate outcomes โ€” use the pre-computed data +- Sort workflows by waste rate descending so the worst performers are at the top +- Flag any workflow with acceptance rate <60% as needing attention +- Flag any item pending >48 hours +- If reactions data is available, include it in the per-workflow breakdown +- Save this report's key metrics to cache-memory for trend comparison in the next run - If no outcomes exist, use `noop` - Stop immediately after creating the issue From cedba0c7e69a2f25c09be0f34d0228cff3f62320 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 21 May 2026 06:46:14 +0000 Subject: [PATCH 5/7] chore: outline workflow recompilation plan Co-authored-by: mnkiefer <8320933+mnkiefer@users.noreply.github.com> --- .../otlp-data-quality-validator.lock.yml | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/workflows/otlp-data-quality-validator.lock.yml b/.github/workflows/otlp-data-quality-validator.lock.yml index 6d4286d5a00..46b54f639e5 100644 --- a/.github/workflows/otlp-data-quality-validator.lock.yml +++ b/.github/workflows/otlp-data-quality-validator.lock.yml @@ -1,4 +1,4 @@ -# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"17dcabe392f10a701b05312a2a2a544024a389a44bbf590159964c1892c52074","strict":true,"agent_id":"copilot"} +# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"759e69cd162496de334aa3b7220316b6485908c3c63e4436e2a2963728bf6146","strict":true,"agent_id":"copilot"} # gh-aw-manifest: {"version":1,"secrets":["COPILOT_GITHUB_TOKEN","GH_AW_GITHUB_MCP_SERVER_TOKEN","GH_AW_GITHUB_TOKEN","GH_AW_OTEL_GRAFANA_AUTHORIZATION","GH_AW_OTEL_GRAFANA_ENDPOINT","GH_AW_OTEL_SENTRY_AUTHORIZATION","GH_AW_OTEL_SENTRY_ENDPOINT","GITHUB_TOKEN"],"actions":[{"repo":"actions/checkout","sha":"de0fac2e4500dabe0009e67214ff5f5447ce83dd","version":"v6.0.2"},{"repo":"actions/download-artifact","sha":"3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c","version":"v8.0.1"},{"repo":"actions/github-script","sha":"3a2844b7e9c422d3c10d287c895573f7108da1b3","version":"v9.0.0"},{"repo":"actions/setup-node","sha":"48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e","version":"v6.4.0"},{"repo":"actions/upload-artifact","sha":"043fb46d1a93c77aae656e7c1c64a875d1fc6a0a","version":"v7.0.1"}],"containers":[{"image":"ghcr.io/github/gh-aw-firewall/agent:0.25.49"},{"image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.25.49"},{"image":"ghcr.io/github/gh-aw-firewall/cli-proxy:0.25.49"},{"image":"ghcr.io/github/gh-aw-firewall/squid:0.25.49"},{"image":"ghcr.io/github/gh-aw-mcpg:v0.3.9","digest":"sha256:64828b42a4482f58fab16509d7f8f495a6d97c972a98a68aff20543531ac0388","pinned_image":"ghcr.io/github/gh-aw-mcpg:v0.3.9@sha256:64828b42a4482f58fab16509d7f8f495a6d97c972a98a68aff20543531ac0388"},{"image":"ghcr.io/github/github-mcp-server:v1.0.4"},{"image":"node:lts-alpine","digest":"sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f","pinned_image":"node:lts-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f"}]} # ___ _ _ # / _ \ | | (_) @@ -22,7 +22,7 @@ # # For more information: https://github.github.com/gh-aw/introduction/overview/ # -# Validates OTLP trace, metric, and log data quality across app emission, Collector processing, and backend visibility +# Validates gh-aw OTLP trace data quality across local JSONL mirror, direct vendor export, and backend visibility # # Resolved workflow manifest: # Imports: @@ -202,20 +202,20 @@ jobs: run: | bash "${RUNNER_TEMP}/gh-aw/actions/create_prompt_first.sh" { - cat << 'GH_AW_PROMPT_7de7fa5e3739b47b_EOF' + cat << 'GH_AW_PROMPT_bc29e1568146c495_EOF' - GH_AW_PROMPT_7de7fa5e3739b47b_EOF + GH_AW_PROMPT_bc29e1568146c495_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/xpia.md" cat "${RUNNER_TEMP}/gh-aw/prompts/temp_folder_prompt.md" cat "${RUNNER_TEMP}/gh-aw/prompts/markdown.md" cat "${RUNNER_TEMP}/gh-aw/prompts/safe_outputs_prompt.md" - cat << 'GH_AW_PROMPT_7de7fa5e3739b47b_EOF' + cat << 'GH_AW_PROMPT_bc29e1568146c495_EOF' Tools: create_issue, missing_tool, missing_data, noop - GH_AW_PROMPT_7de7fa5e3739b47b_EOF + GH_AW_PROMPT_bc29e1568146c495_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/mcp_cli_tools_prompt.md" - cat << 'GH_AW_PROMPT_7de7fa5e3739b47b_EOF' + cat << 'GH_AW_PROMPT_bc29e1568146c495_EOF' The following GitHub context information is available for this workflow: {{#if github.actor}} @@ -244,14 +244,14 @@ jobs: {{/if}} - GH_AW_PROMPT_7de7fa5e3739b47b_EOF + GH_AW_PROMPT_bc29e1568146c495_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/cli_proxy_with_safeoutputs_prompt.md" - cat << 'GH_AW_PROMPT_7de7fa5e3739b47b_EOF' + cat << 'GH_AW_PROMPT_bc29e1568146c495_EOF' {{#runtime-import .github/workflows/shared/otlp.md}} {{#runtime-import .github/workflows/shared/otel-queries.md}} {{#runtime-import .github/workflows/otlp-data-quality-validator.md}} - GH_AW_PROMPT_7de7fa5e3739b47b_EOF + GH_AW_PROMPT_bc29e1568146c495_EOF } > "$GH_AW_PROMPT" - name: Interpolate variables and render templates uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 @@ -466,9 +466,9 @@ jobs: mkdir -p "${RUNNER_TEMP}/gh-aw/safeoutputs" mkdir -p /tmp/gh-aw/safeoutputs mkdir -p /tmp/gh-aw/mcp-logs/safeoutputs - cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_4c77f8b71cbb283e_EOF' + cat > "${RUNNER_TEMP}/gh-aw/safeoutputs/config.json" << 'GH_AW_SAFE_OUTPUTS_CONFIG_53860d35baa3701f_EOF' {"create_issue":{"close_older_issues":true,"expires":168,"labels":["observability","telemetry","report"],"max":1,"title_prefix":"[OTLP Validation] "},"create_report_incomplete_issue":{},"max_bot_mentions":1,"mentions":{"enabled":false},"missing_data":{},"missing_tool":{},"noop":{"max":1,"report-as-issue":"true"},"report_incomplete":{}} - GH_AW_SAFE_OUTPUTS_CONFIG_4c77f8b71cbb283e_EOF + GH_AW_SAFE_OUTPUTS_CONFIG_53860d35baa3701f_EOF - name: Generate Safe Outputs Tools env: GH_AW_TOOLS_META_JSON: | @@ -673,7 +673,7 @@ jobs: mkdir -p /home/runner/.copilot GH_AW_NODE=$(which node 2>/dev/null || command -v node 2>/dev/null || echo node) - cat << GH_AW_MCP_CONFIG_14247717b4285c48_EOF | "$GH_AW_NODE" "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.cjs" + cat << GH_AW_MCP_CONFIG_2c6df0af9284b001_EOF | "$GH_AW_NODE" "${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.cjs" { "mcpServers": { "safeoutputs": { @@ -703,7 +703,7 @@ jobs: } } } - GH_AW_MCP_CONFIG_14247717b4285c48_EOF + GH_AW_MCP_CONFIG_2c6df0af9284b001_EOF - name: Mount MCP servers as CLIs id: mount-mcp-clis continue-on-error: true @@ -1214,7 +1214,7 @@ jobs: uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 env: WORKFLOW_NAME: "OTLP Data Quality Validator" - WORKFLOW_DESCRIPTION: "Validates OTLP trace, metric, and log data quality across app emission, Collector processing, and backend visibility" + WORKFLOW_DESCRIPTION: "Validates gh-aw OTLP trace data quality across local JSONL mirror, direct vendor export, and backend visibility" HAS_PATCH: ${{ needs.agent.outputs.has_patch }} with: script: | From f913f0d5c11cbf5753c5fc560e731b464536c209 Mon Sep 17 00:00:00 2001 From: Mara Nikola Kiefer <8320933+mnkiefer@users.noreply.github.com> Date: Thu, 21 May 2026 08:54:07 +0200 Subject: [PATCH 6/7] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- actions/setup/js/evaluate_outcomes.cjs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/actions/setup/js/evaluate_outcomes.cjs b/actions/setup/js/evaluate_outcomes.cjs index 1addf3239fe..6abce70d250 100644 --- a/actions/setup/js/evaluate_outcomes.cjs +++ b/actions/setup/js/evaluate_outcomes.cjs @@ -263,7 +263,7 @@ function evaluateItem(item, defaultRepo) { } // Zero-touch: merged with no human review comments and no issue-level comments - if (data.merged === true && (out.review_comments === 0 || out.review_comments === null) && (out.comments === 0 || out.comments === null)) { + if (data.merged === true && out.review_comments === 0 && out.comments === 0) { out.zero_touch = true; } From 15ceccdb3aa096c3768aab39044f6e90b667775c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 21 May 2026 07:01:12 +0000 Subject: [PATCH 7/7] fix: address remaining OTLP outcome review feedback Co-authored-by: mnkiefer <8320933+mnkiefer@users.noreply.github.com> --- .../workflows/otlp-data-quality-validator.md | 16 ++++----- actions/setup/js/emit_outcome_spans.cjs | 2 +- actions/setup/js/emit_outcome_spans.test.cjs | 33 +++++++++++++++++++ actions/setup/js/evaluate_outcomes.cjs | 19 ++++++----- specs/otel-observability-spec.md | 4 +-- 5 files changed, 55 insertions(+), 19 deletions(-) diff --git a/.github/workflows/otlp-data-quality-validator.md b/.github/workflows/otlp-data-quality-validator.md index 5f2baf979cf..aba848a7090 100644 --- a/.github/workflows/otlp-data-quality-validator.md +++ b/.github/workflows/otlp-data-quality-validator.md @@ -80,23 +80,23 @@ Infer expectations from: ### Step 2: Validate trace completeness and integrity From the local JSONL mirror (`/tmp/gh-aw/otel.jsonl`), compute and report: -- unique `trace_id` count (expect 1 per workflow run) -- unique span identity count using `trace_id + span_id` -- duplicate spans with same `trace_id + span_id` +- unique `traceId` count (expect 1 per workflow run) +- unique span identity count using `traceId + spanId` +- duplicate spans with same `traceId + spanId` Validate the expected span hierarchy per the spec (ยง9.3): -- all setup spans share a single global `parent_span_id` +- all setup spans share a single global `parentSpanId` - each conclusion span parents under its job's setup span - agent spans parent under the conclusion span - root setup parent has no parent Validate required fields per span: -- `trace_id` (32-char hex) -- `span_id` (16-char hex) +- `traceId` (32-char hex) +- `spanId` (16-char hex) - `name` (must match pattern `gh-aw..`) - `kind` (INTERNAL=1 for setup/conclusion, CLIENT=3 for agent) -- `start_time_unix_nano` -- `end_time_unix_nano` +- `startTimeUnixNano` +- `endTimeUnixNano` Flag timestamp issues: - `start_time > end_time` diff --git a/actions/setup/js/emit_outcome_spans.cjs b/actions/setup/js/emit_outcome_spans.cjs index 344770bb841..2d7eccaf95e 100644 --- a/actions/setup/js/emit_outcome_spans.cjs +++ b/actions/setup/js/emit_outcome_spans.cjs @@ -215,7 +215,7 @@ async function main() { buildAttr("gh-aw.outcome.acceptance_rate", getSummaryNumber("acceptance_rate", 0)), buildAttr("gh-aw.outcome.waste_rate", getSummaryNumber("waste_rate", 0)), buildAttr("gh-aw.outcome.noop_rate", getSummaryNumber("noop_rate", 0)), - buildAttr("gh-aw.outcome.zero_touch", getSummaryNumber("zero_touch", 0)), + buildAttr("gh-aw.outcome.zero_touch_count", getSummaryNumber("zero_touch", 0)), buildAttr("gh-aw.outcome.zero_touch_rate", getSummaryNumber("zero_touch_rate", 0)), buildAttr("gh-aw.outcome.item_count", evaluations.length), ]; diff --git a/actions/setup/js/emit_outcome_spans.test.cjs b/actions/setup/js/emit_outcome_spans.test.cjs index a590f91f948..f13bb36f64e 100644 --- a/actions/setup/js/emit_outcome_spans.test.cjs +++ b/actions/setup/js/emit_outcome_spans.test.cjs @@ -182,6 +182,11 @@ describe("emit_outcome_spans.cjs", () => { rejected: 1, ignored: 0, pending: 0, + noop: 0, + noop_rate: 0, + zero_touch: 1, + zero_touch_rate: 1, + median_resolution_sec: 42, acceptance_rate: 0.5, waste_rate: 0.5, date: "2026-05-13", @@ -198,6 +203,15 @@ describe("emit_outcome_spans.cjs", () => { url: "https://github.com/github/gh-aw/issues/1", repo: "github/gh-aw", timestamp: "2026-05-13T09:00:00Z", + review_comments: 0, + changed_files: 3, + additions: 10, + deletions: 2, + reactions_total: 5, + reactions_positive: 4, + reactions_negative: 1, + comments: 0, + zero_touch: true, }), JSON.stringify({ type: "comment", @@ -263,10 +277,29 @@ describe("emit_outcome_spans.cjs", () => { expect(summarySpan.attributes).toContainEqual({ key: "gh-aw.exporter.name", value: "outcome-collector" }); expect(summarySpan.attributes).toContainEqual({ key: "gh-aw.outcome.date", value: "2026-05-13" }); + expect(summarySpan.attributes).toContainEqual({ key: "gh-aw.outcome.zero_touch_count", value: 1 }); expect(spans[1].attributes).toContainEqual({ key: "gh-aw.exporter.name", value: "outcome-collector" }); expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.url", value: "https://github.com/github/gh-aw/issues/1" }); expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.detail", value: "created item" }); expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.created_at", value: "2026-05-13T09:00:00Z" }); + expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.review_comments", value: 0 }); + expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.changed_files", value: 3 }); + expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.additions", value: 10 }); + expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.deletions", value: 2 }); + expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.reactions_total", value: 5 }); + expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.reactions_positive", value: 4 }); + expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.reactions_negative", value: 1 }); + expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.comments", value: 0 }); + expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.zero_touch", value: true }); + expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.review_comments")).toBeUndefined(); + expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.changed_files")).toBeUndefined(); + expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.additions")).toBeUndefined(); + expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.deletions")).toBeUndefined(); + expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.reactions_total")).toBeUndefined(); + expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.reactions_positive")).toBeUndefined(); + expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.reactions_negative")).toBeUndefined(); + expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.comments")).toBeUndefined(); + expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.zero_touch")).toBeUndefined(); expect(mockAppendToOTLPJSONL).toHaveBeenCalledOnce(); expect(mockSendOTLPToAllEndpoints).not.toHaveBeenCalled(); diff --git a/actions/setup/js/evaluate_outcomes.cjs b/actions/setup/js/evaluate_outcomes.cjs index 6abce70d250..349d34b9e0e 100644 --- a/actions/setup/js/evaluate_outcomes.cjs +++ b/actions/setup/js/evaluate_outcomes.cjs @@ -353,6 +353,9 @@ function main() { let pending = 0; let total = 0; let noop = 0; + let zeroTouchCount = 0; + /** @type {number[]} */ + const resolutionTimes = []; // Clear the evaluations file fs.writeFileSync(EVAL_JSONL, ""); @@ -431,6 +434,9 @@ function main() { switch (evalResult.result) { case "accepted": accepted++; + if (evalResult.zero_touch === true) { + zeroTouchCount++; + } break; case "rejected": rejected++; @@ -439,6 +445,9 @@ function main() { pending++; break; } + if (typeof evalResult.resolution_sec === "number" && evalResult.resolution_sec > 0) { + resolutionTimes.push(evalResult.resolution_sec); + } fs.appendFileSync( EVAL_JSONL, @@ -486,14 +495,8 @@ function main() { const noopRate = total + noop > 0 ? noop / (total + noop) : 0; // Economics: zero-touch rate and median time-to-outcome - const allEvals = readJSONL(EVAL_JSONL); - const acceptedEvals = allEvals.filter(e => e.result === "accepted"); - const zeroTouchCount = acceptedEvals.filter(e => e.zero_touch === true).length; - const zeroTouchRate = acceptedEvals.length > 0 ? zeroTouchCount / acceptedEvals.length : 0; - const resolutionTimes = allEvals - .filter(e => typeof e.resolution_sec === "number" && e.resolution_sec > 0) - .map(e => e.resolution_sec) - .sort((a, b) => a - b); + const zeroTouchRate = accepted > 0 ? zeroTouchCount / accepted : 0; + resolutionTimes.sort((a, b) => a - b); let medianResolutionSec = null; if (resolutionTimes.length > 0) { const mid = Math.floor(resolutionTimes.length / 2); diff --git a/specs/otel-observability-spec.md b/specs/otel-observability-spec.md index 626efd5ca0b..d0a2989f31b 100644 --- a/specs/otel-observability-spec.md +++ b/specs/otel-observability-spec.md @@ -516,7 +516,7 @@ The fleet summary span (`gh-aw.outcome.summary`) aggregates all evaluated outcom | `gh-aw.outcome.acceptance_rate` | double | Accepted / (accepted + rejected) | | `gh-aw.outcome.waste_rate` | double | Rejected / total | | `gh-aw.outcome.noop_rate` | double | Noop / (total + noop) | -| `gh-aw.outcome.zero_touch` | int | Count of zero-touch accepted outcomes | +| `gh-aw.outcome.zero_touch_count` | int | Count of zero-touch accepted outcomes | | `gh-aw.outcome.zero_touch_rate` | double | Zero-touch / accepted | | `gh-aw.outcome.median_resolution_sec` | int | Median seconds from creation to resolution | | `gh-aw.outcome.item_count` | int | Number of per-item spans emitted | @@ -744,4 +744,4 @@ The following agentic workflows provide runtime conformance validation: - Initial repository-level OTel observability specification - Defined the normative `observability.otlp` contract for compiler and runtime behavior -- Added gateway-integration, local-mirror, implementation-mapping, and conformance-test sections \ No newline at end of file +- Added gateway-integration, local-mirror, implementation-mapping, and conformance-test sections