Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions actions/setup/js/send_otlp_span.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -1173,7 +1173,7 @@ async function sendJobSetupSpan(options = {}) {
startMs,
endMs,
serviceName,
scopeVersion: process.env.GH_AW_INFO_VERSION || process.env.GH_AW_INFO_CLI_VERSION || "unknown",
scopeVersion: process.env.GH_AW_INFO_VERSION || process.env.GH_AW_INFO_CLI_VERSION || process.env.GITHUB_SHA || "unknown",
attributes,
resourceAttributes,
});
Expand Down Expand Up @@ -1627,7 +1627,7 @@ async function sendJobConclusionSpan(spanName, options = {}) {
const effectiveTokens = rawET ? parseInt(rawET, 10) : NaN;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/diagnose] The GITHUB_SHA fallback correctly provides a commit anchor for Sentry. One observability enhancement to consider:

Current behavior: GITHUB_SHA is a 40-character full commit hash. Sentry release identifiers are typically shorter for readability in dashboards.

Suggestion: Consider truncating to the first 7-8 characters when using GITHUB_SHA as a fallback:

const gitShaFallback = process.env.GITHUB_SHA?.substring(0, 8);
const version = 
  awInfo.agent_version || awInfo.version || 
  process.env.GH_AW_INFO_VERSION || 
  awInfo.cli_version || 
  process.env.GH_AW_INFO_CLI_VERSION || 
  gitShaFallback || 
  "unknown";

This would make Sentry releases more readable while still providing a unique commit anchor. However, full SHAs are also valid — document the decision either way.


const serviceName = process.env.OTEL_SERVICE_NAME || "gh-aw";
const version = awInfo.agent_version || awInfo.version || process.env.GH_AW_INFO_VERSION || awInfo.cli_version || process.env.GH_AW_INFO_CLI_VERSION || "unknown";
const version = awInfo.agent_version || awInfo.version || process.env.GH_AW_INFO_VERSION || awInfo.cli_version || process.env.GH_AW_INFO_CLI_VERSION || process.env.GITHUB_SHA || "unknown";

// Prefer GITHUB_AW_OTEL_TRACE_ID (written to GITHUB_ENV by this job's setup step) so
// all spans in the same job share one trace. Fall back to aw_context.otel_trace_id
Expand Down Expand Up @@ -1791,11 +1791,12 @@ async function sendJobConclusionSpan(spanName, options = {}) {
if (workflowName) attributes.push(buildAttr("gen_ai.workflow.name", workflowName));
if (runtimeMetrics.resolvedModel) attributes.push(buildAttr("gen_ai.response.model", runtimeMetrics.resolvedModel));
// Use the engine-reported stop_reason when available; fall back to "timeout" when
// the agent was killed before it could write a result entry to agent-stdio.log.
const effectiveStopReason = runtimeMetrics.stopReason || (isAgentTimedOut ? "timeout" : undefined);
if (effectiveStopReason) {
attributes.push(buildArrayAttr("gen_ai.response.finish_reasons", [effectiveStopReason]));
}
// the agent was killed before it could write a result entry to agent-stdio.log;
// use "unknown" as a sentinel for engines (e.g. copilot, codex) that do not emit
// a result entry at all, so that gen_ai.response.finish_reasons is always present
// and length-truncation is always queryable in Sentry/dashboards.
const effectiveStopReason = runtimeMetrics.stopReason || (isAgentTimedOut ? "timeout" : "unknown");
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/diagnose] The fix correctly addresses the root cause by making gen_ai.response.finish_reasons always present. However, the sentinel value "unknown" is now conflated with two distinct scenarios:

  1. Engines that never emit a result entry (Copilot, Codex)
  2. Edge cases where the span is emitted before the agent writes a result entry

Consider: Would "no_result_entry" be a more specific sentinel than "unknown"? This would make dashboards more actionable — "unknown" could mean many things, while "no_result_entry" explicitly identifies the Copilot/Codex engine limitation.

Alternative: Document in a comment that "unknown" specifically means "engine did not emit result entry", distinguishing it from other potential unknown states.

attributes.push(buildArrayAttr("gen_ai.response.finish_reasons", [effectiveStopReason]));
}

if (agentConclusion) {
Expand Down
49 changes: 44 additions & 5 deletions actions/setup/js/send_otlp_span.test.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -1822,6 +1822,22 @@ describe("sendJobSetupSpan", () => {
expect(body.resourceSpans[0].scopeSpans[0].scope.version).toBe("v2.5.0");
});

it("falls back to GITHUB_SHA for service.version when no gh-aw version is available", async () => {
const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" });
vi.stubGlobal("fetch", mockFetch);

process.env.GH_AW_OTLP_ENDPOINTS = JSON.stringify([{ url: "https://traces.example.com" }]);
// No GH_AW_INFO_VERSION or GH_AW_INFO_CLI_VERSION — only SHA available
process.env.GITHUB_SHA = "aabbccdd1122334455667788aabbccdd11223344";

await sendJobSetupSpan();

const body = JSON.parse(mockFetch.mock.calls[0][1].body);
const resourceAttrs = body.resourceSpans[0].resource.attributes;
expect(resourceAttrs).toContainEqual({ key: "service.version", value: { stringValue: "aabbccdd1122334455667788aabbccdd11223344" } });
expect(body.resourceSpans[0].scopeSpans[0].scope.version).toBe("aabbccdd1122334455667788aabbccdd11223344");
});

it("includes gh-aw.awf.version and gh-aw.awmg.version resource attributes from aw_info.json", async () => {
const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" });
vi.stubGlobal("fetch", mockFetch);
Expand Down Expand Up @@ -3040,7 +3056,7 @@ describe("sendJobConclusionSpan", () => {
expect(conclusionAttrs["gen_ai.response.model"]).toBe("claude-sonnet-4-20250514");
});

it("omits gen_ai.response.finish_reasons from the agent span when stop_reason is absent in agent-stdio.log", async () => {
it("emits gen_ai.response.finish_reasons=[unknown] on the agent span when stop_reason is absent from agent-stdio.log", async () => {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/tdd] Consider adding a test that explicitly validates the documented priority order for finish_reasons: stopReason"timeout""unknown". Currently, each case is tested in isolation, but the fallback chain isn't verified in a single test.

Example:

it("applies finish_reasons priority: stopReason → timeout → unknown", async () => {
  const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" });
  vi.stubGlobal("fetch", mockFetch);

  process.env.GH_AW_OTLP_ENDPOINTS = JSON.stringify([{ url: "(traces.example.com/redacted)" }]);

  // Case 1: stop_reason present (highest priority)
  const stdio1 = JSON.stringify({ type: "result", result: { stop_reason: "end_turn" } }) + "\n";
  fs.writeFileSync("/tmp/gh-aw/test/agent-stdio.log", stdio1, "utf-8");
  await sendJobConclusionSpan("gh-aw.job.conclusion");
  let finishAttr = JSON.parse(mockFetch.mock.calls[0][1].body)
    .resourceSpans[0].scopeSpans[0].spans[0].attributes
    .find(a => a.key === "gen_ai.response.finish_reasons");
  expect(finishAttr.value.arrayValue.values).toEqual([{ stringValue: "end_turn" }]);

  // Case 2: timeout (no stop_reason, agent timed out)
  fs.writeFileSync("/tmp/gh-aw/test/agent-stdio.log", "", "utf-8");
  process.env.GITHUB_AW_AGENT_TIMEOUT_TRIGGERED = "true";
  await sendJobConclusionSpan("gh-aw.job.conclusion");
  finishAttr = JSON.parse(mockFetch.mock.calls[1][1].body)
    .resourceSpans[0].scopeSpans[0].spans[0].attributes
    .find(a => a.key === "gen_ai.response.finish_reasons");
  expect(finishAttr.value.arrayValue.values).toEqual([{ stringValue: "timeout" }]);

  // Case 3: unknown (no stop_reason, not timed out)
  delete process.env.GITHUB_AW_AGENT_TIMEOUT_TRIGGERED;
  await sendJobConclusionSpan("gh-aw.job.conclusion");
  finishAttr = JSON.parse(mockFetch.mock.calls[2][1].body)
    .resourceSpans[0].scopeSpans[0].spans[0].attributes
    .find(a => a.key === "gen_ai.response.finish_reasons");
  expect(finishAttr.value.arrayValue.values).toEqual([{ stringValue: "unknown" }]);
});

This would serve as executable documentation of the fallback chain and catch regressions if the priority order ever changes unintentionally.

const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" });
vi.stubGlobal("fetch", mockFetch);

Expand All @@ -3065,13 +3081,15 @@ describe("sendJobConclusionSpan", () => {
const agentBody = JSON.parse(mockFetch.mock.calls[0][1].body);
const agentSpan = agentBody.resourceSpans[0].scopeSpans[0].spans[0];
expect(agentSpan.name).toBe("gh-aw.agent.agent");
const keys = agentSpan.attributes.map(a => a.key);
expect(keys).not.toContain("gen_ai.response.finish_reasons");
const agentFinishAttr = agentSpan.attributes.find(a => a.key === "gen_ai.response.finish_reasons");
expect(agentFinishAttr).toBeDefined();
expect(agentFinishAttr.value.arrayValue.values).toEqual([{ stringValue: "unknown" }]);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/tdd] The test verifies that gen_ai.response.finish_reasons is present with value ["unknown"], but it does not verify the conclusion span behavior when stop_reason is absent.

The PR description mentions this affected "5/7 of the failed runs" — the test should validate both the agent span and the conclusion span independently to ensure they both emit the attribute. Currently, lines 3084-3087 test the conclusion span, but consider adding a comment clarifying which span is under test.

Suggestion: Add an explicit assertion comment:

// Agent span should always emit finish_reasons, even when stop_reason is absent
const agentFinishAttr = agentSpan.attributes.find(a => a.key === "gen_ai.response.finish_reasons");

// Conclusion span should mirror the same behavior
const conclusionFinishAttr = conclusionSpan.attributes.find(a => a.key === "gen_ai.response.finish_reasons");

const conclusionBody = JSON.parse(mockFetch.mock.calls[1][1].body);
const conclusionSpan = conclusionBody.resourceSpans[0].scopeSpans[0].spans[0];
const conclusionKeys = conclusionSpan.attributes.map(a => a.key);
expect(conclusionKeys).not.toContain("gen_ai.response.finish_reasons");
const conclusionFinishAttr = conclusionSpan.attributes.find(a => a.key === "gen_ai.response.finish_reasons");
expect(conclusionFinishAttr).toBeDefined();
expect(conclusionFinishAttr.value.arrayValue.values).toEqual([{ stringValue: "unknown" }]);
});

it("includes gen_ai.response.finish_reasons with max_tokens on the agent span when truncated", async () => {
Expand Down Expand Up @@ -3934,6 +3952,27 @@ describe("sendJobConclusionSpan", () => {
expect(resourceAttrs).toContainEqual({ key: "service.version", value: { stringValue: "v3.0.0" } });
});

it("falls back to GITHUB_SHA for service.version on conclusion span when no gh-aw version is available", async () => {
const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" });
vi.stubGlobal("fetch", mockFetch);

process.env.GH_AW_OTLP_ENDPOINTS = JSON.stringify([{ url: "https://traces.example.com" }]);
// No GH_AW_INFO_VERSION, GH_AW_INFO_CLI_VERSION, or aw_info.json version fields
process.env.GITHUB_SHA = "deadbeef1234567890abcdef1234567890abcdef";

const readFileSpy = vi.spyOn(fs, "readFileSync").mockImplementation(() => {
throw Object.assign(new Error("ENOENT"), { code: "ENOENT" });
});

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/tdd] This test validates the GITHUB_SHA fallback by mocking readFileSync to simulate missing aw_info.json. Excellent regression coverage!

Edge case to consider: What happens when GITHUB_SHA itself is undefined or empty? This could occur in local development or non-GitHub-hosted runners.

Suggestion: Add a test case:

it("falls back to 'unknown' for service.version when GITHUB_SHA is also unavailable", async () => {
  const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200, statusText: "OK" });
  vi.stubGlobal("fetch", mockFetch);

  process.env.GH_AW_OTLP_ENDPOINTS = JSON.stringify([{ url: "(traces.example.com/redacted)" }]);
  delete process.env.GITHUB_SHA;
  delete process.env.GH_AW_INFO_VERSION;
  delete process.env.GH_AW_INFO_CLI_VERSION;
  
  const readFileSpy = vi.spyOn(fs, "readFileSync").mockImplementation(() => {
    throw Object.assign(new Error("ENOENT"), { code: "ENOENT" });
  });

  await sendJobConclusionSpan("gh-aw.job.conclusion");
  readFileSpy.mockRestore();

  const body = JSON.parse(mockFetch.mock.calls[0][1].body);
  const resourceAttrs = body.resourceSpans[0].resource.attributes;
  expect(resourceAttrs).toContainEqual({ key: "service.version", value: { stringValue: "unknown" } });
});

This ensures the fallback chain is fully tested.

await sendJobConclusionSpan("gh-aw.job.conclusion");
readFileSpy.mockRestore();

const body = JSON.parse(mockFetch.mock.calls[0][1].body);
const resourceAttrs = body.resourceSpans[0].resource.attributes;
expect(resourceAttrs).toContainEqual({ key: "service.version", value: { stringValue: "deadbeef1234567890abcdef1234567890abcdef" } });
expect(body.resourceSpans[0].scopeSpans[0].scope.version).toBe("deadbeef1234567890abcdef1234567890abcdef");
});

describe("agent_output.json error enrichment", () => {
let readFileSpy;

Expand Down
Loading