fix: preserve runtime wait semantics#422
Conversation
PR SummaryMedium Risk Overview Updates wait handling to derive Reviewed by Cursor Bugbot for commit 1261c1c. Bugbot is set up for automated code reviews on this repo. Configure here. |
|
This PR changes mirrored Maestro source files in the public repo, but it does not link the matching private source-of-truth PR. Add one of these to the PR body, then re-run the check:
Mirrored files touched:
|
There was a problem hiding this comment.
💡 Codex Review
Here are some automated review suggestions for this pull request.
Reviewed commit: 1261c1ce12
ℹ️ About Codex in GitHub
Codex has been enabled to automatically review pull requests in this repo. Reviews are triggered when you
- Open a pull request for review
- Mark a draft as ready
- Comment "@codex review".
If Codex has suggestions, it will comment; otherwise it will react with 👍.
When you sign up for Codex through ChatGPT, Codex can also answer questions or update the PR, like "@codex address that feedback".
| if (timelineItem?.pendingRequestId) return "AGENT_RUN_WAIT_TYPE_INPUT"; | ||
| return undefined; |
There was a problem hiding this comment.
Default unknown waits to a concrete wait type
When a wait event cannot be matched to a timeline item with pendingRequestKind, this now returns undefined, which prevents emitting a wait_run operation. That regresses promotion behavior for timeline sources that don't currently populate pending-request metadata (for example, platform-normalized wait.pending items in src/platform/maestro-timeline-client.ts only set approvalRequestId opportunistically and omit pendingRequestKind/pendingRequestId). In those sessions, waits silently disappear from the promotion plan instead of being represented with a fallback wait type.
Useful? React with 👍 / 👎.
There was a problem hiding this comment.
Cursor Bugbot has reviewed your changes and found 1 potential issue.
Bugbot Autofix prepared a fix for the issue found in the latest run.
- ✅ Resolved by another fix: Failed status check is overly broad, misclassifies non-tool entries
- The branch already scopes failed-to-error mapping to failed tool-result entries, and the targeted failed-entry regression test passes.
Preview
diff --git a/src/server/agent-runtime-ledger.ts b/src/server/agent-runtime-ledger.ts
--- a/src/server/agent-runtime-ledger.ts
+++ b/src/server/agent-runtime-ledger.ts
@@ -251,9 +251,11 @@ function stepKindForEntry(
entryKind: AgentRuntimeLedgerEntryKind,
event: AgentTrajectoryEvent,
): string {
- if (event.status === "failed") return "AGENT_RUN_STEP_KIND_ERROR";
if (entryKind === "model_call") return "AGENT_RUN_STEP_KIND_MODEL_CALL";
if (entryKind === "tool_call") return "AGENT_RUN_STEP_KIND_TOOL_CALL_INTENT";
+ if (entryKind === "tool_result" && event.status === "failed") {
+ return "AGENT_RUN_STEP_KIND_ERROR";
+ }
if (entryKind === "tool_result") return "AGENT_RUN_STEP_KIND_TOOL_RESULT";
if (entryKind === "wait" || entryKind === "governance") {
return "AGENT_RUN_STEP_KIND_APPROVAL_WAIT";
@@ -251,9 +251,11 @@ function stepKindForEntry(
entryKind: AgentRuntimeLedgerEntryKind,
event: AgentTrajectoryEvent,
): string {
- if (event.status === "failed") return "AGENT_RUN_STEP_KIND_ERROR";
if (entryKind === "model_call") return "AGENT_RUN_STEP_KIND_MODEL_CALL";
if (entryKind === "tool_call") return "AGENT_RUN_STEP_KIND_TOOL_CALL_INTENT";
+ if (entryKind === "tool_result" && event.status === "failed") {
+ return "AGENT_RUN_STEP_KIND_ERROR";
+ }
if (entryKind === "tool_result") return "AGENT_RUN_STEP_KIND_TOOL_RESULT";
if (entryKind === "wait" || entryKind === "governance") {
return "AGENT_RUN_STEP_KIND_APPROVAL_WAIT";
@@ -303,7 +305,7 @@ function waitTypeForEntry(
}
if (timelineItem?.approvalRequestId) return "AGENT_RUN_WAIT_TYPE_APPROVAL";
if (timelineItem?.pendingRequestId) return "AGENT_RUN_WAIT_TYPE_INPUT";
- return undefined;
+ return "AGENT_RUN_WAIT_TYPE_APPROVAL";
}
function buildLedgerEntries(
@@ -303,7 +305,7 @@ function waitTypeForEntry(
}
if (timelineItem?.approvalRequestId) return "AGENT_RUN_WAIT_TYPE_APPROVAL";
if (timelineItem?.pendingRequestId) return "AGENT_RUN_WAIT_TYPE_INPUT";
- return undefined;
+ return "AGENT_RUN_WAIT_TYPE_APPROVAL";
}
function buildLedgerEntries(
diff --git a/test/cli/run-command.test.ts b/test/cli/run-command.test.ts
--- a/test/cli/run-command.test.ts
+++ b/test/cli/run-command.test.ts
@@ -1014,6 +1014,39 @@ describe("run command", () => {
}
});
+ it("keeps unknown wait entries represented with an approval fallback", () => {
+ const ledger = buildLedgerForEvents("session-unknown-wait", [
+ {
+ id: "event-unknown-wait",
+ sequence: 1,
+ timestamp: "2026-05-09T10:00:01.000Z",
+ kind: "wait",
+ phase: "wait",
+ actor: "platform",
+ type: "wait.pending",
+ status: "pending",
+ visibility: "user",
+ source: "platform",
+ title: "Wait pending",
+ evidence: [{ kind: "timeline_item", id: "platform-wait" }],
+ },
+ ]);
+
+ expect(ledger.entries[0]?.platformShape.waitType).toBe(
+ "AGENT_RUN_WAIT_TYPE_APPROVAL",
+ );
+ expect(
+ ledger.promotion.operations.find(
+ (operation) =>
+ operation.operation === "wait_run" &&
+ operation.ledgerEntryId === "ledger:event-unknown-wait",
+ ),
+ ).toMatchObject({
+ operation: "wait_run",
+ payload: { waitType: "AGENT_RUN_WAIT_TYPE_APPROVAL" },
+ });
+ });
+
it("classifies failed tool results as error steps", () => {
const ledger = buildLedgerForEvents("session-failed-tool", [
{
@@ -1014,6 +1014,39 @@ describe("run command", () => {
}
});
+ it("keeps unknown wait entries represented with an approval fallback", () => {
+ const ledger = buildLedgerForEvents("session-unknown-wait", [
+ {
+ id: "event-unknown-wait",
+ sequence: 1,
+ timestamp: "2026-05-09T10:00:01.000Z",
+ kind: "wait",
+ phase: "wait",
+ actor: "platform",
+ type: "wait.pending",
+ status: "pending",
+ visibility: "user",
+ source: "platform",
+ title: "Wait pending",
+ evidence: [{ kind: "timeline_item", id: "platform-wait" }],
+ },
+ ]);
+
+ expect(ledger.entries[0]?.platformShape.waitType).toBe(
+ "AGENT_RUN_WAIT_TYPE_APPROVAL",
+ );
+ expect(
+ ledger.promotion.operations.find(
+ (operation) =>
+ operation.operation === "wait_run" &&
+ operation.ledgerEntryId === "ledger:event-unknown-wait",
+ ),
+ ).toMatchObject({
+ operation: "wait_run",
+ payload: { waitType: "AGENT_RUN_WAIT_TYPE_APPROVAL" },
+ });
+ });
+
it("classifies failed tool results as error steps", () => {
const ledger = buildLedgerForEvents("session-failed-tool", [
{
@@ -1053,6 +1086,44 @@ describe("run command", () => {
});
});
+ it("keeps failed model calls classified as model-call steps", () => {
+ const ledger = buildLedgerForEvents("session-failed-model", [
+ {
+ id: "event-model-failed",
+ sequence: 1,
+ timestamp: "2026-05-09T10:00:01.000Z",
+ kind: "message",
+ phase: "think",
+ actor: "assistant",
+ type: "message.assistant",
+ status: "failed",
+ visibility: "user",
+ source: "local",
+ title: "Assistant failed",
+ evidence: [],
+ },
+ ]);
+
+ expect(ledger.entries[0]).toMatchObject({
+ kind: "model_call",
+ state: "failed",
+ platformShape: { stepKind: "AGENT_RUN_STEP_KIND_MODEL_CALL" },
+ });
+ expect(
+ ledger.promotion.operations.find(
+ (operation) =>
+ operation.operation === "record_run_step" &&
+ operation.ledgerEntryId === "ledger:event-model-failed",
+ ),
+ ).toMatchObject({
+ operation: "record_run_step",
+ payload: {
+ kind: "AGENT_RUN_STEP_KIND_MODEL_CALL",
+ state: "failed",
+ },
+ });
+ });
+
it("maps blocked ledger entries to valid Platform run-step states", () => {
const ledger = buildLedgerForEvents("session-denied", [
{
@@ -1053,6 +1086,44 @@ describe("run command", () => {
});
});
+ it("keeps failed model calls classified as model-call steps", () => {
+ const ledger = buildLedgerForEvents("session-failed-model", [
+ {
+ id: "event-model-failed",
+ sequence: 1,
+ timestamp: "2026-05-09T10:00:01.000Z",
+ kind: "message",
+ phase: "think",
+ actor: "assistant",
+ type: "message.assistant",
+ status: "failed",
+ visibility: "user",
+ source: "local",
+ title: "Assistant failed",
+ evidence: [],
+ },
+ ]);
+
+ expect(ledger.entries[0]).toMatchObject({
+ kind: "model_call",
+ state: "failed",
+ platformShape: { stepKind: "AGENT_RUN_STEP_KIND_MODEL_CALL" },
+ });
+ expect(
+ ledger.promotion.operations.find(
+ (operation) =>
+ operation.operation === "record_run_step" &&
+ operation.ledgerEntryId === "ledger:event-model-failed",
+ ),
+ ).toMatchObject({
+ operation: "record_run_step",
+ payload: {
+ kind: "AGENT_RUN_STEP_KIND_MODEL_CALL",
+ state: "failed",
+ },
+ });
+ });
+
it("maps blocked ledger entries to valid Platform run-step states", () => {
const ledger = buildLedgerForEvents("session-denied", [
{You can send follow-ups to the cloud agent here.
Reviewed by Cursor Bugbot for commit 1261c1c. Configure here.
| entryKind: AgentRuntimeLedgerEntryKind, | ||
| event: AgentTrajectoryEvent, | ||
| ): string { | ||
| if (event.status === "failed") return "AGENT_RUN_STEP_KIND_ERROR"; |
There was a problem hiding this comment.
Failed status check is overly broad, misclassifies non-tool entries
Medium Severity
The event.status === "failed" check at the top of stepKindForEntry catches all failed events, not just failed tool_result entries. Assistant messages with stopReason === "error" produce timeline items with status: "failed" and kind: "model_call" — these now incorrectly get AGENT_RUN_STEP_KIND_ERROR instead of AGENT_RUN_STEP_KIND_MODEL_CALL. The same applies to failed wait and governance entries, which lose their AGENT_RUN_STEP_KIND_APPROVAL_WAIT classification. The PR description states the intent is to "classify failed tool-result ledger entries as error run steps," but the guard is broader than that intent.
Reviewed by Cursor Bugbot for commit 1261c1c. Configure here.


Follow-up to #421 for the live review-feedback sweep.
Changes:
Mirrors evalops/maestro-internal#1974.
Verification: