From c5c6b697729491fcb92e3ed88d340a3802058f22 Mon Sep 17 00:00:00 2001 From: betegon Date: Tue, 26 May 2026 12:06:06 +0200 Subject: [PATCH 1/2] fix(init): recover run-level stale resume errors --- src/lib/init/wizard-runner.ts | 59 +++++++++++++++++------------ test/lib/init/wizard-runner.test.ts | 39 +++++++++++++++++-- 2 files changed, 70 insertions(+), 28 deletions(-) diff --git a/src/lib/init/wizard-runner.ts b/src/lib/init/wizard-runner.ts index 0dc35f0fc..76473c751 100644 --- a/src/lib/init/wizard-runner.ts +++ b/src/lib/init/wizard-runner.ts @@ -437,6 +437,7 @@ async function preamble( const MAX_RESUME_RETRIES = 3; const RETRY_BACKOFF_MS = [2000, 4000, 8000]; +const RUN_STATE_RECOVERY_BACKOFF_MS = [0, 250, 750, 1500]; type ResumeRetryArgs = { run: { @@ -454,11 +455,13 @@ type ResumeRetryArgs = { }; /** - * Detect Mastra's "step not suspended" 500 — means the server already + * Detect Mastra's "not suspended" 500 — means the server already * processed this step (our previous request succeeded but the response was * dropped before we received it). The MastraClientError message embeds the * server body, e.g.: * "HTTP error! status: 500 - {"error":"This workflow step 'X' was not suspended..."}" + * or: + * "HTTP error! status: 500 - {"error":"This workflow run was not suspended"}" */ function isStepAlreadyAdvancedError(err: unknown): boolean { return err instanceof Error && err.message.includes("was not suspended"); @@ -473,31 +476,39 @@ async function tryRecoverCurrentRunState( workflow: ResumeRetryArgs["workflow"], runId: string ): Promise { - try { - const raw = await withTimeout( - workflow.runById(runId, { - fields: ["steps", "activeStepsPath", "result"], - }), - API_TIMEOUT_MS, - "Run state recovery" - ); - // runById returns activeStepsPath (Record) but - // not suspended (string[][]). The main loop reads result.suspended to - // find the active step; without it, stepId falls back to "unknown" and - // extractSuspendPayload iterates all steps — picking the first with any - // suspendPayload, which could be a completed step with stale D1 data. - // Derive suspended from the activeStepsPath keys so the lookup is - // deterministic: those keys are exactly the currently-active step IDs. - const state = raw as Record; - if (!state.suspended && state.activeStepsPath) { - state.suspended = Object.keys( - state.activeStepsPath as Record - ).map((id) => [id]); + for (const delayMs of RUN_STATE_RECOVERY_BACKOFF_MS) { + if (delayMs > 0) { + await new Promise((resolve) => setTimeout(resolve, delayMs)); + } + try { + const raw = await withTimeout( + workflow.runById(runId, { + fields: ["steps", "activeStepsPath", "result"], + }), + API_TIMEOUT_MS, + "Run state recovery" + ); + // runById returns activeStepsPath (Record) but + // not suspended (string[][]). The main loop reads result.suspended to + // find the active step; without it, stepId falls back to "unknown" and + // extractSuspendPayload iterates all steps — picking the first with any + // suspendPayload, which could be a completed step with stale D1 data. + // Derive suspended from the activeStepsPath keys so the lookup is + // deterministic: those keys are exactly the currently-active step IDs. + const state = raw as Record; + if (!state.suspended && state.activeStepsPath) { + state.suspended = Object.keys( + state.activeStepsPath as Record + ).map((id) => [id]); + } + return assertWorkflowResult(state); + } catch { + // Mastra/D1 can briefly return a not-yet-readable or intermediate run + // state immediately after rejecting a stale resume. Poll a few times + // before surfacing the original 500 to the user. } - return assertWorkflowResult(state); - } catch { - return null; } + return null; } // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: retry loop branches across transient errors, stale-step recovery, and backoff diff --git a/test/lib/init/wizard-runner.test.ts b/test/lib/init/wizard-runner.test.ts index 343521a15..a47d5f408 100644 --- a/test/lib/init/wizard-runner.test.ts +++ b/test/lib/init/wizard-runner.test.ts @@ -976,6 +976,13 @@ describe("runWizard — resumeWithRetry stale-step recovery", () => { ); } + function staleRunError(): Error { + return new Error( + "HTTP error! status: 500 - " + + JSON.stringify({ error: "This workflow run was not suspended" }) + ); + } + test("recovers when server has already advanced to the next step", async () => { mockStartResult = { status: "suspended", @@ -1005,13 +1012,37 @@ describe("runWizard — resumeWithRetry stale-step recovery", () => { expect(resumeCount).toBe(1); }); - test("throws immediately when stale-step error occurs and runById fails", async () => { + test("recovers from run-level not-suspended errors after transient runById failure", async () => { + mockStartResult = { + status: "suspended", + suspended: [["tool-step"]], + steps: { "tool-step": { suspendPayload: toolPayload } }, + }; + runByIdMock + .mockRejectedValueOnce(new Error("D1 snapshot not ready")) + .mockResolvedValueOnce({ status: "success" }); + + let resumeCount = 0; + makeStaleStepRun(() => { + resumeCount += 1; + return Promise.reject(staleRunError()); + }); + + await runWizard(makeOptions()); + + expect(formatResultSpy).toHaveBeenCalled(); + expect(runByIdMock).toHaveBeenCalledTimes(2); + expect(resumeCount).toBe(1); + }); + + test("throws when stale-step error occurs and runById keeps failing", async () => { mockStartResult = { status: "suspended", suspended: [["tool-step"]], steps: { "tool-step": { suspendPayload: toolPayload } }, }; - // runById is unreachable — recovery fails, wizard throws without retrying. + // runById is unreachable — recovery fails, wizard throws without retrying + // the stale resume request. mockRunByIdResult = new Error("runById network error"); let resumeCount = 0; @@ -1022,9 +1053,9 @@ describe("runWizard — resumeWithRetry stale-step recovery", () => { await expect(runWizard(makeOptions())).rejects.toThrow(WizardError); - // Threw immediately after recovery failed — no futile retries of the stale step. + // Threw after recovery polling failed — no futile retries of the stale step. expect(resumeCount).toBe(1); - expect(runByIdMock).toHaveBeenCalledTimes(1); + expect(runByIdMock).toHaveBeenCalledTimes(4); }); }); From 5909b0b8fd273d69de9c85017b5cf198b4afa9fb Mon Sep 17 00:00:00 2001 From: betegon Date: Tue, 26 May 2026 12:13:33 +0200 Subject: [PATCH 2/2] fix(init): improve install failure recovery telemetry --- src/lib/init/tools/run-commands.ts | 1 + src/lib/init/wizard-runner.ts | 3 ++- test/lib/run-commands.mocked.test.ts | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/lib/init/tools/run-commands.ts b/src/lib/init/tools/run-commands.ts index f869db1b1..9cece52e3 100644 --- a/src/lib/init/tools/run-commands.ts +++ b/src/lib/init/tools/run-commands.ts @@ -54,6 +54,7 @@ export async function runCommands( message: `Command failed: ${command.original}`, data: { exitCode: result.exitCode, + stdout: result.stdout.slice(0, 500), stderr: result.stderr.slice(0, 500), cwd: payload.cwd, }, diff --git a/src/lib/init/wizard-runner.ts b/src/lib/init/wizard-runner.ts index 76473c751..6927f2336 100644 --- a/src/lib/init/wizard-runner.ts +++ b/src/lib/init/wizard-runner.ts @@ -438,6 +438,7 @@ async function preamble( const MAX_RESUME_RETRIES = 3; const RETRY_BACKOFF_MS = [2000, 4000, 8000]; const RUN_STATE_RECOVERY_BACKOFF_MS = [0, 250, 750, 1500]; +const RUN_STATE_RECOVERY_TIMEOUT_MS = 10_000; type ResumeRetryArgs = { run: { @@ -485,7 +486,7 @@ async function tryRecoverCurrentRunState( workflow.runById(runId, { fields: ["steps", "activeStepsPath", "result"], }), - API_TIMEOUT_MS, + RUN_STATE_RECOVERY_TIMEOUT_MS, "Run state recovery" ); // runById returns activeStepsPath (Record) but diff --git a/test/lib/run-commands.mocked.test.ts b/test/lib/run-commands.mocked.test.ts index d9ca3f2d4..d119f31ee 100644 --- a/test/lib/run-commands.mocked.test.ts +++ b/test/lib/run-commands.mocked.test.ts @@ -15,7 +15,7 @@ import type { RunCommandsPayload } from "../../src/lib/init/types.js"; type Breadcrumb = { level: string; message: string; - data: { exitCode: number; stderr: string; cwd: string }; + data: { exitCode: number; stdout: string; stderr: string; cwd: string }; }; const { breadcrumbs } = vi.hoisted(() => ({ @@ -60,6 +60,7 @@ describe("runCommands breadcrumb on failure", () => { expect(crumb.level).toBe("error"); expect(crumb.message).toContain("ls"); expect(crumb.data.exitCode).not.toBe(0); + expect(typeof crumb.data.stdout).toBe("string"); expect(typeof crumb.data.stderr).toBe("string"); expect(crumb.data.cwd).toBe("/tmp"); });