From c5c6b697729491fcb92e3ed88d340a3802058f22 Mon Sep 17 00:00:00 2001
From: betegon <miguelbetegongarcia@gmail.com>
Date: Tue, 26 May 2026 12:06:06 +0200
Subject: [PATCH 1/2] fix(init): recover run-level stale resume errors

---
 src/lib/init/wizard-runner.ts       | 59 +++++++++++++++++------------
 test/lib/init/wizard-runner.test.ts | 39 +++++++++++++++++--
 2 files changed, 70 insertions(+), 28 deletions(-)

diff --git a/src/lib/init/wizard-runner.ts b/src/lib/init/wizard-runner.ts
index 0dc35f0fc..76473c751 100644
--- a/src/lib/init/wizard-runner.ts
+++ b/src/lib/init/wizard-runner.ts
@@ -437,6 +437,7 @@ async function preamble(
 
 const MAX_RESUME_RETRIES = 3;
 const RETRY_BACKOFF_MS = [2000, 4000, 8000];
+const RUN_STATE_RECOVERY_BACKOFF_MS = [0, 250, 750, 1500];
 
 type ResumeRetryArgs = {
   run: {
@@ -454,11 +455,13 @@ type ResumeRetryArgs = {
 };
 
 /**
- * Detect Mastra's "step not suspended" 500 — means the server already
+ * Detect Mastra's "not suspended" 500 — means the server already
  * processed this step (our previous request succeeded but the response was
  * dropped before we received it). The MastraClientError message embeds the
  * server body, e.g.:
  *   "HTTP error! status: 500 - {"error":"This workflow step 'X' was not suspended..."}"
+ * or:
+ *   "HTTP error! status: 500 - {"error":"This workflow run was not suspended"}"
  */
 function isStepAlreadyAdvancedError(err: unknown): boolean {
   return err instanceof Error && err.message.includes("was not suspended");
@@ -473,31 +476,39 @@ async function tryRecoverCurrentRunState(
   workflow: ResumeRetryArgs["workflow"],
   runId: string
 ): Promise<WorkflowRunResult | null> {
-  try {
-    const raw = await withTimeout(
-      workflow.runById(runId, {
-        fields: ["steps", "activeStepsPath", "result"],
-      }),
-      API_TIMEOUT_MS,
-      "Run state recovery"
-    );
-    // runById returns activeStepsPath (Record<stepId, executionPath>) but
-    // not suspended (string[][]). The main loop reads result.suspended to
-    // find the active step; without it, stepId falls back to "unknown" and
-    // extractSuspendPayload iterates all steps — picking the first with any
-    // suspendPayload, which could be a completed step with stale D1 data.
-    // Derive suspended from the activeStepsPath keys so the lookup is
-    // deterministic: those keys are exactly the currently-active step IDs.
-    const state = raw as Record<string, unknown>;
-    if (!state.suspended && state.activeStepsPath) {
-      state.suspended = Object.keys(
-        state.activeStepsPath as Record<string, unknown>
-      ).map((id) => [id]);
+  for (const delayMs of RUN_STATE_RECOVERY_BACKOFF_MS) {
+    if (delayMs > 0) {
+      await new Promise((resolve) => setTimeout(resolve, delayMs));
+    }
+    try {
+      const raw = await withTimeout(
+        workflow.runById(runId, {
+          fields: ["steps", "activeStepsPath", "result"],
+        }),
+        API_TIMEOUT_MS,
+        "Run state recovery"
+      );
+      // runById returns activeStepsPath (Record<stepId, executionPath>) but
+      // not suspended (string[][]). The main loop reads result.suspended to
+      // find the active step; without it, stepId falls back to "unknown" and
+      // extractSuspendPayload iterates all steps — picking the first with any
+      // suspendPayload, which could be a completed step with stale D1 data.
+      // Derive suspended from the activeStepsPath keys so the lookup is
+      // deterministic: those keys are exactly the currently-active step IDs.
+      const state = raw as Record<string, unknown>;
+      if (!state.suspended && state.activeStepsPath) {
+        state.suspended = Object.keys(
+          state.activeStepsPath as Record<string, unknown>
+        ).map((id) => [id]);
+      }
+      return assertWorkflowResult(state);
+    } catch {
+      // Mastra/D1 can briefly return a not-yet-readable or intermediate run
+      // state immediately after rejecting a stale resume. Poll a few times
+      // before surfacing the original 500 to the user.
     }
-    return assertWorkflowResult(state);
-  } catch {
-    return null;
   }
+  return null;
 }
 
 // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: retry loop branches across transient errors, stale-step recovery, and backoff
diff --git a/test/lib/init/wizard-runner.test.ts b/test/lib/init/wizard-runner.test.ts
index 343521a15..a47d5f408 100644
--- a/test/lib/init/wizard-runner.test.ts
+++ b/test/lib/init/wizard-runner.test.ts
@@ -976,6 +976,13 @@ describe("runWizard — resumeWithRetry stale-step recovery", () => {
     );
   }
 
+  function staleRunError(): Error {
+    return new Error(
+      "HTTP error! status: 500 - " +
+        JSON.stringify({ error: "This workflow run was not suspended" })
+    );
+  }
+
   test("recovers when server has already advanced to the next step", async () => {
     mockStartResult = {
       status: "suspended",
@@ -1005,13 +1012,37 @@ describe("runWizard — resumeWithRetry stale-step recovery", () => {
     expect(resumeCount).toBe(1);
   });
 
-  test("throws immediately when stale-step error occurs and runById fails", async () => {
+  test("recovers from run-level not-suspended errors after transient runById failure", async () => {
+    mockStartResult = {
+      status: "suspended",
+      suspended: [["tool-step"]],
+      steps: { "tool-step": { suspendPayload: toolPayload } },
+    };
+    runByIdMock
+      .mockRejectedValueOnce(new Error("D1 snapshot not ready"))
+      .mockResolvedValueOnce({ status: "success" });
+
+    let resumeCount = 0;
+    makeStaleStepRun(() => {
+      resumeCount += 1;
+      return Promise.reject(staleRunError());
+    });
+
+    await runWizard(makeOptions());
+
+    expect(formatResultSpy).toHaveBeenCalled();
+    expect(runByIdMock).toHaveBeenCalledTimes(2);
+    expect(resumeCount).toBe(1);
+  });
+
+  test("throws when stale-step error occurs and runById keeps failing", async () => {
     mockStartResult = {
       status: "suspended",
       suspended: [["tool-step"]],
       steps: { "tool-step": { suspendPayload: toolPayload } },
     };
-    // runById is unreachable — recovery fails, wizard throws without retrying.
+    // runById is unreachable — recovery fails, wizard throws without retrying
+    // the stale resume request.
     mockRunByIdResult = new Error("runById network error");
 
     let resumeCount = 0;
@@ -1022,9 +1053,9 @@ describe("runWizard — resumeWithRetry stale-step recovery", () => {
 
     await expect(runWizard(makeOptions())).rejects.toThrow(WizardError);
 
-    // Threw immediately after recovery failed — no futile retries of the stale step.
+    // Threw after recovery polling failed — no futile retries of the stale step.
     expect(resumeCount).toBe(1);
-    expect(runByIdMock).toHaveBeenCalledTimes(1);
+    expect(runByIdMock).toHaveBeenCalledTimes(4);
   });
 });
 

From 5909b0b8fd273d69de9c85017b5cf198b4afa9fb Mon Sep 17 00:00:00 2001
From: betegon <miguelbetegongarcia@gmail.com>
Date: Tue, 26 May 2026 12:13:33 +0200
Subject: [PATCH 2/2] fix(init): improve install failure recovery telemetry

---
 src/lib/init/tools/run-commands.ts   | 1 +
 src/lib/init/wizard-runner.ts        | 3 ++-
 test/lib/run-commands.mocked.test.ts | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/lib/init/tools/run-commands.ts b/src/lib/init/tools/run-commands.ts
index f869db1b1..9cece52e3 100644
--- a/src/lib/init/tools/run-commands.ts
+++ b/src/lib/init/tools/run-commands.ts
@@ -54,6 +54,7 @@ export async function runCommands(
         message: `Command failed: ${command.original}`,
         data: {
           exitCode: result.exitCode,
+          stdout: result.stdout.slice(0, 500),
           stderr: result.stderr.slice(0, 500),
           cwd: payload.cwd,
         },
diff --git a/src/lib/init/wizard-runner.ts b/src/lib/init/wizard-runner.ts
index 76473c751..6927f2336 100644
--- a/src/lib/init/wizard-runner.ts
+++ b/src/lib/init/wizard-runner.ts
@@ -438,6 +438,7 @@ async function preamble(
 const MAX_RESUME_RETRIES = 3;
 const RETRY_BACKOFF_MS = [2000, 4000, 8000];
 const RUN_STATE_RECOVERY_BACKOFF_MS = [0, 250, 750, 1500];
+const RUN_STATE_RECOVERY_TIMEOUT_MS = 10_000;
 
 type ResumeRetryArgs = {
   run: {
@@ -485,7 +486,7 @@ async function tryRecoverCurrentRunState(
         workflow.runById(runId, {
           fields: ["steps", "activeStepsPath", "result"],
         }),
-        API_TIMEOUT_MS,
+        RUN_STATE_RECOVERY_TIMEOUT_MS,
         "Run state recovery"
       );
       // runById returns activeStepsPath (Record<stepId, executionPath>) but
diff --git a/test/lib/run-commands.mocked.test.ts b/test/lib/run-commands.mocked.test.ts
index d9ca3f2d4..d119f31ee 100644
--- a/test/lib/run-commands.mocked.test.ts
+++ b/test/lib/run-commands.mocked.test.ts
@@ -15,7 +15,7 @@ import type { RunCommandsPayload } from "../../src/lib/init/types.js";
 type Breadcrumb = {
   level: string;
   message: string;
-  data: { exitCode: number; stderr: string; cwd: string };
+  data: { exitCode: number; stdout: string; stderr: string; cwd: string };
 };
 
 const { breadcrumbs } = vi.hoisted(() => ({
@@ -60,6 +60,7 @@ describe("runCommands breadcrumb on failure", () => {
     expect(crumb.level).toBe("error");
     expect(crumb.message).toContain("ls");
     expect(crumb.data.exitCode).not.toBe(0);
+    expect(typeof crumb.data.stdout).toBe("string");
     expect(typeof crumb.data.stderr).toBe("string");
     expect(crumb.data.cwd).toBe("/tmp");
   });