From 8687182dc81fd4fac572eac321b4eb656d745a39 Mon Sep 17 00:00:00 2001 From: Nik Divjak Date: Wed, 27 May 2026 13:23:35 +0200 Subject: [PATCH] fix(session): auto-recover orphaned tool parts on read (#254) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sessions whose process was killed mid-tool-call leave `tool` parts with `state.status='running'` (or 'pending') on disk. Resuming such a session crashes the TUI with `TypeError: undefined is not an object (evaluating 'U.length')` because the renderer reaches into `state.output` / `state.metadata` fields that only exist on completed/error states. Fix at the single chokepoint where PartTable rows become Part objects: `part(row)` in message-v2.ts. When a tool part's state is still running/pending AND its `time_created` is older than 60s, we rewrite it on read into a synthetic `ToolStateError`: status: 'error' error: '[Tool execution was interrupted ...]' time: { start, end: start + orphan_age_ms } metadata: { ...preserved, interrupted: true } The 60s threshold guards against clobbering a tool call that's genuinely in-flight in another process during normal multi-client operation; orphaned-on-disk parts are reliably older. The fix sits inside the single `part(row)` helper so it covers every read path (page, parts, get, stream, findMessage) without duplicating logic — anything that hydrates parts gets the recovery for free. Acceptance verified by tests in test/session/orphan-tool-recovery.test.ts: - Orphaned running question tool → transforms to error - Orphaned pending bash tool → transforms to error - Completed tool → untouched - Fresh (<60s) running tool → untouched - Multi-type orphans (bash, read, edit, question) → all transform Full session test suite (357 tests) green with no regressions. Closes #254. --- packages/opencode/src/session/message-v2.ts | 40 ++- .../test/session/orphan-tool-recovery.test.ts | 283 ++++++++++++++++++ 2 files changed, 319 insertions(+), 4 deletions(-) create mode 100644 packages/opencode/test/session/orphan-tool-recovery.test.ts diff --git a/packages/opencode/src/session/message-v2.ts b/packages/opencode/src/session/message-v2.ts index 2745ff4f45d7..7a884baf09f9 100644 --- a/packages/opencode/src/session/message-v2.ts +++ b/packages/opencode/src/session/message-v2.ts @@ -584,13 +584,45 @@ const info = (row: typeof MessageTable.$inferSelect) => sessionID: row.session_id, }) as Info -const part = (row: typeof PartTable.$inferSelect) => - ({ - ...row.data, +// Read-time orphan threshold: a tool part whose state still says "running" or +// "pending" 60s after it was first written almost certainly belongs to a +// process that died mid-execution (kill -9, panic, OOM). Hydrating such a part +// raw crashes the TUI on resume (renderers assume completed tool parts have +// state.output) — see hivemind #254. We surface them as a synthetic error +// state instead, so the renderer treats them like any other failed tool call. +const ORPHAN_TOOL_STALE_MS = 60_000 + +const part = (row: typeof PartTable.$inferSelect) => { + // PartData = Omit. Cast to Part for + // discriminated-union narrowing (Omit doesn't distribute over unions in TS). + const data = row.data as Part + if (data.type === "tool" && (data.state.status === "running" || data.state.status === "pending")) { + const start = data.state.status === "running" ? data.state.time.start : row.time_created + const orphanAgeMs = Date.now() - start + if (orphanAgeMs >= ORPHAN_TOOL_STALE_MS) { + const baseMetadata = data.state.status === "running" ? data.state.metadata : undefined + return { + ...data, + state: { + status: "error", + input: data.state.input, + error: "[Tool execution was interrupted — process terminated before completion]", + time: { start, end: start + orphanAgeMs }, + metadata: { ...(baseMetadata ?? {}), interrupted: true }, + }, + id: row.id, + sessionID: row.session_id, + messageID: row.message_id, + } satisfies Part + } + } + return { + ...data, id: row.id, sessionID: row.session_id, messageID: row.message_id, - }) as Part + } satisfies Part +} const older = (row: Cursor) => or(lt(MessageTable.time_created, row.time), and(eq(MessageTable.time_created, row.time), lt(MessageTable.id, row.id))) diff --git a/packages/opencode/test/session/orphan-tool-recovery.test.ts b/packages/opencode/test/session/orphan-tool-recovery.test.ts new file mode 100644 index 000000000000..bda70f4fe5d7 --- /dev/null +++ b/packages/opencode/test/session/orphan-tool-recovery.test.ts @@ -0,0 +1,283 @@ +import { describe, expect } from "bun:test" +import { Effect } from "effect" +import { Session as SessionNs } from "@/session/session" +import { MessageV2 } from "@/session/message-v2" +import { Database } from "@/storage/db" +import { MessageTable, PartTable } from "@/session/session.sql" +import { MessageID, PartID, type SessionID } from "@/session/schema" +import * as Log from "@opencode-ai/core/util/log" +import { testEffect } from "../lib/effect" + +void Log.init({ print: false }) + +const it = testEffect(SessionNs.defaultLayer) + +// Stale enough that the orphan-recovery threshold fires (60s). +const ANCIENT_MS = Date.now() - 5 * 60_000 + +// Insert a raw orphan tool row directly into the part table, simulating the +// state on disk after a process was killed mid-tool-call (hivemind #254 +// repro). Bypasses session.updatePart on purpose — that path would refuse a +// freshly-created running tool because the message hasn't been authored yet, +// and we want exactly the kill-9 shape (orphan row, message present, no +// follow-up part). +const seedOrphan = Effect.fn("Test.seedOrphan")(function* ( + sessionID: SessionID, + tool: string, + status: "running" | "pending", +) { + const messageID = MessageID.ascending() + const partID = PartID.ascending() + yield* Effect.sync(() => + Database.use((db) => { + db.insert(MessageTable) + .values({ + id: messageID, + session_id: sessionID, + time_created: ANCIENT_MS, + time_updated: ANCIENT_MS, + data: { + role: "assistant", + time: { created: ANCIENT_MS }, + agent: "test", + cost: 0, + tokens: { + input: 0, + output: 0, + reasoning: 0, + cache: { read: 0, write: 0 }, + }, + mode: "", + model: { providerID: "test", modelID: "test" }, + path: { cwd: "/", root: "/" }, + system: [], + tools: {}, + } as never, + }) + .run() + db.insert(PartTable) + .values({ + id: partID, + message_id: messageID, + session_id: sessionID, + time_created: ANCIENT_MS, + time_updated: ANCIENT_MS, + data: + status === "running" + ? ({ + type: "tool", + callID: `call_${tool}`, + tool, + state: { + status: "running", + input: { foo: "bar" }, + title: tool, + time: { start: ANCIENT_MS }, + }, + } as never) + : ({ + type: "tool", + callID: `call_${tool}`, + tool, + state: { + status: "pending", + input: { foo: "bar" }, + raw: "", + }, + } as never), + }) + .run() + }), + ) + return { messageID, partID } +}) + +const withSession = ( + fn: (input: { session: SessionNs.Interface; sessionID: SessionID }) => Effect.Effect, +) => + Effect.acquireUseRelease( + Effect.gen(function* () { + const session = yield* SessionNs.Service + const created = yield* session.create({}) + return { session, sessionID: created.id } + }), + fn, + (input) => input.session.remove(input.sessionID).pipe(Effect.ignore), + ) + +describe("orphan tool-part recovery on session resume (#254)", () => { + it.instance("transforms an orphaned running `question` tool-part into a synthetic error", () => + withSession(({ session, sessionID }) => + Effect.gen(function* () { + yield* seedOrphan(sessionID, "question", "running") + + const messages = yield* session.messages({ sessionID }) + const orphan = messages + .flatMap((m) => m.parts) + .find((p) => p.type === "tool") + expect(orphan).toBeDefined() + if (orphan?.type !== "tool") throw new Error("expected tool part") + expect(orphan.state.status).toBe("error") + if (orphan.state.status !== "error") throw new Error("narrowing failed") + expect(orphan.state.error).toContain("interrupted") + expect(orphan.state.metadata?.interrupted).toBe(true) + // The synthetic timeline must satisfy ToolStateError's schema invariant + // (end >= start) — otherwise downstream consumers panic. + expect(orphan.state.time.end).toBeGreaterThanOrEqual(orphan.state.time.start) + }), + ), + ) + + it.instance("transforms an orphaned pending bash tool-part into a synthetic error", () => + withSession(({ session, sessionID }) => + Effect.gen(function* () { + yield* seedOrphan(sessionID, "bash", "pending") + + const messages = yield* session.messages({ sessionID }) + const orphan = messages.flatMap((m) => m.parts).find((p) => p.type === "tool") + if (orphan?.type !== "tool") throw new Error("expected tool part") + expect(orphan.state.status).toBe("error") + }), + ), + ) + + it.instance("does not transform a tool-part whose state is already completed", () => + withSession(({ session, sessionID }) => + Effect.gen(function* () { + const messageID = MessageID.ascending() + const partID = PartID.ascending() + yield* Effect.sync(() => + Database.use((db) => { + db.insert(MessageTable) + .values({ + id: messageID, + session_id: sessionID, + time_created: ANCIENT_MS, + time_updated: ANCIENT_MS, + data: { + role: "assistant", + time: { created: ANCIENT_MS }, + agent: "test", + cost: 0, + tokens: { input: 0, output: 0, reasoning: 0, cache: { read: 0, write: 0 } }, + mode: "", + model: { providerID: "test", modelID: "test" }, + path: { cwd: "/", root: "/" }, + system: [], + tools: {}, + } as never, + }) + .run() + db.insert(PartTable) + .values({ + id: partID, + message_id: messageID, + session_id: sessionID, + time_created: ANCIENT_MS, + time_updated: ANCIENT_MS, + data: { + type: "tool", + callID: "call_done", + tool: "read", + state: { + status: "completed", + input: { path: "/etc/hostname" }, + output: "ok", + title: "read", + metadata: { exit: 0 }, + time: { start: ANCIENT_MS - 1, end: ANCIENT_MS }, + }, + } as never, + }) + .run() + }), + ) + + const messages = yield* session.messages({ sessionID }) + const completed = messages.flatMap((m) => m.parts).find((p) => p.type === "tool") + if (completed?.type !== "tool") throw new Error("expected tool part") + expect(completed.state.status).toBe("completed") + }), + ), + ) + + it.instance("does not transform a running tool-part that's still within the 60s window", () => + withSession(({ session, sessionID }) => + Effect.gen(function* () { + const messageID = MessageID.ascending() + const partID = PartID.ascending() + // start = NOW. Recovery threshold is 60s; this should be untouched. + const now = Date.now() + yield* Effect.sync(() => + Database.use((db) => { + db.insert(MessageTable) + .values({ + id: messageID, + session_id: sessionID, + time_created: now, + time_updated: now, + data: { + role: "assistant", + time: { created: now }, + agent: "test", + cost: 0, + tokens: { input: 0, output: 0, reasoning: 0, cache: { read: 0, write: 0 } }, + mode: "", + model: { providerID: "test", modelID: "test" }, + path: { cwd: "/", root: "/" }, + system: [], + tools: {}, + } as never, + }) + .run() + db.insert(PartTable) + .values({ + id: partID, + message_id: messageID, + session_id: sessionID, + time_created: now, + time_updated: now, + data: { + type: "tool", + callID: "call_live", + tool: "bash", + state: { + status: "running", + input: { cmd: "sleep 1" }, + title: "bash", + time: { start: now }, + }, + } as never, + }) + .run() + }), + ) + + const messages = yield* session.messages({ sessionID }) + const live = messages.flatMap((m) => m.parts).find((p) => p.type === "tool") + if (live?.type !== "tool") throw new Error("expected tool part") + // Fresh running tool must stay running — would be wrong to clobber a + // tool call that's actually in flight in some other process. + expect(live.state.status).toBe("running") + }), + ), + ) + + it.instance("covers multiple orphan tool types simultaneously (bash, read, edit, question)", () => + withSession(({ session, sessionID }) => + Effect.gen(function* () { + for (const tool of ["bash", "read", "edit", "question"]) { + yield* seedOrphan(sessionID, tool, "running") + } + + const messages = yield* session.messages({ sessionID }) + const tools = messages.flatMap((m) => m.parts).filter((p) => p.type === "tool") + expect(tools).toHaveLength(4) + for (const t of tools) { + if (t.type !== "tool") throw new Error("expected tool part") + expect(t.state.status).toBe("error") + } + }), + ), + ) +})