fix(orchestration): stop on node failure, persist skippedNodes in checkpoints, fix branch resume

jddunn · jddunn · commit 15656c842ccb · 2026-03-24T09:25:56.000-07:00
diff --git a/src/orchestration/__tests__/checkpoint-store.test.ts b/src/orchestration/__tests__/checkpoint-store.test.ts
@@ -62,6 +62,16 @@ describe('InMemoryCheckpointStore', () => {
     expect(loaded!.runId).toBe('run-1');
   });
 
+  // -------------------------------------------------------------------------
+  it('get returns a checkpoint by exact checkpoint id', async () => {
+    const cp = makeCheckpoint({ id: 'cp-direct', runId: 'run-1' });
+    await store.save(cp);
+
+    const loaded = await store.get('cp-direct');
+    expect(loaded).not.toBeNull();
+    expect(loaded!.id).toBe('cp-direct');
+  });
+
   // -------------------------------------------------------------------------
   it('load returns null for an unknown runId', async () => {
     const result = await store.load('run-missing');
diff --git a/src/orchestration/__tests__/graph-runtime.test.ts b/src/orchestration/__tests__/graph-runtime.test.ts
@@ -316,4 +316,132 @@ describe('GraphRuntime', () => {
     // The resume should complete without throwing.
     expect(resumeResult).toBeDefined();
   });
+
+  it('accepts an exact checkpoint id in resume()', async () => {
+    const store = new InMemoryCheckpointStore();
+    const executeMock = vi.fn().mockResolvedValue({
+      success: true,
+      output: 'resume-output',
+    } satisfies NodeExecutionResult);
+
+    const runtime = new GraphRuntime({
+      checkpointStore: store,
+      nodeExecutor: makeExecutorWithMock(executeMock),
+    });
+
+    const graph = makeLinearGraph(
+      'g-resume-checkpoint-id',
+      [makeNode('a'), makeNode('b')],
+      { checkpointPolicy: 'every_node' },
+    );
+
+    await runtime.invoke(graph, { seed: 7 });
+    const checkpoints = await store.list('g-resume-checkpoint-id');
+    const checkpointForA = checkpoints.find((cp) => cp.nodeId === 'a');
+    expect(checkpointForA).toBeDefined();
+
+    executeMock.mockClear();
+    const resumeResult = await runtime.resume(graph, checkpointForA!.id);
+
+    expect(resumeResult).toBeDefined();
+    expect(executeMock).toHaveBeenCalled();
+  });
+
+  it('halts on node failure and emits error/interruption events', async () => {
+    const store = new InMemoryCheckpointStore();
+    const executeMock = vi.fn().mockImplementation(async (node: GraphNode): Promise<NodeExecutionResult> => {
+      if (node.id === 'a') {
+        return { success: false, error: 'boom' };
+      }
+      return { success: true, output: `${node.id}-done` };
+    });
+
+    const runtime = new GraphRuntime({
+      checkpointStore: store,
+      nodeExecutor: makeExecutorWithMock(executeMock),
+    });
+
+    const graph = makeLinearGraph('g-failure', [makeNode('a'), makeNode('b')]);
+    const events = [];
+    for await (const event of runtime.stream(graph, {})) {
+      events.push(event);
+    }
+
+    expect(executeMock).toHaveBeenCalledTimes(1);
+    expect(events.some((event) => event.type === 'error')).toBe(true);
+    expect(events.some((event) => event.type === 'interrupt')).toBe(true);
+    expect(events.some((event) => event.type === 'run_end')).toBe(true);
+    expect(events.some((event) => event.type === 'node_start' && event.nodeId === 'b')).toBe(false);
+  });
+
+  it('persists skipped conditional branches so resume does not execute the bypassed arm', async () => {
+    const store = new InMemoryCheckpointStore();
+    const executeMock = vi.fn().mockImplementation(async (node: GraphNode): Promise<NodeExecutionResult> => {
+      if (node.id === 'a') {
+        return { success: true, output: 'a-done', scratchUpdate: { goToB: true } };
+      }
+      return { success: true, output: `${node.id}-done` };
+    });
+
+    const runtime = new GraphRuntime({
+      checkpointStore: store,
+      nodeExecutor: makeExecutorWithMock(executeMock),
+    });
+
+    const nodeA = makeNode('a');
+    const nodeB = makeNode('b');
+    const nodeC = makeNode('c');
+
+    const graph: CompiledExecutionGraph = {
+      id: 'g-conditional-resume',
+      name: 'conditional-resume-test',
+      nodes: [nodeA, nodeB, nodeC],
+      edges: [
+        { id: 'e0', source: START, target: 'a', type: 'static' },
+        {
+          id: 'e1',
+          source: 'a',
+          target: 'b',
+          type: 'conditional',
+          condition: {
+            type: 'function',
+            fn: (state: GraphState) =>
+              (state.scratch as Record<string, unknown>).goToB ? 'b' : 'c',
+          },
+        },
+        {
+          id: 'e2',
+          source: 'a',
+          target: 'c',
+          type: 'conditional',
+          condition: {
+            type: 'function',
+            fn: (state: GraphState) =>
+              (state.scratch as Record<string, unknown>).goToB ? 'b' : 'c',
+          },
+        },
+        { id: 'e3', source: 'b', target: END, type: 'static' },
+        { id: 'e4', source: 'c', target: END, type: 'static' },
+      ],
+      stateSchema: { input: {}, scratch: {}, artifacts: {} },
+      reducers: {},
+      checkpointPolicy: 'every_node',
+      memoryConsistency: 'snapshot',
+    };
+
+    await runtime.invoke(graph, {});
+
+    const checkpoints = await store.list('g-conditional-resume');
+    const checkpointForA = checkpoints.find((cp) => cp.nodeId === 'a');
+    expect(checkpointForA).toBeDefined();
+
+    const forkedRunId = await store.fork(checkpointForA!.id);
+    executeMock.mockClear();
+
+    await runtime.resume(graph, forkedRunId);
+
+    const executedNodeIds = executeMock.mock.calls.map(([node]) => (node as GraphNode).id);
+    expect(executedNodeIds).toContain('b');
+    expect(executedNodeIds).not.toContain('c');
+  });
 });
diff --git a/src/orchestration/__tests__/integration.test.ts b/src/orchestration/__tests__/integration.test.ts
@@ -12,7 +12,7 @@
  * 3. mission: compile → invoke lifecycle
  * 4. Checkpoint time-travel: fork with modified state
  * 5. Streaming emits correct event sequence
- * 6. Error handling: node failure with graceful continuation
+ * 6. Error handling: node failure halts the run with explicit error events
  */
 
 import { describe, it, expect, vi } from 'vitest';
@@ -160,15 +160,14 @@ describe('E2E Integration — AgentGraph lifecycle', () => {
       scratch: z.object({}),
       artifacts: z.object({ answer: z.string().optional() }),
     })
-      .addNode('step1', toolNode('search'))
-      .addNode('step2', toolNode('summarize'))
+      .addNode('step1', gmiNode({ instructions: 'Search for the answer.' }))
+      .addNode('step2', gmiNode({ instructions: 'Summarize the answer.' }))
       .addEdge(START, 'step1')
       .addEdge('step1', 'step2')
       .addEdge('step2', END)
       .compile();
 
     const result = await graph.invoke({ query: 'test' });
-    // Default NodeExecutor returns a stub — result should be defined (empty artifacts object)
     expect(result).toBeDefined();
   });
 
@@ -178,8 +177,8 @@ describe('E2E Integration — AgentGraph lifecycle', () => {
       scratch: z.object({}),
       artifacts: z.object({}),
     })
-      .addNode('a', toolNode('tool_a'))
-      .addNode('b', toolNode('tool_b'))
+      .addNode('a', gmiNode({ instructions: 'Step A' }))
+      .addNode('b', gmiNode({ instructions: 'Step B' }))
       .addEdge(START, 'a')
       .addEdge('a', 'b')
       .addEdge('b', END)
@@ -469,7 +468,7 @@ describe('E2E Integration — streaming event sequence', () => {
 // ---------------------------------------------------------------------------
 
 describe('E2E Integration — error handling', () => {
-  it('invoke resolves even when node executor returns success:false', async () => {
+  it('halts the run when node executor returns success:false', async () => {
     const executor = {
       execute: vi.fn().mockResolvedValue({
         success: false,
@@ -480,9 +479,13 @@ describe('E2E Integration — error handling', () => {
     const store = new InMemoryCheckpointStore();
     const runtime = new GraphRuntime({ checkpointStore: store, nodeExecutor: executor as any });
 
-    const graph = makeLinearGraph('error-graph', [makeNode('failing-node')]);
-    // The runtime does not throw on node success:false — it returns final artifacts
-    await expect(runtime.invoke(graph, {})).resolves.toBeDefined();
+    const graph = makeLinearGraph('error-graph', [makeNode('failing-node'), makeNode('downstream-node')]);
+    const events = await collectEvents(runtime.stream(graph, {}));
+
+    expect(executor.execute).toHaveBeenCalledTimes(1);
+    expect(events.some((event) => event.type === 'error')).toBe(true);
+    expect(events.some((event) => event.type === 'interrupt')).toBe(true);
+    expect(events.some((event) => event.type === 'node_start' && (event as any).nodeId === 'downstream-node')).toBe(false);
   });
 
   it('resume throws when no checkpoint exists for runId', async () => {
diff --git a/src/orchestration/checkpoint/ICheckpointStore.ts b/src/orchestration/checkpoint/ICheckpointStore.ts
@@ -89,6 +89,16 @@ export interface Checkpoint {
   /** Ordered list of node ids that had completed execution when this checkpoint was taken. */
   visitedNodes: string[];
 
+  /**
+   * Ordered list of node ids that were explicitly bypassed by routing decisions
+   * (for example, the non-selected arm of a conditional branch).
+   *
+   * Persisting this list is required for correct resume semantics on branched
+   * graphs: otherwise a resumed run cannot distinguish "not run yet" from
+   * "intentionally skipped" and may stall on dead branches.
+   */
+  skippedNodes?: string[];
+
   /** Ids of edges that had been emitted but whose target nodes had not yet started. */
   pendingEdges: string[];
 }
@@ -115,6 +125,14 @@ export interface ICheckpointStore {
    */
   save(checkpoint: Checkpoint): Promise<void>;
 
+  /**
+   * Load a checkpoint by its unique checkpoint identifier.
+   *
+   * @param checkpointId - The exact checkpoint id assigned at save-time.
+   * @returns The matching checkpoint, or `null` when none exists.
+   */
+  get(checkpointId: string): Promise<Checkpoint | null>;
+
   /**
    * Load a checkpoint for the given `runId`.
    *
diff --git a/src/orchestration/checkpoint/InMemoryCheckpointStore.ts b/src/orchestration/checkpoint/InMemoryCheckpointStore.ts
@@ -62,6 +62,15 @@ export class InMemoryCheckpointStore implements ICheckpointStore {
     this._checkpoints.set(checkpoint.id, checkpoint);
   }
 
+  /**
+   * Load a checkpoint by its unique checkpoint id.
+   *
+   * {@inheritDoc ICheckpointStore.get}
+   */
+  async get(checkpointId: string): Promise<Checkpoint | null> {
+    return this._checkpoints.get(checkpointId) ?? null;
+  }
+
   /**
    * Load a checkpoint for the given `runId`.
    *
diff --git a/src/orchestration/runtime/GraphRuntime.ts b/src/orchestration/runtime/GraphRuntime.ts