ghostwright · mcheemaa · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026
diff --git a/src/agent/__tests__/judge-query.test.ts b/src/agent/__tests__/judge-query.test.ts
@@ -1,6 +1,6 @@
 import { describe, expect, test } from "bun:test";
 import { z } from "zod/v4";
-import { __absorbUsageForTest, parseJsonFromResponse } from "../judge-query.ts";
+import { __absorbUsageForTest, buildSystemPrompt, parseJsonFromResponse } from "../judge-query.ts";
 
 // parseJsonFromResponse is the shape-normalization layer for judge subprocess output.
 // Models sometimes return markdown fences, leading prose, or trailing whitespace even
@@ -117,6 +117,27 @@ describe("parseJsonFromResponse", () => {
 		expect(partial.costUsd).toBe(0);
 	});
 
+	test("buildSystemPrompt returns the claude_code preset envelope by default", () => {
+		const prompt = buildSystemPrompt("evaluate this", false);
+		expect(typeof prompt).toBe("object");
+		if (typeof prompt === "object") {
+			expect(prompt.type).toBe("preset");
+			expect(prompt.preset).toBe("claude_code");
+			expect(prompt.append).toBe("evaluate this");
+		}
+	});
+
+	test("buildSystemPrompt returns a plain string when omitPreset is true", () => {
+		// This is the C3 fix shape: the SDK accepts a plain-string systemPrompt
+		// per its docstring ("Use a custom system prompt") and that path skips
+		// the preset's base prompt and full tool catalog. The gate is the only
+		// caller that flips this on, because it is a pure pass/skip evaluation
+		// with no tool use. Other judges keep the preset.
+		const prompt = buildSystemPrompt("evaluate this", true);
+		expect(typeof prompt).toBe("string");
+		expect(prompt).toBe("evaluate this");
+	});
+
 	test("parses nested structures", () => {
 		const Nested = z.object({
 			flags: z.array(z.object({ category: z.string(), severity: z.enum(["critical", "warning", "info"]) })),

diff --git a/src/agent/judge-query.ts b/src/agent/judge-query.ts
@@ -16,6 +16,17 @@ export type JudgeQueryOptions<T> = {
 	schema: z.ZodType<T>;
 	model?: string;
 	maxTokens?: number;
+	/**
+	 * When true, the SDK is invoked with a plain-string `systemPrompt` instead
+	 * of the `claude_code` preset envelope. The preset bundles the full Claude
+	 * Code base prompt and tool catalog, which costs thousands of input tokens
+	 * per call. Pure evaluation calls that do not use any tools (the
+	 * conditional firing gate is the canonical example) should set this so
+	 * Haiku only sees the judge prompt and the session summary, not the entire
+	 * Claude Code surface area. Other judges that legitimately want the preset
+	 * keep the default.
+	 */
+	omitPreset?: boolean;
 };
 
 export type JudgeQueryResult<T> = {
@@ -142,17 +153,15 @@ export async function runJudgeQuery<T>(
 	// whenever ANTHROPIC_API_KEY happened to be set in the shell.
 	const providerEnv = buildProviderEnv(config);
 
+	const systemPrompt = buildSystemPrompt(judgePrompt, options.omitPreset === true);
+
 	const queryStream = query({
 		prompt: options.userMessage,
 		options: {
 			model: resolvedModel,
 			permissionMode: "bypassPermissions",
 			allowDangerouslySkipPermissions: true,
-			systemPrompt: {
-				type: "preset" as const,
-				preset: "claude_code" as const,
-				append: judgePrompt,
-			},
+			systemPrompt,
 			maxTurns: 1,
 			effort: "low",
 			persistSession: false,
@@ -308,6 +317,29 @@ export function __absorbUsageForTest(
 	return partial;
 }
 
+/**
+ * Compose the SDK `systemPrompt` value. The default path keeps the
+ * `claude_code` preset envelope so judges designed against the Claude Code
+ * tool catalog continue to receive the full base prompt. When the caller
+ * sets `omitPreset`, the SDK is handed a plain string instead, which the
+ * SDK documents as "Use a custom system prompt" and which skips the preset's
+ * base prompt and tool descriptions entirely. The gate uses this to drop
+ * two orders of magnitude of input tokens per call.
+ */
+export function buildSystemPrompt(
+	judgePrompt: string,
+	omitPreset: boolean,
+): string | { type: "preset"; preset: "claude_code"; append: string } {
+	if (omitPreset) {
+		return judgePrompt;
+	}
+	return {
+		type: "preset" as const,
+		preset: "claude_code" as const,
+		append: judgePrompt,
+	};
+}
+
 function buildJudgePrompt(systemPrompt: string, schemaJson: unknown): string {
 	return [
 		systemPrompt,

diff --git a/src/evolution/__tests__/cadence.test.ts b/src/evolution/__tests__/cadence.test.ts
@@ -42,9 +42,15 @@ function newDb(): Database {
 }
 
 function makeSummary(overrides: Partial<SessionSummary> = {}): SessionSummary {
+	const sessionId = overrides.session_id ?? "s1";
+	// Default the session_key off the session_id so each fixture row gets a
+	// distinct key. The queue dedups by session_key, so two enqueues with the
+	// same key collapse into a single row, which is correct production
+	// behavior but breaks any test that uses session_id alone to distinguish
+	// rows.
 	return {
-		session_id: "s1",
-		session_key: "slack:C1:T1",
+		session_id: sessionId,
+		session_key: `slack:C-${sessionId}:T-${sessionId}`,
 		user_id: "u1",
 		user_messages: ["help"],
 		assistant_messages: ["ok"],
@@ -297,6 +303,42 @@ describe("EvolutionCadence", () => {
 		}
 	});
 
+	test("failed pipeline rows stay in the queue while ok rows are deleted", async () => {
+		const config = setupEnv();
+		const db = newDb();
+		const queue = new EvolutionQueue(db);
+		let calls = 0;
+		const engine = fakeEngine({
+			onRun: async (session) => {
+				calls += 1;
+				if (session.session_id === "fail-me") {
+					throw new Error("simulated transient pipeline failure");
+				}
+				return { version: 1, changes_applied: [], changes_rejected: [] };
+			},
+		});
+		const cadence = new EvolutionCadence(engine, queue, config, { cadenceMinutes: 1_000_000, demandTriggerDepth: 999 });
+		cadence.start();
+		try {
+			// Distinct session_keys so the dedup-on-enqueue from M3 does not
+			// collapse them into a single row.
+			queue.enqueue(makeSummary({ session_id: "ok-1", session_key: "slack:C1:T1" }), DECISION);
+			queue.enqueue(makeSummary({ session_id: "fail-me", session_key: "slack:C2:T2" }), DECISION);
+			queue.enqueue(makeSummary({ session_id: "ok-2", session_key: "slack:C3:T3" }), DECISION);
+			const result = await cadence.triggerNow();
+			expect(result?.processed).toBe(3);
+			expect(result?.failureCount).toBe(1);
+			expect(calls).toBe(3);
+
+			// Only the failing row remains; the two ok rows were deleted.
+			const remaining = queue.drainAll();
+			expect(remaining).toHaveLength(1);
+			expect(remaining[0].session_id).toBe("fail-me");
+		} finally {
+			cadence.stop();
+		}
+	});
+
 	test("queue_stats counters increment after each drain", async () => {
 		const config = setupEnv();
 		const db = newDb();

diff --git a/src/evolution/__tests__/engine.test.ts b/src/evolution/__tests__/engine.test.ts
@@ -201,17 +201,14 @@ describe("EvolutionEngine", () => {
 		const session = makeSession({ outcome: "success" });
 		await engine.afterSession(session);
 
-		// Phase 0 M4: `updateAfterSession` is called twice on the normal run
-		// path. Once at the top of `afterSession` so the mutex-skip path also
-		// updates counters (without that, dashboards undercount during bursts),
-		// and again inside `runCycle` with the real `hadCorrections` value.
-		// Both calls increment `session_count`, so the normal path double-counts
-		// by one. This is the tradeoff the Phase 0 reviewer explicitly accepted
-		// because Phase 2 replaces the drop-on-floor mutex with a real queue
-		// and removes this bookkeeping entirely.
+		// Phase 1+2 collapsed `updateAfterSession` to a single call inside
+		// `runCycle`, after observation extraction supplies the real
+		// `hadCorrections` value. The Phase 0 M4 increments at the top of
+		// `afterSessionInternal` and inside `enqueueIfWorthy` were obsolete
+		// once the persistent queue replaced the drop-on-floor mutex path.
 		const metrics = engine.getMetrics();
-		expect(metrics.session_count).toBe(2);
-		expect(metrics.success_count).toBe(2);
+		expect(metrics.session_count).toBe(1);
+		expect(metrics.success_count).toBe(1);
 	});
 
 	test("constitution violation is rejected", async () => {
@@ -249,6 +246,34 @@ describe("EvolutionEngine", () => {
 		expect(metrics.rollback_count).toBe(1);
 	});
 
+	test("rollback fires the runtime refresh callback with the rolled-back state", async () => {
+		// Codex finding on PR #63: the apply path refreshes the runtime via
+		// `notifyConfigApplied` but the auto-rollback branch was reverting disk
+		// state without firing the callback, leaving the runtime serving the
+		// rolled-forward snapshot. The fix wires the same callback into
+		// `rollback()` so any path that mutates disk state also refreshes the
+		// runtime. `getConfig()` re-reads from disk on every call, so the
+		// callback observes the post-rollback content.
+		const engine = new EvolutionEngine(CONFIG_PATH);
+		const refreshes: Array<{ version: number; userProfile: string }> = [];
+		engine.setOnConfigApplied(() => {
+			const config = engine.getConfig();
+			refreshes.push({ version: engine.getCurrentVersion(), userProfile: config.userProfile });
+		});
+
+		await engine.afterSession(makeSession({ user_messages: ["No, use TypeScript not JavaScript"] }));
+		expect(refreshes.length).toBe(1);
+		expect(refreshes[0].version).toBeGreaterThan(0);
+		expect(refreshes[0].userProfile).toContain("TypeScript");
+
+		engine.rollback(0);
+		// Two callback invocations now: one from the original apply, one from
+		// the rollback. The second sees the version-0 state on disk.
+		expect(refreshes.length).toBe(2);
+		expect(refreshes[1].version).toBe(0);
+		expect(refreshes[1].userProfile).not.toContain("TypeScript");
+	});
+
 	test("preference is detected and applied", async () => {
 		const engine = new EvolutionEngine(CONFIG_PATH);
 		const session = makeSession({
@@ -261,6 +286,46 @@ describe("EvolutionEngine", () => {
 		expect(userProfile.toLowerCase()).toContain("vim");
 	});
 
+	test("setOnConfigApplied fires after a successful afterSession that applies changes", async () => {
+		// C1 contract: the engine owns the runtime refresh callback and fires
+		// it from the apply path. Production wiring in src/index.ts uses this
+		// exact setter to refresh the runtime's evolvedConfig snapshot from
+		// disk. Test against the engine directly so a future refactor that
+		// breaks the callback shape fails here, not at boot time on a VM.
+		const engine = new EvolutionEngine(CONFIG_PATH);
+		const versions: number[] = [];
+		engine.setOnConfigApplied(() => {
+			versions.push(engine.getCurrentVersion());
+		});
+		await engine.afterSession(makeSession({ user_messages: ["No, use TypeScript not JavaScript"] }));
+		expect(versions.length).toBeGreaterThanOrEqual(1);
+		expect(versions.at(-1)).toBe(engine.getCurrentVersion());
+	});
+
+	test("setOnConfigApplied does not fire when no changes are applied", async () => {
+		const engine = new EvolutionEngine(CONFIG_PATH);
+		let calls = 0;
+		engine.setOnConfigApplied(() => {
+			calls += 1;
+		});
+		// A neutral session with no correction signals applies zero changes.
+		await engine.afterSession(makeSession({ user_messages: ["What time is it?"] }));
+		expect(calls).toBe(0);
+	});
+
+	test("setOnConfigApplied callback errors do not wedge the pipeline", async () => {
+		// A telemetry/refresh failure in the callback must not poison the
+		// evolution result. The engine swallows and warns, the pipeline
+		// returns normally, and the next applied change retries the refresh.
+		const engine = new EvolutionEngine(CONFIG_PATH);
+		engine.setOnConfigApplied(() => {
+			throw new Error("simulated runtime refresh failure");
+		});
+		const result = await engine.afterSession(makeSession({ user_messages: ["No, use TypeScript not JavaScript"] }));
+		expect(result.changes_applied.length).toBeGreaterThan(0);
+		expect(result.version).toBeGreaterThan(0);
+	});
+
 	test("evolved config is available in getConfig after changes", async () => {
 		const engine = new EvolutionEngine(CONFIG_PATH);
 		const session = makeSession({

diff --git a/src/evolution/__tests__/gate.test.ts b/src/evolution/__tests__/gate.test.ts
@@ -65,6 +65,7 @@ type FakeRuntime = {
 		schema: unknown;
 		model?: string;
 		maxTokens?: number;
+		omitPreset?: boolean;
 	}) => Promise<unknown>;
 };
 
@@ -93,6 +94,41 @@ describe("decideGate Haiku happy path", () => {
 		expect(decision.reason).toContain("naming");
 	});
 
+	test("gate forwards omitPreset=true so judgeQuery skips the claude_code preset envelope", async () => {
+		// The gate is a pure pass/skip evaluation that never reads files or
+		// runs tools, so it must opt out of the `claude_code` system prompt
+		// preset that bundles the full Claude Code base prompt and tool
+		// catalog. Live fleet data showed gate cost running 20-180x the
+		// research target until this flag was wired through. Asserting the
+		// flag at the call site is the durable defense against a future
+		// refactor accidentally re-introducing the preset overhead.
+		const captured: Array<{ omitPreset?: boolean; model?: string; maxTokens?: number }> = [];
+		const runtime: FakeRuntime = {
+			judgeQuery: async (options) => {
+				captured.push({
+					omitPreset: options.omitPreset,
+					model: options.model,
+					maxTokens: options.maxTokens,
+				});
+				return {
+					verdict: "pass" as const,
+					confidence: 0.9,
+					reasoning: "",
+					data: { evolve: false, reason: "routine" },
+					model: "claude-haiku-4-5",
+					inputTokens: 420,
+					outputTokens: 28,
+					costUsd: 0.0005,
+					durationMs: 900,
+				};
+			},
+		};
+		await decideGate(makeSession(), runtime as unknown as AgentRuntime);
+		expect(captured).toHaveLength(1);
+		expect(captured[0].omitPreset).toBe(true);
+		expect(captured[0].maxTokens).toBe(200);
+	});
+
 	test("evolve=false returns skip source=haiku", async () => {
 		const runtime: FakeRuntime = {
 			judgeQuery: async () => ({

diff --git a/src/evolution/__tests__/mutex-and-retry-ceiling.test.ts b/src/evolution/__tests__/mutex-and-retry-ceiling.test.ts
@@ -1,5 +1,5 @@
 import { afterEach, beforeEach, describe, expect, test } from "bun:test";
-import { mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
+import { mkdirSync, rmSync, writeFileSync } from "node:fs";
 import { JudgeSubprocessError } from "../../agent/judge-query.ts";
 import type { AgentRuntime } from "../../agent/runtime.ts";
 import type { PhantomConfig } from "../../config/types.ts";
@@ -356,11 +356,17 @@ describe("Phase 0 mutex guard", () => {
 		}
 	});
 
-	test("mutex skip path still bumps session_count so dashboards do not undercount", async () => {
-		// M4: without this the normal-vs-skip paths diverge and operators
-		// watching session_count in the dashboard see undercounting during
-		// bursts. The skip path now updates session metrics with
-		// hadCorrections=false at the top of afterSession.
+	test("mutex skip path returns a skipped result without spawning a new cycle", async () => {
+		// Phase 1+2 collapsed `updateAfterSession` to a single call inside
+		// `runCycle`, after observation extraction supplies the real
+		// `hadCorrections` value. The previous Phase 0 M4 top-of-afterSession
+		// increment was load-bearing only while the mutex could permanently
+		// drop a session; the persistent queue replaces that drop-on-floor
+		// path so an enqueued session eventually reaches `runCycle` and the
+		// counter increments exactly once. This test pins the new shape: the
+		// active cycle still holds the mutex, the second call sees the guard
+		// and returns a skipped result with zero applied changes, and no
+		// extra runCycle is launched on top of the hanging one.
 		type Releaser = (reason: unknown) => void;
 		const releaseHangRef: { fn: Releaser | null } = { fn: null };
 		const hang = new Promise<never>((_resolve, reject) => {
@@ -376,15 +382,12 @@ describe("Phase 0 mutex guard", () => {
 		const callA = engine.afterSession(makeSession({ session_id: "active" }));
 		await Promise.resolve();
 		await Promise.resolve();
-		await engine.afterSession(makeSession({ session_id: "skip-1" }));
-		await engine.afterSession(makeSession({ session_id: "skip-2" }));
-
-		const metricsPath = `${TEST_DIR}/phantom-config/meta/metrics.json`;
-		const metrics = JSON.parse(readFileSync(metricsPath, "utf-8"));
-		// Three afterSession calls, three counter increments from the top-of-
-		// afterSession update. The active call is still hanging so its inner
-		// updateAfterSession has not run yet, so we assert exactly 3.
-		expect(metrics.session_count).toBe(3);
+		const skipResult1 = await engine.afterSession(makeSession({ session_id: "skip-1" }));
+		const skipResult2 = await engine.afterSession(makeSession({ session_id: "skip-2" }));
+
+		expect(skipResult1.changes_applied).toHaveLength(0);
+		expect(skipResult2.changes_applied).toHaveLength(0);
+		expect((engine as unknown as { activeCycleSkipCount: number }).activeCycleSkipCount).toBe(2);
 
 		releaseHangRef.fn?.(new Error("released by test"));
 		try {