Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion src/agent/__tests__/judge-query.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { describe, expect, test } from "bun:test";
import { z } from "zod/v4";
import { __absorbUsageForTest, parseJsonFromResponse } from "../judge-query.ts";
import { __absorbUsageForTest, buildSystemPrompt, parseJsonFromResponse } from "../judge-query.ts";

// parseJsonFromResponse is the shape-normalization layer for judge subprocess output.
// Models sometimes return markdown fences, leading prose, or trailing whitespace even
Expand Down Expand Up @@ -117,6 +117,27 @@ describe("parseJsonFromResponse", () => {
expect(partial.costUsd).toBe(0);
});

test("buildSystemPrompt returns the claude_code preset envelope by default", () => {
const prompt = buildSystemPrompt("evaluate this", false);
expect(typeof prompt).toBe("object");
if (typeof prompt === "object") {
expect(prompt.type).toBe("preset");
expect(prompt.preset).toBe("claude_code");
expect(prompt.append).toBe("evaluate this");
}
});

test("buildSystemPrompt returns a plain string when omitPreset is true", () => {
// This is the C3 fix shape: the SDK accepts a plain-string systemPrompt
// per its docstring ("Use a custom system prompt") and that path skips
// the preset's base prompt and full tool catalog. The gate is the only
// caller that flips this on, because it is a pure pass/skip evaluation
// with no tool use. Other judges keep the preset.
const prompt = buildSystemPrompt("evaluate this", true);
expect(typeof prompt).toBe("string");
expect(prompt).toBe("evaluate this");
});

test("parses nested structures", () => {
const Nested = z.object({
flags: z.array(z.object({ category: z.string(), severity: z.enum(["critical", "warning", "info"]) })),
Expand Down
42 changes: 37 additions & 5 deletions src/agent/judge-query.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,17 @@ export type JudgeQueryOptions<T> = {
schema: z.ZodType<T>;
model?: string;
maxTokens?: number;
/**
* When true, the SDK is invoked with a plain-string `systemPrompt` instead
* of the `claude_code` preset envelope. The preset bundles the full Claude
* Code base prompt and tool catalog, which costs thousands of input tokens
* per call. Pure evaluation calls that do not use any tools (the
* conditional firing gate is the canonical example) should set this so
* Haiku only sees the judge prompt and the session summary, not the entire
* Claude Code surface area. Other judges that legitimately want the preset
* keep the default.
*/
omitPreset?: boolean;
};

export type JudgeQueryResult<T> = {
Expand Down Expand Up @@ -142,17 +153,15 @@ export async function runJudgeQuery<T>(
// whenever ANTHROPIC_API_KEY happened to be set in the shell.
const providerEnv = buildProviderEnv(config);

const systemPrompt = buildSystemPrompt(judgePrompt, options.omitPreset === true);

const queryStream = query({
prompt: options.userMessage,
options: {
model: resolvedModel,
permissionMode: "bypassPermissions",
allowDangerouslySkipPermissions: true,
systemPrompt: {
type: "preset" as const,
preset: "claude_code" as const,
append: judgePrompt,
},
systemPrompt,
maxTurns: 1,
effort: "low",
persistSession: false,
Expand Down Expand Up @@ -308,6 +317,29 @@ export function __absorbUsageForTest(
return partial;
}

/**
* Compose the SDK `systemPrompt` value. The default path keeps the
* `claude_code` preset envelope so judges designed against the Claude Code
* tool catalog continue to receive the full base prompt. When the caller
* sets `omitPreset`, the SDK is handed a plain string instead, which the
* SDK documents as "Use a custom system prompt" and which skips the preset's
* base prompt and tool descriptions entirely. The gate uses this to drop
* two orders of magnitude of input tokens per call.
*/
export function buildSystemPrompt(
judgePrompt: string,
omitPreset: boolean,
): string | { type: "preset"; preset: "claude_code"; append: string } {
if (omitPreset) {
return judgePrompt;
}
return {
type: "preset" as const,
preset: "claude_code" as const,
append: judgePrompt,
};
}

function buildJudgePrompt(systemPrompt: string, schemaJson: unknown): string {
return [
systemPrompt,
Expand Down
46 changes: 44 additions & 2 deletions src/evolution/__tests__/cadence.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,15 @@ function newDb(): Database {
}

function makeSummary(overrides: Partial<SessionSummary> = {}): SessionSummary {
const sessionId = overrides.session_id ?? "s1";
// Default the session_key off the session_id so each fixture row gets a
// distinct key. The queue dedups by session_key, so two enqueues with the
// same key collapse into a single row, which is correct production
// behavior but breaks any test that uses session_id alone to distinguish
// rows.
return {
session_id: "s1",
session_key: "slack:C1:T1",
session_id: sessionId,
session_key: `slack:C-${sessionId}:T-${sessionId}`,
user_id: "u1",
user_messages: ["help"],
assistant_messages: ["ok"],
Expand Down Expand Up @@ -297,6 +303,42 @@ describe("EvolutionCadence", () => {
}
});

test("failed pipeline rows stay in the queue while ok rows are deleted", async () => {
const config = setupEnv();
const db = newDb();
const queue = new EvolutionQueue(db);
let calls = 0;
const engine = fakeEngine({
onRun: async (session) => {
calls += 1;
if (session.session_id === "fail-me") {
throw new Error("simulated transient pipeline failure");
}
return { version: 1, changes_applied: [], changes_rejected: [] };
},
});
const cadence = new EvolutionCadence(engine, queue, config, { cadenceMinutes: 1_000_000, demandTriggerDepth: 999 });
cadence.start();
try {
// Distinct session_keys so the dedup-on-enqueue from M3 does not
// collapse them into a single row.
queue.enqueue(makeSummary({ session_id: "ok-1", session_key: "slack:C1:T1" }), DECISION);
queue.enqueue(makeSummary({ session_id: "fail-me", session_key: "slack:C2:T2" }), DECISION);
queue.enqueue(makeSummary({ session_id: "ok-2", session_key: "slack:C3:T3" }), DECISION);
const result = await cadence.triggerNow();
expect(result?.processed).toBe(3);
expect(result?.failureCount).toBe(1);
expect(calls).toBe(3);

// Only the failing row remains; the two ok rows were deleted.
const remaining = queue.drainAll();
expect(remaining).toHaveLength(1);
expect(remaining[0].session_id).toBe("fail-me");
} finally {
cadence.stop();
}
});

test("queue_stats counters increment after each drain", async () => {
const config = setupEnv();
const db = newDb();
Expand Down
85 changes: 75 additions & 10 deletions src/evolution/__tests__/engine.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -201,17 +201,14 @@ describe("EvolutionEngine", () => {
const session = makeSession({ outcome: "success" });
await engine.afterSession(session);

// Phase 0 M4: `updateAfterSession` is called twice on the normal run
// path. Once at the top of `afterSession` so the mutex-skip path also
// updates counters (without that, dashboards undercount during bursts),
// and again inside `runCycle` with the real `hadCorrections` value.
// Both calls increment `session_count`, so the normal path double-counts
// by one. This is the tradeoff the Phase 0 reviewer explicitly accepted
// because Phase 2 replaces the drop-on-floor mutex with a real queue
// and removes this bookkeeping entirely.
// Phase 1+2 collapsed `updateAfterSession` to a single call inside
// `runCycle`, after observation extraction supplies the real
// `hadCorrections` value. The Phase 0 M4 increments at the top of
// `afterSessionInternal` and inside `enqueueIfWorthy` were obsolete
// once the persistent queue replaced the drop-on-floor mutex path.
const metrics = engine.getMetrics();
expect(metrics.session_count).toBe(2);
expect(metrics.success_count).toBe(2);
expect(metrics.session_count).toBe(1);
expect(metrics.success_count).toBe(1);
});

test("constitution violation is rejected", async () => {
Expand Down Expand Up @@ -249,6 +246,34 @@ describe("EvolutionEngine", () => {
expect(metrics.rollback_count).toBe(1);
});

test("rollback fires the runtime refresh callback with the rolled-back state", async () => {
// Codex finding on PR #63: the apply path refreshes the runtime via
// `notifyConfigApplied` but the auto-rollback branch was reverting disk
// state without firing the callback, leaving the runtime serving the
// rolled-forward snapshot. The fix wires the same callback into
// `rollback()` so any path that mutates disk state also refreshes the
// runtime. `getConfig()` re-reads from disk on every call, so the
// callback observes the post-rollback content.
const engine = new EvolutionEngine(CONFIG_PATH);
const refreshes: Array<{ version: number; userProfile: string }> = [];
engine.setOnConfigApplied(() => {
const config = engine.getConfig();
refreshes.push({ version: engine.getCurrentVersion(), userProfile: config.userProfile });
});

await engine.afterSession(makeSession({ user_messages: ["No, use TypeScript not JavaScript"] }));
expect(refreshes.length).toBe(1);
expect(refreshes[0].version).toBeGreaterThan(0);
expect(refreshes[0].userProfile).toContain("TypeScript");

engine.rollback(0);
// Two callback invocations now: one from the original apply, one from
// the rollback. The second sees the version-0 state on disk.
expect(refreshes.length).toBe(2);
expect(refreshes[1].version).toBe(0);
expect(refreshes[1].userProfile).not.toContain("TypeScript");
});

test("preference is detected and applied", async () => {
const engine = new EvolutionEngine(CONFIG_PATH);
const session = makeSession({
Expand All @@ -261,6 +286,46 @@ describe("EvolutionEngine", () => {
expect(userProfile.toLowerCase()).toContain("vim");
});

test("setOnConfigApplied fires after a successful afterSession that applies changes", async () => {
// C1 contract: the engine owns the runtime refresh callback and fires
// it from the apply path. Production wiring in src/index.ts uses this
// exact setter to refresh the runtime's evolvedConfig snapshot from
// disk. Test against the engine directly so a future refactor that
// breaks the callback shape fails here, not at boot time on a VM.
const engine = new EvolutionEngine(CONFIG_PATH);
const versions: number[] = [];
engine.setOnConfigApplied(() => {
versions.push(engine.getCurrentVersion());
});
await engine.afterSession(makeSession({ user_messages: ["No, use TypeScript not JavaScript"] }));
expect(versions.length).toBeGreaterThanOrEqual(1);
expect(versions.at(-1)).toBe(engine.getCurrentVersion());
});

test("setOnConfigApplied does not fire when no changes are applied", async () => {
const engine = new EvolutionEngine(CONFIG_PATH);
let calls = 0;
engine.setOnConfigApplied(() => {
calls += 1;
});
// A neutral session with no correction signals applies zero changes.
await engine.afterSession(makeSession({ user_messages: ["What time is it?"] }));
expect(calls).toBe(0);
});

test("setOnConfigApplied callback errors do not wedge the pipeline", async () => {
// A telemetry/refresh failure in the callback must not poison the
// evolution result. The engine swallows and warns, the pipeline
// returns normally, and the next applied change retries the refresh.
const engine = new EvolutionEngine(CONFIG_PATH);
engine.setOnConfigApplied(() => {
throw new Error("simulated runtime refresh failure");
});
const result = await engine.afterSession(makeSession({ user_messages: ["No, use TypeScript not JavaScript"] }));
expect(result.changes_applied.length).toBeGreaterThan(0);
expect(result.version).toBeGreaterThan(0);
});

test("evolved config is available in getConfig after changes", async () => {
const engine = new EvolutionEngine(CONFIG_PATH);
const session = makeSession({
Expand Down
36 changes: 36 additions & 0 deletions src/evolution/__tests__/gate.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ type FakeRuntime = {
schema: unknown;
model?: string;
maxTokens?: number;
omitPreset?: boolean;
}) => Promise<unknown>;
};

Expand Down Expand Up @@ -93,6 +94,41 @@ describe("decideGate Haiku happy path", () => {
expect(decision.reason).toContain("naming");
});

test("gate forwards omitPreset=true so judgeQuery skips the claude_code preset envelope", async () => {
// The gate is a pure pass/skip evaluation that never reads files or
// runs tools, so it must opt out of the `claude_code` system prompt
// preset that bundles the full Claude Code base prompt and tool
// catalog. Live fleet data showed gate cost running 20-180x the
// research target until this flag was wired through. Asserting the
// flag at the call site is the durable defense against a future
// refactor accidentally re-introducing the preset overhead.
const captured: Array<{ omitPreset?: boolean; model?: string; maxTokens?: number }> = [];
const runtime: FakeRuntime = {
judgeQuery: async (options) => {
captured.push({
omitPreset: options.omitPreset,
model: options.model,
maxTokens: options.maxTokens,
});
return {
verdict: "pass" as const,
confidence: 0.9,
reasoning: "",
data: { evolve: false, reason: "routine" },
model: "claude-haiku-4-5",
inputTokens: 420,
outputTokens: 28,
costUsd: 0.0005,
durationMs: 900,
};
},
};
await decideGate(makeSession(), runtime as unknown as AgentRuntime);
expect(captured).toHaveLength(1);
expect(captured[0].omitPreset).toBe(true);
expect(captured[0].maxTokens).toBe(200);
});

test("evolve=false returns skip source=haiku", async () => {
const runtime: FakeRuntime = {
judgeQuery: async () => ({
Expand Down
33 changes: 18 additions & 15 deletions src/evolution/__tests__/mutex-and-retry-ceiling.test.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { afterEach, beforeEach, describe, expect, test } from "bun:test";
import { mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
import { mkdirSync, rmSync, writeFileSync } from "node:fs";
import { JudgeSubprocessError } from "../../agent/judge-query.ts";
import type { AgentRuntime } from "../../agent/runtime.ts";
import type { PhantomConfig } from "../../config/types.ts";
Expand Down Expand Up @@ -356,11 +356,17 @@ describe("Phase 0 mutex guard", () => {
}
});

test("mutex skip path still bumps session_count so dashboards do not undercount", async () => {
// M4: without this the normal-vs-skip paths diverge and operators
// watching session_count in the dashboard see undercounting during
// bursts. The skip path now updates session metrics with
// hadCorrections=false at the top of afterSession.
test("mutex skip path returns a skipped result without spawning a new cycle", async () => {
// Phase 1+2 collapsed `updateAfterSession` to a single call inside
// `runCycle`, after observation extraction supplies the real
// `hadCorrections` value. The previous Phase 0 M4 top-of-afterSession
// increment was load-bearing only while the mutex could permanently
// drop a session; the persistent queue replaces that drop-on-floor
// path so an enqueued session eventually reaches `runCycle` and the
// counter increments exactly once. This test pins the new shape: the
// active cycle still holds the mutex, the second call sees the guard
// and returns a skipped result with zero applied changes, and no
// extra runCycle is launched on top of the hanging one.
type Releaser = (reason: unknown) => void;
const releaseHangRef: { fn: Releaser | null } = { fn: null };
const hang = new Promise<never>((_resolve, reject) => {
Expand All @@ -376,15 +382,12 @@ describe("Phase 0 mutex guard", () => {
const callA = engine.afterSession(makeSession({ session_id: "active" }));
await Promise.resolve();
await Promise.resolve();
await engine.afterSession(makeSession({ session_id: "skip-1" }));
await engine.afterSession(makeSession({ session_id: "skip-2" }));

const metricsPath = `${TEST_DIR}/phantom-config/meta/metrics.json`;
const metrics = JSON.parse(readFileSync(metricsPath, "utf-8"));
// Three afterSession calls, three counter increments from the top-of-
// afterSession update. The active call is still hanging so its inner
// updateAfterSession has not run yet, so we assert exactly 3.
expect(metrics.session_count).toBe(3);
const skipResult1 = await engine.afterSession(makeSession({ session_id: "skip-1" }));
const skipResult2 = await engine.afterSession(makeSession({ session_id: "skip-2" }));

expect(skipResult1.changes_applied).toHaveLength(0);
expect(skipResult2.changes_applied).toHaveLength(0);
expect((engine as unknown as { activeCycleSkipCount: number }).activeCycleSkipCount).toBe(2);

releaseHangRef.fn?.(new Error("released by test"));
try {
Expand Down
Loading
Loading