diff --git a/scripts/agent-evals/src/runner/agent-test-runner.ts b/scripts/agent-evals/src/runner/agent-test-runner.ts index e7d40330c88..2bb026df91b 100644 --- a/scripts/agent-evals/src/runner/agent-test-runner.ts +++ b/scripts/agent-evals/src/runner/agent-test-runner.ts @@ -1,3 +1,5 @@ +import { ToolDef } from "./tool-matcher.js"; + export interface AgentTestRunner { /** * Simulates typing a string and waits for the turn to complete. It types one @@ -13,8 +15,8 @@ export interface AgentTestRunner { expectText(text: string | RegExp): Promise; /** - * Reads the agent's telemetry file and looks for the given event. Throws if - * the event is not found + * Reads the agent's telemetry and looks for the given tool calls. Throws if + * an event is not found */ - expectTelemetryEvent(eventName: string): Promise; + expectToolCalls(tools: ToolDef[]): Promise; } diff --git a/scripts/agent-evals/src/runner/gemini-cli-runner.ts b/scripts/agent-evals/src/runner/gemini-cli-runner.ts index 46919c6b1d1..389356f583d 100644 --- a/scripts/agent-evals/src/runner/gemini-cli-runner.ts +++ b/scripts/agent-evals/src/runner/gemini-cli-runner.ts @@ -1,15 +1,44 @@ -import { mkdirSync, writeFileSync, readFileSync, existsSync } from "node:fs"; +import { mkdirSync, writeFileSync, readFileSync } from "node:fs"; import path from "node:path"; import { InteractiveCLI, poll } from "./interactive-cli.js"; import { AgentTestRunner } from "./agent-test-runner.js"; +import { + ParsedToolLog, + getToolName, + toolArgumentsMatch, + getToolArgumentsDebug, +} from "./tool-matcher.js"; +import fs from "fs"; +import { throwFailure } from "./logging.js"; const READY_PROMPT = "Type your message"; +interface ParsedTelemetryLog { + attributes?: { + "event.name"?: string; + function_name?: string; + function_args?: string; + success?: boolean; + duration_ms?: number; + }; + scopeMetrics?: { + metrics: { + descriptor: { + name: string; + }; + }[]; + }[]; +} + export class GeminiCliRunner implements AgentTestRunner { private readonly cli: InteractiveCLI; private readonly telemetryPath: string; private readonly telemetryTimeout = 15000; + // Determines which tools to start from for this turn so we don't detect tool + // calls from previous turns + private turnToolIndex = 0; + constructor( private readonly testName: string, testDir: string, @@ -29,8 +58,6 @@ export class GeminiCliRunner implements AgentTestRunner { }, mcpServers: { firebase: { - // TODO: Add a mode where developers can run against their npm run watch command - // command: path.resolve(runDir, "../../../../../lib/bin/firebase.js"), command: "firebase", args: ["experimental:mcp"], }, @@ -52,6 +79,8 @@ export class GeminiCliRunner implements AgentTestRunner { } async type(text: string): Promise { + const toolLogs = this.readToolLogs(); + this.turnToolIndex = toolLogs.length; return this.cli.type(text); } @@ -67,21 +96,115 @@ export class GeminiCliRunner implements AgentTestRunner { * Reads the agent's telemetry file and looks for the given event. Throws if * the event is not found */ - async expectTelemetryEvent(eventName: string): Promise { - // NOTE: This doesn't take into account "turns" yet. It will likely look - // through the entire history, not just the last turn - const found = await poll(() => { - if (!existsSync(this.telemetryPath)) { + async expectToolCalls(tools: string[]): Promise { + await this.waitForTelemetryReady(); + + // We still need to poll because telemetry can take time to write each turn + let messages: string[] = []; + const success = await poll(() => { + messages = []; + let allSucceeded = true; + // Start at this.turnToolIndex so we only read the tools used this turn + const toolLogs = this.readToolLogs().slice(this.turnToolIndex); + const foundToolNames = toolLogs.map((log) => log.name); + for (const toolDef of tools) { + const toolName = getToolName(toolDef); + const matchingTool = toolLogs.find((log) => log.name === toolName); + if (!matchingTool) { + messages.push( + `Did not find expected tool call: "${toolName}" in the telemetry log. Found [${foundToolNames}]`, + ); + allSucceeded = false; + } else { + const foundMatchingArguments = toolLogs.some( + (log) => log.name === toolName && toolArgumentsMatch(toolDef, log), + ); + if (!foundMatchingArguments) { + messages.push( + `Tool arguments matcher "${getToolArgumentsDebug(toolDef)}" for "${toolName}" did not match any tool results in the telemetry log. All tools are: [${JSON.stringify(toolLogs)}]`, + ); + allSucceeded = false; + } + } + } + return allSucceeded; + }, this.telemetryTimeout); + + if (!success) { + throwFailure(messages.join("\n")); + } + } + + // Implementation for this is borrowed from the Gemini CLI's test-helper + private async waitForTelemetryReady() { + // Wait for telemetry file to exist and have content + await poll(() => { + if (!fs.existsSync(this.telemetryPath)) return false; + try { + const content = readFileSync(this.telemetryPath, "utf-8"); + // Check if file has at lease one event in it + return content.includes('"event.name"'); + } catch { return false; } - const content = readFileSync(this.telemetryPath, "utf-8"); - return content.includes(eventName); }, this.telemetryTimeout); + } + + // Implementation for this is borrowed from the Gemini CLI's test-helper + private readToolLogs(): ParsedToolLog[] { + const parsedLogs = this.readAndParseTelemetryLog(); + const logs: ParsedToolLog[] = []; - if (!found) { - throw new Error(`Did not find expected telemetry event: "${eventName}" in the telemetry log`); - } else { - console.log(` [FOUND] expectTelemetryEvent: ${eventName}`); + for (const logData of parsedLogs) { + // Look for tool call logs + if ( + logData.attributes?.function_name && + logData.attributes["event.name"] === "gemini_cli.tool_call" + ) { + logs.push({ + name: logData.attributes.function_name, + args: logData.attributes.function_args ?? "{}", + success: logData.attributes.success ?? false, + duration_ms: logData.attributes.duration_ms ?? 0, + }); + } + } + + return logs; + } + + // Implementation for this is borrowed from the Gemini CLI's test-helper + private readAndParseTelemetryLog(): ParsedTelemetryLog[] { + const logFilePath = this.telemetryPath; + if (!logFilePath || !fs.existsSync(logFilePath)) { + return []; + } + + const content = readFileSync(logFilePath, "utf-8"); + + // Split the content into individual JSON objects + // They are separated by "}\n{" + const jsonObjects = content + .split(/}\n{/) + .map((obj, index, array) => { + // Add back the braces we removed during split + if (index > 0) obj = "{" + obj; + if (index < array.length - 1) obj = obj + "}"; + return obj.trim(); + }) + .filter((obj) => obj); + + const logs: ParsedTelemetryLog[] = []; + + for (const jsonStr of jsonObjects) { + try { + const logData = JSON.parse(jsonStr); + logs.push(logData); + } catch (e) { + // Skip objects that aren't valid JSON + } } + + return logs; } } diff --git a/scripts/agent-evals/src/runner/interactive-cli.ts b/scripts/agent-evals/src/runner/interactive-cli.ts index 4c564d16050..f783cddcc4e 100644 --- a/scripts/agent-evals/src/runner/interactive-cli.ts +++ b/scripts/agent-evals/src/runner/interactive-cli.ts @@ -1,6 +1,7 @@ import * as pty from "node-pty"; import { IPty } from "node-pty"; import stripAnsi from "strip-ansi"; +import { throwFailure } from "./logging.js"; export async function poll(predicate: () => boolean, timeout: number): Promise { const startTime = Date.now(); @@ -96,7 +97,7 @@ export class InteractiveCLI { }, this.timeout); if (!found) { - throw new Error(`Did not find expected text: "${text}" in output within ${this.timeout}ms`); + throwFailure(`Did not find expected text: "${text}" in output within ${this.timeout}ms`); } } @@ -121,7 +122,7 @@ export class InteractiveCLI { }, timeout); if (!stoppedChanging) { - throw new Error(`CLI did not stop changing output within ${timeout}ms`); + throwFailure(`CLI did not stop changing output within ${timeout}ms`); } } @@ -140,7 +141,7 @@ export class InteractiveCLI { } if (!found) { - throw new Error(`Did not find expected text: "${text}" in the latest output`); + throwFailure(`Did not find expected text: "${text}" in the latest output`); } else { console.log(` [FOUND] expectText: ${text}`); } diff --git a/scripts/agent-evals/src/runner/logging.ts b/scripts/agent-evals/src/runner/logging.ts new file mode 100644 index 00000000000..9efb4ebb6a5 --- /dev/null +++ b/scripts/agent-evals/src/runner/logging.ts @@ -0,0 +1,19 @@ +const COLORS = { + RESET: "\x1b[0m", + BRIGHT: "\x1b[1m", + BLUE: "\x1b[34m", + GREEN: "\x1b[32m", + RED: "\x1b[31m", +}; + +function colorLog(color: string, message: string): void { + console.log(`${color}${message}${COLORS.RESET}`); +} + +export function throwFailure(message: string) { + // Log this separately because mocha doesn't print errors from failures + // that happen before the final repetition. The failure can be helpful to get + // early signal that the test is going to fail all reptitions + colorLog(COLORS.BRIGHT + COLORS.RED, message); + throw new Error(message); +} diff --git a/scripts/agent-evals/src/runner/tool-matcher.ts b/scripts/agent-evals/src/runner/tool-matcher.ts new file mode 100644 index 00000000000..c1e37a519d1 --- /dev/null +++ b/scripts/agent-evals/src/runner/tool-matcher.ts @@ -0,0 +1,68 @@ +export type ToolDef = + // Asserts that the tool with this name was called successfully + | string + | { + // Name of the tool + name: string; + // Asserts that the tool arguments contain this string + argumentContains?: string; + // Asserts that the tool's success equals this value + successIs?: boolean; + }; + +export interface ParsedToolLog { + name: string; + args: string; + success: boolean; + duration_ms: number; +} + +export function getToolName(toolDef: ToolDef): string { + if (typeof toolDef === "string") { + return toolDef; + } + return toolDef.name; +} + +export function getToolArgumentsDebug(toolDef: ToolDef): string { + if (typeof toolDef !== "string") { + const out = []; + if (toolDef.successIs) { + out.push(`success=${toolDef.successIs}`); + // If you don't pass successIs, assert that it was successful + } else { + out.push(`success=true`); + } + if (toolDef.argumentContains) { + out.push(`contains=${toolDef.argumentContains}`); + } + return out.join(","); + } + // If you just pass a string, assert that the tool was successful + return "success=true"; +} + +export function toolArgumentsMatch(toolDef: ToolDef, log: ParsedToolLog): boolean { + let success = true; + if (typeof toolDef !== "string") { + if (toolDef.argumentContains) { + if (!log.args.includes(toolDef.argumentContains)) { + success = false; + } + } + if (toolDef.successIs !== undefined) { + if (log.success !== toolDef.successIs) { + success = false; + } + // If you don't pass successIs, assert that it was successful + } else if (!log.success) { + success = false; + } + // If you just pass a string, assert that the tool was successful + } else { + if (!log.success) { + success = false; + } + } + return success; +} diff --git a/scripts/agent-evals/src/tests/firebase-init.spec.ts b/scripts/agent-evals/src/tests/firebase-init.spec.ts index 2063cfa1518..3769d573d06 100644 --- a/scripts/agent-evals/src/tests/firebase-init.spec.ts +++ b/scripts/agent-evals/src/tests/firebase-init.spec.ts @@ -19,5 +19,13 @@ describe("/firebase:init", function (this: Mocha.Suite) { ); await run.type("Yes that looks good. Use Firebase Project gcli-ext-sam-01"); + await run.expectToolCalls([ + "firebase_update_environment", + { + name: "firebase_read_resources", + argumentContains: "firebase://guides/init/backend", + successIs: true, + }, + ]); }); });