Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions scripts/agent-evals/src/runner/agent-test-runner.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import { ToolDef } from "./tool-matcher.js";

export interface AgentTestRunner {
/**
* Simulates typing a string and waits for the turn to complete. It types one
Expand All @@ -13,8 +15,8 @@ export interface AgentTestRunner {
expectText(text: string | RegExp): Promise<void>;

/**
* Reads the agent's telemetry file and looks for the given event. Throws if
* the event is not found
* Reads the agent's telemetry and looks for the given tool calls. Throws if
* an event is not found
*/
expectTelemetryEvent(eventName: string): Promise<void>;
expectToolCalls(tools: ToolDef[]): Promise<void>;
}
151 changes: 137 additions & 14 deletions scripts/agent-evals/src/runner/gemini-cli-runner.ts
Original file line number Diff line number Diff line change
@@ -1,15 +1,44 @@
import { mkdirSync, writeFileSync, readFileSync, existsSync } from "node:fs";
import { mkdirSync, writeFileSync, readFileSync } from "node:fs";
import path from "node:path";
import { InteractiveCLI, poll } from "./interactive-cli.js";
import { AgentTestRunner } from "./agent-test-runner.js";
import {
ParsedToolLog,
getToolName,
toolArgumentsMatch,
getToolArgumentsDebug,
} from "./tool-matcher.js";
import fs from "fs";
import { throwFailure } from "./logging.js";

const READY_PROMPT = "Type your message";

interface ParsedTelemetryLog {
attributes?: {
"event.name"?: string;
function_name?: string;
function_args?: string;
success?: boolean;
duration_ms?: number;
};
scopeMetrics?: {
metrics: {
descriptor: {
name: string;
};
}[];
}[];
}

export class GeminiCliRunner implements AgentTestRunner {
private readonly cli: InteractiveCLI;
private readonly telemetryPath: string;
private readonly telemetryTimeout = 15000;

// Determines which tools to start from for this turn so we don't detect tool
// calls from previous turns
private turnToolIndex = 0;

constructor(
private readonly testName: string,
testDir: string,
Expand All @@ -29,8 +58,6 @@
},
mcpServers: {
firebase: {
// TODO: Add a mode where developers can run against their npm run watch command
// command: path.resolve(runDir, "../../../../../lib/bin/firebase.js"),
command: "firebase",
args: ["experimental:mcp"],
},
Expand All @@ -52,6 +79,8 @@
}

async type(text: string): Promise<void> {
const toolLogs = this.readToolLogs();
this.turnToolIndex = toolLogs.length;
return this.cli.type(text);
}

Expand All @@ -67,21 +96,115 @@
* Reads the agent's telemetry file and looks for the given event. Throws if
* the event is not found
*/
async expectTelemetryEvent(eventName: string): Promise<void> {
// NOTE: This doesn't take into account "turns" yet. It will likely look
// through the entire history, not just the last turn
const found = await poll(() => {
if (!existsSync(this.telemetryPath)) {
async expectToolCalls(tools: string[]): Promise<void> {
await this.waitForTelemetryReady();

// We still need to poll because telemetry can take time to write each turn
let messages: string[] = [];
const success = await poll(() => {
messages = [];
let allSucceeded = true;
// Start at this.turnToolIndex so we only read the tools used this turn
const toolLogs = this.readToolLogs().slice(this.turnToolIndex);
const foundToolNames = toolLogs.map((log) => log.name);
for (const toolDef of tools) {
const toolName = getToolName(toolDef);
const matchingTool = toolLogs.find((log) => log.name === toolName);
if (!matchingTool) {
messages.push(
`Did not find expected tool call: "${toolName}" in the telemetry log. Found [${foundToolNames}]`,

Check warning on line 115 in scripts/agent-evals/src/runner/gemini-cli-runner.ts

View workflow job for this annotation

GitHub Actions / lint (20)

Invalid type "string[]" of template literal expression
);
allSucceeded = false;
} else {
const foundMatchingArguments = toolLogs.some(
(log) => log.name === toolName && toolArgumentsMatch(toolDef, log),
);
if (!foundMatchingArguments) {
messages.push(
`Tool arguments matcher "${getToolArgumentsDebug(toolDef)}" for "${toolName}" did not match any tool results in the telemetry log. All tools are: [${JSON.stringify(toolLogs)}]`,
);
allSucceeded = false;
}
}
}
return allSucceeded;
}, this.telemetryTimeout);

if (!success) {
throwFailure(messages.join("\n"));
}
}

// Implementation for this is borrowed from the Gemini CLI's test-helper
private async waitForTelemetryReady() {

Check warning on line 139 in scripts/agent-evals/src/runner/gemini-cli-runner.ts

View workflow job for this annotation

GitHub Actions / lint (20)

Missing return type on function
// Wait for telemetry file to exist and have content
await poll(() => {
if (!fs.existsSync(this.telemetryPath)) return false;
try {
const content = readFileSync(this.telemetryPath, "utf-8");
// Check if file has at lease one event in it
return content.includes('"event.name"');
} catch {
return false;
}
const content = readFileSync(this.telemetryPath, "utf-8");
return content.includes(eventName);
}, this.telemetryTimeout);
}

// Implementation for this is borrowed from the Gemini CLI's test-helper
private readToolLogs(): ParsedToolLog[] {
const parsedLogs = this.readAndParseTelemetryLog();
const logs: ParsedToolLog[] = [];

if (!found) {
throw new Error(`Did not find expected telemetry event: "${eventName}" in the telemetry log`);
} else {
console.log(` [FOUND] expectTelemetryEvent: ${eventName}`);
for (const logData of parsedLogs) {
// Look for tool call logs
if (
logData.attributes?.function_name &&
logData.attributes["event.name"] === "gemini_cli.tool_call"
) {
logs.push({
name: logData.attributes.function_name,
args: logData.attributes.function_args ?? "{}",
success: logData.attributes.success ?? false,
duration_ms: logData.attributes.duration_ms ?? 0,
});
}
}

return logs;
}

// Implementation for this is borrowed from the Gemini CLI's test-helper
private readAndParseTelemetryLog(): ParsedTelemetryLog[] {
const logFilePath = this.telemetryPath;
if (!logFilePath || !fs.existsSync(logFilePath)) {
return [];
}

const content = readFileSync(logFilePath, "utf-8");

// Split the content into individual JSON objects
// They are separated by "}\n{"
const jsonObjects = content
.split(/}\n{/)
.map((obj, index, array) => {
// Add back the braces we removed during split
if (index > 0) obj = "{" + obj;
if (index < array.length - 1) obj = obj + "}";
return obj.trim();
})
.filter((obj) => obj);

const logs: ParsedTelemetryLog[] = [];

for (const jsonStr of jsonObjects) {
try {
const logData = JSON.parse(jsonStr);

Check warning on line 201 in scripts/agent-evals/src/runner/gemini-cli-runner.ts

View workflow job for this annotation

GitHub Actions / lint (20)

Unsafe assignment of an `any` value
logs.push(logData);

Check warning on line 202 in scripts/agent-evals/src/runner/gemini-cli-runner.ts

View workflow job for this annotation

GitHub Actions / lint (20)

Unsafe argument of type `any` assigned to a parameter of type `ParsedTelemetryLog`
} catch (e) {
// Skip objects that aren't valid JSON
}
}

return logs;
}
}
7 changes: 4 additions & 3 deletions scripts/agent-evals/src/runner/interactive-cli.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import * as pty from "node-pty";
import { IPty } from "node-pty";
import stripAnsi from "strip-ansi";
import { throwFailure } from "./logging.js";

export async function poll(predicate: () => boolean, timeout: number): Promise<boolean> {

Check warning on line 6 in scripts/agent-evals/src/runner/interactive-cli.ts

View workflow job for this annotation

GitHub Actions / lint (20)

Missing JSDoc comment
const startTime = Date.now();
while (Date.now() - startTime < timeout) {
if (predicate()) {
Expand Down Expand Up @@ -34,7 +35,7 @@
args: string[],
private readonly options: RunInteractiveOptions,
) {
this.ptyProcess = pty.spawn(command, args, {

Check warning on line 38 in scripts/agent-evals/src/runner/interactive-cli.ts

View workflow job for this annotation

GitHub Actions / lint (20)

Unsafe call of an `any` typed value

Check warning on line 38 in scripts/agent-evals/src/runner/interactive-cli.ts

View workflow job for this annotation

GitHub Actions / lint (20)

Unsafe member access .spawn on an `any` value

Check warning on line 38 in scripts/agent-evals/src/runner/interactive-cli.ts

View workflow job for this annotation

GitHub Actions / lint (20)

Unsafe assignment of an `any` value
name: "xterm-color",
cols: 80,
rows: 30,
Expand All @@ -42,7 +43,7 @@
env: { ...process.env, ...options.env },
});

this.ptyProcess.onData((data) => {

Check warning on line 46 in scripts/agent-evals/src/runner/interactive-cli.ts

View workflow job for this annotation

GitHub Actions / lint (20)

Unsafe call of an `any` typed value

Check warning on line 46 in scripts/agent-evals/src/runner/interactive-cli.ts

View workflow job for this annotation

GitHub Actions / lint (20)

Unsafe member access .onData on an `any` value
this.turnOutput += data;
if (options.showOutput) {
process.stdout.write(data);
Expand Down Expand Up @@ -96,7 +97,7 @@
}, this.timeout);

if (!found) {
throw new Error(`Did not find expected text: "${text}" in output within ${this.timeout}ms`);
throwFailure(`Did not find expected text: "${text}" in output within ${this.timeout}ms`);
}
}

Expand All @@ -121,7 +122,7 @@
}, timeout);

if (!stoppedChanging) {
throw new Error(`CLI did not stop changing output within ${timeout}ms`);
throwFailure(`CLI did not stop changing output within ${timeout}ms`);
}
}

Expand All @@ -140,7 +141,7 @@
}

if (!found) {
throw new Error(`Did not find expected text: "${text}" in the latest output`);
throwFailure(`Did not find expected text: "${text}" in the latest output`);
} else {
console.log(` [FOUND] expectText: ${text}`);
}
Expand Down
19 changes: 19 additions & 0 deletions scripts/agent-evals/src/runner/logging.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
const COLORS = {
RESET: "\x1b[0m",
BRIGHT: "\x1b[1m",
BLUE: "\x1b[34m",
GREEN: "\x1b[32m",
RED: "\x1b[31m",
};

function colorLog(color: string, message: string): void {
console.log(`${color}${message}${COLORS.RESET}`);
}

export function throwFailure(message: string) {
// Log this separately because mocha doesn't print errors from failures
// that happen before the final repetition. The failure can be helpful to get
// early signal that the test is going to fail all reptitions
colorLog(COLORS.BRIGHT + COLORS.RED, message);
throw new Error(message);
}
68 changes: 68 additions & 0 deletions scripts/agent-evals/src/runner/tool-matcher.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
export type ToolDef =
// Asserts that the tool with this name was called successfully
| string
| {
// Name of the tool
name: string;
// Asserts that the tool arguments contain this string
argumentContains?: string;
// Asserts that the tool's success equals this value
successIs?: boolean;
};

export interface ParsedToolLog {
name: string;
args: string;
success: boolean;
duration_ms: number;
}

export function getToolName(toolDef: ToolDef): string {
if (typeof toolDef === "string") {
return toolDef;
}
return toolDef.name;
}

export function getToolArgumentsDebug(toolDef: ToolDef): string {
if (typeof toolDef !== "string") {
const out = [];
if (toolDef.successIs) {
out.push(`success=${toolDef.successIs}`);
// If you don't pass successIs, assert that it was successful
} else {
out.push(`success=true`);
}
if (toolDef.argumentContains) {
out.push(`contains=${toolDef.argumentContains}`);
}
return out.join(",");
}
// If you just pass a string, assert that the tool was successful
return "success=true";
}

export function toolArgumentsMatch(toolDef: ToolDef, log: ParsedToolLog): boolean {
let success = true;
if (typeof toolDef !== "string") {
if (toolDef.argumentContains) {
if (!log.args.includes(toolDef.argumentContains)) {
success = false;
}
}
if (toolDef.successIs !== undefined) {
if (log.success !== toolDef.successIs) {
success = false;
}
// If you don't pass successIs, assert that it was successful
} else if (!log.success) {
success = false;
}
// If you just pass a string, assert that the tool was successful
} else {
if (!log.success) {
success = false;
}
}
return success;
}
8 changes: 8 additions & 0 deletions scripts/agent-evals/src/tests/firebase-init.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,13 @@ describe("/firebase:init", function (this: Mocha.Suite) {
);

await run.type("Yes that looks good. Use Firebase Project gcli-ext-sam-01");
await run.expectToolCalls([
"firebase_update_environment",
{
name: "firebase_read_resources",
argumentContains: "firebase://guides/init/backend",
successIs: true,
},
]);
});
});
Loading