diff --git a/bin/mcp-server.js b/bin/mcp-server.js index 776e31d22..f80ea0bad 100644 --- a/bin/mcp-server.js +++ b/bin/mcp-server.js @@ -12,8 +12,11 @@ import { snapshotDirFor, artifactsToFileUrls, writeTraceMarkdown, + TraceReader, + ariaDiff, } from '../lib/utils/trace.js' import event from '../lib/event.js' +import recorder from '../lib/recorder.js' import { setPauseHandler, pauseNow } from '../lib/pause.js' import { EventEmitter } from 'events' import { fileURLToPath, pathToFileURL } from 'url' @@ -32,6 +35,93 @@ const __dirname = dirname(__filename) let codecept = null let containerInitialized = false let browserStarted = false +let shellSessionActive = false +let bootstrapDone = false +let currentPluginsSig = '' +let currentAiTraceDir = null // mirrors the dir aiTrace plugin computes per test/session + +event.dispatcher.on(event.test.before, test => { + try { + const title = (test && (test.fullTitle ? test.fullTitle() : test.title)) || 'MCP Session' + currentAiTraceDir = traceDirFor(test?.file, title, outputBaseDir()) + } catch {} +}) + +const SESSION_REQUIRED_ERROR = 'No active CodeceptJS session. Call `start_browser` to open a shell session, or `run_test` (use `pause()` in the test, or set `pauseAt`) to inspect during a test run.' + +async function ensureBootstrap() { + if (bootstrapDone) return + await codecept.bootstrap() + bootstrapDone = true +} + +async function startShellSession() { + if (shellSessionActive) return + await ensureBootstrap() + recorder.start() + event.emit(event.suite.before, { + fullTitle: () => 'MCP Session', + tests: [], + retries: () => {}, + }) + event.emit(event.test.before, { + title: 'MCP Session', + artifacts: {}, + retries: () => {}, + }) + shellSessionActive = true +} + +async function endShellSession() { + if (!shellSessionActive) return + try { event.emit(event.test.after, {}) } catch {} + try { event.emit(event.suite.after, {}) } catch {} + try { event.emit(event.all.result, {}) } catch {} + shellSessionActive = false +} + +function ensureSession() { + if (shellSessionActive || pausedController) return + throw new Error(SESSION_REQUIRED_ERROR) +} + +function normalizePluginOverrides(plugins) { + if (!plugins || typeof plugins !== 'object') return {} + const out = {} + for (const [name, opts] of Object.entries(plugins)) { + if (opts === false) continue + out[name] = (opts === true || opts == null) ? {} : opts + } + return out +} + +function applyPluginOverrides(config, plugins) { + config.plugins = config.plugins || {} + for (const [name, opts] of Object.entries(plugins)) { + config.plugins[name] = { ...(config.plugins[name] || {}), ...opts, enabled: true } + } +} + +function pluginsSignature(plugins) { + const keys = Object.keys(plugins).sort() + return JSON.stringify(keys.map(k => [k, plugins[k]])) +} + +async function teardownContainer() { + if (!containerInitialized) return + await endShellSession() + const helpers = container.helpers() + for (const helperName in helpers) { + const helper = helpers[helperName] + try { if (helper._finish) await helper._finish() } catch {} + } + try { if (codecept?.teardown) await codecept.teardown() } catch {} + containerInitialized = false + browserStarted = false + bootstrapDone = false + codecept = null + currentPluginsSig = '' +} let runLock = Promise.resolve() async function withLock(fn) { @@ -318,8 +408,14 @@ function pausedPayload() { } } -async function initCodecept(configPath) { - if (containerInitialized) return +async function initCodecept(configPath, pluginOverrides) { + const plugins = normalizePluginOverrides(pluginOverrides) + const sig = pluginsSignature(plugins) + + if (containerInitialized) { + if (!Object.keys(plugins).length || sig === currentPluginsSig) return + await teardownContainer() + } const testRoot = process.env.CODECEPTJS_PROJECT_DIR || process.cwd() @@ -344,6 +440,11 @@ async function initCodecept(configPath) { const { getConfig } = await import('../lib/command/utils.js') const config = await getConfig(configPath) + // aiTrace is the canonical per-step ARIA/HTML/screenshot capture for MCP. + // Always on so run_code / continue can read the latest snapshot from disk + // instead of double-capturing through grabAriaSnapshot etc. + applyPluginOverrides(config, { aiTrace: {}, ...plugins }) + codecept = new Codecept(config, {}) await codecept.init(testRoot) await container.create(config, {}) @@ -351,8 +452,11 @@ async function initCodecept(configPath) { containerInitialized = true browserStarted = true + currentPluginsSig = sig } +const PLUGINS_DESCRIPTION = 'Enable CodeceptJS plugins for this run, mirroring the CLI `-p` flag. Keys are plugin names (e.g. screencast, aiTrace, pause, pageInfo, heal, retryFailedStep, screenshotOnFail, autoDelay). Value `true` or `{}` enables with defaults; an object merges options, e.g. {"screencast": {"saveScreenshots": true}, "aiTrace": {"on": "fail"}}. Changing the plugin set tears down and re-initializes the container (closes the browser).' + const server = new Server( { name: 'codeceptjs-mcp-server', version: '1.0.0' }, { capabilities: { tools: {} } } @@ -394,6 +498,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => ({ timeout: { type: 'number' }, config: { type: 'string' }, pauseAt: { type: 'number', description: '1-based step index. Test will pause after the Nth step completes. Useful as a programmatic breakpoint without editing the test.' }, + plugins: { type: 'object', description: PLUGINS_DESCRIPTION, additionalProperties: true }, }, required: ['test'], }, @@ -407,6 +512,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => ({ test: { type: 'string' }, timeout: { type: 'number' }, config: { type: 'string' }, + plugins: { type: 'object', description: PLUGINS_DESCRIPTION, additionalProperties: true }, }, required: ['test'], }, @@ -497,33 +603,26 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => { case 'start_browser': { const configPath = args?.config - if (browserStarted) { - return { content: [{ type: 'text', text: JSON.stringify({ status: 'Browser already started' }, null, 2) }] } + if (browserStarted && shellSessionActive) { + return { content: [{ type: 'text', text: JSON.stringify({ status: 'Session already active' }, null, 2) }] } } await initCodecept(configPath) - return { content: [{ type: 'text', text: JSON.stringify({ status: 'Browser started successfully' }, null, 2) }] } + await startShellSession() + return { content: [{ type: 'text', text: JSON.stringify({ status: 'Session started — run_code and snapshot are now available' }, null, 2) }] } } case 'stop_browser': { if (!containerInitialized) { return { content: [{ type: 'text', text: JSON.stringify({ status: 'Browser not initialized' }, null, 2) }] } } - - const helpers = container.helpers() - for (const helperName in helpers) { - const helper = helpers[helperName] - try { if (helper._finish) await helper._finish() } catch {} - } - - browserStarted = false - containerInitialized = false - + await teardownContainer() return { content: [{ type: 'text', text: JSON.stringify({ status: 'Browser stopped successfully' }, null, 2) }] } } case 'snapshot': { const { config: configPath, fullPage = false } = args || {} await initCodecept(configPath) + ensureSession() const helper = pickActingHelper(container.helpers()) if (!helper) throw new Error('No supported acting helper available (Playwright, Puppeteer, WebDriver).') @@ -588,6 +687,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => { case 'run_code': { const { code, timeout = 60000, config: configPath, saveArtifacts = true } = args await initCodecept(configPath) + ensureSession() const I = container.support('I') if (!I) throw new Error('I object not available. Make sure helpers are configured.') @@ -604,6 +704,11 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => { mkdirp.sync(traceDir) const startedAt = Date.now() + // Pin the latest aiTrace ARIA file before running the code, so we + // can diff after. aiTrace owns per-step capture; we just read it. + const reader = new TraceReader(currentAiTraceDir) + const ariaBefore = reader.last('aria') + const MAX_LOG_ENTRIES = 100 const MAX_LOG_MSG_BYTES = 2000 const MAX_RETURN_BYTES = 20000 @@ -666,6 +771,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => { } } + // Diff against the latest aiTrace ARIA file produced by the steps + // that just ran inside this run_code call. + const ariaAfter = reader.last('aria') + if (ariaBefore && ariaAfter && ariaBefore !== ariaAfter) { + const diff = ariaDiff(ariaBefore, ariaAfter) + if (diff) result.ariaDiff = diff + } + const traceFile = writeTraceMarkdown({ dir: traceDir, title: 'run_code', @@ -686,8 +799,9 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => { if (pausedController) { throw new Error('A previous run_test is still paused. Call "continue" first.') } - const { test, timeout = 60000, config: configPathArg, pauseAt } = args || {} - await initCodecept(configPathArg) + const { test, timeout = 60000, config: configPathArg, pauseAt, plugins } = args || {} + await initCodecept(configPathArg, plugins) + await endShellSession() return await withSilencedIO(async () => { codecept.loadTests() @@ -740,7 +854,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => { let runError = null const runPromise = (async () => { try { - await codecept.bootstrap() + await ensureBootstrap() await codecept.run(testFile) } catch (err) { runError = err @@ -779,8 +893,9 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => { if (pausedController) { throw new Error('A previous run is still paused. Call "continue" first.') } - const { test, timeout = 60000, config: configPath } = args || {} - await initCodecept(configPath) + const { test, timeout = 60000, config: configPath, plugins } = args || {} + await initCodecept(configPath, plugins) + await endShellSession() return await withSilencedIO(async () => { codecept.loadTests() @@ -832,7 +947,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => { let runError = null const runPromise = (async () => { try { - await codecept.bootstrap() + await ensureBootstrap() await codecept.run(testFile) } catch (err) { runError = err diff --git a/docs/agents.md b/docs/agents.md new file mode 100644 index 000000000..3358256bd --- /dev/null +++ b/docs/agents.md @@ -0,0 +1,159 @@ +--- +permalink: /agents +title: Agentic Testing +--- + +# Agentic Testing + +CodeceptJS ships an **MCP server and a skillset** that lets an AI agent (Claude Code, Cursor, Codex, others) write and fix tests by driving the real browser. The agent runs the same `I.*` commands the test does, reads how the page responds, and only commits the lines that succeeded. + +## Why MCP + +The traditional agent testing loop is test/fix/retry, where the agent executes a test, watches it fail, reads artifacts, performs code fixes, and reruns the test. The agent applies fixes by intelligent guess — looking at the ARIA tree, HTML, and screenshot — then assumes the fix is enough and reruns the test hoping it will pass. If the guess is wrong and the test runs for over a minute, it may take dozens of minutes of iteration and a lot of wasted tokens. + +To improve that flow, the agent can spawn a browser and open the page the way the test does. This lets it interact with the page more freely and perform multi-step actions. But putting that experience back into test code is not efficient either: actions executed in the browser may not be relevant in test context, so the agent ends up in another guess-and-try loop. + +The problem is that **the test runs in a different context than the agent**. + +The agent can launch a test but can't control it while it's running. It can't access the browser. It can't set a breakpoint. + +This is where CodeceptJS MCP steps in. Connected to the agent, it can: + +- run a test and pause it on failure +- interact with the browser in a test context +- test locators and perform actions live while the test is running +- write successful actions to the test file + +This lets the agent get a test working in one iteration. The agent can live-write the test before your eyes by exploring the page and performing actions that eventually land in the CodeceptJS test file. + +**Live debugging of tests** is what CodeceptJS MCP provides. The agent receives feedback faster — not from a whole test execution but from specific actions on a specific page — so it can adjust and react faster, trying different approaches. + +The MCP server is the agent-facing equivalent of the `pause()` REPL — same access, driven by tool calls instead of keystrokes. Full tool reference at [/mcp](/mcp). + +## The loop + +Whether the agent is writing a new test or fixing an old one, it follows the same cycle. + +1. **Open the page.** Run a stub test (new work) or set a breakpoint at the failing step (fix). The browser lands at the right starting point and yields control to the agent. +2. **Read the page.** MCP saves HTML, ARIA, and screenshot of the page to files (and the agent can call the `snapshot` tool to refresh them). The agent reads those files before deciding what to try next, controlling its token usage. +3. **Run a CodeceptJS command.** The agent tries `I.*` commands like `I.click('Add to cart')`, `I.fillField('Email', secret(process.env.EMAIL))`, `I.see('Confirmed')`. On success, that line goes into the test — same syntax. +4. **Check the result.** The response after each command shows the new page state. If the URL changed and the modal opened, the line goes into the verified sequence. If not, the agent reads the page again and tries a different locator or a wait. +5. **Move forward.** The agent looks at the new state and chooses the next command. Steps 2–4 repeat until the scenario is whole. +6. **Commit to the file.** The agent edits the test — replaces `pause()` (new tests) or the broken line (fixes) with the verified sequence — then reruns end-to-end and reads the trace to confirm. + +## How the agent reads the page + +MCP commands are token efficient — they don't stream large HTML pages back to the model. MCP writes artifacts to disk under `output/trace_*/` and returns file paths. The agent reads each artifact with its own bash tools — `cat`, `grep`, `jq`. + +A `run_code` response, for example, looks like this: + +```json +{ + "status": "success", + "artifacts": { + "url": "http://localhost:8000/", + "html": "file:///output/trace_run_code_.../mcp_page.html", + "aria": "file:///output/trace_run_code_.../mcp_aria.txt", + "screenshot": "file:///output/trace_run_code_.../mcp_screenshot.png", + "console": "file:///output/trace_run_code_.../mcp_console.json", + "storage": "file:///output/trace_run_code_.../mcp_storage.json" + } +} +``` + +Only `url` is inline. The rest are paths the agent opens with the right tool: + +| Artifact | How the agent reads it | +|----------|------------------------| +| `*_screenshot.png` | As an image — most agents are multimodal | +| `*_aria.txt` | Whole — small and structured | +| `*_page.html` | With `grep` — too large for context, searchable for specific elements/attributes | +| `*_console.json` | With `jq` — filter for errors, 4xx/5xx, deprecation warnings | +| `*_storage.json` | Whole — cookies and `localStorage` snapshot | +| `trace.md` | Whole — markdown index linking every step to its artifacts | + +Saved HTML is formatted, with non-semantic elements stripped out: `