From 6d16bd7a6b6164168cd70b0e77067162544cb0f1 Mon Sep 17 00:00:00 2001 From: Ammar Date: Wed, 29 Oct 2025 00:03:28 +0000 Subject: [PATCH 01/11] feat: add in-place workspace support for CLI/benchmark sessions Enables cmux to work directly in provided directories without requiring git worktrees, essential for terminal-bench integration and CLI usage. Changes: - agentSession: Detect in-place workspaces (not under srcBaseDir) and store path directly by setting projectPath === name as sentinel - aiService: Check for in-place mode and use stored path instead of reconstructing via runtime.getWorkspacePath() - streamManager: Fix cleanup safety by running rm -rf from parent directory instead of root (limits blast radius if path is malformed) Before: Terminal-bench failed with 'Working directory does not exist' After: Agents run successfully in task containers (e.g., /app) Tested with terminal-bench harness running multiple tasks successfully. --- src/services/agentSession.ts | 54 +++++++++++++++++++++++++---------- src/services/aiService.ts | 9 ++++-- src/services/streamManager.ts | 9 ++++-- 3 files changed, 53 insertions(+), 19 deletions(-) diff --git a/src/services/agentSession.ts b/src/services/agentSession.ts index ed2d34547..4d01bec95 100644 --- a/src/services/agentSession.ts +++ b/src/services/agentSession.ts @@ -180,11 +180,17 @@ export class AgentSession { if (existing.success) { // Metadata already exists, verify workspace path matches const metadata = existing.data; - // Directory name uses workspace name (not stable ID) - const runtime = createRuntime( - metadata.runtimeConfig ?? { type: "local", srcBaseDir: this.config.srcDir } - ); - const expectedPath = runtime.getWorkspacePath(metadata.projectPath, metadata.name); + // For in-place workspaces (projectPath === name), use path directly + // Otherwise reconstruct using runtime's worktree pattern + const isInPlace = metadata.projectPath === metadata.name; + const expectedPath = isInPlace + ? metadata.projectPath + : (() => { + const runtime = createRuntime( + metadata.runtimeConfig ?? { type: "local", srcBaseDir: this.config.srcDir } + ); + return runtime.getWorkspacePath(metadata.projectPath, metadata.name); + })(); assert( expectedPath === normalizedWorkspacePath, `Existing metadata workspace path mismatch for ${this.workspaceId}: expected ${expectedPath}, got ${normalizedWorkspacePath}` @@ -192,16 +198,34 @@ export class AgentSession { return; } - // Derive project path from workspace path (parent directory) - const derivedProjectPath = path.dirname(normalizedWorkspacePath); - - const derivedProjectName = - projectName && projectName.trim().length > 0 - ? projectName.trim() - : path.basename(derivedProjectPath) || "unknown"; - - // Extract name from workspace path (last component) - const workspaceName = path.basename(normalizedWorkspacePath); + // Detect in-place workspace: if workspacePath is not under srcBaseDir, + // it's a direct workspace (e.g., for CLI/benchmarks) rather than a worktree + const srcBaseDir = this.config.srcDir; + const normalizedSrcBaseDir = path.resolve(srcBaseDir); + const isUnderSrcBaseDir = normalizedWorkspacePath.startsWith(normalizedSrcBaseDir + path.sep); + + let derivedProjectPath: string; + let workspaceName: string; + let derivedProjectName: string; + + if (isUnderSrcBaseDir) { + // Standard worktree mode: workspace is under ~/.cmux/src/project/branch + derivedProjectPath = path.dirname(normalizedWorkspacePath); + workspaceName = path.basename(normalizedWorkspacePath); + derivedProjectName = + projectName && projectName.trim().length > 0 + ? projectName.trim() + : path.basename(derivedProjectPath) || "unknown"; + } else { + // In-place mode: workspace is a standalone directory + // Store the workspace path directly by setting projectPath === name + derivedProjectPath = normalizedWorkspacePath; + workspaceName = normalizedWorkspacePath; + derivedProjectName = + projectName && projectName.trim().length > 0 + ? projectName.trim() + : path.basename(normalizedWorkspacePath) || "unknown"; + } const metadata: WorkspaceMetadata = { id: this.workspaceId, diff --git a/src/services/aiService.ts b/src/services/aiService.ts index 00c68dd9d..dfb528b6e 100644 --- a/src/services/aiService.ts +++ b/src/services/aiService.ts @@ -519,11 +519,16 @@ export class AIService extends EventEmitter { return Err({ type: "unknown", raw: `Workspace ${workspaceId} not found in config` }); } - // Get workspace path (directory name uses workspace name) + // Get workspace path - handle both worktree and in-place modes const runtime = createRuntime( metadata.runtimeConfig ?? { type: "local", srcBaseDir: this.config.srcDir } ); - const workspacePath = runtime.getWorkspacePath(metadata.projectPath, metadata.name); + // In-place workspaces (CLI/benchmarks) have projectPath === name + // Use path directly instead of reconstructing via getWorkspacePath + const isInPlace = metadata.projectPath === metadata.name; + const workspacePath = isInPlace + ? metadata.projectPath + : runtime.getWorkspacePath(metadata.projectPath, metadata.name); // Build system message from workspace metadata const systemMessage = await buildSystemMessage( diff --git a/src/services/streamManager.ts b/src/services/streamManager.ts index b71ddd03f..10a5fe23e 100644 --- a/src/services/streamManager.ts +++ b/src/services/streamManager.ts @@ -1,4 +1,5 @@ import { EventEmitter } from "events"; +import * as path from "path"; import { streamText, stepCountIs, @@ -982,9 +983,13 @@ export class StreamManager extends EventEmitter { // Don't block stream completion waiting for directory deletion // This is especially important for SSH where rm -rf can take 500ms-2s if (streamInfo.runtimeTempDir) { + // Use parent directory as cwd for safety - if runtimeTempDir is malformed, + // we won't accidentally run rm -rf from root + const tempDirBasename = path.basename(streamInfo.runtimeTempDir); + const tempDirParent = path.dirname(streamInfo.runtimeTempDir); void streamInfo.runtime - .exec(`rm -rf "${streamInfo.runtimeTempDir}"`, { - cwd: "~", + .exec(`rm -rf "${tempDirBasename}"`, { + cwd: tempDirParent, timeout: 10, }) .then(async (result) => { From 0d1b14ebcd9953bf9aee64f0b13b0a1c72829a73 Mon Sep 17 00:00:00 2001 From: Ammar Date: Wed, 29 Oct 2025 00:41:58 +0000 Subject: [PATCH 02/11] =?UTF-8?q?=F0=9F=A4=96=20fix:=20upload=20actual=20b?= =?UTF-8?q?enchmark=20results=20from=20runs/=20directory?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The workflow was trying to upload terminal-bench-results/ which doesn't exist. Terminal-bench writes results to runs/ by default. --- .github/workflows/terminal-bench.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index 6aab52166..d014f93fd 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -66,7 +66,6 @@ jobs: with: name: terminal-bench-results path: | - terminal-bench-results/ - *.json + runs/ if-no-files-found: warn From 53490016947dd0b5a52c1c229385e2270cd36773 Mon Sep 17 00:00:00 2001 From: Ammar Date: Wed, 29 Oct 2025 00:56:25 +0000 Subject: [PATCH 03/11] =?UTF-8?q?=F0=9F=A4=96=20chore:=20add=20terminal-be?= =?UTF-8?q?nch-results/=20to=20.gitignore?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Downloaded artifacts from terminal-bench CI runs should not be committed. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 6314bc635..ef6b70066 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,4 @@ tmpfork storybook-static/ *.tgz src/test-workspaces/ +terminal-bench-results/ From d114048c365552553a251ab1073b832ac0ee845d Mon Sep 17 00:00:00 2001 From: Ammar Date: Wed, 29 Oct 2025 01:15:59 +0000 Subject: [PATCH 04/11] =?UTF-8?q?=F0=9F=A4=96=20fix:=20skip=20worktree=20r?= =?UTF-8?q?emoval=20for=20in-place=20workspaces?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In-place workspaces (identified by projectPath === workspaceName) are direct workspace directories used by CLI/benchmark sessions, not git worktrees. Attempting to run 'git worktree remove' on them fails or attempts to remove the main checkout. This fix detects the in-place sentinel pattern and skips git worktree operations, allowing session cleanup without destructive filesystem operations. Resolves Codex review comment in PR #472. --- src/runtime/LocalRuntime.ts | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/runtime/LocalRuntime.ts b/src/runtime/LocalRuntime.ts index 69c9a61fb..cf31a9f37 100644 --- a/src/runtime/LocalRuntime.ts +++ b/src/runtime/LocalRuntime.ts @@ -512,6 +512,11 @@ export class LocalRuntime implements Runtime { _abortSignal?: AbortSignal ): Promise<{ success: true; deletedPath: string } | { success: false; error: string }> { // Note: _abortSignal ignored for local operations (fast, no need for cancellation) + + // In-place workspaces are identified by projectPath === workspaceName + // These are direct workspace directories (e.g., CLI/benchmark sessions), not git worktrees + const isInPlace = projectPath === workspaceName; + // Compute workspace path using the canonical method const deletedPath = this.getWorkspacePath(projectPath, workspaceName); @@ -520,16 +525,25 @@ export class LocalRuntime implements Runtime { await fsPromises.access(deletedPath); } catch { // Directory doesn't exist - operation is idempotent - // Prune stale git records (best effort) - try { - using pruneProc = execAsync(`git -C "${projectPath}" worktree prune`); - await pruneProc.result; - } catch { - // Ignore prune errors - directory is already deleted, which is the goal + // For standard worktrees, prune stale git records (best effort) + if (!isInPlace) { + try { + using pruneProc = execAsync(`git -C "${projectPath}" worktree prune`); + await pruneProc.result; + } catch { + // Ignore prune errors - directory is already deleted, which is the goal + } } return { success: true, deletedPath }; } + // For in-place workspaces, there's no worktree to remove + // Just return success - the workspace directory itself should not be deleted + // as it may contain the user's actual project files + if (isInPlace) { + return { success: true, deletedPath }; + } + try { // Use git worktree remove to delete the worktree // This updates git's internal worktree metadata correctly From f490ab74a2cc41b6c0e536e53cb54dc366e4c0f4 Mon Sep 17 00:00:00 2001 From: Ammar Date: Wed, 29 Oct 2025 01:17:05 +0000 Subject: [PATCH 05/11] =?UTF-8?q?=F0=9F=A4=96=20fix:=20automatically=20set?= =?UTF-8?q?=20upstream=20in=20wait=5Fpr=5Fchecks.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the current branch has no upstream, automatically run git push -u to set it instead of failing. This makes the script more user-friendly for new branches. --- scripts/wait_pr_checks.sh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/scripts/wait_pr_checks.sh b/scripts/wait_pr_checks.sh index 8e74ac983..671c568d3 100755 --- a/scripts/wait_pr_checks.sh +++ b/scripts/wait_pr_checks.sh @@ -28,9 +28,18 @@ CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD) REMOTE_BRANCH=$(git rev-parse --abbrev-ref --symbolic-full-name '@{u}' 2>/dev/null || echo "") if [[ -z "$REMOTE_BRANCH" ]]; then - echo "❌ Error: Current branch '$CURRENT_BRANCH' has no upstream branch." >&2 - echo "Set an upstream with: git push -u origin $CURRENT_BRANCH" >&2 - exit 1 + echo "⚠️ Current branch '$CURRENT_BRANCH' has no upstream branch." >&2 + echo "Setting upstream to origin/$CURRENT_BRANCH..." >&2 + + # Try to set upstream + if git push -u origin "$CURRENT_BRANCH" 2>&1; then + echo "✅ Upstream set successfully!" >&2 + REMOTE_BRANCH="origin/$CURRENT_BRANCH" + else + echo "❌ Error: Failed to set upstream branch." >&2 + echo "You may need to push manually: git push -u origin $CURRENT_BRANCH" >&2 + exit 1 + fi fi # Check if local and remote are in sync From fa8a0695ada91c05d199b2c474ac5c6cf07979e6 Mon Sep 17 00:00:00 2001 From: Ammar Date: Wed, 29 Oct 2025 01:18:46 +0000 Subject: [PATCH 06/11] =?UTF-8?q?=F0=9F=A4=96=20chore:=20format=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/runtime/LocalRuntime.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/runtime/LocalRuntime.ts b/src/runtime/LocalRuntime.ts index cf31a9f37..d62f4bd60 100644 --- a/src/runtime/LocalRuntime.ts +++ b/src/runtime/LocalRuntime.ts @@ -512,11 +512,11 @@ export class LocalRuntime implements Runtime { _abortSignal?: AbortSignal ): Promise<{ success: true; deletedPath: string } | { success: false; error: string }> { // Note: _abortSignal ignored for local operations (fast, no need for cancellation) - + // In-place workspaces are identified by projectPath === workspaceName // These are direct workspace directories (e.g., CLI/benchmark sessions), not git worktrees const isInPlace = projectPath === workspaceName; - + // Compute workspace path using the canonical method const deletedPath = this.getWorkspacePath(projectPath, workspaceName); From 07362af3c52e1bf37248033b1a17ac86929060ae Mon Sep 17 00:00:00 2001 From: Ammar Date: Wed, 29 Oct 2025 01:18:52 +0000 Subject: [PATCH 07/11] =?UTF-8?q?=F0=9F=A4=96=20chore:=20format=20wait=5Fp?= =?UTF-8?q?r=5Fchecks.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/wait_pr_checks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/wait_pr_checks.sh b/scripts/wait_pr_checks.sh index 671c568d3..77ec30c9b 100755 --- a/scripts/wait_pr_checks.sh +++ b/scripts/wait_pr_checks.sh @@ -30,7 +30,7 @@ REMOTE_BRANCH=$(git rev-parse --abbrev-ref --symbolic-full-name '@{u}' 2>/dev/nu if [[ -z "$REMOTE_BRANCH" ]]; then echo "⚠️ Current branch '$CURRENT_BRANCH' has no upstream branch." >&2 echo "Setting upstream to origin/$CURRENT_BRANCH..." >&2 - + # Try to set upstream if git push -u origin "$CURRENT_BRANCH" 2>&1; then echo "✅ Upstream set successfully!" >&2 From 2355fd1e389c7f989896ccc76b7f82098f171344 Mon Sep 17 00:00:00 2001 From: Ammar Date: Wed, 29 Oct 2025 01:24:58 +0000 Subject: [PATCH 08/11] =?UTF-8?q?=F0=9F=A4=96=20feat:=20add=20nightly=20te?= =?UTF-8?q?rminal-bench=20schedule?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Run full benchmark suite (~80 tasks) every night at midnight UTC - Concurrency=4 is appropriate for full suite (60-90 min estimated) - Timeout=180 min (3 hours) provides safety margin - Use default fallbacks for scheduled runs (no inputs) - Add unique artifact names with run_id to avoid conflicts - Set 30-day retention for nightly benchmark artifacts --- .github/workflows/terminal-bench.yml | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index d014f93fd..c061adb7b 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -1,6 +1,9 @@ name: Terminal-Bench on: + schedule: + # Run full benchmark suite every night at midnight UTC + - cron: '0 0 * * *' workflow_dispatch: inputs: dataset: @@ -31,7 +34,9 @@ jobs: benchmark: name: Run Terminal-Bench runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }} - timeout-minutes: 180 # 3 hours - terminal-bench can take a long time + # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes + # Allow 3 hours for safety margin and slower tasks + timeout-minutes: 180 steps: - name: Checkout code uses: actions/checkout@v4 @@ -52,11 +57,11 @@ jobs: - name: Run Terminal-Bench run: make benchmark-terminal env: - TB_DATASET: ${{ inputs.dataset }} - TB_CONCURRENCY: ${{ inputs.concurrency }} + TB_DATASET: ${{ inputs.dataset || 'terminal-bench-core==0.1.1' }} + TB_CONCURRENCY: ${{ inputs.concurrency || '4' }} TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }} - TB_SAMPLE_SIZE: ${{ inputs.sample_size }} - TB_ARGS: ${{ inputs.extra_args }} + TB_SAMPLE_SIZE: ${{ inputs.sample_size || '' }} + TB_ARGS: ${{ inputs.extra_args || '' }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -64,8 +69,9 @@ jobs: if: always() uses: actions/upload-artifact@v4 with: - name: terminal-bench-results + name: terminal-bench-results-${{ github.run_id }} path: | runs/ if-no-files-found: warn + retention-days: 30 From 5f09559d4cdd7f9336c94da8326b616253851692 Mon Sep 17 00:00:00 2001 From: Ammar Date: Wed, 29 Oct 2025 01:30:36 +0000 Subject: [PATCH 09/11] =?UTF-8?q?=F0=9F=A4=96=20docs:=20clarify=20terminal?= =?UTF-8?q?-bench-core=3D=3D0.1.1=20is=20the=20full=20suite?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit terminal-bench-core==0.1.1 contains ~80 tasks, which is the complete stable benchmark suite. The -head version is bleeding-edge dev. --- .github/workflows/terminal-bench.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index c061adb7b..4bf272266 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -2,7 +2,8 @@ name: Terminal-Bench on: schedule: - # Run full benchmark suite every night at midnight UTC + # Run full benchmark suite (~80 tasks) every night at midnight UTC + # Uses terminal-bench-core==0.1.1 which is the stable, full benchmark suite - cron: '0 0 * * *' workflow_dispatch: inputs: From dfb6daed5f088e8d97932dd144289a4aa315d25d Mon Sep 17 00:00:00 2001 From: Ammar Date: Wed, 29 Oct 2025 01:36:40 +0000 Subject: [PATCH 10/11] =?UTF-8?q?=F0=9F=A4=96=20feat:=20run=20nightly=20be?= =?UTF-8?q?nchmarks=20for=20both=20Sonnet=204.5=20and=20Codex?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use matrix strategy to run both models every night: - anthropic:claude-sonnet-4-5 (high thinking) - openai:gpt-5-codex (high thinking) Matrix only applies to scheduled runs (cron), not manual workflow_dispatch. Artifacts are named uniquely per model to avoid conflicts. This enables direct comparison of model performance on the full 80-task suite. --- .github/workflows/terminal-bench.yml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index 4bf272266..35355b02d 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -33,11 +33,17 @@ on: jobs: benchmark: - name: Run Terminal-Bench + name: Run Terminal-Bench (${{ matrix.model_name }}) runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }} # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes # Allow 3 hours for safety margin and slower tasks timeout-minutes: 180 + strategy: + # Run scheduled benchmarks for both models + matrix: + model_name: ${{ github.event_name == 'schedule' && fromJSON('["anthropic:claude-sonnet-4-5", "openai:gpt-5-codex"]') || fromJSON('[""]') }} + thinking_level: ${{ github.event_name == 'schedule' && fromJSON('["high"]') || fromJSON('[""]') }} + fail-fast: false steps: - name: Checkout code uses: actions/checkout@v4 @@ -62,7 +68,7 @@ jobs: TB_CONCURRENCY: ${{ inputs.concurrency || '4' }} TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }} TB_SAMPLE_SIZE: ${{ inputs.sample_size || '' }} - TB_ARGS: ${{ inputs.extra_args || '' }} + TB_ARGS: ${{ matrix.model_name && format('--agent-kwarg model_name={0} --agent-kwarg thinking_level={1} {2}', matrix.model_name, matrix.thinking_level, inputs.extra_args || '') || inputs.extra_args || '' }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -70,7 +76,7 @@ jobs: if: always() uses: actions/upload-artifact@v4 with: - name: terminal-bench-results-${{ github.run_id }} + name: terminal-bench-results-${{ matrix.model_name && format('{0}-{1}', matrix.model_name, github.run_id) || github.run_id }} path: | runs/ if-no-files-found: warn From 9dca9486f847e326b43db478f0da29195c4eaa08 Mon Sep 17 00:00:00 2001 From: Ammar Date: Wed, 29 Oct 2025 01:38:44 +0000 Subject: [PATCH 11/11] =?UTF-8?q?=F0=9F=A4=96=20refactor:=20split=20nightl?= =?UTF-8?q?y=20benchmarks=20into=20separate=20workflow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleaner architecture: - terminal-bench.yml: Reusable workflow (workflow_call + workflow_dispatch) - nightly-terminal-bench.yml: Scheduled runner with matrix strategy Benefits: - Main workflow stays simple for manual use - Nightly schedule logic isolated in dedicated file - Easy to add more models to nightly runs - Manual workflow_dispatch supports model/thinking overrides Nightly runs both models at midnight UTC: - anthropic:claude-sonnet-4-5 (high thinking) - openai:gpt-5-codex (high thinking) --- .github/workflows/nightly-terminal-bench.yml | 50 ++++++++++++++ .github/workflows/terminal-bench.yml | 68 +++++++++++++++----- 2 files changed, 102 insertions(+), 16 deletions(-) create mode 100644 .github/workflows/nightly-terminal-bench.yml diff --git a/.github/workflows/nightly-terminal-bench.yml b/.github/workflows/nightly-terminal-bench.yml new file mode 100644 index 000000000..e78b2ce2d --- /dev/null +++ b/.github/workflows/nightly-terminal-bench.yml @@ -0,0 +1,50 @@ +name: Nightly Terminal-Bench + +on: + schedule: + # Run full benchmark suite (~80 tasks) every night at midnight UTC + - cron: '0 0 * * *' + workflow_dispatch: + inputs: + models: + description: 'Models to test (comma-separated, or "all" for both)' + required: false + default: 'all' + type: string + +jobs: + determine-models: + name: Determine models to test + runs-on: ubuntu-latest + outputs: + models: ${{ steps.set-models.outputs.models }} + steps: + - name: Set models matrix + id: set-models + run: | + if [ "${{ inputs.models }}" = "all" ] || [ -z "${{ inputs.models }}" ]; then + echo 'models=["anthropic:claude-sonnet-4-5","openai:gpt-5-codex"]' >> $GITHUB_OUTPUT + else + # Convert comma-separated to JSON array + models="${{ inputs.models }}" + models_json=$(echo "$models" | jq -R -s -c 'split(",") | map(gsub("^\\s+|\\s+$"; ""))') + echo "models=$models_json" >> $GITHUB_OUTPUT + fi + + benchmark: + name: ${{ matrix.model }} + needs: determine-models + strategy: + matrix: + model: ${{ fromJSON(needs.determine-models.outputs.models) }} + fail-fast: false + uses: ./.github/workflows/terminal-bench.yml + with: + model_name: ${{ matrix.model }} + thinking_level: 'high' + dataset: 'terminal-bench-core==0.1.1' + concurrency: '4' + livestream: true + secrets: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index 35355b02d..c8fa90057 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -1,10 +1,44 @@ name: Terminal-Bench on: - schedule: - # Run full benchmark suite (~80 tasks) every night at midnight UTC - # Uses terminal-bench-core==0.1.1 which is the stable, full benchmark suite - - cron: '0 0 * * *' + workflow_call: + inputs: + model_name: + description: 'Model to use (e.g., anthropic:claude-sonnet-4-5)' + required: false + type: string + thinking_level: + description: 'Thinking level (off, low, medium, high)' + required: false + type: string + dataset: + description: 'Terminal-Bench dataset to use' + required: false + type: string + default: 'terminal-bench-core==0.1.1' + concurrency: + description: 'Number of concurrent tasks (--n-concurrent)' + required: false + type: string + default: '4' + livestream: + description: 'Enable livestream mode' + required: false + type: boolean + default: true + sample_size: + description: 'Number of random tasks to run (empty = all tasks)' + required: false + type: string + extra_args: + description: 'Additional arguments to pass to terminal-bench' + required: false + type: string + secrets: + ANTHROPIC_API_KEY: + required: true + OPENAI_API_KEY: + required: true workflow_dispatch: inputs: dataset: @@ -26,6 +60,14 @@ on: description: 'Number of random tasks to run (empty = all tasks)' required: false type: string + model_name: + description: 'Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)' + required: false + type: string + thinking_level: + description: 'Thinking level (off, low, medium, high)' + required: false + type: string extra_args: description: 'Additional arguments to pass to terminal-bench' required: false @@ -33,17 +75,11 @@ on: jobs: benchmark: - name: Run Terminal-Bench (${{ matrix.model_name }}) + name: Run Terminal-Bench${{ inputs.model_name && format(' ({0})', inputs.model_name) || '' }} runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }} # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes # Allow 3 hours for safety margin and slower tasks timeout-minutes: 180 - strategy: - # Run scheduled benchmarks for both models - matrix: - model_name: ${{ github.event_name == 'schedule' && fromJSON('["anthropic:claude-sonnet-4-5", "openai:gpt-5-codex"]') || fromJSON('[""]') }} - thinking_level: ${{ github.event_name == 'schedule' && fromJSON('["high"]') || fromJSON('[""]') }} - fail-fast: false steps: - name: Checkout code uses: actions/checkout@v4 @@ -64,11 +100,11 @@ jobs: - name: Run Terminal-Bench run: make benchmark-terminal env: - TB_DATASET: ${{ inputs.dataset || 'terminal-bench-core==0.1.1' }} - TB_CONCURRENCY: ${{ inputs.concurrency || '4' }} + TB_DATASET: ${{ inputs.dataset }} + TB_CONCURRENCY: ${{ inputs.concurrency }} TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }} - TB_SAMPLE_SIZE: ${{ inputs.sample_size || '' }} - TB_ARGS: ${{ matrix.model_name && format('--agent-kwarg model_name={0} --agent-kwarg thinking_level={1} {2}', matrix.model_name, matrix.thinking_level, inputs.extra_args || '') || inputs.extra_args || '' }} + TB_SAMPLE_SIZE: ${{ inputs.sample_size }} + TB_ARGS: ${{ inputs.model_name && format('--agent-kwarg model_name={0} --agent-kwarg thinking_level={1} {2}', inputs.model_name, inputs.thinking_level, inputs.extra_args) || inputs.extra_args }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -76,7 +112,7 @@ jobs: if: always() uses: actions/upload-artifact@v4 with: - name: terminal-bench-results-${{ matrix.model_name && format('{0}-{1}', matrix.model_name, github.run_id) || github.run_id }} + name: terminal-bench-results-${{ inputs.model_name && format('{0}-{1}', inputs.model_name, github.run_id) || format('{0}', github.run_id) }} path: | runs/ if-no-files-found: warn