From 6d16bd7a6b6164168cd70b0e77067162544cb0f1 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Wed, 29 Oct 2025 00:03:28 +0000
Subject: [PATCH 01/11] feat: add in-place workspace support for CLI/benchmark
 sessions

Enables cmux to work directly in provided directories without requiring
git worktrees, essential for terminal-bench integration and CLI usage.

Changes:
- agentSession: Detect in-place workspaces (not under srcBaseDir) and store
  path directly by setting projectPath === name as sentinel
- aiService: Check for in-place mode and use stored path instead of
  reconstructing via runtime.getWorkspacePath()
- streamManager: Fix cleanup safety by running rm -rf from parent directory
  instead of root (limits blast radius if path is malformed)

Before: Terminal-bench failed with 'Working directory does not exist'
After: Agents run successfully in task containers (e.g., /app)

Tested with terminal-bench harness running multiple tasks successfully.
---
 src/services/agentSession.ts  | 54 +++++++++++++++++++++++++----------
 src/services/aiService.ts     |  9 ++++--
 src/services/streamManager.ts |  9 ++++--
 3 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/src/services/agentSession.ts b/src/services/agentSession.ts
index ed2d34547..4d01bec95 100644
--- a/src/services/agentSession.ts
+++ b/src/services/agentSession.ts
@@ -180,11 +180,17 @@ export class AgentSession {
     if (existing.success) {
       // Metadata already exists, verify workspace path matches
       const metadata = existing.data;
-      // Directory name uses workspace name (not stable ID)
-      const runtime = createRuntime(
-        metadata.runtimeConfig ?? { type: "local", srcBaseDir: this.config.srcDir }
-      );
-      const expectedPath = runtime.getWorkspacePath(metadata.projectPath, metadata.name);
+      // For in-place workspaces (projectPath === name), use path directly
+      // Otherwise reconstruct using runtime's worktree pattern
+      const isInPlace = metadata.projectPath === metadata.name;
+      const expectedPath = isInPlace
+        ? metadata.projectPath
+        : (() => {
+            const runtime = createRuntime(
+              metadata.runtimeConfig ?? { type: "local", srcBaseDir: this.config.srcDir }
+            );
+            return runtime.getWorkspacePath(metadata.projectPath, metadata.name);
+          })();
       assert(
         expectedPath === normalizedWorkspacePath,
         `Existing metadata workspace path mismatch for ${this.workspaceId}: expected ${expectedPath}, got ${normalizedWorkspacePath}`
@@ -192,16 +198,34 @@ export class AgentSession {
       return;
     }
 
-    // Derive project path from workspace path (parent directory)
-    const derivedProjectPath = path.dirname(normalizedWorkspacePath);
-
-    const derivedProjectName =
-      projectName && projectName.trim().length > 0
-        ? projectName.trim()
-        : path.basename(derivedProjectPath) || "unknown";
-
-    // Extract name from workspace path (last component)
-    const workspaceName = path.basename(normalizedWorkspacePath);
+    // Detect in-place workspace: if workspacePath is not under srcBaseDir,
+    // it's a direct workspace (e.g., for CLI/benchmarks) rather than a worktree
+    const srcBaseDir = this.config.srcDir;
+    const normalizedSrcBaseDir = path.resolve(srcBaseDir);
+    const isUnderSrcBaseDir = normalizedWorkspacePath.startsWith(normalizedSrcBaseDir + path.sep);
+
+    let derivedProjectPath: string;
+    let workspaceName: string;
+    let derivedProjectName: string;
+
+    if (isUnderSrcBaseDir) {
+      // Standard worktree mode: workspace is under ~/.cmux/src/project/branch
+      derivedProjectPath = path.dirname(normalizedWorkspacePath);
+      workspaceName = path.basename(normalizedWorkspacePath);
+      derivedProjectName =
+        projectName && projectName.trim().length > 0
+          ? projectName.trim()
+          : path.basename(derivedProjectPath) || "unknown";
+    } else {
+      // In-place mode: workspace is a standalone directory
+      // Store the workspace path directly by setting projectPath === name
+      derivedProjectPath = normalizedWorkspacePath;
+      workspaceName = normalizedWorkspacePath;
+      derivedProjectName =
+        projectName && projectName.trim().length > 0
+          ? projectName.trim()
+          : path.basename(normalizedWorkspacePath) || "unknown";
+    }
 
     const metadata: WorkspaceMetadata = {
       id: this.workspaceId,
diff --git a/src/services/aiService.ts b/src/services/aiService.ts
index 00c68dd9d..dfb528b6e 100644
--- a/src/services/aiService.ts
+++ b/src/services/aiService.ts
@@ -519,11 +519,16 @@ export class AIService extends EventEmitter {
         return Err({ type: "unknown", raw: `Workspace ${workspaceId} not found in config` });
       }
 
-      // Get workspace path (directory name uses workspace name)
+      // Get workspace path - handle both worktree and in-place modes
       const runtime = createRuntime(
         metadata.runtimeConfig ?? { type: "local", srcBaseDir: this.config.srcDir }
       );
-      const workspacePath = runtime.getWorkspacePath(metadata.projectPath, metadata.name);
+      // In-place workspaces (CLI/benchmarks) have projectPath === name
+      // Use path directly instead of reconstructing via getWorkspacePath
+      const isInPlace = metadata.projectPath === metadata.name;
+      const workspacePath = isInPlace
+        ? metadata.projectPath
+        : runtime.getWorkspacePath(metadata.projectPath, metadata.name);
 
       // Build system message from workspace metadata
       const systemMessage = await buildSystemMessage(
diff --git a/src/services/streamManager.ts b/src/services/streamManager.ts
index b71ddd03f..10a5fe23e 100644
--- a/src/services/streamManager.ts
+++ b/src/services/streamManager.ts
@@ -1,4 +1,5 @@
 import { EventEmitter } from "events";
+import * as path from "path";
 import {
   streamText,
   stepCountIs,
@@ -982,9 +983,13 @@ export class StreamManager extends EventEmitter {
       // Don't block stream completion waiting for directory deletion
       // This is especially important for SSH where rm -rf can take 500ms-2s
       if (streamInfo.runtimeTempDir) {
+        // Use parent directory as cwd for safety - if runtimeTempDir is malformed,
+        // we won't accidentally run rm -rf from root
+        const tempDirBasename = path.basename(streamInfo.runtimeTempDir);
+        const tempDirParent = path.dirname(streamInfo.runtimeTempDir);
         void streamInfo.runtime
-          .exec(`rm -rf "${streamInfo.runtimeTempDir}"`, {
-            cwd: "~",
+          .exec(`rm -rf "${tempDirBasename}"`, {
+            cwd: tempDirParent,
             timeout: 10,
           })
           .then(async (result) => {

From 0d1b14ebcd9953bf9aee64f0b13b0a1c72829a73 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Wed, 29 Oct 2025 00:41:58 +0000
Subject: [PATCH 02/11] =?UTF-8?q?=F0=9F=A4=96=20fix:=20upload=20actual=20b?=
 =?UTF-8?q?enchmark=20results=20from=20runs/=20directory?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The workflow was trying to upload terminal-bench-results/ which doesn't exist.
Terminal-bench writes results to runs/ by default.
---
 .github/workflows/terminal-bench.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
index 6aab52166..d014f93fd 100644
--- a/.github/workflows/terminal-bench.yml
+++ b/.github/workflows/terminal-bench.yml
@@ -66,7 +66,6 @@ jobs:
         with:
           name: terminal-bench-results
           path: |
-            terminal-bench-results/
-            *.json
+            runs/
           if-no-files-found: warn
 

From 53490016947dd0b5a52c1c229385e2270cd36773 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Wed, 29 Oct 2025 00:56:25 +0000
Subject: [PATCH 03/11] =?UTF-8?q?=F0=9F=A4=96=20chore:=20add=20terminal-be?=
 =?UTF-8?q?nch-results/=20to=20.gitignore?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Downloaded artifacts from terminal-bench CI runs should not be committed.
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 6314bc635..ef6b70066 100644
--- a/.gitignore
+++ b/.gitignore
@@ -105,3 +105,4 @@ tmpfork
 storybook-static/
 *.tgz
 src/test-workspaces/
+terminal-bench-results/

From d114048c365552553a251ab1073b832ac0ee845d Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Wed, 29 Oct 2025 01:15:59 +0000
Subject: [PATCH 04/11] =?UTF-8?q?=F0=9F=A4=96=20fix:=20skip=20worktree=20r?=
 =?UTF-8?q?emoval=20for=20in-place=20workspaces?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In-place workspaces (identified by projectPath === workspaceName) are direct
workspace directories used by CLI/benchmark sessions, not git worktrees. Attempting
to run 'git worktree remove' on them fails or attempts to remove the main checkout.

This fix detects the in-place sentinel pattern and skips git worktree operations,
allowing session cleanup without destructive filesystem operations.

Resolves Codex review comment in PR #472.
---
 src/runtime/LocalRuntime.ts | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/runtime/LocalRuntime.ts b/src/runtime/LocalRuntime.ts
index 69c9a61fb..cf31a9f37 100644
--- a/src/runtime/LocalRuntime.ts
+++ b/src/runtime/LocalRuntime.ts
@@ -512,6 +512,11 @@ export class LocalRuntime implements Runtime {
     _abortSignal?: AbortSignal
   ): Promise<{ success: true; deletedPath: string } | { success: false; error: string }> {
     // Note: _abortSignal ignored for local operations (fast, no need for cancellation)
+    
+    // In-place workspaces are identified by projectPath === workspaceName
+    // These are direct workspace directories (e.g., CLI/benchmark sessions), not git worktrees
+    const isInPlace = projectPath === workspaceName;
+    
     // Compute workspace path using the canonical method
     const deletedPath = this.getWorkspacePath(projectPath, workspaceName);
 
@@ -520,16 +525,25 @@ export class LocalRuntime implements Runtime {
       await fsPromises.access(deletedPath);
     } catch {
       // Directory doesn't exist - operation is idempotent
-      // Prune stale git records (best effort)
-      try {
-        using pruneProc = execAsync(`git -C "${projectPath}" worktree prune`);
-        await pruneProc.result;
-      } catch {
-        // Ignore prune errors - directory is already deleted, which is the goal
+      // For standard worktrees, prune stale git records (best effort)
+      if (!isInPlace) {
+        try {
+          using pruneProc = execAsync(`git -C "${projectPath}" worktree prune`);
+          await pruneProc.result;
+        } catch {
+          // Ignore prune errors - directory is already deleted, which is the goal
+        }
       }
       return { success: true, deletedPath };
     }
 
+    // For in-place workspaces, there's no worktree to remove
+    // Just return success - the workspace directory itself should not be deleted
+    // as it may contain the user's actual project files
+    if (isInPlace) {
+      return { success: true, deletedPath };
+    }
+
     try {
       // Use git worktree remove to delete the worktree
       // This updates git's internal worktree metadata correctly

From f490ab74a2cc41b6c0e536e53cb54dc366e4c0f4 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Wed, 29 Oct 2025 01:17:05 +0000
Subject: [PATCH 05/11] =?UTF-8?q?=F0=9F=A4=96=20fix:=20automatically=20set?=
 =?UTF-8?q?=20upstream=20in=20wait=5Fpr=5Fchecks.sh?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the current branch has no upstream, automatically run git push -u
to set it instead of failing. This makes the script more user-friendly
for new branches.
---
 scripts/wait_pr_checks.sh | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/scripts/wait_pr_checks.sh b/scripts/wait_pr_checks.sh
index 8e74ac983..671c568d3 100755
--- a/scripts/wait_pr_checks.sh
+++ b/scripts/wait_pr_checks.sh
@@ -28,9 +28,18 @@ CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
 REMOTE_BRANCH=$(git rev-parse --abbrev-ref --symbolic-full-name '@{u}' 2>/dev/null || echo "")
 
 if [[ -z "$REMOTE_BRANCH" ]]; then
-  echo "❌ Error: Current branch '$CURRENT_BRANCH' has no upstream branch." >&2
-  echo "Set an upstream with: git push -u origin $CURRENT_BRANCH" >&2
-  exit 1
+  echo "⚠️  Current branch '$CURRENT_BRANCH' has no upstream branch." >&2
+  echo "Setting upstream to origin/$CURRENT_BRANCH..." >&2
+  
+  # Try to set upstream
+  if git push -u origin "$CURRENT_BRANCH" 2>&1; then
+    echo "✅ Upstream set successfully!" >&2
+    REMOTE_BRANCH="origin/$CURRENT_BRANCH"
+  else
+    echo "❌ Error: Failed to set upstream branch." >&2
+    echo "You may need to push manually: git push -u origin $CURRENT_BRANCH" >&2
+    exit 1
+  fi
 fi
 
 # Check if local and remote are in sync

From fa8a0695ada91c05d199b2c474ac5c6cf07979e6 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Wed, 29 Oct 2025 01:18:46 +0000
Subject: [PATCH 06/11] =?UTF-8?q?=F0=9F=A4=96=20chore:=20format=20code?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/runtime/LocalRuntime.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/LocalRuntime.ts b/src/runtime/LocalRuntime.ts
index cf31a9f37..d62f4bd60 100644
--- a/src/runtime/LocalRuntime.ts
+++ b/src/runtime/LocalRuntime.ts
@@ -512,11 +512,11 @@ export class LocalRuntime implements Runtime {
     _abortSignal?: AbortSignal
   ): Promise<{ success: true; deletedPath: string } | { success: false; error: string }> {
     // Note: _abortSignal ignored for local operations (fast, no need for cancellation)
-    
+
     // In-place workspaces are identified by projectPath === workspaceName
     // These are direct workspace directories (e.g., CLI/benchmark sessions), not git worktrees
     const isInPlace = projectPath === workspaceName;
-    
+
     // Compute workspace path using the canonical method
     const deletedPath = this.getWorkspacePath(projectPath, workspaceName);
 

From 07362af3c52e1bf37248033b1a17ac86929060ae Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Wed, 29 Oct 2025 01:18:52 +0000
Subject: [PATCH 07/11] =?UTF-8?q?=F0=9F=A4=96=20chore:=20format=20wait=5Fp?=
 =?UTF-8?q?r=5Fchecks.sh?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/wait_pr_checks.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/wait_pr_checks.sh b/scripts/wait_pr_checks.sh
index 671c568d3..77ec30c9b 100755
--- a/scripts/wait_pr_checks.sh
+++ b/scripts/wait_pr_checks.sh
@@ -30,7 +30,7 @@ REMOTE_BRANCH=$(git rev-parse --abbrev-ref --symbolic-full-name '@{u}' 2>/dev/nu
 if [[ -z "$REMOTE_BRANCH" ]]; then
   echo "⚠️  Current branch '$CURRENT_BRANCH' has no upstream branch." >&2
   echo "Setting upstream to origin/$CURRENT_BRANCH..." >&2
-  
+
   # Try to set upstream
   if git push -u origin "$CURRENT_BRANCH" 2>&1; then
     echo "✅ Upstream set successfully!" >&2

From 2355fd1e389c7f989896ccc76b7f82098f171344 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Wed, 29 Oct 2025 01:24:58 +0000
Subject: [PATCH 08/11] =?UTF-8?q?=F0=9F=A4=96=20feat:=20add=20nightly=20te?=
 =?UTF-8?q?rminal-bench=20schedule?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Run full benchmark suite (~80 tasks) every night at midnight UTC
- Concurrency=4 is appropriate for full suite (60-90 min estimated)
- Timeout=180 min (3 hours) provides safety margin
- Use default fallbacks for scheduled runs (no inputs)
- Add unique artifact names with run_id to avoid conflicts
- Set 30-day retention for nightly benchmark artifacts
---
 .github/workflows/terminal-bench.yml | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
index d014f93fd..c061adb7b 100644
--- a/.github/workflows/terminal-bench.yml
+++ b/.github/workflows/terminal-bench.yml
@@ -1,6 +1,9 @@
 name: Terminal-Bench
 
 on:
+  schedule:
+    # Run full benchmark suite every night at midnight UTC
+    - cron: '0 0 * * *'
   workflow_dispatch:
     inputs:
       dataset:
@@ -31,7 +34,9 @@ jobs:
   benchmark:
     name: Run Terminal-Bench
     runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }}
-    timeout-minutes: 180 # 3 hours - terminal-bench can take a long time
+    # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes
+    # Allow 3 hours for safety margin and slower tasks
+    timeout-minutes: 180
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -52,11 +57,11 @@ jobs:
       - name: Run Terminal-Bench
         run: make benchmark-terminal
         env:
-          TB_DATASET: ${{ inputs.dataset }}
-          TB_CONCURRENCY: ${{ inputs.concurrency }}
+          TB_DATASET: ${{ inputs.dataset || 'terminal-bench-core==0.1.1' }}
+          TB_CONCURRENCY: ${{ inputs.concurrency || '4' }}
           TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }}
-          TB_SAMPLE_SIZE: ${{ inputs.sample_size }}
-          TB_ARGS: ${{ inputs.extra_args }}
+          TB_SAMPLE_SIZE: ${{ inputs.sample_size || '' }}
+          TB_ARGS: ${{ inputs.extra_args || '' }}
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
 
@@ -64,8 +69,9 @@ jobs:
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: terminal-bench-results
+          name: terminal-bench-results-${{ github.run_id }}
           path: |
             runs/
           if-no-files-found: warn
+          retention-days: 30
 

From 5f09559d4cdd7f9336c94da8326b616253851692 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Wed, 29 Oct 2025 01:30:36 +0000
Subject: [PATCH 09/11] =?UTF-8?q?=F0=9F=A4=96=20docs:=20clarify=20terminal?=
 =?UTF-8?q?-bench-core=3D=3D0.1.1=20is=20the=20full=20suite?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

terminal-bench-core==0.1.1 contains ~80 tasks, which is the complete
stable benchmark suite. The -head version is bleeding-edge dev.
---
 .github/workflows/terminal-bench.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
index c061adb7b..4bf272266 100644
--- a/.github/workflows/terminal-bench.yml
+++ b/.github/workflows/terminal-bench.yml
@@ -2,7 +2,8 @@ name: Terminal-Bench
 
 on:
   schedule:
-    # Run full benchmark suite every night at midnight UTC
+    # Run full benchmark suite (~80 tasks) every night at midnight UTC
+    # Uses terminal-bench-core==0.1.1 which is the stable, full benchmark suite
     - cron: '0 0 * * *'
   workflow_dispatch:
     inputs:

From dfb6daed5f088e8d97932dd144289a4aa315d25d Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Wed, 29 Oct 2025 01:36:40 +0000
Subject: [PATCH 10/11] =?UTF-8?q?=F0=9F=A4=96=20feat:=20run=20nightly=20be?=
 =?UTF-8?q?nchmarks=20for=20both=20Sonnet=204.5=20and=20Codex?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use matrix strategy to run both models every night:
- anthropic:claude-sonnet-4-5 (high thinking)
- openai:gpt-5-codex (high thinking)

Matrix only applies to scheduled runs (cron), not manual workflow_dispatch.
Artifacts are named uniquely per model to avoid conflicts.

This enables direct comparison of model performance on the full 80-task suite.
---
 .github/workflows/terminal-bench.yml | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
index 4bf272266..35355b02d 100644
--- a/.github/workflows/terminal-bench.yml
+++ b/.github/workflows/terminal-bench.yml
@@ -33,11 +33,17 @@ on:
 
 jobs:
   benchmark:
-    name: Run Terminal-Bench
+    name: Run Terminal-Bench (${{ matrix.model_name }})
     runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }}
     # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes
     # Allow 3 hours for safety margin and slower tasks
     timeout-minutes: 180
+    strategy:
+      # Run scheduled benchmarks for both models
+      matrix:
+        model_name: ${{ github.event_name == 'schedule' && fromJSON('["anthropic:claude-sonnet-4-5", "openai:gpt-5-codex"]') || fromJSON('[""]') }}
+        thinking_level: ${{ github.event_name == 'schedule' && fromJSON('["high"]') || fromJSON('[""]') }}
+      fail-fast: false
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -62,7 +68,7 @@ jobs:
           TB_CONCURRENCY: ${{ inputs.concurrency || '4' }}
           TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }}
           TB_SAMPLE_SIZE: ${{ inputs.sample_size || '' }}
-          TB_ARGS: ${{ inputs.extra_args || '' }}
+          TB_ARGS: ${{ matrix.model_name && format('--agent-kwarg model_name={0} --agent-kwarg thinking_level={1} {2}', matrix.model_name, matrix.thinking_level, inputs.extra_args || '') || inputs.extra_args || '' }}
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
 
@@ -70,7 +76,7 @@ jobs:
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: terminal-bench-results-${{ github.run_id }}
+          name: terminal-bench-results-${{ matrix.model_name && format('{0}-{1}', matrix.model_name, github.run_id) || github.run_id }}
           path: |
             runs/
           if-no-files-found: warn

From 9dca9486f847e326b43db478f0da29195c4eaa08 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Wed, 29 Oct 2025 01:38:44 +0000
Subject: [PATCH 11/11] =?UTF-8?q?=F0=9F=A4=96=20refactor:=20split=20nightl?=
 =?UTF-8?q?y=20benchmarks=20into=20separate=20workflow?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cleaner architecture:
- terminal-bench.yml: Reusable workflow (workflow_call + workflow_dispatch)
- nightly-terminal-bench.yml: Scheduled runner with matrix strategy

Benefits:
- Main workflow stays simple for manual use
- Nightly schedule logic isolated in dedicated file
- Easy to add more models to nightly runs
- Manual workflow_dispatch supports model/thinking overrides

Nightly runs both models at midnight UTC:
- anthropic:claude-sonnet-4-5 (high thinking)
- openai:gpt-5-codex (high thinking)
---
 .github/workflows/nightly-terminal-bench.yml | 50 ++++++++++++++
 .github/workflows/terminal-bench.yml         | 68 +++++++++++++++-----
 2 files changed, 102 insertions(+), 16 deletions(-)
 create mode 100644 .github/workflows/nightly-terminal-bench.yml

diff --git a/.github/workflows/nightly-terminal-bench.yml b/.github/workflows/nightly-terminal-bench.yml
new file mode 100644
index 000000000..e78b2ce2d
--- /dev/null
+++ b/.github/workflows/nightly-terminal-bench.yml
@@ -0,0 +1,50 @@
+name: Nightly Terminal-Bench
+
+on:
+  schedule:
+    # Run full benchmark suite (~80 tasks) every night at midnight UTC
+    - cron: '0 0 * * *'
+  workflow_dispatch:
+    inputs:
+      models:
+        description: 'Models to test (comma-separated, or "all" for both)'
+        required: false
+        default: 'all'
+        type: string
+
+jobs:
+  determine-models:
+    name: Determine models to test
+    runs-on: ubuntu-latest
+    outputs:
+      models: ${{ steps.set-models.outputs.models }}
+    steps:
+      - name: Set models matrix
+        id: set-models
+        run: |
+          if [ "${{ inputs.models }}" = "all" ] || [ -z "${{ inputs.models }}" ]; then
+            echo 'models=["anthropic:claude-sonnet-4-5","openai:gpt-5-codex"]' >> $GITHUB_OUTPUT
+          else
+            # Convert comma-separated to JSON array
+            models="${{ inputs.models }}"
+            models_json=$(echo "$models" | jq -R -s -c 'split(",") | map(gsub("^\\s+|\\s+$"; ""))')
+            echo "models=$models_json" >> $GITHUB_OUTPUT
+          fi
+
+  benchmark:
+    name: ${{ matrix.model }}
+    needs: determine-models
+    strategy:
+      matrix:
+        model: ${{ fromJSON(needs.determine-models.outputs.models) }}
+      fail-fast: false
+    uses: ./.github/workflows/terminal-bench.yml
+    with:
+      model_name: ${{ matrix.model }}
+      thinking_level: 'high'
+      dataset: 'terminal-bench-core==0.1.1'
+      concurrency: '4'
+      livestream: true
+    secrets:
+      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
index 35355b02d..c8fa90057 100644
--- a/.github/workflows/terminal-bench.yml
+++ b/.github/workflows/terminal-bench.yml
@@ -1,10 +1,44 @@
 name: Terminal-Bench
 
 on:
-  schedule:
-    # Run full benchmark suite (~80 tasks) every night at midnight UTC
-    # Uses terminal-bench-core==0.1.1 which is the stable, full benchmark suite
-    - cron: '0 0 * * *'
+  workflow_call:
+    inputs:
+      model_name:
+        description: 'Model to use (e.g., anthropic:claude-sonnet-4-5)'
+        required: false
+        type: string
+      thinking_level:
+        description: 'Thinking level (off, low, medium, high)'
+        required: false
+        type: string
+      dataset:
+        description: 'Terminal-Bench dataset to use'
+        required: false
+        type: string
+        default: 'terminal-bench-core==0.1.1'
+      concurrency:
+        description: 'Number of concurrent tasks (--n-concurrent)'
+        required: false
+        type: string
+        default: '4'
+      livestream:
+        description: 'Enable livestream mode'
+        required: false
+        type: boolean
+        default: true
+      sample_size:
+        description: 'Number of random tasks to run (empty = all tasks)'
+        required: false
+        type: string
+      extra_args:
+        description: 'Additional arguments to pass to terminal-bench'
+        required: false
+        type: string
+    secrets:
+      ANTHROPIC_API_KEY:
+        required: true
+      OPENAI_API_KEY:
+        required: true
   workflow_dispatch:
     inputs:
       dataset:
@@ -26,6 +60,14 @@ on:
         description: 'Number of random tasks to run (empty = all tasks)'
         required: false
         type: string
+      model_name:
+        description: 'Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)'
+        required: false
+        type: string
+      thinking_level:
+        description: 'Thinking level (off, low, medium, high)'
+        required: false
+        type: string
       extra_args:
         description: 'Additional arguments to pass to terminal-bench'
         required: false
@@ -33,17 +75,11 @@ on:
 
 jobs:
   benchmark:
-    name: Run Terminal-Bench (${{ matrix.model_name }})
+    name: Run Terminal-Bench${{ inputs.model_name && format(' ({0})', inputs.model_name) || '' }}
     runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }}
     # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes
     # Allow 3 hours for safety margin and slower tasks
     timeout-minutes: 180
-    strategy:
-      # Run scheduled benchmarks for both models
-      matrix:
-        model_name: ${{ github.event_name == 'schedule' && fromJSON('["anthropic:claude-sonnet-4-5", "openai:gpt-5-codex"]') || fromJSON('[""]') }}
-        thinking_level: ${{ github.event_name == 'schedule' && fromJSON('["high"]') || fromJSON('[""]') }}
-      fail-fast: false
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -64,11 +100,11 @@ jobs:
       - name: Run Terminal-Bench
         run: make benchmark-terminal
         env:
-          TB_DATASET: ${{ inputs.dataset || 'terminal-bench-core==0.1.1' }}
-          TB_CONCURRENCY: ${{ inputs.concurrency || '4' }}
+          TB_DATASET: ${{ inputs.dataset }}
+          TB_CONCURRENCY: ${{ inputs.concurrency }}
           TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }}
-          TB_SAMPLE_SIZE: ${{ inputs.sample_size || '' }}
-          TB_ARGS: ${{ matrix.model_name && format('--agent-kwarg model_name={0} --agent-kwarg thinking_level={1} {2}', matrix.model_name, matrix.thinking_level, inputs.extra_args || '') || inputs.extra_args || '' }}
+          TB_SAMPLE_SIZE: ${{ inputs.sample_size }}
+          TB_ARGS: ${{ inputs.model_name && format('--agent-kwarg model_name={0} --agent-kwarg thinking_level={1} {2}', inputs.model_name, inputs.thinking_level, inputs.extra_args) || inputs.extra_args }}
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
 
@@ -76,7 +112,7 @@ jobs:
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: terminal-bench-results-${{ matrix.model_name && format('{0}-{1}', matrix.model_name, github.run_id) || github.run_id }}
+          name: terminal-bench-results-${{ inputs.model_name && format('{0}-{1}', inputs.model_name, github.run_id) || format('{0}', github.run_id) }}
           path: |
             runs/
           if-no-files-found: warn