coder · ammario · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025
diff --git a/.github/workflows/nightly-terminal-bench.yml b/.github/workflows/nightly-terminal-bench.yml
@@ -0,0 +1,50 @@
+name: Nightly Terminal-Bench
+
+on:
+  schedule:
+    # Run full benchmark suite (~80 tasks) every night at midnight UTC
+    - cron: '0 0 * * *'
+  workflow_dispatch:
+    inputs:
+      models:
+        description: 'Models to test (comma-separated, or "all" for both)'
+        required: false
+        default: 'all'
+        type: string
+
+jobs:
+  determine-models:
+    name: Determine models to test
+    runs-on: ubuntu-latest
+    outputs:
+      models: ${{ steps.set-models.outputs.models }}
+    steps:
+      - name: Set models matrix
+        id: set-models
+        run: |
+          if [ "${{ inputs.models }}" = "all" ] || [ -z "${{ inputs.models }}" ]; then
+            echo 'models=["anthropic:claude-sonnet-4-5","openai:gpt-5-codex"]' >> $GITHUB_OUTPUT
+          else
+            # Convert comma-separated to JSON array
+            models="${{ inputs.models }}"
+            models_json=$(echo "$models" | jq -R -s -c 'split(",") | map(gsub("^\\s+|\\s+$"; ""))')
+            echo "models=$models_json" >> $GITHUB_OUTPUT
+          fi
+
+  benchmark:
+    name: ${{ matrix.model }}
+    needs: determine-models
+    strategy:
+      matrix:
+        model: ${{ fromJSON(needs.determine-models.outputs.models) }}
+      fail-fast: false
+    uses: ./.github/workflows/terminal-bench.yml
+    with:
+      model_name: ${{ matrix.model }}
+      thinking_level: 'high'
+      dataset: 'terminal-bench-core==0.1.1'
+      concurrency: '4'
+      livestream: true
+    secrets:
+      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
@@ -1,6 +1,44 @@
 name: Terminal-Bench
 
 on:
+  workflow_call:
+    inputs:
+      model_name:
+        description: 'Model to use (e.g., anthropic:claude-sonnet-4-5)'
+        required: false
+        type: string
+      thinking_level:
+        description: 'Thinking level (off, low, medium, high)'
+        required: false
+        type: string
+      dataset:
+        description: 'Terminal-Bench dataset to use'
+        required: false
+        type: string
+        default: 'terminal-bench-core==0.1.1'
+      concurrency:
+        description: 'Number of concurrent tasks (--n-concurrent)'
+        required: false
+        type: string
+        default: '4'
+      livestream:
+        description: 'Enable livestream mode'
+        required: false
+        type: boolean
+        default: true
+      sample_size:
+        description: 'Number of random tasks to run (empty = all tasks)'
+        required: false
+        type: string
+      extra_args:
+        description: 'Additional arguments to pass to terminal-bench'
+        required: false
+        type: string
+    secrets:
+      ANTHROPIC_API_KEY:
+        required: true
+      OPENAI_API_KEY:
+        required: true
   workflow_dispatch:
     inputs:
       dataset:
@@ -22,16 +60,26 @@ on:
         description: 'Number of random tasks to run (empty = all tasks)'
         required: false
         type: string
+      model_name:
+        description: 'Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)'
+        required: false
+        type: string
+      thinking_level:
+        description: 'Thinking level (off, low, medium, high)'
+        required: false
+        type: string
       extra_args:
         description: 'Additional arguments to pass to terminal-bench'
         required: false
         type: string
 
 jobs:
   benchmark:
-    name: Run Terminal-Bench
+    name: Run Terminal-Bench${{ inputs.model_name && format(' ({0})', inputs.model_name) || '' }}
     runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }}
-    timeout-minutes: 180 # 3 hours - terminal-bench can take a long time
+    # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes
+    # Allow 3 hours for safety margin and slower tasks
+    timeout-minutes: 180
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -56,17 +104,17 @@ jobs:
           TB_CONCURRENCY: ${{ inputs.concurrency }}
           TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }}
           TB_SAMPLE_SIZE: ${{ inputs.sample_size }}
-          TB_ARGS: ${{ inputs.extra_args }}
+          TB_ARGS: ${{ inputs.model_name && format('--agent-kwarg model_name={0} --agent-kwarg thinking_level={1} {2}', inputs.model_name, inputs.thinking_level, inputs.extra_args) || inputs.extra_args }}
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
 
       - name: Upload benchmark results
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: terminal-bench-results
+          name: terminal-bench-results-${{ inputs.model_name && format('{0}-{1}', inputs.model_name, github.run_id) || format('{0}', github.run_id) }}
           path: |
-            terminal-bench-results/
-            *.json
+            runs/
           if-no-files-found: warn
+          retention-days: 30
 
diff --git a/.gitignore b/.gitignore
@@ -105,3 +105,4 @@ tmpfork
 storybook-static/
 *.tgz
 src/test-workspaces/
+terminal-bench-results/
diff --git a/scripts/wait_pr_checks.sh b/scripts/wait_pr_checks.sh
@@ -28,9 +28,18 @@ CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
 REMOTE_BRANCH=$(git rev-parse --abbrev-ref --symbolic-full-name '@{u}' 2>/dev/null || echo "")
 
 if [[ -z "$REMOTE_BRANCH" ]]; then
-  echo "❌ Error: Current branch '$CURRENT_BRANCH' has no upstream branch." >&2
-  echo "Set an upstream with: git push -u origin $CURRENT_BRANCH" >&2
-  exit 1
+  echo "⚠️  Current branch '$CURRENT_BRANCH' has no upstream branch." >&2
+  echo "Setting upstream to origin/$CURRENT_BRANCH..." >&2
+
+  # Try to set upstream
+  if git push -u origin "$CURRENT_BRANCH" 2>&1; then
+    echo "✅ Upstream set successfully!" >&2
+    REMOTE_BRANCH="origin/$CURRENT_BRANCH"
+  else
+    echo "❌ Error: Failed to set upstream branch." >&2
+    echo "You may need to push manually: git push -u origin $CURRENT_BRANCH" >&2
+    exit 1
+  fi
 fi
 
 # Check if local and remote are in sync

diff --git a/src/runtime/LocalRuntime.ts b/src/runtime/LocalRuntime.ts
@@ -512,6 +512,11 @@ export class LocalRuntime implements Runtime {
     _abortSignal?: AbortSignal
   ): Promise<{ success: true; deletedPath: string } | { success: false; error: string }> {
     // Note: _abortSignal ignored for local operations (fast, no need for cancellation)
+
+    // In-place workspaces are identified by projectPath === workspaceName
+    // These are direct workspace directories (e.g., CLI/benchmark sessions), not git worktrees
+    const isInPlace = projectPath === workspaceName;
+
     // Compute workspace path using the canonical method
     const deletedPath = this.getWorkspacePath(projectPath, workspaceName);
 
@@ -520,16 +525,25 @@ export class LocalRuntime implements Runtime {
       await fsPromises.access(deletedPath);
     } catch {
       // Directory doesn't exist - operation is idempotent
-      // Prune stale git records (best effort)
-      try {
-        using pruneProc = execAsync(`git -C "${projectPath}" worktree prune`);
-        await pruneProc.result;
-      } catch {
-        // Ignore prune errors - directory is already deleted, which is the goal
+      // For standard worktrees, prune stale git records (best effort)
+      if (!isInPlace) {
+        try {
+          using pruneProc = execAsync(`git -C "${projectPath}" worktree prune`);
+          await pruneProc.result;
+        } catch {
+          // Ignore prune errors - directory is already deleted, which is the goal
+        }
       }
       return { success: true, deletedPath };
     }
 
+    // For in-place workspaces, there's no worktree to remove
+    // Just return success - the workspace directory itself should not be deleted
+    // as it may contain the user's actual project files
+    if (isInPlace) {
+      return { success: true, deletedPath };
+    }
+
     try {
       // Use git worktree remove to delete the worktree
       // This updates git's internal worktree metadata correctly

diff --git a/src/services/agentSession.ts b/src/services/agentSession.ts
@@ -180,28 +180,52 @@ export class AgentSession {
     if (existing.success) {
       // Metadata already exists, verify workspace path matches
       const metadata = existing.data;
-      // Directory name uses workspace name (not stable ID)
-      const runtime = createRuntime(
-        metadata.runtimeConfig ?? { type: "local", srcBaseDir: this.config.srcDir }
-      );
-      const expectedPath = runtime.getWorkspacePath(metadata.projectPath, metadata.name);
+      // For in-place workspaces (projectPath === name), use path directly
+      // Otherwise reconstruct using runtime's worktree pattern
+      const isInPlace = metadata.projectPath === metadata.name;
+      const expectedPath = isInPlace
+        ? metadata.projectPath
+        : (() => {
+            const runtime = createRuntime(
+              metadata.runtimeConfig ?? { type: "local", srcBaseDir: this.config.srcDir }
+            );
+            return runtime.getWorkspacePath(metadata.projectPath, metadata.name);
+          })();
       assert(
         expectedPath === normalizedWorkspacePath,
         `Existing metadata workspace path mismatch for ${this.workspaceId}: expected ${expectedPath}, got ${normalizedWorkspacePath}`
       );
       return;
     }
 
-    // Derive project path from workspace path (parent directory)
-    const derivedProjectPath = path.dirname(normalizedWorkspacePath);
-
-    const derivedProjectName =
-      projectName && projectName.trim().length > 0
-        ? projectName.trim()
-        : path.basename(derivedProjectPath) || "unknown";
-
-    // Extract name from workspace path (last component)
-    const workspaceName = path.basename(normalizedWorkspacePath);
+    // Detect in-place workspace: if workspacePath is not under srcBaseDir,
+    // it's a direct workspace (e.g., for CLI/benchmarks) rather than a worktree
+    const srcBaseDir = this.config.srcDir;
+    const normalizedSrcBaseDir = path.resolve(srcBaseDir);
+    const isUnderSrcBaseDir = normalizedWorkspacePath.startsWith(normalizedSrcBaseDir + path.sep);
+
+    let derivedProjectPath: string;
+    let workspaceName: string;
+    let derivedProjectName: string;
+
+    if (isUnderSrcBaseDir) {
+      // Standard worktree mode: workspace is under ~/.cmux/src/project/branch
+      derivedProjectPath = path.dirname(normalizedWorkspacePath);
+      workspaceName = path.basename(normalizedWorkspacePath);
+      derivedProjectName =
+        projectName && projectName.trim().length > 0
+          ? projectName.trim()
+          : path.basename(derivedProjectPath) || "unknown";
+    } else {
+      // In-place mode: workspace is a standalone directory
+      // Store the workspace path directly by setting projectPath === name
+      derivedProjectPath = normalizedWorkspacePath;
+      workspaceName = normalizedWorkspacePath;
+      derivedProjectName =
+        projectName && projectName.trim().length > 0
+          ? projectName.trim()
+          : path.basename(normalizedWorkspacePath) || "unknown";
+    }
 
     const metadata: WorkspaceMetadata = {
       id: this.workspaceId,

diff --git a/src/services/aiService.ts b/src/services/aiService.ts
@@ -519,11 +519,16 @@ export class AIService extends EventEmitter {
         return Err({ type: "unknown", raw: `Workspace ${workspaceId} not found in config` });
       }
 
-      // Get workspace path (directory name uses workspace name)
+      // Get workspace path - handle both worktree and in-place modes
       const runtime = createRuntime(
         metadata.runtimeConfig ?? { type: "local", srcBaseDir: this.config.srcDir }
       );
-      const workspacePath = runtime.getWorkspacePath(metadata.projectPath, metadata.name);
+      // In-place workspaces (CLI/benchmarks) have projectPath === name
+      // Use path directly instead of reconstructing via getWorkspacePath
+      const isInPlace = metadata.projectPath === metadata.name;
+      const workspacePath = isInPlace
+        ? metadata.projectPath
+        : runtime.getWorkspacePath(metadata.projectPath, metadata.name);
 
       // Build system message from workspace metadata
       const systemMessage = await buildSystemMessage(

diff --git a/src/services/streamManager.ts b/src/services/streamManager.ts
@@ -1,4 +1,5 @@
 import { EventEmitter } from "events";
+import * as path from "path";
 import {
   streamText,
   stepCountIs,
@@ -982,9 +983,13 @@ export class StreamManager extends EventEmitter {
       // Don't block stream completion waiting for directory deletion
       // This is especially important for SSH where rm -rf can take 500ms-2s
       if (streamInfo.runtimeTempDir) {
+        // Use parent directory as cwd for safety - if runtimeTempDir is malformed,
+        // we won't accidentally run rm -rf from root
+        const tempDirBasename = path.basename(streamInfo.runtimeTempDir);
+        const tempDirParent = path.dirname(streamInfo.runtimeTempDir);
         void streamInfo.runtime
-          .exec(`rm -rf "${streamInfo.runtimeTempDir}"`, {
-            cwd: "~",
+          .exec(`rm -rf "${tempDirBasename}"`, {
+            cwd: tempDirParent,
             timeout: 10,
           })
           .then(async (result) => {