diff --git a/.github/workflows/nightly-terminal-bench.yml b/.github/workflows/nightly-terminal-bench.yml new file mode 100644 index 000000000..e78b2ce2d --- /dev/null +++ b/.github/workflows/nightly-terminal-bench.yml @@ -0,0 +1,50 @@ +name: Nightly Terminal-Bench + +on: + schedule: + # Run full benchmark suite (~80 tasks) every night at midnight UTC + - cron: '0 0 * * *' + workflow_dispatch: + inputs: + models: + description: 'Models to test (comma-separated, or "all" for both)' + required: false + default: 'all' + type: string + +jobs: + determine-models: + name: Determine models to test + runs-on: ubuntu-latest + outputs: + models: ${{ steps.set-models.outputs.models }} + steps: + - name: Set models matrix + id: set-models + run: | + if [ "${{ inputs.models }}" = "all" ] || [ -z "${{ inputs.models }}" ]; then + echo 'models=["anthropic:claude-sonnet-4-5","openai:gpt-5-codex"]' >> $GITHUB_OUTPUT + else + # Convert comma-separated to JSON array + models="${{ inputs.models }}" + models_json=$(echo "$models" | jq -R -s -c 'split(",") | map(gsub("^\\s+|\\s+$"; ""))') + echo "models=$models_json" >> $GITHUB_OUTPUT + fi + + benchmark: + name: ${{ matrix.model }} + needs: determine-models + strategy: + matrix: + model: ${{ fromJSON(needs.determine-models.outputs.models) }} + fail-fast: false + uses: ./.github/workflows/terminal-bench.yml + with: + model_name: ${{ matrix.model }} + thinking_level: 'high' + dataset: 'terminal-bench-core==0.1.1' + concurrency: '4' + livestream: true + secrets: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index 6aab52166..c8fa90057 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -1,6 +1,44 @@ name: Terminal-Bench on: + workflow_call: + inputs: + model_name: + description: 'Model to use (e.g., anthropic:claude-sonnet-4-5)' + required: false + type: string + thinking_level: + description: 'Thinking level (off, low, medium, high)' + required: false + type: string + dataset: + description: 'Terminal-Bench dataset to use' + required: false + type: string + default: 'terminal-bench-core==0.1.1' + concurrency: + description: 'Number of concurrent tasks (--n-concurrent)' + required: false + type: string + default: '4' + livestream: + description: 'Enable livestream mode' + required: false + type: boolean + default: true + sample_size: + description: 'Number of random tasks to run (empty = all tasks)' + required: false + type: string + extra_args: + description: 'Additional arguments to pass to terminal-bench' + required: false + type: string + secrets: + ANTHROPIC_API_KEY: + required: true + OPENAI_API_KEY: + required: true workflow_dispatch: inputs: dataset: @@ -22,6 +60,14 @@ on: description: 'Number of random tasks to run (empty = all tasks)' required: false type: string + model_name: + description: 'Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)' + required: false + type: string + thinking_level: + description: 'Thinking level (off, low, medium, high)' + required: false + type: string extra_args: description: 'Additional arguments to pass to terminal-bench' required: false @@ -29,9 +75,11 @@ on: jobs: benchmark: - name: Run Terminal-Bench + name: Run Terminal-Bench${{ inputs.model_name && format(' ({0})', inputs.model_name) || '' }} runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }} - timeout-minutes: 180 # 3 hours - terminal-bench can take a long time + # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes + # Allow 3 hours for safety margin and slower tasks + timeout-minutes: 180 steps: - name: Checkout code uses: actions/checkout@v4 @@ -56,7 +104,7 @@ jobs: TB_CONCURRENCY: ${{ inputs.concurrency }} TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }} TB_SAMPLE_SIZE: ${{ inputs.sample_size }} - TB_ARGS: ${{ inputs.extra_args }} + TB_ARGS: ${{ inputs.model_name && format('--agent-kwarg model_name={0} --agent-kwarg thinking_level={1} {2}', inputs.model_name, inputs.thinking_level, inputs.extra_args) || inputs.extra_args }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -64,9 +112,9 @@ jobs: if: always() uses: actions/upload-artifact@v4 with: - name: terminal-bench-results + name: terminal-bench-results-${{ inputs.model_name && format('{0}-{1}', inputs.model_name, github.run_id) || format('{0}', github.run_id) }} path: | - terminal-bench-results/ - *.json + runs/ if-no-files-found: warn + retention-days: 30 diff --git a/.gitignore b/.gitignore index 6314bc635..ef6b70066 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,4 @@ tmpfork storybook-static/ *.tgz src/test-workspaces/ +terminal-bench-results/ diff --git a/scripts/wait_pr_checks.sh b/scripts/wait_pr_checks.sh index 8e74ac983..77ec30c9b 100755 --- a/scripts/wait_pr_checks.sh +++ b/scripts/wait_pr_checks.sh @@ -28,9 +28,18 @@ CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD) REMOTE_BRANCH=$(git rev-parse --abbrev-ref --symbolic-full-name '@{u}' 2>/dev/null || echo "") if [[ -z "$REMOTE_BRANCH" ]]; then - echo "❌ Error: Current branch '$CURRENT_BRANCH' has no upstream branch." >&2 - echo "Set an upstream with: git push -u origin $CURRENT_BRANCH" >&2 - exit 1 + echo "⚠️ Current branch '$CURRENT_BRANCH' has no upstream branch." >&2 + echo "Setting upstream to origin/$CURRENT_BRANCH..." >&2 + + # Try to set upstream + if git push -u origin "$CURRENT_BRANCH" 2>&1; then + echo "✅ Upstream set successfully!" >&2 + REMOTE_BRANCH="origin/$CURRENT_BRANCH" + else + echo "❌ Error: Failed to set upstream branch." >&2 + echo "You may need to push manually: git push -u origin $CURRENT_BRANCH" >&2 + exit 1 + fi fi # Check if local and remote are in sync diff --git a/src/runtime/LocalRuntime.ts b/src/runtime/LocalRuntime.ts index 69c9a61fb..d62f4bd60 100644 --- a/src/runtime/LocalRuntime.ts +++ b/src/runtime/LocalRuntime.ts @@ -512,6 +512,11 @@ export class LocalRuntime implements Runtime { _abortSignal?: AbortSignal ): Promise<{ success: true; deletedPath: string } | { success: false; error: string }> { // Note: _abortSignal ignored for local operations (fast, no need for cancellation) + + // In-place workspaces are identified by projectPath === workspaceName + // These are direct workspace directories (e.g., CLI/benchmark sessions), not git worktrees + const isInPlace = projectPath === workspaceName; + // Compute workspace path using the canonical method const deletedPath = this.getWorkspacePath(projectPath, workspaceName); @@ -520,16 +525,25 @@ export class LocalRuntime implements Runtime { await fsPromises.access(deletedPath); } catch { // Directory doesn't exist - operation is idempotent - // Prune stale git records (best effort) - try { - using pruneProc = execAsync(`git -C "${projectPath}" worktree prune`); - await pruneProc.result; - } catch { - // Ignore prune errors - directory is already deleted, which is the goal + // For standard worktrees, prune stale git records (best effort) + if (!isInPlace) { + try { + using pruneProc = execAsync(`git -C "${projectPath}" worktree prune`); + await pruneProc.result; + } catch { + // Ignore prune errors - directory is already deleted, which is the goal + } } return { success: true, deletedPath }; } + // For in-place workspaces, there's no worktree to remove + // Just return success - the workspace directory itself should not be deleted + // as it may contain the user's actual project files + if (isInPlace) { + return { success: true, deletedPath }; + } + try { // Use git worktree remove to delete the worktree // This updates git's internal worktree metadata correctly diff --git a/src/services/agentSession.ts b/src/services/agentSession.ts index ed2d34547..4d01bec95 100644 --- a/src/services/agentSession.ts +++ b/src/services/agentSession.ts @@ -180,11 +180,17 @@ export class AgentSession { if (existing.success) { // Metadata already exists, verify workspace path matches const metadata = existing.data; - // Directory name uses workspace name (not stable ID) - const runtime = createRuntime( - metadata.runtimeConfig ?? { type: "local", srcBaseDir: this.config.srcDir } - ); - const expectedPath = runtime.getWorkspacePath(metadata.projectPath, metadata.name); + // For in-place workspaces (projectPath === name), use path directly + // Otherwise reconstruct using runtime's worktree pattern + const isInPlace = metadata.projectPath === metadata.name; + const expectedPath = isInPlace + ? metadata.projectPath + : (() => { + const runtime = createRuntime( + metadata.runtimeConfig ?? { type: "local", srcBaseDir: this.config.srcDir } + ); + return runtime.getWorkspacePath(metadata.projectPath, metadata.name); + })(); assert( expectedPath === normalizedWorkspacePath, `Existing metadata workspace path mismatch for ${this.workspaceId}: expected ${expectedPath}, got ${normalizedWorkspacePath}` @@ -192,16 +198,34 @@ export class AgentSession { return; } - // Derive project path from workspace path (parent directory) - const derivedProjectPath = path.dirname(normalizedWorkspacePath); - - const derivedProjectName = - projectName && projectName.trim().length > 0 - ? projectName.trim() - : path.basename(derivedProjectPath) || "unknown"; - - // Extract name from workspace path (last component) - const workspaceName = path.basename(normalizedWorkspacePath); + // Detect in-place workspace: if workspacePath is not under srcBaseDir, + // it's a direct workspace (e.g., for CLI/benchmarks) rather than a worktree + const srcBaseDir = this.config.srcDir; + const normalizedSrcBaseDir = path.resolve(srcBaseDir); + const isUnderSrcBaseDir = normalizedWorkspacePath.startsWith(normalizedSrcBaseDir + path.sep); + + let derivedProjectPath: string; + let workspaceName: string; + let derivedProjectName: string; + + if (isUnderSrcBaseDir) { + // Standard worktree mode: workspace is under ~/.cmux/src/project/branch + derivedProjectPath = path.dirname(normalizedWorkspacePath); + workspaceName = path.basename(normalizedWorkspacePath); + derivedProjectName = + projectName && projectName.trim().length > 0 + ? projectName.trim() + : path.basename(derivedProjectPath) || "unknown"; + } else { + // In-place mode: workspace is a standalone directory + // Store the workspace path directly by setting projectPath === name + derivedProjectPath = normalizedWorkspacePath; + workspaceName = normalizedWorkspacePath; + derivedProjectName = + projectName && projectName.trim().length > 0 + ? projectName.trim() + : path.basename(normalizedWorkspacePath) || "unknown"; + } const metadata: WorkspaceMetadata = { id: this.workspaceId, diff --git a/src/services/aiService.ts b/src/services/aiService.ts index 00c68dd9d..dfb528b6e 100644 --- a/src/services/aiService.ts +++ b/src/services/aiService.ts @@ -519,11 +519,16 @@ export class AIService extends EventEmitter { return Err({ type: "unknown", raw: `Workspace ${workspaceId} not found in config` }); } - // Get workspace path (directory name uses workspace name) + // Get workspace path - handle both worktree and in-place modes const runtime = createRuntime( metadata.runtimeConfig ?? { type: "local", srcBaseDir: this.config.srcDir } ); - const workspacePath = runtime.getWorkspacePath(metadata.projectPath, metadata.name); + // In-place workspaces (CLI/benchmarks) have projectPath === name + // Use path directly instead of reconstructing via getWorkspacePath + const isInPlace = metadata.projectPath === metadata.name; + const workspacePath = isInPlace + ? metadata.projectPath + : runtime.getWorkspacePath(metadata.projectPath, metadata.name); // Build system message from workspace metadata const systemMessage = await buildSystemMessage( diff --git a/src/services/streamManager.ts b/src/services/streamManager.ts index b71ddd03f..10a5fe23e 100644 --- a/src/services/streamManager.ts +++ b/src/services/streamManager.ts @@ -1,4 +1,5 @@ import { EventEmitter } from "events"; +import * as path from "path"; import { streamText, stepCountIs, @@ -982,9 +983,13 @@ export class StreamManager extends EventEmitter { // Don't block stream completion waiting for directory deletion // This is especially important for SSH where rm -rf can take 500ms-2s if (streamInfo.runtimeTempDir) { + // Use parent directory as cwd for safety - if runtimeTempDir is malformed, + // we won't accidentally run rm -rf from root + const tempDirBasename = path.basename(streamInfo.runtimeTempDir); + const tempDirParent = path.dirname(streamInfo.runtimeTempDir); void streamInfo.runtime - .exec(`rm -rf "${streamInfo.runtimeTempDir}"`, { - cwd: "~", + .exec(`rm -rf "${tempDirBasename}"`, { + cwd: tempDirParent, timeout: 10, }) .then(async (result) => {