Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions .github/workflows/nightly-terminal-bench.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
name: Nightly Terminal-Bench

on:
schedule:
# Run full benchmark suite (~80 tasks) every night at midnight UTC
- cron: '0 0 * * *'
workflow_dispatch:
inputs:
models:
description: 'Models to test (comma-separated, or "all" for both)'
required: false
default: 'all'
type: string

jobs:
determine-models:
name: Determine models to test
runs-on: ubuntu-latest
outputs:
models: ${{ steps.set-models.outputs.models }}
steps:
- name: Set models matrix
id: set-models
run: |
if [ "${{ inputs.models }}" = "all" ] || [ -z "${{ inputs.models }}" ]; then
echo 'models=["anthropic:claude-sonnet-4-5","openai:gpt-5-codex"]' >> $GITHUB_OUTPUT
else
# Convert comma-separated to JSON array
models="${{ inputs.models }}"
models_json=$(echo "$models" | jq -R -s -c 'split(",") | map(gsub("^\\s+|\\s+$"; ""))')
echo "models=$models_json" >> $GITHUB_OUTPUT
fi

benchmark:
name: ${{ matrix.model }}
needs: determine-models
strategy:
matrix:
model: ${{ fromJSON(needs.determine-models.outputs.models) }}
fail-fast: false
uses: ./.github/workflows/terminal-bench.yml
with:
model_name: ${{ matrix.model }}
thinking_level: 'high'
dataset: 'terminal-bench-core==0.1.1'
concurrency: '4'
livestream: true
secrets:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
60 changes: 54 additions & 6 deletions .github/workflows/terminal-bench.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,44 @@
name: Terminal-Bench

on:
workflow_call:
inputs:
model_name:
description: 'Model to use (e.g., anthropic:claude-sonnet-4-5)'
required: false
type: string
thinking_level:
description: 'Thinking level (off, low, medium, high)'
required: false
type: string
dataset:
description: 'Terminal-Bench dataset to use'
required: false
type: string
default: 'terminal-bench-core==0.1.1'
concurrency:
description: 'Number of concurrent tasks (--n-concurrent)'
required: false
type: string
default: '4'
livestream:
description: 'Enable livestream mode'
required: false
type: boolean
default: true
sample_size:
description: 'Number of random tasks to run (empty = all tasks)'
required: false
type: string
extra_args:
description: 'Additional arguments to pass to terminal-bench'
required: false
type: string
secrets:
ANTHROPIC_API_KEY:
required: true
OPENAI_API_KEY:
required: true
workflow_dispatch:
inputs:
dataset:
Expand All @@ -22,16 +60,26 @@ on:
description: 'Number of random tasks to run (empty = all tasks)'
required: false
type: string
model_name:
description: 'Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)'
required: false
type: string
thinking_level:
description: 'Thinking level (off, low, medium, high)'
required: false
type: string
extra_args:
description: 'Additional arguments to pass to terminal-bench'
required: false
type: string

jobs:
benchmark:
name: Run Terminal-Bench
name: Run Terminal-Bench${{ inputs.model_name && format(' ({0})', inputs.model_name) || '' }}
runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }}
timeout-minutes: 180 # 3 hours - terminal-bench can take a long time
# Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes
# Allow 3 hours for safety margin and slower tasks
timeout-minutes: 180
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand All @@ -56,17 +104,17 @@ jobs:
TB_CONCURRENCY: ${{ inputs.concurrency }}
TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }}
TB_SAMPLE_SIZE: ${{ inputs.sample_size }}
TB_ARGS: ${{ inputs.extra_args }}
TB_ARGS: ${{ inputs.model_name && format('--agent-kwarg model_name={0} --agent-kwarg thinking_level={1} {2}', inputs.model_name, inputs.thinking_level, inputs.extra_args) || inputs.extra_args }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

- name: Upload benchmark results
if: always()
uses: actions/upload-artifact@v4
with:
name: terminal-bench-results
name: terminal-bench-results-${{ inputs.model_name && format('{0}-{1}', inputs.model_name, github.run_id) || format('{0}', github.run_id) }}
path: |
terminal-bench-results/
*.json
runs/
if-no-files-found: warn
retention-days: 30

1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,4 @@ tmpfork
storybook-static/
*.tgz
src/test-workspaces/
terminal-bench-results/
15 changes: 12 additions & 3 deletions scripts/wait_pr_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,18 @@ CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
REMOTE_BRANCH=$(git rev-parse --abbrev-ref --symbolic-full-name '@{u}' 2>/dev/null || echo "")

if [[ -z "$REMOTE_BRANCH" ]]; then
echo "❌ Error: Current branch '$CURRENT_BRANCH' has no upstream branch." >&2
echo "Set an upstream with: git push -u origin $CURRENT_BRANCH" >&2
exit 1
echo "⚠️ Current branch '$CURRENT_BRANCH' has no upstream branch." >&2
echo "Setting upstream to origin/$CURRENT_BRANCH..." >&2

# Try to set upstream
if git push -u origin "$CURRENT_BRANCH" 2>&1; then
echo "✅ Upstream set successfully!" >&2
REMOTE_BRANCH="origin/$CURRENT_BRANCH"
else
echo "❌ Error: Failed to set upstream branch." >&2
echo "You may need to push manually: git push -u origin $CURRENT_BRANCH" >&2
exit 1
fi
fi

# Check if local and remote are in sync
Expand Down
26 changes: 20 additions & 6 deletions src/runtime/LocalRuntime.ts
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,11 @@ export class LocalRuntime implements Runtime {
_abortSignal?: AbortSignal
): Promise<{ success: true; deletedPath: string } | { success: false; error: string }> {
// Note: _abortSignal ignored for local operations (fast, no need for cancellation)

// In-place workspaces are identified by projectPath === workspaceName
// These are direct workspace directories (e.g., CLI/benchmark sessions), not git worktrees
const isInPlace = projectPath === workspaceName;

// Compute workspace path using the canonical method
const deletedPath = this.getWorkspacePath(projectPath, workspaceName);

Expand All @@ -520,16 +525,25 @@ export class LocalRuntime implements Runtime {
await fsPromises.access(deletedPath);
} catch {
// Directory doesn't exist - operation is idempotent
// Prune stale git records (best effort)
try {
using pruneProc = execAsync(`git -C "${projectPath}" worktree prune`);
await pruneProc.result;
} catch {
// Ignore prune errors - directory is already deleted, which is the goal
// For standard worktrees, prune stale git records (best effort)
if (!isInPlace) {
try {
using pruneProc = execAsync(`git -C "${projectPath}" worktree prune`);
await pruneProc.result;
} catch {
// Ignore prune errors - directory is already deleted, which is the goal
}
}
return { success: true, deletedPath };
}

// For in-place workspaces, there's no worktree to remove
// Just return success - the workspace directory itself should not be deleted
// as it may contain the user's actual project files
if (isInPlace) {
return { success: true, deletedPath };
}

try {
// Use git worktree remove to delete the worktree
// This updates git's internal worktree metadata correctly
Expand Down
54 changes: 39 additions & 15 deletions src/services/agentSession.ts
Original file line number Diff line number Diff line change
Expand Up @@ -180,28 +180,52 @@ export class AgentSession {
if (existing.success) {
// Metadata already exists, verify workspace path matches
const metadata = existing.data;
// Directory name uses workspace name (not stable ID)
const runtime = createRuntime(
metadata.runtimeConfig ?? { type: "local", srcBaseDir: this.config.srcDir }
);
const expectedPath = runtime.getWorkspacePath(metadata.projectPath, metadata.name);
// For in-place workspaces (projectPath === name), use path directly
// Otherwise reconstruct using runtime's worktree pattern
const isInPlace = metadata.projectPath === metadata.name;
const expectedPath = isInPlace
? metadata.projectPath
: (() => {
const runtime = createRuntime(
metadata.runtimeConfig ?? { type: "local", srcBaseDir: this.config.srcDir }
);
return runtime.getWorkspacePath(metadata.projectPath, metadata.name);
})();
assert(
expectedPath === normalizedWorkspacePath,
`Existing metadata workspace path mismatch for ${this.workspaceId}: expected ${expectedPath}, got ${normalizedWorkspacePath}`
);
return;
}

// Derive project path from workspace path (parent directory)
const derivedProjectPath = path.dirname(normalizedWorkspacePath);

const derivedProjectName =
projectName && projectName.trim().length > 0
? projectName.trim()
: path.basename(derivedProjectPath) || "unknown";

// Extract name from workspace path (last component)
const workspaceName = path.basename(normalizedWorkspacePath);
// Detect in-place workspace: if workspacePath is not under srcBaseDir,
// it's a direct workspace (e.g., for CLI/benchmarks) rather than a worktree
const srcBaseDir = this.config.srcDir;
const normalizedSrcBaseDir = path.resolve(srcBaseDir);
const isUnderSrcBaseDir = normalizedWorkspacePath.startsWith(normalizedSrcBaseDir + path.sep);

let derivedProjectPath: string;
let workspaceName: string;
let derivedProjectName: string;

if (isUnderSrcBaseDir) {
// Standard worktree mode: workspace is under ~/.cmux/src/project/branch
derivedProjectPath = path.dirname(normalizedWorkspacePath);
workspaceName = path.basename(normalizedWorkspacePath);
derivedProjectName =
projectName && projectName.trim().length > 0
? projectName.trim()
: path.basename(derivedProjectPath) || "unknown";
} else {
// In-place mode: workspace is a standalone directory
// Store the workspace path directly by setting projectPath === name
derivedProjectPath = normalizedWorkspacePath;
workspaceName = normalizedWorkspacePath;
derivedProjectName =
projectName && projectName.trim().length > 0
? projectName.trim()
: path.basename(normalizedWorkspacePath) || "unknown";
}

const metadata: WorkspaceMetadata = {
id: this.workspaceId,
Expand Down
9 changes: 7 additions & 2 deletions src/services/aiService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -519,11 +519,16 @@ export class AIService extends EventEmitter {
return Err({ type: "unknown", raw: `Workspace ${workspaceId} not found in config` });
}

// Get workspace path (directory name uses workspace name)
// Get workspace path - handle both worktree and in-place modes
const runtime = createRuntime(
metadata.runtimeConfig ?? { type: "local", srcBaseDir: this.config.srcDir }
);
const workspacePath = runtime.getWorkspacePath(metadata.projectPath, metadata.name);
// In-place workspaces (CLI/benchmarks) have projectPath === name
// Use path directly instead of reconstructing via getWorkspacePath
const isInPlace = metadata.projectPath === metadata.name;
const workspacePath = isInPlace
? metadata.projectPath
: runtime.getWorkspacePath(metadata.projectPath, metadata.name);

// Build system message from workspace metadata
const systemMessage = await buildSystemMessage(
Expand Down
9 changes: 7 additions & 2 deletions src/services/streamManager.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { EventEmitter } from "events";
import * as path from "path";
import {
streamText,
stepCountIs,
Expand Down Expand Up @@ -982,9 +983,13 @@ export class StreamManager extends EventEmitter {
// Don't block stream completion waiting for directory deletion
// This is especially important for SSH where rm -rf can take 500ms-2s
if (streamInfo.runtimeTempDir) {
// Use parent directory as cwd for safety - if runtimeTempDir is malformed,
// we won't accidentally run rm -rf from root
const tempDirBasename = path.basename(streamInfo.runtimeTempDir);
const tempDirParent = path.dirname(streamInfo.runtimeTempDir);
void streamInfo.runtime
.exec(`rm -rf "${streamInfo.runtimeTempDir}"`, {
cwd: "~",
.exec(`rm -rf "${tempDirBasename}"`, {
cwd: tempDirParent,
timeout: 10,
})
.then(async (result) => {
Expand Down