Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 14 additions & 33 deletions benchmarks/terminal_bench/cmux-run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,6 @@ CMUX_WORKSPACE_ID="${CMUX_WORKSPACE_ID:-cmux-bench}"
CMUX_THINKING_LEVEL="${CMUX_THINKING_LEVEL:-high}"
CMUX_MODE="${CMUX_MODE:-exec}"

ensure_bun() {
if ! command -v bun >/dev/null 2>&1; then
fatal "bun must be installed before running the cmux agent"
fi
}

resolve_project_path() {
if [[ -n "${CMUX_PROJECT_PATH}" ]]; then
if [[ -d "${CMUX_PROJECT_PATH}" ]]; then
Expand All @@ -59,40 +53,27 @@ resolve_project_path() {
ensure_git_repo() {
local project_path=$1

if command -v git >/dev/null 2>&1; then
if git -C "${project_path}" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
# Ensure trunk branch exists even on pre-existing repos.
if ! git -C "${project_path}" rev-parse --verify "${CMUX_TRUNK}" >/dev/null 2>&1; then
git -C "${project_path}" checkout -b "${CMUX_TRUNK}" >/dev/null 2>&1 || true
else
git -C "${project_path}" checkout "${CMUX_TRUNK}" >/dev/null 2>&1 || true
fi
return 0
fi
command -v git >/dev/null 2>&1 || return 0

log "initialising git repository at ${project_path}"
if git -C "${project_path}" init --initial-branch="${CMUX_TRUNK}" >/dev/null 2>&1; then
:
else
git -C "${project_path}" init >/dev/null
git -C "${project_path}" checkout -B "${CMUX_TRUNK}" >/dev/null
fi
git -C "${project_path}" config user.name "cmux-bench"
git -C "${project_path}" config user.email "bench@cmux.local"
git -C "${project_path}" add -A >/dev/null
git -C "${project_path}" commit -m "chore: initial snapshot" --allow-empty >/dev/null
git -C "${project_path}" branch -M "${CMUX_TRUNK}" >/dev/null
else
log "git not available; skipping repository initialisation"
if git -C "${project_path}" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
git -C "${project_path}" checkout "${CMUX_TRUNK}" 2>/dev/null || \
git -C "${project_path}" checkout -b "${CMUX_TRUNK}" 2>/dev/null || true
return 0
fi

log "initialising git repository at ${project_path}"
git -C "${project_path}" init --initial-branch="${CMUX_TRUNK}" 2>/dev/null || \
(git -C "${project_path}" init && git -C "${project_path}" checkout -B "${CMUX_TRUNK}") >/dev/null
git -C "${project_path}" config user.name "cmux-bench"
git -C "${project_path}" config user.email "bench@cmux.local"
git -C "${project_path}" add -A >/dev/null
git -C "${project_path}" commit -m "chore: initial snapshot" --allow-empty >/dev/null
}

ensure_bun
command -v bun >/dev/null 2>&1 || fatal "bun is not installed"
project_path=$(resolve_project_path)
ensure_git_repo "${project_path}"

bun --version >/dev/null 2>&1 || fatal "bun not available after ensure_bun"

log "starting cmux agent session for ${project_path}"
cd "${CMUX_APP_ROOT}"

Expand Down
76 changes: 27 additions & 49 deletions benchmarks/terminal_bench/cmux_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class CmuxAgent(AbstractInstalledAgent):
"tsconfig.json",
"tsconfig.main.json",
"src",
"dist",
)

_PROVIDER_ENV_KEYS: Sequence[str] = (
Expand Down Expand Up @@ -140,33 +141,22 @@ def _env(self) -> dict[str, str]:
else:
raise ValueError("CMUX_MODE must be one of plan, exec, or execute")

config_root = env["CMUX_CONFIG_ROOT"].strip()
app_root = env["CMUX_APP_ROOT"].strip()
workspace_id = env["CMUX_WORKSPACE_ID"].strip()
project_candidates = env["CMUX_PROJECT_CANDIDATES"].strip()
if not config_root:
raise ValueError("CMUX_CONFIG_ROOT must be set")
if not app_root:
raise ValueError("CMUX_APP_ROOT must be set")
if not workspace_id:
raise ValueError("CMUX_WORKSPACE_ID must be set")
if not project_candidates:
raise ValueError("CMUX_PROJECT_CANDIDATES must be set")
env["CMUX_CONFIG_ROOT"] = config_root
env["CMUX_APP_ROOT"] = app_root
env["CMUX_WORKSPACE_ID"] = workspace_id
env["CMUX_PROJECT_CANDIDATES"] = project_candidates

timeout_value = env.get("CMUX_TIMEOUT_MS")
if timeout_value:
timeout_value = timeout_value.strip()
if not timeout_value.isdigit():
raise ValueError("CMUX_TIMEOUT_MS must be an integer expressed in ms")
env["CMUX_TIMEOUT_MS"] = timeout_value

project_path = env.get("CMUX_PROJECT_PATH")
if project_path is not None and not project_path.strip():
raise ValueError("CMUX_PROJECT_PATH must be non-empty when provided")
# These env vars are all set with defaults above, no need to validate
for key in (
"CMUX_CONFIG_ROOT",
"CMUX_APP_ROOT",
"CMUX_WORKSPACE_ID",
"CMUX_PROJECT_CANDIDATES",
):
env[key] = env[key].strip()

if timeout_value := env.get("CMUX_TIMEOUT_MS"):
if not timeout_value.strip().isdigit():
raise ValueError("CMUX_TIMEOUT_MS must be an integer")

if project_path := env.get("CMUX_PROJECT_PATH"):
if not project_path.strip():
raise ValueError("CMUX_PROJECT_PATH must be non-empty when provided")

return env

Expand All @@ -180,37 +170,25 @@ def perform_task(
session: TmuxSession,
logging_dir=None,
) -> AgentResult:
if not instruction or not instruction.strip():
if not instruction.strip():
raise ValueError("instruction must be a non-empty string")

self._ensure_payload_staged(session)
return super().perform_task(
instruction=instruction, session=session, logging_dir=logging_dir
)
return super().perform_task(instruction, session, logging_dir)

def _ensure_payload_staged(self, session: TmuxSession) -> None:
container_id = getattr(session.container, "id", None)
if container_id and container_id == self._staged_container_id:
if container_id == self._staged_container_id:
return

archive = self._build_archive()
if not self._archive_bytes:
self._archive_bytes = build_app_archive(
self._repo_root, self._INCLUDE_PATHS
)

stage_payload(
session=session,
archive_bytes=archive,
archive_name=self._ARCHIVE_NAME,
runner_path=self._runner_path,
session, self._archive_bytes, self._ARCHIVE_NAME, self._runner_path
)

if container_id:
self._staged_container_id = container_id

def _build_archive(self) -> bytes:
if self._archive_bytes is not None:
return self._archive_bytes

archive = build_app_archive(self._repo_root, self._INCLUDE_PATHS)
self._archive_bytes = archive
return archive
self._staged_container_id = container_id

def _run_agent_commands(self, instruction: str) -> list[TerminalCommand]:
escaped = shlex.quote(instruction)
Expand Down
23 changes: 3 additions & 20 deletions benchmarks/terminal_bench/cmux_payload.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@

def build_app_archive(repo_root: Path, include_paths: Iterable[str]) -> bytes:
"""Pack the cmux workspace into a gzipped tarball."""

if not repo_root or not repo_root.exists():
if not repo_root.exists():
raise FileNotFoundError(f"cmux repo root {repo_root} not found")

buffer = io.BytesIO()
Expand All @@ -22,8 +21,6 @@ def build_app_archive(repo_root: Path, include_paths: Iterable[str]) -> bytes:
if not source.exists():
raise FileNotFoundError(f"Required file {source} missing")
archive.add(source, arcname=relative_path, recursive=True)

buffer.seek(0)
return buffer.getvalue()


Expand All @@ -34,27 +31,13 @@ def stage_payload(
runner_path: Path,
) -> None:
"""Copy the cmux bundle and runner into the task container."""

if not archive_bytes:
raise ValueError("archive_bytes must be non-empty")
if not runner_path or not runner_path.is_file():
raise FileNotFoundError(f"cmux runner missing at {runner_path}")

with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as temp_file:
temp_file.write(archive_bytes)
temp_path = Path(temp_file.name)

try:
session.copy_to_container(
paths=temp_path,
container_dir="/installed-agent",
container_filename=archive_name,
)
session.copy_to_container(temp_path, "/installed-agent", archive_name)
finally:
temp_path.unlink(missing_ok=True)

session.copy_to_container(
paths=runner_path,
container_dir="/installed-agent",
container_filename=runner_path.name,
)
session.copy_to_container(runner_path, "/installed-agent", runner_path.name)
14 changes: 3 additions & 11 deletions benchmarks/terminal_bench/cmux_setup.sh.j2
Original file line number Diff line number Diff line change
Expand Up @@ -37,23 +37,15 @@ CMUX_APP_ROOT="${CMUX_APP_ROOT:-/opt/cmux-app}"
CMUX_CONFIG_ROOT="${CMUX_CONFIG_ROOT:-/root/.cmux}"
CMUX_AGENT_VERSION="{{ version if version is not none else '' }}"

mkdir -p "$CMUX_APP_ROOT"

rm -rf "${CMUX_APP_ROOT}"
if [[ -n "${CMUX_AGENT_VERSION}" ]]; then
: "${CMUX_AGENT_GIT_URL:?CMUX_AGENT_GIT_URL must be set when version is provided}"
: "${CMUX_AGENT_GIT_URL:?CMUX_AGENT_GIT_URL required when version is set}"
log "cloning cmux from ${CMUX_AGENT_GIT_URL} @ ${CMUX_AGENT_VERSION}"
rm -rf "${CMUX_APP_ROOT}"
git clone --depth 1 --branch "${CMUX_AGENT_VERSION}" "${CMUX_AGENT_GIT_URL}" "${CMUX_APP_ROOT}"
else
ARCHIVE_PATH="/installed-agent/cmux-app.tar.gz"
if [[ ! -s "${ARCHIVE_PATH}" ]]; then
printf 'Expected cmux archive at %s\n' "${ARCHIVE_PATH}" >&2
exit 1
fi
log "extracting cmux archive"
rm -rf "${CMUX_APP_ROOT}"
mkdir -p "${CMUX_APP_ROOT}"
tar -xzf "${ARCHIVE_PATH}" -C "${CMUX_APP_ROOT}"
tar -xzf "/installed-agent/cmux-app.tar.gz" -C "${CMUX_APP_ROOT}"
fi

cd "${CMUX_APP_ROOT}"
Expand Down