From 87020e28b3ada9ec3713ac5584619fe90b0edad0 Mon Sep 17 00:00:00 2001 From: Ammar Date: Fri, 17 Oct 2025 08:54:57 -0500 Subject: [PATCH 1/6] =?UTF-8?q?=F0=9F=A4=96=20Add=20workflow=5Fdispatch=20?= =?UTF-8?q?for=20Terminal-Bench?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Manually triggerable workflow for running terminal-bench - 3 hour timeout for long-running benchmarks - Configurable dataset, concurrency (default: 4), and livestream (default: true) - Installs uv for uvx terminal-bench command - Uploads benchmark results as artifacts --- .github/workflows/terminal-bench.yml | 67 ++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 .github/workflows/terminal-bench.yml diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml new file mode 100644 index 000000000..58509b435 --- /dev/null +++ b/.github/workflows/terminal-bench.yml @@ -0,0 +1,67 @@ +name: Terminal-Bench + +on: + workflow_dispatch: + inputs: + dataset: + description: 'Terminal-Bench dataset to use' + required: false + default: 'terminal-bench-core==0.1.1' + type: string + concurrency: + description: 'Number of concurrent tasks (--n-concurrent)' + required: false + default: '4' + type: string + livestream: + description: 'Enable livestream mode' + required: false + default: true + type: boolean + extra_args: + description: 'Additional arguments to pass to terminal-bench' + required: false + type: string + +jobs: + benchmark: + name: Run Terminal-Bench + runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }} + timeout-minutes: 180 # 3 hours - terminal-bench can take a long time + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Required for git describe to find tags + + - uses: ./.github/actions/setup-cmux + + - name: Install uv + run: curl -LsSf https://astral.sh/uv/install.sh | sh + + - name: Add uv to PATH + run: echo "$HOME/.cargo/bin" >> $GITHUB_PATH + + - name: Generate version file + run: ./scripts/generate-version.sh + + - name: Run Terminal-Bench + run: make benchmark-terminal + env: + TB_DATASET: ${{ inputs.dataset }} + TB_CONCURRENCY: ${{ inputs.concurrency }} + TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }} + TB_ARGS: ${{ inputs.extra_args }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + + - name: Upload benchmark results + if: always() + uses: actions/upload-artifact@v4 + with: + name: terminal-bench-results + path: | + terminal-bench-results/ + *.json + if-no-files-found: warn + From 5d87533759a590b74ca1ca58cea663927d118cb0 Mon Sep 17 00:00:00 2001 From: Ammar Date: Fri, 17 Oct 2025 08:57:55 -0500 Subject: [PATCH 2/6] =?UTF-8?q?=F0=9F=A4=96=20Add=20benchmarks/=5F=5Finit?= =?UTF-8?q?=5F=5F.py=20for=20Python=20package=20import?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- benchmarks/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 benchmarks/__init__.py diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py new file mode 100644 index 000000000..e69de29bb From 229a216aab6418fe9271edbe4870c7a56249eb95 Mon Sep 17 00:00:00 2001 From: Ammar Date: Fri, 17 Oct 2025 08:58:49 -0500 Subject: [PATCH 3/6] =?UTF-8?q?=F0=9F=A4=96=20Fix=20uv=20PATH=20-=20instal?= =?UTF-8?q?ler=20uses=20~/.local/bin=20not=20~/.cargo/bin?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/terminal-bench.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index 58509b435..235cf1ca4 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -40,7 +40,7 @@ jobs: run: curl -LsSf https://astral.sh/uv/install.sh | sh - name: Add uv to PATH - run: echo "$HOME/.cargo/bin" >> $GITHUB_PATH + run: echo "$HOME/.local/bin" >> $GITHUB_PATH - name: Generate version file run: ./scripts/generate-version.sh From c024efbcf10fa8c9120dae96c6ccd8d9f3914a6a Mon Sep 17 00:00:00 2001 From: Ammar Date: Fri, 17 Oct 2025 09:12:53 -0500 Subject: [PATCH 4/6] =?UTF-8?q?=F0=9F=A4=96=20Add=20Python=20formatter=20(?= =?UTF-8?q?ruff)=20to=20make=20fmt/fmt-check?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Adds fmt-python and fmt-python-check targets - Uses uvx ruff format for fast, Black-compatible formatting - Automatically formats benchmarks/ directory - Integrated into main fmt and fmt-check targets --- fmt.mk | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/fmt.mk b/fmt.mk index 51f04eaed..95bcdb7cc 100644 --- a/fmt.mk +++ b/fmt.mk @@ -3,11 +3,12 @@ # This file contains all code formatting logic. # Included by the main Makefile. -.PHONY: fmt fmt-check fmt-prettier fmt-prettier-check fmt-shell fmt-shell-check fmt-nix fmt-nix-check +.PHONY: fmt fmt-check fmt-prettier fmt-prettier-check fmt-shell fmt-shell-check fmt-nix fmt-nix-check fmt-python fmt-python-check # Centralized patterns - single source of truth PRETTIER_PATTERNS := 'src/**/*.{ts,tsx,json}' 'tests/**/*.ts' 'docs/**/*.md' 'package.json' 'tsconfig*.json' 'README.md' SHELL_SCRIPTS := scripts +PYTHON_DIRS := benchmarks # Always use bun x prettier for reproducibility (uses package.json version) PRETTIER := bun x prettier @@ -15,11 +16,12 @@ PRETTIER := bun x prettier # Tool availability checks SHFMT := $(shell command -v shfmt 2>/dev/null) NIX := $(shell command -v nix 2>/dev/null) +UVX := $(shell command -v uvx 2>/dev/null) -fmt: fmt-prettier fmt-shell fmt-nix +fmt: fmt-prettier fmt-shell fmt-python fmt-nix @echo "==> All formatting complete!" -fmt-check: fmt-prettier-check fmt-shell-check fmt-nix-check +fmt-check: fmt-prettier-check fmt-shell-check fmt-python-check fmt-nix-check @echo "==> All formatting checks passed!" fmt-prettier: @@ -48,6 +50,24 @@ else @shfmt -i 2 -ci -bn -d $(SHELL_SCRIPTS) endif +fmt-python: +ifeq ($(UVX),) + @echo "Error: uvx not found. Install with: curl -LsSf https://astral.sh/uv/install.sh | sh" + @exit 1 +else + @echo "Formatting Python files..." + @uvx ruff format $(PYTHON_DIRS) +endif + +fmt-python-check: +ifeq ($(UVX),) + @echo "Error: uvx not found. Install with: curl -LsSf https://astral.sh/uv/install.sh | sh" + @exit 1 +else + @echo "Checking Python formatting..." + @uvx ruff format --check $(PYTHON_DIRS) +endif + fmt-nix: ifeq ($(NIX),) @echo "Nix not found; skipping Nix formatting" From 29f3bb21415f411b4ab4475a062bc22842524d6e Mon Sep 17 00:00:00 2001 From: Ammar Date: Fri, 17 Oct 2025 09:13:36 -0500 Subject: [PATCH 5/6] =?UTF-8?q?=F0=9F=A4=96=20DRY=20uvx=20check=20into=20h?= =?UTF-8?q?elper=20target?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fmt.mk | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/fmt.mk b/fmt.mk index 95bcdb7cc..701ffbf86 100644 --- a/fmt.mk +++ b/fmt.mk @@ -50,23 +50,20 @@ else @shfmt -i 2 -ci -bn -d $(SHELL_SCRIPTS) endif -fmt-python: +# Helper target to check for uvx +.check-uvx: ifeq ($(UVX),) @echo "Error: uvx not found. Install with: curl -LsSf https://astral.sh/uv/install.sh | sh" @exit 1 -else +endif + +fmt-python: .check-uvx @echo "Formatting Python files..." @uvx ruff format $(PYTHON_DIRS) -endif -fmt-python-check: -ifeq ($(UVX),) - @echo "Error: uvx not found. Install with: curl -LsSf https://astral.sh/uv/install.sh | sh" - @exit 1 -else +fmt-python-check: .check-uvx @echo "Checking Python formatting..." @uvx ruff format --check $(PYTHON_DIRS) -endif fmt-nix: ifeq ($(NIX),) From d8bf3bec66c38170ec030665e48545b8088d3d97 Mon Sep 17 00:00:00 2001 From: Ammar Date: Fri, 17 Oct 2025 09:17:57 -0500 Subject: [PATCH 6/6] =?UTF-8?q?=F0=9F=A4=96=20Install=20uv=20in=20CI=20for?= =?UTF-8?q?=20Python=20formatting=20checks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1366eb6f8..a46064577 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -46,6 +46,12 @@ jobs: extra_nix_config: | experimental-features = nix-command flakes + - name: Install uv + run: curl -LsSf https://astral.sh/uv/install.sh | sh + + - name: Add uv to PATH + run: echo "$HOME/.local/bin" >> $GITHUB_PATH + - name: Run static checks run: make -j3 static-check