diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1366eb6f8..a46064577 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -46,6 +46,12 @@ jobs: extra_nix_config: | experimental-features = nix-command flakes + - name: Install uv + run: curl -LsSf https://astral.sh/uv/install.sh | sh + + - name: Add uv to PATH + run: echo "$HOME/.local/bin" >> $GITHUB_PATH + - name: Run static checks run: make -j3 static-check diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml new file mode 100644 index 000000000..235cf1ca4 --- /dev/null +++ b/.github/workflows/terminal-bench.yml @@ -0,0 +1,67 @@ +name: Terminal-Bench + +on: + workflow_dispatch: + inputs: + dataset: + description: 'Terminal-Bench dataset to use' + required: false + default: 'terminal-bench-core==0.1.1' + type: string + concurrency: + description: 'Number of concurrent tasks (--n-concurrent)' + required: false + default: '4' + type: string + livestream: + description: 'Enable livestream mode' + required: false + default: true + type: boolean + extra_args: + description: 'Additional arguments to pass to terminal-bench' + required: false + type: string + +jobs: + benchmark: + name: Run Terminal-Bench + runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }} + timeout-minutes: 180 # 3 hours - terminal-bench can take a long time + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Required for git describe to find tags + + - uses: ./.github/actions/setup-cmux + + - name: Install uv + run: curl -LsSf https://astral.sh/uv/install.sh | sh + + - name: Add uv to PATH + run: echo "$HOME/.local/bin" >> $GITHUB_PATH + + - name: Generate version file + run: ./scripts/generate-version.sh + + - name: Run Terminal-Bench + run: make benchmark-terminal + env: + TB_DATASET: ${{ inputs.dataset }} + TB_CONCURRENCY: ${{ inputs.concurrency }} + TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }} + TB_ARGS: ${{ inputs.extra_args }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + + - name: Upload benchmark results + if: always() + uses: actions/upload-artifact@v4 + with: + name: terminal-bench-results + path: | + terminal-bench-results/ + *.json + if-no-files-found: warn + diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/fmt.mk b/fmt.mk index 51f04eaed..701ffbf86 100644 --- a/fmt.mk +++ b/fmt.mk @@ -3,11 +3,12 @@ # This file contains all code formatting logic. # Included by the main Makefile. -.PHONY: fmt fmt-check fmt-prettier fmt-prettier-check fmt-shell fmt-shell-check fmt-nix fmt-nix-check +.PHONY: fmt fmt-check fmt-prettier fmt-prettier-check fmt-shell fmt-shell-check fmt-nix fmt-nix-check fmt-python fmt-python-check # Centralized patterns - single source of truth PRETTIER_PATTERNS := 'src/**/*.{ts,tsx,json}' 'tests/**/*.ts' 'docs/**/*.md' 'package.json' 'tsconfig*.json' 'README.md' SHELL_SCRIPTS := scripts +PYTHON_DIRS := benchmarks # Always use bun x prettier for reproducibility (uses package.json version) PRETTIER := bun x prettier @@ -15,11 +16,12 @@ PRETTIER := bun x prettier # Tool availability checks SHFMT := $(shell command -v shfmt 2>/dev/null) NIX := $(shell command -v nix 2>/dev/null) +UVX := $(shell command -v uvx 2>/dev/null) -fmt: fmt-prettier fmt-shell fmt-nix +fmt: fmt-prettier fmt-shell fmt-python fmt-nix @echo "==> All formatting complete!" -fmt-check: fmt-prettier-check fmt-shell-check fmt-nix-check +fmt-check: fmt-prettier-check fmt-shell-check fmt-python-check fmt-nix-check @echo "==> All formatting checks passed!" fmt-prettier: @@ -48,6 +50,21 @@ else @shfmt -i 2 -ci -bn -d $(SHELL_SCRIPTS) endif +# Helper target to check for uvx +.check-uvx: +ifeq ($(UVX),) + @echo "Error: uvx not found. Install with: curl -LsSf https://astral.sh/uv/install.sh | sh" + @exit 1 +endif + +fmt-python: .check-uvx + @echo "Formatting Python files..." + @uvx ruff format $(PYTHON_DIRS) + +fmt-python-check: .check-uvx + @echo "Checking Python formatting..." + @uvx ruff format --check $(PYTHON_DIRS) + fmt-nix: ifeq ($(NIX),) @echo "Nix not found; skipping Nix formatting"