Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 23 additions & 19 deletions .github/workflows/terminal-bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,34 @@ on:
workflow_call:
inputs:
model_name:
description: "Model to use (e.g., anthropic:claude-sonnet-4-5)"
description: 'Model to use (e.g., anthropic:claude-sonnet-4-5)'
required: false
type: string
thinking_level:
description: "Thinking level (off, low, medium, high)"
description: 'Thinking level (off, low, medium, high)'
required: false
type: string
dataset:
description: "Terminal-Bench dataset to use"
description: 'Terminal-Bench dataset to use'
required: false
type: string
default: "terminal-bench-core==0.1.1"
default: 'terminal-bench-core==0.1.1'
concurrency:
description: "Number of concurrent tasks (--n-concurrent)"
description: 'Number of concurrent tasks (--n-concurrent)'
required: false
type: string
default: "4"
default: '4'
livestream:
description: "Enable livestream mode"
description: 'Enable livestream mode'
required: false
type: boolean
default: true
sample_size:
description: "Number of random tasks to run (empty = all tasks)"
description: 'Number of random tasks to run (empty = all tasks)'
required: false
type: string
extra_args:
description: "Additional arguments to pass to terminal-bench"
description: 'Additional arguments to pass to terminal-bench'
required: false
type: string
secrets:
Expand All @@ -42,34 +42,34 @@ on:
workflow_dispatch:
inputs:
dataset:
description: "Terminal-Bench dataset to use"
description: 'Terminal-Bench dataset to use'
required: false
default: "terminal-bench-core==0.1.1"
default: 'terminal-bench-core==0.1.1'
type: string
concurrency:
description: "Number of concurrent tasks (--n-concurrent)"
description: 'Number of concurrent tasks (--n-concurrent)'
required: false
default: "4"
default: '4'
type: string
livestream:
description: "Enable livestream mode"
description: 'Enable livestream mode'
required: false
default: true
type: boolean
sample_size:
description: "Number of random tasks to run (empty = all tasks)"
description: 'Number of random tasks to run (empty = all tasks)'
required: false
type: string
model_name:
description: "Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)"
description: 'Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)'
required: false
type: string
thinking_level:
description: "Thinking level (off, low, medium, high)"
description: 'Thinking level (off, low, medium, high)'
required: false
type: string
extra_args:
description: "Additional arguments to pass to terminal-bench"
description: 'Additional arguments to pass to terminal-bench'
required: false
type: string

Expand Down Expand Up @@ -97,6 +97,9 @@ jobs:
- name: Generate version file
run: ./scripts/generate-version.sh

- name: Build dist/
run: make build

- name: Run Terminal-Bench
run: make benchmark-terminal
env:
Expand All @@ -120,7 +123,7 @@ jobs:
cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE"
echo ""
echo "Per-task summary:"
cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .is_resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
else
echo "No results.json found in runs/"
ls -la runs/
Expand Down Expand Up @@ -148,3 +151,4 @@ jobs:
runs/
if-no-files-found: warn
retention-days: 30

Loading