44 workflow_call :
55 inputs :
66 model_name :
7- description : " Model to use (e.g., anthropic:claude-sonnet-4-5)"
7+ description : ' Model to use (e.g., anthropic:claude-sonnet-4-5)'
88 required : false
99 type : string
1010 thinking_level :
11- description : " Thinking level (off, low, medium, high)"
11+ description : ' Thinking level (off, low, medium, high)'
1212 required : false
1313 type : string
1414 dataset :
15- description : " Terminal-Bench dataset to use"
15+ description : ' Terminal-Bench dataset to use'
1616 required : false
1717 type : string
18- default : " terminal-bench-core==0.1.1"
18+ default : ' terminal-bench-core==0.1.1'
1919 concurrency :
20- description : " Number of concurrent tasks (--n-concurrent)"
20+ description : ' Number of concurrent tasks (--n-concurrent)'
2121 required : false
2222 type : string
23- default : " 4 "
23+ default : ' 4 '
2424 livestream :
25- description : " Enable livestream mode"
25+ description : ' Enable livestream mode'
2626 required : false
2727 type : boolean
2828 default : true
2929 sample_size :
30- description : " Number of random tasks to run (empty = all tasks)"
30+ description : ' Number of random tasks to run (empty = all tasks)'
3131 required : false
3232 type : string
3333 extra_args :
34- description : " Additional arguments to pass to terminal-bench"
34+ description : ' Additional arguments to pass to terminal-bench'
3535 required : false
3636 type : string
3737 secrets :
4242 workflow_dispatch :
4343 inputs :
4444 dataset :
45- description : " Terminal-Bench dataset to use"
45+ description : ' Terminal-Bench dataset to use'
4646 required : false
47- default : " terminal-bench-core==0.1.1"
47+ default : ' terminal-bench-core==0.1.1'
4848 type : string
4949 concurrency :
50- description : " Number of concurrent tasks (--n-concurrent)"
50+ description : ' Number of concurrent tasks (--n-concurrent)'
5151 required : false
52- default : " 4 "
52+ default : ' 4 '
5353 type : string
5454 livestream :
55- description : " Enable livestream mode"
55+ description : ' Enable livestream mode'
5656 required : false
5757 default : true
5858 type : boolean
5959 sample_size :
60- description : " Number of random tasks to run (empty = all tasks)"
60+ description : ' Number of random tasks to run (empty = all tasks)'
6161 required : false
6262 type : string
6363 model_name :
64- description : " Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)"
64+ description : ' Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)'
6565 required : false
6666 type : string
6767 thinking_level :
68- description : " Thinking level (off, low, medium, high)"
68+ description : ' Thinking level (off, low, medium, high)'
6969 required : false
7070 type : string
7171 extra_args :
72- description : " Additional arguments to pass to terminal-bench"
72+ description : ' Additional arguments to pass to terminal-bench'
7373 required : false
7474 type : string
7575
9797 - name : Generate version file
9898 run : ./scripts/generate-version.sh
9999
100+ - name : Build dist/
101+ run : make build
102+
100103 - name : Run Terminal-Bench
101104 run : make benchmark-terminal
102105 env :
@@ -120,7 +123,7 @@ jobs:
120123 cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE"
121124 echo ""
122125 echo "Per-task summary:"
123- cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .is_resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
126+ cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
124127 else
125128 echo "No results.json found in runs/"
126129 ls -la runs/
@@ -148,3 +151,4 @@ jobs:
148151 runs/
149152 if-no-files-found : warn
150153 retention-days : 30
154+
0 commit comments