From f6dc37bd97319ec8d6ce5f343ca715cab9b113dd Mon Sep 17 00:00:00 2001 From: Ammar Date: Wed, 29 Oct 2025 19:14:38 +0000 Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=A4=96=20fix:=20replace=20colons=20in?= =?UTF-8?q?=20artifact=20names=20with=20hyphens?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Artifact names cannot contain colons due to filesystem restrictions. Convert model names like 'anthropic:claude-sonnet-4-5' to 'anthropic-claude-sonnet-4-5' in artifact names. This fixes the nightly benchmark workflow failures where artifacts were generated successfully but failed to upload. --- .github/workflows/terminal-bench.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index c8fa90057..c5505e336 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -112,7 +112,8 @@ jobs: if: always() uses: actions/upload-artifact@v4 with: - name: terminal-bench-results-${{ inputs.model_name && format('{0}-{1}', inputs.model_name, github.run_id) || format('{0}', github.run_id) }} + # Replace colons with hyphens to avoid GitHub artifact name restrictions + name: terminal-bench-results-${{ inputs.model_name && replace(format('{0}-{1}', inputs.model_name, github.run_id), ':', '-') || format('{0}', github.run_id) }} path: | runs/ if-no-files-found: warn From ae7b78b324fc558e3a90b52773cbca287737eb17 Mon Sep 17 00:00:00 2001 From: Ammar Date: Wed, 29 Oct 2025 19:21:53 +0000 Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=A4=96=20debug:=20log=20model=20name?= =?UTF-8?q?=20in=20agentSessionCli=20for=20benchmark=20verification?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add console.error logging to confirm which model is being used when running terminal-bench. This helps verify that model_name is correctly passed through the workflow -> Makefile -> agent -> CLI chain. Logging to stderr to avoid interfering with stdout-based result parsing. --- src/debug/agentSessionCli.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/debug/agentSessionCli.ts b/src/debug/agentSessionCli.ts index 66c5b2fc9..0b0c0f429 100644 --- a/src/debug/agentSessionCli.ts +++ b/src/debug/agentSessionCli.ts @@ -187,6 +187,12 @@ async function main(): Promise { const emitJsonStreaming = values["json-streaming"] === true; const suppressHumanOutput = emitJsonStreaming || emitFinalJson; + + // Log model selection for terminal-bench verification + if (!suppressHumanOutput) { + console.error(`[cmux-cli] Using model: ${model}`); + } + const humanStream = process.stdout; const writeHuman = (text: string) => { if (suppressHumanOutput) { From 7f6630a8d4b09e2ceb0f43d4c997a60223f86b37 Mon Sep 17 00:00:00 2001 From: Ammar Date: Wed, 29 Oct 2025 19:23:45 +0000 Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=A4=96=20feat:=20print=20results.json?= =?UTF-8?q?=20before=20artifact=20upload?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a step to output the full results.json and per-task summary before uploading artifacts. This ensures we can see task-level results even if artifact upload fails. Output includes: - Full results.json (with jq formatting if available) - Per-task summary: task_id: ✓ PASS / ✗ FAIL This will help verify that different models are producing different results, not just different timeout patterns. --- .github/workflows/terminal-bench.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index c5505e336..83905f86c 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -108,6 +108,24 @@ jobs: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + - name: Print results summary + if: always() + run: | + echo "=== Terminal-Bench Results Summary ===" + if [ -f "$(find runs -name 'results.json' | head -1)" ]; then + RESULTS_FILE=$(find runs -name 'results.json' | head -1) + echo "Results file: $RESULTS_FILE" + echo "" + echo "Full results.json:" + cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE" + echo "" + echo "Per-task summary:" + cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details" + else + echo "No results.json found in runs/" + ls -la runs/ + fi + - name: Upload benchmark results if: always() uses: actions/upload-artifact@v4