From f6dc37bd97319ec8d6ce5f343ca715cab9b113dd Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Wed, 29 Oct 2025 19:14:38 +0000
Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=A4=96=20fix:=20replace=20colons=20in?=
 =?UTF-8?q?=20artifact=20names=20with=20hyphens?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Artifact names cannot contain colons due to filesystem restrictions.
Convert model names like 'anthropic:claude-sonnet-4-5' to
'anthropic-claude-sonnet-4-5' in artifact names.

This fixes the nightly benchmark workflow failures where artifacts
were generated successfully but failed to upload.
---
 .github/workflows/terminal-bench.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
index c8fa90057..c5505e336 100644
--- a/.github/workflows/terminal-bench.yml
+++ b/.github/workflows/terminal-bench.yml
@@ -112,7 +112,8 @@ jobs:
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: terminal-bench-results-${{ inputs.model_name && format('{0}-{1}', inputs.model_name, github.run_id) || format('{0}', github.run_id) }}
+          # Replace colons with hyphens to avoid GitHub artifact name restrictions
+          name: terminal-bench-results-${{ inputs.model_name && replace(format('{0}-{1}', inputs.model_name, github.run_id), ':', '-') || format('{0}', github.run_id) }}
           path: |
             runs/
           if-no-files-found: warn

From ae7b78b324fc558e3a90b52773cbca287737eb17 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Wed, 29 Oct 2025 19:21:53 +0000
Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=A4=96=20debug:=20log=20model=20name?=
 =?UTF-8?q?=20in=20agentSessionCli=20for=20benchmark=20verification?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add console.error logging to confirm which model is being used
when running terminal-bench. This helps verify that model_name
is correctly passed through the workflow -> Makefile -> agent -> CLI chain.

Logging to stderr to avoid interfering with stdout-based result parsing.
---
 src/debug/agentSessionCli.ts | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/debug/agentSessionCli.ts b/src/debug/agentSessionCli.ts
index 66c5b2fc9..0b0c0f429 100644
--- a/src/debug/agentSessionCli.ts
+++ b/src/debug/agentSessionCli.ts
@@ -187,6 +187,12 @@ async function main(): Promise<void> {
   const emitJsonStreaming = values["json-streaming"] === true;
 
   const suppressHumanOutput = emitJsonStreaming || emitFinalJson;
+
+  // Log model selection for terminal-bench verification
+  if (!suppressHumanOutput) {
+    console.error(`[cmux-cli] Using model: ${model}`);
+  }
+
   const humanStream = process.stdout;
   const writeHuman = (text: string) => {
     if (suppressHumanOutput) {

From 7f6630a8d4b09e2ceb0f43d4c997a60223f86b37 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Wed, 29 Oct 2025 19:23:45 +0000
Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=A4=96=20feat:=20print=20results.json?=
 =?UTF-8?q?=20before=20artifact=20upload?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a step to output the full results.json and per-task summary
before uploading artifacts. This ensures we can see task-level
results even if artifact upload fails.

Output includes:
- Full results.json (with jq formatting if available)
- Per-task summary: task_id: ✓ PASS / ✗ FAIL

This will help verify that different models are producing
different results, not just different timeout patterns.
---
 .github/workflows/terminal-bench.yml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
index c5505e336..83905f86c 100644
--- a/.github/workflows/terminal-bench.yml
+++ b/.github/workflows/terminal-bench.yml
@@ -108,6 +108,24 @@ jobs:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
 
+      - name: Print results summary
+        if: always()
+        run: |
+          echo "=== Terminal-Bench Results Summary ==="
+          if [ -f "$(find runs -name 'results.json' | head -1)" ]; then
+            RESULTS_FILE=$(find runs -name 'results.json' | head -1)
+            echo "Results file: $RESULTS_FILE"
+            echo ""
+            echo "Full results.json:"
+            cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE"
+            echo ""
+            echo "Per-task summary:"
+            cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
+          else
+            echo "No results.json found in runs/"
+            ls -la runs/
+          fi
+
       - name: Upload benchmark results
         if: always()
         uses: actions/upload-artifact@v4