diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index c8fa90057..83905f86c 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -108,11 +108,30 @@ jobs: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + - name: Print results summary + if: always() + run: | + echo "=== Terminal-Bench Results Summary ===" + if [ -f "$(find runs -name 'results.json' | head -1)" ]; then + RESULTS_FILE=$(find runs -name 'results.json' | head -1) + echo "Results file: $RESULTS_FILE" + echo "" + echo "Full results.json:" + cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE" + echo "" + echo "Per-task summary:" + cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details" + else + echo "No results.json found in runs/" + ls -la runs/ + fi + - name: Upload benchmark results if: always() uses: actions/upload-artifact@v4 with: - name: terminal-bench-results-${{ inputs.model_name && format('{0}-{1}', inputs.model_name, github.run_id) || format('{0}', github.run_id) }} + # Replace colons with hyphens to avoid GitHub artifact name restrictions + name: terminal-bench-results-${{ inputs.model_name && replace(format('{0}-{1}', inputs.model_name, github.run_id), ':', '-') || format('{0}', github.run_id) }} path: | runs/ if-no-files-found: warn diff --git a/src/debug/agentSessionCli.ts b/src/debug/agentSessionCli.ts index 66c5b2fc9..0b0c0f429 100644 --- a/src/debug/agentSessionCli.ts +++ b/src/debug/agentSessionCli.ts @@ -187,6 +187,12 @@ async function main(): Promise { const emitJsonStreaming = values["json-streaming"] === true; const suppressHumanOutput = emitJsonStreaming || emitFinalJson; + + // Log model selection for terminal-bench verification + if (!suppressHumanOutput) { + console.error(`[cmux-cli] Using model: ${model}`); + } + const humanStream = process.stdout; const writeHuman = (text: string) => { if (suppressHumanOutput) {