Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 17 additions & 11 deletions pkg/evaluation/eval_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,23 +156,29 @@ func TestParseJudgeResponse(t *testing.T) {
text string
wantPassed bool
wantReason string
wantErr bool
}{
{"simple pass", `{"result": "pass", "reason": "good"}`, true, "good"},
{"simple fail", `{"result": "fail", "reason": "bad"}`, false, "bad"},
{"pass uppercase", `{"result": "PASS", "reason": "good"}`, true, "good"},
{"fail uppercase", `{"result": "FAIL", "reason": "bad"}`, false, "bad"},
{"pass mixed case", `{"result": "Pass", "reason": "good"}`, true, "good"},
{"invalid json returns false", `not json at all`, false, "failed to parse judge response"},
{"empty result returns false", `{"result": "", "reason": "empty"}`, false, "empty"},
{"missing result field", `{"reason": "no result field"}`, false, "no result field"},
{"simple pass", `{"result": "pass", "reason": "good"}`, true, "good", false},
{"simple fail", `{"result": "fail", "reason": "bad"}`, false, "bad", false},
{"pass uppercase", `{"result": "PASS", "reason": "good"}`, true, "good", false},
{"fail uppercase", `{"result": "FAIL", "reason": "bad"}`, false, "bad", false},
{"pass mixed case", `{"result": "Pass", "reason": "good"}`, true, "good", false},
{"invalid json returns error", `not json at all`, false, "", true},
{"empty result returns false", `{"result": "", "reason": "empty"}`, false, "empty", false},
{"missing result field", `{"reason": "no result field"}`, false, "no result field", false},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
got := parseJudgeResponse(tt.text)
assert.Equal(t, tt.wantPassed, got.passed)
assert.Equal(t, tt.wantReason, got.reason)
passed, reason, err := parseJudgeResponse(tt.text)
if tt.wantErr {
require.Error(t, err)
} else {
require.NoError(t, err)
assert.Equal(t, tt.wantPassed, passed)
assert.Equal(t, tt.wantReason, reason)
}
})
}
}
Expand Down
54 changes: 40 additions & 14 deletions pkg/evaluation/judge.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@ package evaluation
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"log/slog"
"strings"
"sync"

Expand Down Expand Up @@ -164,18 +167,42 @@ func (j *Judge) checkSingle(ctx context.Context, response, criterion string) (pa
defer stream.Close()

var fullResponse strings.Builder
var streamErr error
for {
resp, err := stream.Recv()
if err != nil {
if !errors.Is(err, io.EOF) {
streamErr = err
}
break
}
for _, choice := range resp.Choices {
fullResponse.WriteString(choice.Delta.Content)
}
}

result := parseJudgeResponse(fullResponse.String())
return result.passed, result.reason, nil
if streamErr != nil {
return false, "", fmt.Errorf("streaming judge response: %w", streamErr)
}

raw := fullResponse.String()
passed, reason, err = parseJudgeResponse(raw)
if err != nil {
slog.Warn("Failed to parse judge response",
"criterion", criterion,
"raw_response", raw,
"error", err,
)
return false, "", fmt.Errorf("parsing judge response (length=%d): %w", len(raw), err)
}

slog.Debug("Judge response parsed successfully",
"criterion", criterion,
"passed", passed,
"reason", reason,
)

return passed, reason, nil
}

// judgeResponse represents the structured response from the judge model.
Expand All @@ -184,23 +211,22 @@ type judgeResponse struct {
Reason string `json:"reason"`
}

// parsedJudgeResult contains the parsed result from the judge.
type parsedJudgeResult struct {
passed bool
reason string
}

func parseJudgeResponse(text string) parsedJudgeResult {
// parseJudgeResponse parses a JSON judge response and returns whether the check
// passed, the reason, and any parse error.
func parseJudgeResponse(text string) (passed bool, reason string, err error) {
text = strings.TrimSpace(text)

var resp judgeResponse
if err := json.Unmarshal([]byte(text), &resp); err != nil {
// With structured output this should not happen, but handle gracefully
return parsedJudgeResult{passed: false, reason: "failed to parse judge response"}
return false, "", fmt.Errorf("invalid JSON: %w", err)
}

return parsedJudgeResult{
passed: strings.EqualFold(resp.Result, "pass"),
reason: resp.Reason,
if resp.Result == "" {
slog.Warn("Judge response has empty result field",
"raw_response", text,
"reason_field", resp.Reason,
)
}

return strings.EqualFold(resp.Result, "pass"), resp.Reason, nil
}