Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 15 additions & 9 deletions pkg/evaluation/scoring.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,10 @@ func computeSummary(results []Result) Summary {
}
}

summary.ToolsTotal += r.ToolCallsExpected
summary.ToolsPassed += r.ToolCallsScore * r.ToolCallsExpected
if r.ToolCallsExpected > 0 {
summary.ToolsF1Sum += r.ToolCallsScore
summary.ToolsCount++
}

summary.HandoffsTotal++
if r.HandoffsMatch {
Expand All @@ -115,24 +117,28 @@ func printSummary(out io.Writer, summary Summary, duration time.Duration) {
}

printMetric(out, "Sizes", summary.SizesPassed, summary.SizesTotal)
printMetricFloat(out, "Tool Calls", summary.ToolsPassed, summary.ToolsTotal)
printF1Score(out, "Tool Calls", summary.ToolsF1Sum, summary.ToolsCount)
printMetric(out, "Handoffs", summary.HandoffsPassed, summary.HandoffsTotal)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Loss of precision when casting Relevance scores from float64 to int

The RelevancePassed and RelevanceTotal fields in the Summary struct are defined as float64 (types.go lines 99-100) and can contain fractional values from accumulated relevance scores. Casting them to int here truncates decimal portions, losing precision.

This is inconsistent with:

  • How the Tool Calls metric preserves float precision (line 120 uses printF1Score)
  • How the underlying values are stored (float64 in the Summary struct)
  • How relevance scores are accumulated (can be fractional)

For example, a RelevancePassed value of 8.5 would display as 8, making the displayed metric inconsistent with the actual accumulated scores.

Suggestion: If you want to maintain float precision for Relevance like Tool Calls, consider creating a similar display function or using the previous printMetricFloat approach. If integer display is intentional, consider documenting why relevance is treated differently than other float-based metrics.

printMetricFloat(out, "Relevance", summary.RelevancePassed, summary.RelevanceTotal)
printMetric(out, "Relevance", int(summary.RelevancePassed), int(summary.RelevanceTotal))

fmt.Fprintf(out, "\nTotal Cost: $%.6f\n", summary.TotalCost)
fmt.Fprintf(out, "Total Time: %s\n", duration.Round(time.Second))
}

func printMetric(out io.Writer, label string, passed, total int) {
printMetricFloat(out, label, float64(passed), float64(total))
if total == 0 {
return // Skip metrics with no data
}
ratio := float64(passed) / float64(total)
fmt.Fprintf(out, "%s %14s: %d/%d passed (%.1f%%)\n", statusIcon(ratio), label, passed, total, ratio*100)
}

func printMetricFloat(out io.Writer, label string, passed, total float64) {
if total == 0 {
func printF1Score(out io.Writer, label string, f1Sum float64, count int) {
if count == 0 {
return // Skip metrics with no data
}
ratio := passed / total
fmt.Fprintf(out, "%s %14s: %.0f/%.0f passed (%.1f%%)\n", statusIcon(ratio), label, passed, total, ratio*100)
avgF1 := f1Sum / float64(count)
fmt.Fprintf(out, "%s %14s: %.1f%% avg F1 (%d evals)\n", statusIcon(avgF1), label, avgF1*100, count)
}

func statusIcon(ratio float64) string {
Expand Down
4 changes: 2 additions & 2 deletions pkg/evaluation/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ type Summary struct {
TotalCost float64 `json:"total_cost"`
SizesPassed int `json:"sizes_passed"`
SizesTotal int `json:"sizes_total"`
ToolsPassed float64 `json:"tools_passed"`
ToolsTotal float64 `json:"tools_total"`
ToolsF1Sum float64 `json:"tools_f1_sum"`
ToolsCount int `json:"tools_count"`
HandoffsPassed int `json:"handoffs_passed"`
HandoffsTotal int `json:"handoffs_total"`
RelevancePassed float64 `json:"relevance_passed"`
Expand Down
Loading