Skip to content

testing: correct benchmark output when running in parallel #28814

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 51 additions & 26 deletions src/testing/benchmark.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,19 +82,19 @@ type InternalBenchmark struct {
// affecting benchmark results.
type B struct {
common
importPath string // import path of the package containing the benchmark
context *benchContext
N int
previousN int // number of iterations in the previous run
previousDuration time.Duration // total duration of the previous run
benchFunc func(b *B)
benchTime benchTimeFlag
bytes int64
missingBytes bool // one of the subbenchmarks does not have bytes set.
timerOn bool
showAllocResult bool
result BenchmarkResult
parallelism int // RunParallel creates parallelism*GOMAXPROCS goroutines
importPath string // import path of the package containing the benchmark
context *benchContext
N int
previousN int // number of iterations in the previous run
previousElapsedTime time.Duration // total duration of the previous run
benchFunc func(b *B)
benchTime benchTimeFlag
bytes int64
missingBytes bool // one of the subbenchmarks does not have bytes set.
timerOn bool
showAllocResult bool
result BenchmarkResult
parallelism int // RunParallel creates parallelism*GOMAXPROCS goroutines
// The initial states of memStats.Mallocs and memStats.TotalAlloc.
startAllocs uint64
startBytes uint64
Expand All @@ -121,7 +121,8 @@ func (b *B) StartTimer() {
// want to measure.
func (b *B) StopTimer() {
if b.timerOn {
b.duration += time.Since(b.start)
b.elapsedTime += time.Since(b.start)
b.processTime += time.Since(b.start)
runtime.ReadMemStats(&memStats)
b.netAllocs += memStats.Mallocs - b.startAllocs
b.netBytes += memStats.TotalAlloc - b.startBytes
Expand All @@ -138,7 +139,8 @@ func (b *B) ResetTimer() {
b.startBytes = memStats.TotalAlloc
b.start = time.Now()
}
b.duration = 0
b.elapsedTime = 0
b.processTime = 0
b.netAllocs = 0
b.netBytes = 0
}
Expand All @@ -158,7 +160,7 @@ func (b *B) nsPerOp() int64 {
if b.N <= 0 {
return 0
}
return b.duration.Nanoseconds() / int64(b.N)
return b.processTime.Nanoseconds() / int64(b.N)
}

// runN runs a single benchmark for the specified number of iterations.
Expand All @@ -176,7 +178,7 @@ func (b *B) runN(n int) {
b.benchFunc(b)
b.StopTimer()
b.previousN = n
b.previousDuration = b.duration
b.previousElapsedTime = b.elapsedTime
b.raceErrors += race.Errors()
if b.raceErrors > 0 {
b.Errorf("race detected during execution of benchmark")
Expand Down Expand Up @@ -312,7 +314,7 @@ func (b *B) launch() {
b.runN(b.benchTime.n)
} else {
d := b.benchTime.d
for n := 1; !b.failed && b.duration < d && n < 1e9; {
for n := 1; !b.failed && b.elapsedTime < d && n < 1e9; {
last := n
// Predict required iterations.
n = int(d.Nanoseconds())
Expand All @@ -328,13 +330,14 @@ func (b *B) launch() {
b.runN(n)
}
}
b.result = BenchmarkResult{b.N, b.duration, b.bytes, b.netAllocs, b.netBytes}
b.result = BenchmarkResult{b.N, b.elapsedTime, b.processTime, b.bytes, b.netAllocs, b.netBytes}
}

// The results of a benchmark run.
type BenchmarkResult struct {
N int // The number of iterations.
T time.Duration // The total time taken.
P time.Duration // Process time taken.
Bytes int64 // Bytes processed in one iteration.
MemAllocs uint64 // The total number of memory allocations.
MemBytes uint64 // The total number of bytes allocated.
Expand All @@ -344,7 +347,14 @@ func (r BenchmarkResult) NsPerOp() int64 {
if r.N <= 0 {
return 0
}
return r.T.Nanoseconds() / int64(r.N)
return r.P.Nanoseconds() / int64(r.N)
}

func (r BenchmarkResult) OpsPerSecond() float64 {
if r.T.Seconds() > 0 {
return float64(r.N) / r.T.Seconds()
}
return 0
}

func (r BenchmarkResult) mbPerSec() float64 {
Expand Down Expand Up @@ -382,12 +392,14 @@ func (r BenchmarkResult) String() string {
// The format specifiers here make sure that
// the ones digits line up for all three possible formats.
if nsop < 10 {
ns = fmt.Sprintf("%13.2f ns/op", float64(r.T.Nanoseconds())/float64(r.N))
ns = fmt.Sprintf("%13.2f ns/op", float64(r.P.Nanoseconds())/float64(r.N))
} else {
ns = fmt.Sprintf("%12.1f ns/op", float64(r.T.Nanoseconds())/float64(r.N))
ns = fmt.Sprintf("%12.1f ns/op", float64(r.P.Nanoseconds())/float64(r.N))
}
}
return fmt.Sprintf("%8d\t%s%s", r.N, ns, mb)

opss := fmt.Sprintf("%13.2f ops/s", r.OpsPerSecond())
return fmt.Sprintf("%8d\t%s\t%s%s", r.N, ns, opss, mb)
}

// MemString returns r.AllocedBytesPerOp and r.AllocsPerOp in the same format as 'go test'.
Expand Down Expand Up @@ -565,7 +577,7 @@ func (b *B) add(other BenchmarkResult) {
// The aggregated BenchmarkResults resemble running all subbenchmarks as
// in sequence in a single benchmark.
r.N = 1
r.T += time.Duration(other.NsPerOp())
r.P += time.Duration(other.NsPerOp())
if other.Bytes == 0 {
// Summing Bytes is meaningless in aggregate if not all subbenchmarks
// set it.
Expand Down Expand Up @@ -638,8 +650,8 @@ func (b *B) RunParallel(body func(*PB)) {
// 100µs is enough to amortize the overhead and provide sufficient
// dynamic load balancing.
grain := uint64(0)
if b.previousN > 0 && b.previousDuration > 0 {
grain = 1e5 * uint64(b.previousN) / uint64(b.previousDuration)
if b.previousN > 0 && b.previousElapsedTime > 0 {
grain = 1e5 * uint64(b.previousN) / uint64(b.previousElapsedTime)
}
if grain < 1 {
grain = 1
Expand All @@ -654,18 +666,31 @@ func (b *B) RunParallel(body func(*PB)) {
numProcs := b.parallelism * runtime.GOMAXPROCS(0)
var wg sync.WaitGroup
wg.Add(numProcs)
// Stop the timer as this is the part where elapsed time and process
// time differ. This means elapsed time counts only once, but process
// time gets calculated per goroutine.
b.StopTimer()
timeReportingMutex := sync.Mutex{}
b.start = time.Now()
for p := 0; p < numProcs; p++ {
go func() {
start := time.Now()
defer wg.Done()
pb := &PB{
globalN: &n,
grain: grain,
bN: uint64(b.N),
}
body(pb)
timeReportingMutex.Lock()
defer timeReportingMutex.Unlock()
b.processTime += time.Since(start)
}()
}
wg.Wait()
b.elapsedTime += time.Since(b.start)
// Timing procedure goes back to normal here
b.StartTimer()
if n <= uint64(b.N) && !b.Failed() {
b.Fatal("RunParallel: body exited without pb.Next() == false")
}
Expand Down
18 changes: 18 additions & 0 deletions src/testing/benchmark_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"sync/atomic"
"testing"
"text/template"
"time"
)

var roundDownTests = []struct {
Expand Down Expand Up @@ -94,6 +95,23 @@ func TestRunParallelFail(t *testing.T) {
})
}

// Ensure that elapsed time is calculated properly when running parallel (per #28813)
func TestRunParallelResults(t *testing.T) {
r := testing.Benchmark(func(b *testing.B) {
b.SetParallelism(4)
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
time.Sleep(1 * time.Microsecond)
}
})
})

// Each iteration should take at least 1 microsecond (1000 ns)
if r.NsPerOp() < 1000 {
t.Errorf("ran too quickly; took %d ns - should be greater than 1000", r.NsPerOp())
}
}

func ExampleB_RunParallel() {
// Parallel benchmark for text/template.Template.Execute on a single object.
testing.Benchmark(func(b *testing.B) {
Expand Down
25 changes: 13 additions & 12 deletions src/testing/testing.go
Original file line number Diff line number Diff line change
Expand Up @@ -315,15 +315,16 @@ type common struct {
raceErrors int // number of races detected during test
runner string // function name of tRunner running the test

parent *common
level int // Nesting depth of test or benchmark.
creator []uintptr // If level > 0, the stack trace at the point where the parent called t.Run.
name string // Name of test or benchmark.
start time.Time // Time test or benchmark started
duration time.Duration
barrier chan bool // To signal parallel subtests they may start.
signal chan bool // To signal a test is done.
sub []*T // Queue of subtests to be run in parallel.
parent *common
level int // Nesting depth of test or benchmark.
creator []uintptr // If level > 0, the stack trace at the point where the parent called t.Run.
name string // Name of test or benchmark.
start time.Time // Time test or benchmark started
elapsedTime time.Duration // Total real-world time taken to run
processTime time.Duration // Total process time taken to run
barrier chan bool // To signal parallel subtests they may start.
signal chan bool // To signal a test is done.
sub []*T // Queue of subtests to be run in parallel.
}

// Short reports whether the -test.short flag is set.
Expand Down Expand Up @@ -750,7 +751,7 @@ func (t *T) Parallel() {
// We don't want to include the time we spend waiting for serial tests
// in the test duration. Record the elapsed time thus far and reset the
// timer afterwards.
t.duration += time.Since(t.start)
t.elapsedTime += time.Since(t.start)

// Add to the list of tests to be released by the parent.
t.parent.sub = append(t.parent.sub, t)
Expand Down Expand Up @@ -809,7 +810,7 @@ func tRunner(t *T, fn func(t *T)) {
t.Errorf("race detected during execution of test")
}

t.duration += time.Since(t.start)
t.elapsedTime += time.Since(t.start)
// If the test panicked, print any test output before dying.
err := recover()
signal := true
Expand Down Expand Up @@ -1088,7 +1089,7 @@ func (t *T) report() {
if t.parent == nil {
return
}
dstr := fmtDuration(t.duration)
dstr := fmtDuration(t.elapsedTime)
format := "--- %s: %s (%s)\n"
if t.Failed() {
t.flushToParent(format, "FAIL", t.name, dstr)
Expand Down