From 1fe7c9d279a159a9c203185c31ebde2806c099e6 Mon Sep 17 00:00:00 2001 From: DarrylWong Date: Wed, 29 Nov 2023 17:37:40 +0000 Subject: [PATCH] roachtest: no longer show other errors if a test times out When a test times out, it often leads to other failures (i.e. context cancelled). However, these are often just noise and distract from the root cause of the failures, the timeout. This change makes it so when a timeout failure is added, it suppresses future errors from being shown in github posts and the test failure message. They are still logged in their respective failure.log files. Epic: none Release note: none Fixes: #114565 --- pkg/cmd/roachtest/test_impl.go | 26 +++++++++++++++++++++++--- pkg/cmd/roachtest/test_runner.go | 3 +++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/pkg/cmd/roachtest/test_impl.go b/pkg/cmd/roachtest/test_impl.go index a4fa7a1a221c..18dba3679716 100644 --- a/pkg/cmd/roachtest/test_impl.go +++ b/pkg/cmd/roachtest/test_impl.go @@ -106,6 +106,13 @@ type testImpl struct { // referencing 0+ errors. failure captures all the errors failures []failure + // failuresSuppressed indicates if further failures should be added to mu.failures. + failuresSuppressed bool + + // numFailures is the number of failures that have been added via addFailures. + // This can deviate from len(failures) if failures have been suppressed. + numFailures int + // status is a map from goroutine id to status set by that goroutine. A // special goroutine is indicated by runnerID; that one provides the test's // "main status". @@ -393,13 +400,16 @@ func (t *testImpl) addFailure(depth int, format string, args ...interface{}) { t.mu.Lock() defer t.mu.Unlock() - t.mu.failures = append(t.mu.failures, reportFailure) + if !t.mu.failuresSuppressed { + t.mu.failures = append(t.mu.failures, reportFailure) + } var b strings.Builder formatFailure(&b, reportFailure) msg := b.String() - failureNum := len(t.mu.failures) + t.mu.numFailures++ + failureNum := t.mu.numFailures failureLog := fmt.Sprintf("failure_%d", failureNum) t.L().Printf("test failure #%d: full stack retained in %s.log: %s", failureNum, failureLog, msg) // Also dump the verbose error (incl. all stack traces) to a log file, in case @@ -425,6 +435,16 @@ func (t *testImpl) addFailure(depth int, format string, args ...interface{}) { t.mu.output = append(t.mu.output, '\n') } +// suppressFailures will stop future failures from being surfaced to github posting +// or the test logger. It will not stop those failures from being logged in their +// own failure.log files. Used if we are confident on the root cause of a failure and +// want to reduce noise of other failures, i.e. timeouts. +func (t *testImpl) suppressFailures() { + t.mu.Lock() + defer t.mu.Unlock() + t.mu.failuresSuppressed = true +} + // We take the "squashed" error that contains information of all the errors for each failure. func formatFailure(b *strings.Builder, reportFailures ...failure) { for i, failure := range reportFailures { @@ -450,7 +470,7 @@ func (t *testImpl) Failed() bool { } func (t *testImpl) failedRLocked() bool { - return len(t.mu.failures) > 0 + return t.mu.numFailures > 0 } func (t *testImpl) firstFailure() failure { diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index fc4170422373..8dd22b17dac8 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -1139,6 +1139,9 @@ func (r *testRunner) runTest( // NB: We're adding the timeout failure intentionally without cancelling the context // to capture as much state as possible during artifact collection. t.addFailure(0, "test timed out (%s)", timeout) + // We suppress other failures from being surfaced to the top as the timeout is always going + // to be the main error and subsequent errors (i.e. context cancelled) add noise. + t.suppressFailures() timedOut = true }