From 1fe7c9d279a159a9c203185c31ebde2806c099e6 Mon Sep 17 00:00:00 2001
From: DarrylWong <darryl@cockroachlabs.com>
Date: Wed, 29 Nov 2023 17:37:40 +0000
Subject: [PATCH] roachtest: no longer show other errors if a test times out

When a test times out, it often leads to other failures
(i.e. context cancelled). However, these are often just
noise and distract from the root cause of the failures,
the timeout.

This change makes it so when a timeout failure is added,
it suppresses future errors from being shown in github posts
and the test failure message. They are still logged in their
respective failure.log files.

Epic: none
Release note: none
Fixes: #114565
---
 pkg/cmd/roachtest/test_impl.go   | 26 +++++++++++++++++++++++---
 pkg/cmd/roachtest/test_runner.go |  3 +++
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/pkg/cmd/roachtest/test_impl.go b/pkg/cmd/roachtest/test_impl.go
index a4fa7a1a221c..18dba3679716 100644
--- a/pkg/cmd/roachtest/test_impl.go
+++ b/pkg/cmd/roachtest/test_impl.go
@@ -106,6 +106,13 @@ type testImpl struct {
 		// referencing 0+ errors. failure captures all the errors
 		failures []failure
 
+		// failuresSuppressed indicates if further failures should be added to mu.failures.
+		failuresSuppressed bool
+
+		// numFailures is the number of failures that have been added via addFailures.
+		// This can deviate from len(failures) if failures have been suppressed.
+		numFailures int
+
 		// status is a map from goroutine id to status set by that goroutine. A
 		// special goroutine is indicated by runnerID; that one provides the test's
 		// "main status".
@@ -393,13 +400,16 @@ func (t *testImpl) addFailure(depth int, format string, args ...interface{}) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 
-	t.mu.failures = append(t.mu.failures, reportFailure)
+	if !t.mu.failuresSuppressed {
+		t.mu.failures = append(t.mu.failures, reportFailure)
+	}
 
 	var b strings.Builder
 	formatFailure(&b, reportFailure)
 	msg := b.String()
 
-	failureNum := len(t.mu.failures)
+	t.mu.numFailures++
+	failureNum := t.mu.numFailures
 	failureLog := fmt.Sprintf("failure_%d", failureNum)
 	t.L().Printf("test failure #%d: full stack retained in %s.log: %s", failureNum, failureLog, msg)
 	// Also dump the verbose error (incl. all stack traces) to a log file, in case
@@ -425,6 +435,16 @@ func (t *testImpl) addFailure(depth int, format string, args ...interface{}) {
 	t.mu.output = append(t.mu.output, '\n')
 }
 
+// suppressFailures will stop future failures from being surfaced to github posting
+// or the test logger. It will not stop those failures from being logged in their
+// own failure.log files. Used if we are confident on the root cause of a failure and
+// want to reduce noise of other failures, i.e. timeouts.
+func (t *testImpl) suppressFailures() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.mu.failuresSuppressed = true
+}
+
 // We take the "squashed" error that contains information of all the errors for each failure.
 func formatFailure(b *strings.Builder, reportFailures ...failure) {
 	for i, failure := range reportFailures {
@@ -450,7 +470,7 @@ func (t *testImpl) Failed() bool {
 }
 
 func (t *testImpl) failedRLocked() bool {
-	return len(t.mu.failures) > 0
+	return t.mu.numFailures > 0
 }
 
 func (t *testImpl) firstFailure() failure {
diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go
index fc4170422373..8dd22b17dac8 100644
--- a/pkg/cmd/roachtest/test_runner.go
+++ b/pkg/cmd/roachtest/test_runner.go
@@ -1139,6 +1139,9 @@ func (r *testRunner) runTest(
 		// NB: We're adding the timeout failure intentionally without cancelling the context
 		// to capture as much state as possible during artifact collection.
 		t.addFailure(0, "test timed out (%s)", timeout)
+		// We suppress other failures from being surfaced to the top as the timeout is always going
+		// to be the main error and subsequent errors (i.e. context cancelled) add noise.
+		t.suppressFailures()
 		timedOut = true
 	}