Skip to content

Commit

Permalink
roachtest: categorise errors that occur during test teardown
Browse files Browse the repository at this point in the history
Epic: none
Fixes: #98366

Release note: None
  • Loading branch information
Miral Gadani committed Jun 9, 2023
1 parent b3f95aa commit 1cea00f
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 21 deletions.
22 changes: 10 additions & 12 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1603,33 +1603,31 @@ func (c *clusterImpl) HealthStatus(
return results, nil
}

// FailOnInvalidDescriptors fails the test if there exists any descriptors in
// assertValidDescriptors fails the test if there exists any descriptors in
// the crdb_internal.invalid_objects virtual table.
func (c *clusterImpl) FailOnInvalidDescriptors(ctx context.Context, db *gosql.DB, t *testImpl) {
func (c *clusterImpl) assertValidDescriptors(ctx context.Context, db *gosql.DB, t *testImpl) error {
t.L().Printf("checking for invalid descriptors")
if err := timeutil.RunWithTimeout(
return timeutil.RunWithTimeout(
ctx, "invalid descriptors check", 1*time.Minute,
func(ctx context.Context) error {
return roachtestutil.CheckInvalidDescriptors(ctx, db)
},
); err != nil {
t.Errorf("invalid descriptors check failed: %v", err)
}
)
}

// FailOnReplicaDivergence fails the test if
// assertConsistentReplicas fails the test if
// crdb_internal.check_consistency(true, ”, ”) indicates that any ranges'
// replicas are inconsistent with each other.
func (c *clusterImpl) FailOnReplicaDivergence(ctx context.Context, db *gosql.DB, t *testImpl) {
func (c *clusterImpl) assertConsistentReplicas(
ctx context.Context, db *gosql.DB, t *testImpl,
) error {
t.L().Printf("checking for replica divergence")
if err := timeutil.RunWithTimeout(
return timeutil.RunWithTimeout(
ctx, "consistency check", 5*time.Minute,
func(ctx context.Context) error {
return roachtestutil.CheckReplicaDivergenceOnDB(ctx, t.L(), db)
},
); err != nil {
t.Errorf("consistency check failed: %v", err)
}
)
}

// FetchDmesg grabs the dmesg logs if possible. This requires being able to run
Expand Down
5 changes: 5 additions & 0 deletions pkg/cmd/roachtest/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ const (
otherErr issueCategory = iota
clusterCreationErr
sshErr
teardownErr
)

func newGithubIssues(disable bool, c *clusterImpl, vmCreateOpts *vm.CreateOpts) *githubIssues {
Expand Down Expand Up @@ -128,6 +129,8 @@ func (g *githubIssues) createPostRequest(
issueName = "ssh_problem"
messagePrefix = fmt.Sprintf("test %s failed due to ", t.Name())
infraFlake = true
} else if cat == teardownErr {
messagePrefix = fmt.Sprintf("test %s failed during teardown (see teardown.log) due to ", t.Name())
}

// Issues posted from roachtest are identifiable as such and
Expand Down Expand Up @@ -226,6 +229,8 @@ func (g *githubIssues) MaybePost(t *testImpl, l *logger.Logger, message string)
cat = clusterCreationErr
} else if failureContainsError(firstFailure, rperrors.ErrSSH255) {
cat = sshErr
} else if failureContainsError(firstFailure, errDuringTeardown) {
cat = teardownErr
}

return g.issuePoster(
Expand Down
4 changes: 4 additions & 0 deletions pkg/cmd/roachtest/github_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ func TestCreatePostRequest(t *testing.T) {
},
//Simulate failure loading TEAMS.yaml
{true, false, true, false, "", otherErr, false, false, nil},
//Error during teardown
{true, false, false, false, "", teardownErr, false, false, nil},
}

reg := makeTestRegistry(spec.GCE, "", "", false, false)
Expand Down Expand Up @@ -216,6 +218,8 @@ func TestCreatePostRequest(t *testing.T) {
expectedLabel = "T-testeng"
expectedName = "ssh_problem"
expectedMessagePrefix = "test github_test failed due to "
} else if c.category == teardownErr {
expectedMessagePrefix = "test github_test failed during teardown (see teardown.log) due to "
}

require.Contains(t, req.MentionOnCreate, expectedTeam)
Expand Down
5 changes: 2 additions & 3 deletions pkg/cmd/roachtest/test_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -370,8 +370,7 @@ func (t *testImpl) addFailure(depth int, format string, args ...interface{}) {
t.mu.output = append(t.mu.output, '\n')
}

// We take the first error from each failure which is the
// "squashed" error that contains all information of a failure
// We take the "squashed" error that contains information of all the errors for each failure.
func formatFailure(b *strings.Builder, reportFailures ...failure) {
for i, failure := range reportFailures {
if i > 0 {
Expand Down Expand Up @@ -402,7 +401,7 @@ func (t *testImpl) failedRLocked() bool {
func (t *testImpl) firstFailure() failure {
t.mu.RLock()
defer t.mu.RUnlock()
if len(t.mu.failures) <= 0 {
if len(t.mu.failures) == 0 {
return failure{}
}
return t.mu.failures[0]
Expand Down
22 changes: 16 additions & 6 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ var (
// reference error used when cluster creation fails for a test
errClusterProvisioningFailed = fmt.Errorf("cluster could not be created")

// reference error for any failures during test teardown
errDuringTeardown = fmt.Errorf("error during test tear down")

prometheusNameSpace = "roachtest"
// prometheusScrapeInterval should be consistent with the scrape interval defined in
// https://grafana.testeng.crdb.io/prometheus/config
Expand Down Expand Up @@ -941,7 +944,7 @@ func (r *testRunner) runTest(

durationStr := fmt.Sprintf("%.2fs", t.duration().Seconds())
if t.Failed() {
output := fmt.Sprintf("test artifacts and logs in: %s\n%s", t.ArtifactsDir(), t.failureMsg())
output := fmt.Sprintf("%s\ntest artifacts and logs in: %s", t.failureMsg(), t.ArtifactsDir())

if teamCity {
// If `##teamcity[testFailed ...]` is not present before `##teamCity[testFinished ...]`,
Expand Down Expand Up @@ -1065,7 +1068,7 @@ func (r *testRunner) runTest(
if t.Failed() {
s = "failure"
}
t.L().Printf("tearing down after %s; see teardown.log", s)
t.L().Printf("tearing down after %s; see teardown.log (subsequent failures may still occur there)", s)
case <-time.After(timeout):
// NB: We're adding the timeout failure intentionally without cancelling the context
// to capture as much state as possible during artifact collection.
Expand All @@ -1090,6 +1093,9 @@ func (r *testRunner) teardownTest(
ctx context.Context, t *testImpl, c *clusterImpl, timedOut bool,
) error {

teardownError := func(err error) {
t.Error(errors.Mark(err, errDuringTeardown))
}
// We still have to collect artifacts and run post-flight checks, and any of
// these might hang. So they go into a goroutine and the main goroutine
// abandons them after a timeout. We intentionally don't wait for the
Expand Down Expand Up @@ -1151,7 +1157,7 @@ func (r *testRunner) teardownTest(
if err := c.assertNoDeadNode(ctx, t); err != nil {
// Some tests expect dead nodes, so they may opt out of this check.
if t.spec.SkipPostValidations&registry.PostValidationNoDeadNodes == 0 {
t.Error(err)
teardownError(err)
} else {
t.L().Printf("dead node(s) detected but expected")
}
Expand All @@ -1161,7 +1167,7 @@ func (r *testRunner) teardownTest(
// and select the first one that succeeds to run the validation queries
statuses, err := c.HealthStatus(ctx, t.L(), c.All())
if err != nil {
t.Error(errors.WithDetail(err, "Unable to check health status"))
teardownError(errors.WithDetail(err, "Unable to check health status"))
}

var db *gosql.DB
Expand Down Expand Up @@ -1198,12 +1204,16 @@ func (r *testRunner) teardownTest(
// If this validation fails due to a timeout, it is very likely that
// the replica divergence check below will also fail.
if t.spec.SkipPostValidations&registry.PostValidationInvalidDescriptors == 0 {
c.FailOnInvalidDescriptors(ctx, db, t)
if err := c.assertValidDescriptors(ctx, db, t); err != nil {
teardownError(errors.WithDetail(err, "invalid descriptors check failed"))
}
}
// Detect replica divergence (i.e. ranges in which replicas have arrived
// at the same log position with different states).
if t.spec.SkipPostValidations&registry.PostValidationReplicaDivergence == 0 {
c.FailOnReplicaDivergence(ctx, db, t)
if err := c.assertConsistentReplicas(ctx, db, t); err != nil {
teardownError(errors.WithDetail(err, "consistency check failed"))
}
}
} else {
t.L().Printf("no live node found, skipping validation checks")
Expand Down

0 comments on commit 1cea00f

Please sign in to comment.