Skip to content

Commit

Permalink
Merge #34711 #34721
Browse files Browse the repository at this point in the history
34711: roachtest: fix flakey follower read roachtest r=ajwerner a=ajwerner

The follower read roachtest sustains reads against followers and verifies that
the 90th %-ile latency never exceeds 20ms. During certain situations like lease
transfer, the latency may jump up. This change now runs the sustained reads for
longer (3 minutes instead of 1) and verifies that the latency is low at least
80% of the time. This should be much less fragile.

Fixes #34701.

Release note: None

34721: server: remove timeout from the problem ranges page r=andreimatei a=andreimatei

This page was using an arbitrary timeout of 3s which was proving
insufficient for a user. Not particularly surprising, since the report
performs RPCs which need to iterate through all the ranges.
No more timeout.

Fixes #34311

Release note: None

Co-authored-by: Andrew Werner <ajwerner@cockroachlabs.com>
Co-authored-by: Andrei Matei <andrei@cockroachlabs.com>
  • Loading branch information
3 people committed Feb 7, 2019
3 parents 4bdf0ad + b490469 + c12ae68 commit 70cd045
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 12 deletions.
18 changes: 11 additions & 7 deletions pkg/cmd/roachtest/follower_reads.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,10 +205,10 @@ func runFollowerReadsTest(ctx context.Context, t *test, c *cluster) {
t.Fatalf("fewer than 2 follower reads occurred: saw %v before and %v after",
followerReadsBefore, followerReadsAfter)
}
// Run reads for 60s which given the metrics window of 10s should guarantee
// Run reads for 3m which given the metrics window of 10s should guarantee
// that the most recent SQL latency time series data should relate to at least
// some of these reads.
timeoutCtx, cancel := context.WithTimeout(ctx, time.Minute)
timeoutCtx, cancel := context.WithTimeout(ctx, 3*time.Minute)
defer cancel()
g, gCtx = errgroup.WithContext(timeoutCtx)
for i := 0; i < concurrency; i++ {
Expand Down Expand Up @@ -245,8 +245,8 @@ func computeFollowerReadDuration(ctx context.Context, db *gosql.DB) (time.Durati
}

// verifySQLLatency verifies that the client-facing SQL latencies in the 90th
// percentile remain below target latency between start and end ignoring the
// first 20s.
// percentile remain below target latency 80% of the time between start and end
// ignoring the first 20s.
func verifySQLLatency(
ctx context.Context,
c *cluster,
Expand Down Expand Up @@ -279,12 +279,16 @@ func verifySQLLatency(
t.Fatalf("not enough ts data to verify latency")
}
perTenSeconds = perTenSeconds[2:]
var above []time.Duration
for _, dp := range perTenSeconds {
if time.Duration(dp.Value) > targetLatency {
t.Fatalf("latency value %v for ts data point %v is above target latency %v",
time.Duration(dp.Value), dp, targetLatency)
if val := time.Duration(dp.Value); val > targetLatency {
above = append(above, val)
}
}
if permitted := int(.2 * float64(len(perTenSeconds))); len(above) > permitted {
t.Fatalf("%d latency values (%v) are above target latency %v, %d permitted",
len(above), above, targetLatency, permitted)
}
}

const followerReadsMetric = "follower_reads_success_count"
Expand Down
6 changes: 1 addition & 5 deletions pkg/server/problem_ranges.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import (
"context"
"sort"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/server/serverpb"
"github.com/cockroachdb/cockroach/pkg/storage"
Expand Down Expand Up @@ -49,9 +48,6 @@ func (s *statusServer) ProblemRanges(
}
}

nodeCtx, cancel := context.WithTimeout(ctx, base.NetworkTimeout)
defer cancel()

type nodeResponse struct {
nodeID roachpb.NodeID
resp *serverpb.RangesResponse
Expand All @@ -63,7 +59,7 @@ func (s *statusServer) ProblemRanges(
for nodeID := range isLiveMap {
nodeID := nodeID
if err := s.stopper.RunAsyncTask(
nodeCtx, "server.statusServer: requesting remote ranges",
ctx, "server.statusServer: requesting remote ranges",
func(ctx context.Context) {
status, err := s.dialNode(ctx, nodeID)
var rangesResponse *serverpb.RangesResponse
Expand Down

0 comments on commit 70cd045

Please sign in to comment.