Skip to content

Commit

Permalink
kvserver: fix Replica.tenantLimiter race
Browse files Browse the repository at this point in the history
If a Replica is destroyed while or shortly before Send waits for the
tenant rate limiter, the rate limiter returns a quotapool.ErrClosed
which propagates all the way up to the SQL client. This commit fixes
maybeRateLimitBatch to return the Replica destruction status error
instead, so that the Send stack retries the request.

Release note (bug fix): fixed a race condition in Replica lifecycle
which could result in a failed SQL request in cases where it could be
successfully retried.

Epic: none
  • Loading branch information
pav-kv committed Sep 19, 2023
1 parent 409d938 commit 406552e
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 4 deletions.
19 changes: 18 additions & 1 deletion pkg/kv/kvserver/replica_rate_limit.go
Expand Up @@ -16,6 +16,8 @@ import (
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
"github.com/cockroachdb/cockroach/pkg/multitenant/tenantcostmodel"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/util/quotapool"
"github.com/cockroachdb/errors"
)

// maybeRateLimitBatch may block the batch waiting to be rate-limited. Note that
Expand All @@ -29,8 +31,23 @@ func (r *Replica) maybeRateLimitBatch(ctx context.Context, ba *kvpb.BatchRequest
if !ok || tenantID == roachpb.SystemTenantID {
return nil
}

// writeMultiplier isn't needed here since it's only used to calculate RUs.
return r.tenantLimiter.Wait(ctx, tenantcostmodel.MakeRequestInfo(ba, 1, 1))
err := r.tenantLimiter.Wait(ctx, tenantcostmodel.MakeRequestInfo(ba, 1, 1))

// For performance reasons, we do not hold any Replica's mutexes while waiting
// on the tenantLimiter, and so we are racing with the Replica lifecycle. The
// Replica can be destroyed and release the limiter before or during the Wait
// call. In this case Wait returns an ErrClosed error. Instead of ErrClosed,
// return the destruction status error which upper layers recognize.
if err != nil && errors.HasType(err, (*quotapool.ErrClosed)(nil)) {
if _, err := r.IsDestroyed(); err != nil {
return err
}
return errors.AssertionFailedf("replica not marked as destroyed but limiter is closed: %v", r)
}

return err
}

// recordImpactOnRateLimiter is used to record a read against the tenant rate
Expand Down
5 changes: 2 additions & 3 deletions pkg/kv/kvserver/replica_rate_limit_test.go
Expand Up @@ -20,7 +20,6 @@ import (
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/tenantrate"
"github.com/cockroachdb/cockroach/pkg/multitenant/tenantcapabilities/tenantcapabilitiesauthorizer"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
Expand Down Expand Up @@ -90,7 +89,7 @@ func TestReplicaRateLimit(t *testing.T) {
// Now the rate limiter is saturated. If we try to write a request to the
// replica now, the rate limiter will block it. If this races with a range
// destruction (for example, due to a merge like below), maybeRateLimitBatch()
// returns a quota pool closed error.
// must return a RangeNotFound error.
g := ctxgroup.WithContext(ctx)
g.Go(func() error {
_, pErr := leftRepl.AdminMerge(ctx, kvpb.AdminMergeRequest{
Expand All @@ -101,7 +100,7 @@ func TestReplicaRateLimit(t *testing.T) {
return pErr.GoError()
})
err := put(5 * time.Second)
require.True(t, testutils.IsError(err, "123 pool closed: released"), err)
require.True(t, errors.Is(err, &kvpb.RangeNotFoundError{RangeID: 2, StoreID: 1}), err)

require.NoError(t, g.Wait())
}

0 comments on commit 406552e

Please sign in to comment.