Skip to content

Commit

Permalink
Merge #83194 #84922
Browse files Browse the repository at this point in the history
83194: kvserver: recompute stats after mvcc gc r=lunevalex a=lunevalex

Touched #82920

There is at least one known issue in MVCC stats calculation and
there maybe more. This could lead to the MVCC GC Queue spinning on
ranges with bad stats. To prevent the queue from spinning it should
recompute the stats if it detects that they are wrong. The easiest
mechanism to do that is to check if the GC score wants to queue this
range again after finishing GC, if it does it likely indicates something
fishy with the stats.

Release note: Change the MVCC GC queue to recompute MVCC stats on a
range, if after doing a GC run it still thinks there is garbage in
the range.

84922: bazel: bump size of `gc` test r=jlinder a=rickystewart

This has timed out in CI.

Release note: None

Co-authored-by: Alex Lunev <alexl@cockroachlabs.com>
Co-authored-by: Ricky Stewart <ricky@cockroachlabs.com>
  • Loading branch information
3 people committed Jul 22, 2022
3 parents 02727e3 + cda8806 + ca43846 commit d7cf6d2
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 4 deletions.
1 change: 1 addition & 0 deletions pkg/kv/kvserver/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ go_test(
"client_merge_test.go",
"client_metrics_test.go",
"client_migration_test.go",
"client_mvcc_gc_test.go",
"client_protectedts_test.go",
"client_raft_helpers_test.go",
"client_raft_log_queue_test.go",
Expand Down
67 changes: 67 additions & 0 deletions pkg/kv/kvserver/client_mvcc_gc_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package kvserver_test

import (
"context"
"testing"
"time"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/server"
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/stretchr/testify/require"
)

// TestMVCCGCCorrectStats verifies that the mvcc gc queue corrects stats
// for a range that has bad ones that would unnecessarily trigger the mvcc
// gc queue.
func TestMVCCGCCorrectStats(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

ctx := context.Background()
serv, _, _ := serverutils.StartServer(t, base.TestServerArgs{})
s := serv.(*server.TestServer)
defer s.Stopper().Stop(ctx)

key, err := s.ScratchRange()
require.NoError(t, err)
store, err := s.Stores().GetStore(s.GetFirstStoreID())
require.NoError(t, err)

repl := store.LookupReplica(roachpb.RKey(key))
for i := 0; i < 10; i++ {
if err := store.DB().Put(ctx, key, "foo"); err != nil {
t.Fatal(err)
}
key = key.Next()
}

// Put some garbage in the stats, so it triggers the mvcc gc queue.
ms := repl.GetMVCCStats()
oldKeyBytes := ms.KeyBytes
oldValBytes := ms.ValBytes
ms.KeyBytes = 16 * (1 << 20) // 16mb
ms.ValBytes = 32 * (1 << 20) // 16mb
ms.GCBytesAge = 48 * (1 << 20) * 100 * int64(time.Hour.Seconds())

repl.SetMVCCStatsForTesting(&ms)
require.NoError(t, store.ManualMVCCGC(repl))

// Verify that the mvcc gc queue restored the stats.
newStats := repl.GetMVCCStats()
require.Equal(t, oldKeyBytes, newStats.KeyBytes)
require.Equal(t, oldValBytes, newStats.ValBytes)
}
2 changes: 1 addition & 1 deletion pkg/kv/kvserver/gc/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ go_library(

go_test(
name = "gc_test",
size = "medium",
size = "large",
srcs = [
"data_distribution_test.go",
"gc_iterator_test.go",
Expand Down
35 changes: 32 additions & 3 deletions pkg/kv/kvserver/mvcc_gc_queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"sync/atomic"
"time"

"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/gc"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/intentresolver"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
Expand Down Expand Up @@ -614,10 +615,38 @@ func (mgcq *mvccGCQueue) process(
return false, err
}

log.Eventf(ctx, "MVCC stats after GC: %+v", repl.GetMVCCStats())
log.Eventf(ctx, "GC score after GC: %s", makeMVCCGCQueueScore(
ctx, repl, repl.store.Clock().Now(), lastGC, conf.TTL(), canAdvanceGCThreshold))
scoreAfter := makeMVCCGCQueueScore(
ctx, repl, repl.store.Clock().Now(), lastGC, conf.TTL(), canAdvanceGCThreshold)
log.VEventf(ctx, 2, "MVCC stats after GC: %+v", repl.GetMVCCStats())
log.VEventf(ctx, 2, "GC score after GC: %s", scoreAfter)
updateStoreMetricsWithGCInfo(mgcq.store.metrics, info)
// If the score after running through the queue indicates that this
// replica should be re-queued for GC it most likely means that there
// is something wrong with the stats. One such known issue is
// https://github.com/cockroachdb/cockroach/issues/82920. To fix this we
// recompute stats, it's an expensive operation but it's better to recompute
// them then to spin the GC queue.
// Note: the score is not recomputed as if the GC queue was going to run again,
// because we are reusing the old lastGC and canAdvanceGCThreshold. This helps
// avoid issues with e.g. cooldown timers and focuses the recomputation on the
// difference in stats after GC.

if scoreAfter.ShouldQueue {
// The scores are very long, so splitting into multiple lines manually for
// readability.
log.Infof(ctx, "GC still needed following GC, recomputing MVCC stats")
log.Infof(ctx, "old score %s", r)
log.Infof(ctx, "new score %s", scoreAfter)
req := roachpb.RecomputeStatsRequest{
RequestHeader: roachpb.RequestHeader{Key: desc.StartKey.AsRawKey()},
}
var b kv.Batch
b.AddRawRequest(&req)
err := repl.store.db.Run(ctx, &b)
if err != nil {
log.Errorf(ctx, "failed to recompute stats with error=%s", err)
}
}
return true, nil
}

Expand Down
8 changes: 8 additions & 0 deletions pkg/kv/kvserver/replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -1140,6 +1140,14 @@ func (r *Replica) GetMVCCStats() enginepb.MVCCStats {
return *r.mu.state.Stats
}

// SetMVCCStatsForTesting updates the MVCC stats on the repl object only, it does
// not affect the on disk state and is only safe to use for testing purposes.
func (r *Replica) SetMVCCStatsForTesting(stats *enginepb.MVCCStats) {
r.mu.RLock()
defer r.mu.RUnlock()
r.mu.state.Stats = stats
}

// GetMaxSplitQPS returns the Replica's maximum queries/s request rate over a
// configured measurement period. If the Replica has not been recording QPS for
// at least an entire measurement period, the method will return false.
Expand Down

0 comments on commit d7cf6d2

Please sign in to comment.