Merge #119894

119894: kvserver: compute MVCCStats in AdminSplit instead of splitTrigger r=nvanbenschoten,kvoli a=miraradeva This PR includes four commits: - c3edbe5 adds two new stats computation functions to compute the stats wrt only the user data keys in the range, and to compute the stats of all non-user-data keys in the range. These functions are used in the next commit to estimate the stats of the LHS of the split. - 23bf51a includes the main logic for moving some of the stats computation from splitTrigger (where we hold latches) to AdminSplit (where we don't hold latches) to reduce the duration of the critical section of the split. The resulting MVCC stats are not always guaranteed to be 100% correct because they may not account for writes concurrent with the split. - 37598e1 is a followup to 23bf51a that adds some extra protection against creating stats that differ too much from the range's data. It does so by ensuring the pre- and post-split stats for the entire range are close. - b88c331 adds logging and observability around the new stats computation. Co-authored-by: Mira Radeva <mira@cockroachlabs.com>
cockroachdb · Mar 21, 2024 · 1867fa1 · 1867fa1
2 parents ab07424 + af328f4
commit 1867fa1
Show file tree

Hide file tree

Showing 22 changed files with 716 additions and 100 deletions.
diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html
@@ -247,6 +247,8 @@
 <tr><td>STORAGE</td><td>kv.replica_read_batch_evaluate.latency</td><td>Execution duration for evaluating a BatchRequest on the read-only path after latches have been acquired.<br/><br/>A measurement is recorded regardless of outcome (i.e. also in case of an error). If internal retries occur, each instance is recorded separately.</td><td>Nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>STORAGE</td><td>kv.replica_read_batch_evaluate.without_interleaving_iter</td><td>Number of read-only batches evaluated without an intent interleaving iter.</td><td>Batches</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>STORAGE</td><td>kv.replica_write_batch_evaluate.latency</td><td>Execution duration for evaluating a BatchRequest on the read-write path after latches have been acquired.<br/><br/>A measurement is recorded regardless of outcome (i.e. also in case of an error). If internal retries occur, each instance is recorded separately.<br/>Note that the measurement does not include the duration for replicating the evaluated command.</td><td>Nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
+<tr><td>STORAGE</td><td>kv.split.estimated_stats</td><td>Number of splits that computed estimated MVCC stats.</td><td>Events</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
+<tr><td>STORAGE</td><td>kv.split.total_bytes_estimates</td><td>Number of total bytes difference between the pre-split and post-split MVCC stats.</td><td>Bytes</td><td>COUNTER</td><td>BYTES</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>STORAGE</td><td>kv.tenant_rate_limit.current_blocked</td><td>Number of requests currently blocked by the rate limiter</td><td>Requests</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>STORAGE</td><td>kv.tenant_rate_limit.num_tenants</td><td>Number of tenants currently being tracked</td><td>Tenants</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>STORAGE</td><td>kv.tenant_rate_limit.read_batches_admitted</td><td>Number of read batches admitted by the rate limiter</td><td>Requests</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>

diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt
@@ -337,4 +337,4 @@ trace.snapshot.rate	duration	0s	if non-zero, interval at which background trace
 trace.span_registry.enabled	boolean	true	if set, ongoing traces can be seen at https://<ui>/#/debug/tracez	application
 trace.zipkin.collector	string		the address of a Zipkin instance to receive traces, as <host>:<port>. If no port is specified, 9411 will be used.	application
 ui.display_timezone	enumeration	etc/utc	the timezone used to format timestamps in the ui [etc/utc = 0, america/new_york = 1]	application
-version	version	1000023.2-upgrading-to-1000024.1-step-020	set the active cluster version in the format '<major>.<minor>'	application
+version	version	1000023.2-upgrading-to-1000024.1-step-022	set the active cluster version in the format '<major>.<minor>'	application
diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html
@@ -292,6 +292,6 @@
 <tr><td><div id="setting-trace-span-registry-enabled" class="anchored"><code>trace.span_registry.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>if set, ongoing traces can be seen at https://&lt;ui&gt;/#/debug/tracez</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
 <tr><td><div id="setting-trace-zipkin-collector" class="anchored"><code>trace.zipkin.collector</code></div></td><td>string</td><td><code></code></td><td>the address of a Zipkin instance to receive traces, as &lt;host&gt;:&lt;port&gt;. If no port is specified, 9411 will be used.</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
 <tr><td><div id="setting-ui-display-timezone" class="anchored"><code>ui.display_timezone</code></div></td><td>enumeration</td><td><code>etc/utc</code></td><td>the timezone used to format timestamps in the ui [etc/utc = 0, america/new_york = 1]</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
-<tr><td><div id="setting-version" class="anchored"><code>version</code></div></td><td>version</td><td><code>1000023.2-upgrading-to-1000024.1-step-020</code></td><td>set the active cluster version in the format &#39;&lt;major&gt;.&lt;minor&gt;&#39;</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
+<tr><td><div id="setting-version" class="anchored"><code>version</code></div></td><td>version</td><td><code>1000023.2-upgrading-to-1000024.1-step-022</code></td><td>set the active cluster version in the format &#39;&lt;major&gt;.&lt;minor&gt;&#39;</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
 </tbody>
 </table>
diff --git a/pkg/clusterversion/cockroach_versions.go b/pkg/clusterversion/cockroach_versions.go
@@ -301,6 +301,10 @@ const (
 	// used for allocator decisions before then.
 	V24_1_GossipMaximumIOOverload
 
+	// V24_1_EstimatedMVCCStatsInSplit introduces MVCC stats estimates during range
+	// splits.
+	V24_1_EstimatedMVCCStatsInSplit
+
 	numKeys
 )
 
@@ -367,6 +371,7 @@ var versionTable = [numKeys]roachpb.Version{
 	V24_1_PebbleFormatSyntheticPrefixSuffix:    {Major: 23, Minor: 2, Internal: 16},
 	V24_1_SystemDatabaseSurvivability:          {Major: 23, Minor: 2, Internal: 18},
 	V24_1_GossipMaximumIOOverload:              {Major: 23, Minor: 2, Internal: 20},
+	V24_1_EstimatedMVCCStatsInSplit:            {Major: 23, Minor: 2, Internal: 22},
 }
 
 // Latest is always the highest version key. This is the maximum logical cluster

diff --git a/pkg/kv/kvserver/batcheval/cmd_end_transaction.go b/pkg/kv/kvserver/batcheval/cmd_end_transaction.go
@@ -30,6 +30,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
+	"github.com/cockroachdb/cockroach/pkg/settings"
 	"github.com/cockroachdb/cockroach/pkg/storage"
 	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
 	"github.com/cockroachdb/cockroach/pkg/util"
@@ -40,6 +41,30 @@ import (
 	"github.com/cockroachdb/redact"
 )
 
+// MaxMVCCStatCountDiff defines the maximum number of units (e.g. keys or
+// intents) that is acceptable for an individual MVCC stat to diverge from the
+// real value when computed during splits. If this threshold is
+// exceeded, the split will fall back to computing 100% accurate stats.
+// It takes effect only if kv.split.estimated_mvcc_stats.enabled is true.
+var MaxMVCCStatCountDiff = settings.RegisterIntSetting(
+	settings.SystemVisible,
+	"kv.split.max_mvcc_stat_count_diff",
+	"defines the max number of units that are acceptable for an individual "+
+		"MVCC stat to diverge; needs kv.split.estimated_mvcc_stats.enabled to be true",
+	5000)
+
+// MaxMVCCStatBytesDiff defines the maximum number of bytes (e.g. keys bytes or
+// intents bytes) that is acceptable for an individual MVCC stat to diverge
+// from the real value when computed during splits. If this threshold is
+// exceeded, the split will fall back to computing 100% accurate stats.
+// It takes effect only if kv.split.estimated_mvcc_stats.enabled is true.
+var MaxMVCCStatBytesDiff = settings.RegisterIntSetting(
+	settings.SystemVisible,
+	"kv.split.max_mvcc_stat_bytes_diff",
+	"defines the max number of bytes that are acceptable for an individual "+
+		"MVCC stat to diverge; needs kv.split.estimated_mvcc_stats.enabled to be true",
+	5120000) // 5.12 MB = 1% of the max range size
+
 func init() {
 	RegisterReadWriteCommand(kvpb.EndTxn, declareKeysEndTxn, EndTxn)
 }
@@ -1032,6 +1057,17 @@ func splitTrigger(
 			"unable to determine whether right hand side of split is empty")
 	}
 
+	// The intentInterleavingIterator doesn't like iterating over spans containing
+	// both local and global keys. Here we only care about global keys.
+	spanWithNoLocals := split.LeftDesc.KeySpan().AsRawSpanWithNoLocals()
+	emptyLHS, err := storage.MVCCIsSpanEmpty(ctx, batch, storage.MVCCIsSpanEmptyOptions{
+		StartKey: spanWithNoLocals.Key, EndKey: spanWithNoLocals.EndKey,
+	})
+	if err != nil {
+		return enginepb.MVCCStats{}, result.Result{}, errors.Wrapf(err,
+			"unable to determine whether left hand side of split is empty")
+	}
+
 	rangeKeyDeltaMS, err := computeSplitRangeKeyStatsDelta(ctx, batch, split.LeftDesc, split.RightDesc)
 	if err != nil {
 		return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err,
@@ -1055,12 +1091,19 @@ func splitTrigger(
 	}
 
 	h := splitStatsHelperInput{
-		AbsPreSplitBothStored: currentStats,
-		DeltaBatchEstimated:   bothDeltaMS,
-		DeltaRangeKey:         rangeKeyDeltaMS,
-		PostSplitScanLeftFn:   makeScanStatsFn(ctx, batch, ts, &split.LeftDesc, "left hand side"),
-		PostSplitScanRightFn:  makeScanStatsFn(ctx, batch, ts, &split.RightDesc, "right hand side"),
-		ScanRightFirst:        splitScansRightForStatsFirst || emptyRHS,
+		AbsPreSplitBothStored:    currentStats,
+		DeltaBatchEstimated:      bothDeltaMS,
+		DeltaRangeKey:            rangeKeyDeltaMS,
+		PreSplitLeftUser:         split.PreSplitLeftUserStats,
+		PreSplitStats:            split.PreSplitStats,
+		PostSplitScanLeftFn:      makeScanStatsFn(ctx, batch, ts, &split.LeftDesc, "left hand side", false /* excludeUserSpans */),
+		PostSplitScanRightFn:     makeScanStatsFn(ctx, batch, ts, &split.RightDesc, "right hand side", false /* excludeUserSpans */),
+		PostSplitScanLocalLeftFn: makeScanStatsFn(ctx, batch, ts, &split.LeftDesc, "local left hand side", true /* excludeUserSpans */),
+		ScanRightFirst:           splitScansRightForStatsFirst || emptyRHS,
+		LeftUserIsEmpty:          emptyLHS,
+		RightUserIsEmpty:         emptyRHS,
+		MaxCountDiff:             MaxMVCCStatCountDiff.Get(&rec.ClusterSettings().SV),
+		MaxBytesDiff:             MaxMVCCStatBytesDiff.Get(&rec.ClusterSettings().SV),
 	}
 	return splitTriggerHelper(ctx, rec, batch, h, split, ts)
 }
@@ -1081,9 +1124,14 @@ func makeScanStatsFn(
 	ts hlc.Timestamp,
 	sideDesc *roachpb.RangeDescriptor,
 	sideName string,
+	excludeUserSpans bool,
 ) splitStatsScanFn {
+	computeStatsFn := rditer.ComputeStatsForRange
+	if excludeUserSpans {
+		computeStatsFn = rditer.ComputeStatsForRangeExcludingUser
+	}
 	return func() (enginepb.MVCCStats, error) {
-		sideMS, err := rditer.ComputeStatsForRange(ctx, sideDesc, reader, ts.WallTime)
+		sideMS, err := computeStatsFn(ctx, sideDesc, reader, ts.WallTime)
 		if err != nil {
 			return enginepb.MVCCStats{}, errors.Wrapf(err,
 				"unable to compute stats for %s range after split", sideName)
@@ -1128,7 +1176,44 @@ func splitTriggerHelper(
 	// modifications to the left hand side are allowed after this line and any
 	// modifications to the right hand side are accounted for by updating the
 	// helper's AbsPostSplitRight() reference.
-	h, err := makeSplitStatsHelper(statsInput)
+	var h splitStatsHelper
+	// There are three conditions under which we want to fall back to accurate
+	// stats computation:
+	// 1. There are no pre-computed stats for the LHS. This can happen if
+	// kv.split.estimated_mvcc_stats.enabled is disabled, or if the leaseholder
+	// node is running an older version. Pre-computed stats are necessary for
+	// makeEstimatedSplitStatsHelper to estimate the stats.
+	// Note that PreSplitLeftUserStats can also be equal to enginepb.MVCCStats{}
+	// when the user LHS stats are all zero, but in that case it's ok to fall back
+	// to accurate stats computation because scanning the empty LHS is not
+	// expensive.
+	noPreComputedStats := split.PreSplitLeftUserStats == enginepb.MVCCStats{}
+	// 2. If either side contains no user data; scanning the empty ranges is
+	// cheap.
+	emptyLeftOrRight := statsInput.LeftUserIsEmpty || statsInput.RightUserIsEmpty
+	// 3. If the user pre-split stats differ significantly from the current stats
+	// stored on disk. Note that the current stats on disk were corrected in
+	// AdminSplit, so any differences we see here are due to writes concurrent
+	// with this split (not compounded estimates from previous splits).
+	preComputedStatsDiff := !statsInput.AbsPreSplitBothStored.HasUserDataCloseTo(
+		statsInput.PreSplitStats, statsInput.MaxCountDiff, statsInput.MaxBytesDiff)
+
+	if noPreComputedStats || emptyLeftOrRight || preComputedStatsDiff {
+		var reason redact.RedactableString
+		if noPreComputedStats {
+			reason = "there are no pre-split LHS stats (or they're empty)"
+		} else if emptyLeftOrRight {
+			reason = "the in-split LHS or RHS is empty"
+		} else {
+			reason = redact.Sprintf("the pre-split user stats differ too much "+
+				"from the in-split stats; pre-split: %+v, in-split: %+v",
+				statsInput.PreSplitStats, statsInput.AbsPreSplitBothStored)
+		}
+		log.Infof(ctx, "falling back to accurate stats computation because %v", reason)
+		h, err = makeSplitStatsHelper(statsInput)
+	} else {
+		h, err = makeEstimatedSplitStatsHelper(statsInput)
+	}
 	if err != nil {
 		return enginepb.MVCCStats{}, result.Result{}, err
 	}
@@ -1247,6 +1332,10 @@ func splitTriggerHelper(
 		RHSDelta: *h.AbsPostSplitRight(),
 	}
 
+	pd.Local.Metrics = &result.Metrics{
+		SplitsWithEstimatedStats:     h.splitsWithEstimates,
+		SplitEstimatedTotalBytesDiff: h.estimatedTotalBytesDiff,
+	}
 	deltaPostSplitLeft := h.DeltaPostSplitLeft()
 	return deltaPostSplitLeft, pd, nil
 }

diff --git a/pkg/kv/kvserver/batcheval/result/metrics.go b/pkg/kv/kvserver/batcheval/result/metrics.go
@@ -13,14 +13,16 @@ package result
 // Metrics tracks various counters related to command applications and
 // their outcomes.
 type Metrics struct {
-	LeaseRequestSuccess  int // lease request evaluated successfully
-	LeaseRequestError    int // lease request error at evaluation time
-	LeaseTransferSuccess int // lease transfer evaluated successfully
-	LeaseTransferError   int // lease transfer error at evaluation time
-	ResolveCommit        int // intent commit evaluated successfully
-	ResolveAbort         int // non-poisoning intent abort evaluated successfully
-	ResolvePoison        int // poisoning intent abort evaluated successfully
-	AddSSTableAsWrites   int // AddSSTable requests with IngestAsWrites set
+	LeaseRequestSuccess          int // lease request evaluated successfully
+	LeaseRequestError            int // lease request error at evaluation time
+	LeaseTransferSuccess         int // lease transfer evaluated successfully
+	LeaseTransferError           int // lease transfer error at evaluation time
+	ResolveCommit                int // intent commit evaluated successfully
+	ResolveAbort                 int // non-poisoning intent abort evaluated successfully
+	ResolvePoison                int // poisoning intent abort evaluated successfully
+	AddSSTableAsWrites           int // AddSSTable requests with IngestAsWrites set
+	SplitsWithEstimatedStats     int // Splits that computed stats estimates
+	SplitEstimatedTotalBytesDiff int // Difference between pre- and post-split total bytes.
 }
 
 // Add absorbs the supplied Metrics into the receiver.
@@ -33,4 +35,6 @@ func (mt *Metrics) Add(o Metrics) {
 	mt.ResolveAbort += o.ResolveAbort
 	mt.ResolvePoison += o.ResolvePoison
 	mt.AddSSTableAsWrites += o.AddSSTableAsWrites
+	mt.SplitsWithEstimatedStats += o.SplitsWithEstimatedStats
+	mt.SplitEstimatedTotalBytesDiff += o.SplitEstimatedTotalBytesDiff
 }