Skip to content

Commit

Permalink
kvserver: observability for estimated MVCC stats during splits
Browse files Browse the repository at this point in the history
This patch adds logging and two new metrics to track estimated MVCC
stats computed during splits.

`kv.split.estimated_stats`: the number of splits that computed estimated
MVCC stats, as opposed to 100% accurate ones.

`kv.split.total_bytes_estimates`: the number of total bytes of estimates
introduced by splits. These are calculated as the difference between
the pre-computed stats before the split and the stored stats during the
split (while holding latches).

Fixes: #119516

Release note (ops-change): Two new metrics (kv.split.estimated_stats and
kv.split.total_bytes_estimates) added to track the number of splits
that produce MVCC stats estimates, and the total bytes of estimates
produced.
  • Loading branch information
miraradeva committed Mar 11, 2024
1 parent fe85c85 commit 05daef9
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 12 deletions.
2 changes: 2 additions & 0 deletions docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,8 @@
<tr><td>STORAGE</td><td>kv.replica_read_batch_evaluate.latency</td><td>Execution duration for evaluating a BatchRequest on the read-only path after latches have been acquired.<br/><br/>A measurement is recorded regardless of outcome (i.e. also in case of an error). If internal retries occur, each instance is recorded separately.</td><td>Nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>STORAGE</td><td>kv.replica_read_batch_evaluate.without_interleaving_iter</td><td>Number of read-only batches evaluated without an intent interleaving iter.</td><td>Batches</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>STORAGE</td><td>kv.replica_write_batch_evaluate.latency</td><td>Execution duration for evaluating a BatchRequest on the read-write path after latches have been acquired.<br/><br/>A measurement is recorded regardless of outcome (i.e. also in case of an error). If internal retries occur, each instance is recorded separately.<br/>Note that the measurement does not include the duration for replicating the evaluated command.</td><td>Nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>STORAGE</td><td>kv.split.estimated_stats</td><td>Number of splits that computed estimated MVCC stats.</td><td>Events</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>STORAGE</td><td>kv.split.total_bytes_estimates</td><td>Number of total bytes difference between the pre-split and post-split MVCC stats.</td><td>Bytes</td><td>COUNTER</td><td>BYTES</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>STORAGE</td><td>kv.tenant_rate_limit.current_blocked</td><td>Number of requests currently blocked by the rate limiter</td><td>Requests</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>STORAGE</td><td>kv.tenant_rate_limit.num_tenants</td><td>Number of tenants currently being tracked</td><td>Tenants</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>STORAGE</td><td>kv.tenant_rate_limit.read_batches_admitted</td><td>Number of read batches admitted by the rate limiter</td><td>Requests</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
Expand Down
4 changes: 4 additions & 0 deletions pkg/kv/kvserver/batcheval/cmd_end_transaction.go
Original file line number Diff line number Diff line change
Expand Up @@ -1332,6 +1332,10 @@ func splitTriggerHelper(
RHSDelta: *h.AbsPostSplitRight(),
}

pd.Local.Metrics = &result.Metrics{
SplitsWithEstimatedStats: h.splitsWithEstimates,
SplitEstimatedTotalBytesDiff: h.estimatedTotalBytesDiff,
}
deltaPostSplitLeft := h.DeltaPostSplitLeft()
return deltaPostSplitLeft, pd, nil
}
Expand Down
20 changes: 12 additions & 8 deletions pkg/kv/kvserver/batcheval/result/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,16 @@ package result
// Metrics tracks various counters related to command applications and
// their outcomes.
type Metrics struct {
LeaseRequestSuccess int // lease request evaluated successfully
LeaseRequestError int // lease request error at evaluation time
LeaseTransferSuccess int // lease transfer evaluated successfully
LeaseTransferError int // lease transfer error at evaluation time
ResolveCommit int // intent commit evaluated successfully
ResolveAbort int // non-poisoning intent abort evaluated successfully
ResolvePoison int // poisoning intent abort evaluated successfully
AddSSTableAsWrites int // AddSSTable requests with IngestAsWrites set
LeaseRequestSuccess int // lease request evaluated successfully
LeaseRequestError int // lease request error at evaluation time
LeaseTransferSuccess int // lease transfer evaluated successfully
LeaseTransferError int // lease transfer error at evaluation time
ResolveCommit int // intent commit evaluated successfully
ResolveAbort int // non-poisoning intent abort evaluated successfully
ResolvePoison int // poisoning intent abort evaluated successfully
AddSSTableAsWrites int // AddSSTable requests with IngestAsWrites set
SplitsWithEstimatedStats int // Splits that computed stats estimates
SplitEstimatedTotalBytesDiff int // Difference between pre- and post-split total bytes.
}

// Add absorbs the supplied Metrics into the receiver.
Expand All @@ -33,4 +35,6 @@ func (mt *Metrics) Add(o Metrics) {
mt.ResolveAbort += o.ResolveAbort
mt.ResolvePoison += o.ResolvePoison
mt.AddSSTableAsWrites += o.AddSSTableAsWrites
mt.SplitsWithEstimatedStats += o.SplitsWithEstimatedStats
mt.SplitEstimatedTotalBytesDiff += o.SplitEstimatedTotalBytesDiff
}
18 changes: 15 additions & 3 deletions pkg/kv/kvserver/batcheval/split_stats_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@
package batcheval

import (
"context"
"math"

"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
"github.com/cockroachdb/cockroach/pkg/util/log"
)

// splitStatsHelper codifies and explains the stats computations related to a
Expand Down Expand Up @@ -111,8 +115,10 @@ import (
type splitStatsHelper struct {
in splitStatsHelperInput

absPostSplitLeft *enginepb.MVCCStats
absPostSplitRight *enginepb.MVCCStats
absPostSplitLeft *enginepb.MVCCStats
absPostSplitRight *enginepb.MVCCStats
splitsWithEstimates int
estimatedTotalBytesDiff int
}

// splitStatsScanFn scans a post-split keyspace to compute its stats. The
Expand Down Expand Up @@ -267,6 +273,10 @@ func makeEstimatedSplitStatsHelper(input splitStatsHelperInput) (splitStatsHelpe
// compounded estimates from previous splits).
if !h.in.AbsPreSplitBothStored.HasUserDataCloseTo(
h.in.PreSplitStats, h.in.MaxCountDiff, h.in.MaxBytesDiff) {
log.VEventf(context.Background(), 2,
"split falling back to accurate stats computation because of "+
"large difference in pre- and post-split MVCC stats; pre: %v, post: %v",
h.in.PreSplitStats, h.in.AbsPreSplitBothStored)
return makeSplitStatsHelper(input)
}

Expand Down Expand Up @@ -315,7 +325,9 @@ func makeEstimatedSplitStatsHelper(input splitStatsHelperInput) (splitStatsHelpe
if !h.in.RightIsEmpty {
h.absPostSplitRight.ContainsEstimates++
}

h.splitsWithEstimates = 1
h.estimatedTotalBytesDiff = int(math.Abs(
float64(h.in.AbsPreSplitBothStored.Total()) - float64(h.in.PreSplitStats.Total())))
return h, nil
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/kv/kvserver/client_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ func TestStoreResolveMetrics(t *testing.T) {
// them everywhere.
{
act := fmt.Sprintf("%+v", result.Metrics{})
exp := "{LeaseRequestSuccess:0 LeaseRequestError:0 LeaseTransferSuccess:0 LeaseTransferError:0 ResolveCommit:0 ResolveAbort:0 ResolvePoison:0 AddSSTableAsWrites:0}"
exp := "{LeaseRequestSuccess:0 LeaseRequestError:0 LeaseTransferSuccess:0 LeaseTransferError:0 ResolveCommit:0 ResolveAbort:0 ResolvePoison:0 AddSSTableAsWrites:0 SplitsWithEstimatedStats:0 SplitEstimatedTotalBytesDiff:0}"
if act != exp {
t.Errorf("need to update this test due to added fields: %v", act)
}
Expand Down
27 changes: 27 additions & 0 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -2268,6 +2268,20 @@ Note that the measurement does not include the duration for replicating the eval
Unit: metric.Unit_COUNT,
}

metaSplitEstimatedStats = metric.Metadata{
Name: "kv.split.estimated_stats",
Help: "Number of splits that computed estimated MVCC stats.",
Measurement: "Events",
Unit: metric.Unit_COUNT,
}

metaSplitEstimatedTotalBytesDiff = metric.Metadata{
Name: "kv.split.total_bytes_estimates",
Help: "Number of total bytes difference between the pre-split and post-split MVCC stats.",
Measurement: "Bytes",
Unit: metric.Unit_BYTES,
}

metaStorageFlushUtilization = metric.Metadata{
Name: "storage.flush.utilization",
Help: "The percentage of time the storage engine is actively flushing memtables to disk.",
Expand Down Expand Up @@ -2713,6 +2727,9 @@ type StoreMetrics struct {
ReplicaReadBatchDroppedLatchesBeforeEval *metric.Counter
ReplicaReadBatchWithoutInterleavingIter *metric.Counter

SplitsWithEstimatedStats *metric.Counter
SplitEstimatedTotalBytesDiff *metric.Counter

FlushUtilization *metric.GaugeFloat64
FsyncLatency *metric.ManualWindowHistogram
}
Expand Down Expand Up @@ -3451,6 +3468,10 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {

ReplicaReadBatchDroppedLatchesBeforeEval: metric.NewCounter(metaReplicaReadBatchDroppedLatchesBeforeEval),
ReplicaReadBatchWithoutInterleavingIter: metric.NewCounter(metaReplicaReadBatchWithoutInterleavingIter),

// Estimated MVCC stats in split.
SplitsWithEstimatedStats: metric.NewCounter(metaSplitEstimatedStats),
SplitEstimatedTotalBytesDiff: metric.NewCounter(metaSplitEstimatedTotalBytesDiff),
}

storeRegistry.AddMetricStruct(sm)
Expand Down Expand Up @@ -3680,6 +3701,12 @@ func (sm *StoreMetrics) handleMetricsResult(ctx context.Context, metric result.M
sm.AddSSTableAsWrites.Inc(int64(metric.AddSSTableAsWrites))
metric.AddSSTableAsWrites = 0

sm.SplitsWithEstimatedStats.Inc(int64(metric.SplitsWithEstimatedStats))
metric.SplitsWithEstimatedStats = 0

sm.SplitEstimatedTotalBytesDiff.Inc(int64(metric.SplitEstimatedTotalBytesDiff))
metric.SplitEstimatedTotalBytesDiff = 0

if metric != (result.Metrics{}) {
log.Fatalf(ctx, "unhandled fields in metrics result: %+v", metric)
}
Expand Down

0 comments on commit 05daef9

Please sign in to comment.