Skip to content

Commit

Permalink
kvserver: Add a metric for in-progress snapshots
Browse files Browse the repository at this point in the history
Fixes: #98242

Knowing how many delegate snapshot requests are currently in-progress
will be useful for detecting problems. This change adds a metric for
this. It also updates the names of the previous stats to have the prefix
`range.snapshots` vs `range.snapshot` to be consistent with other stats.

Epic: none

Release note (ops change): Adds a new stat
range.snapshots.delegate.in-progress and renames two existing stats.
They were never part of a release, so better to rename them before
23.1.0 is cut.
range.snapshot.delegate.successes -> range.snapshots.delegate.successes
range.snapshot.delegate.failures -> range.snapshots.delegate.failures
  • Loading branch information
andrewbaptist committed Mar 28, 2023
1 parent 63b683e commit fff395f
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 8 deletions.
18 changes: 13 additions & 5 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -825,15 +825,15 @@ evaluating the network savings of not sending cross region traffic.
Unit: metric.Unit_BYTES,
}
metaDelegateSnapshotSuccesses = metric.Metadata{
Name: "range.snapshot.delegate.successes",
Name: "range.snapshots.delegate.successes",
Help: `Number of snapshots that were delegated to a different node and
resulted in success on that delegate. This does not count self delegated snapshots.
`,
Measurement: "Snapshots",
Unit: metric.Unit_COUNT,
}
metaDelegateSnapshotFailures = metric.Metadata{
Name: "range.snapshot.delegate.failures",
Name: "range.snapshots.delegate.failures",
Help: `Number of snapshots that were delegated to a different node and
resulted in failure on that delegate. There are numerous reasons a failure can
occur on a delegate such as timeout, the delegate Raft log being too far behind
Expand All @@ -842,6 +842,12 @@ or the delegate being too busy to send.
Measurement: "Snapshots",
Unit: metric.Unit_COUNT,
}
metaDelegateSnapshotInProgress = metric.Metadata{
Name: "range.snapshots.delegate.in-progress",
Help: `Number of delegated snapshots that are currently in-flight.`,
Measurement: "Snapshots",
Unit: metric.Unit_COUNT,
}

// Quota pool metrics.
metaRaftQuotaPoolPercentUsed = metric.Metadata{
Expand Down Expand Up @@ -1922,9 +1928,10 @@ type StoreMetrics struct {
RangeSnapshotRecvTotalInProgress *metric.Gauge

// Delegate snapshot metrics. These don't count self-delegated snapshots.
DelegateSnapshotSendBytes *metric.Counter
DelegateSnapshotSuccesses *metric.Counter
DelegateSnapshotFailures *metric.Counter
DelegateSnapshotSendBytes *metric.Counter
DelegateSnapshotSuccesses *metric.Counter
DelegateSnapshotFailures *metric.Counter
DelegateSnapshotInProgress *metric.Gauge

// Raft processing metrics.
RaftTicks *metric.Counter
Expand Down Expand Up @@ -2461,6 +2468,7 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
DelegateSnapshotSendBytes: metric.NewCounter(metaDelegateSnapshotSendBytes),
DelegateSnapshotSuccesses: metric.NewCounter(metaDelegateSnapshotSuccesses),
DelegateSnapshotFailures: metric.NewCounter(metaDelegateSnapshotFailures),
DelegateSnapshotInProgress: metric.NewGauge(metaDelegateSnapshotInProgress),

// Raft processing metrics.
RaftTicks: metric.NewCounter(metaRaftTicks),
Expand Down
9 changes: 8 additions & 1 deletion pkg/kv/kvserver/replica_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -2844,13 +2844,20 @@ func (r *Replica) sendSnapshotUsingDelegate(
if selfDelegate {
delegateRequest.QueueOnDelegateLen = -1
}
if !selfDelegate {
r.store.Metrics().DelegateSnapshotInProgress.Inc(1)
}

retErr = contextutil.RunWithTimeout(
ctx, "send-snapshot", sendSnapshotTimeout, func(ctx context.Context) error {
// Sending snapshot
return r.store.cfg.Transport.DelegateSnapshot(ctx, delegateRequest)
},
)
if !selfDelegate {
r.store.Metrics().DelegateSnapshotInProgress.Dec(1)
}

// Return once we have success.
if retErr == nil {
if !selfDelegate {
Expand All @@ -2861,7 +2868,7 @@ func (r *Replica) sendSnapshotUsingDelegate(
if !selfDelegate {
r.store.Metrics().DelegateSnapshotFailures.Inc(1)
}
log.Warningf(ctx, "attempt %d: delegate snapshot %+v request failed %v", n+1, delegateRequest, retErr)
log.KvDistribution.Warningf(ctx, "attempt %d: delegate snapshot %+v request failed %v", n+1, delegateRequest, retErr)
}
}
return
Expand Down
5 changes: 3 additions & 2 deletions pkg/ts/catalog/chart_catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -630,8 +630,8 @@ var charts = []sectionDescription{
"range.snapshots.applied-voter",
"range.snapshots.applied-initial",
"range.snapshots.applied-non-voter",
"range.snapshot.delegate.successes",
"range.snapshot.delegate.failures",
"range.snapshots.delegate.successes",
"range.snapshots.delegate.failures",
},
},
{
Expand All @@ -643,6 +643,7 @@ var charts = []sectionDescription{
"range.snapshots.recv-in-progress",
"range.snapshots.send-total-in-progress",
"range.snapshots.recv-total-in-progress",
"range.snapshots.delegate.in-progress",
},
},
{
Expand Down

0 comments on commit fff395f

Please sign in to comment.