Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
120239: kvclient: add metrics for proxy requests r=erikgrinaker a=andrewbaptist

Proxy behavior was added in a previous commit. This commit adds 4 new
metrics to track the client and server side of proxying and the number
of errors that occured as a result of the proxy. These statistics will
normally be zero or close to zero. While there is a partial partition
the metric will be increased.

Epic: none

Release note: Adds four new metrics: `distsender.rpc.proxy.sent`,
`distsender.rpc.proxy.err`, `distsender.rpc.proxy.forward.sent`,
`distsender.rpc.proxy.forward.err` to track the number and outcome of
proxy requests. Operators should monitor and alert on
`distsender.rpc.proxy.sent` as it indicates there is likely a network
partition in the system.

121021: kvcoord: fix DistSender circuit breaker benchmark `nil` panic r=erikgrinaker a=erikgrinaker

Resolves #121020.
Epic: none
Release note: None

Co-authored-by: Andrew Baptist <baptist@cockroachlabs.com>
Co-authored-by: Erik Grinaker <grinaker@cockroachlabs.com>
  • Loading branch information
3 people committed Mar 25, 2024
3 parents 7674256 + 2fcf790 + 1df21ee commit 00a6257
Show file tree
Hide file tree
Showing 6 changed files with 131 additions and 8 deletions.
4 changes: 4 additions & 0 deletions docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -937,6 +937,10 @@
<tr><td>APPLICATION</td><td>distsender.rpc.merge.sent</td><td>Number of Merge requests processed.<br/><br/>This counts the requests in batches handed to DistSender, not the RPCs<br/>sent to individual Ranges as a result.</td><td>RPCs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rpc.migrate.sent</td><td>Number of Migrate requests processed.<br/><br/>This counts the requests in batches handed to DistSender, not the RPCs<br/>sent to individual Ranges as a result.</td><td>RPCs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rpc.probe.sent</td><td>Number of Probe requests processed.<br/><br/>This counts the requests in batches handed to DistSender, not the RPCs<br/>sent to individual Ranges as a result.</td><td>RPCs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rpc.proxy.err</td><td>Number of attempts by a gateway to proxy a request which resulted in a failure.</td><td>RPCs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rpc.proxy.forward.err</td><td>Number of attempts on a follower replica to proxy a request which resulted in a failure.</td><td>RPCs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rpc.proxy.forward.sent</td><td>Number of attempts on a follower replica to proxy a request to an unreachable leaseholder.</td><td>RPCs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rpc.proxy.sent</td><td>Number of attempts by a gateway to proxy a request to an unreachable leaseholder via a follower replica.</td><td>RPCs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rpc.pushtxn.sent</td><td>Number of PushTxn requests processed.<br/><br/>This counts the requests in batches handed to DistSender, not the RPCs<br/>sent to individual Ranges as a result.</td><td>RPCs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rpc.put.sent</td><td>Number of Put requests processed.<br/><br/>This counts the requests in batches handed to DistSender, not the RPCs<br/>sent to individual Ranges as a result.</td><td>RPCs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rpc.queryintent.sent</td><td>Number of QueryIntent requests processed.<br/><br/>This counts the requests in batches handed to DistSender, not the RPCs<br/>sent to individual Ranges as a result.</td><td>RPCs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
Expand Down
47 changes: 45 additions & 2 deletions pkg/kv/kvclient/kvcoord/dist_sender.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,30 @@ errors as 'roachpb.InternalErrType'.
Measurement: "Errors",
Unit: metric.Unit_COUNT,
}
metaDistSenderProxySentCount = metric.Metadata{
Name: "distsender.rpc.proxy.sent",
Help: "Number of attempts by a gateway to proxy a request to an unreachable leaseholder via a follower replica.",
Measurement: "RPCs",
Unit: metric.Unit_COUNT,
}
metaDistSenderProxyErrCount = metric.Metadata{
Name: "distsender.rpc.proxy.err",
Help: "Number of attempts by a gateway to proxy a request which resulted in a failure.",
Measurement: "RPCs",
Unit: metric.Unit_COUNT,
}
metaDistSenderProxyForwardSentCount = metric.Metadata{
Name: "distsender.rpc.proxy.forward.sent",
Help: "Number of attempts on a follower replica to proxy a request to an unreachable leaseholder.",
Measurement: "RPCs",
Unit: metric.Unit_COUNT,
}
metaDistSenderProxyForwardErrCount = metric.Metadata{
Name: "distsender.rpc.proxy.forward.err",
Help: "Number of attempts on a follower replica to proxy a request which resulted in a failure.",
Measurement: "RPCs",
Unit: metric.Unit_COUNT,
}
metaDistSenderRangefeedTotalRanges = metric.Metadata{
Name: "distsender.rangefeed.total_ranges",
Help: `Number of ranges executing rangefeed
Expand Down Expand Up @@ -379,6 +403,10 @@ type DistSenderMetrics struct {
RangeLookups *metric.Counter
SlowRPCs *metric.Gauge
SlowReplicaRPCs *metric.Counter
ProxySentCount *metric.Counter
ProxyErrCount *metric.Counter
ProxyForwardSentCount *metric.Counter
ProxyForwardErrCount *metric.Counter
MethodCounts [kvpb.NumMethods]*metric.Counter
ErrCounts [kvpb.NumErrors]*metric.Counter
CircuitBreaker DistSenderCircuitBreakerMetrics
Expand Down Expand Up @@ -407,7 +435,7 @@ type DistSenderRangeFeedMetrics struct {
Errors rangeFeedErrorCounters
}

func makeDistSenderMetrics() DistSenderMetrics {
func MakeDistSenderMetrics() DistSenderMetrics {
m := DistSenderMetrics{
BatchCount: metric.NewCounter(metaDistSenderBatchCount),
PartialBatchCount: metric.NewCounter(metaDistSenderPartialBatchCount),
Expand All @@ -428,6 +456,10 @@ func makeDistSenderMetrics() DistSenderMetrics {
SlowRPCs: metric.NewGauge(metaDistSenderSlowRPCs),
SlowReplicaRPCs: metric.NewCounter(metaDistSenderSlowReplicaRPCs),
CircuitBreaker: makeDistSenderCircuitBreakerMetrics(),
ProxySentCount: metric.NewCounter(metaDistSenderProxySentCount),
ProxyErrCount: metric.NewCounter(metaDistSenderProxyErrCount),
ProxyForwardSentCount: metric.NewCounter(metaDistSenderProxyForwardSentCount),
ProxyForwardErrCount: metric.NewCounter(metaDistSenderProxyForwardErrCount),
DistSenderRangeFeedMetrics: makeDistSenderRangeFeedMetrics(),
}
for i := range m.MethodCounts {
Expand Down Expand Up @@ -746,7 +778,7 @@ func NewDistSender(cfg DistSenderConfig) *DistSender {
clock: cfg.Clock,
nodeDescs: cfg.NodeDescs,
nodeIDGetter: nodeIDGetter,
metrics: makeDistSenderMetrics(),
metrics: MakeDistSenderMetrics(),
kvInterceptor: cfg.KVInterceptor,
locality: cfg.Locality,
healthFunc: cfg.HealthFunc,
Expand Down Expand Up @@ -2422,6 +2454,7 @@ func (ds *DistSender) sendToReplicas(
// to the caller so they can try something else.
if ba.ProxyRangeInfo != nil {
log.VEventf(ctx, 3, "processing a proxy request to %v", ba.ProxyRangeInfo)
ds.metrics.ProxyForwardSentCount.Inc(1)
// We don't know who the leaseholder is, and it is likely that the
// client had stale information. Return our information to them through
// a NLHE and let them retry.
Expand All @@ -2434,6 +2467,7 @@ func (ds *DistSender) sendToReplicas(
routing.Desc(),
"client requested a proxy but we can't figure out the leaseholder"),
)
ds.metrics.ProxyForwardErrCount.Inc(1)
return &br, nil
}
if ba.ProxyRangeInfo.Lease.Sequence != routing.Lease().Sequence ||
Expand All @@ -2447,6 +2481,7 @@ func (ds *DistSender) sendToReplicas(
routing.Desc(),
fmt.Sprintf("proxy failed, update client information %v != %v", ba.ProxyRangeInfo, routing)),
)
ds.metrics.ProxyForwardErrCount.Inc(1)
return &br, nil
}

Expand Down Expand Up @@ -2653,6 +2688,7 @@ func (ds *DistSender) sendToReplicas(
rangeInfo := routing.RangeInfo()
requestToSend.ProxyRangeInfo = &rangeInfo
log.VEventf(ctx, 1, "attempt proxy request %v using %v", requestToSend, rangeInfo)
ds.metrics.ProxySentCount.Inc(1)
}

tBegin := timeutil.Now() // for slow log message
Expand Down Expand Up @@ -2694,6 +2730,13 @@ func (ds *DistSender) sendToReplicas(

ds.metrics.updateCrossLocalityMetricsOnReplicaAddressedBatchResponse(comparisonResult, int64(br.Size()))
ds.maybeIncrementErrCounters(br, err)
if err != nil {
if ba.ProxyRangeInfo != nil {
ds.metrics.ProxyErrCount.Inc(1)
} else if requestToSend.ProxyRangeInfo != nil {
ds.metrics.ProxyForwardErrCount.Inc(1)
}
}

if cbErr != nil {
log.VErrEventf(ctx, 2, "circuit breaker error: %s", cbErr)
Expand Down
4 changes: 2 additions & 2 deletions pkg/kv/kvclient/kvcoord/dist_sender_circuit_breaker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ func BenchmarkDistSenderCircuitBreakersForReplica(b *testing.B) {
kvcoord.CircuitBreakerEnabled.Override(ctx, &st.SV, true)

cbs := kvcoord.NewDistSenderCircuitBreakers(
ambientCtx, stopper, st, nil, kvcoord.DistSenderMetrics{})
ambientCtx, stopper, st, nil, kvcoord.MakeDistSenderMetrics())

replDesc := &roachpb.ReplicaDescriptor{
NodeID: 1,
Expand Down Expand Up @@ -198,7 +198,7 @@ func benchmarkCircuitBreakersTrack(
kvcoord.CircuitBreakerCancellation.Override(ctx, &st.SV, cancel)

cbs := kvcoord.NewDistSenderCircuitBreakers(
ambientCtx, stopper, st, nil, kvcoord.DistSenderMetrics{})
ambientCtx, stopper, st, nil, kvcoord.MakeDistSenderMetrics())

replDesc := &roachpb.ReplicaDescriptor{
NodeID: 1,
Expand Down
2 changes: 1 addition & 1 deletion pkg/kv/kvclient/kvcoord/dist_sender_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5881,7 +5881,7 @@ func TestDistSenderCrossLocalityMetrics(t *testing.T) {
},
} {
t.Run(fmt.Sprintf("%-v", tc.crossLocalityType), func(t *testing.T) {
metrics := makeDistSenderMetrics()
metrics := MakeDistSenderMetrics()
beforeMetrics, err := metrics.getDistSenderCounterMetrics(metricsNames)
if err != nil {
t.Error(err)
Expand Down
4 changes: 2 additions & 2 deletions pkg/kv/kvclient/kvcoord/transport_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ func TestSpanImport(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
ctx := context.Background()
metrics := makeDistSenderMetrics()
metrics := MakeDistSenderMetrics()
gt := grpcTransport{
opts: SendOptions{
metrics: &metrics,
Expand Down Expand Up @@ -190,7 +190,7 @@ func TestResponseVerifyFailure(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
ctx := context.Background()
metrics := makeDistSenderMetrics()
metrics := MakeDistSenderMetrics()
gt := grpcTransport{
opts: SendOptions{
metrics: &metrics,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ export default function (props: GraphDashboardProps) {
<LineGraph
title="Unhealthy RPC Connections"
tooltip={`The number of outgoing connections on each node that are in an
unhealthy state.`}
unhealthy state.`}
showMetricsInTooltip={true}
>
<Axis label="connections">
Expand All @@ -202,5 +202,81 @@ export default function (props: GraphDashboardProps) {
))}
</Axis>
</LineGraph>,

<LineGraph
title="Proxy requests"
tooltip={`The number of proxy attempts each gateway node is initiating.`}
showMetricsInTooltip={true}
>
<Axis label="requests">
{nodeIDs.map(nid => (
<Metric
key={nid}
name="cr.node.distsender.rpc.proxy.sent"
title={nodeDisplayName(nodeDisplayNameByID, nid)}
sources={storeIDsForNode(storeIDsByNodeID, nid)}
tenantSource={tenantSource}
nonNegativeRate
/>
))}
</Axis>
</LineGraph>,

<LineGraph
title="Proxy request errors"
tooltip={`The number of proxy attempts which resulted in an error.`}
showMetricsInTooltip={true}
>
<Axis label="errors">
{nodeIDs.map(nid => (
<Metric
key={nid}
name="cr.node.distsender.rpc.proxy.err"
title={nodeDisplayName(nodeDisplayNameByID, nid)}
sources={storeIDsForNode(storeIDsByNodeID, nid)}
tenantSource={tenantSource}
nonNegativeRate
/>
))}
</Axis>
</LineGraph>,

<LineGraph
title="Proxy forwards"
tooltip={`The number of proxy requests each server node is attempting to foward.`}
showMetricsInTooltip={true}
>
<Axis label="requests">
{nodeIDs.map(nid => (
<Metric
key={nid}
name="cr.node.distsender.rpc.proxy.forward.sent"
title={nodeDisplayName(nodeDisplayNameByID, nid)}
sources={storeIDsForNode(storeIDsByNodeID, nid)}
tenantSource={tenantSource}
nonNegativeRate
/>
))}
</Axis>
</LineGraph>,

<LineGraph
title="Proxy forward errors"
tooltip={`The number of proxy forward attempts which resulted in an error.`}
showMetricsInTooltip={true}
>
<Axis label="errors">
{nodeIDs.map(nid => (
<Metric
key={nid}
name="cr.node.distsender.rpc.proxy.forward.err"
title={nodeDisplayName(nodeDisplayNameByID, nid)}
sources={storeIDsForNode(storeIDsByNodeID, nid)}
tenantSource={tenantSource}
nonNegativeRate
/>
))}
</Axis>
</LineGraph>,
];
}

0 comments on commit 00a6257

Please sign in to comment.