Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

server,ui: Add debugging for quiesced ranges #26269

Merged
merged 1 commit into from
May 31, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pkg/server/problem_ranges.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,17 @@ func (s *statusServer) ProblemRanges(
problems.NoLeaseRangeIDs =
append(problems.NoLeaseRangeIDs, info.State.Desc.RangeID)
}
if info.Problems.QuiescentEqualsTicking {
problems.QuiescentEqualsTickingRangeIDs =
append(problems.QuiescentEqualsTickingRangeIDs, info.State.Desc.RangeID)
}
}
sort.Sort(roachpb.RangeIDSlice(problems.UnavailableRangeIDs))
sort.Sort(roachpb.RangeIDSlice(problems.RaftLeaderNotLeaseHolderRangeIDs))
sort.Sort(roachpb.RangeIDSlice(problems.NoRaftLeaderRangeIDs))
sort.Sort(roachpb.RangeIDSlice(problems.NoLeaseRangeIDs))
sort.Sort(roachpb.RangeIDSlice(problems.UnderreplicatedRangeIDs))
sort.Sort(roachpb.RangeIDSlice(problems.QuiescentEqualsTickingRangeIDs))
response.ProblemsByNodeID[resp.nodeID] = problems
case <-ctx.Done():
return nil, status.Errorf(codes.DeadlineExceeded, ctx.Err().Error())
Expand Down
696 changes: 430 additions & 266 deletions pkg/server/serverpb/status.pb.go

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions pkg/server/serverpb/status.proto
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,12 @@ message RangeProblems {
bool no_raft_leader = 3;
bool underreplicated = 4;
bool no_lease = 5;

// Quiescent ranges do not tick by definition, but we track this in
// two different ways and suspect that they're getting out of sync.
// If the replica's quiescent flag doesn't agree with the store's
// list of replicas that are ticking, warn about it.
bool quiescent_equals_ticking = 6;
}

message RangeStatistics {
Expand Down Expand Up @@ -179,6 +185,7 @@ message RangeInfo {
CommandQueueMetrics cmd_q_global = 12 [ (gogoproto.nullable) = false ];
storage.LeaseStatus lease_status = 13 [ (gogoproto.nullable) = false ];
bool quiescent = 14;
bool ticking = 15;
}

message RangesRequest {
Expand Down Expand Up @@ -505,6 +512,11 @@ message ProblemRangesResponse {
(gogoproto.casttype) =
"github.com/cockroachdb/cockroach/pkg/roachpb.RangeID"
];
repeated int64 quiescent_equals_ticking_range_ids = 7 [
(gogoproto.customname) = "QuiescentEqualsTickingRangeIDs",
(gogoproto.casttype) =
"github.com/cockroachdb/cockroach/pkg/roachpb.RangeID"
];
}
reserved 1 to 7;
// NodeID is the node that submitted all the requests.
Expand Down
12 changes: 7 additions & 5 deletions pkg/server/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -1094,16 +1094,18 @@ func (s *statusServer) Ranges(
WritesPerSecond: rep.WritesPerSecond(),
},
Problems: serverpb.RangeProblems{
Unavailable: metrics.Unavailable,
LeaderNotLeaseHolder: metrics.Leader && metrics.LeaseValid && !metrics.Leaseholder,
NoRaftLeader: !storage.HasRaftLeader(raftStatus) && !metrics.Quiescent,
Underreplicated: metrics.Underreplicated,
NoLease: metrics.Leader && !metrics.LeaseValid && !metrics.Quiescent,
Unavailable: metrics.Unavailable,
LeaderNotLeaseHolder: metrics.Leader && metrics.LeaseValid && !metrics.Leaseholder,
NoRaftLeader: !storage.HasRaftLeader(raftStatus) && !metrics.Quiescent,
Underreplicated: metrics.Underreplicated,
NoLease: metrics.Leader && !metrics.LeaseValid && !metrics.Quiescent,
QuiescentEqualsTicking: metrics.Quiescent == metrics.Ticking,
},
CmdQLocal: serverpb.CommandQueueMetrics(metrics.CmdQMetricsLocal),
CmdQGlobal: serverpb.CommandQueueMetrics(metrics.CmdQMetricsGlobal),
LeaseStatus: metrics.LeaseStatus,
Quiescent: metrics.Quiescent,
Ticking: metrics.Ticking,
}
}

Expand Down
15 changes: 14 additions & 1 deletion pkg/storage/replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -6021,7 +6021,13 @@ type ReplicaMetrics struct {
Leaseholder bool
LeaseType roachpb.LeaseType
LeaseStatus LeaseStatus
Quiescent bool

// Quiescent indicates whether the replica believes itself to be quiesced.
Quiescent bool
// Ticking indicates whether the store is ticking the replica. It should be
// the opposite of Quiescent.
Ticking bool

// Is this the replica which collects per-range metrics? This is done either
// on the leader or, if there is no leader, on the largest live replica ID.
RangeCounter bool
Expand Down Expand Up @@ -6050,6 +6056,10 @@ func (r *Replica) Metrics(
r.cmdQMu.Unlock()
r.mu.RUnlock()

r.store.unquiescedReplicas.Lock()
_, ticking := r.store.unquiescedReplicas.m[r.RangeID]
r.store.unquiescedReplicas.Unlock()

return calcReplicaMetrics(
ctx,
now,
Expand All @@ -6060,6 +6070,7 @@ func (r *Replica) Metrics(
leaseStatus,
r.store.StoreID(),
quiescent,
ticking,
cmdQMetricsLocal,
cmdQMetricsGlobal,
)
Expand All @@ -6084,6 +6095,7 @@ func calcReplicaMetrics(
leaseStatus LeaseStatus,
storeID roachpb.StoreID,
quiescent bool,
ticking bool,
cmdQMetricsLocal CommandQueueMetrics,
cmdQMetricsGlobal CommandQueueMetrics,
) ReplicaMetrics {
Expand All @@ -6099,6 +6111,7 @@ func calcReplicaMetrics(
m.Leaseholder = m.LeaseValid && leaseOwner
m.Leader = isRaftLeader(raftStatus)
m.Quiescent = quiescent
m.Ticking = ticking

// We compute an estimated range count across the cluster by counting the
// first live replica in each descriptor. Note that the first live replica is
Expand Down
3 changes: 2 additions & 1 deletion pkg/storage/replica_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8669,10 +8669,11 @@ func TestReplicaMetrics(t *testing.T) {
// Alternate between quiescent and non-quiescent replicas to test the
// quiescent metric.
c.expected.Quiescent = i%2 == 0
c.expected.Ticking = !c.expected.Quiescent
metrics := calcReplicaMetrics(
context.Background(), hlc.Timestamp{}, config.SystemConfig{},
c.liveness, &c.desc, c.raftStatus, LeaseStatus{},
c.storeID, c.expected.Quiescent, CommandQueueMetrics{}, CommandQueueMetrics{})
c.storeID, c.expected.Quiescent, c.expected.Ticking, CommandQueueMetrics{}, CommandQueueMetrics{})
if c.expected != metrics {
t.Fatalf("unexpected metrics:\n%s", pretty.Diff(c.expected, metrics))
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,19 @@ const connectionTableColumns: ConnectionTableColumn[] = [
title: "Underreplicated (or slow)",
extract: (problem) => problem.underreplicated_range_ids.length,
},
{
title: "Quiescent equals ticking",
extract: (problem) => problem.quiescent_equals_ticking_range_ids.length,
},
{
title: "Total",
extract: (problem) => {
return problem.unavailable_range_ids.length +
problem.no_raft_leader_range_ids.length +
problem.no_lease_range_ids.length +
problem.raft_leader_not_lease_holder_range_ids.length +
problem.underreplicated_range_ids.length;
problem.underreplicated_range_ids.length +
problem.quiescent_equals_ticking_range_ids.length;
},
},
{ title: "Error", extract: (problem) => problem.error_message },
Expand Down
5 changes: 5 additions & 0 deletions pkg/ui/src/views/reports/containers/problemRanges/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,11 @@ class ProblemRanges extends React.Component<ProblemRangesProps, {}> {
problems={problems}
extract={(problem) => problem.underreplicated_range_ids}
/>
<ProblemRangeList
name="Quiescent equals ticking"
problems={problems}
extract={(problem) => problem.quiescent_equals_ticking_range_ids}
/>
</div>
);
}
Expand Down
5 changes: 5 additions & 0 deletions pkg/ui/src/views/reports/containers/range/rangeTable.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ const rangeTableDisplayList: RangeTableRow[] = [
{ variable: "problems", display: "Problems", compareToLeader: true },
{ variable: "raftState", display: "Raft State", compareToLeader: false },
{ variable: "quiescent", display: "Quiescent", compareToLeader: true },
{ variable: "ticking", display: "Ticking", compareToLeader: true },
{ variable: "leaseType", display: "Lease Type", compareToLeader: true },
{ variable: "leaseState", display: "Lease State", compareToLeader: true },
{ variable: "leaseHolder", display: "Lease Holder", compareToLeader: true },
Expand Down Expand Up @@ -204,6 +205,9 @@ export default class RangeTable extends React.Component<RangeTableProps, {}> {
if (problems.unavailable) {
results = _.concat(results, "Unavailable");
}
if (problems.quiescent_equals_ticking) {
results = _.concat(results, "Quiescent equals ticking");
}
if (awaitingGC) {
results = _.concat(results, "Awaiting GC");
}
Expand Down Expand Up @@ -449,6 +453,7 @@ export default class RangeTable extends React.Component<RangeTableProps, {}> {
problems: this.contentProblems(info.problems, awaitingGC),
raftState: raftState,
quiescent: info.quiescent ? rangeTableQuiescent : rangeTableEmptyContent,
ticking: this.createContent(info.ticking.toString()),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be nice to roll this in with the quiescent row. I'm imagining we'd show quiescent and empty as normal states and "quiescent but ticking" and "not ticking but not quiescent" as error states.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I just wanted to get something in quickly for cyan without having to fiddle with and test all the combinations.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm fine with that, we can just have @BramGruneir come through and add some niceties here.

leaseState: leaseState,
leaseHolder: this.createContent(
Print.ReplicaID(rangeID, lease.replica),
Expand Down