New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
shards allocation health indicator services #83513
Changes from 30 commits
1b73587
65af584
ef2bfaf
bde7eb7
171b819
5f98333
31d698a
6fddaac
75a9730
e7512cb
4f11792
7e7aba5
4419524
8bfdc86
805b7e9
04006fe
b0c8da5
0be8545
fabf72b
bb08288
2a52d81
a9802b5
cf2f4de
07013a7
1f33992
a8dc1a8
8ad8ca0
be0666a
49bcc79
aa0e9c6
d2cc9d3
86264ea
39703b4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
pr: 83513 | ||
summary: Shards allocation health indicator services | ||
area: Health | ||
type: enhancement | ||
issues: [] |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,210 @@ | ||||||
/* | ||||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||||||
* or more contributor license agreements. Licensed under the Elastic License | ||||||
* 2.0 and the Server Side Public License, v 1; you may not use this file except | ||||||
* in compliance with, at your election, the Elastic License 2.0 or the Server | ||||||
* Side Public License, v 1. | ||||||
*/ | ||||||
|
||||||
package org.elasticsearch.cluster.routing.allocation; | ||||||
|
||||||
import org.elasticsearch.cluster.health.ClusterHealthStatus; | ||||||
import org.elasticsearch.cluster.metadata.NodesShutdownMetadata; | ||||||
import org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata; | ||||||
import org.elasticsearch.cluster.routing.IndexRoutingTable; | ||||||
import org.elasticsearch.cluster.routing.IndexShardRoutingTable; | ||||||
import org.elasticsearch.cluster.routing.ShardRouting; | ||||||
import org.elasticsearch.cluster.routing.UnassignedInfo; | ||||||
import org.elasticsearch.cluster.service.ClusterService; | ||||||
import org.elasticsearch.health.HealthIndicatorResult; | ||||||
import org.elasticsearch.health.HealthIndicatorService; | ||||||
import org.elasticsearch.health.HealthStatus; | ||||||
import org.elasticsearch.health.SimpleHealthIndicatorDetails; | ||||||
|
||||||
import java.util.Map; | ||||||
import java.util.function.Function; | ||||||
import java.util.stream.Stream; | ||||||
|
||||||
import static java.util.stream.Collectors.joining; | ||||||
import static org.elasticsearch.cluster.health.ClusterShardHealth.getInactivePrimaryHealth; | ||||||
import static org.elasticsearch.health.HealthStatus.GREEN; | ||||||
import static org.elasticsearch.health.HealthStatus.RED; | ||||||
import static org.elasticsearch.health.HealthStatus.YELLOW; | ||||||
import static org.elasticsearch.health.ServerHealthComponents.DATA; | ||||||
|
||||||
/** | ||||||
* This indicator reports health for shards. | ||||||
* <p> | ||||||
* Indicator will report: | ||||||
* * RED when one or more primary shards are not available | ||||||
* * YELLOW when one or more replica shards are not available | ||||||
* * GREEN otherwise | ||||||
* <p> | ||||||
* Each shard needs to be available and replicated in order to guarantee high availability and prevent data loses. | ||||||
* Shards allocated on nodes scheduled for restart (using nodes shutdown API) will not degrade this indicator health. | ||||||
idegtiarenko marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
*/ | ||||||
public class ShardsAvailabilityHealthIndicatorService implements HealthIndicatorService { | ||||||
|
||||||
public static final String NAME = "shards_availability"; | ||||||
|
||||||
private final ClusterService clusterService; | ||||||
|
||||||
public ShardsAvailabilityHealthIndicatorService(ClusterService clusterService) { | ||||||
this.clusterService = clusterService; | ||||||
} | ||||||
|
||||||
@Override | ||||||
public String name() { | ||||||
return NAME; | ||||||
} | ||||||
|
||||||
@Override | ||||||
public String component() { | ||||||
return DATA; | ||||||
} | ||||||
|
||||||
@Override | ||||||
public HealthIndicatorResult calculate() { | ||||||
var state = clusterService.state(); | ||||||
var shutdown = state.getMetadata().custom(NodesShutdownMetadata.TYPE, NodesShutdownMetadata.EMPTY); | ||||||
var status = new ShardAllocationStatus(); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we construct this with the shutdown metadata instead, such that we only pass the node shutdowns in the constructor? Would look cleaner to me. Also, can we pass just a
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nodes shutdown is not required for output and only used for calculation. I believe it is better to keep such dependencies as arguments There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do not agree, by having the I would still like to see us just pass around There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I agree with Henning on this |
||||||
|
||||||
for (IndexRoutingTable indexShardRouting : state.routingTable()) { | ||||||
for (IndexShardRoutingTable shardRouting : indexShardRouting) { | ||||||
status.addPrimary(shardRouting.primaryShard(), shutdown); | ||||||
for (ShardRouting replicaShard : shardRouting.replicaShards()) { | ||||||
status.addReplica(replicaShard, shutdown); | ||||||
} | ||||||
} | ||||||
} | ||||||
|
||||||
return createIndicator(status.getStatus(), status.getSummary(), status.getDetails()); | ||||||
} | ||||||
|
||||||
private static class ShardAllocationCounts { | ||||||
private boolean available = true; | ||||||
private int unassigned = 0; | ||||||
private int unassigned_new = 0; | ||||||
private int unassigned_restarting = 0; | ||||||
private int initializing = 0; | ||||||
private int started = 0; | ||||||
private int reallocating = 0; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we rename this to relocating? |
||||||
|
||||||
public void increment(ShardRouting routing, NodesShutdownMetadata metadata) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let us rename |
||||||
boolean isNew = isUnassignedDueToNewInitialization(routing); | ||||||
boolean isRestarting = isUnassignedDueToTimelyRestart(routing, metadata); | ||||||
available &= routing.active() || isRestarting || isNew; | ||||||
|
||||||
switch (routing.state()) { | ||||||
case UNASSIGNED -> { | ||||||
if (isNew) { | ||||||
unassigned_new++; | ||||||
} else if (isRestarting) { | ||||||
unassigned_restarting++; | ||||||
} else { | ||||||
unassigned++; | ||||||
} | ||||||
} | ||||||
case INITIALIZING -> initializing++; | ||||||
case STARTED -> started++; | ||||||
idegtiarenko marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
case RELOCATING -> reallocating++; | ||||||
} | ||||||
} | ||||||
} | ||||||
|
||||||
private static boolean isUnassignedDueToTimelyRestart(ShardRouting routing, NodesShutdownMetadata metadata) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let us rename |
||||||
var info = routing.unassignedInfo(); | ||||||
if (info == null || info.getReason() != UnassignedInfo.Reason.NODE_RESTARTING) { | ||||||
return false; | ||||||
} | ||||||
var shutdown = metadata.getAllNodeMetadataMap().get(info.getLastAllocatedNodeId()); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would be better to retrive the Map<String, SingleNodeShutdownMetadata> once (cf Henning's comment) |
||||||
if (shutdown == null || shutdown.getType() != SingleNodeShutdownMetadata.Type.RESTART) { | ||||||
return false; | ||||||
} | ||||||
var now = System.currentTimeMillis(); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You may want to pass a LongSupplier to unit test this more easily There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently this is used in a static method that is used in a static inner class. Passing the reference to the LongSupplier might make this code more complex. Currently the unit test injects the time of the entity relative to the current and does not rely on injecting time to the class. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, no need to hold on this. I did not realize it was used in a static context. I'm just advocating on trying to avoid |
||||||
var restartingAllocationDelayExpiration = info.getUnassignedTimeInMillis() + shutdown.getAllocationDelay().getMillis(); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we use |
||||||
return now <= restartingAllocationDelayExpiration; | ||||||
} | ||||||
|
||||||
private static boolean isUnassignedDueToNewInitialization(ShardRouting routing) { | ||||||
return routing.primary() && routing.active() == false && getInactivePrimaryHealth(routing) == ClusterHealthStatus.YELLOW; | ||||||
} | ||||||
|
||||||
private static class ShardAllocationStatus { | ||||||
private final ShardAllocationCounts primaries = new ShardAllocationCounts(); | ||||||
private final ShardAllocationCounts replicas = new ShardAllocationCounts(); | ||||||
|
||||||
public void addPrimary(ShardRouting routing, NodesShutdownMetadata metadata) { | ||||||
primaries.increment(routing, metadata); | ||||||
} | ||||||
|
||||||
public void addReplica(ShardRouting routing, NodesShutdownMetadata metadata) { | ||||||
replicas.increment(routing, metadata); | ||||||
} | ||||||
|
||||||
public HealthStatus getStatus() { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should ideally also trigger on initializing shards. This may have some complication to integrate with restart though if the operator/orchestration waits for green or yellow I think deferring to a follow-up is fine, but perhaps add a comment here to aid the next reader? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Adding to the handover document |
||||||
if (primaries.available == false) { | ||||||
return RED; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need to special handle new indices similar to A separate count of the "new" unassigned/initializing shards seems necessary. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added |
||||||
} else if (replicas.available == false) { | ||||||
return YELLOW; | ||||||
} else { | ||||||
return GREEN; | ||||||
} | ||||||
} | ||||||
|
||||||
public String getSummary() { | ||||||
var builder = new StringBuilder("This cluster has "); | ||||||
if (primaries.unassigned > 0 | ||||||
|| primaries.unassigned_new > 0 | ||||||
|| primaries.unassigned_restarting > 0 | ||||||
|| replicas.unassigned > 0 | ||||||
|| replicas.unassigned_restarting > 0) { | ||||||
builder.append( | ||||||
Stream.of( | ||||||
createMessage(primaries.unassigned, "unavailable primary", " unavailable primaries"), | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: did you consider implement a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but I did not come up with a good way to merge distinct messages. It would require appending primary/primaries/replica/replicas suffixes as well as properly setting comas. |
||||||
createMessage(primaries.unassigned_new, "creating primary", " creating primaries"), | ||||||
createMessage(primaries.unassigned_restarting, "restarting primary", " restarting primaries"), | ||||||
createMessage(replicas.unassigned, "unavailable replica", "unavailable replicas"), | ||||||
createMessage(replicas.unassigned_restarting, "restarting replica", "restarting replicas") | ||||||
).flatMap(Function.identity()).collect(joining(" , ")) | ||||||
).append("."); | ||||||
} else { | ||||||
builder.append("all shards available."); | ||||||
} | ||||||
return builder.toString(); | ||||||
} | ||||||
|
||||||
private static Stream<String> createMessage(int count, String singular, String plural) { | ||||||
return switch (count) { | ||||||
case 0 -> Stream.empty(); | ||||||
case 1 -> Stream.of("1 " + singular); | ||||||
default -> Stream.of(count + " " + plural); | ||||||
}; | ||||||
} | ||||||
|
||||||
public SimpleHealthIndicatorDetails getDetails() { | ||||||
return new SimpleHealthIndicatorDetails( | ||||||
Map.of( | ||||||
"unassigned_primaries", | ||||||
primaries.unassigned, | ||||||
"initializing_primaries", | ||||||
primaries.initializing, | ||||||
"creating_primaries", | ||||||
primaries.unassigned_new, | ||||||
"restarting_primaries", | ||||||
primaries.unassigned_restarting, | ||||||
"started_primaries", | ||||||
primaries.started + primaries.reallocating, | ||||||
"unassigned_replicas", | ||||||
replicas.unassigned, | ||||||
"initializing_replicas", | ||||||
replicas.initializing, | ||||||
"restarting_replicas", | ||||||
replicas.unassigned_restarting, | ||||||
"started_replicas", | ||||||
replicas.started + replicas.reallocating | ||||||
) | ||||||
); | ||||||
} | ||||||
} | ||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
According to the line above it is always not null