elastic · idegtiarenko · Mar 7, 2022 · Feb 4, 2022 · Feb 4, 2022 · Feb 10, 2022
diff --git a/docs/changelog/83513.yaml b/docs/changelog/83513.yaml
@@ -0,0 +1,5 @@
+pr: 83513
+summary: Shards allocation health indicator services
+area: Health
+type: enhancement
+issues: []
diff --git a/docs/reference/index-modules.asciidoc b/docs/reference/index-modules.asciidoc
@@ -163,6 +163,9 @@ specific index module:
 
     The number of replicas each primary shard has. Defaults to 1.
 
+    WARNING: Configuring it to 0 may lead to temporary availability loss
+    during node restarts or permanent data loss in case of data corruption.
+
 [[dynamic-index-auto-expand-replicas]]
 `index.auto_expand_replicas`::
 Auto-expand the number of replicas based on the number of data nodes in the

diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/NodesShutdownMetadata.java b/server/src/main/java/org/elasticsearch/cluster/metadata/NodesShutdownMetadata.java
@@ -41,6 +41,7 @@
 public class NodesShutdownMetadata implements Metadata.Custom {
     public static final String TYPE = "node_shutdown";
     public static final Version NODE_SHUTDOWN_VERSION = Version.V_7_13_0;
+    public static final NodesShutdownMetadata EMPTY = new NodesShutdownMetadata(Map.of());
 
     private static final ParseField NODES_FIELD = new ParseField("nodes");
 
@@ -70,17 +71,17 @@ public static NamedDiff<Metadata.Custom> readDiffFrom(StreamInput in) throws IOE
 
     public static Optional<NodesShutdownMetadata> getShutdowns(final ClusterState state) {
         assert state != null : "cluster state should never be null";
-        return Optional.ofNullable(state).map(ClusterState::metadata).map(m -> m.custom(TYPE));
+        return Optional.of(state).map(ClusterState::metadata).map(m -> m.custom(TYPE));
     }
 
     private final Map<String, SingleNodeShutdownMetadata> nodes;
 
     public NodesShutdownMetadata(Map<String, SingleNodeShutdownMetadata> nodes) {
-        this.nodes = nodes;
+        this.nodes = Collections.unmodifiableMap(nodes);
     }
 
     public NodesShutdownMetadata(StreamInput in) throws IOException {
-        this.nodes = in.readMap(StreamInput::readString, SingleNodeShutdownMetadata::new);
+        this(in.readMap(StreamInput::readString, SingleNodeShutdownMetadata::new));
     }
 
     @Override
@@ -92,7 +93,7 @@ public void writeTo(StreamOutput out) throws IOException {
      * @return A map of NodeID to shutdown metadata.
      */
     public Map<String, SingleNodeShutdownMetadata> getAllNodeMetadataMap() {
-        return Collections.unmodifiableMap(nodes);
+        return nodes;
     }
 
     /**

diff --git a/...rg/elasticsearch/cluster/routing/allocation/ShardsAvailabilityHealthIndicatorService.java b/...rg/elasticsearch/cluster/routing/allocation/ShardsAvailabilityHealthIndicatorService.java
@@ -0,0 +1,210 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.cluster.routing.allocation;
+
+import org.elasticsearch.cluster.health.ClusterHealthStatus;
+import org.elasticsearch.cluster.metadata.NodesShutdownMetadata;
+import org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata;
+import org.elasticsearch.cluster.routing.IndexRoutingTable;
+import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
+import org.elasticsearch.cluster.routing.ShardRouting;
+import org.elasticsearch.cluster.routing.UnassignedInfo;
+import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.health.HealthIndicatorResult;
+import org.elasticsearch.health.HealthIndicatorService;
+import org.elasticsearch.health.HealthStatus;
+import org.elasticsearch.health.SimpleHealthIndicatorDetails;
+
+import java.util.Map;
+import java.util.function.Function;
+import java.util.stream.Stream;
+
+import static java.util.stream.Collectors.joining;
+import static org.elasticsearch.cluster.health.ClusterShardHealth.getInactivePrimaryHealth;
+import static org.elasticsearch.health.HealthStatus.GREEN;
+import static org.elasticsearch.health.HealthStatus.RED;
+import static org.elasticsearch.health.HealthStatus.YELLOW;
+import static org.elasticsearch.health.ServerHealthComponents.DATA;
+
+/**
+ * This indicator reports health for shards.
+ * <p>
+ * Indicator will report:
+ * * RED when one or more primary shards are not available
+ * * YELLOW when one or more replica shards are not available
+ * * GREEN otherwise
+ * <p>
+ * Each shard needs to be available and replicated in order to guarantee high availability and prevent data loses.
+ * Shards allocated on nodes scheduled for restart (using nodes shutdown API) will not degrade this indicator health.
+ */
+public class ShardsAvailabilityHealthIndicatorService implements HealthIndicatorService {
+
+    public static final String NAME = "shards_availability";
+
+    private final ClusterService clusterService;
+
+    public ShardsAvailabilityHealthIndicatorService(ClusterService clusterService) {
+        this.clusterService = clusterService;
+    }
+
+    @Override
+    public String name() {
+        return NAME;
+    }
+
+    @Override
+    public String component() {
+        return DATA;
+    }
+
+    @Override
+    public HealthIndicatorResult calculate() {
+        var state = clusterService.state();
+        var shutdown = state.getMetadata().custom(NodesShutdownMetadata.TYPE, NodesShutdownMetadata.EMPTY);
+        var status = new ShardAllocationStatus();
-        var status = new ShardAllocationStatus();
+        var status = new ShardAllocationStatus(shutdown.getAllNodeMetadataMap()::get);
-        var status = new ShardAllocationStatus();
+        var status = new ShardAllocationStatus(shutdown.getAllNodeMetadataMap()::get);
+
+        for (IndexRoutingTable indexShardRouting : state.routingTable()) {
+            for (IndexShardRoutingTable shardRouting : indexShardRouting) {
+                status.addPrimary(shardRouting.primaryShard(), shutdown);
+                for (ShardRouting replicaShard : shardRouting.replicaShards()) {
+                    status.addReplica(replicaShard, shutdown);
+                }
+            }
+        }
+
+        return createIndicator(status.getStatus(), status.getSummary(), status.getDetails());
+    }
+
+    private static class ShardAllocationCounts {
+        private boolean available = true;
+        private int unassigned = 0;
+        private int unassigned_new = 0;
+        private int unassigned_restarting = 0;
+        private int initializing = 0;
+        private int started = 0;
+        private int reallocating = 0;
+
+        public void increment(ShardRouting routing, NodesShutdownMetadata metadata) {
+            boolean isNew = isUnassignedDueToNewInitialization(routing);
+            boolean isRestarting = isUnassignedDueToTimelyRestart(routing, metadata);
+            available &= routing.active() || isRestarting || isNew;
+
+            switch (routing.state()) {
+                case UNASSIGNED -> {
+                    if (isNew) {
+                        unassigned_new++;
+                    } else if (isRestarting) {
+                        unassigned_restarting++;
+                    } else {
+                        unassigned++;
+                    }
+                }
+                case INITIALIZING -> initializing++;
+                case STARTED -> started++;
+                case RELOCATING -> reallocating++;
+            }
+        }
+    }
+
+    private static boolean isUnassignedDueToTimelyRestart(ShardRouting routing, NodesShutdownMetadata metadata) {
+        var info = routing.unassignedInfo();
+        if (info == null || info.getReason() != UnassignedInfo.Reason.NODE_RESTARTING) {
+            return false;
+        }
+        var shutdown = metadata.getAllNodeMetadataMap().get(info.getLastAllocatedNodeId());
+        if (shutdown == null || shutdown.getType() != SingleNodeShutdownMetadata.Type.RESTART) {
+            return false;
+        }
+        var now = System.currentTimeMillis();
+        var restartingAllocationDelayExpiration = info.getUnassignedTimeInMillis() + shutdown.getAllocationDelay().getMillis();
+        return now <= restartingAllocationDelayExpiration;
+    }
+
+    private static boolean isUnassignedDueToNewInitialization(ShardRouting routing) {
+        return routing.primary() && routing.active() == false && getInactivePrimaryHealth(routing) == ClusterHealthStatus.YELLOW;
+    }
+
+    private static class ShardAllocationStatus {
+        private final ShardAllocationCounts primaries = new ShardAllocationCounts();
+        private final ShardAllocationCounts replicas = new ShardAllocationCounts();
+
+        public void addPrimary(ShardRouting routing, NodesShutdownMetadata metadata) {
+            primaries.increment(routing, metadata);
+        }
+
+        public void addReplica(ShardRouting routing, NodesShutdownMetadata metadata) {
+            replicas.increment(routing, metadata);
+        }
+
+        public HealthStatus getStatus() {
+            if (primaries.available == false) {
+                return RED;
+            } else if (replicas.available == false) {
+                return YELLOW;
+            } else {
+                return GREEN;
+            }
+        }
+
+        public String getSummary() {
+            var builder = new StringBuilder("This cluster has ");
+            if (primaries.unassigned > 0
+                || primaries.unassigned_new > 0
+                || primaries.unassigned_restarting > 0
+                || replicas.unassigned > 0
+                || replicas.unassigned_restarting > 0) {
+                builder.append(
+                    Stream.of(
+                        createMessage(primaries.unassigned, "unavailable primary", " unavailable primaries"),
+                        createMessage(primaries.unassigned_new, "creating primary", " creating primaries"),
+                        createMessage(primaries.unassigned_restarting, "restarting primary", " restarting primaries"),
+                        createMessage(replicas.unassigned, "unavailable replica", "unavailable replicas"),
+                        createMessage(replicas.unassigned_restarting, "restarting replica", "restarting replicas")
+                    ).flatMap(Function.identity()).collect(joining(" , "))
+                ).append(".");
+            } else {
+                builder.append("all shards available.");
+            }
+            return builder.toString();
+        }
+
+        private static Stream<String> createMessage(int count, String singular, String plural) {
+            return switch (count) {
+                case 0 -> Stream.empty();
+                case 1 -> Stream.of("1 " + singular);
+                default -> Stream.of(count + " " + plural);
+            };
+        }
+
+        public SimpleHealthIndicatorDetails getDetails() {
+            return new SimpleHealthIndicatorDetails(
+                Map.of(
+                    "unassigned_primaries",
+                    primaries.unassigned,
+                    "initializing_primaries",
+                    primaries.initializing,
+                    "creating_primaries",
+                    primaries.unassigned_new,
+                    "restarting_primaries",
+                    primaries.unassigned_restarting,
+                    "started_primaries",
+                    primaries.started + primaries.reallocating,
+                    "unassigned_replicas",
+                    replicas.unassigned,
+                    "initializing_replicas",
+                    replicas.initializing,
+                    "restarting_replicas",
+                    replicas.unassigned_restarting,
+                    "started_replicas",
+                    replicas.started + replicas.reallocating
+                )
+            );
+        }
+    }
+}
diff --git a/server/src/main/java/org/elasticsearch/node/Node.java b/server/src/main/java/org/elasticsearch/node/Node.java
@@ -57,6 +57,7 @@
 import org.elasticsearch.cluster.routing.BatchedRerouteService;
 import org.elasticsearch.cluster.routing.RerouteService;
 import org.elasticsearch.cluster.routing.allocation.DiskThresholdMonitor;
+import org.elasticsearch.cluster.routing.allocation.ShardsAvailabilityHealthIndicatorService;
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.StopWatch;
 import org.elasticsearch.common.breaker.CircuitBreaker;
@@ -1043,7 +1044,8 @@ protected Node(
     private HealthService createHealthService(ClusterService clusterService) {
         var serverHealthIndicatorServices = List.of(
             new InstanceHasMasterHealthIndicatorService(clusterService),
-            new RepositoryIntegrityHealthIndicatorService(clusterService)
+            new RepositoryIntegrityHealthIndicatorService(clusterService),
+            new ShardsAvailabilityHealthIndicatorService(clusterService)
         );
         var pluginHealthIndicatorServices = pluginsService.filterPlugins(HealthPlugin.class)
             .stream()