Fix disk indicator symptoms, impacts and diagnosis (#90262)

This PR fixes the following: **Non-data, non-master calculation** _Bug_: A non-data, non-master node is a node that has at least one role that is not master or data node. _Fix_: A non-data, non-master node is a node that does not contains data and it is not a master. **Dedicated master calculation** _Bug_: A dedicated master node in the context of the disk health indicator is a node that has at least the master role. _Fix_: A dedicated master node in the context of the disk health indicator is a node that has the master role and cannot contain data. **Impact implementation** _Bug_: We list at most 3 impacts, one that is based on blocked indices or indices on unhealthy data nods, one for master and one for the rest. The current implementation wasn't covering this in all cases, for example, if there was a data node and no dedicated master node it wouldn't display the master node impact. _Fix_: Base the calculation of master and other role impacts, on the roles of the affected nodes. **Diagnosis implementation** _Bug_: The code was correct but the tests needed adjusting because they expected 1 extra diagnosis since a master node was identified as both a dedicated master node and a dedicated non-master, non-data node. _Fix_: Fixed the test to respect the hierarchy, data node > dedicated master node > other. **Symptom implementation** _Bug_: There were two messages produced as symptoms, one for the case where we have blocked indices but all the other nodes are healthy, and for all the other cases one that would list the roles that were running out of space. _Fix_: We change them to a messages that might have two parts: - Always mention if there are indices blocked and explain why, which can be one of the following two cases, (i) cluster is recovering **(this includes the case of all nodes appearing healthy because the cluster is moving shards away from the out of space nodes),** (ii) there are unhealthy data nodes that cannot recover without user intervention. - Always mention all the affected roles as symptom. _Examples_ > 3 indices are not allowed to be updated because the cluster was running out of disk space. The cluster is recovering and you should be able to update them within a few minutes. Furthermore 3 more nodes with roles: [master, ingest] are out of disk or running low on disk space. > 3 indices are not allowed to be updated because 2 nodes are out of disk or running low on disk space. Furthermore 3 nodes with roles: [master, ingest] are out of disk or running low on disk space. > 5 nodes with roles: [data, master, ingest] are out of disk or running low on disk space. **Bonus** - We enrich the impact about "other" nodes with the roles. - We enrich the affected indices with listing indicative 10 indices like we do with the shards availability indicator.
elastic · Sep 23, 2022 · db2314f · db2314f
1 parent e1c284a
commit db2314f
Show file tree

Hide file tree

Showing 5 changed files with 315 additions and 251 deletions.
diff --git a/docs/changelog/90262.yaml b/docs/changelog/90262.yaml
@@ -0,0 +1,5 @@
+pr: 90262
+summary: Fix disk indicator impacts and diagnosis
+area: Health
+type: bug
+issues: []
diff --git a/...rg/elasticsearch/cluster/routing/allocation/ShardsAvailabilityHealthIndicatorService.java b/...rg/elasticsearch/cluster/routing/allocation/ShardsAvailabilityHealthIndicatorService.java
@@ -47,7 +47,6 @@
 
 import java.util.ArrayList;
 import java.util.Collections;
-import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
@@ -72,6 +71,8 @@
 import static org.elasticsearch.health.HealthStatus.GREEN;
 import static org.elasticsearch.health.HealthStatus.RED;
 import static org.elasticsearch.health.HealthStatus.YELLOW;
+import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.getTruncatedIndices;
+import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.indicesComparatorByPriorityAndName;
 
 /**
  * This indicator reports health for shards.
@@ -854,7 +855,7 @@ public List<HealthIndicatorImpact> getImpacts() {
                     "Cannot add data to %d %s [%s]. Searches might return incomplete results.",
                     primaries.indicesWithUnavailableShards.size(),
                     primaries.indicesWithUnavailableShards.size() == 1 ? "index" : "indices",
-                    getTruncatedIndicesString(primaries.indicesWithUnavailableShards, clusterMetadata)
+                    getTruncatedIndices(primaries.indicesWithUnavailableShards, clusterMetadata)
                 );
                 impacts.add(
                     new HealthIndicatorImpact(
@@ -879,7 +880,7 @@ public List<HealthIndicatorImpact> getImpacts() {
                     "Searches might be slower than usual. Fewer redundant copies of the data exist on %d %s [%s].",
                     indicesWithUnavailableReplicasOnly.size(),
                     indicesWithUnavailableReplicasOnly.size() == 1 ? "index" : "indices",
-                    getTruncatedIndicesString(indicesWithUnavailableReplicasOnly, clusterMetadata)
+                    getTruncatedIndices(indicesWithUnavailableReplicasOnly, clusterMetadata)
                 );
                 impacts.add(
                     new HealthIndicatorImpact(NAME, REPLICA_UNASSIGNED_IMPACT_ID, 2, impactDescription, List.of(ImpactArea.SEARCH))
@@ -912,7 +913,10 @@ public List<Diagnosis> getUserActions(boolean explain) {
                         .map(
                             e -> new Diagnosis(
                                 e.getKey(),
-                                e.getValue().stream().sorted(byPriorityThenByName(clusterMetadata)).collect(Collectors.toList())
+                                e.getValue()
+                                    .stream()
+                                    .sorted(indicesComparatorByPriorityAndName(clusterMetadata))
+                                    .collect(Collectors.toList())
                             )
                         )
                         .collect(Collectors.toList());
@@ -922,30 +926,4 @@ public List<Diagnosis> getUserActions(boolean explain) {
             }
         }
     }
-
-    private static String getTruncatedIndicesString(Set<String> indices, Metadata clusterMetadata) {
-        final int maxIndices = 10;
-        String truncatedIndicesString = indices.stream()
-            .sorted(byPriorityThenByName(clusterMetadata))
-            .limit(maxIndices)
-            .collect(joining(", "));
-        if (maxIndices < indices.size()) {
-            truncatedIndicesString = truncatedIndicesString + ", ...";
-        }
-        return truncatedIndicesString;
-    }
-
-    /**
-     * Sorts index names by their priority first, then alphabetically by name. If the priority cannot be determined for an index then
-     * a priority of -1 is used to sort it behind other index names.
-     * @param clusterMetadata Used to look up index priority.
-     * @return Comparator instance
-     */
-    private static Comparator<String> byPriorityThenByName(Metadata clusterMetadata) {
-        // We want to show indices with a numerically higher index.priority first (since lower priority ones might get truncated):
-        return Comparator.comparingInt((String indexName) -> {
-            IndexMetadata indexMetadata = clusterMetadata.index(indexName);
-            return indexMetadata == null ? -1 : indexMetadata.priority();
-        }).reversed().thenComparing(Comparator.naturalOrder());
-    }
 }
diff --git a/server/src/main/java/org/elasticsearch/health/node/DiskHealthIndicatorService.java b/server/src/main/java/org/elasticsearch/health/node/DiskHealthIndicatorService.java
@@ -16,6 +16,7 @@
 import org.elasticsearch.cluster.node.DiscoveryNodeRole;
 import org.elasticsearch.cluster.routing.ShardRouting;
 import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.util.set.Sets;
 import org.elasticsearch.health.Diagnosis;
 import org.elasticsearch.health.HealthIndicatorDetails;
 import org.elasticsearch.health.HealthIndicatorImpact;
@@ -36,6 +37,8 @@
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
+import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.getTruncatedIndices;
+
 public class DiskHealthIndicatorService implements HealthIndicatorService {
     public static final String NAME = "disk";
 
@@ -123,24 +126,19 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {
             Set<String> yellowNonDataNonMasterNodes = getNodesWithNonDataNonMasterRoles(nodesReportingYellow, clusterState);
 
             String symptom = getSymptom(
-                clusterHasBlockedIndex,
                 indicesWithBlock,
-                nodesWithBlockedIndices,
+                redDataNodes,
+                yellowDataNodes,
                 nodesReportingRed,
                 nodesReportingYellow,
                 clusterState
             );
             List<HealthIndicatorImpact> impacts = getImpacts(
                 indicesWithBlock,
-                indicesOnRedNodes,
-                indicesOnYellowNodes,
-                nodesWithBlockedIndices,
-                redDataNodes,
-                yellowDataNodes,
-                redMasterNodes,
-                yellowMasterNodes,
-                redNonDataNonMasterNodes,
-                yellowNonDataNonMasterNodes
+                Sets.union(indicesOnYellowNodes, indicesOnRedNodes),
+                nodesReportingRed,
+                nodesReportingYellow,
+                clusterState
             );
             List<Diagnosis> diagnosisList = getDiagnoses(
                 indicesWithBlock,
@@ -160,36 +158,66 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {
     }
 
     private String getSymptom(
-        boolean clusterHasBlockedIndex,
         Set<String> blockedIndices,
-        Set<String> nodesWithBlockedIndices,
+        Set<String> redDataNodes,
+        Set<String> yellowDataNodes,
         Set<String> nodesReportingRed,
         Set<String> nodesReportingYellow,
         ClusterState clusterState
     ) {
-        Set<String> allUnhealthyNodes = (Stream.concat(
-            Stream.concat(nodesWithBlockedIndices.stream(), nodesReportingRed.stream()),
-            nodesReportingYellow.stream()
-        )).collect(Collectors.toSet());
-        Set<String> allRolesOnUnhealthyNodes = getRolesOnNodes(allUnhealthyNodes, clusterState).stream()
-            .map(DiscoveryNodeRole::roleName)
-            .collect(Collectors.toSet());
-        final String symptom;
-        if (clusterHasBlockedIndex && allUnhealthyNodes.isEmpty()) {
-            // In this case the disk issue has been resolved but the index block has not been automatically removed yet:
+        Set<String> unhealthyDataNodes = Sets.union(redDataNodes, yellowDataNodes);
+        Set<String> allUnhealthyNodes = Sets.union(nodesReportingRed, nodesReportingYellow);
+
+        String symptom;
+        if (blockedIndices.isEmpty() == false) {
             symptom = String.format(
                 Locale.ROOT,
-                "%d %s blocked and cannot be updated but 0 nodes are currently out of space.",
+                "%d %s not allowed to be updated because ",
                 blockedIndices.size(),
                 blockedIndices.size() == 1 ? "index is" : "indices are"
             );
+            if (unhealthyDataNodes.isEmpty()) {
+                // In this case the disk issue has been resolved but the index block has not been removed yet or the
+                // cluster is still moving shards away from data nodes that are over the high watermark.
+                symptom +=
+                    ("the cluster was running out of disk space. The cluster is recovering and ingest capabilities should be restored "
+                        + "within a few minutes.");
+            } else {
+                symptom += String.format(
+                    Locale.ROOT,
+                    "%d %s out of disk or running low on disk space.",
+                    unhealthyDataNodes.size(),
+                    unhealthyDataNodes.size() == 1 ? "node is" : "nodes are"
+                );
+            }
+            if (unhealthyDataNodes.size() < allUnhealthyNodes.size()) {
+                Set<String> unhealthyNonDataNodes = Sets.difference(allUnhealthyNodes, unhealthyDataNodes);
+                String roles = getRolesOnNodes(unhealthyNonDataNodes, clusterState).stream()
+                    .map(DiscoveryNodeRole::roleName)
+                    .distinct()
+                    .sorted()
+                    .collect(Collectors.joining(", "));
+                symptom += String.format(
+                    Locale.ROOT,
+                    " Furthermore %d node%s with roles: [%s] %s out of disk or running low on disk space.",
+                    unhealthyNonDataNodes.size(),
+                    unhealthyNonDataNodes.size() == 1 ? "" : "s",
+                    roles,
+                    unhealthyNonDataNodes.size() == 1 ? "is" : "are"
+                );
+            }
         } else {
+            String roles = getRolesOnNodes(allUnhealthyNodes, clusterState).stream()
+                .map(DiscoveryNodeRole::roleName)
+                .distinct()
+                .sorted()
+                .collect(Collectors.joining(", "));
             symptom = String.format(
                 Locale.ROOT,
                 "%d node%s with roles: [%s] %s out of disk or running low on disk space.",
                 allUnhealthyNodes.size(),
                 allUnhealthyNodes.size() == 1 ? "" : "s",
-                allRolesOnUnhealthyNodes.stream().sorted().collect(Collectors.joining(", ")),
+                roles,
                 allUnhealthyNodes.size() == 1 ? "is" : "are"
             );
         }
@@ -198,42 +226,44 @@ private String getSymptom(
 
     private List<HealthIndicatorImpact> getImpacts(
         Set<String> indicesWithBlock,
-        Set<String> indicesOnRedNodes,
-        Set<String> indicesOnYellowNodes,
-        Set<String> nodesWithBlockedIndices,
-        Set<String> redDataNodes,
-        Set<String> yellowDataNodes,
-        Set<String> redMasterNodes,
-        Set<String> yellowMasterNodes,
-        Set<String> redNonDataNonMasterNodes,
-        Set<String> yellowNonDataNonMasterNodes
+        Set<String> indicesOnUnhealthyNodes,
+        Set<String> nodesReportingRed,
+        Set<String> nodesReportingYellow,
+        ClusterState clusterState
     ) {
         List<HealthIndicatorImpact> impacts = new ArrayList<>();
-        if (indicesWithBlock.isEmpty() == false
-            || indicesOnRedNodes.isEmpty() == false
-            || nodesWithBlockedIndices.isEmpty() == false
-            || redDataNodes.isEmpty() == false) {
+        if (indicesWithBlock.isEmpty() == false) {
             impacts.add(
                 new HealthIndicatorImpact(
                     NAME,
                     IMPACT_INGEST_UNAVAILABLE_ID,
                     1,
-                    "Cannot insert or update documents in the affected indices.",
+                    String.format(
+                        Locale.ROOT,
+                        "Cannot insert or update documents in the affected indices [%s].",
+                        getTruncatedIndices(indicesWithBlock, clusterState.getMetadata())
+                    ),
                     List.of(ImpactArea.INGEST)
                 )
             );
-        } else if (indicesOnYellowNodes.isEmpty() == false || yellowDataNodes.isEmpty() == false) {
+        } else if (indicesOnUnhealthyNodes.isEmpty() == false) {
             impacts.add(
                 new HealthIndicatorImpact(
                     NAME,
                     IMPACT_INGEST_AT_RISK_ID,
                     1,
-                    "At risk of not being able to insert or update documents in the affected indices.",
+                    String.format(
+                        Locale.ROOT,
+                        "The cluster is at risk of not being able to insert or update documents in the affected indices [%s].",
+                        getTruncatedIndices(indicesOnUnhealthyNodes, clusterState.metadata())
+                    ),
                     List.of(ImpactArea.INGEST)
                 )
             );
         }
-        if (redMasterNodes.isEmpty() == false || yellowMasterNodes.isEmpty() == false) {
+        Set<String> unhealthyNodes = Stream.concat(nodesReportingRed.stream(), nodesReportingYellow.stream()).collect(Collectors.toSet());
+        Set<DiscoveryNodeRole> unhealthyRoles = getRolesOnNodes(unhealthyNodes, clusterState);
+        if (unhealthyRoles.contains(DiscoveryNodeRole.MASTER_ROLE)) {
             impacts.add(
                 new HealthIndicatorImpact(
                     NAME,
@@ -244,13 +274,19 @@ private List<HealthIndicatorImpact> getImpacts(
                 )
             );
         }
-        if (redNonDataNonMasterNodes.isEmpty() == false || yellowNonDataNonMasterNodes.isEmpty() == false) {
+        String unhealthyNonMasterNonDataRoles = unhealthyRoles.stream()
+            .filter(role -> role.canContainData() == false && role.equals(DiscoveryNodeRole.MASTER_ROLE) == false)
+            .map(DiscoveryNodeRole::roleName)
+            .distinct()
+            .sorted()
+            .collect(Collectors.joining(", "));
+        if (unhealthyNonMasterNonDataRoles.isBlank() == false) {
             impacts.add(
                 new HealthIndicatorImpact(
                     NAME,
                     IMPACT_CLUSTER_FUNCTIONALITY_UNAVAILABLE_ID,
                     2,
-                    "Some cluster functionality might be unavailable.",
+                    String.format(Locale.ROOT, "The [%s] functionality might be impaired.", unhealthyNonMasterNonDataRoles),
                     List.of(ImpactArea.DEPLOYMENT_MANAGEMENT)
                 )
             );
@@ -376,7 +412,7 @@ static Set<String> getNodesWithMasterRole(Set<String> nodeIds, ClusterState clus
             .values()
             .stream()
             .filter(node -> nodeIds.contains(node.getId()))
-            .filter(node -> node.getRoles().contains(DiscoveryNodeRole.MASTER_ROLE))
+            .filter(node -> node.canContainData() == false && node.isMasterNode())
             .map(DiscoveryNode::getId)
             .collect(Collectors.toSet());
     }
@@ -388,11 +424,7 @@ static Set<String> getNodesWithNonDataNonMasterRoles(Set<String> nodeIds, Cluste
             .values()
             .stream()
             .filter(node -> nodeIds.contains(node.getId()))
-            .filter(
-                node -> node.getRoles()
-                    .stream()
-                    .anyMatch(role -> (role.equals(DiscoveryNodeRole.MASTER_ROLE) || role.canContainData()) == false)
-            )
+            .filter(node -> node.canContainData() == false && node.isMasterNode() == false)
             .map(DiscoveryNode::getId)
             .collect(Collectors.toSet());
     }
@@ -419,7 +451,7 @@ static Set<String> getNodeIdsForIndices(Set<String> indices, ClusterState cluste
 
     /**
      * This method logs if any nodes in the cluster state do not have health info results reported. This is logged at debug level and is
-     * not ordinarly important, but could be useful in tracking down problems where nodes have stopped reporting health node information.
+     * not ordinary important, but could be useful in tracking down problems where nodes have stopped reporting health node information.
      * @param diskHealthInfoMap A map of nodeId to DiskHealthInfo
      */
     private void logMissingHealthInfoData(Map<String, DiskHealthInfo> diskHealthInfoMap, ClusterState clusterState) {
@@ -430,7 +462,7 @@ private void logMissingHealthInfoData(Map<String, DiskHealthInfo> diskHealthInfo
             if (nodeIdsInHealthInfo.containsAll(nodeIdsInClusterState) == false) {
                 String nodesWithMissingData = nodesInClusterState.stream()
                     .filter(node -> nodeIdsInHealthInfo.contains(node.getId()) == false)
-                    .map(node -> String.format(Locale.ROOT, "{%s / %s}", node.getId(), node.getName()))
+                    .map(HealthIndicatorDisplayValues::getNodeName)
                     .collect(Collectors.joining(", "));
                 logger.debug("The following nodes are in the cluster state but not reporting health data: [{}]", nodesWithMissingData);
             }

diff --git a/server/src/main/java/org/elasticsearch/health/node/HealthIndicatorDisplayValues.java b/server/src/main/java/org/elasticsearch/health/node/HealthIndicatorDisplayValues.java
@@ -0,0 +1,55 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.health.node;
+
+import org.elasticsearch.cluster.metadata.IndexMetadata;
+import org.elasticsearch.cluster.metadata.Metadata;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+
+import java.util.Comparator;
+import java.util.Locale;
+import java.util.Set;
+
+import static java.util.stream.Collectors.joining;
+
+public class HealthIndicatorDisplayValues {
+
+    public static String getNodeName(DiscoveryNode node) {
+        if (node.getName() != null) {
+            return String.format(Locale.ROOT, "[%s][%s]", node.getId(), node.getName());
+        }
+        return String.format(Locale.ROOT, "[%s]", node.getId());
+    }
+
+    public static String getTruncatedIndices(Set<String> indices, Metadata clusterMetadata) {
+        final int maxIndices = 10;
+        String truncatedIndicesString = indices.stream()
+            .sorted(indicesComparatorByPriorityAndName(clusterMetadata))
+            .limit(maxIndices)
+            .collect(joining(", "));
+        if (maxIndices < indices.size()) {
+            truncatedIndicesString = truncatedIndicesString + ", ...";
+        }
+        return truncatedIndicesString;
+    }
+
+    /**
+     * Sorts index names by their priority first, then alphabetically by name. If the priority cannot be determined for an index then
+     * a priority of -1 is used to sort it behind other index names.
+     * @param clusterMetadata Used to look up index priority.
+     * @return Comparator instance
+     */
+    public static Comparator<String> indicesComparatorByPriorityAndName(Metadata clusterMetadata) {
+        // We want to show indices with a numerically higher index.priority first (since lower priority ones might get truncated):
+        return Comparator.comparingInt((String indexName) -> {
+            IndexMetadata indexMetadata = clusterMetadata.index(indexName);
+            return indexMetadata == null ? -1 : indexMetadata.priority();
+        }).reversed().thenComparing(Comparator.naturalOrder());
+    }
+}