Skip to content

Commit

Permalink
Fix disk indicator symptoms, impacts and diagnosis (#90262)
Browse files Browse the repository at this point in the history
This PR fixes the following:

**Non-data, non-master calculation** _Bug_: A non-data, non-master node
is a node that has at least one role that is not master or data node.
_Fix_: A non-data, non-master node is a node that does not contains data
and it is not a master.

**Dedicated master calculation** _Bug_: A dedicated master node in the
context of the disk health indicator is a node that has at least the
master role. _Fix_: A dedicated master node in the context of the disk
health indicator is a node that has the master role and cannot contain
data.

**Impact implementation** _Bug_: We list at most 3 impacts, one that is
based on blocked indices or indices on unhealthy data nods, one for
master and one for the rest. The current implementation wasn't covering
this in all cases, for example, if there was a data node and no
dedicated master node it wouldn't display the master node impact. _Fix_:
Base the calculation of master and other role impacts, on the roles of
the affected nodes.

**Diagnosis implementation** _Bug_: The code was correct but the tests
needed adjusting because they expected 1 extra diagnosis since a master
node was identified as both a dedicated master node and a dedicated
non-master, non-data node.  _Fix_: Fixed the test to respect the
hierarchy, data node > dedicated master node > other. 

**Symptom implementation** _Bug_: There were two messages produced as
symptoms, one for the case where we have blocked indices but all the
other nodes are healthy, and for all the other cases one that would list
the roles that were running out of space. _Fix_: We change them to a
messages that might have two parts: - Always mention if there are
indices blocked and explain why, which can be one of the following two
cases, (i) cluster is recovering **(this includes the case of all nodes
appearing healthy because the cluster is moving shards away from the out
of space nodes),** (ii) there are unhealthy data nodes that cannot
recover without user intervention. - Always mention all the affected
roles as symptom. _Examples_ > 3 indices are not allowed to be updated
because the cluster was running out of disk space. The cluster is
recovering and you should be able to update them within a few minutes.
Furthermore 3 more nodes with roles: [master, ingest] are out of disk or
running low on disk space.

> 3 indices are not allowed to be updated because 2 nodes are out of
disk or running low on disk space. Furthermore 3 nodes with roles:
[master, ingest] are out of disk or running low on disk space.

> 5 nodes with roles: [data, master, ingest] are out of disk or running
low on disk space.

**Bonus** - We enrich the impact about "other" nodes with the roles. -
We enrich the affected indices with listing indicative 10 indices like
we do with the shards availability indicator.
  • Loading branch information
gmarouli committed Sep 23, 2022
1 parent e1c284a commit db2314f
Show file tree
Hide file tree
Showing 5 changed files with 315 additions and 251 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/90262.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 90262
summary: Fix disk indicator impacts and diagnosis
area: Health
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
Expand All @@ -72,6 +71,8 @@
import static org.elasticsearch.health.HealthStatus.GREEN;
import static org.elasticsearch.health.HealthStatus.RED;
import static org.elasticsearch.health.HealthStatus.YELLOW;
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.getTruncatedIndices;
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.indicesComparatorByPriorityAndName;

/**
* This indicator reports health for shards.
Expand Down Expand Up @@ -854,7 +855,7 @@ public List<HealthIndicatorImpact> getImpacts() {
"Cannot add data to %d %s [%s]. Searches might return incomplete results.",
primaries.indicesWithUnavailableShards.size(),
primaries.indicesWithUnavailableShards.size() == 1 ? "index" : "indices",
getTruncatedIndicesString(primaries.indicesWithUnavailableShards, clusterMetadata)
getTruncatedIndices(primaries.indicesWithUnavailableShards, clusterMetadata)
);
impacts.add(
new HealthIndicatorImpact(
Expand All @@ -879,7 +880,7 @@ public List<HealthIndicatorImpact> getImpacts() {
"Searches might be slower than usual. Fewer redundant copies of the data exist on %d %s [%s].",
indicesWithUnavailableReplicasOnly.size(),
indicesWithUnavailableReplicasOnly.size() == 1 ? "index" : "indices",
getTruncatedIndicesString(indicesWithUnavailableReplicasOnly, clusterMetadata)
getTruncatedIndices(indicesWithUnavailableReplicasOnly, clusterMetadata)
);
impacts.add(
new HealthIndicatorImpact(NAME, REPLICA_UNASSIGNED_IMPACT_ID, 2, impactDescription, List.of(ImpactArea.SEARCH))
Expand Down Expand Up @@ -912,7 +913,10 @@ public List<Diagnosis> getUserActions(boolean explain) {
.map(
e -> new Diagnosis(
e.getKey(),
e.getValue().stream().sorted(byPriorityThenByName(clusterMetadata)).collect(Collectors.toList())
e.getValue()
.stream()
.sorted(indicesComparatorByPriorityAndName(clusterMetadata))
.collect(Collectors.toList())
)
)
.collect(Collectors.toList());
Expand All @@ -922,30 +926,4 @@ public List<Diagnosis> getUserActions(boolean explain) {
}
}
}

private static String getTruncatedIndicesString(Set<String> indices, Metadata clusterMetadata) {
final int maxIndices = 10;
String truncatedIndicesString = indices.stream()
.sorted(byPriorityThenByName(clusterMetadata))
.limit(maxIndices)
.collect(joining(", "));
if (maxIndices < indices.size()) {
truncatedIndicesString = truncatedIndicesString + ", ...";
}
return truncatedIndicesString;
}

/**
* Sorts index names by their priority first, then alphabetically by name. If the priority cannot be determined for an index then
* a priority of -1 is used to sort it behind other index names.
* @param clusterMetadata Used to look up index priority.
* @return Comparator instance
*/
private static Comparator<String> byPriorityThenByName(Metadata clusterMetadata) {
// We want to show indices with a numerically higher index.priority first (since lower priority ones might get truncated):
return Comparator.comparingInt((String indexName) -> {
IndexMetadata indexMetadata = clusterMetadata.index(indexName);
return indexMetadata == null ? -1 : indexMetadata.priority();
}).reversed().thenComparing(Comparator.naturalOrder());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.util.set.Sets;
import org.elasticsearch.health.Diagnosis;
import org.elasticsearch.health.HealthIndicatorDetails;
import org.elasticsearch.health.HealthIndicatorImpact;
Expand All @@ -36,6 +37,8 @@
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.getTruncatedIndices;

public class DiskHealthIndicatorService implements HealthIndicatorService {
public static final String NAME = "disk";

Expand Down Expand Up @@ -123,24 +126,19 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {
Set<String> yellowNonDataNonMasterNodes = getNodesWithNonDataNonMasterRoles(nodesReportingYellow, clusterState);

String symptom = getSymptom(
clusterHasBlockedIndex,
indicesWithBlock,
nodesWithBlockedIndices,
redDataNodes,
yellowDataNodes,
nodesReportingRed,
nodesReportingYellow,
clusterState
);
List<HealthIndicatorImpact> impacts = getImpacts(
indicesWithBlock,
indicesOnRedNodes,
indicesOnYellowNodes,
nodesWithBlockedIndices,
redDataNodes,
yellowDataNodes,
redMasterNodes,
yellowMasterNodes,
redNonDataNonMasterNodes,
yellowNonDataNonMasterNodes
Sets.union(indicesOnYellowNodes, indicesOnRedNodes),
nodesReportingRed,
nodesReportingYellow,
clusterState
);
List<Diagnosis> diagnosisList = getDiagnoses(
indicesWithBlock,
Expand All @@ -160,36 +158,66 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {
}

private String getSymptom(
boolean clusterHasBlockedIndex,
Set<String> blockedIndices,
Set<String> nodesWithBlockedIndices,
Set<String> redDataNodes,
Set<String> yellowDataNodes,
Set<String> nodesReportingRed,
Set<String> nodesReportingYellow,
ClusterState clusterState
) {
Set<String> allUnhealthyNodes = (Stream.concat(
Stream.concat(nodesWithBlockedIndices.stream(), nodesReportingRed.stream()),
nodesReportingYellow.stream()
)).collect(Collectors.toSet());
Set<String> allRolesOnUnhealthyNodes = getRolesOnNodes(allUnhealthyNodes, clusterState).stream()
.map(DiscoveryNodeRole::roleName)
.collect(Collectors.toSet());
final String symptom;
if (clusterHasBlockedIndex && allUnhealthyNodes.isEmpty()) {
// In this case the disk issue has been resolved but the index block has not been automatically removed yet:
Set<String> unhealthyDataNodes = Sets.union(redDataNodes, yellowDataNodes);
Set<String> allUnhealthyNodes = Sets.union(nodesReportingRed, nodesReportingYellow);

String symptom;
if (blockedIndices.isEmpty() == false) {
symptom = String.format(
Locale.ROOT,
"%d %s blocked and cannot be updated but 0 nodes are currently out of space.",
"%d %s not allowed to be updated because ",
blockedIndices.size(),
blockedIndices.size() == 1 ? "index is" : "indices are"
);
if (unhealthyDataNodes.isEmpty()) {
// In this case the disk issue has been resolved but the index block has not been removed yet or the
// cluster is still moving shards away from data nodes that are over the high watermark.
symptom +=
("the cluster was running out of disk space. The cluster is recovering and ingest capabilities should be restored "
+ "within a few minutes.");
} else {
symptom += String.format(
Locale.ROOT,
"%d %s out of disk or running low on disk space.",
unhealthyDataNodes.size(),
unhealthyDataNodes.size() == 1 ? "node is" : "nodes are"
);
}
if (unhealthyDataNodes.size() < allUnhealthyNodes.size()) {
Set<String> unhealthyNonDataNodes = Sets.difference(allUnhealthyNodes, unhealthyDataNodes);
String roles = getRolesOnNodes(unhealthyNonDataNodes, clusterState).stream()
.map(DiscoveryNodeRole::roleName)
.distinct()
.sorted()
.collect(Collectors.joining(", "));
symptom += String.format(
Locale.ROOT,
" Furthermore %d node%s with roles: [%s] %s out of disk or running low on disk space.",
unhealthyNonDataNodes.size(),
unhealthyNonDataNodes.size() == 1 ? "" : "s",
roles,
unhealthyNonDataNodes.size() == 1 ? "is" : "are"
);
}
} else {
String roles = getRolesOnNodes(allUnhealthyNodes, clusterState).stream()
.map(DiscoveryNodeRole::roleName)
.distinct()
.sorted()
.collect(Collectors.joining(", "));
symptom = String.format(
Locale.ROOT,
"%d node%s with roles: [%s] %s out of disk or running low on disk space.",
allUnhealthyNodes.size(),
allUnhealthyNodes.size() == 1 ? "" : "s",
allRolesOnUnhealthyNodes.stream().sorted().collect(Collectors.joining(", ")),
roles,
allUnhealthyNodes.size() == 1 ? "is" : "are"
);
}
Expand All @@ -198,42 +226,44 @@ private String getSymptom(

private List<HealthIndicatorImpact> getImpacts(
Set<String> indicesWithBlock,
Set<String> indicesOnRedNodes,
Set<String> indicesOnYellowNodes,
Set<String> nodesWithBlockedIndices,
Set<String> redDataNodes,
Set<String> yellowDataNodes,
Set<String> redMasterNodes,
Set<String> yellowMasterNodes,
Set<String> redNonDataNonMasterNodes,
Set<String> yellowNonDataNonMasterNodes
Set<String> indicesOnUnhealthyNodes,
Set<String> nodesReportingRed,
Set<String> nodesReportingYellow,
ClusterState clusterState
) {
List<HealthIndicatorImpact> impacts = new ArrayList<>();
if (indicesWithBlock.isEmpty() == false
|| indicesOnRedNodes.isEmpty() == false
|| nodesWithBlockedIndices.isEmpty() == false
|| redDataNodes.isEmpty() == false) {
if (indicesWithBlock.isEmpty() == false) {
impacts.add(
new HealthIndicatorImpact(
NAME,
IMPACT_INGEST_UNAVAILABLE_ID,
1,
"Cannot insert or update documents in the affected indices.",
String.format(
Locale.ROOT,
"Cannot insert or update documents in the affected indices [%s].",
getTruncatedIndices(indicesWithBlock, clusterState.getMetadata())
),
List.of(ImpactArea.INGEST)
)
);
} else if (indicesOnYellowNodes.isEmpty() == false || yellowDataNodes.isEmpty() == false) {
} else if (indicesOnUnhealthyNodes.isEmpty() == false) {
impacts.add(
new HealthIndicatorImpact(
NAME,
IMPACT_INGEST_AT_RISK_ID,
1,
"At risk of not being able to insert or update documents in the affected indices.",
String.format(
Locale.ROOT,
"The cluster is at risk of not being able to insert or update documents in the affected indices [%s].",
getTruncatedIndices(indicesOnUnhealthyNodes, clusterState.metadata())
),
List.of(ImpactArea.INGEST)
)
);
}
if (redMasterNodes.isEmpty() == false || yellowMasterNodes.isEmpty() == false) {
Set<String> unhealthyNodes = Stream.concat(nodesReportingRed.stream(), nodesReportingYellow.stream()).collect(Collectors.toSet());
Set<DiscoveryNodeRole> unhealthyRoles = getRolesOnNodes(unhealthyNodes, clusterState);
if (unhealthyRoles.contains(DiscoveryNodeRole.MASTER_ROLE)) {
impacts.add(
new HealthIndicatorImpact(
NAME,
Expand All @@ -244,13 +274,19 @@ private List<HealthIndicatorImpact> getImpacts(
)
);
}
if (redNonDataNonMasterNodes.isEmpty() == false || yellowNonDataNonMasterNodes.isEmpty() == false) {
String unhealthyNonMasterNonDataRoles = unhealthyRoles.stream()
.filter(role -> role.canContainData() == false && role.equals(DiscoveryNodeRole.MASTER_ROLE) == false)
.map(DiscoveryNodeRole::roleName)
.distinct()
.sorted()
.collect(Collectors.joining(", "));
if (unhealthyNonMasterNonDataRoles.isBlank() == false) {
impacts.add(
new HealthIndicatorImpact(
NAME,
IMPACT_CLUSTER_FUNCTIONALITY_UNAVAILABLE_ID,
2,
"Some cluster functionality might be unavailable.",
String.format(Locale.ROOT, "The [%s] functionality might be impaired.", unhealthyNonMasterNonDataRoles),
List.of(ImpactArea.DEPLOYMENT_MANAGEMENT)
)
);
Expand Down Expand Up @@ -376,7 +412,7 @@ static Set<String> getNodesWithMasterRole(Set<String> nodeIds, ClusterState clus
.values()
.stream()
.filter(node -> nodeIds.contains(node.getId()))
.filter(node -> node.getRoles().contains(DiscoveryNodeRole.MASTER_ROLE))
.filter(node -> node.canContainData() == false && node.isMasterNode())
.map(DiscoveryNode::getId)
.collect(Collectors.toSet());
}
Expand All @@ -388,11 +424,7 @@ static Set<String> getNodesWithNonDataNonMasterRoles(Set<String> nodeIds, Cluste
.values()
.stream()
.filter(node -> nodeIds.contains(node.getId()))
.filter(
node -> node.getRoles()
.stream()
.anyMatch(role -> (role.equals(DiscoveryNodeRole.MASTER_ROLE) || role.canContainData()) == false)
)
.filter(node -> node.canContainData() == false && node.isMasterNode() == false)
.map(DiscoveryNode::getId)
.collect(Collectors.toSet());
}
Expand All @@ -419,7 +451,7 @@ static Set<String> getNodeIdsForIndices(Set<String> indices, ClusterState cluste

/**
* This method logs if any nodes in the cluster state do not have health info results reported. This is logged at debug level and is
* not ordinarly important, but could be useful in tracking down problems where nodes have stopped reporting health node information.
* not ordinary important, but could be useful in tracking down problems where nodes have stopped reporting health node information.
* @param diskHealthInfoMap A map of nodeId to DiskHealthInfo
*/
private void logMissingHealthInfoData(Map<String, DiskHealthInfo> diskHealthInfoMap, ClusterState clusterState) {
Expand All @@ -430,7 +462,7 @@ private void logMissingHealthInfoData(Map<String, DiskHealthInfo> diskHealthInfo
if (nodeIdsInHealthInfo.containsAll(nodeIdsInClusterState) == false) {
String nodesWithMissingData = nodesInClusterState.stream()
.filter(node -> nodeIdsInHealthInfo.contains(node.getId()) == false)
.map(node -> String.format(Locale.ROOT, "{%s / %s}", node.getId(), node.getName()))
.map(HealthIndicatorDisplayValues::getNodeName)
.collect(Collectors.joining(", "));
logger.debug("The following nodes are in the cluster state but not reporting health data: [{}]", nodesWithMissingData);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.health.node;

import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.cluster.metadata.Metadata;
import org.elasticsearch.cluster.node.DiscoveryNode;

import java.util.Comparator;
import java.util.Locale;
import java.util.Set;

import static java.util.stream.Collectors.joining;

public class HealthIndicatorDisplayValues {

public static String getNodeName(DiscoveryNode node) {
if (node.getName() != null) {
return String.format(Locale.ROOT, "[%s][%s]", node.getId(), node.getName());
}
return String.format(Locale.ROOT, "[%s]", node.getId());
}

public static String getTruncatedIndices(Set<String> indices, Metadata clusterMetadata) {
final int maxIndices = 10;
String truncatedIndicesString = indices.stream()
.sorted(indicesComparatorByPriorityAndName(clusterMetadata))
.limit(maxIndices)
.collect(joining(", "));
if (maxIndices < indices.size()) {
truncatedIndicesString = truncatedIndicesString + ", ...";
}
return truncatedIndicesString;
}

/**
* Sorts index names by their priority first, then alphabetically by name. If the priority cannot be determined for an index then
* a priority of -1 is used to sort it behind other index names.
* @param clusterMetadata Used to look up index priority.
* @return Comparator instance
*/
public static Comparator<String> indicesComparatorByPriorityAndName(Metadata clusterMetadata) {
// We want to show indices with a numerically higher index.priority first (since lower priority ones might get truncated):
return Comparator.comparingInt((String indexName) -> {
IndexMetadata indexMetadata = clusterMetadata.index(indexName);
return indexMetadata == null ? -1 : indexMetadata.priority();
}).reversed().thenComparing(Comparator.naturalOrder());
}
}

0 comments on commit db2314f

Please sign in to comment.