Skip to content

Commit

Permalink
Fix NPE when evaluating the disk health for non-data nodes (#92643) (#…
Browse files Browse the repository at this point in the history
…92648)

Non-data nodes are not part of the `RoutingTable` so the expression
```
clusterState.getRoutingNodes().node(nodeId).numberOfShardsWithState(ShardRoutingState.RELOCATING) > 0
```
would yield NPE for dedicated non-data nodes.

(cherry picked from commit 677766d)
Signed-off-by: Andrei Dan <andrei.dan@elastic.co>
  • Loading branch information
andreidan committed Jan 3, 2023
1 parent 1de71c0 commit 169bb72
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 5 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/92643.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 92643
summary: Fix NPE when evaluating the disk health for non-data nodes
area: Health
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.elasticsearch.cluster.ClusterStateListener;
import org.elasticsearch.cluster.DiskUsage;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.routing.RoutingNode;
import org.elasticsearch.cluster.routing.ShardRoutingState;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.settings.ClusterSettings;
Expand Down Expand Up @@ -433,9 +434,18 @@ DiskHealthInfo getHealth(HealthMetadata healthMetadata, ClusterState clusterStat
}

long highThreshold = diskMetadata.getFreeBytesHighWatermark(totalBytes).getBytes();
if (usage.getFreeBytes() < highThreshold && hasRelocatingShards(clusterState, node.getId()) == false) {
logger.debug("High disk watermark [{}] exceeded on {}", highThreshold, usage);
return new DiskHealthInfo(HealthStatus.YELLOW, DiskHealthInfo.Cause.NODE_OVER_HIGH_THRESHOLD);
if (usage.getFreeBytes() < highThreshold) {
if (node.canContainData()) {
// for data nodes only report YELLOW if shards can't move away from the node
if (DiskCheck.hasRelocatingShards(clusterState, node) == false) {
logger.debug("High disk watermark [{}] exceeded on {}", highThreshold, usage);
return new DiskHealthInfo(HealthStatus.YELLOW, DiskHealthInfo.Cause.NODE_OVER_HIGH_THRESHOLD);
}
} else {
// for non-data nodes report YELLOW when the disk high watermark is breached
logger.debug("High disk watermark [{}] exceeded on {}", highThreshold, usage);
return new DiskHealthInfo(HealthStatus.YELLOW, DiskHealthInfo.Cause.NODE_OVER_HIGH_THRESHOLD);
}
}
return new DiskHealthInfo(HealthStatus.GREEN);
}
Expand All @@ -461,8 +471,13 @@ private DiskUsage getDiskUsage() {
return DiskUsage.findLeastAvailablePath(nodeStats);
}

private boolean hasRelocatingShards(ClusterState clusterState, String nodeId) {
return clusterState.getRoutingNodes().node(nodeId).shardsWithState(ShardRoutingState.RELOCATING).isEmpty() == false;
static boolean hasRelocatingShards(ClusterState clusterState, DiscoveryNode node) {
RoutingNode routingNode = clusterState.getRoutingNodes().node(node.getId());
if (routingNode == null) {
// routing node will be null for non-data nodes
return false;
}
return routingNode.numberOfShardsWithState(ShardRoutingState.RELOCATING) > 0;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.cluster.routing.ShardRoutingState;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.settings.ClusterSettings;
import org.elasticsearch.common.settings.Settings;
Expand All @@ -43,7 +44,9 @@
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;

import static org.elasticsearch.action.support.replication.ClusterStateCreationUtils.state;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.nullValue;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.eq;
Expand Down Expand Up @@ -306,6 +309,49 @@ public void testFrozenRedDiskStatus() {
assertThat(diskHealth, equalTo(new DiskHealthInfo(HealthStatus.RED, DiskHealthInfo.Cause.FROZEN_NODE_OVER_FLOOD_STAGE_THRESHOLD)));
}

public void testYellowStatusForNonDataNode() {
DiscoveryNode dedicatedMasterNode = new DiscoveryNode(
"master-node",
"master-node-1",
ESTestCase.buildNewFakeTransportAddress(),
Collections.emptyMap(),
Set.of(DiscoveryNodeRole.MASTER_ROLE),
Version.CURRENT
);
clusterState = ClusterStateCreationUtils.state(
dedicatedMasterNode,
dedicatedMasterNode,
node,
new DiscoveryNode[] { node, dedicatedMasterNode }
).copyAndUpdate(b -> b.putCustom(HealthMetadata.TYPE, healthMetadata));

initializeIncreasedDiskSpaceUsage();
LocalHealthMonitor.DiskCheck diskMonitor = new LocalHealthMonitor.DiskCheck(nodeService);
DiskHealthInfo diskHealth = diskMonitor.getHealth(healthMetadata, clusterState);
assertThat(diskHealth, equalTo(new DiskHealthInfo(HealthStatus.YELLOW, DiskHealthInfo.Cause.NODE_OVER_HIGH_THRESHOLD)));
}

public void testHasRelocatingShards() {
String indexName = "my-index";
final ClusterState state = state(indexName, true, ShardRoutingState.RELOCATING);
// local node coincides with the node hosting the (relocating) primary shard
DiscoveryNode localNode = state.nodes().getLocalNode();
assertThat(LocalHealthMonitor.DiskCheck.hasRelocatingShards(state, localNode), is(true));

DiscoveryNode dedicatedMasterNode = new DiscoveryNode(
"master-node",
"master-node-1",
ESTestCase.buildNewFakeTransportAddress(),
Collections.emptyMap(),
Set.of(DiscoveryNodeRole.MASTER_ROLE),
Version.CURRENT
);
ClusterState newState = ClusterState.builder(state)
.nodes(new DiscoveryNodes.Builder(state.nodes()).add(dedicatedMasterNode))
.build();
assertThat(LocalHealthMonitor.DiskCheck.hasRelocatingShards(newState, dedicatedMasterNode), is(false));
}

private void simulateDiskOutOfSpace() {
when(
nodeService.stats(
Expand Down

0 comments on commit 169bb72

Please sign in to comment.