Skip to content

Commit

Permalink
[HealthAPI] Diagnosis: report typed affected resources (#90653)
Browse files Browse the repository at this point in the history
The health API reports the affected resources in case of an unhealthy
deployment. Until now all indicators reported one type of resource per
diagnosis (index, ILM policy, snapshot repository)

With the introduction of the disk indicator we now have an indicator
that reports multiple types of resources under the same diagnosis (ie.
nodes and indices).

This changes the structure of the `affected_resources` field to
accommodate multiple types of resources:
```
"affected_resources": {
  "nodes": [
    {
      "id": "e1af6F5rTcmgpExkdOMzCg",
      "name": "hot"
    },
    {
      "id": "u_wBVl4ZRne4uZq_ziLsuw",
      "name": "warm"
    }
  ],
  "indices": [
    ".geoip_databases",
    "test_index"
  ]
}
```
  • Loading branch information
andreidan committed Oct 5, 2022
1 parent 77c9fac commit 5d97f0e
Show file tree
Hide file tree
Showing 16 changed files with 315 additions and 96 deletions.
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
---
"Diagnosis":
- skip:
version: "- 8.3.99"
reason: "diagnosis was only added in 8.4.0"
version: "- 8.5.99"
reason: "diagnosis was redefined in 8.6.0"

- do:
indices.create:
Expand All @@ -24,4 +24,4 @@
- length: { indicators.shards_availability.diagnosis: 1 }
- is_true: indicators.shards_availability.diagnosis.0.affected_resources
- length: { indicators.shards_availability.diagnosis.0.affected_resources: 1 }
- match: { indicators.shards_availability.diagnosis.0.affected_resources.0: "red_index" }
- match: { indicators.shards_availability.diagnosis.0.affected_resources.indices.0: "red_index" }
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
Expand All @@ -40,6 +41,8 @@ public class DiscoveryNode implements Writeable, ToXContentFragment {

static final String COORDINATING_ONLY = "coordinating_only";
public static final Version EXTERNAL_ID_VERSION = Version.V_8_3_0;
public static final Comparator<DiscoveryNode> DISCOVERY_NODE_COMPARATOR = Comparator.comparing(DiscoveryNode::getName)
.thenComparing(DiscoveryNode::getId);

public static boolean hasRole(final Settings settings, final DiscoveryNodeRole role) {
// this method can be called before the o.e.n.NodeRoleSettings.NODE_ROLES_SETTING is initialized
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
import static org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider.INDEX_ROUTING_ALLOCATION_ENABLE_SETTING;
import static org.elasticsearch.cluster.routing.allocation.decider.ShardsLimitAllocationDecider.CLUSTER_TOTAL_SHARDS_PER_NODE_SETTING;
import static org.elasticsearch.cluster.routing.allocation.decider.ShardsLimitAllocationDecider.INDEX_TOTAL_SHARDS_PER_NODE_SETTING;
import static org.elasticsearch.health.Diagnosis.Resource.Type.INDEX;
import static org.elasticsearch.health.HealthStatus.GREEN;
import static org.elasticsearch.health.HealthStatus.RED;
import static org.elasticsearch.health.HealthStatus.YELLOW;
Expand Down Expand Up @@ -126,7 +127,7 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {
status.getSummary(),
status.getDetails(explain),
status.getImpacts(),
status.getUserActions(explain)
status.getDiagnosis(explain)
);
}

Expand Down Expand Up @@ -890,11 +891,11 @@ public List<HealthIndicatorImpact> getImpacts() {
}

/**
* Summarizes the user actions that are needed to solve unassigned primary and replica shards.
* Returns the diagnosis for unassigned primary and replica shards.
* @param explain true if user actions should be generated, false if they should be omitted.
* @return A summary of user actions. Alternatively, an empty list if none were found or explain is false.
*/
public List<Diagnosis> getUserActions(boolean explain) {
public List<Diagnosis> getDiagnosis(boolean explain) {
if (explain) {
Map<Diagnosis.Definition, Set<String>> actionsToAffectedIndices = new HashMap<>(primaries.userActions);
replicas.userActions.forEach((actionDefinition, indicesWithReplicasUnassigned) -> {
Expand All @@ -913,10 +914,15 @@ public List<Diagnosis> getUserActions(boolean explain) {
.map(
e -> new Diagnosis(
e.getKey(),
e.getValue()
.stream()
.sorted(indicesComparatorByPriorityAndName(clusterMetadata))
.collect(Collectors.toList())
List.of(
new Diagnosis.Resource(
INDEX,
e.getValue()
.stream()
.sorted(indicesComparatorByPriorityAndName(clusterMetadata))
.collect(Collectors.toList())
)
)
)
)
.collect(Collectors.toList());
Expand Down
107 changes: 105 additions & 2 deletions server/src/main/java/org/elasticsearch/health/Diagnosis.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,16 @@

package org.elasticsearch.health;

import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.xcontent.ToXContentFragment;
import org.elasticsearch.xcontent.ToXContentObject;
import org.elasticsearch.xcontent.XContentBuilder;

import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Objects;

import static org.elasticsearch.health.HealthService.HEALTH_API_ID_PREFIX;

Expand All @@ -23,7 +27,102 @@
* @param definition The definition of the diagnosis (e.g. message, helpURL)
* @param affectedResources Optional list of "things" that are affected by this condition (e.g. shards, indices, or policies).
*/
public record Diagnosis(Definition definition, @Nullable List<String> affectedResources) implements ToXContentObject {
public record Diagnosis(Definition definition, @Nullable List<Resource> affectedResources) implements ToXContentObject {

/**
* Represents a type of affected resource, together with the resources/abstractions that
* are affected.
*/
public static class Resource implements ToXContentFragment {

public static final String ID_FIELD = "id";
public static final String NAME_FIELD = "name";

public enum Type {
INDEX("indices"),
NODE("nodes"),
SLM_POLICY("slm_policy"),
SNAPSHOT_REPOSITORY("snapshot_repository");

private final String displayValue;

Type(String displayValue) {
this.displayValue = displayValue;
}
}

private final Type type;

@Nullable
private Collection<String> values;
@Nullable
private Collection<DiscoveryNode> nodes;

public Resource(Type type, Collection<String> values) {
if (type == Type.NODE) {
throw new IllegalArgumentException("Nodes should be modelled using the dedicated constructor");
}

this.type = type;
this.values = values;
}

public Resource(Collection<DiscoveryNode> nodes) {
this.type = Type.NODE;
this.nodes = nodes;
}

@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
if (nodes != null) {
// we report both node ids and names so we need a bit of structure
builder.startArray(type.displayValue);
for (DiscoveryNode node : nodes) {
builder.startObject();
builder.field(ID_FIELD, node.getId());
if (node.getName() != null) {
builder.field(NAME_FIELD, node.getName());
}
builder.endObject();
}
builder.endArray();
} else {
builder.field(type.displayValue, values);
}
return builder;
}

@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
Resource resource = (Resource) o;
return type == resource.type && Objects.equals(values, resource.values) && Objects.equals(nodes, resource.nodes);
}

@Override
public int hashCode() {
return Objects.hash(type, values, nodes);
}

public Type getType() {
return type;
}

@Nullable
public Collection<String> getValues() {
return values;
}

@Nullable
public Collection<DiscoveryNode> getNodes() {
return nodes;
}
}

/**
* Details a diagnosis - cause and a potential action that a user could take to clear an issue identified by a {@link HealthService}.
Expand All @@ -44,7 +143,11 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
builder.field("action", definition.action);

if (affectedResources != null && affectedResources.size() > 0) {
builder.field("affected_resources", affectedResources);
builder.startObject("affected_resources");
for (Resource affectedResource : affectedResources) {
affectedResource.toXContent(builder, params);
}
builder.endObject();
}

builder.field("help_url", definition.helpURL);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,12 @@
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static org.elasticsearch.cluster.node.DiscoveryNode.DISCOVERY_NODE_COMPARATOR;
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.are;
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.getSortedUniqueValuesString;
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.getTruncatedIndices;
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.indices;
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.indicesComparatorByPriorityAndName;
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.regularNoun;
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.regularVerb;
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.these;
Expand Down Expand Up @@ -134,11 +136,11 @@ static class DiskHealthAnalyzer {

private final ClusterState clusterState;
private final Set<String> blockedIndices;
private final Set<DiscoveryNode> dataNodes = new HashSet<>();
private final List<DiscoveryNode> dataNodes = new ArrayList<>();
// In this context a master node, is a master node that cannot contain data.
private final Map<HealthStatus, Set<DiscoveryNode>> masterNodes = new HashMap<>();
private final Map<HealthStatus, List<DiscoveryNode>> masterNodes = new HashMap<>();
// In this context "other" nodes are nodes that cannot contain data and are not masters.
private final Map<HealthStatus, Set<DiscoveryNode>> otherNodes = new HashMap<>();
private final Map<HealthStatus, List<DiscoveryNode>> otherNodes = new HashMap<>();
private final Set<DiscoveryNodeRole> affectedRoles = new HashSet<>();
private final Set<String> indicesAtRisk;
private final HealthStatus healthStatus;
Expand Down Expand Up @@ -168,11 +170,18 @@ static class DiskHealthAnalyzer {
if (node.canContainData()) {
dataNodes.add(node);
} else if (node.isMasterNode()) {
masterNodes.computeIfAbsent(healthStatus, ignored -> new HashSet<>()).add(node);
masterNodes.computeIfAbsent(healthStatus, ignored -> new ArrayList<>()).add(node);
} else {
otherNodes.computeIfAbsent(healthStatus, ignored -> new HashSet<>()).add(node);
otherNodes.computeIfAbsent(healthStatus, ignored -> new ArrayList<>()).add(node);
}
}
dataNodes.sort(DISCOVERY_NODE_COMPARATOR);
for (List<DiscoveryNode> masterNodes : masterNodes.values()) {
masterNodes.sort(DISCOVERY_NODE_COMPARATOR);
}
for (List<DiscoveryNode> nodes : otherNodes.values()) {
nodes.sort(DISCOVERY_NODE_COMPARATOR);
}
indicesAtRisk = getIndicesForNodes(dataNodes, clusterState);
healthStatus = mostSevereStatusSoFar;
details = createDetails(diskHealthByNode, blockedIndices);
Expand Down Expand Up @@ -317,6 +326,20 @@ private List<Diagnosis> getDiagnoses() {
List<Diagnosis> diagnosisList = new ArrayList<>();
if (hasBlockedIndices() || hasUnhealthyDataNodes()) {
Set<String> affectedIndices = Sets.union(blockedIndices, indicesAtRisk);
List<Diagnosis.Resource> affectedResources = new ArrayList<>();
if (dataNodes.size() > 0) {
Diagnosis.Resource nodeResources = new Diagnosis.Resource(dataNodes);
affectedResources.add(nodeResources);
}
if (affectedIndices.size() > 0) {
Diagnosis.Resource indexResources = new Diagnosis.Resource(
Diagnosis.Resource.Type.INDEX,
affectedIndices.stream()
.sorted(indicesComparatorByPriorityAndName(clusterState.metadata()))
.collect(Collectors.toList())
);
affectedResources.add(indexResources);
}
diagnosisList.add(
new Diagnosis(
new Diagnosis.Definition(
Expand All @@ -336,7 +359,7 @@ private List<Diagnosis> getDiagnoses() {
+ "this. If you have already taken action please wait for the rebalancing to complete.",
"https://ela.st/fix-data-disk"
),
dataNodes.stream().map(DiscoveryNode::getId).sorted().toList()
affectedResources
)
);
}
Expand Down Expand Up @@ -397,7 +420,7 @@ private boolean hasBlockedIndices() {
}

// Non-private for unit testing
static Set<String> getIndicesForNodes(Set<DiscoveryNode> nodes, ClusterState clusterState) {
static Set<String> getIndicesForNodes(List<DiscoveryNode> nodes, ClusterState clusterState) {
RoutingNodes routingNodes = clusterState.getRoutingNodes();
return nodes.stream()
.map(node -> routingNodes.node(node.getId()))
Expand All @@ -416,11 +439,11 @@ private Diagnosis createNonDataNodeDiagnosis(HealthStatus healthStatus, Collecti
"Please add capacity to the current nodes, or replace them with ones with higher capacity.",
isMaster ? "https://ela.st/fix-master-disk" : "https://ela.st/fix-disk-space"
),
nodes.stream().map(DiscoveryNode::getId).sorted().toList()
List.of(new Diagnosis.Resource(nodes))
);
}

private int getUnhealthyNodeSize(Map<HealthStatus, Set<DiscoveryNode>> nodes) {
private int getUnhealthyNodeSize(Map<HealthStatus, List<DiscoveryNode>> nodes) {
return (nodes.containsKey(HealthStatus.RED) ? nodes.get(HealthStatus.RED).size() : 0) + (nodes.containsKey(HealthStatus.YELLOW)
? nodes.get(HealthStatus.YELLOW).size()
: 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.elasticsearch.cluster.metadata.RepositoryMetadata;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.health.Diagnosis;
import org.elasticsearch.health.Diagnosis.Resource.Type;
import org.elasticsearch.health.HealthIndicatorDetails;
import org.elasticsearch.health.HealthIndicatorImpact;
import org.elasticsearch.health.HealthIndicatorResult;
Expand Down Expand Up @@ -132,7 +133,7 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {
)
: HealthIndicatorDetails.EMPTY,
impacts,
List.of(new Diagnosis(CORRUPTED_REPOSITORY, corrupted))
List.of(new Diagnosis(CORRUPTED_REPOSITORY, List.of(new Diagnosis.Resource(Type.SNAPSHOT_REPOSITORY, corrupted))))
);
}

Expand Down

0 comments on commit 5d97f0e

Please sign in to comment.