Skip to content

Commit

Permalink
Health API explain query param (#86410)
Browse files Browse the repository at this point in the history
The health API has a notion of details within each health indicator that is returned. These details can sometimes be
expensive to compute or transfer. This change allows a user to specify whether the details are generated and
returned. By default now all details are generated and returned (previously this was only the case if a component
was specified in the request). This behavior can be changed with the explain query param.
Closes #86215
  • Loading branch information
masseyke committed May 9, 2022
1 parent 699da84 commit 6d975a6
Show file tree
Hide file tree
Showing 16 changed files with 213 additions and 183 deletions.
4 changes: 4 additions & 0 deletions docs/reference/health/health.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ is controlled by the worst component status.
[[health-api-query-params]]
==== {api-query-parms-title}

`explain`::
(Optional, Boolean) If `true`, the response includes additional details that help explain the status of each non-green indicator.
Defaults to `true`.

include::{es-repo-dir}/rest-api/common-parms.asciidoc[tag=local]

include::{es-repo-dir}/rest-api/common-parms.asciidoc[tag=timeoutparms]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@
"timeout":{
"type":"time",
"description":"Explicit operation timeout"
},
"explain":{
"type":"boolean",
"description":"Include details on returned indicators",
"default":true
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
public void testGetHealthWhenMasterIsElected() throws Exception {
var client = client();

var response = client.execute(GetHealthAction.INSTANCE, new GetHealthAction.Request()).get();
var response = client.execute(GetHealthAction.INSTANCE, new GetHealthAction.Request(randomBoolean())).get();

assertThat(response.findComponent(CLUSTER_COORDINATION).findIndicator(NAME).status(), equalTo(GREEN));
}
Expand All @@ -66,7 +66,7 @@ public void testGetHealthWhenNoMaster() throws Exception {
ClusterState state = client.admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
assertTrue(state.blocks().hasGlobalBlockWithId(NoMasterBlockService.NO_MASTER_BLOCK_ID));

var response = client.execute(GetHealthAction.INSTANCE, new GetHealthAction.Request()).get();
var response = client.execute(GetHealthAction.INSTANCE, new GetHealthAction.Request(randomBoolean())).get();

assertThat(response.findComponent(CLUSTER_COORDINATION).findIndicator(NAME).status(), equalTo(RED));
});
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ public void testRepositoryIntegrityHealthIndicator() throws IOException, Interru
}

private void assertSnapshotRepositoryHealth(String message, Client client, HealthStatus status) {
var response = client.execute(GetHealthAction.INSTANCE, new GetHealthAction.Request()).actionGet();
var response = client.execute(GetHealthAction.INSTANCE, new GetHealthAction.Request(randomBoolean())).actionGet();
assertThat(message, response.findComponent(SNAPSHOT).findIndicator(NAME).status(), equalTo(status));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public String component() {
}

@Override
public HealthIndicatorResult calculate(boolean includeDetails) {
public HealthIndicatorResult calculate(boolean explain) {

DiscoveryNode coordinatingNode = clusterService.localNode();
ClusterState clusterState = clusterService.state();
Expand All @@ -55,7 +55,7 @@ public HealthIndicatorResult calculate(boolean includeDetails) {
HealthStatus instanceHasMasterStatus = masterNode == null ? HealthStatus.RED : HealthStatus.GREEN;
String instanceHasMasterSummary = masterNode == null ? INSTANCE_HAS_MASTER_RED_SUMMARY : INSTANCE_HAS_MASTER_GREEN_SUMMARY;

return createIndicator(instanceHasMasterStatus, instanceHasMasterSummary, includeDetails ? (builder, params) -> {
return createIndicator(instanceHasMasterStatus, instanceHasMasterSummary, explain ? (builder, params) -> {
builder.startObject();
builder.object("coordinating_node", xContentBuilder -> {
builder.field("node_id", coordinatingNode.getId());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,26 +111,26 @@ public String component() {
}

@Override
public HealthIndicatorResult calculate(boolean includeDetails) {
public HealthIndicatorResult calculate(boolean explain) {
var state = clusterService.state();
var shutdown = state.getMetadata().custom(NodesShutdownMetadata.TYPE, NodesShutdownMetadata.EMPTY);
var status = new ShardAllocationStatus(state.getMetadata());

for (IndexRoutingTable indexShardRouting : state.routingTable()) {
for (int i = 0; i < indexShardRouting.size(); i++) {
IndexShardRoutingTable shardRouting = indexShardRouting.shard(i);
status.addPrimary(shardRouting.primaryShard(), state, shutdown, includeDetails);
status.addPrimary(shardRouting.primaryShard(), state, shutdown, explain);
for (ShardRouting replicaShard : shardRouting.replicaShards()) {
status.addReplica(replicaShard, state, shutdown, includeDetails);
status.addReplica(replicaShard, state, shutdown, explain);
}
}
}
return createIndicator(
status.getStatus(),
status.getSummary(),
status.getDetails(includeDetails),
status.getDetails(explain),
status.getImpacts(),
status.getUserActions(includeDetails)
status.getUserActions(explain)
);
}

Expand Down Expand Up @@ -307,7 +307,7 @@ private class ShardAllocationCounts {
private final Set<String> indicesWithUnavailableShards = new HashSet<>();
private final Map<UserAction.Definition, Set<String>> userActions = new HashMap<>();

public void increment(ShardRouting routing, ClusterState state, NodesShutdownMetadata shutdowns, boolean includeDetails) {
public void increment(ShardRouting routing, ClusterState state, NodesShutdownMetadata shutdowns, boolean explain) {
boolean isNew = isUnassignedDueToNewInitialization(routing);
boolean isRestarting = isUnassignedDueToTimelyRestart(routing, shutdowns);
available &= routing.active() || isRestarting || isNew;
Expand All @@ -323,7 +323,7 @@ public void increment(ShardRouting routing, ClusterState state, NodesShutdownMet
unassigned_restarting++;
} else {
unassigned++;
if (includeDetails) {
if (explain) {
diagnoseUnassignedShardRouting(routing, state).forEach(
definition -> addUserAction(definition, routing.getIndexName())
);
Expand Down Expand Up @@ -677,12 +677,12 @@ private class ShardAllocationStatus {
this.clusterMetadata = clusterMetadata;
}

public void addPrimary(ShardRouting routing, ClusterState state, NodesShutdownMetadata shutdowns, boolean includeDetails) {
primaries.increment(routing, state, shutdowns, includeDetails);
public void addPrimary(ShardRouting routing, ClusterState state, NodesShutdownMetadata shutdowns, boolean explain) {
primaries.increment(routing, state, shutdowns, explain);
}

public void addReplica(ShardRouting routing, ClusterState state, NodesShutdownMetadata shutdowns, boolean includeDetails) {
replicas.increment(routing, state, shutdowns, includeDetails);
public void addReplica(ShardRouting routing, ClusterState state, NodesShutdownMetadata shutdowns, boolean explain) {
replicas.increment(routing, state, shutdowns, explain);
}

public HealthStatus getStatus() {
Expand Down Expand Up @@ -725,8 +725,8 @@ private static Stream<String> createMessage(int count, String singular, String p
};
}

public HealthIndicatorDetails getDetails(boolean includeDetails) {
if (includeDetails) {
public HealthIndicatorDetails getDetails(boolean explain) {
if (explain) {
return new SimpleHealthIndicatorDetails(
Map.of(
"unassigned_primaries",
Expand Down Expand Up @@ -788,11 +788,11 @@ public List<HealthIndicatorImpact> getImpacts() {

/**
* Summarizes the user actions that are needed to solve unassigned primary and replica shards.
* @param includeDetails true if user actions should be generated, false if they should be omitted.
* @return A summary of user actions. Alternatively, an empty list if none were found or includeDetails is false.
* @param explain true if user actions should be generated, false if they should be omitted.
* @return A summary of user actions. Alternatively, an empty list if none were found or explain is false.
*/
public List<UserAction> getUserActions(boolean includeDetails) {
if (includeDetails) {
public List<UserAction> getUserActions(boolean explain) {
if (explain) {
Map<UserAction.Definition, Set<String>> actionsToAffectedIndices = new HashMap<>(primaries.userActions);
replicas.userActions.forEach((actionDefinition, indicesWithReplicasUnassigned) -> {
Set<String> indicesWithPrimariesUnassigned = actionsToAffectedIndices.get(actionDefinition);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,22 +124,22 @@ public String toString() {
}

public static class Request extends ActionRequest {
private final boolean computeDetails;
private final String componentName;
private final String indicatorName;
private final boolean explain;

public Request() {
public Request(boolean explain) {
// We never compute details if no component name is given because of the runtime cost:
this.computeDetails = false;
this.componentName = null;
this.indicatorName = null;
this.explain = explain;
}

public Request(String componentName, String indicatorName) {
public Request(String componentName, String indicatorName, boolean explain) {
assert componentName != null;
computeDetails = true;
this.componentName = componentName;
this.indicatorName = indicatorName;
this.explain = explain;
}

@Override
Expand Down Expand Up @@ -170,7 +170,7 @@ protected void doExecute(Task task, Request request, ActionListener<Response> li
listener.onResponse(
new Response(
clusterService.getClusterName(),
healthService.getHealth(request.componentName, request.indicatorName, request.computeDetails),
healthService.getHealth(request.componentName, request.indicatorName, request.explain),
request.componentName == null
)
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ public interface HealthIndicatorService {

String component();

HealthIndicatorResult calculate(boolean calculateDetails);
HealthIndicatorResult calculate(boolean explain);

/**
* This method creates a HealthIndicatorResult with the given information. Note that it sorts the impacts by severity (the lower the
Expand Down
12 changes: 6 additions & 6 deletions server/src/main/java/org/elasticsearch/health/HealthService.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,11 @@ public HealthService(
* the given indicatorName.
* @param componentName If not null, only the component with this name is returned
* @param indicatorName If not null, the returned component will only have this indicator
* @param computeDetails Whether to compute the details portion of the component results
* @param explain Whether to compute the details portion of the component results
* @return A list of all HealthComponentResults if componentName is null, or one HealthComponentResult if componentName is not null
* @throws ResourceNotFoundException if a component name is given and the component or indicator are not found
*/
public List<HealthComponentResult> getHealth(@Nullable String componentName, @Nullable String indicatorName, boolean computeDetails) {
public List<HealthComponentResult> getHealth(@Nullable String componentName, @Nullable String indicatorName, boolean explain) {
final boolean shouldDrillDownToIndicatorLevel = indicatorName != null;
final boolean showRolledUpComponentStatus = shouldDrillDownToIndicatorLevel == false;

Expand All @@ -99,10 +99,10 @@ public List<HealthComponentResult> getHealth(@Nullable String componentName, @Nu
List<HealthIndicatorResult> preflightResults;
if (clusterStateRecovered) {
// Determine if cluster is stable enough to calculate health before running other indicators
preflightResults = preflightHealthIndicatorServices.stream().map(service -> service.calculate(computeDetails)).toList();
preflightResults = preflightHealthIndicatorServices.stream().map(service -> service.calculate(explain)).toList();
} else {
// Mark preflight indicators as UNKNOWN
HealthIndicatorDetails details = computeDetails ? DETAILS_UNKNOWN_STATE_NOT_RECOVERED : HealthIndicatorDetails.EMPTY;
HealthIndicatorDetails details = explain ? DETAILS_UNKNOWN_STATE_NOT_RECOVERED : HealthIndicatorDetails.EMPTY;
preflightResults = preflightHealthIndicatorServices.stream()
.map(service -> generateUnknownResult(service, UNKNOWN_RESULT_SUMMARY_NOT_RECOVERED, details))
.toList();
Expand All @@ -120,11 +120,11 @@ public List<HealthComponentResult> getHealth(@Nullable String componentName, @Nu
Stream<HealthIndicatorResult> filteredIndicatorResults;
if (clusterStateRecovered && clusterHealthIsObtainable) {
// Calculate remaining indicators
filteredIndicatorResults = filteredIndicators.map(service -> service.calculate(computeDetails));
filteredIndicatorResults = filteredIndicators.map(service -> service.calculate(explain));
} else {
// Mark remaining indicators as UNKNOWN
String unknownSummary = clusterStateRecovered ? UNKNOWN_RESULT_SUMMARY_PREFLIGHT_FAILED : UNKNOWN_RESULT_SUMMARY_NOT_RECOVERED;
HealthIndicatorDetails unknownDetails = healthUnknownReason(preflightResults, clusterStateRecovered, computeDetails);
HealthIndicatorDetails unknownDetails = healthUnknownReason(preflightResults, clusterStateRecovered, explain);
filteredIndicatorResults = filteredIndicators.map(service -> generateUnknownResult(service, unknownSummary, unknownDetails));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@

public class RestGetHealthAction extends BaseRestHandler {

private static final String EXPLAIN_PARAM = "explain";

@Override
public String getName() {
// TODO: Existing - "cluster_health_action", "cat_health_action"
Expand All @@ -39,9 +41,10 @@ public List<Route> routes() {
protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient client) throws IOException {
String componentName = request.param("component");
String indicatorName = request.param("indicator");
boolean explain = request.paramAsBoolean(EXPLAIN_PARAM, true);
GetHealthAction.Request getHealthRequest = componentName == null
? new GetHealthAction.Request()
: new GetHealthAction.Request(componentName, indicatorName);
? new GetHealthAction.Request(explain)
: new GetHealthAction.Request(componentName, indicatorName, explain);
return channel -> client.execute(GetHealthAction.INSTANCE, getHealthRequest, new RestToXContentListener<>(channel));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ public String component() {
}

@Override
public HealthIndicatorResult calculate(boolean includeDetails) {
public HealthIndicatorResult calculate(boolean explain) {
var snapshotMetadata = clusterService.state().metadata().custom(RepositoriesMetadata.TYPE, RepositoriesMetadata.EMPTY);

if (snapshotMetadata.repositories().isEmpty()) {
Expand All @@ -82,9 +82,7 @@ public HealthIndicatorResult calculate(boolean includeDetails) {
return createIndicator(
GREEN,
"No corrupted repositories.",
includeDetails
? new SimpleHealthIndicatorDetails(Map.of("total_repositories", totalRepositories))
: HealthIndicatorDetails.EMPTY,
explain ? new SimpleHealthIndicatorDetails(Map.of("total_repositories", totalRepositories)) : HealthIndicatorDetails.EMPTY,
Collections.emptyList(),
Collections.emptyList()
);
Expand All @@ -93,7 +91,7 @@ public HealthIndicatorResult calculate(boolean includeDetails) {
return createIndicator(
RED,
createCorruptedRepositorySummary(corrupted),
includeDetails
explain
? new SimpleHealthIndicatorDetails(
Map.of(
"total_repositories",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public String component() {
}

@Override
public HealthIndicatorResult calculate(boolean calculateDetails) {
public HealthIndicatorResult calculate(boolean explain) {
return null;
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ public String component() {
}

@Override
public HealthIndicatorResult calculate(boolean calculateDetails) {
public HealthIndicatorResult calculate(boolean explain) {
return result;
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,37 +51,37 @@ public String component() {
}

@Override
public HealthIndicatorResult calculate(boolean includeDetails) {
public HealthIndicatorResult calculate(boolean explain) {
var ilmMetadata = clusterService.state().metadata().custom(IndexLifecycleMetadata.TYPE, IndexLifecycleMetadata.EMPTY);
if (ilmMetadata.getPolicyMetadatas().isEmpty()) {
return createIndicator(
GREEN,
"No policies configured",
createDetails(includeDetails, ilmMetadata),
createDetails(explain, ilmMetadata),
Collections.emptyList(),
Collections.emptyList()
);
} else if (ilmMetadata.getOperationMode() != OperationMode.RUNNING) {
return createIndicator(
YELLOW,
"ILM is not running",
createDetails(includeDetails, ilmMetadata),
createDetails(explain, ilmMetadata),
Collections.emptyList(),
Collections.emptyList()
);
} else {
return createIndicator(
GREEN,
"ILM is running",
createDetails(includeDetails, ilmMetadata),
createDetails(explain, ilmMetadata),
Collections.emptyList(),
Collections.emptyList()
);
}
}

private static HealthIndicatorDetails createDetails(boolean includeDetails, IndexLifecycleMetadata metadata) {
if (includeDetails) {
private static HealthIndicatorDetails createDetails(boolean explain, IndexLifecycleMetadata metadata) {
if (explain) {
return new SimpleHealthIndicatorDetails(
Map.of("ilm_status", metadata.getOperationMode(), "policies", metadata.getPolicies().size())
);
Expand Down

0 comments on commit 6d975a6

Please sign in to comment.