Skip to content

Commit

Permalink
Add preflight checks to Health API to ensure health is obtainable (#8…
Browse files Browse the repository at this point in the history
…6404)

This PR introduces an idea of preflight health indicator services to the new health service. Preflight indicators are 
structurally identical to regular indicators, but they are executed first when calculating health and conditionally block 
downstream indicators from running on an unstable or unknown cluster state.
  • Loading branch information
jbaiera committed May 5, 2022
1 parent 3ef46b0 commit 8c03df6
Show file tree
Hide file tree
Showing 4 changed files with 519 additions and 44 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/86404.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 86404
summary: Add preflight checks to Health API to ensure health is obtainable
area: Health
type: enhancement
issues: []
140 changes: 135 additions & 5 deletions server/src/main/java/org/elasticsearch/health/HealthService.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,28 +9,71 @@
package org.elasticsearch.health;

import org.elasticsearch.ResourceNotFoundException;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.gateway.GatewayService;

import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static java.util.function.Predicate.isEqual;
import static java.util.stream.Collectors.collectingAndThen;
import static java.util.stream.Collectors.groupingBy;
import static java.util.stream.Collectors.toList;
import static java.util.stream.Collectors.toMap;

/**
* This service collects health indicators from all modules and plugins of elasticsearch
*/
public class HealthService {

// Visible for testing
static final String UNKNOWN_RESULT_SUMMARY_PREFLIGHT_FAILED = "Could not determine indicator state. Cluster state is not stable. Check "
+ "details for critical issues keeping this indicator from running.";
static final String UNKNOWN_RESULT_SUMMARY_NOT_RECOVERED =
"Could not determine indicator state. The current node handling the health request is not ready to assess the health of the "
+ "cluster. Try again later or execute the health API against a different node.";

/**
* Detail map key that contains the reasons a result was marked as UNKNOWN
*/
private static final String REASON = "reasons";

private static final String CLUSTER_STATE_RECOVERED = "cluster_state_recovered";
private static final SimpleHealthIndicatorDetails DETAILS_UNKNOWN_STATE_NOT_RECOVERED = new SimpleHealthIndicatorDetails(
Map.of(REASON, Map.of(CLUSTER_STATE_RECOVERED, false))
);

private final List<HealthIndicatorService> preflightHealthIndicatorServices;
private final List<HealthIndicatorService> healthIndicatorServices;
private final ClusterService clusterService;

public HealthService(List<HealthIndicatorService> healthIndicatorServices) {
/**
* Creates a new HealthService.
*
* Accepts a list of regular indicator services and a list of preflight indicator services. Preflight indicators are run first and
* represent serious cascading health problems. If any of these preflight indicators are not GREEN status, all remaining indicators are
* likely to be degraded in some way or will not be able to calculate their state correctly. The remaining health indicators will return
* UNKNOWN statuses in this case.
*
* @param preflightHealthIndicatorServices indicators that are run first and represent a serious cascading health problem.
* @param healthIndicatorServices indicators that are run if the preflight indicators return GREEN results.
*/
public HealthService(
List<HealthIndicatorService> preflightHealthIndicatorServices,
List<HealthIndicatorService> healthIndicatorServices,
ClusterService clusterService
) {
this.preflightHealthIndicatorServices = preflightHealthIndicatorServices;
this.healthIndicatorServices = healthIndicatorServices;
this.clusterService = clusterService;
}

/**
Expand All @@ -47,11 +90,52 @@ public HealthService(List<HealthIndicatorService> healthIndicatorServices) {
public List<HealthComponentResult> getHealth(@Nullable String componentName, @Nullable String indicatorName, boolean computeDetails) {
final boolean shouldDrillDownToIndicatorLevel = indicatorName != null;
final boolean showRolledUpComponentStatus = shouldDrillDownToIndicatorLevel == false;

// Is the cluster state recovered? If not, ALL indicators should return UNKNOWN
boolean clusterStateRecovered = clusterService.state()
.getBlocks()
.hasGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK) == false;

List<HealthIndicatorResult> preflightResults;
if (clusterStateRecovered) {
// Determine if cluster is stable enough to calculate health before running other indicators
preflightResults = preflightHealthIndicatorServices.stream().map(service -> service.calculate(computeDetails)).toList();
} else {
// Mark preflight indicators as UNKNOWN
HealthIndicatorDetails details = computeDetails ? DETAILS_UNKNOWN_STATE_NOT_RECOVERED : HealthIndicatorDetails.EMPTY;
preflightResults = preflightHealthIndicatorServices.stream()
.map(service -> generateUnknownResult(service, UNKNOWN_RESULT_SUMMARY_NOT_RECOVERED, details))
.toList();
}

// If any of these are not GREEN, then we cannot obtain health from other indicators
boolean clusterHealthIsObtainable = preflightResults.isEmpty()
|| preflightResults.stream().map(HealthIndicatorResult::status).allMatch(isEqual(HealthStatus.GREEN));

// Filter remaining indicators by component name and indicator name if present before calculating their results
Stream<HealthIndicatorService> filteredIndicators = healthIndicatorServices.stream()
.filter(service -> componentName == null || service.component().equals(componentName))
.filter(service -> indicatorName == null || service.name().equals(indicatorName));

Stream<HealthIndicatorResult> filteredIndicatorResults;
if (clusterStateRecovered && clusterHealthIsObtainable) {
// Calculate remaining indicators
filteredIndicatorResults = filteredIndicators.map(service -> service.calculate(computeDetails));
} else {
// Mark remaining indicators as UNKNOWN
String unknownSummary = clusterStateRecovered ? UNKNOWN_RESULT_SUMMARY_PREFLIGHT_FAILED : UNKNOWN_RESULT_SUMMARY_NOT_RECOVERED;
HealthIndicatorDetails unknownDetails = healthUnknownReason(preflightResults, clusterStateRecovered, computeDetails);
filteredIndicatorResults = filteredIndicators.map(service -> generateUnknownResult(service, unknownSummary, unknownDetails));
}

// Filter the cluster indicator results by component name and indicator name if present
Stream<HealthIndicatorResult> filteredPreflightResults = preflightResults.stream()
.filter(result -> componentName == null || result.component().equals(componentName))
.filter(result -> indicatorName == null || result.name().equals(indicatorName));

// Combine indicator results
List<HealthComponentResult> components = List.copyOf(
healthIndicatorServices.stream()
.filter(service -> componentName == null || service.component().equals(componentName))
.filter(service -> indicatorName == null || service.name().equals(indicatorName))
.map(service -> service.calculate(computeDetails))
Stream.concat(filteredPreflightResults, filteredIndicatorResults)
.collect(
groupingBy(
HealthIndicatorResult::component,
Expand All @@ -76,6 +160,52 @@ public List<HealthComponentResult> getHealth(@Nullable String componentName, @Nu
return components;
}

/**
* Return details to include on health indicator results when health information cannot be obtained due to unstable cluster.
* @param preflightResults Results of indicators used to determine if health checks can happen.
* @param computeDetails If details should be calculated on which indicators are causing the UNKNOWN state.
* @return Details explaining why results are UNKNOWN, or an empty detail set if computeDetails is false.
*/
private HealthIndicatorDetails healthUnknownReason(
List<HealthIndicatorResult> preflightResults,
boolean clusterStateRecovered,
boolean computeDetails
) {
assert clusterStateRecovered == false || preflightResults.isEmpty() == false
: "Requires at least one non-GREEN preflight result or cluster state not recovered";
HealthIndicatorDetails unknownDetails;
if (computeDetails) {
if (clusterStateRecovered) {
// Determine why the cluster is not stable enough for running remaining indicators
Map<String, String> clusterUnstableReasons = preflightResults.stream()
.filter(result -> HealthStatus.GREEN.equals(result.status()) == false)
.collect(toMap(HealthIndicatorResult::name, result -> result.status().xContentValue()));
assert clusterUnstableReasons.isEmpty() == false : "Requires at least one non-GREEN preflight result";
unknownDetails = new SimpleHealthIndicatorDetails(Map.of(REASON, clusterUnstableReasons));
} else {
unknownDetails = DETAILS_UNKNOWN_STATE_NOT_RECOVERED;
}
} else {
unknownDetails = HealthIndicatorDetails.EMPTY;
}
return unknownDetails;
}

/**
* Generates an UNKNOWN result for an indicator
* @param indicatorService the indicator to generate a result for
* @param summary the summary to include for the UNKNOWN result
* @param details the details to include on the result
* @return A result with the UNKNOWN status
*/
private HealthIndicatorResult generateUnknownResult(
HealthIndicatorService indicatorService,
String summary,
HealthIndicatorDetails details
) {
return indicatorService.createIndicator(HealthStatus.UNKNOWN, summary, details, Collections.emptyList(), Collections.emptyList());
}

// Non-private for testing purposes
static HealthComponentResult createComponentFromIndicators(List<HealthIndicatorResult> indicators, boolean showComponentSummary) {
assert indicators.size() > 0 : "Component should not be non empty";
Expand Down
11 changes: 9 additions & 2 deletions server/src/main/java/org/elasticsearch/node/Node.java
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@
import org.elasticsearch.gateway.GatewayService;
import org.elasticsearch.gateway.MetaStateService;
import org.elasticsearch.gateway.PersistedClusterStateService;
import org.elasticsearch.health.HealthIndicatorService;
import org.elasticsearch.health.HealthService;
import org.elasticsearch.http.HttpServerTransport;
import org.elasticsearch.index.IndexSettingProviders;
Expand Down Expand Up @@ -1039,16 +1040,22 @@ protected Node(
}

private HealthService createHealthService(ClusterService clusterService, ClusterModule clusterModule) {
List<HealthIndicatorService> preflightHealthIndicatorServices = Collections.singletonList(
new InstanceHasMasterHealthIndicatorService(clusterService)
);
var serverHealthIndicatorServices = List.of(
new InstanceHasMasterHealthIndicatorService(clusterService),
new RepositoryIntegrityHealthIndicatorService(clusterService),
new ShardsAvailabilityHealthIndicatorService(clusterService, clusterModule.getAllocationService())
);
var pluginHealthIndicatorServices = pluginsService.filterPlugins(HealthPlugin.class)
.stream()
.flatMap(plugin -> plugin.getHealthIndicatorServices().stream())
.toList();
return new HealthService(concatLists(serverHealthIndicatorServices, pluginHealthIndicatorServices));
return new HealthService(
preflightHealthIndicatorServices,
concatLists(serverHealthIndicatorServices, pluginHealthIndicatorServices),
clusterService
);
}

private RecoveryPlannerService getRecoveryPlannerService(
Expand Down

0 comments on commit 8c03df6

Please sign in to comment.