Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[8.13] Make Health API more resilient to multi-version clusters (#105789) #105903

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/105789.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 105789
summary: Make Health API more resilient to multi-version clusters
area: Health
type: bug
issues:
- 90183
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
---
"cluster health basic test":
- skip:
version: all
reason: "AwaitsFix https://github.com/elastic/elasticsearch/issues/90183"
# version: "- 8.3.99"
# reason: "health was only added in 8.2.0, and master_is_stable in 8.4.0"
version: "- 8.3.99"
reason: "health was only added in 8.2.0, and master_is_stable in 8.4.0"

- do:
health_report: { }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,21 @@
import org.elasticsearch.features.NodeFeature;

import java.util.Map;
import java.util.Set;

public class HealthFeatures implements FeatureSpecification {

public static final NodeFeature SUPPORTS_HEALTH = new NodeFeature("health.supports_health");
public static final NodeFeature SUPPORTS_SHARDS_CAPACITY_INDICATOR = new NodeFeature("health.shards_capacity_indicator");
public static final NodeFeature SUPPORTS_EXTENDED_REPOSITORY_INDICATOR = new NodeFeature("health.extended_repository_indicator");

@Override
public Set<NodeFeature> getFeatures() {
return Set.of(SUPPORTS_EXTENDED_REPOSITORY_INDICATOR);
}

@Override
public Map<NodeFeature, Version> getHistoricalFeatures() {
return Map.of(SUPPORTS_HEALTH, Version.V_8_5_0);
return Map.of(SUPPORTS_HEALTH, Version.V_8_5_0, SUPPORTS_SHARDS_CAPACITY_INDICATOR, Version.V_8_8_0);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
import org.elasticsearch.cluster.routing.RoutingNodes;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.util.set.Sets;
import org.elasticsearch.features.FeatureService;
import org.elasticsearch.health.Diagnosis;
import org.elasticsearch.health.HealthFeatures;
import org.elasticsearch.health.HealthIndicatorDetails;
import org.elasticsearch.health.HealthIndicatorImpact;
import org.elasticsearch.health.HealthIndicatorResult;
Expand Down Expand Up @@ -71,9 +73,11 @@ public class DiskHealthIndicatorService implements HealthIndicatorService {
private static final String IMPACT_CLUSTER_FUNCTIONALITY_UNAVAILABLE_ID = "cluster_functionality_unavailable";

private final ClusterService clusterService;
private final FeatureService featureService;

public DiskHealthIndicatorService(ClusterService clusterService) {
public DiskHealthIndicatorService(ClusterService clusterService, FeatureService featureService) {
this.clusterService = clusterService;
this.featureService = featureService;
}

@Override
Expand All @@ -83,8 +87,18 @@ public String name() {

@Override
public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResourcesCount, HealthInfo healthInfo) {
ClusterState clusterState = clusterService.state();
Map<String, DiskHealthInfo> diskHealthInfoMap = healthInfo.diskInfoByNode();
if (diskHealthInfoMap == null || diskHealthInfoMap.isEmpty()) {
if (featureService.clusterHasFeature(clusterState, HealthFeatures.SUPPORTS_HEALTH) == false) {
return createIndicator(
HealthStatus.GREEN,
"No disk usage data available. The cluster currently has mixed versions (an upgrade may be in progress).",
HealthIndicatorDetails.EMPTY,
List.of(),
List.of()
);
}
/*
* If there is no disk health info, that either means that a new health node was just elected, or something is seriously
* wrong with health data collection on the health node. Either way, we immediately return UNKNOWN. If there are at least
Expand All @@ -98,7 +112,6 @@ public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResources
Collections.emptyList()
);
}
ClusterState clusterState = clusterService.state();
logNodesMissingHealthInfo(diskHealthInfoMap, clusterState);

DiskHealthAnalyzer diskHealthAnalyzer = new DiskHealthAnalyzer(diskHealthInfoMap, clusterState);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.TriFunction;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.features.FeatureService;
import org.elasticsearch.health.Diagnosis;
import org.elasticsearch.health.HealthFeatures;
import org.elasticsearch.health.HealthIndicatorDetails;
import org.elasticsearch.health.HealthIndicatorImpact;
import org.elasticsearch.health.HealthIndicatorResult;
Expand Down Expand Up @@ -90,9 +92,11 @@ public class ShardsCapacityHealthIndicatorService implements HealthIndicatorServ
);

private final ClusterService clusterService;
private final FeatureService featureService;

public ShardsCapacityHealthIndicatorService(ClusterService clusterService) {
public ShardsCapacityHealthIndicatorService(ClusterService clusterService, FeatureService featureService) {
this.clusterService = clusterService;
this.featureService = featureService;
}

@Override
Expand All @@ -105,6 +109,15 @@ public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResources
var state = clusterService.state();
var healthMetadata = HealthMetadata.getFromClusterState(state);
if (healthMetadata == null || healthMetadata.getShardLimitsMetadata() == null) {
if (featureService.clusterHasFeature(state, HealthFeatures.SUPPORTS_SHARDS_CAPACITY_INDICATOR) == false) {
return createIndicator(
HealthStatus.GREEN,
"No shard limits configured yet. The cluster currently has mixed versions (an upgrade may be in progress).",
HealthIndicatorDetails.EMPTY,
List.of(),
List.of()
);
}
return unknownIndicator();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1189,9 +1189,9 @@ private Module loadDiagnosticServices(

var serverHealthIndicatorServices = Stream.of(
new StableMasterHealthIndicatorService(coordinationDiagnosticsService, clusterService),
new RepositoryIntegrityHealthIndicatorService(clusterService),
new DiskHealthIndicatorService(clusterService),
new ShardsCapacityHealthIndicatorService(clusterService)
new RepositoryIntegrityHealthIndicatorService(clusterService, featureService),
new DiskHealthIndicatorService(clusterService, featureService),
new ShardsCapacityHealthIndicatorService(clusterService, featureService)
);
var pluginHealthIndicatorServices = pluginsService.filterPlugins(HealthPlugin.class)
.flatMap(plugin -> plugin.getHealthIndicatorServices().stream());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
import org.elasticsearch.cluster.metadata.RepositoriesMetadata;
import org.elasticsearch.cluster.metadata.RepositoryMetadata;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.features.FeatureService;
import org.elasticsearch.health.Diagnosis;
import org.elasticsearch.health.HealthFeatures;
import org.elasticsearch.health.HealthIndicatorDetails;
import org.elasticsearch.health.HealthIndicatorImpact;
import org.elasticsearch.health.HealthIndicatorResult;
Expand Down Expand Up @@ -59,6 +61,8 @@ public class RepositoryIntegrityHealthIndicatorService implements HealthIndicato
public static final String NO_REPOS_CONFIGURED = "No snapshot repositories configured.";
public static final String ALL_REPOS_HEALTHY = "All repositories are healthy.";
public static final String NO_REPO_HEALTH_INFO = "No repository health info.";
public static final String MIXED_VERSIONS =
"No repository health info. The cluster currently has mixed versions (an upgrade may be in progress).";

public static final List<HealthIndicatorImpact> IMPACTS = List.of(
new HealthIndicatorImpact(
Expand Down Expand Up @@ -95,9 +99,11 @@ public class RepositoryIntegrityHealthIndicatorService implements HealthIndicato
);

private final ClusterService clusterService;
private final FeatureService featureService;

public RepositoryIntegrityHealthIndicatorService(ClusterService clusterService) {
public RepositoryIntegrityHealthIndicatorService(ClusterService clusterService, FeatureService featureService) {
this.clusterService = clusterService;
this.featureService = featureService;
}

@Override
Expand Down Expand Up @@ -128,7 +134,7 @@ public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResources
/**
* Analyzer for the cluster's repositories health; aids in constructing a {@link HealthIndicatorResult}.
*/
static class RepositoryHealthAnalyzer {
class RepositoryHealthAnalyzer {
private final ClusterState clusterState;
private final int totalRepositories;
private final List<String> corruptedRepositories;
Expand All @@ -137,6 +143,7 @@ static class RepositoryHealthAnalyzer {
private final Set<String> invalidRepositories = new HashSet<>();
private final Set<String> nodesWithInvalidRepos = new HashSet<>();
private final HealthStatus healthStatus;
private boolean clusterHasFeature = true;

private RepositoryHealthAnalyzer(
ClusterState clusterState,
Expand Down Expand Up @@ -167,7 +174,15 @@ private RepositoryHealthAnalyzer(
|| invalidRepositories.isEmpty() == false) {
healthStatus = YELLOW;
} else if (repositoriesHealthByNode.isEmpty()) {
healthStatus = UNKNOWN;
clusterHasFeature = featureService.clusterHasFeature(
clusterState,
HealthFeatures.SUPPORTS_EXTENDED_REPOSITORY_INDICATOR
) == false;
if (clusterHasFeature) {
healthStatus = GREEN;
} else {
healthStatus = UNKNOWN;
}
} else {
healthStatus = GREEN;
}
Expand All @@ -179,7 +194,7 @@ public HealthStatus getHealthStatus() {

public String getSymptom() {
if (healthStatus == GREEN) {
return ALL_REPOS_HEALTHY;
return clusterHasFeature ? ALL_REPOS_HEALTHY : MIXED_VERSIONS;
} else if (healthStatus == UNKNOWN) {
return NO_REPO_HEALTH_INFO;
}
Expand Down
Loading