Skip to content

Commit

Permalink
Add ids to health API diagnoses and impacts (#90072)
Browse files Browse the repository at this point in the history
This PR adds hierarchical ids to each diagnosis and impact that is present in the health API. 
The ids are structured starting with generic elasticsearch info, leading into which indicator 
they belong to, and finally what kind of resource they are, their identifier, and any additional 
information pertaining to the identifier (like which tier they correspond to).
  • Loading branch information
jbaiera committed Sep 20, 2022
1 parent 2566cd1 commit 3df7e38
Show file tree
Hide file tree
Showing 15 changed files with 182 additions and 27 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/90072.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 90072
summary: Add IDs to health API diagnoses and impacts
area: Health
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ public class StableMasterHealthIndicatorService implements HealthIndicatorServic
public static final String GET_HELP_GUIDE = "https://ela.st/getting-help";
public static final Diagnosis CONTACT_SUPPORT_USER_ACTION = new Diagnosis(
new Diagnosis.Definition(
NAME,
"contact_support",
"The Elasticsearch cluster does not have a stable master node.",
"Get help at " + GET_HELP_GUIDE,
Expand All @@ -61,6 +62,11 @@ public class StableMasterHealthIndicatorService implements HealthIndicatorServic
private static final String CLUSTER_FORMATION = "cluster_formation";
private static final String CLUSTER_FORMATION_MESSAGE = "cluster_formation_message";

// Impact IDs
public static final String INGEST_DISABLED_IMPACT_ID = "ingest_disabled";
public static final String AUTOMATION_DISABLED_IMPACT_ID = "automation_disabled";
public static final String BACKUP_DISABLED_IMPACT_ID = "backup_disabled";

// Impacts of having an unstable master:
private static final String UNSTABLE_MASTER_INGEST_IMPACT = "The cluster cannot create, delete, or rebalance indices, and cannot "
+ "insert or update documents.";
Expand All @@ -73,9 +79,15 @@ public class StableMasterHealthIndicatorService implements HealthIndicatorServic
* This is the list of the impacts to be reported when the master node is determined to be unstable.
*/
private static final List<HealthIndicatorImpact> UNSTABLE_MASTER_IMPACTS = List.of(
new HealthIndicatorImpact(1, UNSTABLE_MASTER_INGEST_IMPACT, List.of(ImpactArea.INGEST)),
new HealthIndicatorImpact(1, UNSTABLE_MASTER_DEPLOYMENT_MANAGEMENT_IMPACT, List.of(ImpactArea.DEPLOYMENT_MANAGEMENT)),
new HealthIndicatorImpact(3, UNSTABLE_MASTER_BACKUP_IMPACT, List.of(ImpactArea.BACKUP))
new HealthIndicatorImpact(NAME, INGEST_DISABLED_IMPACT_ID, 1, UNSTABLE_MASTER_INGEST_IMPACT, List.of(ImpactArea.INGEST)),
new HealthIndicatorImpact(
NAME,
AUTOMATION_DISABLED_IMPACT_ID,
1,
UNSTABLE_MASTER_DEPLOYMENT_MANAGEMENT_IMPACT,
List.of(ImpactArea.DEPLOYMENT_MANAGEMENT)
),
new HealthIndicatorImpact(NAME, BACKUP_DISABLED_IMPACT_ID, 3, UNSTABLE_MASTER_BACKUP_IMPACT, List.of(ImpactArea.BACKUP))
);

public StableMasterHealthIndicatorService(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,13 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {
);
}

// Impact IDs
public static final String PRIMARY_UNASSIGNED_IMPACT_ID = "primary_unassigned";
public static final String REPLICA_UNASSIGNED_IMPACT_ID = "replica_unassigned";

public static final String RESTORE_FROM_SNAPSHOT_ACTION_GUIDE = "http://ela.st/restore-snapshot";
public static final Diagnosis.Definition ACTION_RESTORE_FROM_SNAPSHOT = new Diagnosis.Definition(
NAME,
"restore_from_snapshot",
"Elasticsearch isn't allowed to allocate some shards because there are no copies of the shards in the cluster. Elasticsearch will "
+ "allocate these shards when nodes holding good copies of the data join the cluster.",
Expand All @@ -140,6 +145,7 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {

public static final String DIAGNOSE_SHARDS_ACTION_GUIDE = "http://ela.st/diagnose-shards";
public static final Diagnosis.Definition ACTION_CHECK_ALLOCATION_EXPLAIN_API = new Diagnosis.Definition(
NAME,
"explain_allocations",
"Elasticsearch isn't allowed to allocate some shards from these indices to any of the nodes in the cluster.",
"Diagnose the issue by calling the allocation explain API for an index [GET _cluster/allocation/explain]. Choose a node to which "
Expand All @@ -150,6 +156,7 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {

public static final String FIX_DELAYED_SHARDS_GUIDE = "http://ela.st/fix-delayed-shard-allocation";
public static final Diagnosis.Definition DIAGNOSIS_WAIT_FOR_OR_FIX_DELAYED_SHARDS = new Diagnosis.Definition(
NAME,
"delayed_shard_allocations",
"Elasticsearch is not allocating some shards because they are marked for delayed allocation. Shards that have become "
+ "unavailable are usually marked for delayed allocation because it is more efficient to wait and see if the shards return "
Expand All @@ -160,6 +167,7 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {

public static final String ENABLE_INDEX_ALLOCATION_GUIDE = "http://ela.st/fix-index-allocation";
public static final Diagnosis.Definition ACTION_ENABLE_INDEX_ROUTING_ALLOCATION = new Diagnosis.Definition(
NAME,
"enable_index_allocations",
"Elasticsearch isn't allowed to allocate some shards from these indices because allocation for those shards has been disabled at "
+ "the index level.",
Expand All @@ -172,6 +180,7 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {
);
public static final String ENABLE_CLUSTER_ALLOCATION_ACTION_GUIDE = "http://ela.st/fix-cluster-allocation";
public static final Diagnosis.Definition ACTION_ENABLE_CLUSTER_ROUTING_ALLOCATION = new Diagnosis.Definition(
NAME,
"enable_cluster_allocations",
"Elasticsearch isn't allowed to allocate some shards from these indices because allocation for those shards has been disabled at "
+ "the cluster level.",
Expand All @@ -189,7 +198,8 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {
Collectors.toUnmodifiableMap(
tier -> tier,
tier -> new Diagnosis.Definition(
"enable_data_tiers_" + tier,
NAME,
"enable_data_tiers:tier:" + tier,
"Elasticsearch isn't allowed to allocate some shards from these indices because the indices expect to be allocated to "
+ "data tier nodes, but there were not any nodes with the expected tiers found in the cluster.",
"Add nodes with the [" + tier + "] role to the cluster.",
Expand All @@ -200,6 +210,7 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {

public static final String INCREASE_SHARD_LIMIT_ACTION_GUIDE = "http://ela.st/index-total-shards";
public static final Diagnosis.Definition ACTION_INCREASE_SHARD_LIMIT_INDEX_SETTING = new Diagnosis.Definition(
NAME,
"increase_shard_limit_index_setting",
"Elasticsearch isn't allowed to allocate some shards from these indices to any data nodes because each node has reached the index "
+ "shard limit. ",
Expand All @@ -215,7 +226,8 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {
Collectors.toUnmodifiableMap(
tier -> tier,
tier -> new Diagnosis.Definition(
"increase_shard_limit_index_setting_" + tier,
NAME,
"increase_shard_limit_index_setting:tier:" + tier,
"Elasticsearch isn't allowed to allocate some shards from these indices because each node in the ["
+ tier
+ "] tier has reached the index shard limit. ",
Expand All @@ -229,6 +241,7 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {

public static final String INCREASE_CLUSTER_SHARD_LIMIT_ACTION_GUIDE = "http://ela.st/cluster-total-shards";
public static final Diagnosis.Definition ACTION_INCREASE_SHARD_LIMIT_CLUSTER_SETTING = new Diagnosis.Definition(
NAME,
"increase_shard_limit_cluster_setting",
"Elasticsearch isn't allowed to allocate some shards from these indices to any data nodes because each node has reached the "
+ "cluster shard limit.",
Expand All @@ -244,7 +257,8 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {
Collectors.toUnmodifiableMap(
tier -> tier,
tier -> new Diagnosis.Definition(
"increase_shard_limit_cluster_setting_" + tier,
NAME,
"increase_shard_limit_cluster_setting:tier:" + tier,
"Elasticsearch isn't allowed to allocate some shards from these indices because each node in the ["
+ tier
+ "] tier has reached the cluster shard limit. ",
Expand All @@ -258,6 +272,7 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {

public static final String MIGRATE_TO_TIERS_ACTION_GUIDE = "http://ela.st/migrate-to-tiers";
public static final Diagnosis.Definition ACTION_MIGRATE_TIERS_AWAY_FROM_REQUIRE_DATA = new Diagnosis.Definition(
NAME,
"migrate_data_tiers_require_data",
"Elasticsearch isn't allowed to allocate some shards from these indices to any nodes in the desired data tiers because the "
+ "indices are configured with allocation filter rules that are incompatible with the nodes in this tier.",
Expand All @@ -274,7 +289,8 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {
Collectors.toUnmodifiableMap(
tier -> tier,
tier -> new Diagnosis.Definition(
"migrate_data_tiers_require_data_" + tier,
NAME,
"migrate_data_tiers_require_data:tier:" + tier,
"Elasticsearch isn't allowed to allocate some shards from these indices to any nodes in the ["
+ tier
+ "] data tier because the indices are configured with allocation filter rules that are incompatible with the "
Expand All @@ -290,6 +306,7 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {
);

public static final Diagnosis.Definition ACTION_MIGRATE_TIERS_AWAY_FROM_INCLUDE_DATA = new Diagnosis.Definition(
NAME,
"migrate_data_tiers_include_data",
"Elasticsearch isn't allowed to allocate some shards from these indices to any nodes in the desired data tiers because the "
+ "indices are configured with allocation filter rules that are incompatible with the nodes in this tier. ",
Expand All @@ -306,7 +323,8 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {
Collectors.toUnmodifiableMap(
tier -> tier,
tier -> new Diagnosis.Definition(
"migrate_data_tiers_include_data_" + tier,
NAME,
"migrate_data_tiers_include_data:tier:" + tier,
"Elasticsearch isn't allowed to allocate some shards from these indices to any nodes in the ["
+ tier
+ "] data tier because the indices are configured with allocation filter rules that are incompatible with the "
Expand All @@ -323,6 +341,7 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {

public static final String TIER_CAPACITY_ACTION_GUIDE = "http://ela.st/tier-capacity";
public static final Diagnosis.Definition ACTION_INCREASE_NODE_CAPACITY = new Diagnosis.Definition(
NAME,
"increase_node_capacity_for_allocations",
"Elasticsearch isn't allowed to allocate some shards from these indices because there are not enough nodes in the cluster to "
+ "allocate each shard copy on a different node.",
Expand All @@ -335,7 +354,8 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {
Collectors.toUnmodifiableMap(
tier -> tier,
tier -> new Diagnosis.Definition(
"increase_tier_capacity_for_allocations_" + tier,
NAME,
"increase_tier_capacity_for_allocations:tier:" + tier,
"Elasticsearch isn't allowed to allocate some shards from these indices to any of the nodes in the desired data tier "
+ "because there are not enough nodes in the ["
+ tier
Expand Down Expand Up @@ -836,7 +856,15 @@ public List<HealthIndicatorImpact> getImpacts() {
primaries.indicesWithUnavailableShards.size() == 1 ? "index" : "indices",
getTruncatedIndicesString(primaries.indicesWithUnavailableShards, clusterMetadata)
);
impacts.add(new HealthIndicatorImpact(1, impactDescription, List.of(ImpactArea.INGEST, ImpactArea.SEARCH)));
impacts.add(
new HealthIndicatorImpact(
NAME,
PRIMARY_UNASSIGNED_IMPACT_ID,
1,
impactDescription,
List.of(ImpactArea.INGEST, ImpactArea.SEARCH)
)
);
}
/*
* It is possible that we're working with an intermediate cluster state, and that for an index we have no primary but a replica
Expand All @@ -853,7 +881,9 @@ public List<HealthIndicatorImpact> getImpacts() {
indicesWithUnavailableReplicasOnly.size() == 1 ? "index" : "indices",
getTruncatedIndicesString(indicesWithUnavailableReplicasOnly, clusterMetadata)
);
impacts.add(new HealthIndicatorImpact(2, impactDescription, List.of(ImpactArea.SEARCH)));
impacts.add(
new HealthIndicatorImpact(NAME, REPLICA_UNASSIGNED_IMPACT_ID, 2, impactDescription, List.of(ImpactArea.SEARCH))
);
}
return impacts;
}
Expand Down
8 changes: 6 additions & 2 deletions server/src/main/java/org/elasticsearch/health/Diagnosis.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import java.io.IOException;
import java.util.List;

import static org.elasticsearch.health.HealthService.HEALTH_API_ID_PREFIX;

/**
* Details a potential issue that was diagnosed by a {@link HealthService}.
*
Expand All @@ -26,16 +28,18 @@ public record Diagnosis(Definition definition, @Nullable List<String> affectedRe
/**
* Details a diagnosis - cause and a potential action that a user could take to clear an issue identified by a {@link HealthService}.
*
* @param id A unique identifier
* @param indicatorName The name of the health indicator service that will generate this diagnosis
* @param id An identifier unique to this diagnosis across the health indicator that generates it
* @param cause A description of the cause of the problem
* @param action A description of the action to be taken to remedy the problem
* @param helpURL Optional evergreen url to a help document
*/
public record Definition(String id, String cause, String action, String helpURL) {}
public record Definition(String indicatorName, String id, String cause, String action, String helpURL) {}

@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
builder.field("id", HEALTH_API_ID_PREFIX + definition.indicatorName + ":diagnosis:" + definition.id);
builder.field("cause", definition.cause);
builder.field("action", definition.action);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@
import java.io.IOException;
import java.util.List;

public record HealthIndicatorImpact(int severity, String impactDescription, List<ImpactArea> impactAreas) implements ToXContentObject {
import static org.elasticsearch.health.HealthService.HEALTH_API_ID_PREFIX;

public record HealthIndicatorImpact(String indicatorName, String id, int severity, String impactDescription, List<ImpactArea> impactAreas)
implements
ToXContentObject {

public HealthIndicatorImpact {
if (severity < 0) {
Expand All @@ -32,6 +36,7 @@ public record HealthIndicatorImpact(int severity, String impactDescription, List
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
builder.field("id", HEALTH_API_ID_PREFIX + indicatorName + ":impact:" + id);
builder.field("severity", severity);
builder.field("description", impactDescription);
builder.startArray("impact_areas");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ public class HealthService {
static final String UNKNOWN_RESULT_SUMMARY_PREFLIGHT_FAILED = "Could not determine health status. Check details on critical issues "
+ "preventing the health status from reporting.";

public static final String HEALTH_API_ID_PREFIX = "elasticsearch:health:";

/**
* Detail map key that contains the reasons a result was marked as UNKNOWN
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,12 @@ public class RepositoryIntegrityHealthIndicatorService implements HealthIndicato
public static final String NAME = "repository_integrity";

public static final String HELP_URL = "https://ela.st/fix-repository-integrity";

public static final String REPOSITORY_CORRUPTED_IMPACT_ID = "repository_corruption";

public static final Diagnosis.Definition CORRUPTED_REPOSITORY = new Diagnosis.Definition(
"corrupt-repo-integrity",
NAME,
"corrupt_repo_integrity",
"Multiple clusters are writing to the same repository.",
"Remove the repository from the other cluster(s), or mark it as read-only in the other cluster(s), and then re-add the repository"
+ " to this cluster.",
Expand Down Expand Up @@ -100,6 +104,8 @@ public HealthIndicatorResult calculate(boolean explain, HealthInfo healthInfo) {
}
List<HealthIndicatorImpact> impacts = Collections.singletonList(
new HealthIndicatorImpact(
NAME,
REPOSITORY_CORRUPTED_IMPACT_ID,
1,
String.format(
Locale.ROOT,
Expand Down

0 comments on commit 3df7e38

Please sign in to comment.