Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@
import static org.elasticsearch.health.HealthStatus.YELLOW;
import static org.elasticsearch.xpack.core.ilm.LifecycleOperationMetadata.currentILMMode;
import static org.elasticsearch.xpack.ilm.IlmHealthIndicatorService.RuleConfig.Builder.actionRule;
import static org.elasticsearch.xpack.ilm.IlmHealthIndicatorService.StepRule.stepRule;
import static org.elasticsearch.xpack.ilm.IlmHealthIndicatorService.StepRule.stepRuleFullChecks;
import static org.elasticsearch.xpack.ilm.IlmHealthIndicatorService.StepRule.stepRuleOnlyCheckRetries;

/**
* This indicator reports health for index lifecycle management component.
Expand Down Expand Up @@ -103,13 +104,14 @@ public class IlmHealthIndicatorService implements HealthIndicatorService {
);

private static final TimeValue ONE_DAY = TimeValue.timeValueDays(1);
private static final long MAX_RETRIES = 100;

static final Map<String, RuleConfig> RULES_BY_ACTION_CONFIG = Map.of(
RolloverAction.NAME,
actionRule(RolloverAction.NAME).stepRules(
stepRule(WaitForActiveShardsStep.NAME, ONE_DAY),
stepRule(WaitForRolloverReadyStep.NAME, ONE_DAY),
stepRule(RolloverStep.NAME, ONE_DAY)
stepRuleFullChecks(WaitForActiveShardsStep.NAME, ONE_DAY, MAX_RETRIES),
stepRuleOnlyCheckRetries(WaitForRolloverReadyStep.NAME, MAX_RETRIES),
stepRuleFullChecks(RolloverStep.NAME, ONE_DAY, MAX_RETRIES)
),
//
MigrateAction.NAME,
Expand All @@ -118,32 +120,27 @@ public class IlmHealthIndicatorService implements HealthIndicatorService {
SearchableSnapshotAction.NAME,
actionRule(SearchableSnapshotAction.NAME).maxTimeOnAction(ONE_DAY)
.stepRules(
stepRule(WaitForDataTierStep.NAME, ONE_DAY),
stepRule(WaitForIndexColorStep.NAME, ONE_DAY),
// The no-follower step is added here because an `UnfollowAction` is added before the `shrinkAction` in the follower cluster
stepRule(WaitForNoFollowersStep.NAME, ONE_DAY)
stepRuleFullChecks(WaitForDataTierStep.NAME, ONE_DAY, MAX_RETRIES),
stepRuleFullChecks(WaitForIndexColorStep.NAME, ONE_DAY, MAX_RETRIES),
stepRuleOnlyCheckRetries(WaitForNoFollowersStep.NAME, MAX_RETRIES)
),
//
DeleteAction.NAME,
actionRule(DeleteAction.NAME).stepRules(stepRule(DeleteStep.NAME, ONE_DAY)),
actionRule(DeleteAction.NAME).stepRules(stepRuleFullChecks(DeleteStep.NAME, ONE_DAY, MAX_RETRIES)),
//
ShrinkAction.NAME,
actionRule(ShrinkAction.NAME).maxTimeOnAction(ONE_DAY)
.stepRules(
// The no-follower step is added here because an `unfollowAction` is added before the `shrinkAction` in the follower
// cluster.
stepRule(WaitForNoFollowersStep.NAME, ONE_DAY)
),
.stepRules(stepRuleOnlyCheckRetries(WaitForNoFollowersStep.NAME, MAX_RETRIES)),
//
AllocateAction.NAME,
actionRule(AllocateAction.NAME).maxTimeOnAction(ONE_DAY).noStepRules(),
//
ForceMergeAction.NAME,
actionRule(ForceMergeAction.NAME).maxTimeOnAction(ONE_DAY)
.stepRules(
stepRule(WaitForIndexColorStep.NAME, ONE_DAY),
stepRule(ForceMergeStep.NAME, ONE_DAY),
stepRule(SegmentCountStep.NAME, ONE_DAY)
stepRuleFullChecks(WaitForIndexColorStep.NAME, ONE_DAY, MAX_RETRIES),
stepRuleFullChecks(ForceMergeStep.NAME, ONE_DAY, MAX_RETRIES),
stepRuleFullChecks(SegmentCountStep.NAME, ONE_DAY, MAX_RETRIES)
)
//
// The next rule has to be commented because of this issue https://github.com/elastic/elasticsearch/issues/96705
Expand Down Expand Up @@ -409,17 +406,33 @@ public boolean test(Long now, IndexMetadata indexMetadata) {
* @param maxTimeOn Maximum time that an index should spend on this step.
* @param maxRetries Maximum number of times that a step should be retried.
*/
public record StepRule(String step, TimeValue maxTimeOn, long maxRetries) implements RuleConfig {
static StepRule stepRule(String name, TimeValue maxTimeOn) {
return new StepRule(name, maxTimeOn, 100);
public record StepRule(String step, TimeValue maxTimeOn, Long maxRetries) implements RuleConfig {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shall we override the canonical constructor and add a validation that either maxTimeOn or maxRetries are different than null ? And otherwise, if both are null, throw an IllegalArgumentException ?


public StepRule {
if (maxTimeOn == null && maxRetries == null) {
throw new IllegalArgumentException("At least one of [maxTimeOne or maxRetries] must be defined.");
}
}

static StepRule stepRuleFullChecks(String name, TimeValue maxTimeOn, long maxRetries) {
return new StepRule(name, maxTimeOn, maxRetries);
}

static StepRule stepRuleOnlyCheckPassedTime(String name, TimeValue maxTimeOn) {
return new StepRule(name, maxTimeOn, null);
}

static StepRule stepRuleOnlyCheckRetries(String name, long maxRetries) {
return new StepRule(name, null, maxRetries);
}

@Override
public boolean test(Long now, IndexMetadata indexMetadata) {
var failedStepRetryCount = indexMetadata.getLifecycleExecutionState().failedStepRetryCount();
return step.equals(indexMetadata.getLifecycleExecutionState().step())
&& (maxTimeOn.compareTo(RuleConfig.getElapsedTime(now, indexMetadata.getLifecycleExecutionState().stepTime())) < 0
|| (failedStepRetryCount != null && failedStepRetryCount > maxRetries));
&& (maxTimeOn != null
&& maxTimeOn.compareTo(RuleConfig.getElapsedTime(now, indexMetadata.getLifecycleExecutionState().stepTime())) < 0
|| (maxRetries != null && failedStepRetryCount != null && failedStepRetryCount > maxRetries));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public void testActionRuleConfig() {
public void testStepRuleConfig() {
var stepName = randomAlphaOfLength(30);
var maxTimeOn = TimeValue.parseTimeValue(randomTimeValue(), "");
var maxRetries = randomIntBetween(11, 100);
var maxRetries = randomLongBetween(11, 100);
var rule = new IlmHealthIndicatorService.StepRule(stepName, maxTimeOn, maxRetries);
var now = System.currentTimeMillis();

Expand Down Expand Up @@ -98,13 +98,14 @@ public void testRuleConfigBuilder() {
var now = System.currentTimeMillis();
var lastExecutionTime = System.currentTimeMillis() - TimeValue.timeValueDays(2).millis();
var maxTimeOnStep = TimeValue.timeValueDays(1);
var maxRetries = randomLongBetween(10, 1000);
var expectedAction = "some-action";
var rules = IlmHealthIndicatorService.RuleConfig.Builder.actionRule(expectedAction)
.maxTimeOnAction(TimeValue.timeValueDays(1))
.stepRules(
IlmHealthIndicatorService.StepRule.stepRule("step-1", maxTimeOnStep),
IlmHealthIndicatorService.StepRule.stepRule("step-2", maxTimeOnStep),
IlmHealthIndicatorService.StepRule.stepRule("step-3", maxTimeOnStep)
IlmHealthIndicatorService.StepRule.stepRuleFullChecks("step-1", maxTimeOnStep, maxRetries),
IlmHealthIndicatorService.StepRule.stepRuleFullChecks("step-2", maxTimeOnStep, maxRetries),
IlmHealthIndicatorService.StepRule.stepRuleFullChecks("step-3", maxTimeOnStep, maxRetries)
);

// An unknown action should not satisfy the conditions
Expand Down