Skip to content

Commit

Permalink
[7.17] ILM step retry safe refresh of the cached phase (#82613) (#82684)
Browse files Browse the repository at this point in the history
When ILM retries a step (moving from the ERROR step back to the
failed_step) we always refreshed the ILM cached phase (the use case here
was that the policy might've been changed for an index due to
accidentally configuring the wrong policy).

In the case when the `failed_step` doesn't exit in the policy though,
refreshing the cached phase would block ILM as the retried step (the
`failed_step`) would not be recognized anymore.

This commit changes retrying an ILM step to only refresh the cached
phase if the failed_step's action and phase are still present in the
policy. If the action or even phase were removed, ILM will honour the
cached phase.

(cherry picked from commit e456eb7)
Signed-off-by: Andrei Dan <andrei.dan@elastic.co>
  • Loading branch information
andreidan committed Jan 17, 2022
1 parent 7e7a5b0 commit 8534ec5
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,37 @@ public void testRetryFreezeDeleteAction() throws Exception {
assertBusy(() -> assertThat(getOnlyIndexSettings(client(), index).get("index.frozen"), equalTo("true")));
}

public void testUpdatePolicyToNotContainFailedStep() throws Exception {
createNewSingletonPolicy(client(), policy, "delete", new DeleteAction(true));
createIndexWithSettings(
client(),
index,
alias,
Settings.builder()
.put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1)
.put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)
.put(IndexMetadata.SETTING_READ_ONLY, true)
.put("index.lifecycle.name", policy)
);

assertBusy(
() -> assertThat((Integer) explainIndex(client(), index).get(FAILED_STEP_RETRY_COUNT_FIELD), greaterThanOrEqualTo(1)),
30,
TimeUnit.SECONDS
);
assertTrue(indexExists(index));

// updating the policy to not contain the delete phase at all
createNewSingletonPolicy(client(), policy, "hot", new RolloverAction(null, null, null, 1L));

// ILM must honour the cached delete phase and eventually delete the index
Request request = new Request("PUT", index + "/_settings");
request.setJsonEntity("{\"index.blocks.read_only\":false}");
assertOK(client().performRequest(request));

assertBusy(() -> assertFalse(indexExists(index)));
}

public void testAllocateOnlyAllocation() throws Exception {
createIndexWithSettings(
client(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
Expand Down Expand Up @@ -246,13 +247,23 @@ static ClusterState moveClusterStateToPreviouslyFailedStep(

LifecyclePolicyMetadata policyMetadata = ilmMeta.getPolicyMetadatas()
.get(LifecycleSettings.LIFECYCLE_NAME_SETTING.get(indexMetadata.getSettings()));
LifecycleExecutionState nextStepState = IndexLifecycleTransition.updateExecutionStateToStep(

Map<String, Phase> policyPhases = policyMetadata.getPolicy().getPhases();

// we only refresh the cached phase if the failed step's action is still present in the underlying policy
// as otherwise ILM would block due to not recognizing the next step as part of the policy.
// if the policy was updated to not contain the action or even phase, we honour the cached phase as it is and do not refresh it
boolean forcePhaseDefinitionRefresh = policyPhases.get(nextStepKey.getPhase()) != null
&& policyPhases.get(nextStepKey.getPhase()).getActions().get(nextStepKey.getAction()) != null;

final LifecycleExecutionState nextStepState = IndexLifecycleTransition.updateExecutionStateToStep(
policyMetadata,
lifecycleState,
nextStepKey,
nowSupplier,
true
forcePhaseDefinitionRefresh
);

LifecycleExecutionState.Builder retryStepState = LifecycleExecutionState.builder(nextStepState);
retryStepState.setIsAutoRetryableError(lifecycleState.isAutoRetryableError());
Integer currentRetryCount = lifecycleState.getFailedStepRetryCount();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,74 @@ public void testMoveClusterStateToPreviouslyFailedStepAsAutomaticRetry() {
assertThat(executionState.getFailedStepRetryCount(), is(1));
}

public void testMoveToFailedStepDoesntRefreshCachedPhaseWhenUnsafe() {
String initialPhaseDefinition = ""
+ "{\n"
+ " \"policy\" : \"my-policy\",\n"
+ " \"phase_definition\" : {\n"
+ " \"min_age\" : \"20m\",\n"
+ " \"actions\" : {\n"
+ " \"rollover\" : {\n"
+ " \"max_age\" : \"5s\"\n"
+ " },\n"
+ " \"set_priority\" : {\n"
+ " \"priority\" : 150\n"
+ " }\n"
+ " }\n"
+ " },\n"
+ " \"version\" : 1,\n"
+ " \"modified_date_in_millis\" : 1578521007076\n"
+ "}\n";
String failedStep = "check-rollover-ready";
LifecycleExecutionState.Builder currentExecutionState = LifecycleExecutionState.builder()
.setPhase("hot")
.setAction("rollover")
.setStep(ErrorStep.NAME)
.setFailedStep(failedStep)
// the phase definition contains the rollover action, but the actual policy does not contain rollover anymore
.setPhaseDefinition(initialPhaseDefinition);

IndexMetadata meta = buildIndexMetadata("my-policy", currentExecutionState);
String indexName = meta.getIndex().getName();

Map<String, LifecycleAction> actions = new HashMap<>();
actions.put("set_priority", new SetPriorityAction(100));
Phase hotPhase = new Phase("hot", TimeValue.ZERO, actions);
Map<String, Phase> phases = Collections.singletonMap("hot", hotPhase);
LifecyclePolicy currentPolicy = new LifecyclePolicy("my-policy", phases);

List<LifecyclePolicyMetadata> policyMetadatas = new ArrayList<>();
policyMetadatas.add(
new LifecyclePolicyMetadata(currentPolicy, Collections.emptyMap(), randomNonNegativeLong(), randomNonNegativeLong())
);

Step.StepKey errorStepKey = new Step.StepKey("hot", RolloverAction.NAME, ErrorStep.NAME);
PolicyStepsRegistry stepsRegistry = createOneStepPolicyStepRegistry("my-policy", new ErrorStep(errorStepKey));

ClusterState clusterState = buildClusterState(
indexName,
Settings.builder().put(LifecycleSettings.LIFECYCLE_NAME, "my-policy"),
currentExecutionState.build(),
policyMetadatas
);
ClusterState newState = IndexLifecycleTransition.moveClusterStateToPreviouslyFailedStep(
clusterState,
indexName,
ESTestCase::randomNonNegativeLong,
stepsRegistry,
false
);

IndexMetadata indexMetadata = newState.metadata().index(indexName);
LifecycleExecutionState nextLifecycleExecutionState = LifecycleExecutionState.fromIndexMetadata(indexMetadata);
assertThat(
"we musn't refresh the cache definition if the failed step is not part of the real policy anymore",
nextLifecycleExecutionState.getPhaseDefinition(),
is(initialPhaseDefinition)
);
assertThat(nextLifecycleExecutionState.getStep(), is(failedStep));
}

public void testRefreshPhaseJson() {
LifecycleExecutionState.Builder exState = LifecycleExecutionState.builder()
.setPhase("hot")
Expand Down

0 comments on commit 8534ec5

Please sign in to comment.