Skip to content

Commit

Permalink
[7.17] WaitForSnapshotStep verifies if the index belongs to the lat…
Browse files Browse the repository at this point in the history
…est snapshot of that SLM policy (#100911) (#101030)

* `WaitForSnapshotStep` verifies if the index belongs to the latest snapshot of that SLM policy (#100911)

The `WaitForSnapshotStep` used to check if the SLM policy has been
executed after the index has entered the delete phase, but it did not
check if the SLM policy included this index.

The result of this is that if the user used an SLM policy that did not
include this index, when the index would enter the
`WaitForSnapshotStep`, it would wait for a snapshot to be taken, a
snapshot that would not include the index, and then ILM would delete the
index.

See the exact reproduction path:
#57809

**Solution** This PR, after it finds a successful SLM run, it verifies
if the snapshot taken by SLM contains this index. If not it throws an
error, otherwise it proceeds.

ILM explain will report:

```
"step_info": {
        "type": "illegal_state_exception",
        "reason": "the last successful snapshot of policy 'hourly-snapshots' does not include index '.ds-my-other-stream-2023.10.16-000001'"
      }
```

**Backwards compatibility concerns** In this PR, the
`WaitForSnapshotStep` changed from `ClusterStateWaitStep` to
`AsyncWaitStep`. We do not think this is gonna cause an issue. This was
tested manually by the following steps: - Run a master node with the old
version. - When ILM is executing `wait-for-snapshot`, we shutdown the
node - We start the node again with the new version os ES - ES was able
to pick up the step and continue with the new code.

We believe that this covers bwc concerns.

Fixes: #57809
(cherry picked from commit 5697fcf)
  • Loading branch information
gmarouli committed Oct 18, 2023
1 parent 28c0147 commit b249462
Show file tree
Hide file tree
Showing 4 changed files with 282 additions and 50 deletions.
7 changes: 7 additions & 0 deletions docs/changelog/100911.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
pr: 100911
summary: '`WaitForSnapshotStep` verifies if the index belongs to the latest snapshot
of that SLM policy'
area: ILM+SLM
type: bug
issues:
- 57809
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public WaitForSnapshotAction(StreamInput in) throws IOException {
@Override
public List<Step> toSteps(Client client, String phase, StepKey nextStepKey) {
StepKey waitForSnapshotKey = new StepKey(phase, NAME, WaitForSnapshotStep.NAME);
return Collections.singletonList(new WaitForSnapshotStep(waitForSnapshotKey, nextStepKey, policy));
return Collections.singletonList(new WaitForSnapshotStep(waitForSnapshotKey, nextStepKey, client, policy));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsRequest;
import org.elasticsearch.client.Client;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.cluster.metadata.Metadata;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.index.Index;
import org.elasticsearch.xcontent.ToXContentObject;
import org.elasticsearch.xpack.core.slm.SnapshotLifecycleMetadata;
Expand All @@ -20,44 +24,53 @@
import java.util.Objects;

/***
* A step that waits for snapshot to be taken by SLM to ensure we have backup before we delete the index.
* It will signal error if it can't get data needed to do the check (action time from ILM and SLM metadata)
* and will only return success if execution of SLM policy took place after index entered the wait for snapshot action.
* A step that waits for snapshot to be taken by SLM that includes the index in question to ensure we have backup
* before we delete the index. It will signal error if it can't get data needed to do the check (action time from ILM
* and SLM metadata) and will only return success if execution of SLM policy took place after index entered the wait
* for snapshot action and the latest successful snapshot includes the index.
*/
public class WaitForSnapshotStep extends ClusterStateWaitStep {
public class WaitForSnapshotStep extends AsyncWaitStep {

static final String NAME = "wait-for-snapshot";
private static final Logger logger = LogManager.getLogger(WaitForSnapshotStep.class);

private static final String MESSAGE_FIELD = "message";
private static final String POLICY_NOT_EXECUTED_MESSAGE = "waiting for policy '%s' to be executed since %s";
private static final String POLICY_NOT_FOUND_MESSAGE = "configured policy '%s' not found";
private static final String INDEX_NOT_INCLUDED_IN_SNAPSHOT_MESSAGE =
"the last successful snapshot of policy '%s' does not include index '%s'";

private static final String UNEXPECTED_SNAPSHOT_STATE_MESSAGE =
"unexpected number of snapshots retrieved for repository '%s' and snapshot '%s' (expected 1, found %d)";
private static final String NO_INDEX_METADATA_MESSAGE = "no index metadata found for index '%s'";
private static final String NO_ACTION_TIME_MESSAGE = "no information about ILM action start in index metadata for index '%s'";

private final String policy;

WaitForSnapshotStep(StepKey key, StepKey nextStepKey, String policy) {
super(key, nextStepKey);
WaitForSnapshotStep(StepKey key, StepKey nextStepKey, Client client, String policy) {
super(key, nextStepKey, client);
this.policy = policy;
}

@Override
public Result isConditionMet(Index index, ClusterState clusterState) {
IndexMetadata indexMetadata = clusterState.metadata().index(index);
public void evaluateCondition(Metadata metadata, Index index, Listener listener, TimeValue masterTimeout) {
IndexMetadata indexMetadata = metadata.index(index);
if (indexMetadata == null) {
throw error(NO_INDEX_METADATA_MESSAGE, index.getName());
listener.onFailure(error(NO_INDEX_METADATA_MESSAGE, index.getName()));
return;
}

Long actionTime = LifecycleExecutionState.fromIndexMetadata(indexMetadata).getActionTime();

if (actionTime == null) {
throw error(NO_ACTION_TIME_MESSAGE, index.getName());
listener.onFailure(error(NO_ACTION_TIME_MESSAGE, index.getName()));
return;
}

SnapshotLifecycleMetadata snapMeta = clusterState.metadata().custom(SnapshotLifecycleMetadata.TYPE);
SnapshotLifecycleMetadata snapMeta = metadata.custom(SnapshotLifecycleMetadata.TYPE);
if (snapMeta == null || snapMeta.getSnapshotConfigurations().containsKey(policy) == false) {
throw error(POLICY_NOT_FOUND_MESSAGE, policy);
listener.onFailure(error(POLICY_NOT_FOUND_MESSAGE, policy));
return;
}
SnapshotLifecyclePolicyMetadata snapPolicyMeta = snapMeta.getSnapshotConfigurations().get(policy);
if (snapPolicyMeta.getLastSuccess() == null
Expand All @@ -79,15 +92,32 @@ public Result isConditionMet(Index index, ClusterState clusterState) {
snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()
);
}
return new Result(false, notExecutedMessage(actionTime));
listener.onResponse(false, notExecutedMessage(actionTime));
return;
}
logger.debug(
"executing policy because snapshot start time {} is after action time {}, snapshot timestamp is {}",
snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(),
actionTime,
snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()
);
return new Result(true, null);
String snapshotName = snapPolicyMeta.getLastSuccess().getSnapshotName();
String repositoryName = snapPolicyMeta.getPolicy().getRepository();
GetSnapshotsRequest request = new GetSnapshotsRequest().repositories(repositoryName)
.snapshots(new String[] { snapshotName })
.verbose(false);
getClient().admin().cluster().getSnapshots(request, ActionListener.wrap(response -> {
if (response.getSnapshots().size() != 1) {
listener.onFailure(error(UNEXPECTED_SNAPSHOT_STATE_MESSAGE, repositoryName, snapshotName, response.getSnapshots().size()));
} else {
if (response.getSnapshots().get(0).indices().contains(index.getName())) {
listener.onResponse(true, null);
} else {
listener.onFailure(error(INDEX_NOT_INCLUDED_IN_SNAPSHOT_MESSAGE, policy, index.getName()));
}
}
}, listener::onFailure));

}

public String getPolicy() {
Expand Down

0 comments on commit b249462

Please sign in to comment.