Skip to content

Commit

Permalink
[7.17] Retry clean and create snapshot if it already exists (#84829) (#…
Browse files Browse the repository at this point in the history
…93724)

During a master restart or failover the step CreateSnapshotStep of the action SearchableSnapshotAction can be invoked twice. When this occurs we mark the creation as incomplete and we follow the existing mechanism of returning back to CleanupSnapshotStep.

Closes #83694

(cherry picked from commit e03c2b0)
Signed-off-by: Andrei Dan <andrei.dan@elastic.co>

Co-authored-by: Andrei Dan <andrei.dan@elastic.co>
  • Loading branch information
andreidan committed Feb 14, 2023
1 parent a79da4f commit 686f8bc
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 11 deletions.
6 changes: 6 additions & 0 deletions docs/changelog/84829.yaml
@@ -0,0 +1,6 @@
pr: 84829
summary: "Retry clean and create snapshot if it already exists #83694"
area: ILM+SLM
type: bug
issues:
- 83694
Expand Up @@ -30,5 +30,4 @@ public InvalidSnapshotNameException(StreamInput in) throws IOException {
public RestStatus status() {
return RestStatus.BAD_REQUEST;
}

}
Expand Up @@ -155,6 +155,8 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
public static final String UPDATE_SNAPSHOT_STATUS_ACTION_NAME = "internal:cluster/snapshot/update_snapshot_status";

public static final String NO_FEATURE_STATES_VALUE = "none";
public static final String SNAPSHOT_ALREADY_IN_PROGRESS_EXCEPTION_DESC = "snapshot with the same name is already in-progress";
public static final String SNAPSHOT_ALREADY_EXISTS_EXCEPTION_DESC = "snapshot with the same name already exists";

private final ClusterService clusterService;

Expand Down Expand Up @@ -616,7 +618,7 @@ public void clusterStateProcessed(String source, ClusterState oldState, final Cl

private static void ensureSnapshotNameNotRunning(SnapshotsInProgress runningSnapshots, String repositoryName, String snapshotName) {
if (runningSnapshots.forRepo(repositoryName).stream().anyMatch(s -> s.snapshot().getSnapshotId().getName().equals(snapshotName))) {
throw new InvalidSnapshotNameException(repositoryName, snapshotName, "snapshot with the same name is already in-progress");
throw new InvalidSnapshotNameException(repositoryName, snapshotName, SNAPSHOT_ALREADY_IN_PROGRESS_EXCEPTION_DESC);
}
}

Expand Down Expand Up @@ -742,11 +744,7 @@ private static void ensureNoCleanupInProgress(
private static void ensureSnapshotNameAvailableInRepo(RepositoryData repositoryData, String snapshotName, Repository repository) {
// check if the snapshot name already exists in the repository
if (repositoryData.getSnapshotIds().stream().anyMatch(s -> s.getName().equals(snapshotName))) {
throw new InvalidSnapshotNameException(
repository.getMetadata().name(),
snapshotName,
"snapshot with the same name already exists"
);
throw new InvalidSnapshotNameException(repository.getMetadata().name(), snapshotName, SNAPSHOT_ALREADY_EXISTS_EXCEPTION_DESC);
}
}

Expand Down Expand Up @@ -1070,7 +1068,7 @@ protected void doRun() {
throw new InvalidSnapshotNameException(
repository.getMetadata().name(),
snapshotName,
"snapshot with the same name already exists"
SNAPSHOT_ALREADY_EXISTS_EXCEPTION_DESC
);
}
if (clusterState.nodes().getMinNodeVersion().onOrAfter(NO_REPO_INITIALIZE_VERSION) == false) {
Expand Down
Expand Up @@ -16,7 +16,9 @@
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.common.Strings;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.snapshots.InvalidSnapshotNameException;
import org.elasticsearch.snapshots.SnapshotInfo;
import org.elasticsearch.snapshots.SnapshotsService;

import java.util.Locale;
import java.util.Objects;
Expand Down Expand Up @@ -62,7 +64,31 @@ public void onResponse(Boolean complete) {

@Override
public void onFailure(Exception e) {
listener.onFailure(e);
if (e instanceof InvalidSnapshotNameException && e.getMessage() != null) {
// this is not pretty, but we'd like for 7.17 to avoid introducing a custom exception to match the "snapshot already
// exists" scenario (like we did for the 8.x line)
// the InvalidSnapshotNameException message is in a custom format and the parts of the message are not exposed however,
// we're matching on entire phrases so we will not inadvertently match different scenarios than the two we're
// targeting here
if (e.getMessage().endsWith(SnapshotsService.SNAPSHOT_ALREADY_EXISTS_EXCEPTION_DESC)
|| e.getMessage().endsWith(SnapshotsService.SNAPSHOT_ALREADY_IN_PROGRESS_EXCEPTION_DESC)) {
// we treat a snapshot that was already created before this step as an incomplete snapshot. This scenario is
// triggered by a master restart or a failover which can result in a double invocation of this step.
logger.warn(
"snapshot [{}] is already in-progress or in-use for index [{}], ILM will attempt to clean it up and "
+ "recreate it",
((InvalidSnapshotNameException) e).getSnapshotName(),
indexMetadata.getIndex().getName()
);
// note we're going thorugh the success branch here (ie. this is not listener.onComplete)
// to mark the step as onResponseResult=false (this way our next step key will be nextKeyOnIncomplete)
onResponse(false);
} else {
listener.onFailure(e);
}
} else {
listener.onFailure(e);
}
}
});
}
Expand Down
Expand Up @@ -17,6 +17,8 @@
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.cluster.metadata.Metadata;
import org.elasticsearch.snapshots.InvalidSnapshotNameException;
import org.elasticsearch.snapshots.SnapshotsService;
import org.elasticsearch.test.client.NoOpClient;
import org.elasticsearch.xpack.core.ilm.Step.StepKey;

Expand Down Expand Up @@ -48,18 +50,22 @@ protected CreateSnapshotStep copyInstance(CreateSnapshotStep instance) {
@Override
public CreateSnapshotStep mutateInstance(CreateSnapshotStep instance) {
StepKey key = instance.getKey();
StepKey nextKeyOnCompleteResponse = instance.getNextKeyOnComplete();
StepKey nextKeyOnIncompleteResponse = instance.getNextKeyOnIncomplete();
switch (between(0, 1)) {
switch (between(0, 2)) {
case 0:
key = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
break;
case 1:
nextKeyOnCompleteResponse = randomStepKey();
break;
case 2:
nextKeyOnIncompleteResponse = randomStepKey();
break;
default:
throw new AssertionError("Illegal randomisation branch");
}
return new CreateSnapshotStep(key, randomStepKey(), nextKeyOnIncompleteResponse, instance.getClient());
return new CreateSnapshotStep(key, nextKeyOnCompleteResponse, nextKeyOnIncompleteResponse, instance.getClient());
}

public void testPerformActionFailure() {
Expand Down Expand Up @@ -218,6 +224,42 @@ public void onFailure(Exception e) {
assertThat(incompleteStep.getNextStepKey(), is(nextKeyOnIncomplete));
}
}

{
try (NoOpClient client = new NoOpClient(getTestName())) {
StepKey nextKeyOnComplete = randomStepKey();
StepKey nextKeyOnIncomplete = randomStepKey();
CreateSnapshotStep doubleInvocationStep = new CreateSnapshotStep(
randomStepKey(),
nextKeyOnComplete,
nextKeyOnIncomplete,
client
) {
@Override
void createSnapshot(IndexMetadata indexMetadata, ActionListener<Boolean> listener) {
listener.onFailure(
new InvalidSnapshotNameException(
repository,
snapshotName,
SnapshotsService.SNAPSHOT_ALREADY_EXISTS_EXCEPTION_DESC
)
);
}
};
doubleInvocationStep.performAction(indexMetadata, clusterState, null, new ActionListener<Void>() {
@Override
public void onResponse(Void unused) {

}

@Override
public void onFailure(Exception e) {

}
});
assertThat(doubleInvocationStep.getNextStepKey(), is(nextKeyOnIncomplete));
}
}
}

private NoOpClient getCreateSnapshotRequestAssertingClient(String expectedRepoName, String expectedSnapshotName, String indexName) {
Expand Down

0 comments on commit 686f8bc

Please sign in to comment.