Skip to content

Commit

Permalink
Fix Tripped Assertion on Failed Snapshot Clone Cleanup (#75582) (#75590)
Browse files Browse the repository at this point in the history
We have to put the entry into `endingSnapshots`, otherwise we trip
an assertion about listener consistency with the cluster state if
there's other concurrent operations.
  • Loading branch information
original-brownbear committed Aug 15, 2021
1 parent a680400 commit 8fe2ee7
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,75 @@ public void testDoesNotStartOnBrokenSourceSnapshot() throws Exception {
);
}

public void testSnapshotQueuedAfterCloneFromBrokenSourceSnapshot() throws Exception {
internalCluster().startMasterOnlyNode();
final String dataNode = internalCluster().startDataOnlyNode();
final String repoName = "test-repo";
createRepository(repoName, "mock");
final String testIndex = "index-test";
createIndexWithContent(testIndex);

final String sourceSnapshot = "source-snapshot";
blockDataNode(repoName, dataNode);
final Client masterClient = internalCluster().masterClient();
final ActionFuture<CreateSnapshotResponse> sourceSnapshotFuture = masterClient.admin()
.cluster()
.prepareCreateSnapshot(repoName, sourceSnapshot)
.setWaitForCompletion(true)
.execute();
awaitNumberOfSnapshotsInProgress(1);
waitForBlock(dataNode, repoName);
internalCluster().restartNode(dataNode);
ensureGreen();
assertThat(sourceSnapshotFuture.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
final String sourceSnapshotHealthy = "source-snapshot-healthy";
createFullSnapshot(repoName, "source-snapshot-healthy");

final ActionFuture<CreateSnapshotResponse> sn1 = startFullSnapshot(repoName, "concurrent-snapshot-1");
final ActionFuture<AcknowledgedResponse> clone1 = startClone(
masterClient,
repoName,
sourceSnapshotHealthy,
"target-snapshot-1",
testIndex
);
final ActionFuture<CreateSnapshotResponse> sn2 = startFullSnapshot(repoName, "concurrent-snapshot-2");
final ActionFuture<AcknowledgedResponse> clone2 = startClone(
masterClient,
repoName,
sourceSnapshotHealthy,
"target-snapshot-2",
testIndex
);
final ActionFuture<CreateSnapshotResponse> sn3 = startFullSnapshot(repoName, "concurrent-snapshot-3");
final ActionFuture<AcknowledgedResponse> clone3 = startClone(
masterClient,
repoName,
sourceSnapshotHealthy,
"target-snapshot-3",
testIndex
);
final SnapshotException sne = expectThrows(
SnapshotException.class,
() -> startClone(masterClient, repoName, sourceSnapshot, "target-snapshot", testIndex).actionGet(
TimeValue.timeValueSeconds(30L)
)
);
assertThat(
sne.getMessage(),
containsString(
"Can't clone index [" + getRepositoryData(repoName).resolveIndexId(testIndex) + "] because its snapshot was not successful."
)
);

assertSuccessful(sn1);
assertSuccessful(sn2);
assertSuccessful(sn3);
assertAcked(clone1.get());
assertAcked(clone2.get());
assertAcked(clone3.get());
}

public void testStartSnapshotWithSuccessfulShardClonePendingFinalization() throws Exception {
final String masterName = internalCluster().startMasterOnlyNode(LARGE_SNAPSHOT_POOL_SETTINGS);
final String dataNode = internalCluster().startDataOnlyNode();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,7 @@ private void startCloning(Repository repository, SnapshotsInProgress.Entry clone
final Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT);
// Exception handler for IO exceptions with loading index and repo metadata
final Consumer<Exception> onFailure = e -> {
endingSnapshots.add(targetSnapshot);
initializingClones.remove(targetSnapshot);
logger.info(() -> new ParameterizedMessage("Failed to start snapshot clone [{}]", cloneEntry), e);
removeFailedSnapshotFromClusterState(targetSnapshot, e, null, null);
Expand Down Expand Up @@ -1246,6 +1247,7 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS
@Override
public void onFailure(Exception e) {
logger.warn(() -> new ParameterizedMessage("failed to create snapshot [{}]", snapshot.snapshot().getSnapshotId()), e);
endingSnapshots.add(snapshot.snapshot());
removeFailedSnapshotFromClusterState(
snapshot.snapshot(),
e,
Expand Down Expand Up @@ -1829,6 +1831,7 @@ private static Tuple<Set<String>, Set<String>> indicesWithMissingShards(
*/
private void endSnapshot(SnapshotsInProgress.Entry entry, Metadata metadata, @Nullable RepositoryData repositoryData) {
final Snapshot snapshot = entry.snapshot();
final boolean newFinalization = endingSnapshots.add(snapshot);
if (entry.repositoryStateId() == RepositoryData.UNKNOWN_REPO_GEN) {
logger.debug("[{}] was aborted before starting", snapshot);
removeFailedSnapshotFromClusterState(
Expand All @@ -1844,7 +1847,6 @@ private void endSnapshot(SnapshotsInProgress.Entry entry, Metadata metadata, @Nu
removeFailedSnapshotFromClusterState(entry.snapshot(), new SnapshotException(entry.snapshot(), entry.failure()), null, null);
return;
}
final boolean newFinalization = endingSnapshots.add(snapshot);
final String repoName = snapshot.getRepository();
if (tryEnterRepoLoop(repoName)) {
if (repositoryData == null) {
Expand Down Expand Up @@ -2418,6 +2420,8 @@ private void removeFailedSnapshotFromClusterState(
@Override
public ClusterState execute(ClusterState currentState) {
final ClusterState updatedState = stateWithoutFailedSnapshot(currentState, snapshot);
assert updatedState == currentState || endingSnapshots.contains(snapshot)
: "did not track [" + snapshot + "] in ending snapshots while removing it from the cluster state";
// now check if there are any delete operations that refer to the just failed snapshot and remove the snapshot from them
return updateWithSnapshots(
updatedState,
Expand Down

0 comments on commit 8fe2ee7

Please sign in to comment.