Skip to content

Commit

Permalink
Fix Snapshot Getting Stuck if Snapshot Queued after Delete has Shard …
Browse files Browse the repository at this point in the history
…in State MISSING (#78587) (#78589)

It's in the title. If we reassign a shard to `MISSING` then we must keep assigning tasks for that shard
in the case of clones and/or keep marking those shards `MISSING` on subsequent snapshots as well.
  • Loading branch information
original-brownbear committed Oct 3, 2021
1 parent 7264d12 commit 0940c63
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -1735,6 +1735,120 @@ public void testQueuedAfterFailedShardSnapshot() throws Exception {
assertEquals(snapshotsStatusResponse1, snapshotsStatusResponse3);
}

public void testCloneQueuedAfterMissingShard() throws Exception {
final String master = internalCluster().startMasterOnlyNode();
final List<String> dataNodes = internalCluster().startDataOnlyNodes(2);
final String index1 = "index-1";
final String index2 = "index-2";
createIndexWithContent(index1, dataNodes.get(0), dataNodes.get(1));
createIndexWithContent(index2, dataNodes.get(1), dataNodes.get(0));

final String repository = "test-repo";
createRepository(repository, "mock");
final String snapshotToDelete = "snapshot-to-delete";
createFullSnapshot(repository, snapshotToDelete);
final String cloneSource = "source-snapshot";
createFullSnapshot(repository, cloneSource);

internalCluster().stopNode(dataNodes.get(0));

blockMasterOnWriteIndexFile(repository);
final ActionFuture<AcknowledgedResponse> deleteFuture = clusterAdmin().prepareDeleteSnapshot(repository, snapshotToDelete)
.execute();
awaitNDeletionsInProgress(1);

final ActionFuture<CreateSnapshotResponse> snapshot1 = startFullSnapshot(repository, "snapshot-1", true);
awaitNumberOfSnapshotsInProgress(1);

final ActionFuture<AcknowledgedResponse> cloneFuture = clusterAdmin().prepareCloneSnapshot(
repository,
cloneSource,
"target-snapshot"
).setIndices(index1).execute();
awaitNumberOfSnapshotsInProgress(2);

unblockNode(repository, master);
assertAcked(deleteFuture.get());
assertAcked(cloneFuture.get());
awaitNoMoreRunningOperations();
assertThat(snapshot1.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
}

public void testSnapshotQueuedAfterMissingShard() throws Exception {
final String master = internalCluster().startMasterOnlyNode();
final List<String> dataNodes = internalCluster().startDataOnlyNodes(2);
final String index1 = "index-1";
final String index2 = "index-2";
createIndexWithContent(index1, dataNodes.get(0), dataNodes.get(1));
createIndexWithContent(index2, dataNodes.get(1), dataNodes.get(0));

final String repository = "test-repo";
createRepository(repository, "mock");
final String snapshotToDelete = "snapshot-to-delete";
createFullSnapshot(repository, snapshotToDelete);

internalCluster().stopNode(dataNodes.get(0));

blockMasterOnWriteIndexFile(repository);
final ActionFuture<AcknowledgedResponse> deleteFuture = startDeleteSnapshot(repository, snapshotToDelete);
awaitNDeletionsInProgress(1);

final ActionFuture<CreateSnapshotResponse> snapshot1 = startFullSnapshot(repository, "snapshot-1", true);
awaitNumberOfSnapshotsInProgress(1);

final ActionFuture<CreateSnapshotResponse> snapshot2 = startFullSnapshot(repository, "snapshot-2", true);
awaitNumberOfSnapshotsInProgress(2);

unblockNode(repository, master);
assertAcked(deleteFuture.get());
awaitNoMoreRunningOperations();
assertThat(snapshot1.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
assertThat(snapshot2.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
}

public void testSnapshotAndCloneQueuedAfterMissingShard() throws Exception {
final String master = internalCluster().startMasterOnlyNode();
final List<String> dataNodes = internalCluster().startDataOnlyNodes(2);
final String index1 = "index-1";
final String index2 = "index-2";
createIndexWithContent(index1, dataNodes.get(0), dataNodes.get(1));
createIndexWithContent(index2, dataNodes.get(1), dataNodes.get(0));

final String repository = "test-repo";
createRepository(repository, "mock");
final String snapshotToDelete = "snapshot-to-delete";
createFullSnapshot(repository, snapshotToDelete);
final String cloneSource = "source-snapshot";
createFullSnapshot(repository, cloneSource);

internalCluster().stopNode(dataNodes.get(0));

blockMasterOnWriteIndexFile(repository);
final ActionFuture<AcknowledgedResponse> deleteFuture = clusterAdmin().prepareDeleteSnapshot(repository, snapshotToDelete)
.execute();
awaitNDeletionsInProgress(1);

final ActionFuture<CreateSnapshotResponse> snapshot1 = startFullSnapshot(repository, "snapshot-1", true);
awaitNumberOfSnapshotsInProgress(1);

final ActionFuture<CreateSnapshotResponse> snapshot2 = startFullSnapshot(repository, "snapshot-2", true);
awaitNumberOfSnapshotsInProgress(2);

final ActionFuture<AcknowledgedResponse> cloneFuture = clusterAdmin().prepareCloneSnapshot(
repository,
cloneSource,
"target-snapshot"
).setIndices(index1).execute();
awaitNumberOfSnapshotsInProgress(3);

unblockNode(repository, master);
assertAcked(deleteFuture.get());
assertAcked(cloneFuture.get());
awaitNoMoreRunningOperations();
assertThat(snapshot1.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
assertThat(snapshot2.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
}

private static void assertSnapshotStatusCountOnRepo(String otherBlockedRepoName, int count) {
final SnapshotsStatusResponse snapshotsStatusResponse = client().admin()
.cluster()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3335,7 +3335,9 @@ private SnapshotsInProgress updatedSnapshotsInProgress(ClusterState currentState
: "Missing assignment for [" + sid + "]";
updatedAssignmentsBuilder.put(sid, ShardSnapshotStatus.MISSING);
} else {
markShardReassigned(shardId, reassignedShardIds);
if (updated.isActive()) {
markShardReassigned(shardId, reassignedShardIds);
}
updatedAssignmentsBuilder.put(sid, updated);
}
}
Expand Down

0 comments on commit 0940c63

Please sign in to comment.