Skip to content

Commit

Permalink
Fix Partial Snapshots Recording Spurious Errors (#69150) (#69373)
Browse files Browse the repository at this point in the history
If an index is deleted while a partial snapshot is running the behavior
was not deterministic.
If an index was deleted just as one of its shard snapshots was about to
start then it would be recorded as a shard snapshot failure in the
snapshot result and the snapshot would show up as `PARTIAL`.
If the index delete however happened after the shard had been
snapshotted, then the snapshot would show `SUCCESS`.
In both cases however, the snapshot would contain the exact same data
because the deleted index would become part of the final snapshot.
Also, it was confusing that in the `PARTIAL` case, there would be errors
recorded for shards the indices of which would not be part of the
snapshot.

This commit makes it such that not only are indices filtered from the
list of indices in a snapshot but also from the shard snapshot errors
in a snapshot entry so that the snapshot always shows up as `SUCCESS`
because concurrent index deletes are not a failure but allowed in
partial snapshots.

Closes #69014

Co-authored-by: Tamara Braun <summerwind.1005@gmail.com>
  • Loading branch information
original-brownbear and TommyWind committed Feb 23, 2021
1 parent db58272 commit 293436c
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -823,8 +823,8 @@ public void testMultiplePartialSnapshotsQueuedAfterDelete() throws Exception {
assertAcked(client().admin().indices().prepareDelete("index-two"));
unblockNode(repoName, masterNode);

assertThat(snapshotThree.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
assertThat(snapshotFour.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
assertThat(snapshotThree.get().getSnapshotInfo().state(), is(SnapshotState.SUCCESS));
assertThat(snapshotFour.get().getSnapshotInfo().state(), is(SnapshotState.SUCCESS));
assertAcked(deleteFuture.get());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1017,11 +1017,34 @@ public void testSnapshotDeleteRelocatingPrimaryIndex() throws Exception {

awaitNoMoreRunningOperations();
SnapshotInfo snapshotInfo = getSnapshot(repoName, "test-snap");
assertThat(snapshotInfo.state(), equalTo(SnapshotState.PARTIAL));
assertThat(snapshotInfo.shardFailures().size(), greaterThan(0));
assertThat(snapshotInfo.state(), equalTo(SnapshotState.SUCCESS));
logger.info("--> done");
}

public void testPartialSnapshotsDoNotRecordDeletedShardFailures() throws Exception {
internalCluster().startMasterOnlyNodes(1);
final String dataNode = internalCluster().startDataOnlyNode();
final String repoName = "test-repo";
createRepository(repoName, "mock");

final String firstIndex = "index-one";
createIndexWithContent(firstIndex);
final String secondIndex = "index-two";
createIndexWithContent(secondIndex);

final String snapshot = "snapshot-one";
blockDataNode(repoName, dataNode);
final ActionFuture<CreateSnapshotResponse> snapshotResponse = startFullSnapshot(repoName, snapshot, true);
waitForBlock(dataNode, repoName);

assertAcked(client().admin().indices().prepareDelete(firstIndex));

unblockNode(repoName, dataNode);

SnapshotInfo snapshotInfo = assertSuccessful(snapshotResponse);
assertThat(snapshotInfo.shardFailures(), empty());
}

public void testGetReposWithWildcard() {
internalCluster().startMasterOnlyNode();
List<RepositoryMetadata> repositoryMetadata = client().admin().cluster().prepareGetRepositories("*").get().repositories();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -816,15 +816,16 @@ public void testPartialSnapshotsOfSystemIndexRemovesFeatureState() throws Except
});
}

@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/69014")
public void testParallelIndexDeleteRemovesFeatureState() throws Exception {
final String indexToBeDeleted = SystemIndexTestPlugin.SYSTEM_INDEX_NAME;
final String fullIndexName = AnotherSystemIndexTestPlugin.SYSTEM_INDEX_NAME;
final String nonsystemIndex = "nonsystem-idx";

final int nodesInCluster = internalCluster().size();
// Stop one data node so we only have one data node to start with
internalCluster().stopNode(dataNodes.get(1));
dataNodes.remove(1);
ensureStableCluster(nodesInCluster - 1);

createRepositoryNoVerify(REPO_NAME, "mock");

Expand Down Expand Up @@ -866,7 +867,7 @@ public void testParallelIndexDeleteRemovesFeatureState() throws Exception {
unblockNode(REPO_NAME, dataNodes.get(1));

logger.info("--> Repo unblocked, checking that snapshot finished...");
CreateSnapshotResponse createSnapshotResponse = createSnapshotFuture.actionGet();
CreateSnapshotResponse createSnapshotResponse = createSnapshotFuture.get();
logger.info(createSnapshotResponse.toString());
assertThat(createSnapshotResponse.status(), equalTo(RestStatus.OK));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1628,9 +1628,16 @@ private void finalizeSnapshotEntry(SnapshotsInProgress.Entry entry, Metadata met
final String failure = entry.failure();
final Snapshot snapshot = entry.snapshot();
logger.trace("[{}] finalizing snapshot in repository, state: [{}], failure[{}]", snapshot, entry.state(), failure);
final ShardGenerations shardGenerations = buildGenerations(entry, metadata);
final List<String> finalIndices = shardGenerations.indices().stream().map(IndexId::getName).collect(Collectors.toList());
final Set<String> indexNames = new HashSet<>(finalIndices);
ArrayList<SnapshotShardFailure> shardFailures = new ArrayList<>();
for (ObjectObjectCursor<ShardId, ShardSnapshotStatus> shardStatus : entry.shards()) {
ShardId shardId = shardStatus.key;
if (indexNames.contains(shardId.getIndexName()) == false) {
assert entry.partial() : "only ignoring shard failures for concurrently deleted indices for partial snapshots";
continue;
}
ShardSnapshotStatus status = shardStatus.value;
final ShardState state = status.state();
if (state.failed()) {
Expand All @@ -1641,7 +1648,6 @@ private void finalizeSnapshotEntry(SnapshotsInProgress.Entry entry, Metadata met
assert state == ShardState.SUCCESS;
}
}
final ShardGenerations shardGenerations = buildGenerations(entry, metadata);
final String repository = snapshot.getRepository();
final StepListener<Metadata> metadataListener = new StepListener<>();
final Repository repo = repositoriesService.repository(snapshot.getRepository());
Expand Down Expand Up @@ -1671,9 +1677,6 @@ private void finalizeSnapshotEntry(SnapshotsInProgress.Entry entry, Metadata met
}
metadataListener.whenComplete(meta -> {
final Metadata metaForSnapshot = metadataForSnapshot(entry, meta);
final List<String> finalIndices = shardGenerations.indices().stream()
.map(IndexId::getName)
.collect(Collectors.toList());
final SnapshotInfo snapshotInfo = new SnapshotInfo(snapshot.getSnapshotId(),
finalIndices,
entry.partial() ? entry.dataStreams().stream()
Expand Down

0 comments on commit 293436c

Please sign in to comment.