Skip to content

Commit

Permalink
Fix Concurrent Snapshot Repository Remove Issues (#74880) (#75122)
Browse files Browse the repository at this point in the history
We need to fail repository operations as soon as a repository has been shut down
to avoid getting into broken cluster state situations where operations are created
in the cluster state but the repository they target was removed concurrently.

closes #74858
  • Loading branch information
original-brownbear committed Jul 8, 2021
1 parent 5b5dca2 commit 54459eb
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import org.elasticsearch.indices.recovery.RecoveryState;
import org.elasticsearch.node.Node;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.repositories.RepositoryException;
import org.elasticsearch.repositories.blobstore.BlobStoreRepository;
import org.elasticsearch.rest.AbstractRestChannel;
import org.elasticsearch.rest.RestRequest;
Expand Down Expand Up @@ -89,6 +90,7 @@
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertFutureThrows;
import static org.hamcrest.Matchers.allOf;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.either;
import static org.hamcrest.Matchers.empty;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
Expand Down Expand Up @@ -1115,6 +1117,45 @@ public void testGetReposWithWildcard() {
assertThat(repositoryMetadata, empty());
}

public void testConcurrentSnapshotAndRepoDelete() throws Exception {
internalCluster().startMasterOnlyNodes(1);
internalCluster().startDataOnlyNode();
final String repoName = "test-repo";
createRepository(repoName, "fs");

// create a few snapshots so deletes will run for a while
final int snapshotCount = randomIntBetween(10, 25);
final List<String> snapshotNames = createNSnapshots(repoName, snapshotCount);

// concurrently trigger repository and snapshot deletes
final List<ActionFuture<AcknowledgedResponse>> deleteFutures = new ArrayList<>(snapshotCount);
final ActionFuture<AcknowledgedResponse> deleteRepoFuture = clusterAdmin().prepareDeleteRepository(repoName).execute();
for (String snapshotName : snapshotNames) {
deleteFutures.add(clusterAdmin().prepareDeleteSnapshot(repoName, snapshotName).execute());
}

try {
assertAcked(deleteRepoFuture.actionGet());
} catch (Exception e) {
assertThat(
e.getMessage(),
containsString(
"trying to modify or unregister repository [test-repo] that is currently used (snapshot deletion is in progress)"
)
);
}
for (ActionFuture<AcknowledgedResponse> deleteFuture : deleteFutures) {
try {
assertAcked(deleteFuture.actionGet());
} catch (RepositoryException e) {
assertThat(
e.getMessage(),
either(containsString("[test-repo] repository is not in started state")).or(containsString("[test-repo] missing"))
);
}
}
}

private long calculateTotalFilesSize(List<Path> files) {
return files.stream().mapToLong(f -> {
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,11 @@ private void cleanupRepo(String repositoryName, ActionListener<RepositoryCleanup

@Override
public ClusterState execute(ClusterState currentState) {
final RepositoryCleanupInProgress repositoryCleanupInProgress =
currentState.custom(RepositoryCleanupInProgress.TYPE, RepositoryCleanupInProgress.EMPTY);
SnapshotsService.ensureRepositoryExists(repositoryName, currentState);
final RepositoryCleanupInProgress repositoryCleanupInProgress = currentState.custom(
RepositoryCleanupInProgress.TYPE,
RepositoryCleanupInProgress.EMPTY
);
if (repositoryCleanupInProgress.hasCleanupInProgress()) {
throw new IllegalStateException(
"Cannot cleanup [" + repositoryName + "] - a repository cleanup is already in-progress in ["
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -556,6 +556,10 @@ protected BlobStore getBlobStore() {
protected BlobContainer blobContainer() {
assertSnapshotOrGenericThread();

if (lifecycle.started() == false) {
throw notStartedException();
}

BlobContainer blobContainer = this.blobContainer.get();
if (blobContainer == null) {
synchronized (lock) {
Expand Down Expand Up @@ -583,7 +587,7 @@ public BlobStore blobStore() {
store = blobStore.get();
if (store == null) {
if (lifecycle.started() == false) {
throw new RepositoryException(metadata.name(), "repository is not in started state");
throw notStartedException();
}
try {
store = createBlobStore();
Expand Down Expand Up @@ -1365,6 +1369,11 @@ public void getRepositoryData(ActionListener<RepositoryData> listener) {
// master-eligible or not.
assert clusterService.localNode().isMasterNode() : "should only load repository data on master nodes";

if (lifecycle.started() == false) {
listener.onFailure(notStartedException());
return;
}

if (latestKnownRepoGen.get() == RepositoryData.CORRUPTED_REPO_GEN) {
listener.onFailure(corruptedStateException(null, null));
return;
Expand Down Expand Up @@ -1396,6 +1405,10 @@ public void getRepositoryData(ActionListener<RepositoryData> listener) {
}
}

private RepositoryException notStartedException() {
return new RepositoryException(metadata.name(), "repository is not in started state");
}

// Listener used to ensure that repository data is only initialized once in the cluster state by #initializeRepoGenerationTracking
private ListenableActionFuture<RepositoryData> repoDataInitialized;

Expand Down Expand Up @@ -2042,9 +2055,11 @@ private ClusterState updateRepositoryGenerationsIfNecessary(ClusterState state,
}

private RepositoryMetadata getRepoMetadata(ClusterState state) {
final RepositoryMetadata repositoryMetadata =
state.getMetadata().<RepositoriesMetadata>custom(RepositoriesMetadata.TYPE).repository(metadata.name());
assert repositoryMetadata != null;
final RepositoryMetadata repositoryMetadata = state.getMetadata()
.<RepositoriesMetadata>custom(RepositoriesMetadata.TYPE)
.repository(metadata.name());
assert repositoryMetadata != null || lifecycle.stoppedOrClosed()
: "did not find metadata for repo [" + metadata.name() + "] in state [" + lifecycleState() + "]";
return repositoryMetadata;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ public void createSnapshotLegacy(final CreateSnapshotRequest request, final Acti
@Override
public ClusterState execute(ClusterState currentState) {
validate(repositoryName, snapshotName, currentState);
ensureRepositoryExists(repositoryName, currentState);
SnapshotDeletionsInProgress deletionsInProgress = currentState.custom(SnapshotDeletionsInProgress.TYPE);
if (deletionsInProgress != null && deletionsInProgress.hasDeletionsInProgress()) {
throw new ConcurrentSnapshotExecutionException(repositoryName, snapshotName,
Expand Down Expand Up @@ -413,6 +414,7 @@ public void createSnapshot(final CreateSnapshotRequest request, final ActionList

@Override
public ClusterState execute(ClusterState currentState) {
ensureRepositoryExists(repositoryName, currentState);
ensureSnapshotNameAvailableInRepo(repositoryData, snapshotName, repository);
final SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY);
final List<SnapshotsInProgress.Entry> runningSnapshots = snapshots.entries();
Expand Down Expand Up @@ -557,6 +559,7 @@ public void cloneSnapshot(CloneSnapshotRequest request, ActionListener<Void> lis

@Override
public ClusterState execute(ClusterState currentState) {
ensureRepositoryExists(repositoryName, currentState);
ensureSnapshotNameAvailableInRepo(repositoryData, snapshotName, repository);
ensureNoCleanupInProgress(currentState, repositoryName, snapshotName);
final SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY);
Expand Down Expand Up @@ -808,6 +811,15 @@ private void ensureBelowConcurrencyLimit(String repository, String name, Snapsho
}
}

/**
* Throws {@link RepositoryMissingException} if no repository by the given name is found in the given cluster state.
*/
public static void ensureRepositoryExists(String repoName, ClusterState state) {
if (state.metadata().custom(RepositoriesMetadata.TYPE, RepositoriesMetadata.EMPTY).repository(repoName) == null) {
throw new RepositoryMissingException(repoName);
}
}

/**
* Validates snapshot request
*
Expand Down Expand Up @@ -2038,6 +2050,7 @@ public ClusterState execute(ClusterState currentState) throws Exception {
+ MULTI_DELETE_VERSION + "] but cluster contained node of version [" + currentState.nodes().getMinNodeVersion()
+ "]");
}
ensureRepositoryExists(repoName, currentState);
final SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY);
final List<SnapshotsInProgress.Entry> snapshotEntries = findInProgressSnapshots(snapshots, snapshotNames, repoName);
final List<SnapshotId> snapshotIds = matchingSnapshotIds(
Expand Down

0 comments on commit 54459eb

Please sign in to comment.