Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure error handler is called during SLM retention callback failure #55252

Merged
merged 2 commits into from
Apr 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.LatchedActionListener;
import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsResponse;
import org.elasticsearch.action.support.master.AcknowledgedResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.OriginSettingClient;
Expand Down Expand Up @@ -252,14 +251,13 @@ void getAllRetainableSnapshots(Collection<String> repositories, ActionListener<M
if (repositories.isEmpty()) {
// Skip retrieving anything if there are no repositories to fetch
listener.onResponse(Collections.emptyMap());
return;
}

client.admin().cluster()
.prepareGetSnapshots(repositories.toArray(Strings.EMPTY_ARRAY))
.setIgnoreUnavailable(true)
.execute(new ActionListener<>() {
@Override
public void onResponse(final GetSnapshotsResponse resp) {
.execute(ActionListener.wrap(resp -> {
if (logger.isTraceEnabled()) {
logger.trace("retrieved snapshots: {}",
repositories.stream()
Expand All @@ -276,14 +274,11 @@ public void onResponse(final GetSnapshotsResponse resp) {
.collect(Collectors.toList()));
});
listener.onResponse(snapshots);
}

@Override
public void onFailure(Exception e) {
},
e -> {
logger.debug(new ParameterizedMessage("unable to retrieve snapshots for [{}] repositories", repositories), e);
errorHandler.accept(e);
}
});
}));
}

static String getPolicyId(SnapshotInfo snapshotInfo) {
Expand Down Expand Up @@ -424,26 +419,21 @@ void deleteSnapshot(String slmPolicy, String repo, SnapshotId snapshot, Snapshot
logger.info("[{}] snapshot retention deleting snapshot [{}]", repo, snapshot);
CountDownLatch latch = new CountDownLatch(1);
client.admin().cluster().prepareDeleteSnapshot(repo, snapshot.getName())
.execute(new LatchedActionListener<>(new ActionListener<>() {
@Override
public void onResponse(AcknowledgedResponse acknowledgedResponse) {
.execute(new LatchedActionListener<>(ActionListener.wrap(acknowledgedResponse -> {
if (acknowledgedResponse.isAcknowledged()) {
logger.debug("[{}] snapshot [{}] deleted successfully", repo, snapshot);
} else {
logger.warn("[{}] snapshot [{}] delete issued but the request was not acknowledged", repo, snapshot);
}
slmStats.snapshotDeleted(slmPolicy);
listener.onResponse(acknowledgedResponse);
}

@Override
public void onFailure(Exception e) {
},
e -> {
logger.warn(new ParameterizedMessage("[{}] failed to delete snapshot [{}] for retention",
repo, snapshot), e);
slmStats.snapshotDeleteFailure(slmPolicy);
listener.onFailure(e);
}
}, latch));
}), latch));
try {
// Deletes cannot occur simultaneously, so wait for this
// deletion to complete before attempting the next one
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,14 @@

package org.elasticsearch.xpack.slm;

import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.ActionRequest;
import org.elasticsearch.action.ActionResponse;
import org.elasticsearch.action.ActionType;
import org.elasticsearch.action.admin.cluster.snapshots.delete.DeleteSnapshotRequest;
import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsRequest;
import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsResponse;
import org.elasticsearch.action.support.master.AcknowledgedResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.cluster.ClusterName;
Expand Down Expand Up @@ -45,6 +52,7 @@
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
Expand All @@ -53,6 +61,7 @@
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.function.LongSupplier;
Expand Down Expand Up @@ -375,6 +384,114 @@ public void testOkToDeleteSnapshots() {
assertThat(SnapshotRetentionTask.okayToDeleteSnapshots(state), equalTo(true));
}

public void testErrStillRunsFailureHandlerWhenRetrieving() throws Exception {
ThreadPool threadPool = new TestThreadPool("slm-test");
final String policyId = "policy";
final String repoId = "repo";
try (ClusterService clusterService = ClusterServiceUtils.createClusterService(threadPool);
Client noOpClient = new NoOpClient("slm-test") {

@Override
@SuppressWarnings("unchecked")
protected <Request extends ActionRequest, Response extends ActionResponse>
void doExecute(ActionType<Response> action, Request request, ActionListener<Response> listener) {
if (request instanceof GetSnapshotsRequest) {
logger.info("--> called");
listener.onResponse((Response) new GetSnapshotsResponse(
Collections.singleton(GetSnapshotsResponse.Response.snapshots(repoId, Collections.emptyList()))));
} else {
super.doExecute(action, request, listener);
}
}
}) {
SnapshotLifecyclePolicy policy = new SnapshotLifecyclePolicy(policyId, "snap", "1 * * * * ?",
repoId, null, new SnapshotRetentionConfiguration(TimeValue.timeValueDays(30), null, null));

ClusterState state = createState(policy);
ClusterServiceUtils.setState(clusterService, state);

SnapshotRetentionTask task = new SnapshotRetentionTask(noOpClient, clusterService,
System::nanoTime,
new SnapshotLifecycleTaskTests.VerifyingHistoryStore(noOpClient, ZoneOffset.UTC,
(historyItem) -> fail("should never write history")),
threadPool);

AtomicReference<Exception> errHandlerCalled = new AtomicReference<>(null);
task.getAllRetainableSnapshots(Collections.singleton(repoId), new ActionListener<>() {
@Override
public void onResponse(Map<String, List<SnapshotInfo>> stringListMap) {
logger.info("--> forcing failure");
throw new ElasticsearchException("forced failure");
}

@Override
public void onFailure(Exception e) {
fail("we have another err handler that should have been called");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found this confusing, but I see it's just how getAllRetainableSnapshots does the listener callback, so maybe it's a discussion we should have on a separate ocasion

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, we can do that maybe in future work since it's technically not really a bugfix.

}
}, errHandlerCalled::set);

assertNotNull(errHandlerCalled.get());
assertThat(errHandlerCalled.get().getMessage(), equalTo("forced failure"));
} finally {
threadPool.shutdownNow();
threadPool.awaitTermination(10, TimeUnit.SECONDS);
}
}

public void testErrStillRunsFailureHandlerWhenDeleting() throws Exception {
ThreadPool threadPool = new TestThreadPool("slm-test");
try (ClusterService clusterService = ClusterServiceUtils.createClusterService(threadPool);
Client noOpClient = new NoOpClient("slm-test") {

@Override
@SuppressWarnings("unchecked")
protected <Request extends ActionRequest, Response extends ActionResponse>
void doExecute(ActionType<Response> action, Request request, ActionListener<Response> listener) {
if (request instanceof DeleteSnapshotRequest) {
logger.info("--> called");
listener.onResponse((Response) new AcknowledgedResponse(true));
} else {
super.doExecute(action, request, listener);
}
}
}) {
final String policyId = "policy";
final String repoId = "repo";
SnapshotLifecyclePolicy policy = new SnapshotLifecyclePolicy(policyId, "snap", "1 * * * * ?",
repoId, null, new SnapshotRetentionConfiguration(TimeValue.timeValueDays(30), null, null));

ClusterState state = createState(policy);
ClusterServiceUtils.setState(clusterService, state);

SnapshotRetentionTask task = new SnapshotRetentionTask(noOpClient, clusterService,
System::nanoTime,
new SnapshotLifecycleTaskTests.VerifyingHistoryStore(noOpClient, ZoneOffset.UTC,
(historyItem) -> fail("should never write history")),
threadPool);

AtomicBoolean onFailureCalled = new AtomicBoolean(false);
AtomicReference<Exception> errHandlerCalled = new AtomicReference<>(null);
task.deleteSnapshot("policy", "foo", new SnapshotId("name", "uuid"),
new SnapshotLifecycleStats(0, 0, 0, 0, new HashMap<>()), new ActionListener<>() {
@Override
public void onResponse(AcknowledgedResponse acknowledgedResponse) {
logger.info("--> forcing failure");
throw new ElasticsearchException("forced failure");
}

@Override
public void onFailure(Exception e) {
onFailureCalled.set(true);
}
});

assertThat(onFailureCalled.get(), equalTo(true));
} finally {
threadPool.shutdownNow();
threadPool.awaitTermination(10, TimeUnit.SECONDS);
}
}

public void testSkipWhileStopping() throws Exception {
doTestSkipDuringMode(OperationMode.STOPPING);
}
Expand Down