Skip to content

Commit

Permalink
Stop Serializing Exceptions in SnapshotInfo (#57866) (#57899)
Browse files Browse the repository at this point in the history
In ff9e8c6 we changed the format
used when serializing snapshot failures in the cluster state and
`SnapshotInfo`. This turned them from a short string holding all the
nested exception messages into a multi kb stacktrace in many cases.
This is not great if you snapshot a large number of shards that all fail
for example and massively blows up the size of the GET snapshots response
if there are snapshots with failures in there.
This change reverts to the format used for exceptions before the above commit.

Also, this change short circuits logging and serialization of the failure
for an aborted snapshot where we don't care about the specific message at all
and aligns the message to "aborted" in all cases (current if we aborted before any IO,
it would have been "aborted" and an exception when aborting later during IO).
  • Loading branch information
original-brownbear committed Jun 9, 2020
1 parent fab2522 commit fd271fc
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@
import static org.hamcrest.Matchers.anyOf;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.empty;
import static org.hamcrest.Matchers.endsWith;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.hasSize;
Expand Down Expand Up @@ -3180,7 +3181,7 @@ public void testSnapshotCanceledOnRemovedShard() throws Exception {
SnapshotInfo snapshotInfo = waitForCompletion(repo, snapshot, TimeValue.timeValueSeconds(60));
assertEquals(1, snapshotInfo.shardFailures().size());
assertEquals(0, snapshotInfo.shardFailures().get(0).shardId());
assertThat(snapshotInfo.shardFailures().get(0).reason(), containsString("IndexShardSnapshotFailedException[Aborted]"));
assertThat(snapshotInfo.shardFailures().get(0).reason(), is("aborted"));
}

public void testSnapshotSucceedsAfterSnapshotFailure() throws Exception {
Expand Down Expand Up @@ -3222,7 +3223,7 @@ public void testSnapshotSucceedsAfterSnapshotFailure() throws Exception {
assertThat(getFailureCount("test-repo"), greaterThan(0L));
assertThat(createSnapshotResponse.getSnapshotInfo().shardFailures().size(), greaterThan(0));
for (SnapshotShardFailure shardFailure : createSnapshotResponse.getSnapshotInfo().shardFailures()) {
assertThat(shardFailure.reason(), containsString("Random IOException"));
assertThat(shardFailure.reason(), endsWith("; nested: IOException[Random IOException]"));
}
}
} catch (SnapshotException | RepositoryException ex) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@
import org.elasticsearch.repositories.RepositoryVerificationException;
import org.elasticsearch.snapshots.SnapshotCreationException;
import org.elasticsearch.repositories.ShardGenerations;
import org.elasticsearch.snapshots.AbortedSnapshotException;
import org.elasticsearch.snapshots.SnapshotException;
import org.elasticsearch.snapshots.SnapshotId;
import org.elasticsearch.snapshots.SnapshotInfo;
Expand Down Expand Up @@ -1700,7 +1701,7 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
for (String fileName : fileNames) {
if (snapshotStatus.isAborted()) {
logger.debug("[{}] [{}] Aborted on the file [{}], exiting", shardId, snapshotId, fileName);
throw new IndexShardSnapshotFailedException(shardId, "Aborted");
throw new AbortedSnapshotException();
}

logger.trace("[{}] [{}] Processing [{}]", shardId, snapshotId, fileName);
Expand Down Expand Up @@ -2148,7 +2149,7 @@ private void checkAborted() {
if (snapshotStatus.isAborted()) {
logger.debug("[{}] [{}] Aborted on the file [{}], exiting", shardId,
snapshotId, fileInfo.physicalName());
throw new IndexShardSnapshotFailedException(shardId, "Aborted");
throw new AbortedSnapshotException();
}
}
};
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.snapshots;

public final class AbortedSnapshotException extends RuntimeException {
public AbortedSnapshotException() {
super("aborted");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.apache.lucene.index.IndexCommit;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.ActionRequestValidationException;
Expand Down Expand Up @@ -302,16 +301,44 @@ public void onResponse(String newGeneration) {

@Override
public void onFailure(Exception e) {
final String failure = ExceptionsHelper.stackTrace(e);
final String failure;
if (e instanceof AbortedSnapshotException) {
failure = "aborted";
logger.debug(() -> new ParameterizedMessage("[{}][{}] aborted shard snapshot", shardId, snapshot), e);
} else {
failure = summarizeFailure(e);
logger.warn(() -> new ParameterizedMessage("[{}][{}] failed to snapshot shard", shardId, snapshot), e);
}
snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), failure);
logger.warn(() -> new ParameterizedMessage("[{}][{}] failed to snapshot shard", shardId, snapshot), e);
notifyFailedSnapshotShard(snapshot, shardId, failure);
}
});
}
});
}

//package private for testing
static String summarizeFailure(Throwable t) {
if (t.getCause() == null) {
return t.getClass().getSimpleName() + "[" + t.getMessage() + "]";
} else {
StringBuilder sb = new StringBuilder();
while (t != null) {
sb.append(t.getClass().getSimpleName());
if (t.getMessage() != null) {
sb.append("[");
sb.append(t.getMessage());
sb.append("]");
}
t = t.getCause();
if (t != null) {
sb.append("; nested: ");
}
}
return sb.toString();
}
}

/**
* Creates shard snapshot
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,24 @@
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.EqualsHashCodeTestUtils;

import java.io.IOException;

import static org.hamcrest.Matchers.is;

public class SnapshotShardsServiceTests extends ESTestCase {

public void testSummarizeFailure() {
final RuntimeException wrapped = new RuntimeException("wrapped");
assertThat(SnapshotShardsService.summarizeFailure(wrapped), is("RuntimeException[wrapped]"));
final RuntimeException wrappedWithNested = new RuntimeException("wrapped", new IOException("nested"));
assertThat(SnapshotShardsService.summarizeFailure(wrappedWithNested),
is("RuntimeException[wrapped]; nested: IOException[nested]"));
final RuntimeException wrappedWithTwoNested =
new RuntimeException("wrapped", new IOException("nested", new IOException("root")));
assertThat(SnapshotShardsService.summarizeFailure(wrappedWithTwoNested),
is("RuntimeException[wrapped]; nested: IOException[nested]; nested: IOException[root]"));
}

public void testEqualsAndHashcodeUpdateIndexShardSnapshotStatusRequest() {
EqualsHashCodeTestUtils.checkEqualsAndHashCode(
new SnapshotShardsService.UpdateIndexShardSnapshotStatusRequest(
Expand Down

0 comments on commit fd271fc

Please sign in to comment.