Skip to content

Commit

Permalink
Avoid capturing SnapshotsInProgress$Entry in queue (#88707) (#88724)
Browse files Browse the repository at this point in the history
Today each time there's shards to snapshot we enqueue a lambda which
captures the current `SnapshotsInProgress$Entry`. This is a pretty
heavyweight object, possibly several MB in size, most of which is not
necessary to capture, and with concurrent snapshots across thousands of
shards we may enqueue many hundreds of slightly different such objects.
With this commit we compute a more efficient representation of the work
to be done by each task in the queue instead.

Relates #77466
  • Loading branch information
DaveCTurner committed Jul 22, 2022
1 parent c7f4454 commit 8c118e1
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 47 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/88707.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 88707
summary: Avoid capturing `SnapshotsInProgress$Entry` in queue
area: Snapshot/Restore
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
import org.elasticsearch.transport.TransportService;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
Expand Down Expand Up @@ -226,7 +227,24 @@ private void startNewSnapshots(List<SnapshotsInProgress.Entry> snapshotsInProgre
}
if (startedShards != null && startedShards.isEmpty() == false) {
shardSnapshots.computeIfAbsent(snapshot, s -> new HashMap<>()).putAll(startedShards);
startNewShards(entry, startedShards);

final List<Runnable> shardSnapshotTasks = new ArrayList<>(startedShards.size());
for (final Map.Entry<ShardId, IndexShardSnapshotStatus> shardEntry : startedShards.entrySet()) {
final ShardId shardId = shardEntry.getKey();
final IndexShardSnapshotStatus snapshotStatus = shardEntry.getValue();
final IndexId indexId = entry.indices().get(shardId.getIndexName());
assert indexId != null;
assert SnapshotsService.useShardGenerations(entry.version())
|| ShardGenerations.fixShardGeneration(snapshotStatus.generation()) == null
: "Found non-null, non-numeric shard generation ["
+ snapshotStatus.generation()
+ "] for snapshot with old-format compatibility";
shardSnapshotTasks.add(
newShardSnapshotTask(shardId, snapshot, indexId, entry.userMetadata(), snapshotStatus, entry.version())
);
}

threadPool.executor(ThreadPool.Names.SNAPSHOT).execute(() -> shardSnapshotTasks.forEach(Runnable::run));
}
} else if (entryState == State.ABORTED) {
// Abort all running shards for this snapshot
Expand All @@ -249,53 +267,47 @@ private void startNewSnapshots(List<SnapshotsInProgress.Entry> snapshotsInProgre
}
}

private void startNewShards(SnapshotsInProgress.Entry entry, Map<ShardId, IndexShardSnapshotStatus> startedShards) {
threadPool.executor(ThreadPool.Names.SNAPSHOT).execute(() -> {
final Snapshot snapshot = entry.snapshot();
for (final Map.Entry<ShardId, IndexShardSnapshotStatus> shardEntry : startedShards.entrySet()) {
final ShardId shardId = shardEntry.getKey();
final IndexShardSnapshotStatus snapshotStatus = shardEntry.getValue();
final IndexId indexId = entry.indices().get(shardId.getIndexName());
assert indexId != null;
assert SnapshotsService.useShardGenerations(entry.version())
|| ShardGenerations.fixShardGeneration(snapshotStatus.generation()) == null
: "Found non-null, non-numeric shard generation ["
+ snapshotStatus.generation()
+ "] for snapshot with old-format compatibility";
snapshot(shardId, snapshot, indexId, entry.userMetadata(), snapshotStatus, entry.version(), new ActionListener<>() {
@Override
public void onResponse(ShardSnapshotResult shardSnapshotResult) {
final ShardGeneration newGeneration = shardSnapshotResult.getGeneration();
assert newGeneration != null;
assert newGeneration.equals(snapshotStatus.generation());
if (logger.isDebugEnabled()) {
final IndexShardSnapshotStatus.Copy lastSnapshotStatus = snapshotStatus.asCopy();
logger.debug(
"[{}][{}] completed snapshot to [{}] with status [{}] at generation [{}]",
shardId,
snapshot,
snapshot.getRepository(),
lastSnapshotStatus,
snapshotStatus.generation()
);
}
notifySuccessfulSnapshotShard(snapshot, shardId, shardSnapshotResult);
}
private Runnable newShardSnapshotTask(
final ShardId shardId,
final Snapshot snapshot,
final IndexId indexId,
final Map<String, Object> userMetadata,
final IndexShardSnapshotStatus snapshotStatus,
final Version entryVersion
) {
// separate method to make sure this lambda doesn't capture any heavy local objects like a SnapshotsInProgress.Entry
return () -> snapshot(shardId, snapshot, indexId, userMetadata, snapshotStatus, entryVersion, new ActionListener<>() {
@Override
public void onResponse(ShardSnapshotResult shardSnapshotResult) {
final ShardGeneration newGeneration = shardSnapshotResult.getGeneration();
assert newGeneration != null;
assert newGeneration.equals(snapshotStatus.generation());
if (logger.isDebugEnabled()) {
final IndexShardSnapshotStatus.Copy lastSnapshotStatus = snapshotStatus.asCopy();
logger.debug(
"[{}][{}] completed snapshot to [{}] with status [{}] at generation [{}]",
shardId,
snapshot,
snapshot.getRepository(),
lastSnapshotStatus,
snapshotStatus.generation()
);
}
notifySuccessfulSnapshotShard(snapshot, shardId, shardSnapshotResult);
}

@Override
public void onFailure(Exception e) {
final String failure;
if (e instanceof AbortedSnapshotException) {
failure = "aborted";
logger.debug(() -> format("[%s][%s] aborted shard snapshot", shardId, snapshot), e);
} else {
failure = summarizeFailure(e);
logger.warn(() -> format("[%s][%s] failed to snapshot shard", shardId, snapshot), e);
}
snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), failure);
notifyFailedSnapshotShard(snapshot, shardId, failure, snapshotStatus.generation());
}
});
@Override
public void onFailure(Exception e) {
final String failure;
if (e instanceof AbortedSnapshotException) {
failure = "aborted";
logger.debug(() -> format("[%s][%s] aborted shard snapshot", shardId, snapshot), e);
} else {
failure = summarizeFailure(e);
logger.warn(() -> format("[%s][%s] failed to snapshot shard", shardId, snapshot), e);
}
snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), failure);
notifyFailedSnapshotShard(snapshot, shardId, failure, snapshotStatus.generation());
}
});
}
Expand Down

0 comments on commit 8c118e1

Please sign in to comment.