Skip to content

Commit

Permalink
More detailed logging for snapshot-based recovery (#93469)
Browse files Browse the repository at this point in the history
The logic for planning a snapshot-based recovery is a little intricate
and lacks much logging, making it hard to determine the reason why a
snapshot-based recovery might not occur in production. This commit adds
some extra logging showing details of the branches that ES takes.
  • Loading branch information
DaveCTurner committed Feb 6, 2023
1 parent d3d1bbc commit 1fb3a1b
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ public ShardSnapshotsService(
}

public void fetchLatestSnapshotsForShard(ShardId shardId, ActionListener<Optional<ShardSnapshot>> listener) {
assert shardId != null : "SharId was null but a value was expected";
assert shardId != null : "ShardId was null but a value was expected";

final RepositoriesMetadata currentReposMetadata = clusterService.state()
.metadata()
Expand Down Expand Up @@ -104,24 +104,30 @@ public void fetchLatestSnapshotsForShard(ShardId shardId, ActionListener<Optiona
client.execute(
GetShardSnapshotAction.INSTANCE,
request,
new ThreadedActionListener<>(threadPool.generic(), listener.map(this::fetchSnapshotFiles))
new ThreadedActionListener<>(
threadPool.generic(),
listener.map(shardSnapshotResponse -> fetchSnapshotFiles(shardId, shardSnapshotResponse))
)
);
}

private Optional<ShardSnapshot> fetchSnapshotFiles(GetShardSnapshotResponse shardSnapshotResponse) {
private Optional<ShardSnapshot> fetchSnapshotFiles(ShardId shardId, GetShardSnapshotResponse shardSnapshotResponse) {
assert ThreadPool.assertCurrentThreadPool(ThreadPool.Names.GENERIC);

final Optional<ShardSnapshotInfo> latestShardSnapshotOpt = shardSnapshotResponse.getLatestShardSnapshot();
if (latestShardSnapshotOpt.isEmpty()) {
logger.debug("{} no latest shard snapshot found", shardId);
return Optional.empty();
}

final ShardSnapshotInfo latestShardSnapshot = latestShardSnapshotOpt.get();
try {
final Snapshot snapshot = latestShardSnapshot.getSnapshot();
logger.debug("{} considering recovery from [{}][{}]", shardId, snapshot.getRepository(), snapshot.getSnapshotId());

final Repository repository = repositoriesService.repository(snapshot.getRepository());
if (repository instanceof BlobStoreRepository == false) {
logger.debug("{} not recovering from snapshot in non-blobstore repository [{}]", shardId, snapshot.getRepository());
return Optional.empty();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

package org.elasticsearch.repositories;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.StepListener;
import org.elasticsearch.cluster.metadata.IndexMetadata;
Expand All @@ -28,6 +30,9 @@
import java.util.Optional;

public class IndexSnapshotsService {

private static final Logger logger = LogManager.getLogger(IndexSnapshotsService.class);

private static final Comparator<Tuple<SnapshotId, RepositoryData.SnapshotDetails>> START_TIME_COMPARATOR = Comparator.<
Tuple<SnapshotId, RepositoryData.SnapshotDetails>>comparingLong(pair -> pair.v2().getStartTimeMillis()).thenComparing(Tuple::v1);

Expand Down Expand Up @@ -62,6 +67,7 @@ public void getLatestSuccessfulSnapshotForShard(

repositoryDataStepListener.whenComplete(repositoryData -> {
if (repositoryData.hasIndex(indexName) == false) {
logger.debug("{} repository [{}] has no snapshots of this index", shardId, repositoryName);
listener.onResponse(Optional.empty());
return;
}
Expand All @@ -81,11 +87,13 @@ public void getLatestSuccessfulSnapshotForShard(
// have the start/end date populated in RepositoryData. We could fetch all the backups and find out if there is
// a valid candidate, but for simplicity we just consider that we couldn't find any valid snapshot. Existing
// snapshots start/end timestamps should appear in the RepositoryData eventually.
logger.debug("{} could not determine latest snapshot of this shard in repository [{}]", shardId, repositoryName);
listener.onResponse(Optional.empty());
return;
}

final SnapshotId snapshotId = latestSnapshotId.get();
logger.debug("{} fetching details of [{}][{}]", shardId, repositoryName, snapshotId);
repository.getSnapshotInfo(
snapshotId,
snapshotInfoStepListener.map(
Expand All @@ -100,6 +108,7 @@ public void getLatestSuccessfulSnapshotForShard(

if (snapshotInfo == null || snapshotInfo.state() != SnapshotState.SUCCESS) {
// We couldn't find a valid candidate
logger.debug("{} failed to retrieve snapshot details from [{}]", shardId, repositoryName);
listener.onResponse(Optional.empty());
return;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
import org.apache.logging.log4j.Logger;
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.common.Strings;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.index.snapshots.blobstore.BlobStoreIndexShardSnapshot;
import org.elasticsearch.index.store.Store;
import org.elasticsearch.index.store.StoreFileMetadata;
import org.elasticsearch.indices.recovery.RecoverySettings;
Expand Down Expand Up @@ -68,6 +70,7 @@ public void computeRecoveryPlan(
latestSnapshotOpt -> ActionListener.completeWith(
listener,
() -> computeRecoveryPlanWithSnapshots(
shardId,
shardStateIdentifier,
sourceMetadata,
targetMetadata,
Expand All @@ -80,6 +83,7 @@ public void computeRecoveryPlan(
}

private ShardRecoveryPlan computeRecoveryPlanWithSnapshots(
ShardId shardId,
@Nullable String shardStateIdentifier,
Store.MetadataSnapshot sourceMetadata,
Store.MetadataSnapshot targetMetadata,
Expand All @@ -91,6 +95,7 @@ private ShardRecoveryPlan computeRecoveryPlanWithSnapshots(
List<StoreFileMetadata> filesMissingInTarget = concatLists(sourceTargetDiff.missing, sourceTargetDiff.different);

if (latestSnapshotOpt.isEmpty()) {
logger.debug("{} no snapshot suitable for recovery found, falling back to recovery from the primary", shardId);
// If we couldn't find any valid snapshots, fallback to the source
return getRecoveryPlanUsingSourceNode(sourceMetadata, sourceTargetDiff, filesMissingInTarget, startingSeqNo, translogOps);
}
Expand All @@ -102,6 +107,18 @@ private ShardRecoveryPlan computeRecoveryPlanWithSnapshots(
&& latestSnapshot.hasDifferentPhysicalFiles(sourceMetadata)
&& isSnapshotVersionCompatible(latestSnapshot)
&& sourceTargetDiff.identical.isEmpty()) {

logger.debug(
() -> Strings.format(
"%s primary has changed since snapshot completed, but snapshot still looks ok for recovery: %s",
shardId,
latestSnapshot.getSnapshotFiles()
.stream()
.map(BlobStoreIndexShardSnapshot.FileInfo::physicalName)
.collect(Collectors.joining(", ", "[", "]"))
)
);

// Use the current primary as a fallback if the download fails half-way
ShardRecoveryPlan fallbackPlan = getRecoveryPlanUsingSourceNode(
sourceMetadata,
Expand Down Expand Up @@ -141,6 +158,18 @@ && isSnapshotVersionCompatible(latestSnapshot)
);
}

logger.debug(
() -> Strings.format(
"%s attempting snapshot-based recovery of %s based on %s",
shardId,
snapshotFilesToRecover.getSnapshotFiles()
.stream()
.map(BlobStoreIndexShardSnapshot.FileInfo::physicalName)
.collect(Collectors.joining(", ", "[", "]")),
snapshotDiff
)
);

return new ShardRecoveryPlan(
snapshotFilesToRecover,
concatLists(snapshotDiff.missing, snapshotDiff.different),
Expand Down Expand Up @@ -183,6 +212,7 @@ private ShardRecoveryPlan getRecoveryPlanUsingSourceNode(

private void fetchLatestSnapshotsIgnoringErrors(ShardId shardId, boolean useSnapshots, Consumer<Optional<ShardSnapshot>> listener) {
if (useSnapshots == false) {
logger.debug("{} recovery will not attempt to use snapshots", shardId);
listener.accept(Optional.empty());
return;
}
Expand Down

0 comments on commit 1fb3a1b

Please sign in to comment.