Skip to content

Commit

Permalink
Reduce logging for cache file synchronization failures (#85331) (#85668)
Browse files Browse the repository at this point in the history
The searchable snapshot cache on cold tier synchronizes cache files 
every 60 seconds on disk. But if the synchronization fails due to disk 
being full (or any other problem that can arise at the file system level) 
the scheduled task logs a failure for every cache file to synchronize, 
filling up the disk even more since that be hundreds of files.

This pull request reduce the logs emitted in such cases. Only one 
error message is logged per shard (actually per cache directory, 
each shard has its own cache dir) every 10 minutes.
  • Loading branch information
tlrx committed Apr 4, 2022
1 parent 95433cb commit dc95e72
Showing 1 changed file with 29 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ public class CacheService extends AbstractLifecycleComponent {
private final ThreadPool threadPool;
private final ConcurrentLinkedQueue<CacheFileEvent> cacheFilesEventsQueue;
private final CacheFile.ModificationListener cacheFilesListener;
private final Map<Path, Long> cacheFilesSyncExceptionsLogs;
private final Map<Path, Long> cacheDirsSyncExceptionsLogs;
private final AtomicLong numberOfCacheFilesEvents;
private final CacheSynchronizationTask cacheSyncTask;
private final TimeValue cacheSyncStopTimeout;
Expand Down Expand Up @@ -165,6 +167,8 @@ public CacheService(
this.numberOfCacheFilesEvents = new AtomicLong();
this.cacheFilesEventsQueue = new ConcurrentLinkedQueue<>();
this.cacheFilesListener = new CacheFileModificationListener();
this.cacheFilesSyncExceptionsLogs = new HashMap<>();
this.cacheDirsSyncExceptionsLogs = new HashMap<>();
final ClusterSettings clusterSettings = clusterService.getClusterSettings();
this.maxCacheFilesToSyncAtOnce = SNAPSHOT_CACHE_MAX_FILES_TO_SYNC_AT_ONCE_SETTING.get(settings);
clusterSettings.addSettingsUpdateConsumer(SNAPSHOT_CACHE_MAX_FILES_TO_SYNC_AT_ONCE_SETTING, this::setMaxCacheFilesToSyncAtOnce);
Expand Down Expand Up @@ -214,6 +218,8 @@ protected void doStop() {
} catch (Exception e) {
logger.warn("failed to close persistent cache", e);
} finally {
cacheFilesSyncExceptionsLogs.clear();
cacheDirsSyncExceptionsLogs.clear();
if (acquired) {
cacheSyncLock.unlock();
}
Expand Down Expand Up @@ -530,6 +536,12 @@ public void synchronizeCache() {
final long startTimeNanos = threadPool.relativeTimeInNanos();
final long maxCacheFilesToSync = Math.min(numberOfCacheFilesEvents.get(), this.maxCacheFilesToSyncAtOnce);

if (cacheFilesSyncExceptionsLogs.isEmpty() == false || cacheDirsSyncExceptionsLogs.isEmpty() == false) {
final long expiredTimeNanos = Math.min(0L, startTimeNanos - TimeUnit.MINUTES.toNanos(10L));
cacheFilesSyncExceptionsLogs.values().removeIf(lastLogTimeNanos -> expiredTimeNanos >= lastLogTimeNanos);
cacheDirsSyncExceptionsLogs.values().removeIf(lastLogTimeNanos -> expiredTimeNanos >= lastLogTimeNanos);
}

long updates = 0L;
long deletes = 0L;
long errors = 0L;
Expand All @@ -549,6 +561,7 @@ public void synchronizeCache() {
assert numberOfEvents >= 0L : numberOfEvents;

final CacheFile cacheFile = event.value;
final Path cacheDir = cacheFile.getFile().toAbsolutePath().getParent();
try {
switch (event.type) {
case DELETE -> {
Expand All @@ -564,7 +577,6 @@ public void synchronizeCache() {
ranges.size()
);
if (ranges.isEmpty() == false) {
final Path cacheDir = cacheFile.getFile().toAbsolutePath().getParent();
boolean shouldPersist = cacheDirs.contains(cacheDir);
if (shouldPersist == false) {
try {
Expand All @@ -573,10 +585,12 @@ public void synchronizeCache() {
cacheDirs.add(cacheDir);
shouldPersist = true;
} catch (Exception e) {
logger.warn(
() -> new ParameterizedMessage("failed to synchronize cache directory [{}]", cacheDir),
e
);
if (cacheDirsSyncExceptionsLogs.putIfAbsent(cacheDir, startTimeNanos) == null) {
logger.warn(
() -> new ParameterizedMessage("failed to synchronize cache directory [{}]", cacheDir),
e
);
}
assert e instanceof IOException : e;
shouldPersist = false;
}
Expand All @@ -590,14 +604,16 @@ public void synchronizeCache() {
default -> throw new IllegalArgumentException("Unknown cache file event [" + event + ']');
}
} catch (Exception e) {
logger.warn(
() -> new ParameterizedMessage(
"failed to process [{}] for cache file [{}]",
event.type,
cacheFile.getFile().getFileName()
),
e
);
if (cacheFilesSyncExceptionsLogs.putIfAbsent(cacheDir, startTimeNanos) == null) {
logger.warn(
() -> new ParameterizedMessage(
"failed to process [{}] for cache file [{}]",
event.type,
cacheFile.getFile().getFileName()
),
e
);
}
assert e instanceof IOException : e;
errors += 1L;
}
Expand Down

0 comments on commit dc95e72

Please sign in to comment.