Skip to content

Commit

Permalink
dcache-resilience: avoid spamming alarms with abort messages
Browse files Browse the repository at this point in the history
Motivation:

When a file replication operation is fatally aborted, an alarm
notification is raised.   This notification currently is keyed
to the file's pnfsid.

While such failures are usually infrequent, under certain conditions
(e.g., network congestion failures, no more available pools for further copies),
whole successions of such failures can occur, in which cases hundreds
or even thousands of alarms are sent.  Though this is not a problem
for the server to handle, it is unfriendly to the user, both when
scanning alarms using the various web inferfaces, or potentially
when receiving email alerts (since the key for the alarm is the
pnfsid, each alarm will generate an email alert as well).

Modification:

Key the alarm to the file's storage unit instead of the pnfsid.
Add a suffix to the key based on a hourly timestamp.
The alarm will thus be incremented during the hour but a new
alarm will be created every hour; in this way, those receiving
email alerts will receive them once an hour.   Log this
alarm to the domain log at WARN level.

Create a new logger and appender for resilience, and log the
full file information to that at the ERROR level.

Result:

Friendlier behavior wrt replication failure (fatal abort) alarms.

Target: master
Require-notes: yes
Require-book: no
Request: 4.0
Request: 3.2
Request: 3.1
Request: 3.0
Request: 2.16
Acked-by: Paul
  • Loading branch information
alrossi committed Feb 27, 2018
1 parent eb11d09 commit d272111
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,7 @@ private void postProcess(FileOperation operation) {
.map(poolInfoMap::getPool)
.collect(Collectors.toSet());
completionHandler.taskAborted(operation.getPnfsId(),
poolInfoMap.getUnit(operation.getStorageUnit()),
tried,
operation.getRetried(),
maxRetries,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,12 @@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.time.Instant;
import java.util.Set;

import diskCacheV111.util.CacheException;
import diskCacheV111.util.PnfsId;

import org.dcache.alarms.AlarmMarkerFactory;
import org.dcache.alarms.PredefinedAlarm;
import org.dcache.pool.migration.Task;
Expand All @@ -74,13 +76,21 @@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
import org.dcache.resilience.util.CacheExceptionUtils;
import org.dcache.resilience.util.ExceptionMessage;

import static java.time.temporal.ChronoField.MINUTE_OF_HOUR;
import static java.time.temporal.ChronoField.SECOND_OF_MINUTE;

/**
* <p>Implements the handling of pnfsid task termination.
* Also implements the migration task termination logic.</p>
*/
public final class FileTaskCompletionHandler implements TaskCompletionHandler {
static final String ABORT_REPLICATION_LOG_MESSAGE
= "Aborting replication for {}; pools tried: {}; {}";
= "Storage unit {}: aborted replication for {}; pools tried: {}; {}";

static final String ABORT_REPLICATION_ALARM_MESSAGE
= "There are files in storage unit {} for which replication "
+ "has been aborted; please consult the resilience-specific "
+ "log or 'history errors' for details.";

static final String VERIFY_FAILURE_MESSAGE
= "Processing for %s failed during verify. %s%s";
Expand All @@ -95,13 +105,17 @@ public final class FileTaskCompletionHandler implements TaskCompletionHandler {
private static final Logger LOGGER
= LoggerFactory.getLogger(FileTaskCompletionHandler.class);

private static final Logger ABORTED_LOGGER
= LoggerFactory.getLogger("org.dcache.resilience-log");

private FileOperationMap map;

public void setMap(FileOperationMap map) {
this.map = map;
}

public void taskAborted(PnfsId pnfsId,
String storageUnit,
Set<String> triedSources,
int retried,
int maxRetries,
Expand All @@ -112,11 +126,27 @@ public void taskAborted(PnfsId pnfsId,
maxRetries), e);
}

LOGGER.error(AlarmMarkerFactory.getMarker(
Instant ref = Instant.now()
.with(SECOND_OF_MINUTE, 0)
.with(MINUTE_OF_HOUR, 0);

/*
* Alarm notification is keyed to the storage group, so as to avoid
* spamming the server or email forwarding. The alarm key changes every hour.
* This guarantees that a new alarm is registered each hour.
* Send this at warn level, so it is possible to throttle repeated
* messages in the domain log.
*/
LOGGER.warn(AlarmMarkerFactory.getMarker(
PredefinedAlarm.FAILED_REPLICATION,
pnfsId.toString()),
ABORT_REPLICATION_LOG_MESSAGE, pnfsId,
triedSources, new ExceptionMessage(e));
storageUnit, "ABORT_REPLICATION-" + ref),
ABORT_REPLICATION_ALARM_MESSAGE, storageUnit);

/*
* Full info on the file is logged to the ".resilience" log.
*/
ABORTED_LOGGER.error(ABORT_REPLICATION_LOG_MESSAGE, storageUnit, pnfsId,
triedSources, new ExceptionMessage(e));
}

@Override
Expand Down Expand Up @@ -196,4 +226,4 @@ public void taskFailed(Task task, int rc, String msg) {
public void taskFailedPermanently(Task task, int rc, String msg) {
taskFailed(task, rc, msg);
}
}
}
22 changes: 22 additions & 0 deletions skel/etc/logback.xml
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,17 @@
</then>
</if>

<appender name="resilience" class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>${dcache.log.dir}/${dcache.domain.name}.resilience</file>
<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
<fileNamePattern>${dcache.log.dir}/${dcache.domain.name}.resilience.%d{yyyy-MM-dd}.gz</fileNamePattern>
<maxHistory>${dcache.log.resilience.max-history}</maxHistory>
</rollingPolicy>
<encoder>
<pattern>${dcache.log.format.file}</pattern>
</encoder>
</appender>

<root>
<appender-ref ref="stdout"/>
<appender-ref ref="pinboard"/>
Expand All @@ -159,6 +170,9 @@
<appender-ref ref="zookeeper"/>
</logger>

<logger name="org.dcache.resilience-log" additivity="false">
<appender-ref ref="resilience"/>
</logger>

<!-- Nothing is logged to this logger. Its sole purpose is to list
all appenders available; this ensures that the appenders are
Expand All @@ -174,6 +188,7 @@
<appender-ref ref="remote"/>
</then>
</if>
<appender-ref ref="resilience"/>
</logger>

<turboFilter class="dmg.util.logback.CellThresholdFilter">
Expand Down Expand Up @@ -271,5 +286,12 @@
<logger>org.dcache.zookeeper</logger>
<level>${dcache.log.level.zookeeper}</level>
</threshold>

<threshold>
<appender>resilience</appender>
<logger>org.dcache.resilience-log</logger>
<level>error</level>
</threshold>

</turboFilter>
</configuration>
3 changes: 3 additions & 0 deletions skel/share/defaults/dcache.properties
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ dcache.log.access.max-history=30
# How many days to keep zookeeper logs
dcache.log.zookeeper.max-history=30

# How many days to keep resilience logs
dcache.log.resilience.max-history=30

# Host on which the remote log server will run
# relative to this dCache installation
#
Expand Down

0 comments on commit d272111

Please sign in to comment.