Skip to content

Commit

Permalink
dcache-resilience: propagate file op error to pool op and display
Browse files Browse the repository at this point in the history
Motivation:

To make it clear to admins whether a pool has cleanly scanned.

Modification:

Add an error counter to the pool operation.  When the
file operation terminates, increment this counter if
it has failed.

Add a message to the end of the pool operation output
indicating the number of errors encountered (thus far).

The counter is reset to 0 at the next change of pool
status or scan.

The date format has also been shortened on the
operation output lines.

Result:

Inspection of the pool operations using 'pool ls'
now reveals whether the most recent scan completed
without file operation errors or not.

NOTE: As this is a new feature, I am not sure if we should
backport.  One could argue that it addresses a potential
issue which is dangerous, and thus merits inclusion
in the stable branches.

Target: master
Acked-by: Tigran
  • Loading branch information
alrossi committed Jul 3, 2018
1 parent 9914d17 commit 6546110
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 17 deletions.
Expand Up @@ -148,7 +148,7 @@ public final class FileOperation {
"%s (%s %s)(%s %s)(parent %s, count %s, retried %s)";
private static final String TO_HISTORY_STRING =
"%s (%s %s)(%s %s)(parent %s, retried %s) %s";
private static final String FORMAT_STR = "E MMM dd HH:mm:ss zzz yyyy";
private static final String FORMAT_STR = "yyyy/MM/dd HH:mm:ss";

/*
* Hidden marker for null, used with int->Integer autoboxing.
Expand Down
Expand Up @@ -1088,7 +1088,13 @@ private void remove(PnfsId pnfsId, boolean failed) {
+ "queue has no parent: {}; "
+ "this is a bug.", operation);
}
poolTaskCompletionHandler.childTerminated(parent, pnfsId);

if (failed) {
poolTaskCompletionHandler.childTerminatedWithFailure(parent,
pnfsId);
} else {
poolTaskCompletionHandler.childTerminated(parent, pnfsId);
}
}

history.add(operation.toHistoryString(), failed);
Expand Down
Expand Up @@ -105,6 +105,7 @@ enum NextAction {

private int children;
private int completed;
private int failed;

PoolOperation() {
forceScan = false;
Expand All @@ -117,6 +118,7 @@ enum NextAction {
currStatus = PoolStatusForResilience.UNINITIALIZED;
children = 0;
completed = 0;
failed = 0;
}

public String toString() {
Expand All @@ -127,10 +129,10 @@ public String toString() {
FileOperation.getFormattedDateFromMillis(lastUpdate),
FileOperation.getFormattedDateFromMillis(lastScan),
lastStatus, currStatus, state,
exception == null ? "" : new ExceptionMessage(exception));
exception == null ? getFailedMessage() :
new ExceptionMessage(exception));
}


public synchronized boolean isExcluded() {
return state == State.EXCLUDED;
}
Expand Down Expand Up @@ -187,34 +189,51 @@ synchronized NextAction getNextAction(PoolStatusForResilience incoming) {
}
}

synchronized void incrementCompleted() {
LOGGER.debug("entering incrementCompleted, state {}, children {}, completed = {}.",
state, children, completed );
synchronized void incrementCompleted(boolean failed) {
LOGGER.trace("entering incrementCompleted, state {}, failed {}, "
+ "children {}, completed = {}.",
state, failed, children, completed );
if (state == State.RUNNING) {
++completed;
if (failed) {
++this.failed;
}
}
LOGGER.debug("leaving incrementCompleted, state {}, children {}, completed = {}.",
LOGGER.trace("leaving incrementCompleted, state {}, failed {}, "
+ "children {}, completed = {}.",
state, children, completed );
}

synchronized boolean isComplete() {
boolean isComplete = children > 0 && children == completed;
LOGGER.debug("isComplete {}, children {}, completed = {}.",
LOGGER.trace("isComplete {}, children {}, completed = {}.",
isComplete, children, completed );
return isComplete;
}

synchronized int failedChildren() {
return failed;
}

synchronized void resetChildren() {
children = 0;
completed = 0;
}

synchronized void resetFailed() {
failed = 0;
}

synchronized void setChildren(int children) {
if (state == State.RUNNING) {
this.children = children;
}
}

private String getFailedMessage() {
return failed == 0 ? "" : failed + " file operations failed";
}

private String getFormattedPercentDone() {
String percent = children == 0 ?
"?" :
Expand Down
Expand Up @@ -631,6 +631,7 @@ public void update(PoolStateUpdate update) {
update.pool, operation);
queue.remove(update.pool);
operation.resetChildren();
operation.resetFailed();
operation.lastUpdate = System.currentTimeMillis();
operation.state = State.WAITING;
operation.group = update.group;
Expand Down Expand Up @@ -659,13 +660,13 @@ public void update(String pool, int children) {
/**
* <p>Called by the {@link FileOperationMap ) when a child operation completes.</p>
*/
public void update(String pool, PnfsId pnfsId) {
public void update(String pool, PnfsId pnfsId, boolean failed) {
LOGGER.debug("Parent {}, child operation for {} has completed.", pool,
pnfsId);
lock.lock();
try {
PoolOperation operation = get(pool);
operation.incrementCompleted();
operation.incrementCompleted(failed);
if (operation.isComplete()) {
terminate(pool, operation);
condition.signalAll();
Expand Down Expand Up @@ -839,6 +840,7 @@ private boolean doScan(PoolStateUpdate update, boolean bypassStateCheck) {
}

operation.exception = null;
operation.resetFailed();
operation.task = null;
waiting.put(update.pool, operation);
return true;
Expand Down Expand Up @@ -870,12 +872,14 @@ private void reset(String pool, PoolOperation operation) {
operation.resetChildren();
if (poolInfoMap.isResilientPool(pool)) {
idle.put(pool, operation);
} else if (operation.state == State.FAILED) {
} else if (operation.state == State.FAILED || operation.failedChildren() > 0) {
String message = operation.exception == null ? "" : "exception: " +
new ExceptionMessage(operation.exception);
LOGGER.error(AlarmMarkerFactory.getMarker(
PredefinedAlarm.FAILED_REPLICATION, pool),
"{} was removed from resilient group but final scan "
+ "failed: {}.", pool,
new ExceptionMessage(operation.exception));
+ "{}; {} failed file operations.",
pool, message, operation.failedChildren());
}
}

Expand Down Expand Up @@ -921,6 +925,8 @@ private void scanIdle() {
i.remove();
operation.forceScan = true;
operation.state = State.WAITING;
operation.resetFailed();
operation.exception = null;
/*
* This is a periodic scan, so check for repartitioning.
*/
Expand Down Expand Up @@ -992,7 +998,6 @@ private void submit(String pool, PoolOperation operation) {
operation.lastUpdate = System.currentTimeMillis();
operation.lastStatus = operation.currStatus;
operation.task.setErrorHandler(e -> update(pool, 0, e));
operation.resetChildren();
running.put(pool, operation);
LOGGER.trace("Submitting pool scan task for {}.", pool);
operation.task.submit();
Expand Down
Expand Up @@ -71,7 +71,11 @@ public class PoolTaskCompletionHandler {
private PoolOperationMap map;

public void childTerminated(String pool, PnfsId pnfsId) {
map.update(pool, pnfsId);
map.update(pool, pnfsId, false);
}

public void childTerminatedWithFailure(String pool, PnfsId pnfsId) {
map.update(pool, pnfsId, true);
}

public void setMap(PoolOperationMap map) {
Expand Down
Expand Up @@ -417,7 +417,7 @@ private void givenRestartsAreNotHandled() {

private void whenAllChildrenCompleteFor(String pool) {
for (int c = 0; c < children; c++) {
poolOperationMap.update(pool, (PnfsId) null);
poolOperationMap.update(pool, (PnfsId) null, false);
}
}

Expand Down

0 comments on commit 6546110

Please sign in to comment.