Skip to content

Commit

Permalink
pinmanager: provide more details information on timeout
Browse files Browse the repository at this point in the history
Motivation:

Issue #6251 describes a problem with pinmanager where requests are
caught in a retry loop until the pin request message timeout results in
the request being terminated.

Unfortunately, the current error message provides insufficient
information to learn what is going wrong.  It indicates that the request
has been retried until it eventually fails, but not why.

Modification:

In lieu of a full pin-request activity log, we can throw a
CacheException with a message that includes why the final retry was
attempted, the one that resulted in the request exceeding the message
lifetime.

This is in keeping with how explicit failures are handled, where more
detailed information is provided.

Result:

A more detailed pinmanager log entry is provided for pin requests that
are caught in a retry loop, and eventually time out.

Target: master
Request: 7.2
Request: 7.1
Request: 7.0
Request: 6.2
Requires-notes: yes
Requires-book: no
Patch: https://rb.dcache.org/r/13266/
Acked-by: Tigran Mkrtchyan
  • Loading branch information
paulmillar authored and mksahakyan committed Nov 24, 2021
1 parent f429b91 commit 0097c28
Showing 1 changed file with 14 additions and 13 deletions.
Expand Up @@ -209,17 +209,18 @@ private void enforceLifetimeLimit(PinManagerPinMessage message) {
return RequestContainerV5.allStatesExceptStage;
}

private void retry(final PinTask task, long delay) {
private void retry(final PinTask task, long delay, String reason) {
if (!task.isValidIn(delay)) {
fail(task, CacheException.TIMEOUT, "Pin request TTL exceeded");
fail(task, CacheException.TIMEOUT, "Pin request TTL exceeded: " + reason);
} else {
_scheduledExecutor.schedule(() -> {
try {
rereadNameSpaceEntry(task);
} catch (CacheException e) {
fail(task, e.getRc(), e.getMessage());
fail(task, e.getRc(), "Retry (because " + reason + ") failed: " + e.getMessage());
} catch (RuntimeException e) {
fail(task, CacheException.UNEXPECTED_SYSTEM_EXCEPTION, e.toString());
fail(task, CacheException.UNEXPECTED_SYSTEM_EXCEPTION,
"Retry (because " + reason + ") failed: " + e);
}
}, delay, MILLISECONDS);
}
Expand Down Expand Up @@ -280,7 +281,7 @@ public void noroute(CellPath path) {
* expect this to be a transient
* problem and retry in a moment.
*/
retry(task, RETRY_DELAY);
retry(task, RETRY_DELAY, "no route to PnfsManager at " + path);
}

@Override
Expand All @@ -289,7 +290,7 @@ public void timeout(String error) {
* expect this to be a transient
* problem and retry in a moment.
*/
retry(task, SMALL_DELAY);
retry(task, SMALL_DELAY, "timeout with PnfsManager: " + error);
}
}, _executor);
}
Expand Down Expand Up @@ -359,7 +360,7 @@ public void failure(int rc, Object error) {
* refresh of the request.
* Retry right away.
*/
retry(task, 0);
retry(task, 0, "PoolManager requested a retry");
break;
case CacheException.FILE_NOT_IN_REPOSITORY:
case CacheException.PERMISSION_DENIED:
Expand All @@ -370,7 +371,7 @@ public void failure(int rc, Object error) {
* but for the time being the retry is dealed with
* by pin manager.
*/
retry(task, RETRY_DELAY);
retry(task, RETRY_DELAY, "PoolManager request failed: " + error);
break;
}
}
Expand All @@ -382,7 +383,7 @@ public void noroute(CellPath path) {
* to be transient and retry in
* a moment.
*/
retry(task, RETRY_DELAY);
retry(task, RETRY_DELAY, "no route to PoolManager at " + path);
}

@Override
Expand All @@ -392,7 +393,7 @@ public void timeout(String message) {
* transient and retry in a
* moment.
*/
retry(task, SMALL_DELAY);
retry(task, SMALL_DELAY, "timeout with PoolManager: " + message);
}
}, _executor);
}
Expand Down Expand Up @@ -440,7 +441,7 @@ public void failure(int rc, Object error) {
* it a chance to be updated and
* then retry.
*/
retry(task, RETRY_DELAY);
retry(task, RETRY_DELAY, "pool " + poolName + " declared itself disabled");
break;
case CacheException.FILE_NOT_IN_REPOSITORY:
/* Pnfs manager had stale location
Expand All @@ -449,7 +450,7 @@ public void failure(int rc, Object error) {
* this error, so we retry in a
* moment.
*/
retry(task, SMALL_DELAY);
retry(task, SMALL_DELAY, "file not on pool " + poolName);
break;
default:
fail(task, rc, error.toString());
Expand All @@ -463,7 +464,7 @@ public void noroute(CellPath path) {
* pool manager a moment to notice this
* and then retry.
*/
retry(task, RETRY_DELAY);
retry(task, RETRY_DELAY, "no route to pool at " + poolAddress);
}

@Override
Expand Down

0 comments on commit 0097c28

Please sign in to comment.