From 0097c289b13937b6a2e073b8486c8a29e3ccb207 Mon Sep 17 00:00:00 2001 From: Paul Millar Date: Tue, 16 Nov 2021 12:49:14 +0100 Subject: [PATCH] pinmanager: provide more details information on timeout Motivation: Issue #6251 describes a problem with pinmanager where requests are caught in a retry loop until the pin request message timeout results in the request being terminated. Unfortunately, the current error message provides insufficient information to learn what is going wrong. It indicates that the request has been retried until it eventually fails, but not why. Modification: In lieu of a full pin-request activity log, we can throw a CacheException with a message that includes why the final retry was attempted, the one that resulted in the request exceeding the message lifetime. This is in keeping with how explicit failures are handled, where more detailed information is provided. Result: A more detailed pinmanager log entry is provided for pin requests that are caught in a retry loop, and eventually time out. Target: master Request: 7.2 Request: 7.1 Request: 7.0 Request: 6.2 Requires-notes: yes Requires-book: no Patch: https://rb.dcache.org/r/13266/ Acked-by: Tigran Mkrtchyan --- .../pinmanager/PinRequestProcessor.java | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/modules/dcache/src/main/java/org/dcache/pinmanager/PinRequestProcessor.java b/modules/dcache/src/main/java/org/dcache/pinmanager/PinRequestProcessor.java index fa37e8bf8c4..13cf5a4bf47 100644 --- a/modules/dcache/src/main/java/org/dcache/pinmanager/PinRequestProcessor.java +++ b/modules/dcache/src/main/java/org/dcache/pinmanager/PinRequestProcessor.java @@ -209,17 +209,18 @@ private void enforceLifetimeLimit(PinManagerPinMessage message) { return RequestContainerV5.allStatesExceptStage; } - private void retry(final PinTask task, long delay) { + private void retry(final PinTask task, long delay, String reason) { if (!task.isValidIn(delay)) { - fail(task, CacheException.TIMEOUT, "Pin request TTL exceeded"); + fail(task, CacheException.TIMEOUT, "Pin request TTL exceeded: " + reason); } else { _scheduledExecutor.schedule(() -> { try { rereadNameSpaceEntry(task); } catch (CacheException e) { - fail(task, e.getRc(), e.getMessage()); + fail(task, e.getRc(), "Retry (because " + reason + ") failed: " + e.getMessage()); } catch (RuntimeException e) { - fail(task, CacheException.UNEXPECTED_SYSTEM_EXCEPTION, e.toString()); + fail(task, CacheException.UNEXPECTED_SYSTEM_EXCEPTION, + "Retry (because " + reason + ") failed: " + e); } }, delay, MILLISECONDS); } @@ -280,7 +281,7 @@ public void noroute(CellPath path) { * expect this to be a transient * problem and retry in a moment. */ - retry(task, RETRY_DELAY); + retry(task, RETRY_DELAY, "no route to PnfsManager at " + path); } @Override @@ -289,7 +290,7 @@ public void timeout(String error) { * expect this to be a transient * problem and retry in a moment. */ - retry(task, SMALL_DELAY); + retry(task, SMALL_DELAY, "timeout with PnfsManager: " + error); } }, _executor); } @@ -359,7 +360,7 @@ public void failure(int rc, Object error) { * refresh of the request. * Retry right away. */ - retry(task, 0); + retry(task, 0, "PoolManager requested a retry"); break; case CacheException.FILE_NOT_IN_REPOSITORY: case CacheException.PERMISSION_DENIED: @@ -370,7 +371,7 @@ public void failure(int rc, Object error) { * but for the time being the retry is dealed with * by pin manager. */ - retry(task, RETRY_DELAY); + retry(task, RETRY_DELAY, "PoolManager request failed: " + error); break; } } @@ -382,7 +383,7 @@ public void noroute(CellPath path) { * to be transient and retry in * a moment. */ - retry(task, RETRY_DELAY); + retry(task, RETRY_DELAY, "no route to PoolManager at " + path); } @Override @@ -392,7 +393,7 @@ public void timeout(String message) { * transient and retry in a * moment. */ - retry(task, SMALL_DELAY); + retry(task, SMALL_DELAY, "timeout with PoolManager: " + message); } }, _executor); } @@ -440,7 +441,7 @@ public void failure(int rc, Object error) { * it a chance to be updated and * then retry. */ - retry(task, RETRY_DELAY); + retry(task, RETRY_DELAY, "pool " + poolName + " declared itself disabled"); break; case CacheException.FILE_NOT_IN_REPOSITORY: /* Pnfs manager had stale location @@ -449,7 +450,7 @@ public void failure(int rc, Object error) { * this error, so we retry in a * moment. */ - retry(task, SMALL_DELAY); + retry(task, SMALL_DELAY, "file not on pool " + poolName); break; default: fail(task, rc, error.toString()); @@ -463,7 +464,7 @@ public void noroute(CellPath path) { * pool manager a moment to notice this * and then retry. */ - retry(task, RETRY_DELAY); + retry(task, RETRY_DELAY, "no route to pool at " + poolAddress); } @Override