Skip to content

Commit

Permalink
doors: Fix timeout handling during retries
Browse files Browse the repository at this point in the history
Motivation:

Doors will retry pool selection and mover startup several times (subject to a
door specific policy). A timeout limits the total duration and this timeout is
applied as an upper bound for pool selection and mover submission; yet it
wasn't applied for pnfs manager interactions that happen during the retry loop.

This could in some cases cause errors in pool manager about discarded messages
with a negative TTL.

Modification:

Applies the total timeout as an upper bound to the pnfs manager interaction.

Result:

Greatly reduced risk of ending up with a negative TTL.

Target: trunk
Require-notes: yes
Require-book: no
Request: 2.13
Acked-by: Paul Millar <paul.millar@desy.de>
Patch: https://rb.dcache.org/r/8492/
  • Loading branch information
gbehrmann committed Aug 24, 2015
1 parent 1ec6400 commit 73b2d94
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 3 deletions.
11 changes: 10 additions & 1 deletion modules/dcache/src/main/java/diskCacheV111/util/PnfsHandler.java
Expand Up @@ -216,14 +216,23 @@ public <T extends PnfsMessage> T request(T msg)
* Sends a message to the pnfs manager and returns a promise of a future reply.
*/
public <T extends PnfsMessage> ListenableFuture<T> requestAsync(T msg)
{
checkState(_cellStub != null, "Missing endpoint");
return requestAsync(msg, _cellStub.getTimeoutInMillis());
}

/**
* Sends a message to the pnfs manager and returns a promise of a future reply.
*/
public <T extends PnfsMessage> ListenableFuture<T> requestAsync(T msg, long timeout)
{
checkState(_cellStub != null, "Missing endpoint");

msg.setReplyRequired(true);
if (_subject != null) {
msg.setSubject(_subject);
}
return _cellStub.send(msg);
return _cellStub.send(msg, timeout);
}

public PnfsCreateEntryMessage createPnfsDirectory(String path)
Expand Down
14 changes: 12 additions & 2 deletions modules/dcache/src/main/java/org/dcache/util/Transfer.java
Expand Up @@ -695,6 +695,11 @@ public final void readNameSpaceEntry(boolean allowWrite)
* @oaram allowWrite whether the file may be opened for writing
*/
public ListenableFuture<Void> readNameSpaceEntryAsync(boolean allowWrite)
{
return readNameSpaceEntryAsync(allowWrite, _pnfs.getPnfsTimeout());
}

private ListenableFuture<Void> readNameSpaceEntryAsync(boolean allowWrite, long timeout)
{
Set<FileAttribute> attr = EnumSet.of(PNFSID, TYPE, STORAGEINFO, SIZE);
attr.addAll(_additionalAttributes);
Expand All @@ -714,7 +719,7 @@ public ListenableFuture<Void> readNameSpaceEntryAsync(boolean allowWrite)
}
request.setAccessMask(mask);
request.setUpdateAtime(true);
ListenableFuture<PnfsGetFileAttributes> reply = _pnfs.requestAsync(request);
ListenableFuture<PnfsGetFileAttributes> reply = _pnfs.requestAsync(request, timeout);

setStatusUntil("PnfsManager: Fetching storage info", reply);

Expand Down Expand Up @@ -1123,6 +1128,11 @@ private static long getTimeoutFor(CellStub stub, long deadline)
return Math.min(subWithInfinity(deadline, System.currentTimeMillis()), stub.getTimeoutInMillis());
}

private static long getTimeoutFor(PnfsHandler pnfs, long deadline)
{
return Math.min(subWithInfinity(deadline, System.currentTimeMillis()), pnfs.getPnfsTimeout());
}

/**
* Select a pool and start a mover. Failed attempts are handled
* according to the {@link TransferRetryPolicy}. Note, that there
Expand All @@ -1148,7 +1158,7 @@ public ListenableFuture<Void> selectPoolAndStartMoverAsync(String queue, Transfe
AsyncFunction<Void, Void> startMover =
ignored -> startMoverAsync(queue, getTimeoutFor(_pool, deadLine));
AsyncFunction<Void, Void> readNameSpaceEntry =
ignored -> readNameSpaceEntryAsync(false);
ignored -> readNameSpaceEntryAsync(false, getTimeoutFor(_pnfs, deadLine));

FutureFallback<Void> retry =
new FutureFallback<Void>()
Expand Down

0 comments on commit 73b2d94

Please sign in to comment.