Skip to content

Commit

Permalink
pool: describe why migration job was cancelled
Browse files Browse the repository at this point in the history
Motivation:

We have had reports (see #6557) where a migration job was cancelled;
however, the reason the job was cancelled is not clear.  Currently, the
pool logs only `Task was cancelled`.

Modification:

Update PoolMigrationCancelMessage to include a reason (as a simple
String), explaining the motivation behind cancelling the migration job.

The controlling job is updated to populate this explanation.  Note that
this information is already available if the task is explicitly
cancelled (i.e., outside of the FSM).

The target pool is updated to log the explanation from the
PoolMigrationCancelMessage if one is provided.

Result:

The pool now provides more information when a migration job was
cancelled.

Target: master
Request: 8.0
Request: 7.2
Requires-notes: yes
Requires-book: no
Patch: https://rb.dcache.org/r/13501/
Acked-by: Lea Morschel
  • Loading branch information
paulmillar committed Mar 31, 2022
1 parent 6af465f commit 9187b08
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 7 deletions.
Expand Up @@ -25,6 +25,7 @@
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import javax.annotation.Nullable;
import org.dcache.pool.PoolDataBeanProvider;
import org.dcache.pool.classic.ChecksumModule;
import org.dcache.pool.migration.json.MigrationData;
Expand Down Expand Up @@ -152,7 +153,7 @@ public Message messageArrived(PoolMigrationCancelMessage message)

UUID uuid = message.getUUID();
Request request = _requests.get(uuid);
if (request == null || !request.cancel()) {
if (request == null || !request.cancel(message.getReason())) {
throw new CacheException(CacheException.INVALID_ARGS,
"No such request");
}
Expand All @@ -176,6 +177,7 @@ private class Request implements CacheFileAvailable, Runnable {
private final boolean _isMetaOnly;
private Integer _companion;
private Future<?> _updateTask;
private String _whyCancel;

public Request(CellPath requestor, PoolMigrationCopyReplicaMessage message) {
_requestor = requestor;
Expand Down Expand Up @@ -225,12 +227,17 @@ public synchronized void start()
}
}

public synchronized boolean cancel() {
private String cancelMessage() {
return _whyCancel == null ? "Task was cancelled" : ("Task was cancelled: " + _whyCancel);
}

public synchronized boolean cancel(@Nullable String why) {
_whyCancel = why;
if (_companion != null) {
return _p2p.cancel(_companion);
} else if (_updateTask != null && _updateTask.cancel(true)) {
if (_requests.remove(_uuid) != null) {
finished(new CacheException("Task was cancelled"));
finished(new CacheException(cancelMessage()));
}
return true;
}
Expand Down Expand Up @@ -306,7 +313,7 @@ public void run() {
finished(new DiskErrorCacheException(
"I/O error during checksum calculation: " + messageOrClassName(e)));
} catch (InterruptedException e) {
finished(new CacheException("Task was cancelled"));
finished(new CacheException(cancelMessage()));
} catch (IllegalTransitionException e) {
finished(new CacheException("Cannot update file in state " + e.getSourceState()));
} catch (CacheException | NoSuchAlgorithmException | RuntimeException e) {
Expand Down
Expand Up @@ -2,6 +2,7 @@

import diskCacheV111.util.PnfsId;
import java.util.UUID;
import javax.annotation.Nullable;

/**
* MigrationModuleServer message to request that a transfer is aborted.
Expand All @@ -10,7 +11,15 @@ public class PoolMigrationCancelMessage extends PoolMigrationMessage {

private static final long serialVersionUID = -7995913634698011318L;

public PoolMigrationCancelMessage(UUID uuid, String pool, PnfsId pnfsId) {
private final String reason;

public PoolMigrationCancelMessage(UUID uuid, String pool, PnfsId pnfsId, @Nullable String reason) {
super(uuid, pool, pnfsId);
this.reason = reason;
}

@Nullable
public String getReason() {
return reason;
}
}
Expand Up @@ -292,10 +292,15 @@ synchronized void initiateCopy() {
* FSM Action
*/
synchronized void cancelCopy() {
cancelCopy(_cancelReason.orElse(null));
}

synchronized void cancelCopy(String reason) {
CellStub.addCallback(_parameters.pool.send(_target,
new PoolMigrationCancelMessage(_uuid,
_source,
getPnfsId())),
getPnfsId(),
reason)),
new Callback<>("cancel_"), _parameters.executor);
}

Expand Down
Expand Up @@ -220,7 +220,8 @@ Entry
{
// No reply, but message could have been
// received anyway, so try to cancel it.
cancelCopy();
cancelCopy("Timeout waiting for target pool "
+ ctxt.getTarget());
}
cancel
Cancelling
Expand Down

0 comments on commit 9187b08

Please sign in to comment.