Skip to content

Commit

Permalink
cells: Fix deadlock during startup
Browse files Browse the repository at this point in the history
The patch fixes a regression introduced in 2.8. The regression may cause a
deadlock in cells if they receive a message during startup.  This may happen
when restarting individual services in a busy dCache. It is unlikely to happen
when "cold starting" dCache.

The problem is caused by AbstractCell running most cell initialization on the
cell message thread (to ensure the correct context is used). If a message is
received before this task is created, the message will block the message thread
waiting for the cell startup to complete, however the cell startup cannot
complete because the initialization task cannot be executed due to the message
thread being blocked.

The patch solves this by returning a NoRouteToCellException when receiving a
message before cell startup has completed. Thus the blocking behaviour is
removed and the deadlock is avoided.

An unused shutdown gate is removed too.

Target: trunk
Require-notes: yes
Require-book: no
Request: 2.10
Request: 2.9
Request: 2.8
Acked-by: Paul Millar <paul.millar@desy.de>
Patch: https://rb.dcache.org/r/7187/
  • Loading branch information
gbehrmann committed Aug 8, 2014
1 parent 4b10fe5 commit 61d7402
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 3 deletions.
19 changes: 16 additions & 3 deletions modules/cells/src/main/java/dmg/cells/nucleus/CellAdapter.java
Expand Up @@ -78,7 +78,6 @@ public class CellAdapter extends CommandInterpreter
private final CellNucleus _nucleus;
private final Gate _readyGate = new Gate(false);
private final Gate _startGate = new Gate(false);
private final Gate _shutdownGate = new Gate(false);
private final Args _args;
private boolean _useInterpreter = true;
private boolean _returnCommandException = true;
Expand Down Expand Up @@ -707,8 +706,8 @@ public String call() throws IOException
public void prepareRemoval(KillEvent ce)
{
_log.info("CellAdapter : prepareRemoval : waiting for gate to open");
_startGate.check();
_readyGate.check();
_shutdownGate.open();
cleanUp();
dumpPinboard();
_log.info("CellAdapter : prepareRemoval : done");
Expand Down Expand Up @@ -777,10 +776,24 @@ public String getInfo() {
*/
@Override
public void messageArrived(MessageEvent me) {
_startGate.check();
if (me instanceof LastMessageEvent) {
_log.info("messageArrived : LastMessageEvent (opening gate)");
_readyGate.open();
} else if (!_startGate.isOpen()) {
CellMessage msg = me.getMessage();
if (!msg.isReply()) {
try {
NoRouteToCellException e =
new NoRouteToCellException(msg.getUOID(),
msg.getDestinationPath(),
getCellName() + " is still initializing.");
msg.revertDirection();
msg.setMessageObject(e);
_nucleus.sendMessage(msg, true, true);
} catch (NoRouteToCellException e) {
_log.warn("PANIC : Problem returning answer : " + e);
}
}
} else {
CellMessage msg = me.getMessage();
Serializable obj = msg.getMessageObject();
Expand Down
5 changes: 5 additions & 0 deletions modules/cells/src/main/java/dmg/util/Gate.java
Expand Up @@ -25,6 +25,11 @@ public synchronized Object check(){
}

}

public synchronized boolean isOpen() {
return _isOpen;
}

public synchronized void open(){
_isOpen = true ;
notifyAll() ;
Expand Down

0 comments on commit 61d7402

Please sign in to comment.