Skip to content

Commit

Permalink
Discovery: a more lenient wait joinThread when stopping
Browse files Browse the repository at this point in the history
When a node stops, we cancel any ongoing join process. With #8327, we improved this logic and wait for it to complete before shutting down the node. In our tests we typically shutdown an entire cluster at once, which makes it very likely for nodes to be joining while shutting down. This introduces a race condition where the joinThread.interrupt can happen before the thread starts waiting on pings which causes shutdown logic to be slow. This commits improves by repeatedly trying to stop the thread in smaller waits.

Another side effect of the change is that we are now more likely to ping ourselves while shutting down, we results in an ugly warn level log. We now log all remote exception during pings at a debug level.

Closes #8359
  • Loading branch information
bleskes committed Nov 6, 2014
1 parent c473976 commit 83d9dab
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 9 deletions.
17 changes: 8 additions & 9 deletions src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
Expand Up @@ -253,6 +253,7 @@ public void onFailure(String source, @org.elasticsearch.common.Nullable Throwabl


@Override @Override
protected void doStop() throws ElasticsearchException { protected void doStop() throws ElasticsearchException {
joinThreadControl.stop();
pingService.stop(); pingService.stop();
masterFD.stop("zen disco stop"); masterFD.stop("zen disco stop");
nodesFD.stop(); nodesFD.stop();
Expand Down Expand Up @@ -282,7 +283,6 @@ protected void doStop() throws ElasticsearchException {
} }
} }
} }
joinThreadControl.stop();
} }


@Override @Override
Expand Down Expand Up @@ -1354,15 +1354,14 @@ public void stop() {
running.set(false); running.set(false);
Thread joinThread = currentJoinThread.getAndSet(null); Thread joinThread = currentJoinThread.getAndSet(null);
if (joinThread != null) { if (joinThread != null) {
try { for (int i = 0; i < 10 && joinThread.isAlive(); i++) {
joinThread.interrupt(); joinThread.interrupt();
} catch (Exception e) { try {
// ignore joinThread.join(200);
} } catch (InterruptedException e) {
try { Thread.currentThread().interrupt();
joinThread.join(10000); return;
} catch (InterruptedException e) { }
Thread.currentThread().interrupt();
} }
} }
} }
Expand Down
Expand Up @@ -373,6 +373,9 @@ public void run() {
} catch (ConnectTransportException e) { } catch (ConnectTransportException e) {
// can't connect to the node - this is a more common path! // can't connect to the node - this is a more common path!
logger.trace("[{}] failed to connect to {}", e, sendPingsHandler.id(), finalNodeToSend); logger.trace("[{}] failed to connect to {}", e, sendPingsHandler.id(), finalNodeToSend);
} catch (RemoteTransportException e) {
// something went wrong on the other side
logger.debug("[{}] received a remote error as a response to ping {}", e, sendPingsHandler.id(), finalNodeToSend);
} catch (Throwable e) { } catch (Throwable e) {
logger.warn("[{}] failed send ping to {}", e, sendPingsHandler.id(), finalNodeToSend); logger.warn("[{}] failed send ping to {}", e, sendPingsHandler.id(), finalNodeToSend);
} finally { } finally {
Expand Down

0 comments on commit 83d9dab

Please sign in to comment.