Skip to content

Commit

Permalink
Capture thread dump on ShardLockObtainFailedException (#93458)
Browse files Browse the repository at this point in the history
We sometimes see a `ShardLockObtainFailedException` when a shard failed
to shut down as fast as we expected, often because a node left and
rejoined the cluster. Sometimes this is because it was held open by
ongoing scrolls or PITs, but other times it may be because the shutdown
process itself is too slow. With this commit we add the ability to
capture and log a thread dump at the time of the failure to give us more
information about where the shutdown process might be running slowly.

Relates #93226
  • Loading branch information
DaveCTurner committed Feb 2, 2023
1 parent 7adeb48 commit 4c68382
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 43 deletions.
36 changes: 36 additions & 0 deletions docs/reference/modules/discovery/fault-detection.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -373,3 +373,39 @@ checks are `transport_worker` and `cluster_coordination` threads, for which
there should never be a long wait. There may also be evidence of long waits for
threads in the {es} logs. Refer to <<modules-network-threading-model>> for more
information.

===== Diagnosing `ShardLockObtainFailedException` failures

If a node leaves and rejoins the cluster then {es} will usually shut down and
re-initialize its shards. If the shards do not shut down quickly enough then
{es} may fail to re-initialize them due to a `ShardLockObtainFailedException`.

To gather more information about the reason for shards shutting down slowly,
configure the following logger:

[source,yaml]
----
logger.org.elasticsearch.env.NodeEnvironment: DEBUG
----

When this logger is enabled, {es} will attempt to run the
<<cluster-nodes-hot-threads>> API whenever it encounters a
`ShardLockObtainFailedException`. The results are compressed, encoded, and
split into chunks to avoid truncation:

[source,text]
----
[DEBUG][o.e.e.NodeEnvironment ] [master] hot threads while failing to obtain shard lock for [index][0] [part 1]: H4sIAAAAAAAA/x...
[DEBUG][o.e.e.NodeEnvironment ] [master] hot threads while failing to obtain shard lock for [index][0] [part 2]: p7x3w1hmOQVtuV...
[DEBUG][o.e.e.NodeEnvironment ] [master] hot threads while failing to obtain shard lock for [index][0] [part 3]: v7uTboMGDbyOy+...
[DEBUG][o.e.e.NodeEnvironment ] [master] hot threads while failing to obtain shard lock for [index][0] [part 4]: 4tse0RnPnLeDNN...
[DEBUG][o.e.e.NodeEnvironment ] [master] hot threads while failing to obtain shard lock for [index][0] (gzip compressed, base64-encoded, and split into 4 parts on preceding log lines)
----

To reconstruct the output, base64-decode the data and decompress it using
`gzip`. For instance, on Unix-like systems:

[source,sh]
----
cat shardlock.log | sed -e 's/.*://' | base64 --decode | gzip --decompress
----
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ public enum ReferenceDocs {
DISCOVERY_TROUBLESHOOTING,
UNSTABLE_CLUSTER_TROUBLESHOOTING,
LAGGING_NODE_TROUBLESHOOTING,
SHARD_LOCK_TROUBLESHOOTING,
CONCURRENT_REPOSITORY_WRITERS,
ARCHIVE_INDICES,
// this comment keeps the ';' on the next line so every entry above has a trailing ',' which makes the diff for adding new links cleaner
Expand Down
57 changes: 46 additions & 11 deletions server/src/main/java/org/elasticsearch/env/NodeEnvironment.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

package org.elasticsearch.env;

import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.util.Strings;
Expand All @@ -25,8 +26,10 @@
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
import org.elasticsearch.common.Randomness;
import org.elasticsearch.common.ReferenceDocs;
import org.elasticsearch.common.UUIDs;
import org.elasticsearch.common.io.FileSystemUtils;
import org.elasticsearch.common.logging.ChunkedLoggingStream;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Setting.Property;
import org.elasticsearch.common.settings.Settings;
Expand All @@ -49,12 +52,15 @@
import org.elasticsearch.index.store.FsDirectoryFactory;
import org.elasticsearch.monitor.fs.FsInfo;
import org.elasticsearch.monitor.fs.FsProbe;
import org.elasticsearch.monitor.jvm.HotThreads;
import org.elasticsearch.monitor.jvm.JvmInfo;
import org.elasticsearch.xcontent.NamedXContentRegistry;

import java.io.Closeable;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UncheckedIOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.AtomicMoveNotSupportedException;
import java.nio.file.DirectoryStream;
import java.nio.file.FileStore;
Expand Down Expand Up @@ -926,6 +932,38 @@ public Set<ShardId> lockedShards() {
}
}

// throttle the hot-threads calls: no more than one per minute
private final Semaphore shardLockHotThreadsPermit = new Semaphore(1);
private long nextShardLockHotThreadsNanos = Long.MIN_VALUE;

private void maybeLogThreadDump(ShardId shardId, String message) {
if (logger.isDebugEnabled() == false) {
return;
}

final var prefix = format("hot threads while failing to obtain shard lock for %s: %s", shardId, message);
if (shardLockHotThreadsPermit.tryAcquire()) {
try {
final var now = System.nanoTime();
if (now <= nextShardLockHotThreadsNanos) {
return;
}
nextShardLockHotThreadsNanos = now + TimeUnit.SECONDS.toNanos(60);
final var hotThreads = new HotThreads().busiestThreads(500).ignoreIdleThreads(false).detect();
try (
var stream = ChunkedLoggingStream.create(logger, Level.DEBUG, prefix, ReferenceDocs.SHARD_LOCK_TROUBLESHOOTING);
var writer = new OutputStreamWriter(stream, StandardCharsets.UTF_8)
) {
writer.write(hotThreads);
}
} catch (Exception e) {
logger.error(format("could not obtain %s", prefix), e);
} finally {
shardLockHotThreadsPermit.release();
}
}
}

private final class InternalShardLock {
/*
* This class holds a mutex for exclusive access and timeout / wait semantics
Expand Down Expand Up @@ -975,18 +1013,15 @@ void acquire(long timeoutInMillis, final String details) throws ShardLockObtainF
setDetails(details);
} else {
final Tuple<Long, String> lockDetails = this.lockDetails; // single volatile read
throw new ShardLockObtainFailedException(
shardId,
"obtaining shard lock for ["
+ details
+ "] timed out after ["
+ timeoutInMillis
+ "ms], lock already held for ["
+ lockDetails.v2()
+ "] with age ["
+ TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - lockDetails.v1())
+ "ms]"
final var message = format(
"obtaining shard lock for [%s] timed out after [%dms], lock already held for [%s] with age [%dms]",
details,
timeoutInMillis,
lockDetails.v2(),
TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - lockDetails.v1())
);
maybeLogThreadDump(shardId, message);
throw new ShardLockObtainFailedException(shardId, message);
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"DISCOVERY_TROUBLESHOOTING": "discovery-troubleshooting.html",
"UNSTABLE_CLUSTER_TROUBLESHOOTING": "cluster-fault-detection.html#cluster-fault-detection-troubleshooting",
"LAGGING_NODE_TROUBLESHOOTING": "cluster-fault-detection.html#_diagnosing_lagging_nodes",
"SHARD_LOCK_TROUBLESHOOTING": "cluster-fault-detection.html#_diagnosing_shardlockobtainfailedexception_failures",
"CONCURRENT_REPOSITORY_WRITERS": "add-repository.html",
"ARCHIVE_INDICES": "archive-indices.html"
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*/
package org.elasticsearch.env;

import org.apache.logging.log4j.Level;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
Expand Down Expand Up @@ -34,7 +35,9 @@
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.test.MockLogAppender;
import org.elasticsearch.test.NodeRoles;
import org.elasticsearch.test.junit.annotations.TestLogging;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
Expand Down Expand Up @@ -114,44 +117,63 @@ public void testSegmentInfosTracing() {
}
}

// using a literal string here because the logger is mentioned in the docs, and therefore must only be changed with care
private static final String NODE_ENVIRONMENT_LOGGER_NAME = "org.elasticsearch.env.NodeEnvironment";

@TestLogging(reason = "test includes assertions about DEBUG logging", value = NODE_ENVIRONMENT_LOGGER_NAME + ":DEBUG")
public void testShardLock() throws Exception {
final NodeEnvironment env = newNodeEnvironment();
try (var env = newNodeEnvironment()) {

Index index = new Index("foo", "fooUUID");
ShardLock fooLock = env.shardLock(new ShardId(index, 0), "1");
assertEquals(new ShardId(index, 0), fooLock.getShardId());
Index index = new Index("foo", "fooUUID");

try {
env.shardLock(new ShardId(index, 0), "2");
fail("shard is locked");
} catch (ShardLockObtainFailedException ex) {
// expected
}
for (Path path : env.indexPaths(index)) {
Files.createDirectories(path.resolve("0"));
Files.createDirectories(path.resolve("1"));
}
try {
env.lockAllForIndex(index, idxSettings, "3", randomIntBetween(0, 10));
fail("shard 0 is locked");
} catch (ShardLockObtainFailedException ex) {
// expected
}
var appender = new MockLogAppender();
appender.addExpectation(
new MockLogAppender.SeenEventExpectation(
"hot threads logging",
NODE_ENVIRONMENT_LOGGER_NAME,
Level.DEBUG,
"hot threads while failing to obtain shard lock for [foo][0]: obtaining shard lock for [2] timed out after *"
)
);
appender.addExpectation(
new MockLogAppender.UnseenEventExpectation(
"second attempt should be suppressed due to throttling",
NODE_ENVIRONMENT_LOGGER_NAME,
Level.DEBUG,
"hot threads while failing to obtain shard lock for [foo][0]: obtaining shard lock for [3] timed out after *"
)
);

fooLock.close();
// can lock again?
env.shardLock(new ShardId(index, 0), "4").close();
try (var ignored = appender.capturing(NodeEnvironment.class); var lock = env.shardLock(new ShardId(index, 0), "1")) {
assertEquals(new ShardId(index, 0), lock.getShardId());

List<ShardLock> locks = env.lockAllForIndex(index, idxSettings, "5", randomIntBetween(0, 10));
try {
env.shardLock(new ShardId(index, 0), "6");
fail("shard is locked");
} catch (ShardLockObtainFailedException ex) {
// expected
expectThrows(ShardLockObtainFailedException.class, () -> env.shardLock(new ShardId(index, 0), "2"));

for (Path path : env.indexPaths(index)) {
Files.createDirectories(path.resolve("0"));
Files.createDirectories(path.resolve("1"));
}
expectThrows(
ShardLockObtainFailedException.class,
() -> env.lockAllForIndex(index, idxSettings, "3", randomIntBetween(0, 10))
);

appender.assertAllExpectationsMatched();
}

// can lock again?
env.shardLock(new ShardId(index, 0), "4").close();

List<ShardLock> locks = new ArrayList<>();
try {
locks.addAll(env.lockAllForIndex(index, idxSettings, "5", randomIntBetween(0, 10)));
expectThrows(ShardLockObtainFailedException.class, () -> env.shardLock(new ShardId(index, 0), "6"));
} finally {
IOUtils.close(locks);
}

assertTrue("LockedShards: " + env.lockedShards(), env.lockedShards().isEmpty());
}
IOUtils.close(locks);
assertTrue("LockedShards: " + env.lockedShards(), env.lockedShards().isEmpty());
env.close();
}

public void testAvailableIndexFolders() throws Exception {
Expand Down

0 comments on commit 4c68382

Please sign in to comment.