Skip to content

Commit

Permalink
[Broker] Make health check fail if dead locked threads are detected (a…
Browse files Browse the repository at this point in the history
…pache#15155)

* [Broker] Make health check fail if dead locked threads are detected

* Add unit test for detecting a dead lock

* Use lockInterruptibly to unlock the deadlock and wait for threads to finish

* Add test for testing the deadlock detection overhead

(cherry picked from commit df0c110)
  • Loading branch information
lhotari committed Apr 22, 2022
1 parent f5adc17 commit 6efc6a6
Show file tree
Hide file tree
Showing 4 changed files with 374 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ static String buildThreadDump() {

static String buildDeadlockInfo() {
ThreadMXBean threadBean = ManagementFactory.getThreadMXBean();
long[] threadIds = threadBean.findMonitorDeadlockedThreads();
long[] threadIds = threadBean.findDeadlockedThreads();
if (threadIds != null && threadIds.length > 0) {
StringWriter stringWriter = new StringWriter();
PrintWriter out = new PrintWriter(stringWriter);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,17 @@

import static org.apache.pulsar.broker.service.BrokerService.BROKER_SERVICE_CONFIGURATION_PATH;

import java.lang.management.ManagementFactory;
import java.lang.management.ThreadInfo;
import java.lang.management.ThreadMXBean;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ScheduledFuture;
import java.util.stream.Collectors;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.TimeUnit;

Expand Down Expand Up @@ -58,6 +63,7 @@
import org.apache.pulsar.common.policies.data.NamespaceOwnershipStatus;
import org.apache.pulsar.common.util.ObjectMapperFactory;
import org.apache.pulsar.zookeeper.ZooKeeperDataCache;
import org.apache.pulsar.common.util.ThreadDumpUtil;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.ZooDefs;
import org.slf4j.Logger;
Expand All @@ -75,6 +81,10 @@
public class BrokersBase extends AdminResource {
private static final Logger LOG = LoggerFactory.getLogger(BrokersBase.class);
private int serviceConfigZkVersion = -1;
// log a full thread dump when a deadlock is detected in healthcheck once every 10 minutes
// to prevent excessive logging
private static final long LOG_THREADDUMP_INTERVAL_WHEN_DEADLOCK_DETECTED = 600000L;
private volatile long threadDumpLoggedTimestamp;

@GET
@Path("/{cluster}")
Expand Down Expand Up @@ -298,6 +308,9 @@ public void isReady(@Suspended AsyncResponse asyncResponse) {
@ApiResponse(code = 500, message = "Internal server error")})
public void healthcheck(@Suspended AsyncResponse asyncResponse) throws Exception {
validateSuperUserAccess();

checkDeadlockedThreads();

String heartbeatNamespace = NamespaceService.getHeartbeatNamespace(
pulsar().getAdvertisedAddress(), pulsar().getConfiguration());
String topic = String.format("persistent://%s/healthcheck", heartbeatNamespace);
Expand Down Expand Up @@ -374,6 +387,26 @@ public void healthcheck(@Suspended AsyncResponse asyncResponse) throws Exception
});
}

private void checkDeadlockedThreads() {
ThreadMXBean threadBean = ManagementFactory.getThreadMXBean();
long[] threadIds = threadBean.findDeadlockedThreads();
if (threadIds != null && threadIds.length > 0) {
ThreadInfo[] threadInfos = threadBean.getThreadInfo(threadIds, false, false);
String threadNames = Arrays.stream(threadInfos)
.map(threadInfo -> threadInfo.getThreadName() + "(tid=" + threadInfo.getThreadId() + ")").collect(
Collectors.joining(", "));
if (System.currentTimeMillis() - threadDumpLoggedTimestamp
> LOG_THREADDUMP_INTERVAL_WHEN_DEADLOCK_DETECTED) {
threadDumpLoggedTimestamp = System.currentTimeMillis();
LOG.error("Deadlocked threads detected. {}\n{}", threadNames,
ThreadDumpUtil.buildThreadDiagnosticString());
} else {
LOG.error("Deadlocked threads detected. {}", threadNames);
}
throw new IllegalStateException("Deadlocked threads detected. " + threadNames);
}
}

private void healthcheckReadLoop(CompletableFuture<Reader<String>> readerFuture,
CompletableFuture<?> completablePromise,
String messageStr) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.pulsar.broker.admin;

import com.google.common.collect.Sets;
import java.lang.management.ManagementFactory;
import java.lang.management.ThreadMXBean;
import java.time.Duration;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Phaser;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.apache.pulsar.broker.auth.MockedPulsarServiceBaseTest;
import org.apache.pulsar.client.admin.PulsarAdminException;
import org.apache.pulsar.common.policies.data.ClusterData;
import org.apache.pulsar.common.policies.data.TenantInfo;
import org.apache.pulsar.compaction.Compactor;
import org.awaitility.Awaitility;
import org.junit.Assert;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;

@Test(groups = "broker-admin")
@Slf4j
public class AdminApiHealthCheckTest extends MockedPulsarServiceBaseTest {

private final ThreadMXBean threadBean = ManagementFactory.getThreadMXBean();

@BeforeMethod
@Override
public void setup() throws Exception {
resetConfig();
super.internalSetup();
ClusterData clusterData = new ClusterData();
clusterData.setServiceUrl(pulsar.getWebServiceAddress());
admin.clusters().createCluster("test", clusterData);
TenantInfo tenantInfo = new TenantInfo(
Sets.newHashSet("role1", "role2"), Sets.newHashSet("test"));
admin.tenants().createTenant("pulsar", tenantInfo);
admin.namespaces().createNamespace("pulsar/system", Sets.newHashSet("test"));
admin.tenants().createTenant("public", tenantInfo);
admin.namespaces().createNamespace("public/default", Sets.newHashSet("test"));
}

@AfterMethod(alwaysRun = true)
@Override
public void cleanup() throws Exception {
super.internalCleanup();
}

@Test
public void testHealthCheckup() throws Exception {
final int times = 30;
CompletableFuture<Void> future = new CompletableFuture<>();
pulsar.getExecutor().execute(() -> {
try {
for (int i = 0; i < times; i++) {
admin.brokers().healthcheck();
}
future.complete(null);
}catch (PulsarAdminException e) {
future.completeExceptionally(e);
}
});
for (int i = 0; i < times; i++) {
admin.brokers().healthcheck();
}
// To ensure we don't have any subscription
final String testHealthCheckTopic = String.format("persistent://pulsar/test/localhost:%s/healthcheck",
pulsar.getConfig().getWebServicePort().get());
Awaitility.await().untilAsserted(() -> {
Assert.assertFalse(future.isCompletedExceptionally());
});
Awaitility.await().untilAsserted(() ->
Assert.assertTrue(admin.topics()
.getSubscriptions(testHealthCheckTopic).stream()
// All system topics are using compaction, even though is not explicitly set in the policies.
.filter(v -> !v.equals(Compactor.COMPACTION_SUBSCRIPTION))
.findAny()
.isPresent()
));
}

@Test(expectedExceptions= PulsarAdminException.class)
public void testHealthCheckupDetectsDeadlock() throws Exception {
// simulate a deadlock in the Test JVM
// the broker used in unit tests runs in the test JVM and the
// healthcheck implementation should detect this deadlock
Lock lock1 = new ReentrantReadWriteLock().writeLock();
Lock lock2 = new ReentrantReadWriteLock().writeLock();
final Phaser phaser = new Phaser(3);
Thread thread1=new Thread(() -> {
phaser.arriveAndAwaitAdvance();
try {
deadlock(lock1, lock2, 1000L);
} finally {
phaser.arriveAndDeregister();
}
}, "deadlockthread-1");
Thread thread2=new Thread(() -> {
phaser.arriveAndAwaitAdvance();
try {
deadlock(lock2, lock1, 2000L);
} finally {
phaser.arriveAndDeregister();
}
}, "deadlockthread-2");
thread1.start();
thread2.start();
phaser.arriveAndAwaitAdvance();
Thread.sleep(5000L);

try {
admin.brokers().healthcheck();
} finally {
// unlock the deadlock
thread1.interrupt();
thread2.interrupt();
// wait for deadlock threads to finish
phaser.arriveAndAwaitAdvance();
// wait for deadlocked status to clear before continuing
Awaitility.await().atMost(Duration.ofSeconds(10))
.until(() -> threadBean.findDeadlockedThreads() == null);
}
}

private void deadlock(Lock lock1, Lock lock2, long millis) {
try {
lock1.lockInterruptibly();
try {
Thread.sleep(millis);
lock2.lockInterruptibly();
lock2.unlock();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
} finally {
lock1.unlock();
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}

@Test(timeOut = 5000L)
public void testDeadlockDetectionOverhead() {
for (int i=0; i < 1000; i++) {
long[] threadIds = threadBean.findDeadlockedThreads();
// assert that there's no deadlock
Assert.assertNull(threadIds);
}
}
}
Loading

0 comments on commit 6efc6a6

Please sign in to comment.