Skip to content

Commit

Permalink
Fix deadlock bug exposed by a test (#89934) (#89942)
Browse files Browse the repository at this point in the history
A new test exposed a very rare bug where the
file settings service was in the middle of processing
the file when the node closed. This terminated the
cluster state update task, but nobody unlocked the
latch await. The fix allows the stop operation to
properly terminate the watcher thread.
  • Loading branch information
grcevski committed Sep 8, 2022
1 parent 3a6152f commit ffc9ff1
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 5 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/89934.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 89934
summary: Fix deadlock bug exposed by the test
area: Infra/Core
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ public class FileSettingsService extends AbstractLifecycleComponent implements C

private WatchService watchService; // null;
private CountDownLatch watcherThreadLatch;
private volatile CountDownLatch processingLatch;

private volatile FileUpdateState fileUpdateState = null;
private volatile WatchKey settingsDirWatchKey = null;
Expand Down Expand Up @@ -312,7 +313,15 @@ synchronized void startWatcher(ClusterState clusterState, boolean onStartup) {
settingsDirWatchKey = enableSettingsWatcher(settingsDirWatchKey, settingsDir);

if (watchedFileChanged(path)) {
processFileSettings(path, (e) -> logger.error("Error processing operator settings json file", e)).await();
processingLatch = processFileSettings(
path,
(e) -> logger.error("Error processing operator settings json file", e)
);
// After we get and set the processing latch, we need to check if stop wasn't
// invoked in the meantime. Stop will invalidate all watch keys.
if (configDirWatchKey != null) {
processingLatch.await();
}
}
} catch (IOException e) {
logger.warn("encountered I/O error while watching file settings", e);
Expand All @@ -339,6 +348,9 @@ synchronized void stopWatcher() {
cleanupWatchKeys();
fileUpdateState = null;
watchService.close();
if (processingLatch != null) {
processingLatch.countDown();
}
if (watcherThreadLatch != null) {
watcherThreadLatch.await();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ public class FileSettingsServiceTests extends ESTestCase {
private Environment env;
private ClusterService clusterService;
private FileSettingsService fileSettingsService;
private ReservedClusterStateService controller;
private ThreadPool threadpool;

@Before
Expand Down Expand Up @@ -86,10 +87,7 @@ public void setUp() throws Exception {

ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);

ReservedClusterStateService controller = new ReservedClusterStateService(
clusterService,
List.of(new ReservedClusterSettingsAction(clusterSettings))
);
controller = new ReservedClusterStateService(clusterService, List.of(new ReservedClusterSettingsAction(clusterSettings)));

fileSettingsService = new FileSettingsService(clusterService, controller, env);
}
Expand Down Expand Up @@ -217,4 +215,93 @@ public void testInitialFile() throws Exception {
service.stop();
service.close();
}

@SuppressWarnings("unchecked")
public void testStopWorksInMiddleOfProcessing() throws Exception {
var spiedController = spy(controller);
var fsService = new FileSettingsService(clusterService, spiedController, env);

FileSettingsService service = spy(fsService);
CountDownLatch processFileLatch = new CountDownLatch(1);
CountDownLatch deadThreadLatch = new CountDownLatch(1);

doAnswer((Answer<Void>) invocation -> {
processFileLatch.countDown();
new Thread(() -> {
// Simulate a thread that never comes back and decrements the
// countdown latch in FileSettingsService.processFileSettings
try {
deadThreadLatch.await();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}).start();
return null;
}).when(spiedController).process(any(String.class), any(XContentParser.class), any(Consumer.class));

service.start();
assertTrue(service.watching());

Files.createDirectories(service.operatorSettingsDir());

// Make some fake settings file to cause the file settings service to process it
Files.write(service.operatorSettingsFile(), "{}".getBytes(StandardCharsets.UTF_8));

// we need to wait a bit, on MacOS it may take up to 10 seconds for the Java watcher service to notice the file,
// on Linux is instantaneous. Windows is instantaneous too.
processFileLatch.await(30, TimeUnit.SECONDS);

// Stopping the service should interrupt the watcher thread, we should be able to stop
service.stop();
assertFalse(service.watching());
service.close();
// let the deadlocked thread end, so we can cleanly exit the test
deadThreadLatch.countDown();
}

@SuppressWarnings("unchecked")
public void testStopWorksIfProcessingDidntReturnYet() throws Exception {
var spiedController = spy(controller);
var fsService = new FileSettingsService(clusterService, spiedController, env);

FileSettingsService service = spy(fsService);
CountDownLatch processFileLatch = new CountDownLatch(1);
CountDownLatch deadThreadLatch = new CountDownLatch(1);

doAnswer((Answer<Void>) invocation -> {
processFileLatch.countDown();
// allow the other thread to continue, but hold on a bit to avoid
// setting the count-down latch in the main watcher loop.
Thread.sleep(1_000);
new Thread(() -> {
// Simulate a thread that never comes back and decrements the
// countdown latch in FileSettingsService.processFileSettings
try {
deadThreadLatch.await();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}).start();
return null;
}).when(spiedController).process(any(String.class), any(XContentParser.class), any(Consumer.class));

service.start();
assertTrue(service.watching());

Files.createDirectories(service.operatorSettingsDir());

// Make some fake settings file to cause the file settings service to process it
Files.write(service.operatorSettingsFile(), "{}".getBytes(StandardCharsets.UTF_8));

// we need to wait a bit, on MacOS it may take up to 10 seconds for the Java watcher service to notice the file,
// on Linux is instantaneous. Windows is instantaneous too.
processFileLatch.await(30, TimeUnit.SECONDS);

// Stopping the service should interrupt the watcher thread, we should be able to stop
service.stop();
assertFalse(service.watching());
service.close();
// let the deadlocked thread end, so we can cleanly exit the test
deadThreadLatch.countDown();
}
}

0 comments on commit ffc9ff1

Please sign in to comment.