From 373f437f33c1a78a3f6cba460d8da2f57908582e Mon Sep 17 00:00:00 2001 From: Moritz Mack Date: Fri, 19 Sep 2025 15:58:43 +0200 Subject: [PATCH] Fix race in FileSettingsServiceIT.testSettingsAppliedOnStart (#134368) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This was failing very very rarely due to unfortunate timing conditions. Cluster state changes are applied to all nodes prior to being published on the master node itself. However, the cluster state listener was previously attached to the data node, allowing for a very short time window where the state update wasn't visible on the master node itself when checking in `assertClusterStateSaveOK`. This changes the test to attach the listener to the master node itself preventing above condition. I was initially worried it might be attached too late in cases, but I couldn't reproduce any more issues this way. > According to the dashboard, this started to fail on Monday (13/07). It definitely does not look like a test failure, so I'm assigning a medium priority, which we could raise if we discover this is a new bug. I couldn't find any related commit that might have caused this. Still wondering why this started failing around that time 🤔 Fixes #131210 (cherry picked from commit a29392c54def8d96c5a969810554cede95309242) # Conflicts: # muted-tests.yml --- .../service/FileSettingsServiceIT.java | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/reservedstate/service/FileSettingsServiceIT.java b/server/src/internalClusterTest/java/org/elasticsearch/reservedstate/service/FileSettingsServiceIT.java index b86031ce96bf3..b1d335d0d79f2 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/reservedstate/service/FileSettingsServiceIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/reservedstate/service/FileSettingsServiceIT.java @@ -188,7 +188,7 @@ public void clusterChanged(ClusterChangedEvent event) { return new Tuple<>(savedClusterState, metadataVersion); } - private Tuple setupClusterStateListener(String node, long version) { + private Tuple setupClusterStateListener(String node, long fileSettingsVersion) { ClusterService clusterService = internalCluster().clusterService(node); CountDownLatch savedClusterState = new CountDownLatch(1); AtomicLong metadataVersion = new AtomicLong(-1); @@ -196,10 +196,15 @@ private Tuple setupClusterStateListener(String node, @Override public void clusterChanged(ClusterChangedEvent event) { ReservedStateMetadata reservedState = event.state().metadata().reservedStateMetadata().get(FileSettingsService.NAMESPACE); - if (reservedState != null && reservedState.version() == version) { + if (reservedState != null && reservedState.version() == fileSettingsVersion) { clusterService.removeListener(this); metadataVersion.set(event.state().metadata().version()); savedClusterState.countDown(); + logger.info( + "done waiting for file settings [version: {}, metadata version: {}]", + event.state().version(), + event.state().metadata().version() + ); } } }); @@ -265,15 +270,16 @@ public void testSettingsAppliedOnStart() throws Exception { FileSettingsService dataFileSettingsService = internalCluster().getInstance(FileSettingsService.class, dataNode); assertFalse(dataFileSettingsService.watching()); - var savedClusterState = setupClusterStateListener(dataNode, versionCounter.incrementAndGet()); + long expectedVersion = versionCounter.incrementAndGet(); // In internal cluster tests, the nodes share the config directory, so when we write with the data node path // the master will pick it up on start - writeJSONFile(dataNode, testJSON, logger, versionCounter.get()); + writeJSONFile(dataNode, testJSON, logger, expectedVersion); logger.info("--> start master node"); final String masterNode = internalCluster().startMasterOnlyNode(); awaitMasterNode(internalCluster().getNonMasterNodeName(), masterNode); + var savedClusterState = setupClusterStateListener(masterNode, expectedVersion); FileSettingsService masterFileSettingsService = internalCluster().getInstance(FileSettingsService.class, masterNode);