elastic · DaveCTurner · Jul 7, 2020 · Jul 4, 2019 · Feb 22, 2020 · Feb 23, 2020
diff --git a/server/src/main/java/org/elasticsearch/cluster/ClusterInfo.java b/server/src/main/java/org/elasticsearch/cluster/ClusterInfo.java
@@ -44,9 +44,10 @@ public class ClusterInfo implements ToXContentFragment, Writeable {
     final ImmutableOpenMap<String, Long> shardSizes;
     public static final ClusterInfo EMPTY = new ClusterInfo();
     final ImmutableOpenMap<ShardRouting, String> routingToDataPath;
+    final ImmutableOpenMap<String, Boolean> nodeAllPathsWritable;
 
     protected ClusterInfo() {
-       this(ImmutableOpenMap.of(), ImmutableOpenMap.of(), ImmutableOpenMap.of(), ImmutableOpenMap.of());
+       this(ImmutableOpenMap.of(), ImmutableOpenMap.of(), ImmutableOpenMap.of(), ImmutableOpenMap.of(), ImmutableOpenMap.of());
     }
 
     /**
@@ -60,24 +61,27 @@ protected ClusterInfo() {
      */
     public ClusterInfo(ImmutableOpenMap<String, DiskUsage> leastAvailableSpaceUsage,
             ImmutableOpenMap<String, DiskUsage> mostAvailableSpaceUsage, ImmutableOpenMap<String, Long> shardSizes,
-            ImmutableOpenMap<ShardRouting, String> routingToDataPath) {
+            ImmutableOpenMap<ShardRouting, String> routingToDataPath, ImmutableOpenMap<String, Boolean> nodeAllPathsWritable) {
         this.leastAvailableSpaceUsage = leastAvailableSpaceUsage;
         this.shardSizes = shardSizes;
         this.mostAvailableSpaceUsage = mostAvailableSpaceUsage;
         this.routingToDataPath = routingToDataPath;
+        this.nodeAllPathsWritable = nodeAllPathsWritable;
     }
 
     public ClusterInfo(StreamInput in) throws IOException {
         Map<String, DiskUsage> leastMap = in.readMap(StreamInput::readString, DiskUsage::new);
         Map<String, DiskUsage> mostMap = in.readMap(StreamInput::readString, DiskUsage::new);
+        Map<String, Boolean> allPathsWritable = in.readMap(StreamInput::readString, StreamInput::readBoolean);
         Map<String, Long> sizeMap = in.readMap(StreamInput::readString, StreamInput::readLong);
         Map<ShardRouting, String> routingMap = in.readMap(ShardRouting::new, StreamInput::readString);
-
         ImmutableOpenMap.Builder<String, DiskUsage> leastBuilder = ImmutableOpenMap.builder();
         this.leastAvailableSpaceUsage = leastBuilder.putAll(leastMap).build();
         ImmutableOpenMap.Builder<String, DiskUsage> mostBuilder = ImmutableOpenMap.builder();
         this.mostAvailableSpaceUsage = mostBuilder.putAll(mostMap).build();
         ImmutableOpenMap.Builder<String, Long> sizeBuilder = ImmutableOpenMap.builder();
+        ImmutableOpenMap.Builder<String, Boolean> allPathsWritableBuilder = ImmutableOpenMap.builder();
+        this.nodeAllPathsWritable = allPathsWritableBuilder.putAll(allPathsWritable).build();
         this.shardSizes = sizeBuilder.putAll(sizeMap).build();
         ImmutableOpenMap.Builder<ShardRouting, String> routingBuilder = ImmutableOpenMap.builder();
         this.routingToDataPath = routingBuilder.putAll(routingMap).build();
@@ -95,6 +99,11 @@ public void writeTo(StreamOutput out) throws IOException {
             out.writeString(c.key);
             c.value.writeTo(out);
         }
+        out.writeVInt(this.nodeAllPathsWritable.size());
+        for (ObjectObjectCursor<String, Boolean> c : this.nodeAllPathsWritable) {
+            out.writeString(c.key);
+            out.writeBoolean(c.value);
+        }
         out.writeVInt(this.shardSizes.size());
         for (ObjectObjectCursor<String, Long> c : this.shardSizes) {
             out.writeString(c.key);
@@ -127,6 +136,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
                         }
                     }
                     builder.endObject(); // end "most_available"
+                    builder.field("all_path_writable", this.nodeAllPathsWritable.get(c.key));
                 }
                 builder.endObject(); // end $nodename
             }
@@ -161,6 +171,11 @@ public ImmutableOpenMap<String, DiskUsage> getNodeMostAvailableDiskUsages() {
         return this.mostAvailableSpaceUsage;
     }
 
+    /**
+     * Returns a node id to writeablity mapping for the path that is not writeable.
+     */
+    public ImmutableOpenMap<String, Boolean> getNodeAllPathsWritable() { return this.nodeAllPathsWritable; }
+
     /**
      * Returns the shard size for the given shard routing or <code>null</code> it that metric is not available.
      */

diff --git a/server/src/main/java/org/elasticsearch/cluster/InternalClusterInfoService.java b/server/src/main/java/org/elasticsearch/cluster/InternalClusterInfoService.java
@@ -44,6 +44,7 @@
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
+import org.elasticsearch.monitor.fs.FsHealthService;
 import org.elasticsearch.monitor.fs.FsInfo;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.transport.ReceiveTimeoutTransportException;
@@ -80,6 +81,7 @@ public class InternalClusterInfoService implements ClusterInfoService, LocalNode
 
     private volatile ImmutableOpenMap<String, DiskUsage> leastAvailableSpaceUsages;
     private volatile ImmutableOpenMap<String, DiskUsage> mostAvailableSpaceUsages;
+    private volatile ImmutableOpenMap<String, Boolean> allPathsWritable;
     private volatile ImmutableOpenMap<ShardRouting, String> shardRoutingToDataPath;
     private volatile ImmutableOpenMap<String, Long> shardSizes;
     private volatile boolean isMaster = false;
@@ -94,6 +96,7 @@ public InternalClusterInfoService(Settings settings, ClusterService clusterServi
         this.leastAvailableSpaceUsages = ImmutableOpenMap.of();
         this.mostAvailableSpaceUsages = ImmutableOpenMap.of();
         this.shardRoutingToDataPath = ImmutableOpenMap.of();
+        this.allPathsWritable = ImmutableOpenMap.of();
         this.shardSizes = ImmutableOpenMap.of();
         this.clusterService = clusterService;
         this.threadPool = threadPool;
@@ -105,16 +108,16 @@ public InternalClusterInfoService(Settings settings, ClusterService clusterServi
         clusterSettings.addSettingsUpdateConsumer(INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING, this::setFetchTimeout);
         clusterSettings.addSettingsUpdateConsumer(INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL_SETTING, this::setUpdateFrequency);
         clusterSettings.addSettingsUpdateConsumer(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING,
-                                                  this::setEnabled);
+                                                  FsHealthService.ENABLED_SETTING, this::setEnabled);
 
         // Add InternalClusterInfoService to listen for Master changes
         this.clusterService.addLocalNodeMasterListener(this);
         // Add to listen for state changes (when nodes are added)
         this.clusterService.addListener(this);
     }
 
-    private void setEnabled(boolean enabled) {
-        this.enabled = enabled;
+    private void setEnabled(boolean diskThresholdEnabled, boolean fsHealthEnabled) {
+        this.enabled = diskThresholdEnabled || fsHealthEnabled;
     }
 
     private void setFetchTimeout(TimeValue fetchTimeout) {
@@ -200,7 +203,7 @@ public void clusterChanged(ClusterChangedEvent event) {
 
     @Override
     public ClusterInfo getClusterInfo() {
-        return new ClusterInfo(leastAvailableSpaceUsages, mostAvailableSpaceUsages, shardSizes, shardRoutingToDataPath);
+        return new ClusterInfo(leastAvailableSpaceUsages, mostAvailableSpaceUsages, shardSizes, shardRoutingToDataPath, allPathsWritable);
     }
 
     /**
@@ -242,7 +245,7 @@ public void run() {
      */
     protected CountDownLatch updateNodeStats(final ActionListener<NodesStatsResponse> listener) {
         final CountDownLatch latch = new CountDownLatch(1);
-        final NodesStatsRequest nodesStatsRequest = new NodesStatsRequest("data:true");
+        final NodesStatsRequest nodesStatsRequest = new NodesStatsRequest();
         nodesStatsRequest.clear();
         nodesStatsRequest.fs(true);
         nodesStatsRequest.timeout(fetchTimeout);
@@ -293,10 +296,12 @@ public final ClusterInfo refresh() {
             public void onResponse(NodesStatsResponse nodesStatsResponse) {
                 ImmutableOpenMap.Builder<String, DiskUsage> leastAvailableUsagesBuilder = ImmutableOpenMap.builder();
                 ImmutableOpenMap.Builder<String, DiskUsage> mostAvailableUsagesBuilder = ImmutableOpenMap.builder();
-                fillDiskUsagePerNode(logger, adjustNodesStats(nodesStatsResponse.getNodes()),
-                    leastAvailableUsagesBuilder, mostAvailableUsagesBuilder);
+                ImmutableOpenMap.Builder<String, Boolean> allPathsWritableBuilder = ImmutableOpenMap.builder();
+                fillDiskStatsPerNode(logger, adjustNodesStats(nodesStatsResponse.getNodes()), clusterService,
+                    leastAvailableUsagesBuilder, mostAvailableUsagesBuilder, allPathsWritableBuilder);
                 leastAvailableSpaceUsages = leastAvailableUsagesBuilder.build();
                 mostAvailableSpaceUsages = mostAvailableUsagesBuilder.build();
+                allPathsWritable = allPathsWritableBuilder.build();
             }
 
             @Override
@@ -396,51 +401,57 @@ static void buildShardLevelInfo(Logger logger, ShardStats[] stats, ImmutableOpen
         }
     }
 
-    static void fillDiskUsagePerNode(Logger logger, List<NodeStats> nodeStatsArray,
+    static void fillDiskStatsPerNode(Logger logger, List<NodeStats> nodeStatsArray, ClusterService clusterService,
             ImmutableOpenMap.Builder<String, DiskUsage> newLeastAvaiableUsages,
-            ImmutableOpenMap.Builder<String, DiskUsage> newMostAvaiableUsages) {
+            ImmutableOpenMap.Builder<String, DiskUsage> newMostAvaiableUsages,
+            ImmutableOpenMap.Builder<String, Boolean> allPathsWritableBuilder) {
         for (NodeStats nodeStats : nodeStatsArray) {
             if (nodeStats.getFs() == null) {
                 logger.warn("Unable to retrieve node FS stats for {}", nodeStats.getNode().getName());
             } else {
                 FsInfo.Path leastAvailablePath = null;
                 FsInfo.Path mostAvailablePath = null;
-                for (FsInfo.Path info : nodeStats.getFs()) {
-                    if (leastAvailablePath == null) {
-                        assert mostAvailablePath == null;
-                        mostAvailablePath = leastAvailablePath = info;
-                    } else if (leastAvailablePath.getAvailable().getBytes() > info.getAvailable().getBytes()) {
-                        leastAvailablePath = info;
-                    } else if (mostAvailablePath.getAvailable().getBytes() < info.getAvailable().getBytes()) {
-                        mostAvailablePath = info;
-                    }
-                }
                 String nodeId = nodeStats.getNode().getId();
                 String nodeName = nodeStats.getNode().getName();
-                if (logger.isTraceEnabled()) {
-                    logger.trace("node: [{}], most available: total disk: {}," +
-                            " available disk: {} / least available: total disk: {}, available disk: {}",
+                Boolean allPathsWritable  = nodeStats.getFs().getTotal().isWritable();
+                if (clusterService.state().getNodes().getMasterNodes().containsKey(nodeStats.getNode().getId()) == false) {
+                    for (FsInfo.Path info : nodeStats.getFs()) {
+                        if (leastAvailablePath == null) {
+                            assert mostAvailablePath == null;
+                            mostAvailablePath = leastAvailablePath = info;
+                        } else if (leastAvailablePath.getAvailable().getBytes() > info.getAvailable().getBytes()) {
+                            leastAvailablePath = info;
+                        } else if (mostAvailablePath.getAvailable().getBytes() < info.getAvailable().getBytes()) {
+                            mostAvailablePath = info;
+                        }
+                    }
+                    if (logger.isTraceEnabled()) {
+                        logger.trace("node: [{}], most available: total disk: {}," +
+                                " available disk: {} / least available: total disk: {}, available disk: {}",
                             nodeId, mostAvailablePath.getTotal(), leastAvailablePath.getAvailable(),
                             leastAvailablePath.getTotal(), leastAvailablePath.getAvailable());
-                }
-                if (leastAvailablePath.getTotal().getBytes() < 0) {
-                    if (logger.isTraceEnabled()) {
-                        logger.trace("node: [{}] least available path has less than 0 total bytes of disk [{}], skipping",
+                    }
+                    if (leastAvailablePath.getTotal().getBytes() < 0) {
+                        if (logger.isTraceEnabled()) {
+                            logger.trace("node: [{}] least available path has less than 0 total bytes of disk [{}], skipping",
                                 nodeId, leastAvailablePath.getTotal().getBytes());
+                        }
+                    } else {
+                        newLeastAvaiableUsages.put(nodeId, new DiskUsage(nodeId, nodeName, leastAvailablePath.getPath(),
+                            leastAvailablePath.getTotal().getBytes(), leastAvailablePath.getAvailable().getBytes()));
                     }
-                } else {
-                    newLeastAvaiableUsages.put(nodeId, new DiskUsage(nodeId, nodeName, leastAvailablePath.getPath(),
-                        leastAvailablePath.getTotal().getBytes(), leastAvailablePath.getAvailable().getBytes()));
-                }
-                if (mostAvailablePath.getTotal().getBytes() < 0) {
-                    if (logger.isTraceEnabled()) {
-                        logger.trace("node: [{}] most available path has less than 0 total bytes of disk [{}], skipping",
+                    if (mostAvailablePath.getTotal().getBytes() < 0) {
+                        if (logger.isTraceEnabled()) {
+                            logger.trace("node: [{}] most available path has less than 0 total bytes of disk [{}], skipping",
                                 nodeId, mostAvailablePath.getTotal().getBytes());
+                        }
+                    } else {
+                        newMostAvaiableUsages.put(nodeId, new DiskUsage(nodeId, nodeName, mostAvailablePath.getPath(),
+                            mostAvailablePath.getTotal().getBytes(), mostAvailablePath.getAvailable().getBytes()));
                     }
-                } else {
-                    newMostAvaiableUsages.put(nodeId, new DiskUsage(nodeId, nodeName, mostAvailablePath.getPath(),
-                        mostAvailablePath.getTotal().getBytes(), mostAvailablePath.getAvailable().getBytes()));
+
                 }
+                allPathsWritableBuilder.put(nodeId, allPathsWritable);
 
             }
         }

diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java
@@ -30,6 +30,7 @@
 import org.elasticsearch.cluster.ClusterStateTaskConfig;
 import org.elasticsearch.cluster.ClusterStateUpdateTask;
 import org.elasticsearch.cluster.LocalClusterUpdateTask;
+import org.elasticsearch.cluster.ClusterInfoService;
 import org.elasticsearch.cluster.block.ClusterBlocks;
 import org.elasticsearch.cluster.coordination.ClusterFormationFailureHelper.ClusterFormationState;
 import org.elasticsearch.cluster.coordination.CoordinationMetaData.VotingConfigExclusion;
@@ -67,6 +68,8 @@
 import org.elasticsearch.discovery.PeerFinder;
 import org.elasticsearch.discovery.SeedHostsProvider;
 import org.elasticsearch.discovery.SeedHostsResolver;
+import org.elasticsearch.monitor.fs.FsReadOnlyMonitor;
+import org.elasticsearch.monitor.fs.FsService;
 import org.elasticsearch.threadpool.Scheduler;
 import org.elasticsearch.threadpool.ThreadPool.Names;
 import org.elasticsearch.transport.TransportResponse.Empty;
@@ -149,6 +152,8 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
     private Optional<Join> lastJoin;
     private JoinHelper.JoinAccumulator joinAccumulator;
     private Optional<CoordinatorPublication> currentPublication = Optional.empty();
+    private final FsService fsService;
+    private final FsReadOnlyMonitor fsReadOnlyMonitor;
 
     /**
      * @param nodeName The name of the node, used to name the {@link java.util.concurrent.ExecutorService} of the {@link SeedHostsResolver}.
@@ -158,7 +163,8 @@ public Coordinator(String nodeName, Settings settings, ClusterSettings clusterSe
                        NamedWriteableRegistry namedWriteableRegistry, AllocationService allocationService, MasterService masterService,
                        Supplier<CoordinationState.PersistedState> persistedStateSupplier, SeedHostsProvider seedHostsProvider,
                        ClusterApplier clusterApplier, Collection<BiConsumer<DiscoveryNode, ClusterState>> onJoinValidators, Random random,
-                       RerouteService rerouteService, ElectionStrategy electionStrategy) {
+                       RerouteService rerouteService, ElectionStrategy electionStrategy, FsService fsService,
+                       ClusterInfoService clusterInfoService) {
         this.settings = settings;
         this.transportService = transportService;
         this.masterService = masterService;
@@ -168,7 +174,7 @@ public Coordinator(String nodeName, Settings settings, ClusterSettings clusterSe
         this.electionStrategy = electionStrategy;
         this.joinHelper = new JoinHelper(settings, allocationService, masterService, transportService,
             this::getCurrentTerm, this::getStateForMasterService, this::handleJoinRequest, this::joinLeaderInTerm, this.onJoinValidators,
-            rerouteService);
+            rerouteService, fsService);
         this.persistedStateSupplier = persistedStateSupplier;
         this.noMasterBlockService = new NoMasterBlockService(settings, clusterSettings);
         this.lastKnownLeader = Optional.empty();
@@ -178,7 +184,7 @@ public Coordinator(String nodeName, Settings settings, ClusterSettings clusterSe
         this.publishInfoTimeout = PUBLISH_INFO_TIMEOUT_SETTING.get(settings);
         this.random = random;
         this.electionSchedulerFactory = new ElectionSchedulerFactory(settings, random, transportService.getThreadPool());
-        this.preVoteCollector = new PreVoteCollector(transportService, this::startElection, this::updateMaxTermSeen, electionStrategy);
+        this.preVoteCollector = new PreVoteCollector(transportService, this::startElection, this::updateMaxTermSeen, electionStrategy, fsService);
         configuredHostsResolver = new SeedHostsResolver(nodeName, settings, transportService, seedHostsProvider);
         this.peerFinder = new CoordinatorPeerFinder(settings, transportService,
             new HandshakingTransportAddressConnector(settings, transportService), configuredHostsResolver);
@@ -196,6 +202,10 @@ public Coordinator(String nodeName, Settings settings, ClusterSettings clusterSe
             transportService::getLocalNode);
         this.clusterFormationFailureHelper = new ClusterFormationFailureHelper(settings, this::getClusterFormationState,
             transportService.getThreadPool(), joinHelper::logLastFailedJoinAttempt);
+        //TODO check if FsReadOnlyMonitor and LagDetector can be implemented as a part of a common interface
+        this.fsReadOnlyMonitor = new FsReadOnlyMonitor(settings, clusterSettings, this::getStateForMasterService, transportService::getLocalNode,
+            this::removeNode, clusterInfoService);
+        this.fsService = fsService;
     }
 
     private ClusterFormationState getClusterFormationState() {
@@ -1171,6 +1181,12 @@ public void run() {
                             return;
                         }
 
+                        if(fsService.stats().getTotal().isWritable() == Boolean.FALSE){
+                            logger.warn("skip prevoting as local node is not writable: {}",
-                            logger.warn("skip prevoting as local node is not writable: {}",
+                            logger.debug("skip prevoting as local node is not writable: {}",
-                            logger.warn("skip prevoting as local node is not writable: {}",
+                            logger.debug("skip prevoting as local node is not writable: {}",
+                                lastAcceptedState.coordinationMetaData());
+                            return;
+                        }
+
                         if (prevotingRound != null) {
                             prevotingRound.close();
                         }