elastic · DaveCTurner · Jul 7, 2020 · Jul 4, 2019 · Feb 22, 2020 · Feb 23, 2020
diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java
@@ -24,6 +24,7 @@
 import org.apache.logging.log4j.message.ParameterizedMessage;
 import org.apache.lucene.util.SetOnce;
 import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.client.Client;
 import org.elasticsearch.cluster.ClusterChangedEvent;
 import org.elasticsearch.cluster.ClusterName;
 import org.elasticsearch.cluster.ClusterState;
@@ -67,6 +68,7 @@
 import org.elasticsearch.discovery.PeerFinder;
 import org.elasticsearch.discovery.SeedHostsProvider;
 import org.elasticsearch.discovery.SeedHostsResolver;
+import org.elasticsearch.monitor.fs.FsService;
 import org.elasticsearch.threadpool.Scheduler;
 import org.elasticsearch.threadpool.ThreadPool.Names;
 import org.elasticsearch.transport.TransportResponse.Empty;
@@ -149,6 +151,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
     private Optional<Join> lastJoin;
     private JoinHelper.JoinAccumulator joinAccumulator;
     private Optional<CoordinatorPublication> currentPublication = Optional.empty();
+    private final FsService fsService;
 
     /**
      * @param nodeName The name of the node, used to name the {@link java.util.concurrent.ExecutorService} of the {@link SeedHostsResolver}.
@@ -158,7 +161,7 @@ public Coordinator(String nodeName, Settings settings, ClusterSettings clusterSe
                        NamedWriteableRegistry namedWriteableRegistry, AllocationService allocationService, MasterService masterService,
                        Supplier<CoordinationState.PersistedState> persistedStateSupplier, SeedHostsProvider seedHostsProvider,
                        ClusterApplier clusterApplier, Collection<BiConsumer<DiscoveryNode, ClusterState>> onJoinValidators, Random random,
-                       RerouteService rerouteService, ElectionStrategy electionStrategy) {
+                       RerouteService rerouteService, ElectionStrategy electionStrategy, FsService fsService, Client nodeClient) {
         this.settings = settings;
         this.transportService = transportService;
         this.masterService = masterService;
@@ -168,7 +171,7 @@ public Coordinator(String nodeName, Settings settings, ClusterSettings clusterSe
         this.electionStrategy = electionStrategy;
         this.joinHelper = new JoinHelper(settings, allocationService, masterService, transportService,
             this::getCurrentTerm, this::getStateForMasterService, this::handleJoinRequest, this::joinLeaderInTerm, this.onJoinValidators,
-            rerouteService);
+            rerouteService, fsService);
         this.persistedStateSupplier = persistedStateSupplier;
         this.noMasterBlockService = new NoMasterBlockService(settings, clusterSettings);
         this.lastKnownLeader = Optional.empty();
@@ -178,14 +181,16 @@ public Coordinator(String nodeName, Settings settings, ClusterSettings clusterSe
         this.publishInfoTimeout = PUBLISH_INFO_TIMEOUT_SETTING.get(settings);
         this.random = random;
         this.electionSchedulerFactory = new ElectionSchedulerFactory(settings, random, transportService.getThreadPool());
-        this.preVoteCollector = new PreVoteCollector(transportService, this::startElection, this::updateMaxTermSeen, electionStrategy);
+        this.preVoteCollector = new PreVoteCollector(transportService, this::startElection, this::updateMaxTermSeen, electionStrategy,
+            fsService);
         configuredHostsResolver = new SeedHostsResolver(nodeName, settings, transportService, seedHostsProvider);
         this.peerFinder = new CoordinatorPeerFinder(settings, transportService,
             new HandshakingTransportAddressConnector(settings, transportService), configuredHostsResolver);
         this.publicationHandler = new PublicationTransportHandler(transportService, namedWriteableRegistry,
             this::handlePublishRequest, this::handleApplyCommit);
         this.leaderChecker = new LeaderChecker(settings, transportService, this::onLeaderFailure);
-        this.followersChecker = new FollowersChecker(settings, transportService, this::onFollowerCheckRequest, this::removeNode);
+        this.followersChecker = new FollowersChecker(settings, transportService, this::onFollowerCheckRequest, this::removeNode,
+            nodeClient);
         this.nodeRemovalExecutor = new NodeRemovalClusterStateTaskExecutor(allocationService, logger);
         this.clusterApplier = clusterApplier;
         masterService.setClusterStateSupplier(this::getStateForMasterService);
@@ -196,6 +201,7 @@ public Coordinator(String nodeName, Settings settings, ClusterSettings clusterSe
             transportService::getLocalNode);
         this.clusterFormationFailureHelper = new ClusterFormationFailureHelper(settings, this::getClusterFormationState,
             transportService.getThreadPool(), joinHelper::logLastFailedJoinAttempt);
+        this.fsService = fsService;
     }
 
     private ClusterFormationState getClusterFormationState() {
@@ -1173,6 +1179,12 @@ public void run() {
                             return;
                         }
 
+                        if(fsService.stats().getTotal().isWritable() == Boolean.FALSE){
+                            logger.warn("skip prevoting as local node is not writable: {}",
-                            logger.warn("skip prevoting as local node is not writable: {}",
+                            logger.debug("skip prevoting as local node is not writable: {}",
-                            logger.warn("skip prevoting as local node is not writable: {}",
+                            logger.debug("skip prevoting as local node is not writable: {}",
+                                lastAcceptedState.coordinationMetaData());
+                            return;
+                        }
+
                         if (prevotingRound != null) {
                             prevotingRound.close();
                         }

diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/FollowersChecker.java b/server/src/main/java/org/elasticsearch/cluster/coordination/FollowersChecker.java
@@ -22,6 +22,7 @@
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.apache.logging.log4j.message.ParameterizedMessage;
+import org.elasticsearch.client.Client;
 import org.elasticsearch.cluster.coordination.Coordinator.Mode;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
@@ -39,10 +40,10 @@
 import org.elasticsearch.transport.TransportException;
 import org.elasticsearch.transport.TransportRequest;
 import org.elasticsearch.transport.TransportRequestOptions;
+import org.elasticsearch.transport.TransportService;
 import org.elasticsearch.transport.TransportRequestOptions.Type;
 import org.elasticsearch.transport.TransportResponse.Empty;
 import org.elasticsearch.transport.TransportResponseHandler;
-import org.elasticsearch.transport.TransportService;
 
 import java.io.IOException;
 import java.util.HashSet;
@@ -52,6 +53,7 @@
 import java.util.function.BiConsumer;
 import java.util.function.Consumer;
 import java.util.function.Predicate;
+import java.util.function.Supplier;
 
 import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentMap;
 
@@ -94,12 +96,13 @@ public class FollowersChecker {
     private final Set<DiscoveryNode> faultyNodes = new HashSet<>();
 
     private final TransportService transportService;
+    private final NodeFsHealthChecker nodeFsHealthChecker;
 
     private volatile FastResponseState fastResponseState;
 
     public FollowersChecker(Settings settings, TransportService transportService,
                             Consumer<FollowerCheckRequest> handleRequestAndUpdateState,
-                            BiConsumer<DiscoveryNode, String> onNodeFailure) {
+                            BiConsumer<DiscoveryNode, String> onNodeFailure, Client nodeClient) {
         this.settings = settings;
         this.transportService = transportService;
         this.handleRequestAndUpdateState = handleRequestAndUpdateState;
@@ -118,6 +121,7 @@ public void onNodeDisconnected(DiscoveryNode node, Transport.Connection connecti
                 handleDisconnectedNode(node);
             }
         });
+        nodeFsHealthChecker = new NodeFsHealthChecker(settings, transportService, nodeClient, this::failFollower, this::followers);
     }
 
     /**
@@ -139,9 +143,37 @@ public void setCurrentNodes(DiscoveryNodes discoveryNodes) {
                     followerChecker.start();
                 }
             });
+            nodeFsHealthChecker.start();
         }
     }
 
+    private Set<DiscoveryNode> followers(){
+        return followerCheckers.keySet();
+    }
+
+    private void failFollower(DiscoveryNode discoveryNode, String reason, Supplier<Boolean> supplier) {
+        transportService.getThreadPool().generic().execute(new Runnable() {
+            @Override
+            public void run() {
+                synchronized (mutex) {
+                    if (supplier.get() == false) {
+                        logger.trace("{} no longer running, not marking faulty", discoveryNode);
+                        return;
+                    }
+                    logger.debug("{} marking node as faulty", discoveryNode);
+                    faultyNodes.add(discoveryNode);
+                    followerCheckers.remove(discoveryNode);
+                }
+                onNodeFailure.accept(discoveryNode, reason);
+            }
+
+            @Override
+            public String toString() {
+                return "detected failure of " + discoveryNode;
+            }
+        });
+    }
+
     /**
      * Clear the set of known nodes, stopping all checks.
      */
@@ -351,28 +383,10 @@ public String executor() {
         }
 
         void failNode(String reason) {
-            transportService.getThreadPool().generic().execute(new Runnable() {
-                @Override
-                public void run() {
-                    synchronized (mutex) {
-                        if (running() == false) {
-                            logger.trace("{} no longer running, not marking faulty", FollowerChecker.this);
-                            return;
-                        }
-                        logger.debug("{} marking node as faulty", FollowerChecker.this);
-                        faultyNodes.add(discoveryNode);
-                        followerCheckers.remove(discoveryNode);
-                    }
-                    onNodeFailure.accept(discoveryNode, reason);
-                }
-
-                @Override
-                public String toString() {
-                    return "detected failure of " + discoveryNode;
-                }
-            });
+            failFollower(discoveryNode, reason, () -> running());
         }
 
+
         private void scheduleNextWakeUp() {
             transportService.getThreadPool().schedule(new Runnable() {
                 @Override

diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/JoinHelper.java b/server/src/main/java/org/elasticsearch/cluster/coordination/JoinHelper.java
@@ -42,6 +42,7 @@
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.discovery.DiscoveryModule;
+import org.elasticsearch.monitor.fs.FsService;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.threadpool.ThreadPool.Names;
 import org.elasticsearch.transport.TransportChannel;
@@ -88,6 +89,8 @@ public class JoinHelper {
 
     @Nullable // if using single-node discovery
     private final TimeValue joinTimeout;
+    private final FsService fsService;
+
 
     private final Set<Tuple<DiscoveryNode, JoinRequest>> pendingOutgoingJoins = Collections.synchronizedSet(new HashSet<>());
 
@@ -96,9 +99,10 @@ public class JoinHelper {
     JoinHelper(Settings settings, AllocationService allocationService, MasterService masterService,
                TransportService transportService, LongSupplier currentTermSupplier, Supplier<ClusterState> currentStateSupplier,
                BiConsumer<JoinRequest, JoinCallback> joinHandler, Function<StartJoinRequest, Join> joinLeaderInTerm,
-               Collection<BiConsumer<DiscoveryNode, ClusterState>> joinValidators, RerouteService rerouteService) {
+               Collection<BiConsumer<DiscoveryNode, ClusterState>> joinValidators, RerouteService rerouteService, FsService fsService) {
         this.masterService = masterService;
         this.transportService = transportService;
+        this.fsService = fsService;
         this.joinTimeout = DiscoveryModule.isSingleNodeDiscovery(settings) ? null : JOIN_TIMEOUT_SETTING.get(settings);
         this.joinTaskExecutor = new JoinTaskExecutor(allocationService, logger, rerouteService) {
 
@@ -232,6 +236,10 @@ void logLastFailedJoinAttempt() {
 
     public void sendJoinRequest(DiscoveryNode destination, long term, Optional<Join> optionalJoin) {
         assert destination.isMasterNode() : "trying to join master-ineligible " + destination;
+        if (fsService.stats().getTotal().isWritable() == Boolean.FALSE) {
+            logger.warn("All paths are not writable. Blocking join request");
-            logger.warn("All paths are not writable. Blocking join request");
+            logger.debug("All paths are not writable. Blocking join request");
-            logger.warn("All paths are not writable. Blocking join request");
+            logger.debug("All paths are not writable. Blocking join request");
+            return;
+        }
         final JoinRequest joinRequest = new JoinRequest(transportService.getLocalNode(), term, optionalJoin);
         final Tuple<DiscoveryNode, JoinRequest> dedupKey = Tuple.tuple(destination, joinRequest);
         if (pendingOutgoingJoins.add(dedupKey)) {

diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/NodeFsHealthChecker.java b/server/src/main/java/org/elasticsearch/cluster/coordination/NodeFsHealthChecker.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.cluster.coordination;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.elasticsearch.action.admin.cluster.node.stats.NodeStats;
+import org.elasticsearch.action.admin.cluster.node.stats.NodesStatsRequest;
+import org.elasticsearch.action.admin.cluster.node.stats.NodesStatsResponse;
+import org.elasticsearch.client.Client;
+import org.elasticsearch.cluster.block.ClusterBlockException;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.common.TriConsumer;
+import org.elasticsearch.common.settings.Setting;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.monitor.fs.FsHealthService;
+import org.elasticsearch.threadpool.ThreadPool;
+import org.elasticsearch.transport.ReceiveTimeoutTransportException;
+import org.elasticsearch.transport.TransportService;
+
+import java.util.Set;
+import java.util.function.Supplier;
+
+public class NodeFsHealthChecker {
+
+    private static final Logger logger = LogManager.getLogger(FollowersChecker.class);
+
+    public static final Setting<TimeValue> FS_HEALTH_CHECK_INTERVAL_SETTING =
+        Setting.timeSetting("cluster.fault_detection.fs_health_check.interval",
+            TimeValue.timeValueMillis(5000), TimeValue.timeValueMillis(100), Setting.Property.NodeScope);
+
+    private final TimeValue fsHealthCheckInterval;
+    private final boolean fsHealthCheckEnabled;
+    private final TriConsumer<DiscoveryNode, String, Supplier<Boolean>> failFollower;
+    private final TransportService transportService;
+    private final Supplier<Set<DiscoveryNode>> followerNodesSupplier;
+    private final Client nodeClient;
+
+    public NodeFsHealthChecker(Settings settings, TransportService transportService, Client nodeClient, TriConsumer<DiscoveryNode,
+        String, Supplier<Boolean>> failFollower, Supplier<Set<DiscoveryNode>> followerNodesSupplier){
+        fsHealthCheckEnabled = FsHealthService.ENABLED_SETTING.get(settings);
+        fsHealthCheckInterval = FS_HEALTH_CHECK_INTERVAL_SETTING.get(settings);
+        this.followerNodesSupplier = followerNodesSupplier;
+        this.failFollower = failFollower;
+        this.transportService = transportService;
+        this.nodeClient = nodeClient;
+    }
+
+
+    void start() {
+        handleWakeUp();
+    }
+
+
+    private void scheduleNextWakeUp() {
+        if (followerNodesSupplier.get().isEmpty() == false) {
+            transportService.getThreadPool().schedule(new Runnable() {
+                @Override
+                public void run() {
+                    handleWakeUp();
+                }
+
+                @Override
+                public String toString() {
+                    return NodeFsHealthChecker.this + "::handleWakeUp";
+                }
+            }, fsHealthCheckInterval, ThreadPool.Names.SAME);
+        }
+    }
+
+
+    private void handleWakeUp() {
+        if (fsHealthCheckEnabled && followerNodesSupplier.get().isEmpty() == false) {
+            NodesStatsRequest nodesStatsRequest = new NodesStatsRequest().clear().fs(true).timeout(fsHealthCheckInterval);
+            NodesStatsResponse nodesStatsResponse = fetchNodeStats(nodesStatsRequest);
+            if(nodesStatsResponse == null){
+                return;
+            }
+            for (NodeStats nodeStats : nodesStatsResponse.getNodes()) {
+                if (nodeStats.getFs() == null) {
+                    logger.warn("Unable to retrieve node FS stats for {}", nodeStats.getNode().getName());
+                } else {
+                    if (nodeStats.getFs().getTotal().isWritable() == Boolean.FALSE) {
+                        failFollower.apply(nodeStats.getNode(), "read-only-file-system", () ->
+                            followerNodesSupplier.get().contains(nodeStats.getNode()));
+                    }
+                }
+            }
+        }
+        scheduleNextWakeUp();
+    }
+
+    private NodesStatsResponse fetchNodeStats(NodesStatsRequest nodeStatsRequest) {
+        NodesStatsResponse nodesStatsResponse = null;
+        try {
+            nodesStatsResponse = nodeClient.admin().cluster().nodesStats(nodeStatsRequest).actionGet();
+        } catch (Exception e){
+            if (e instanceof ReceiveTimeoutTransportException) {
+                logger.error("NodeStatsRequest timed out for FollowerChecker", e);
+            } else {
+                if (e instanceof ClusterBlockException) {
+                    if (logger.isTraceEnabled()) {
+                        logger.trace("Failed to execute NodeStatsRequest for FollowerChecker", e);
+                    }
+                } else {
+                    logger.warn("Failed to execute NodeStatsRequest for FollowerChecker", e);
+                }
+            }
+        }
+        return nodesStatsResponse;
+    }
+}
+