New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adds resiliency to read-only filesystems #45286 #52680
Changes from 7 commits
569e8cc
64815f1
b598944
d4fb892
38f1a4e
f3ac906
79948f3
20d9ba2
fa3ed38
1646319
5305ebb
26fbce7
8a86051
c9dd1a7
c99a68e
545eaf5
86fa7c9
8102c81
043db93
bbf5517
1459937
8eb5e20
2095d82
39a0565
136bc44
1ab13b2
4143f8f
1d9a7ab
f222529
befd822
061dd33
cda2179
4d83de0
e41392f
67d49bb
fa3cc69
89035fb
adbe670
fdcdf45
deafeca
1120428
23bc4e5
06b14b8
56fb9b3
0d7b72f
f390ed8
f44cf0d
97a4c02
54d7c98
aae5142
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -24,6 +24,7 @@ | |||||
import org.apache.logging.log4j.message.ParameterizedMessage; | ||||||
import org.apache.lucene.util.SetOnce; | ||||||
import org.elasticsearch.action.ActionListener; | ||||||
import org.elasticsearch.client.Client; | ||||||
import org.elasticsearch.cluster.ClusterChangedEvent; | ||||||
import org.elasticsearch.cluster.ClusterName; | ||||||
import org.elasticsearch.cluster.ClusterState; | ||||||
|
@@ -67,6 +68,7 @@ | |||||
import org.elasticsearch.discovery.PeerFinder; | ||||||
import org.elasticsearch.discovery.SeedHostsProvider; | ||||||
import org.elasticsearch.discovery.SeedHostsResolver; | ||||||
import org.elasticsearch.monitor.fs.FsService; | ||||||
import org.elasticsearch.threadpool.Scheduler; | ||||||
import org.elasticsearch.threadpool.ThreadPool.Names; | ||||||
import org.elasticsearch.transport.TransportResponse.Empty; | ||||||
|
@@ -149,6 +151,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery | |||||
private Optional<Join> lastJoin; | ||||||
private JoinHelper.JoinAccumulator joinAccumulator; | ||||||
private Optional<CoordinatorPublication> currentPublication = Optional.empty(); | ||||||
private final FsService fsService; | ||||||
|
||||||
/** | ||||||
* @param nodeName The name of the node, used to name the {@link java.util.concurrent.ExecutorService} of the {@link SeedHostsResolver}. | ||||||
|
@@ -158,7 +161,7 @@ public Coordinator(String nodeName, Settings settings, ClusterSettings clusterSe | |||||
NamedWriteableRegistry namedWriteableRegistry, AllocationService allocationService, MasterService masterService, | ||||||
Supplier<CoordinationState.PersistedState> persistedStateSupplier, SeedHostsProvider seedHostsProvider, | ||||||
ClusterApplier clusterApplier, Collection<BiConsumer<DiscoveryNode, ClusterState>> onJoinValidators, Random random, | ||||||
RerouteService rerouteService, ElectionStrategy electionStrategy) { | ||||||
RerouteService rerouteService, ElectionStrategy electionStrategy, FsService fsService, Client nodeClient) { | ||||||
this.settings = settings; | ||||||
this.transportService = transportService; | ||||||
this.masterService = masterService; | ||||||
|
@@ -168,7 +171,7 @@ public Coordinator(String nodeName, Settings settings, ClusterSettings clusterSe | |||||
this.electionStrategy = electionStrategy; | ||||||
this.joinHelper = new JoinHelper(settings, allocationService, masterService, transportService, | ||||||
this::getCurrentTerm, this::getStateForMasterService, this::handleJoinRequest, this::joinLeaderInTerm, this.onJoinValidators, | ||||||
rerouteService); | ||||||
rerouteService, fsService); | ||||||
this.persistedStateSupplier = persistedStateSupplier; | ||||||
this.noMasterBlockService = new NoMasterBlockService(settings, clusterSettings); | ||||||
this.lastKnownLeader = Optional.empty(); | ||||||
|
@@ -178,14 +181,16 @@ public Coordinator(String nodeName, Settings settings, ClusterSettings clusterSe | |||||
this.publishInfoTimeout = PUBLISH_INFO_TIMEOUT_SETTING.get(settings); | ||||||
this.random = random; | ||||||
this.electionSchedulerFactory = new ElectionSchedulerFactory(settings, random, transportService.getThreadPool()); | ||||||
this.preVoteCollector = new PreVoteCollector(transportService, this::startElection, this::updateMaxTermSeen, electionStrategy); | ||||||
this.preVoteCollector = new PreVoteCollector(transportService, this::startElection, this::updateMaxTermSeen, electionStrategy, | ||||||
fsService); | ||||||
configuredHostsResolver = new SeedHostsResolver(nodeName, settings, transportService, seedHostsProvider); | ||||||
this.peerFinder = new CoordinatorPeerFinder(settings, transportService, | ||||||
new HandshakingTransportAddressConnector(settings, transportService), configuredHostsResolver); | ||||||
this.publicationHandler = new PublicationTransportHandler(transportService, namedWriteableRegistry, | ||||||
this::handlePublishRequest, this::handleApplyCommit); | ||||||
this.leaderChecker = new LeaderChecker(settings, transportService, this::onLeaderFailure); | ||||||
this.followersChecker = new FollowersChecker(settings, transportService, this::onFollowerCheckRequest, this::removeNode); | ||||||
this.followersChecker = new FollowersChecker(settings, transportService, this::onFollowerCheckRequest, this::removeNode, | ||||||
nodeClient); | ||||||
this.nodeRemovalExecutor = new NodeRemovalClusterStateTaskExecutor(allocationService, logger); | ||||||
this.clusterApplier = clusterApplier; | ||||||
masterService.setClusterStateSupplier(this::getStateForMasterService); | ||||||
|
@@ -196,6 +201,7 @@ public Coordinator(String nodeName, Settings settings, ClusterSettings clusterSe | |||||
transportService::getLocalNode); | ||||||
this.clusterFormationFailureHelper = new ClusterFormationFailureHelper(settings, this::getClusterFormationState, | ||||||
transportService.getThreadPool(), joinHelper::logLastFailedJoinAttempt); | ||||||
this.fsService = fsService; | ||||||
} | ||||||
|
||||||
private ClusterFormationState getClusterFormationState() { | ||||||
|
@@ -1173,6 +1179,12 @@ public void run() { | |||||
return; | ||||||
} | ||||||
|
||||||
if(fsService.stats().getTotal().isWritable() == Boolean.FALSE){ | ||||||
logger.warn("skip prevoting as local node is not writable: {}", | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A warning here isn't helpful, we should be logging the failure elsewhere so this will simply result in confusion.
Suggested change
Also, we have this generic |
||||||
lastAcceptedState.coordinationMetaData()); | ||||||
return; | ||||||
} | ||||||
|
||||||
if (prevotingRound != null) { | ||||||
prevotingRound.close(); | ||||||
} | ||||||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -42,6 +42,7 @@ | |||||
import org.elasticsearch.common.settings.Settings; | ||||||
import org.elasticsearch.common.unit.TimeValue; | ||||||
import org.elasticsearch.discovery.DiscoveryModule; | ||||||
import org.elasticsearch.monitor.fs.FsService; | ||||||
import org.elasticsearch.threadpool.ThreadPool; | ||||||
import org.elasticsearch.threadpool.ThreadPool.Names; | ||||||
import org.elasticsearch.transport.TransportChannel; | ||||||
|
@@ -88,6 +89,8 @@ public class JoinHelper { | |||||
|
||||||
@Nullable // if using single-node discovery | ||||||
private final TimeValue joinTimeout; | ||||||
private final FsService fsService; | ||||||
|
||||||
|
||||||
private final Set<Tuple<DiscoveryNode, JoinRequest>> pendingOutgoingJoins = Collections.synchronizedSet(new HashSet<>()); | ||||||
|
||||||
|
@@ -96,9 +99,10 @@ public class JoinHelper { | |||||
JoinHelper(Settings settings, AllocationService allocationService, MasterService masterService, | ||||||
TransportService transportService, LongSupplier currentTermSupplier, Supplier<ClusterState> currentStateSupplier, | ||||||
BiConsumer<JoinRequest, JoinCallback> joinHandler, Function<StartJoinRequest, Join> joinLeaderInTerm, | ||||||
Collection<BiConsumer<DiscoveryNode, ClusterState>> joinValidators, RerouteService rerouteService) { | ||||||
Collection<BiConsumer<DiscoveryNode, ClusterState>> joinValidators, RerouteService rerouteService, FsService fsService) { | ||||||
this.masterService = masterService; | ||||||
this.transportService = transportService; | ||||||
this.fsService = fsService; | ||||||
this.joinTimeout = DiscoveryModule.isSingleNodeDiscovery(settings) ? null : JOIN_TIMEOUT_SETTING.get(settings); | ||||||
this.joinTaskExecutor = new JoinTaskExecutor(allocationService, logger, rerouteService) { | ||||||
|
||||||
|
@@ -232,6 +236,10 @@ void logLastFailedJoinAttempt() { | |||||
|
||||||
public void sendJoinRequest(DiscoveryNode destination, long term, Optional<Join> optionalJoin) { | ||||||
assert destination.isMasterNode() : "trying to join master-ineligible " + destination; | ||||||
if (fsService.stats().getTotal().isWritable() == Boolean.FALSE) { | ||||||
logger.warn("All paths are not writable. Blocking join request"); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A warning here isn't helpful, we should be logging the failure elsewhere so this will simply result in confusion.
Suggested change
Also, we have this generic |
||||||
return; | ||||||
} | ||||||
final JoinRequest joinRequest = new JoinRequest(transportService.getLocalNode(), term, optionalJoin); | ||||||
final Tuple<DiscoveryNode, JoinRequest> dedupKey = Tuple.tuple(destination, joinRequest); | ||||||
if (pendingOutgoingJoins.add(dedupKey)) { | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.cluster.coordination; | ||
|
||
import org.apache.logging.log4j.LogManager; | ||
import org.apache.logging.log4j.Logger; | ||
import org.elasticsearch.action.admin.cluster.node.stats.NodeStats; | ||
import org.elasticsearch.action.admin.cluster.node.stats.NodesStatsRequest; | ||
import org.elasticsearch.action.admin.cluster.node.stats.NodesStatsResponse; | ||
import org.elasticsearch.client.Client; | ||
import org.elasticsearch.cluster.block.ClusterBlockException; | ||
import org.elasticsearch.cluster.node.DiscoveryNode; | ||
import org.elasticsearch.common.TriConsumer; | ||
import org.elasticsearch.common.settings.Setting; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.common.unit.TimeValue; | ||
import org.elasticsearch.monitor.fs.FsHealthService; | ||
import org.elasticsearch.threadpool.ThreadPool; | ||
import org.elasticsearch.transport.ReceiveTimeoutTransportException; | ||
import org.elasticsearch.transport.TransportService; | ||
|
||
import java.util.Set; | ||
import java.util.function.Supplier; | ||
|
||
public class NodeFsHealthChecker { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This doesn't seem necessary, it's enough for followers to reject the today's health checks. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree. That simplifies a great deal |
||
|
||
private static final Logger logger = LogManager.getLogger(FollowersChecker.class); | ||
|
||
public static final Setting<TimeValue> FS_HEALTH_CHECK_INTERVAL_SETTING = | ||
Setting.timeSetting("cluster.fault_detection.fs_health_check.interval", | ||
TimeValue.timeValueMillis(5000), TimeValue.timeValueMillis(100), Setting.Property.NodeScope); | ||
|
||
private final TimeValue fsHealthCheckInterval; | ||
private final boolean fsHealthCheckEnabled; | ||
private final TriConsumer<DiscoveryNode, String, Supplier<Boolean>> failFollower; | ||
private final TransportService transportService; | ||
private final Supplier<Set<DiscoveryNode>> followerNodesSupplier; | ||
private final Client nodeClient; | ||
|
||
public NodeFsHealthChecker(Settings settings, TransportService transportService, Client nodeClient, TriConsumer<DiscoveryNode, | ||
String, Supplier<Boolean>> failFollower, Supplier<Set<DiscoveryNode>> followerNodesSupplier){ | ||
fsHealthCheckEnabled = FsHealthService.ENABLED_SETTING.get(settings); | ||
fsHealthCheckInterval = FS_HEALTH_CHECK_INTERVAL_SETTING.get(settings); | ||
this.followerNodesSupplier = followerNodesSupplier; | ||
this.failFollower = failFollower; | ||
this.transportService = transportService; | ||
this.nodeClient = nodeClient; | ||
} | ||
|
||
|
||
void start() { | ||
handleWakeUp(); | ||
} | ||
|
||
|
||
private void scheduleNextWakeUp() { | ||
if (followerNodesSupplier.get().isEmpty() == false) { | ||
transportService.getThreadPool().schedule(new Runnable() { | ||
@Override | ||
public void run() { | ||
handleWakeUp(); | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return NodeFsHealthChecker.this + "::handleWakeUp"; | ||
} | ||
}, fsHealthCheckInterval, ThreadPool.Names.SAME); | ||
} | ||
} | ||
|
||
|
||
private void handleWakeUp() { | ||
if (fsHealthCheckEnabled && followerNodesSupplier.get().isEmpty() == false) { | ||
NodesStatsRequest nodesStatsRequest = new NodesStatsRequest().clear().fs(true).timeout(fsHealthCheckInterval); | ||
NodesStatsResponse nodesStatsResponse = fetchNodeStats(nodesStatsRequest); | ||
if(nodesStatsResponse == null){ | ||
return; | ||
} | ||
for (NodeStats nodeStats : nodesStatsResponse.getNodes()) { | ||
if (nodeStats.getFs() == null) { | ||
logger.warn("Unable to retrieve node FS stats for {}", nodeStats.getNode().getName()); | ||
} else { | ||
if (nodeStats.getFs().getTotal().isWritable() == Boolean.FALSE) { | ||
failFollower.apply(nodeStats.getNode(), "read-only-file-system", () -> | ||
followerNodesSupplier.get().contains(nodeStats.getNode())); | ||
} | ||
} | ||
} | ||
} | ||
scheduleNextWakeUp(); | ||
} | ||
|
||
private NodesStatsResponse fetchNodeStats(NodesStatsRequest nodeStatsRequest) { | ||
NodesStatsResponse nodesStatsResponse = null; | ||
try { | ||
nodesStatsResponse = nodeClient.admin().cluster().nodesStats(nodeStatsRequest).actionGet(); | ||
} catch (Exception e){ | ||
if (e instanceof ReceiveTimeoutTransportException) { | ||
logger.error("NodeStatsRequest timed out for FollowerChecker", e); | ||
} else { | ||
if (e instanceof ClusterBlockException) { | ||
if (logger.isTraceEnabled()) { | ||
logger.trace("Failed to execute NodeStatsRequest for FollowerChecker", e); | ||
} | ||
} else { | ||
logger.warn("Failed to execute NodeStatsRequest for FollowerChecker", e); | ||
} | ||
} | ||
} | ||
return nodesStatsResponse; | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have left out spaces assuming
checkStyles
would catch. But unfortunate. I'll fix white spacing