Skip to content

Commit

Permalink
Fix testUnHealthyLeaderRemoved (#67332)
Browse files Browse the repository at this point in the history
Today this test will sometimes mark three nodes of a six-node cluster as
unhealthy, which can prevent the expected failover from succeeding. This
commit ensures that the healthy nodes always outnumber the unhealthy
ones.

Closes #67329
  • Loading branch information
DaveCTurner committed Jan 12, 2021
1 parent 84b6ab0 commit 86a8bd6
Showing 1 changed file with 23 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
import org.elasticsearch.discovery.DiscoveryModule;
import org.elasticsearch.discovery.zen.PublishClusterStateStats;
import org.elasticsearch.gateway.GatewayService;
import org.elasticsearch.monitor.NodeHealthService;
import org.elasticsearch.monitor.StatusInfo;
import org.elasticsearch.test.MockLogAppender;

Expand All @@ -63,8 +64,8 @@
import static java.util.Collections.singleton;
import static org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.DEFAULT_DELAY_VARIABILITY;
import static org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.EXTREME_DELAY_VARIABILITY;
import static org.elasticsearch.cluster.coordination.Coordinator.PUBLISH_TIMEOUT_SETTING;
import static org.elasticsearch.cluster.coordination.Coordinator.Mode.CANDIDATE;
import static org.elasticsearch.cluster.coordination.Coordinator.PUBLISH_TIMEOUT_SETTING;
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING;
import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_INTERVAL_SETTING;
import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_RETRY_COUNT_SETTING;
Expand All @@ -82,7 +83,6 @@
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
import static org.elasticsearch.test.NodeRoles.nonMasterNode;
import static org.hamcrest.Matchers.allOf;
import static org.hamcrest.Matchers.anyOf;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
Expand Down Expand Up @@ -574,25 +574,25 @@ public void testUnresponsiveLeaderDetectedEventually() {
}
}

public void testUnHealthyLeaderRemoved() {
AtomicReference<StatusInfo> nodeHealthServiceStatus = new AtomicReference<>(new StatusInfo(HEALTHY, "healthy-info"));
try (Cluster cluster = new Cluster(randomIntBetween(1, 3), true, Settings.EMPTY,
() -> nodeHealthServiceStatus.get())) {
public void testUnhealthyLeaderIsReplaced() {
final AtomicReference<StatusInfo> nodeHealthServiceStatus = new AtomicReference<>(new StatusInfo(HEALTHY, "healthy-info"));
final int initialClusterSize = between(1, 3);
try (Cluster cluster = new Cluster(initialClusterSize, true, Settings.EMPTY, nodeHealthServiceStatus::get)) {
cluster.runRandomly();
cluster.stabilise();

final ClusterNode leader = cluster.getAnyLeader();

logger.info("--> adding three new healthy nodes");
ClusterNode newNode1 = cluster.new ClusterNode(nextNodeIndex.getAndIncrement(), true, leader.nodeSettings,
() -> new StatusInfo(HEALTHY, "healthy-info"));
ClusterNode newNode2 = cluster.new ClusterNode(nextNodeIndex.getAndIncrement(), true, leader.nodeSettings,
() -> new StatusInfo(HEALTHY, "healthy-info"));
ClusterNode newNode3 = cluster.new ClusterNode(nextNodeIndex.getAndIncrement(), true, leader.nodeSettings,
() -> new StatusInfo(HEALTHY, "healthy-info"));
cluster.clusterNodes.add(newNode1);
cluster.clusterNodes.add(newNode2);
cluster.clusterNodes.add(newNode3);
final int newClusterNodes = between(initialClusterSize + 1, 4);
logger.info("--> adding [{}] new healthy nodes", newClusterNodes);
final NodeHealthService alwaysHealthy = () -> new StatusInfo(HEALTHY, "healthy-info");
final Set<String> newNodeIds = new HashSet<>(newClusterNodes);
for (int i = 0; i < newClusterNodes; i++) {
final ClusterNode node = cluster.new ClusterNode(nextNodeIndex.getAndIncrement(), true, leader.nodeSettings, alwaysHealthy);
newNodeIds.add(node.getId());
cluster.clusterNodes.add(node);
}

cluster.stabilise(
// The first pinging discovers the master
defaultMillis(DISCOVERY_FIND_PEERS_INTERVAL_SETTING)
Expand All @@ -602,7 +602,7 @@ public void testUnHealthyLeaderRemoved() {
// followup reconfiguration
+ 3 * 2 * DEFAULT_CLUSTER_STATE_UPDATE_DELAY);

logger.info("--> changing health status of leader {} to unhealthy", leader);
logger.info("--> change initial nodes to report as unhealthy");
nodeHealthServiceStatus.getAndSet(new StatusInfo(UNHEALTHY, "unhealthy-info"));

cluster.stabilise(
Expand All @@ -617,10 +617,10 @@ public void testUnHealthyLeaderRemoved() {

// then wait for both of:
+ Math.max(
// 1. the term bumping publication to time out
defaultMillis(PUBLISH_TIMEOUT_SETTING),
// 2. the new leader to notice that the old leader is unresponsive
(defaultMillis(FOLLOWER_CHECK_INTERVAL_SETTING) + defaultMillis(FOLLOWER_CHECK_TIMEOUT_SETTING))
// 1. the term bumping publication to time out
defaultMillis(PUBLISH_TIMEOUT_SETTING),
// 2. the new leader to notice that the old leader is unresponsive
(defaultMillis(FOLLOWER_CHECK_INTERVAL_SETTING) + defaultMillis(FOLLOWER_CHECK_TIMEOUT_SETTING))
)

// then wait for the new leader to commit a state without the old leader
Expand All @@ -629,8 +629,8 @@ public void testUnHealthyLeaderRemoved() {
+ DEFAULT_CLUSTER_STATE_UPDATE_DELAY
);

assertThat(cluster.getAnyLeader().getId(), anyOf(equalTo(newNode1.getId()), equalTo(newNode2.getId()),
equalTo(newNode3.getId())));
final String leaderId = cluster.getAnyLeader().getId();
assertTrue(leaderId + " should be one of " + newNodeIds, newNodeIds.contains(leaderId));
}
}

Expand Down

0 comments on commit 86a8bd6

Please sign in to comment.