Skip to content

Commit

Permalink
Avoiding the use of nodes that are no longer in the cluster when comp…
Browse files Browse the repository at this point in the history
…uting master stability (#98809)
  • Loading branch information
masseyke committed Aug 24, 2023
1 parent 5f2fe57 commit e47138e
Show file tree
Hide file tree
Showing 6 changed files with 208 additions and 60 deletions.
7 changes: 7 additions & 0 deletions docs/changelog/98809.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
pr: 98809
summary: Avoiding the use of nodes that are no longer in the cluster when computing
master stability
area: Health
type: enhancement
issues:
- 98636
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ public TransportAction(TransportService transportService, ActionFilters actionFi

@Override
protected void doExecute(Task task, MasterHistoryAction.Request request, ActionListener<Response> listener) {
listener.onResponse(new MasterHistoryAction.Response(masterHistoryService.getLocalMasterHistory().getNodes()));
listener.onResponse(new MasterHistoryAction.Response(masterHistoryService.getLocalMasterHistory().getRawNodes()));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.function.LongSupplier;
import java.util.stream.Collectors;

/**
* This class represents a node's view of the history of which nodes have been elected master over the last 30 minutes. It is kept in
Expand All @@ -34,6 +36,7 @@ public class MasterHistory implements ClusterStateListener {
* The maximum amount of time that the master history covers.
*/
private final TimeValue maxHistoryAge;
private final ClusterService clusterService;
// Note: While the master can be null, the TimeAndMaster object in this list is never null
private volatile List<TimeAndMaster> masterHistory;
private final LongSupplier currentTimeMillisSupplier;
Expand All @@ -57,6 +60,7 @@ public MasterHistory(ThreadPool threadPool, ClusterService clusterService) {
this.masterHistory = new ArrayList<>();
this.currentTimeMillisSupplier = threadPool::relativeTimeInMillis;
this.maxHistoryAge = MAX_HISTORY_AGE_SETTING.get(clusterService.getSettings());
this.clusterService = clusterService;
clusterService.addListener(this);
}

Expand Down Expand Up @@ -247,13 +251,35 @@ private List<TimeAndMaster> getRecentMasterHistory(List<TimeAndMaster> history)
/**
* This method returns an immutable view of this master history, typically for sending over the wire to another node. The returned List
* is ordered by when the master was seen, with the earliest-seen masters being first. The List can contain null values. Times are
* intentionally not included because they cannot be compared across machines.
* intentionally not included because they cannot be compared across machines. This list contains nodes even if they are not currently
* in the cluster.
* @return An immutable view of this master history
*/
public List<DiscoveryNode> getNodes() {
public List<DiscoveryNode> getRawNodes() {
List<TimeAndMaster> masterHistoryCopy = getRecentMasterHistory(masterHistory);
return masterHistoryCopy.stream().map(TimeAndMaster::master).toList();
}

/*
* This method is similar to getRawNodes(), except any non-null nodes whose ephemeral IDs are not in the nodes in the cluster
* state are removed. This is meant to be used to filter out nodes from the master history that are no longer part of the cluster. We
* need to keep these nodes in the master history in case they return to the cluster, but we do not want them to count toward our
* stability calculations.
*/
public List<DiscoveryNode> getNodes() {
List<DiscoveryNode> nodes = getRawNodes();
if (nodes == null || nodes.isEmpty()) {
return nodes;
}
Set<String> ephemeralIdsCurrentlyInCluster = clusterService.state()
.nodes()
.stream()
.map(DiscoveryNode::getEphemeralId)
.collect(Collectors.toSet());
return nodes.stream()
.filter(node -> node == null || ephemeralIdsCurrentlyInCluster.contains(node.getEphemeralId()))
.collect(Collectors.toList());
}

private record TimeAndMaster(long startTimeMillis, DiscoveryNode master) {}
}

0 comments on commit e47138e

Please sign in to comment.