Skip to content

Commit

Permalink
Reinstate disruption-related stateless masters tests (#95409)
Browse files Browse the repository at this point in the history
Ensures that nodes in the `CoordinatorTests` which are disconnected or
blackholed are also unable to access the shared object store, and
reinstates the tests which verify that we react appropriately to this
kind of disruption.
  • Loading branch information
DaveCTurner committed Apr 20, 2023
1 parent 4d6e451 commit 0b4b741
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 93 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,13 @@
import org.elasticsearch.test.junit.annotations.TestLogging;
import org.elasticsearch.threadpool.ThreadPool;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.OptionalLong;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.BooleanSupplier;
import java.util.function.Function;
import java.util.function.LongSupplier;
Expand All @@ -48,31 +51,12 @@ public void testLeaderDisconnectionWithDisconnectEventDetectedQuickly() {
// In this test the leader still has access to the register, therefore it is still considered as a leader.
}

@Override
@AwaitsFix(bugUrl = "ES-5645")
public void testLeaderDisconnectionWithoutDisconnectEventDetectedQuickly() {
// In this test the leader still has access to the register, therefore it is still considered as a leader.
}

@Override
@AwaitsFix(bugUrl = "ES-5645")
public void testMasterStatsOnFailedUpdate() {
// In this test the leader still has access to the register, therefore it is still considered as a leader, and it can perform
// updates.
}

@Override
@AwaitsFix(bugUrl = "ES-5645")
public void testUnhealthyLeaderIsReplaced() {
// In this test the leader still has access to the register, therefore it is still considered as a leader.
}

@Override
@AwaitsFix(bugUrl = "ES-5645")
public void testUnresponsiveLeaderDetectedEventually() {
// In this test the leader still has access to the register, therefore it is still considered as a leader.
}

@Override
@AwaitsFix(bugUrl = "ES-5645")
public void testLogsWarningPeriodicallyIfClusterNotFormed() {
Expand All @@ -91,47 +75,12 @@ public void testAckListenerReceivesNacksIfPublicationTimesOut() {
// The leader still has access to the register, therefore it acknowledges the state update
}

@Override
@AwaitsFix(bugUrl = "ES-5645")
public void testAppliesNoMasterBlockWritesByDefault() {
// If the disconnected node is the leader it will continue to have connectivity
// into the register and therefore the no master block won't be applied
}

@Override
@AwaitsFix(bugUrl = "ES-5645")
public void testAppliesNoMasterBlockWritesIfConfigured() {
// If the disconnected node is the leader it will continue to have connectivity
// into the register and therefore the no master block won't be applied
}

@Override
@AwaitsFix(bugUrl = "ES-5645")
public void testAppliesNoMasterBlockAllIfConfigured() {
// If the disconnected node is the leader it will continue to have connectivity
// into the register and therefore the no master block won't be applied
}

@Override
@AwaitsFix(bugUrl = "ES-5645")
public void testAppliesNoMasterBlockMetadataWritesIfConfigured() {
// If the disconnected node is the leader it will continue to have connectivity
// into the register and therefore the no master block won't be applied
}

@Override
@AwaitsFix(bugUrl = "ES-5645")
public void testClusterCannotFormWithFailingJoinValidation() {
// A single node can form a cluster in this case
}

@Override
@AwaitsFix(bugUrl = "ES-5645")
public void testReportsConnectBackProblemsDuringJoining() {
// If the partitioned node is the leader, it still has access
// to the store, therefore the test fail
}

@Override
@AwaitsFix(bugUrl = "ES-5645")
public void testCannotJoinClusterWithDifferentUUID() {
Expand Down Expand Up @@ -165,34 +114,30 @@ public void testJoiningNodeReceivesFullState() {

@Override
protected CoordinatorStrategy getCoordinatorStrategy() {
var atomicRegister = new AtomicRegister();
var sharedStore = new SharedStore();
return new AtomicRegisterCoordinatorStrategy(atomicRegister, sharedStore);
return new AtomicRegisterCoordinatorStrategy();
}

class AtomicRegisterCoordinatorStrategy implements CoordinatorStrategy {
private final AtomicRegister atomicRegister;
private final SharedStore sharedStore;

AtomicRegisterCoordinatorStrategy(AtomicRegister atomicRegister, SharedStore sharedStore) {
this.atomicRegister = atomicRegister;
this.sharedStore = sharedStore;
}
private final AtomicLong currentTermRef = new AtomicLong();
private final AtomicReference<Heartbeat> heartBeatRef = new AtomicReference<>();
private final SharedStore sharedStore = new SharedStore();

@Override
public CoordinationServices getCoordinationServices(
ThreadPool threadPool,
Settings settings,
ClusterSettings clusterSettings,
CoordinationState.PersistedState persistedState
CoordinationState.PersistedState persistedState,
BooleanSupplier isDisruptedSupplier
) {
final TimeValue heartbeatFrequency = HEARTBEAT_FREQUENCY.get(settings);
var atomicHeartbeat = new StoreHeartbeatService(
sharedStore,
final var atomicRegister = new AtomicRegister(currentTermRef, isDisruptedSupplier);
final var atomicHeartbeat = new StoreHeartbeatService(
new SharedHeartbeatStore(heartBeatRef, isDisruptedSupplier),
threadPool,
heartbeatFrequency,
TimeValue.timeValueMillis(heartbeatFrequency.millis() * MAX_MISSED_HEARTBEATS.get(settings)),
listener -> listener.onResponse(OptionalLong.of(atomicRegister.readCurrentTerm()))
listener -> ActionListener.completeWith(listener, () -> OptionalLong.of(atomicRegister.readCurrentTerm()))
);
var reconfigurator = new SingleNodeReconfigurator(settings, clusterSettings);
var electionStrategy = new AtomicRegisterElectionStrategy(atomicRegister);
Expand Down Expand Up @@ -324,30 +269,29 @@ public boolean isInvalidReconfiguration(
@Override
public void beforeCommit(long term, long version, ActionListener<Void> listener) {
// TODO: add a test to ensure that this gets called
final var currentTerm = register.readCurrentTerm();
if (currentTerm == term) {
listener.onResponse(null);
} else {
assert term < currentTerm : term + " vs " + currentTerm;
listener.onFailure(
new CoordinationStateRejectedException(
ActionListener.completeWith(listener, () -> {
final var currentTerm = register.readCurrentTerm();
if (currentTerm == term) {
return null;
} else {
assert term < currentTerm : term + " vs " + currentTerm;
throw new CoordinationStateRejectedException(
Strings.format(
"could not commit cluster state version %d in term %d, current term is now %d",
version,
term,
currentTerm
)
)
);
}
);
}
});
}
}

record PersistentClusterState(long term, long version, Metadata state) {}

private static class SharedStore implements HeartbeatStore {
private static class SharedStore {
private final Map<Long, PersistentClusterState> clusterStateByTerm = new HashMap<>();
private Heartbeat heartbeat;

private void writeClusterState(ClusterState clusterState) {
clusterStateByTerm.put(
Expand All @@ -367,32 +311,57 @@ void getClusterStateForTerm(long termGoal, ActionListener<PersistentClusterState
return null;
});
}
}

private static class SharedHeartbeatStore implements HeartbeatStore {

private final AtomicReference<Heartbeat> hearbeatRef;
private final BooleanSupplier isDisruptedSupplier;

SharedHeartbeatStore(AtomicReference<Heartbeat> hearbeatRef, BooleanSupplier isDisruptedSupplier) {
this.hearbeatRef = hearbeatRef;
this.isDisruptedSupplier = isDisruptedSupplier;
}

@Override
public void writeHeartbeat(Heartbeat newHeartbeat, ActionListener<Void> listener) {
this.heartbeat = newHeartbeat;
if (isDisruptedSupplier.getAsBoolean()) {
listener.onFailure(new IOException("simulating disrupted access to shared store"));
}
hearbeatRef.set(newHeartbeat);
listener.onResponse(null);
}

@Override
public void readLatestHeartbeat(ActionListener<Heartbeat> listener) {
listener.onResponse(heartbeat);
if (isDisruptedSupplier.getAsBoolean()) {
listener.onFailure(new IOException("simulating disrupted access to shared store"));
}
listener.onResponse(hearbeatRef.get());
}
}

private static class AtomicRegister {
private long currentTerm;
private final AtomicLong currentTermRef;
private final BooleanSupplier isDisruptedSupplier;

long readCurrentTerm() {
return currentTerm;
AtomicRegister(AtomicLong currentTermRef, BooleanSupplier isDisruptedSupplier) {
this.currentTermRef = currentTermRef;
this.isDisruptedSupplier = isDisruptedSupplier;
}

long readCurrentTerm() throws IOException {
if (isDisruptedSupplier.getAsBoolean()) {
throw new IOException("simulating disrupted access to shared store");
}
return currentTermRef.get();
}

long compareAndExchange(long expected, long updated) {
final var witness = currentTerm;
if (currentTerm == expected) {
currentTerm = updated;
long compareAndExchange(long expected, long updated) throws IOException {
if (isDisruptedSupplier.getAsBoolean()) {
throw new IOException("simulating disrupted access to shared store");
}
return witness;
return currentTermRef.compareAndExchange(expected, updated);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1086,7 +1086,8 @@ public RecyclerBytesStreamOutput newNetworkBytesStream() {
threadPool,
settings,
clusterSettings,
persistedState
persistedState,
() -> disconnectedNodes.contains(localNode.getId()) || blackholedNodes.contains(localNode.getId())
);
coordinator = new Coordinator(
"test_node",
Expand Down Expand Up @@ -1480,7 +1481,8 @@ CoordinationServices getCoordinationServices(
ThreadPool threadPool,
Settings settings,
ClusterSettings clusterSettings,
CoordinationState.PersistedState persistedState
CoordinationState.PersistedState persistedState,
BooleanSupplier isDisruptedSupplier
);

CoordinationState.PersistedState createFreshPersistedState(
Expand Down Expand Up @@ -1529,7 +1531,8 @@ public CoordinationServices getCoordinationServices(
ThreadPool threadPool,
Settings settings,
ClusterSettings clusterSettings,
CoordinationState.PersistedState persistedState
CoordinationState.PersistedState persistedState,
BooleanSupplier isDisruptedSupplier
) {
return new CoordinationServices() {
@Override
Expand Down

0 comments on commit 0b4b741

Please sign in to comment.