Skip to content

Commit

Permalink
FailedShardEntry task listener (#82795)
Browse files Browse the repository at this point in the history
Today node removal tasks executed by the master have a separate
ClusterStateTaskListener to feed back the result to the requestor.
It'd be preferable to use the task itself as the listener.
  • Loading branch information
idegtiarenko committed Jan 20, 2022
1 parent b3d15c5 commit 4348a94
Show file tree
Hide file tree
Showing 5 changed files with 224 additions and 197 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -292,79 +292,22 @@ private static class ShardFailedTransportHandler implements TransportRequestHand

@Override
public void messageReceived(FailedShardEntry request, TransportChannel channel, Task task) throws Exception {
logger.debug(() -> new ParameterizedMessage("{} received shard failed for {}", request.shardId, request), request.failure);
logger.debug(
() -> new ParameterizedMessage("{} received shard failed for [{}]", request.getShardId(), request),
request.failure
);
var update = new FailedShardUpdateTask(request, new ChannelActionListener<>(channel, TASK_SOURCE, request));
clusterService.submitStateUpdateTask(
TASK_SOURCE,
request,
update,
ClusterStateTaskConfig.build(Priority.HIGH),
shardFailedClusterStateTaskExecutor,
new ClusterStateTaskListener() {
@Override
public void onFailure(Exception e) {
final MessageSupplier msg = () -> new ParameterizedMessage(
"{} unexpected failure while failing shard [{}]",
request.shardId,
request
);
if (e instanceof FailedToCommitClusterStateException) {
logger.debug(msg, e);
} else {
logger.error(msg, e);
}
try {
channel.sendResponse(e);
} catch (Exception channelException) {
channelException.addSuppressed(e);
logger.warn(
() -> new ParameterizedMessage(
"{} failed to send failure [{}] while failing shard [{}]",
request.shardId,
e,
request
),
channelException
);
}
}

@Override
public void onNoLongerMaster() {
logger.debug("{} no longer master while failing shard [{}]", request.shardId, request);
try {
channel.sendResponse(new NotMasterException(TASK_SOURCE));
} catch (Exception channelException) {
logger.warn(
() -> new ParameterizedMessage(
"{} failed to send no longer master while failing shard [{}]",
request.shardId,
request
),
channelException
);
}
}

@Override
public void clusterStateProcessed(ClusterState oldState, ClusterState newState) {
try {
channel.sendResponse(TransportResponse.Empty.INSTANCE);
} catch (Exception channelException) {
logger.warn(
() -> new ParameterizedMessage(
"{} failed to send response while failing shard [{}]",
request.shardId,
request
),
channelException
);
}
}
}
update
);
}
}

public static class ShardFailedClusterStateTaskExecutor implements ClusterStateTaskExecutor<FailedShardEntry> {
public static class ShardFailedClusterStateTaskExecutor implements ClusterStateTaskExecutor<FailedShardUpdateTask> {
private final AllocationService allocationService;
private final RerouteService rerouteService;

Expand All @@ -374,17 +317,24 @@ public ShardFailedClusterStateTaskExecutor(AllocationService allocationService,
}

@Override
public ClusterTasksResult<FailedShardEntry> execute(ClusterState currentState, List<FailedShardEntry> tasks) throws Exception {
ClusterTasksResult.Builder<FailedShardEntry> batchResultBuilder = ClusterTasksResult.builder();
List<FailedShardEntry> tasksToBeApplied = new ArrayList<>();
public ClusterTasksResult<FailedShardUpdateTask> execute(ClusterState currentState, List<FailedShardUpdateTask> tasks)
throws Exception {
ClusterTasksResult.Builder<FailedShardUpdateTask> batchResultBuilder = ClusterTasksResult.builder();
List<FailedShardUpdateTask> tasksToBeApplied = new ArrayList<>();
List<FailedShard> failedShardsToBeApplied = new ArrayList<>();
List<StaleShard> staleShardsToBeApplied = new ArrayList<>();

for (FailedShardEntry task : tasks) {
IndexMetadata indexMetadata = currentState.metadata().index(task.shardId.getIndex());
for (FailedShardUpdateTask task : tasks) {
FailedShardEntry entry = task.getEntry();
IndexMetadata indexMetadata = currentState.metadata().index(entry.getShardId().getIndex());
if (indexMetadata == null) {
// tasks that correspond to non-existent indices are marked as successful
logger.debug("{} ignoring shard failed task [{}] (unknown index {})", task.shardId, task, task.shardId.getIndex());
logger.debug(
"{} ignoring shard failed task [{}] (unknown index {})",
entry.getShardId(),
entry,
entry.getShardId().getIndex()
);
batchResultBuilder.success(task);
} else {
// The primary term is 0 if the shard failed itself. It is > 0 if a write was done on a primary but was failed to be
Expand All @@ -395,29 +345,29 @@ public ClusterTasksResult<FailedShardEntry> execute(ClusterState currentState, L
// We check here that the primary to which the write happened was not already failed in an earlier cluster state update.
// This prevents situations where a new primary has already been selected and replication failures from an old stale
// primary unnecessarily fail currently active shards.
if (task.primaryTerm > 0) {
long currentPrimaryTerm = indexMetadata.primaryTerm(task.shardId.id());
if (currentPrimaryTerm != task.primaryTerm) {
assert currentPrimaryTerm > task.primaryTerm
if (entry.primaryTerm > 0) {
long currentPrimaryTerm = indexMetadata.primaryTerm(entry.getShardId().id());
if (currentPrimaryTerm != entry.primaryTerm) {
assert currentPrimaryTerm > entry.primaryTerm
: "received a primary term with a higher term than in the "
+ "current cluster state (received ["
+ task.primaryTerm
+ entry.primaryTerm
+ "] but current is ["
+ currentPrimaryTerm
+ "])";
logger.debug(
"{} failing shard failed task [{}] (primary term {} does not match current term {})",
task.shardId,
task,
task.primaryTerm,
indexMetadata.primaryTerm(task.shardId.id())
entry.getShardId(),
entry,
entry.primaryTerm,
indexMetadata.primaryTerm(entry.getShardId().id())
);
batchResultBuilder.failure(
task,
new NoLongerPrimaryShardException(
task.shardId,
entry.getShardId(),
"primary term ["
+ task.primaryTerm
+ entry.primaryTerm
+ "] did not match current primary term ["
+ currentPrimaryTerm
+ "]"
Expand All @@ -427,26 +377,31 @@ public ClusterTasksResult<FailedShardEntry> execute(ClusterState currentState, L
}
}

ShardRouting matched = currentState.getRoutingTable().getByAllocationId(task.shardId, task.allocationId);
ShardRouting matched = currentState.getRoutingTable().getByAllocationId(entry.getShardId(), entry.getAllocationId());
if (matched == null) {
Set<String> inSyncAllocationIds = indexMetadata.inSyncAllocationIds(task.shardId.id());
Set<String> inSyncAllocationIds = indexMetadata.inSyncAllocationIds(entry.getShardId().id());
// mark shard copies without routing entries that are in in-sync allocations set only as stale if the reason why
// they were failed is because a write made it into the primary but not to this copy (which corresponds to
// the check "primaryTerm > 0").
if (task.primaryTerm > 0 && inSyncAllocationIds.contains(task.allocationId)) {
logger.debug("{} marking shard {} as stale (shard failed task: [{}])", task.shardId, task.allocationId, task);
if (entry.primaryTerm > 0 && inSyncAllocationIds.contains(entry.getAllocationId())) {
logger.debug(
"{} marking shard {} as stale (shard failed task: [{}])",
entry.getShardId(),
entry.getAllocationId(),
entry
);
tasksToBeApplied.add(task);
staleShardsToBeApplied.add(new StaleShard(task.shardId, task.allocationId));
staleShardsToBeApplied.add(new StaleShard(entry.getShardId(), entry.getAllocationId()));
} else {
// tasks that correspond to non-existent shards are marked as successful
logger.debug("{} ignoring shard failed task [{}] (shard does not exist anymore)", task.shardId, task);
logger.debug("{} ignoring shard failed task [{}] (shard does not exist anymore)", entry.getShardId(), entry);
batchResultBuilder.success(task);
}
} else {
// failing a shard also possibly marks it as stale (see IndexMetadataUpdater)
logger.debug("{} failing shard {} (shard failed task: [{}])", task.shardId, matched, task);
logger.debug("{} failing shard {} (shard failed task: [{}])", entry.getShardId(), matched, task);
tasksToBeApplied.add(task);
failedShardsToBeApplied.add(new FailedShard(matched, task.message, task.failure, task.markAsStale));
failedShardsToBeApplied.add(new FailedShard(matched, entry.message, entry.failure, entry.markAsStale));
}
}
}
Expand Down Expand Up @@ -578,6 +533,43 @@ public int hashCode() {
}
}

public static class FailedShardUpdateTask implements ClusterStateTaskListener {

private final FailedShardEntry entry;
private final ActionListener<TransportResponse.Empty> listener;

public FailedShardUpdateTask(FailedShardEntry entry, ActionListener<TransportResponse.Empty> listener) {
this.entry = entry;
this.listener = listener;
}

public FailedShardEntry getEntry() {
return entry;
}

@Override
public void onFailure(Exception e) {
if (e instanceof NotMasterException) {
logger.debug(() -> new ParameterizedMessage("{} no longer master while failing shard [{}]", entry.shardId, entry));
} else if (e instanceof FailedToCommitClusterStateException) {
logger.debug(() -> new ParameterizedMessage("{} unexpected failure while failing shard [{}]", entry.shardId, entry), e);
} else {
logger.error(() -> new ParameterizedMessage("{} unexpected failure while failing shard [{}]", entry.shardId, entry), e);
}
listener.onFailure(e);
}

@Override
public void clusterStateProcessed(ClusterState oldState, ClusterState newState) {
listener.onResponse(TransportResponse.Empty.INSTANCE);
}

@Override
public String toString() {
return "FailedShardUpdateTask{entry=" + entry + ", listener=" + listener + "}";
}
}

public void shardStarted(
final ShardRouting shardRouting,
final long primaryTerm,
Expand Down

0 comments on commit 4348a94

Please sign in to comment.