diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java b/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java index c2cd403836593..e81d8d73af9a2 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java @@ -43,9 +43,16 @@ import static org.elasticsearch.cluster.coordination.ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING; import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY; +/** + * Handles periodic debug logging of information regarding why the cluster has failed to form. + * Periodic logging begins once {@link #start()} is called, and ceases on {@link #stop()}. + */ public class ClusterFormationFailureHelper { private static final Logger logger = LogManager.getLogger(ClusterFormationFailureHelper.class); + /** + * This time period controls how often warning log messages will be written if this node fails to join or form a cluster. + */ public static final Setting DISCOVERY_CLUSTER_FORMATION_WARNING_TIMEOUT_SETTING = Setting.timeSetting( "discovery.cluster_formation_warning_timeout", TimeValue.timeValueMillis(10000), @@ -61,6 +68,16 @@ public class ClusterFormationFailureHelper { @Nullable // if no warning is scheduled private volatile WarningScheduler warningScheduler; + /** + * Works with the {@link JoinHelper} to log the latest node-join attempt failure and cluster state debug information. Must call + * {@link ClusterFormationState#start()} to begin. + * + * @param settings provides the period in which to log cluster formation errors. + * @param clusterFormationStateSupplier information about the current believed cluster state (See {@link ClusterFormationState}) + * @param threadPool the thread pool on which to run debug logging + * @param logLastFailedJoinAttempt invokes an instance of the JoinHelper to log the last encountered join failure + * (See {@link JoinHelper#logLastFailedJoinAttempt()}) + */ public ClusterFormationFailureHelper( Settings settings, Supplier clusterFormationStateSupplier, @@ -78,6 +95,10 @@ public boolean isRunning() { return warningScheduler != null; } + /** + * Schedules a warning debug message to be logged in 'clusterFormationWarningTimeout' time, and periodically thereafter, until + * {@link ClusterFormationState#stop()} has been called. + */ public void start() { assert warningScheduler == null; warningScheduler = new WarningScheduler(); @@ -129,7 +150,7 @@ public String toString() { } /** - * If this node believes that cluster formation has failed, this record provides information that can be used to determine why that is. + * This record provides node state information that can be used to determine why cluster formation has failed. */ public record ClusterFormationState( List initialMasterNodesSetting, diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/JoinHelper.java b/server/src/main/java/org/elasticsearch/cluster/coordination/JoinHelper.java index b960bb02ceb7f..059400ad81cfb 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/JoinHelper.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/JoinHelper.java @@ -194,13 +194,23 @@ private void unregisterAndReleaseConnection(DiscoveryNode destination, Releasabl Releasables.close(connectionReference); } - // package-private for testing + /** + * Saves information about a join failure. The failure information may be logged later via either {@link FailedJoinAttempt#logNow} + * or {@link FailedJoinAttempt#lastFailedJoinAttempt}. + * + * Package-private for testing. + */ static class FailedJoinAttempt { private final DiscoveryNode destination; private final JoinRequest joinRequest; private final ElasticsearchException exception; private final long timestamp; + /** + * @param destination the master node targeted by the join request. + * @param joinRequest the join request that was sent to the perceived master node. + * @param exception the error response received in reply to the join request attempt. + */ FailedJoinAttempt(DiscoveryNode destination, JoinRequest joinRequest, ElasticsearchException exception) { this.destination = destination; this.joinRequest = joinRequest; @@ -208,10 +218,18 @@ static class FailedJoinAttempt { this.timestamp = System.nanoTime(); } + /** + * Logs the failed join attempt exception. + * {@link FailedJoinAttempt#getLogLevel(ElasticsearchException)} determines at what log-level the log is written. + */ void logNow() { logger.log(getLogLevel(exception), () -> format("failed to join %s with %s", destination, joinRequest), exception); } + /** + * Returns the appropriate log level based on the given exception. Every error is at least DEBUG, but unexpected errors are INFO. + * For example, NotMasterException and CircuitBreakingExceptions are DEBUG logs. + */ static Level getLogLevel(ElasticsearchException e) { Throwable cause = e.unwrapCause(); if (cause instanceof CoordinationStateRejectedException @@ -226,6 +244,10 @@ void logWarnWithTimestamp() { logger.warn( () -> format( "last failed join attempt was %s ago, failed to join %s with %s", + // 'timestamp' is when this error exception was received by the local node. If the time that has passed since the error + // was originally received is quite large, it could indicate that this is a stale error exception from some prior + // out-of-order request response (where a later sent request but earlier received response was successful); or + // alternatively an old error could indicate that this node did not retry the join request for a very long time. TimeValue.timeValueMillis(TimeValue.nsecToMSec(System.nanoTime() - timestamp)), destination, joinRequest @@ -235,6 +257,9 @@ void logWarnWithTimestamp() { } } + /** + * Logs a warning message if {@link #lastFailedJoinAttempt} has been set with a failure. + */ void logLastFailedJoinAttempt() { FailedJoinAttempt attempt = lastFailedJoinAttempt.get(); if (attempt != null) { @@ -247,7 +272,7 @@ public void sendJoinRequest(DiscoveryNode destination, long term, Optional assert destination.isMasterNode() : "trying to join master-ineligible " + destination; final StatusInfo statusInfo = nodeHealthService.getHealth(); if (statusInfo.getStatus() == UNHEALTHY) { - logger.debug("dropping join request to [{}]: [{}]", destination, statusInfo.getInfo()); + logger.debug("dropping join request to [{}], unhealthy status: [{}]", destination, statusInfo.getInfo()); return; } final JoinRequest joinRequest = new JoinRequest( diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/NodeJoinExecutor.java b/server/src/main/java/org/elasticsearch/cluster/coordination/NodeJoinExecutor.java index 2c024063e2399..9223e02fc946c 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/NodeJoinExecutor.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/NodeJoinExecutor.java @@ -26,6 +26,7 @@ import org.elasticsearch.cluster.routing.allocation.AllocationService; import org.elasticsearch.cluster.version.CompatibilityVersions; import org.elasticsearch.common.Priority; +import org.elasticsearch.common.Strings; import org.elasticsearch.features.FeatureService; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.IndexVersions; @@ -123,7 +124,14 @@ public ClusterState execute(BatchExecutionContext batchExecutionContex newState = ClusterState.builder(initialState); } else { logger.trace("processing node joins, but we are not the master. current master: {}", currentNodes.getMasterNode()); - throw new NotMasterException("Node [" + currentNodes.getLocalNode() + "] not master for join request"); + throw new NotMasterException( + Strings.format( + "Node [%s] not master for join request. Current known master [%s], current term [%d]", + currentNodes.getLocalNode(), + currentNodes.getMasterNode(), + term + ) + ); } DiscoveryNodes.Builder nodesBuilder = DiscoveryNodes.builder(newState.nodes());