Skip to content

Commit

Permalink
clustermesh: add etcd cluster ID to remote cluster status information
Browse files Browse the repository at this point in the history
174e721 ("ClusterMesh: validate etcd cluster ID") introduced the
support for running multiple replicas of the clustermesh-apiserver, by
validating the etcd cluster ID and restart the connection if a change
is detected. This prevents accepting responses from a different replica
(.g., following a deployment rollout), with the potential consequence
of missing events or retaining invalid data.

Let's additionally output the etcd cluster ID as part of the remote
clusters status, to make it easier to figure out which replica
each agent is connected to. The same ID is also output as part of
clustermesh-apiserver etcd logs, as well as by the troubleshoot
commands which are being introduced.

Signed-off-by: Marco Iorio <marco.iorio@isovalent.com>
  • Loading branch information
giorio94 authored and ldelossa committed May 7, 2024
1 parent a66a0f1 commit eb2a6a2
Showing 1 changed file with 15 additions and 1 deletion.
16 changes: 15 additions & 1 deletion pkg/clustermesh/common/remote_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ type remoteCluster struct {
// mutex protects the following variables
// - backend
// - config
// - etcdClusterID
// - failures
// - lastFailure
mutex lock.RWMutex
Expand All @@ -90,6 +91,11 @@ type remoteCluster struct {
// config contains the information about the cluster config for status reporting
config *models.RemoteClusterConfig

// etcdClusterID contains the information about the etcd cluster ID for status
// reporting. It is used to distinguish which instance of the clustermesh-apiserver
// we are connected to when running in HA mode.
etcdClusterID string

// failures is the number of observed failures
failures int

Expand Down Expand Up @@ -119,6 +125,7 @@ func (rc *remoteCluster) releaseOldConnection() {
backend := rc.backend
rc.backend = nil
rc.config = nil
rc.etcdClusterID = ""
rc.mutex.Unlock()

// Release resources asynchronously in the background. Many of these
Expand Down Expand Up @@ -165,8 +172,11 @@ func (rc *remoteCluster) restartRemoteConnection() {
return err
}

etcdClusterID := fmt.Sprintf("%x", clusterLock.etcdClusterID.Load())

rc.mutex.Lock()
rc.backend = backend
rc.etcdClusterID = etcdClusterID
rc.mutex.Unlock()

ctx, cancel := context.WithCancel(ctx)
Expand All @@ -177,7 +187,7 @@ func (rc *remoteCluster) restartRemoteConnection() {
rc.wg.Done()
}()

rc.logger.WithField(logfields.EtcdClusterID, clusterLock.etcdClusterID.Load()).Info("Connection to remote cluster established")
rc.logger.WithField(logfields.EtcdClusterID, etcdClusterID).Info("Connection to remote cluster established")

config, err := rc.getClusterConfig(ctx, backend, rc.ClusterConfigRequired())
if err == nil && config == nil {
Expand Down Expand Up @@ -424,6 +434,10 @@ func (rc *remoteCluster) status() *models.RemoteCluster {
if backendError != nil {
backendStatus = backendError.Error()
}

if rc.etcdClusterID != "" {
backendStatus += ", ID: " + rc.etcdClusterID
}
}

status := &models.RemoteCluster{
Expand Down

0 comments on commit eb2a6a2

Please sign in to comment.