elastic · constanca-m · Apr 8, 2024 · Mar 20, 2024 · Mar 20, 2024 · Mar 20, 2024
diff --git a/CHANGELOG-developer.next.asciidoc b/CHANGELOG-developer.next.asciidoc
@@ -69,6 +69,7 @@ The list below covers the major changes between 7.0.0-rc2 and main only.
 
 ==== Bugfixes
 
+- Fix multiple metricbeat instances reporting same metrics when using autodiscover with provider kubernetes, and ensure leader elector is always running in autodiscover mode.{pull}38471[38471]
 - Fix how Prometheus histograms are calculated when percentiles are provide.{pull}36537[36537]
 - Stop using `mage:import` in community beats. This was ignoring the vendorized beats directory for some mage targets, using the code available in GOPATH, this causes inconsistencies and compilation problems if the version of the code in the GOPATH is different to the vendored one. Use of `mage:import` will continue to be unsupported in custom beats till beats is migrated to go modules, or mage supports vendored dependencies. {issue}13998[13998] {pull}14162[14162]
 - Metricbeat module builders call host parser only once when instantiating light modules. {pull}20149[20149]

diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc
@@ -230,6 +230,7 @@ Setting environmental variable ELASTIC_NETINFO:false in Elastic Agent pod will d
 
 *Metricbeat*
 
+- Add new fields to configure the lease duration, retry and renew when using leader elector with kubernetes autodiscover.{pull}38471[38471]
 - Add per-thread metrics to system_summary {pull}33614[33614]
 - Add GCP CloudSQL metadata {pull}33066[33066]
 - Add GCP Carbon Footprint metricbeat data {pull}34820[34820]

@@ -44,9 +44,14 @@ type Config struct {
 	// Scope can be either node or cluster.
 	Scope    string `config:"scope"`
 	Resource string `config:"resource"`
+
 	// Unique identifies if this provider enables its templates only when it is elected as leader in a k8s cluster
 	Unique      bool   `config:"unique"`
 	LeaderLease string `config:"leader_lease"`
+	//Parameters to configure election process
+	LeaseDuration time.Duration `config:"leader_leaseduration"`
+	RenewDeadline time.Duration `config:"leader_renewdeadline"`
+	RetryPeriod   time.Duration `config:"leader_retryperiod"`
 
 	Prefix    string                  `config:"prefix"`
 	Hints     *config.C               `config:"hints"`
@@ -57,7 +62,7 @@ type Config struct {
 	AddResourceMetadata *metadata.AddResourceMetadataConfig `config:"add_resource_metadata"`
 }
 
-// Public variable, so specific beats (as Filebeat) can set a different cleanup timeout if they need it.
+// DefaultCleanupTimeout Public variable, so specific beats (as Filebeat) can set a different cleanup timeout if they need it.
 var DefaultCleanupTimeout time.Duration = 0
 
 func defaultConfig() *Config {
@@ -68,6 +73,9 @@ func defaultConfig() *Config {
 		Prefix:              "co.elastic",
 		Unique:              false,
 		AddResourceMetadata: metadata.GetDefaultResourceMetadataConfig(),
+		LeaseDuration:       15 * time.Second,
+		RenewDeadline:       10 * time.Second,
+		RetryPeriod:         2 * time.Second,
 	}
 }
 

@@ -31,7 +31,8 @@ import (
 )
 
 func TestConfigWithCustomBuilders(t *testing.T) {
-	autodiscover.Registry.AddBuilder("mock", newMockBuilder)
+	err := autodiscover.Registry.AddBuilder("mock", newMockBuilder)
+	assert.NoError(t, err)
 
 	cfg := mapstr.M{
 		"hints.enabled": false,
@@ -44,13 +45,15 @@ func TestConfigWithCustomBuilders(t *testing.T) {
 
 	config := conf.MustNewConfigFrom(&cfg)
 	c := defaultConfig()
-	err := config.Unpack(&c)
+	err = config.Unpack(&c)
 	assert.NoError(t, err)
 
 	cfg1 := mapstr.M{
 		"hints.enabled": false,
 	}
 	config, err = conf.NewConfigFrom(&cfg1)
+	assert.NoError(t, err)
+
 	c = defaultConfig()
 	err = config.Unpack(&c)
 	assert.Error(t, err)
@@ -72,6 +75,51 @@ func TestConfigWithIncorrectScope(t *testing.T) {
 	assert.Equal(t, "cluster", c.Scope)
 }
 
+func TestConfigLeaseFields(t *testing.T) {
+	cfg := mapstr.M{
+		"scope":  "cluster",
+		"unique": "true",
+	}
+
+	tests := []struct {
+		LeaseDuration string
+		RenewDeadline string
+		RetryPeriod   string
+		message       string
+	}{
+		{
+			LeaseDuration: "20seconds",
+			RenewDeadline: "15s",
+			RetryPeriod:   "2s",
+			message:       "incorrect lease duration, should be set to default",
+		},
+		{
+			LeaseDuration: "20s",
+			RenewDeadline: "15minutes",
+			RetryPeriod:   "2s",
+			message:       "incorrect renew deadline, should be set to default",
+		},
+		{
+			LeaseDuration: "20s",
+			RenewDeadline: "15s",
+			RetryPeriod:   "2hrs",
+			message:       "incorrect retry period, should be set to default",
+		},
+	}
+
+	for _, test := range tests {
+		cfg["leader_leaseduration"] = test.LeaseDuration
+		cfg["leader_renewdeadline"] = test.RenewDeadline
+		cfg["leader_retryperiod"] = test.RetryPeriod
+
+		config := conf.MustNewConfigFrom(&cfg)
+
+		c := defaultConfig()
+		err := config.Unpack(&c)
+		assert.Errorf(t, err, test.message)
+	}
+}
+
 type mockBuilder struct {
 }
 

@@ -279,7 +279,9 @@ func NewLeaderElectionManager(
 		Name:      cfg.LeaderLease,
 		Namespace: ns,
 	}
-	metaUID := lease.GetObjectMeta().GetUID()
+
+	var eventID string
+	leaseId := lease.Name + "-" + lease.Namespace
 	lem.leaderElection = leaderelection.LeaderElectionConfig{
 		Lock: &resourcelock.LeaseLock{
 			LeaseMeta: lease,
@@ -289,18 +291,17 @@ func NewLeaderElectionManager(
 			},
 		},
 		ReleaseOnCancel: true,
-		LeaseDuration:   15 * time.Second,
-		RenewDeadline:   10 * time.Second,
-		RetryPeriod:     2 * time.Second,
+		LeaseDuration:   cfg.LeaseDuration,
+		RenewDeadline:   cfg.RenewDeadline,
+		RetryPeriod:     cfg.RetryPeriod,
 		Callbacks: leaderelection.LeaderCallbacks{
 			OnStartedLeading: func(ctx context.Context) {
-				logger.Debugf("leader election lock GAINED, id %v", id)
-				eventID := fmt.Sprintf("%v-%v", metaUID, time.Now().UnixNano())
+				eventID = fmt.Sprintf("%v-%v", leaseId, time.Now().UnixNano())
 go le.Run(ctx) 
 func (p *leaderElectionManager) Start() { 
 	ctx, cancel := context.WithCancel(context.TODO()) 
 	p.cancelLeaderElection = cancel 
 	p.startLeaderElector(ctx, p.leaderElection) 
 } 
 go le.Run(ctx) 
 func (p *leaderElectionManager) Start() { 
 	ctx, cancel := context.WithCancel(context.TODO()) 
 	p.cancelLeaderElection = cancel 
 	p.startLeaderElector(ctx, p.leaderElection) 
 } 
+				logger.Debugf("leader election lock GAINED, holder: %v, eventID: %v", id, eventID)
 				startLeading(uuid.String(), eventID)
 			},
 			OnStoppedLeading: func() {
-				logger.Debugf("leader election lock LOST, id %v", id)
-				eventID := fmt.Sprintf("%v-%v", metaUID, time.Now().UnixNano())
+				logger.Debugf("leader election lock LOST, holder: %v, eventID: %v", id, eventID)
 				stopLeading(uuid.String(), eventID)
 			},
 		},
@@ -329,7 +330,7 @@ func (p *eventerManager) GenerateHints(event bus.Event) bus.Event {
 func (p *leaderElectionManager) Start() {
 	ctx, cancel := context.WithCancel(context.TODO())
 	p.cancelLeaderElection = cancel
-	p.startLeaderElector(ctx, p.leaderElection)
+	p.startLeaderElectorIndefinitely(ctx, p.leaderElection)
 }
 
 // Stop signals the stop channel to force the leader election loop routine to stop.
@@ -344,14 +345,27 @@ func (p *leaderElectionManager) GenerateHints(event bus.Event) bus.Event {
 	return event
 }
 
-// startLeaderElector starts a Leader Elector in the background with the provided config
-func (p *leaderElectionManager) startLeaderElector(ctx context.Context, lec leaderelection.LeaderElectionConfig) {
+// startLeaderElectorIndefinitely starts a Leader Elector in the background with the provided config.
+// If this instance gets the lease lock and later loses it, we run the leader elector again.
+func (p *leaderElectionManager) startLeaderElectorIndefinitely(ctx context.Context, lec leaderelection.LeaderElectionConfig) {
 	le, err := leaderelection.NewLeaderElector(lec)
 	if err != nil {
 		p.logger.Errorf("error while creating Leader Elector: %w", err)
 	}
 	p.logger.Debugf("Starting Leader Elector")
-	go le.Run(ctx)
+
+	go func() {
+		for {
+			le.Run(ctx)
+			select {
+			case <-ctx.Done():
+				return
+			default:
+				// Run returned because the lease was lost. Run the leader elector again, so this instance
+				// is still a candidate to get the lease.
+			}
+		}
+	}()
 }
 
 func ShouldPut(event mapstr.M, field string, value interface{}, logger *logp.Logger) {