Skip to content

Commit

Permalink
policystatemetrics: timeout for ListTracingPolicies
Browse files Browse the repository at this point in the history
This patch adds a timeout for ListTracingPolicies. It can be the case
that the sensor manager is stuck or misbehaving. This patch (combined
with the previous one) ensures that metrics will continue after a
timeout.

Tested manually using:

```diff
diff --git a/pkg/metrics/policystatemetrics/policystatemetrics_test.go b/pkg/metrics/policystatemetrics/policystatemetrics_test.go
index 227306b65..fd581392b 100644
--- a/pkg/metrics/policystatemetrics/policystatemetrics_test.go
+++ b/pkg/metrics/policystatemetrics/policystatemetrics_test.go
@@ -9,6 +9,7 @@ import (
 	"io"
 	"strings"
 	"testing"
+	"time"

 	"github.com/cilium/tetragon/pkg/observer"
 	tus "github.com/cilium/tetragon/pkg/testutils/sensors"
@@ -57,3 +58,22 @@ tetragon_tracingpolicy_loaded{state="load_error"} %d
 	err = testutil.CollectAndCompare(collector, expectedMetrics(1, 0, 0, 0))
 	assert.NoError(t, err)
 }
+
+func TestTimeout(t *testing.T) {
+	reg := prometheus.NewRegistry()
+
+	manager := tus.GetTestSensorManager(context.TODO(), t).Manager
+	observer.SetSensorManager(manager)
+	t.Cleanup(observer.ResetSensorManager)
+
+	collector := newPolicyStateCollector()
+	reg.Register(collector)
+
+	go func() {
+		err := manager.SleepForTesting(context.TODO(), t, 1*time.Second)
+		assert.NoError(t, err)
+	}()
+
+	err := testutil.CollectAndCompare(collector, strings.NewReader(""))
+	assert.NoError(t, err)
+}
diff --git a/pkg/sensors/manager.go b/pkg/sensors/manager.go
index eaf908340..291a58c8f 100644
--- a/pkg/sensors/manager.go
+++ b/pkg/sensors/manager.go
@@ -8,6 +8,8 @@ import (
 	"errors"
 	"fmt"
 	"strings"
+	"testing"
+	"time"

 	"github.com/cilium/tetragon/api/v1/tetragon"
 	"github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/v1alpha1"
@@ -96,6 +98,13 @@ func startSensorManager(
 				logger.GetLogger().Debugf("stopping sensor controller...")
 				done = true
 				err = nil
+
+			// NB(kkourt): for testing
+			case *sensorManagerSleep:
+				time.Sleep(op.d)
+				err = nil
+
 			default:
 				err = fmt.Errorf("unknown sensorOp: %v", op)
 			}
@@ -421,6 +430,13 @@ type sensorCtlStop struct {
 	retChan chan error
 }

+// sensorManagerSleep just sleeps. Intended only for testing.
+type sensorManagerSleep struct {
+	ctx     context.Context
+	retChan chan error
+	d       time.Duration
+}
+
 type LoadArg struct{}
 type UnloadArg = LoadArg

@@ -436,5 +452,18 @@ func (s *sensorEnable) sensorOpDone(e error)         { s.retChan <- e }
 func (s *sensorDisable) sensorOpDone(e error)        { s.retChan <- e }
 func (s *sensorList) sensorOpDone(e error)           { s.retChan <- e }
 func (s *sensorCtlStop) sensorOpDone(e error)        { s.retChan <- e }
+func (s *sensorManagerSleep) sensorOpDone(e error)   { s.retChan <- e }

 type sensorCtlHandle = chan<- sensorOp
+
+func (h *Manager) SleepForTesting(ctx context.Context, t *testing.T, d time.Duration) error {
+	retc := make(chan error)
+	op := &sensorManagerSleep{
+		ctx:     ctx,
+		retChan: retc,
+		d:       d,
+	}
+
+	h.sensorCtl <- op
+	return <-retc
+}
```

Signed-off-by: Kornilios Kourtis <kornilios@isovalent.com>
  • Loading branch information
kkourt committed Apr 2, 2024
1 parent 33b4014 commit 5ce0f71
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion pkg/metrics/policystatemetrics/policystatemetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package policystatemetrics
import (
"context"
"strings"
"time"

"github.com/cilium/tetragon/api/v1/tetragon"
"github.com/cilium/tetragon/pkg/logger"
Expand Down Expand Up @@ -49,7 +50,10 @@ func (c *policyStateCollector) Collect(ch chan<- prometheus.Metric) {
logger.GetLogger().Debug("failed retrieving the sensor manager: manager is nil")
return
}
list, err := sm.ListTracingPolicies(context.Background())

ctx, cancel := context.WithTimeout(context.TODO(), 900*time.Millisecond)
defer cancel()
list, err := sm.ListTracingPolicies(ctx)
if err != nil {
logger.GetLogger().WithError(err).Warn("error listing tracing policies to collect policies state")
return
Expand Down

0 comments on commit 5ce0f71

Please sign in to comment.