This patch adds a timeout for ListTracingPolicies. It can be the case
that the sensor manager is stuck or misbehaving. This patch (combined
with the previous one) ensures that metrics will continue after a
timeout.
Tested manually using:
```diff
diff --git a/pkg/metrics/policystatemetrics/policystatemetrics_test.go b/pkg/metrics/policystatemetrics/policystatemetrics_test.go
index 227306b65..fd581392b 100644
--- a/pkg/metrics/policystatemetrics/policystatemetrics_test.go
+++ b/pkg/metrics/policystatemetrics/policystatemetrics_test.go
@@ -9,6 +9,7 @@ import (
"io"
"strings"
"testing"
+ "time"
"github.com/cilium/tetragon/pkg/observer"
tus "github.com/cilium/tetragon/pkg/testutils/sensors"
@@ -57,3 +58,22 @@ tetragon_tracingpolicy_loaded{state="load_error"} %d
err = testutil.CollectAndCompare(collector, expectedMetrics(1, 0, 0, 0))
assert.NoError(t, err)
}
+
+func TestTimeout(t *testing.T) {
+ reg := prometheus.NewRegistry()
+
+ manager := tus.GetTestSensorManager(context.TODO(), t).Manager
+ observer.SetSensorManager(manager)
+ t.Cleanup(observer.ResetSensorManager)
+
+ collector := newPolicyStateCollector()
+ reg.Register(collector)
+
+ go func() {
+ err := manager.SleepForTesting(context.TODO(), t, 1*time.Second)
+ assert.NoError(t, err)
+ }()
+
+ err := testutil.CollectAndCompare(collector, strings.NewReader(""))
+ assert.NoError(t, err)
+}
diff --git a/pkg/sensors/manager.go b/pkg/sensors/manager.go
index eaf908340..291a58c8f 100644
--- a/pkg/sensors/manager.go
+++ b/pkg/sensors/manager.go
@@ -8,6 +8,8 @@ import (
"errors"
"fmt"
"strings"
+ "testing"
+ "time"
"github.com/cilium/tetragon/api/v1/tetragon"
"github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/v1alpha1"
@@ -96,6 +98,13 @@ func startSensorManager(
logger.GetLogger().Debugf("stopping sensor controller...")
done = true
err = nil
+
+ // NB(kkourt): for testing
+ case *sensorManagerSleep:
+ time.Sleep(op.d)
+ err = nil
+
default:
err = fmt.Errorf("unknown sensorOp: %v", op)
}
@@ -421,6 +430,13 @@ type sensorCtlStop struct {
retChan chan error
}
+// sensorManagerSleep just sleeps. Intended only for testing.
+type sensorManagerSleep struct {
+ ctx context.Context
+ retChan chan error
+ d time.Duration
+}
+
type LoadArg struct{}
type UnloadArg = LoadArg
@@ -436,5 +452,18 @@ func (s *sensorEnable) sensorOpDone(e error) { s.retChan <- e }
func (s *sensorDisable) sensorOpDone(e error) { s.retChan <- e }
func (s *sensorList) sensorOpDone(e error) { s.retChan <- e }
func (s *sensorCtlStop) sensorOpDone(e error) { s.retChan <- e }
+func (s *sensorManagerSleep) sensorOpDone(e error) { s.retChan <- e }
type sensorCtlHandle = chan<- sensorOp
+
+func (h *Manager) SleepForTesting(ctx context.Context, t *testing.T, d time.Duration) error {
+ retc := make(chan error)
+ op := &sensorManagerSleep{
+ ctx: ctx,
+ retChan: retc,
+ d: d,
+ }
+
+ h.sensorCtl <- op
+ return <-retc
+}
```
Signed-off-by: Kornilios Kourtis <kornilios@isovalent.com>