cilium · michi-covalent · Mar 30, 2023 · Dec 23, 2022 · Mar 1, 2023 · Mar 22, 2023
diff --git a/Documentation/observability/metrics.rst b/Documentation/observability/metrics.rst
@@ -628,6 +628,26 @@ Exported Metrics
 
 Hubble metrics are exported under the ``hubble_`` Prometheus namespace.
 
+lost events
+~~~~~~~~~~~
+
+This metric, unlike other ones, is not directly tied to network flows. It's enabled if any of the other metrics is enabled.
+
+================================ ======================================== ========== ==================================================
+Name                             Labels                                   Default    Description
+================================ ======================================== ========== ==================================================
+``lost_events_total``            ``source``                               Enabled    Number of lost events
+================================ ======================================== ========== ==================================================
+
+Labels
+""""""
+
+``source`` identifies the source of lost events, one of:
+- ``perf_event_ring_buffer``
+- ``observer_events_queue``
+- ``hubble_ring_buffer``
+
+
 ``dns``
 ~~~~~~~
 

diff --git a/pkg/hubble/container/ring.go b/pkg/hubble/container/ring.go
@@ -7,6 +7,7 @@ import (
 	"context"
 	"fmt"
 	"io"
+	"strings"
 	"sync/atomic"
 	"time"
 	"unsafe"
@@ -16,6 +17,7 @@ import (
 	flowpb "github.com/cilium/cilium/api/v1/flow"
 	v1 "github.com/cilium/cilium/pkg/hubble/api/v1"
 	"github.com/cilium/cilium/pkg/hubble/math"
+	"github.com/cilium/cilium/pkg/hubble/metrics"
 	"github.com/cilium/cilium/pkg/lock"
 )
 
@@ -215,6 +217,7 @@ func (r *Ring) OldestWrite() uint64 {
 }
 
 func getLostEvent() *v1.Event {
+	metrics.LostEvents.WithLabelValues(strings.ToLower(flowpb.LostEventSource_HUBBLE_RING_BUFFER.String())).Inc()
 	now := time.Now().UTC()
 	return &v1.Event{
 		Timestamp: &timestamppb.Timestamp{

diff --git a/pkg/hubble/metrics/metrics.go b/pkg/hubble/metrics/metrics.go
@@ -41,6 +41,17 @@ var (
 	podDeletionHandler *PodDeletionHandler
 )
 
+// Additional metrics - they're not counting flows, so are not served via
+// Hubble metrics API, but belong to the same Prometheus namespace.
+var (
+	labelSource = "source"
+	LostEvents  = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Namespace: api.DefaultPrometheusNamespace,
+		Name:      "lost_events_total",
+		Help:      "Number of lost events",
+	}, []string{labelSource})
+)
+
 // ProcessFlow processes a flow and updates metrics
 func ProcessFlow(ctx context.Context, flow *pb.Flow) error {
 	if enabledMetrics != nil {
@@ -101,6 +112,7 @@ func initMetrics(address string, enabled api.Map, grpcMetrics *grpc_prometheus.S
 	enabledMetrics = e
 
 	registry.MustRegister(grpcMetrics)
+	registry.MustRegister(LostEvents)
 
 	errChan := make(chan error, 1)
 

diff --git a/pkg/hubble/monitor/consumer.go b/pkg/hubble/monitor/consumer.go
@@ -4,11 +4,14 @@
 package monitor
 
 import (
+	"strings"
 	"time"
 
 	"github.com/google/uuid"
 	"github.com/sirupsen/logrus"
 
+	flowpb "github.com/cilium/cilium/api/v1/flow"
+	"github.com/cilium/cilium/pkg/hubble/metrics"
 	observerTypes "github.com/cilium/cilium/pkg/hubble/observer/types"
 	"github.com/cilium/cilium/pkg/lock"
 	"github.com/cilium/cilium/pkg/logging"
@@ -66,7 +69,7 @@ func (c *consumer) sendNumLostEvents() {
 		// We now now safely reset the counter, as at this point have
 		// successfully notified the observer about the amount of events
 		// that were lost since the previous LostEvent message
-		c.observer.GetLogger().Infof("hubble events queue is processing messages again: %d messages were lost", c.numEventsLost)
+		c.observer.GetLogger().Debugf("hubble events queue received a LostEvent message: %d messages were lost", c.numEventsLost)
 		c.numEventsLost = 0
 	default:
 		// We do not need to bump the numEventsLost counter here, as we will
@@ -85,20 +88,20 @@ func (c *consumer) sendEvent(event *observerTypes.MonitorEvent) {
 	select {
 	case c.observer.GetEventsChannel() <- event:
 	default:
-		c.logStartedDropping()
+		c.countDroppedEvent()
 	}
 }
 
-// logStartedDropping logs that the events channel is full
-// and starts couting exactly how many messages it has
-// lost until the consumer can recover.
-func (c *consumer) logStartedDropping() {
+// countDroppedEvent logs that the events channel is full
+// and counts how many messages it has lost.
+func (c *consumer) countDroppedEvent() {
 	c.lostLock.Lock()
 	defer c.lostLock.Unlock()
 	if c.numEventsLost == 0 && c.logLimiter.Allow() {
 		c.observer.GetLogger().Warning("hubble events queue is full: dropping messages; consider increasing the queue size (hubble-event-queue-size) or provisioning more CPU")
 	}
 	c.numEventsLost++
+	metrics.LostEvents.WithLabelValues(strings.ToLower(flowpb.LostEventSource_OBSERVER_EVENTS_QUEUE.String())).Inc()
 }
 
 // NotifyAgentEvent implements monitorConsumer.MonitorConsumer
@@ -139,4 +142,5 @@ func (c *consumer) NotifyPerfEventLost(numLostEvents uint64, cpu int) {
 			CPU:           cpu,
 		},
 	})
+	metrics.LostEvents.WithLabelValues(strings.ToLower(flowpb.LostEventSource_PERF_EVENT_RING_BUFFER.String())).Add(float64(numLostEvents))
 }