Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hubble: Add a metric for lost events #22865

Merged
merged 3 commits into from
Mar 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
20 changes: 20 additions & 0 deletions Documentation/observability/metrics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -628,6 +628,26 @@ Exported Metrics

Hubble metrics are exported under the ``hubble_`` Prometheus namespace.

lost events
~~~~~~~~~~~
qmonnet marked this conversation as resolved.
Show resolved Hide resolved

This metric, unlike other ones, is not directly tied to network flows. It's enabled if any of the other metrics is enabled.

================================ ======================================== ========== ==================================================
Name Labels Default Description
================================ ======================================== ========== ==================================================
``lost_events_total`` ``source`` Enabled Number of lost events
================================ ======================================== ========== ==================================================

Labels
""""""

``source`` identifies the source of lost events, one of:
- ``perf_event_ring_buffer``
- ``observer_events_queue``
- ``hubble_ring_buffer``


``dns``
~~~~~~~

Expand Down
3 changes: 3 additions & 0 deletions pkg/hubble/container/ring.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"context"
"fmt"
"io"
"strings"
"sync/atomic"
"time"
"unsafe"
Expand All @@ -16,6 +17,7 @@ import (
flowpb "github.com/cilium/cilium/api/v1/flow"
v1 "github.com/cilium/cilium/pkg/hubble/api/v1"
"github.com/cilium/cilium/pkg/hubble/math"
"github.com/cilium/cilium/pkg/hubble/metrics"
"github.com/cilium/cilium/pkg/lock"
)

Expand Down Expand Up @@ -215,6 +217,7 @@ func (r *Ring) OldestWrite() uint64 {
}

func getLostEvent() *v1.Event {
metrics.LostEvents.WithLabelValues(strings.ToLower(flowpb.LostEventSource_HUBBLE_RING_BUFFER.String())).Inc()
now := time.Now().UTC()
return &v1.Event{
Timestamp: &timestamppb.Timestamp{
Expand Down
12 changes: 12 additions & 0 deletions pkg/hubble/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,17 @@ var (
podDeletionHandler *PodDeletionHandler
)

// Additional metrics - they're not counting flows, so are not served via
// Hubble metrics API, but belong to the same Prometheus namespace.
var (
labelSource = "source"
LostEvents = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: api.DefaultPrometheusNamespace,
Name: "lost_events_total",
Help: "Number of lost events",
}, []string{labelSource})
)

// ProcessFlow processes a flow and updates metrics
func ProcessFlow(ctx context.Context, flow *pb.Flow) error {
if enabledMetrics != nil {
Expand Down Expand Up @@ -101,6 +112,7 @@ func initMetrics(address string, enabled api.Map, grpcMetrics *grpc_prometheus.S
enabledMetrics = e

registry.MustRegister(grpcMetrics)
registry.MustRegister(LostEvents)

errChan := make(chan error, 1)

Expand Down
16 changes: 10 additions & 6 deletions pkg/hubble/monitor/consumer.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@
package monitor

import (
"strings"
"time"

"github.com/google/uuid"
"github.com/sirupsen/logrus"

flowpb "github.com/cilium/cilium/api/v1/flow"
"github.com/cilium/cilium/pkg/hubble/metrics"
observerTypes "github.com/cilium/cilium/pkg/hubble/observer/types"
"github.com/cilium/cilium/pkg/lock"
"github.com/cilium/cilium/pkg/logging"
Expand Down Expand Up @@ -66,7 +69,7 @@ func (c *consumer) sendNumLostEvents() {
// We now now safely reset the counter, as at this point have
// successfully notified the observer about the amount of events
// that were lost since the previous LostEvent message
c.observer.GetLogger().Infof("hubble events queue is processing messages again: %d messages were lost", c.numEventsLost)
c.observer.GetLogger().Debugf("hubble events queue received a LostEvent message: %d messages were lost", c.numEventsLost)
c.numEventsLost = 0
default:
// We do not need to bump the numEventsLost counter here, as we will
Expand All @@ -85,20 +88,20 @@ func (c *consumer) sendEvent(event *observerTypes.MonitorEvent) {
select {
case c.observer.GetEventsChannel() <- event:
default:
c.logStartedDropping()
c.countDroppedEvent()
}
}

// logStartedDropping logs that the events channel is full
// and starts couting exactly how many messages it has
// lost until the consumer can recover.
func (c *consumer) logStartedDropping() {
// countDroppedEvent logs that the events channel is full
// and counts how many messages it has lost.
func (c *consumer) countDroppedEvent() {
c.lostLock.Lock()
defer c.lostLock.Unlock()
if c.numEventsLost == 0 && c.logLimiter.Allow() {
c.observer.GetLogger().Warning("hubble events queue is full: dropping messages; consider increasing the queue size (hubble-event-queue-size) or provisioning more CPU")
}
c.numEventsLost++
metrics.LostEvents.WithLabelValues(strings.ToLower(flowpb.LostEventSource_OBSERVER_EVENTS_QUEUE.String())).Inc()
}

// NotifyAgentEvent implements monitorConsumer.MonitorConsumer
Expand Down Expand Up @@ -139,4 +142,5 @@ func (c *consumer) NotifyPerfEventLost(numLostEvents uint64, cpu int) {
CPU: cpu,
},
})
metrics.LostEvents.WithLabelValues(strings.ToLower(flowpb.LostEventSource_PERF_EVENT_RING_BUFFER.String())).Add(float64(numLostEvents))
}