Skip to content

Commit

Permalink
Modify operator metric CES errors sync to count all CES sync events
Browse files Browse the repository at this point in the history
Change operator metric ces_sync_errors_total to be more useful, by counting all CES sync events for success and failure. The metric will be more suitable to be an SLI for Cilium Endpoint Batching feature, because it will be able to show the percentage of successful CES syncs.

Signed-off-by: Dorde Lapcevic <dordel@google.com>
  • Loading branch information
dlapcevic authored and aanm committed Feb 3, 2023
1 parent bc2af2a commit 37e01a5
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 0 deletions.
10 changes: 10 additions & 0 deletions Documentation/operations/upgrade.rst
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,16 @@ Annotations:
1.14 Upgrade Notes
------------------

Added Metrics
~~~~~~~~~~~~~

* ``cilium_operator_ces_sync_total``

Deprecated Metrics
~~~~~~~~~~~~~~~~~~

* ``cilium_operator_ces_sync_errors_total`` is deprecated. Please use ``cilium_operator_ces_sync_total`` instead.

Helm Options
~~~~~~~~~~~~

Expand Down
12 changes: 12 additions & 0 deletions operator/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,12 @@ var (
// This metric is used to collect number of CEP changes happening at various buckets.
CiliumEndpointsChangeCount *prometheus.HistogramVec

// CiliumEndpointSliceSyncTotal indicates the total number of completed CES syncs with k8s-apiserver by success/fail outcome.
CiliumEndpointSliceSyncTotal *prometheus.CounterVec

// CiliumEndpointSliceSyncErrors used to track the total number of errors occurred during syncing CES with k8s-apiserver.
// This metric is going to be deprecated in Cilium 1.14 and removed in 1.15.
// It is replaced by CiliumEndpointSliceSyncTotal metric.
CiliumEndpointSliceSyncErrors prometheus.Counter

// CiliumEndpointSliceQueueDelay measures the time spent by CES's in the workqueue. This measures time difference between
Expand Down Expand Up @@ -184,6 +189,13 @@ func registerMetrics() []prometheus.Collector {
})
collectors = append(collectors, CiliumEndpointSliceSyncErrors)

CiliumEndpointSliceSyncTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: Namespace,
Name: "ces_sync_total",
Help: "The number of completed CES syncs by outcome",
}, []string{"outcome"})
collectors = append(collectors, CiliumEndpointSliceSyncTotal)

CiliumEndpointSliceQueueDelay = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: Namespace,
Name: "ces_queueing_delay_seconds",
Expand Down
8 changes: 8 additions & 0 deletions operator/pkg/ciliumendpointslice/endpointslice.go
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,14 @@ func (c *CiliumEndpointSliceController) processNextWorkItem() bool {
defer c.queue.Done(cKey)

err := c.syncCES(cKey.(string))
if operatorOption.Config.EnableMetrics {
if err != nil {
metrics.CiliumEndpointSliceSyncTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
} else {
metrics.CiliumEndpointSliceSyncTotal.WithLabelValues(metrics.LabelValueOutcomeSuccess).Inc()
}
}

c.handleErr(err, cKey)

return true
Expand Down

0 comments on commit 37e01a5

Please sign in to comment.