Skip to content

Commit

Permalink
Expand agent metric Policy Import Errors to count all policy changes
Browse files Browse the repository at this point in the history
Change agent metric name from policy_import_errors_total to policy_change_total. Now it counts all policy changes (Add, Update, Delete) based on outcome ("success" or "fail"). The metric can be used to show the percentage of success/failed network policy changes.

Signed-off-by: Dorde Lapcevic <dordel@google.com>
  • Loading branch information
dlapcevic committed Feb 3, 2023
1 parent 37e01a5 commit 6a079ca
Show file tree
Hide file tree
Showing 10 changed files with 74 additions and 21 deletions.
1 change: 1 addition & 0 deletions Documentation/observability/metrics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,7 @@ Name Labels
``policy_regeneration_time_stats_seconds`` ``scope`` Enabled Policy regeneration time stats labeled by the scope
``policy_max_revision`` Enabled Highest policy revision number in the agent
``policy_import_errors_total`` Enabled Number of times a policy import has failed
``policy_change_total`` Enabled Number of policy changes by outcome
``policy_endpoint_enforcement_status`` Enabled Number of endpoints labeled by policy enforcement status
========================================== ================================================== ========== ========================================================

Expand Down
4 changes: 4 additions & 0 deletions Documentation/operations/upgrade.rst
Original file line number Diff line number Diff line change
Expand Up @@ -316,11 +316,15 @@ Added Metrics
~~~~~~~~~~~~~

* ``cilium_operator_ces_sync_total``
* ``cilium_policy_change_total``

Deprecated Metrics
~~~~~~~~~~~~~~~~~~

* ``cilium_operator_ces_sync_errors_total`` is deprecated. Please use ``cilium_operator_ces_sync_total`` instead.
* ``cilium_policy_import_errors_total`` is deprecated. Please use
``cilium_policy_change_total``, which counts all policy changes (Add, Update, Delete)
based on outcome ("success" or "failure").

Helm Options
~~~~~~~~~~~~
Expand Down
10 changes: 7 additions & 3 deletions daemon/cmd/policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -688,22 +688,26 @@ func (h *putPolicy) Handle(params PutPolicyParams) middleware.Responder {

var rules policyAPI.Rules
if err := json.Unmarshal([]byte(params.Policy), &rules); err != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
return NewPutPolicyInvalidPolicy()
}

for _, r := range rules {
if err := r.Sanitize(); err != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
return api.Error(PutPolicyFailureCode, err)
}
}

rev, err := d.PolicyAdd(rules, &policy.AddOptions{Source: metrics.LabelEventSourceAPI})
if err != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
return api.Error(PutPolicyFailureCode, err)
}
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeSuccess).Inc()

policy := &models.Policy{
Revision: int64(rev),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5586,7 +5586,7 @@
"aliasColors": {
"max": "#f2c96d",
"policy errors": "#bf1b00",
"policy import errors": "#bf1b00"
"policy change errors": "#bf1b00"
},
"bars": false,
"dashLength": 10,
Expand Down Expand Up @@ -5646,7 +5646,7 @@
"lines": false
},
{
"alias": "policy import errors",
"alias": "policy change errors",
"yaxis": 2
}
],
Expand Down Expand Up @@ -5676,10 +5676,10 @@
"refId": "C"
},
{
"expr": "sum(cilium_policy_import_errors_total{k8s_app=\"cilium\", pod=~\"$pod\"}) by (pod)",
"expr": "sum(cilium_policy_change_total{k8s_app=\"cilium\", pod=~\"$pod\"}, outcome=\"fail\") by (pod)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "policy import errors",
"legendFormat": "policy change errors",
"refId": "D"
}
],
Expand Down
8 changes: 4 additions & 4 deletions examples/kubernetes/addons/prometheus/monitoring-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5938,7 +5938,7 @@ data:
"aliasColors": {
"max": "#f2c96d",
"policy errors": "#bf1b00",
"policy import errors": "#bf1b00"
"policy change errors": "#bf1b00"
},
"bars": false,
"dashLength": 10,
Expand Down Expand Up @@ -5998,7 +5998,7 @@ data:
"lines": false
},
{
"alias": "policy import errors",
"alias": "policy change errors",
"yaxis": 2
}
],
Expand Down Expand Up @@ -6028,10 +6028,10 @@ data:
"refId": "C"
},
{
"expr": "sum(cilium_policy_import_errors_total{k8s_app=\"cilium\", pod=~\"$pod\"}) by (pod)",
"expr": "sum(cilium_policy_change_total{k8s_app=\"cilium\", pod=~\"$pod\"}, outcome=\"fail\") by (pod)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "policy import errors",
"legendFormat": "policy change errors",
"refId": "D"
}
],
Expand Down
24 changes: 20 additions & 4 deletions pkg/k8s/watchers/cilium_network_policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ func (k *K8sWatcher) ciliumNetworkPoliciesInit(cs client.Clientset) {
cnpCpy := cnp.DeepCopy()

err := k.addCiliumNetworkPolicyV2(cs, cnpCpy, initialRecvTime)
reportCNPChangeMetrics(err)

k.K8sEventProcessed(resources.MetricCNP, resources.MetricCreate, err == nil)
}
},
Expand All @@ -132,6 +134,8 @@ func (k *K8sWatcher) ciliumNetworkPoliciesInit(cs client.Clientset) {
newCNPCpy := newCNP.DeepCopy()

err := k.updateCiliumNetworkPolicyV2(cs, oldCNPCpy, newCNPCpy, initialRecvTime)
reportCNPChangeMetrics(err)

k.K8sEventProcessed(resources.MetricCNP, resources.MetricUpdate, err == nil)
}
}
Expand All @@ -145,6 +149,8 @@ func (k *K8sWatcher) ciliumNetworkPoliciesInit(cs client.Clientset) {
}
valid = true
err := k.deleteCiliumNetworkPolicyV2(cnp)
reportCNPChangeMetrics(err)

k.K8sEventProcessed(resources.MetricCNP, resources.MetricDelete, err == nil)
},
},
Expand Down Expand Up @@ -182,7 +188,7 @@ func (k *K8sWatcher) addCiliumNetworkPolicyV2(ciliumNPClient clientset.Interface
}

if policyImportErr != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
scopedLog.WithError(policyImportErr).Warn("Unable to add CiliumNetworkPolicy")
} else {
scopedLog.Info("Imported CiliumNetworkPolicy")
Expand Down Expand Up @@ -251,12 +257,12 @@ func (k *K8sWatcher) updateCiliumNetworkPolicyV2(ciliumNPClient clientset.Interf
// update to the new policy will be skipped.
switch {
case ns != "" && !errors.Is(err, cilium_v2.ErrEmptyCNP):
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
log.WithError(err).WithField(logfields.Object, logfields.Repr(oldRuleCpy)).
Warn("Error parsing old CiliumNetworkPolicy rule")
return err
case ns == "" && !errors.Is(err, cilium_v2.ErrEmptyCCNP):
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
log.WithError(err).WithField(logfields.Object, logfields.Repr(oldRuleCpy)).
Warn("Error parsing old CiliumClusterwideNetworkPolicy rule")
return err
Expand All @@ -265,7 +271,7 @@ func (k *K8sWatcher) updateCiliumNetworkPolicyV2(ciliumNPClient clientset.Interf

_, err = newRuleCpy.Parse()
if err != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
log.WithError(err).WithField(logfields.Object, logfields.Repr(newRuleCpy)).
Warn("Error parsing new CiliumNetworkPolicy rule")
return err
Expand Down Expand Up @@ -343,3 +349,13 @@ func (k *K8sWatcher) updateCiliumNetworkPolicyV2AnnotationsOnly(ciliumNPClient c
})

}

// reportCNPChangeMetrics generates metrics for changes (Add, Update, Delete) to
// Cilium Network Policies depending on the operation's success.
func reportCNPChangeMetrics(err error) {
if err != nil {
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
} else {
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeSuccess).Inc()
}
}
9 changes: 7 additions & 2 deletions pkg/k8s/watchers/network_policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ func (k *K8sWatcher) addK8sNetworkPolicyV1(k8sNP *slim_networkingv1.NetworkPolic
scopedLog := log.WithField(logfields.K8sAPIVersion, k8sNP.TypeMeta.APIVersion)
rules, err := k8s.ParseNetworkPolicy(k8sNP)
if err != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
scopedLog.WithError(err).WithFields(logrus.Fields{
logfields.CiliumNetworkPolicy: logfields.Repr(k8sNP),
}).Error("Error while parsing k8s kubernetes NetworkPolicy")
Expand All @@ -88,13 +89,15 @@ func (k *K8sWatcher) addK8sNetworkPolicyV1(k8sNP *slim_networkingv1.NetworkPolic

opts := policy.AddOptions{Replace: true, Source: metrics.LabelEventSourceK8s}
if _, err := k.policyManager.PolicyAdd(rules, &opts); err != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
scopedLog.WithError(err).WithFields(logrus.Fields{
logfields.CiliumNetworkPolicy: logfields.Repr(rules),
}).Error("Unable to add NetworkPolicy rules to policy repository")
return err
}

metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeSuccess).Inc()
scopedLog.Info("NetworkPolicy successfully added")
return nil
}
Expand Down Expand Up @@ -125,10 +128,12 @@ func (k *K8sWatcher) deleteK8sNetworkPolicyV1(k8sNP *slim_networkingv1.NetworkPo
logfields.Labels: logfields.Repr(labels),
})
if _, err := k.policyManager.PolicyDelete(labels); err != nil {
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
scopedLog.WithError(err).Error("Error while deleting k8s NetworkPolicy")
return err
}

metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeSuccess).Inc()
scopedLog.Info("NetworkPolicy successfully removed")
return nil
}
20 changes: 19 additions & 1 deletion pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -274,9 +274,15 @@ var (
// PolicyRevision is the current policy revision number for this agent
PolicyRevision = NoOpGauge

// PolicyImportErrorsTotal is a count of failed policy imports
// PolicyImportErrorsTotal is a count of failed policy imports.
// This metric was deprecated in Cilium 1.14 and is to be removed in 1.15.
// It is replaced by PolicyChangeTotal metric.
PolicyImportErrorsTotal = NoOpCounter

// PolicyChangeTotal is a count of policy changes by outcome ("success" or
// "failure")
PolicyChangeTotal = NoOpCounterVec

// PolicyEndpointStatus is the number of endpoints with policy labeled by enforcement type
PolicyEndpointStatus = NoOpGaugeVec

Expand Down Expand Up @@ -540,6 +546,7 @@ type Configuration struct {
PolicyRegenerationTimeStatsEnabled bool
PolicyRevisionEnabled bool
PolicyImportErrorsEnabled bool
PolicyChangeTotalEnabled bool
PolicyEndpointStatusEnabled bool
PolicyImplementationDelayEnabled bool
IdentityCountEnabled bool
Expand Down Expand Up @@ -616,6 +623,7 @@ func DefaultMetrics() map[string]struct{} {
Namespace + "_policy_regeneration_time_stats_seconds": {},
Namespace + "_policy_max_revision": {},
Namespace + "_policy_import_errors_total": {},
Namespace + "_policy_change_total": {},
Namespace + "_policy_endpoint_enforcement_status": {},
Namespace + "_policy_implementation_delay": {},
Namespace + "_identity": {},
Expand Down Expand Up @@ -789,6 +797,16 @@ func CreateConfiguration(metricsEnabled []string) (Configuration, []prometheus.C
collectors = append(collectors, PolicyImportErrorsTotal)
c.PolicyImportErrorsEnabled = true

case Namespace + "_policy_change_total":
PolicyChangeTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: Namespace,
Name: "policy_change_total",
Help: "Number of policy changes by outcome",
}, []string{"outcome"})

collectors = append(collectors, PolicyChangeTotal)
c.PolicyChangeTotalEnabled = true

case Namespace + "_policy_endpoint_enforcement_status":
PolicyEndpointStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: Namespace,
Expand Down
4 changes: 3 additions & 1 deletion pkg/policy/api/selector.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,11 +164,13 @@ func (n EndpointSelector) GetMatch(key string) ([]string, bool) {
func labelSelectorToRequirements(labelSelector *slim_metav1.LabelSelector) *k8sLbls.Requirements {
selector, err := slim_metav1.LabelSelectorAsSelector(labelSelector)
if err != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
log.WithError(err).WithField(logfields.EndpointLabelSelector,
logfields.Repr(labelSelector)).Error("unable to construct selector in label selector")
return nil
}
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeSuccess).Inc()

requirements, selectable := selector.Requirements()
if !selectable {
Expand Down
7 changes: 5 additions & 2 deletions pkg/policy/groups/actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,8 @@ func addDerivativePolicy(ctx context.Context, clientset client.Clientset, cnp *c
}

if derivativeErr != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
scopedLog.WithError(derivativeErr).Error("Cannot create derivative rule. Installing deny-all rule.")
statusErr := updateDerivativeStatus(clientset, cnp, derivativePolicy.GetName(), derivativeErr, clusterScoped)
if statusErr != nil {
Expand All @@ -237,11 +238,13 @@ func addDerivativePolicy(ctx context.Context, clientset client.Clientset, cnp *c
if err != nil {
statusErr := updateDerivativeStatus(clientset, cnp, derivativePolicy.GetName(), err, clusterScoped)
if statusErr != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
scopedLog.WithError(err).Error("Cannot update status for derivative policy")
}
return statusErr
}
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeSuccess).Inc()

err = updateDerivativeStatus(clientset, cnp, derivativePolicy.GetName(), nil, clusterScoped)
if err != nil {
Expand Down

0 comments on commit 6a079ca

Please sign in to comment.