Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expand agent metric Policy Import Errors to count all policy changes #23349

Merged
merged 1 commit into from
Feb 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions Documentation/observability/metrics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,7 @@ Name Labels
``policy_regeneration_time_stats_seconds`` ``scope`` Enabled Policy regeneration time stats labeled by the scope
``policy_max_revision`` Enabled Highest policy revision number in the agent
``policy_import_errors_total`` Enabled Number of times a policy import has failed
``policy_change_total`` Enabled Number of policy changes by outcome
``policy_endpoint_enforcement_status`` Enabled Number of endpoints labeled by policy enforcement status
========================================== ================================================== ========== ========================================================

Expand Down
4 changes: 4 additions & 0 deletions Documentation/operations/upgrade.rst
Original file line number Diff line number Diff line change
Expand Up @@ -316,11 +316,15 @@ Added Metrics
~~~~~~~~~~~~~

* ``cilium_operator_ces_sync_total``
* ``cilium_policy_change_total``

Deprecated Metrics
~~~~~~~~~~~~~~~~~~

* ``cilium_operator_ces_sync_errors_total`` is deprecated. Please use ``cilium_operator_ces_sync_total`` instead.
* ``cilium_policy_import_errors_total`` is deprecated. Please use
dlapcevic marked this conversation as resolved.
Show resolved Hide resolved
``cilium_policy_change_total``, which counts all policy changes (Add, Update, Delete)
based on outcome ("success" or "failure").

Helm Options
~~~~~~~~~~~~
Expand Down
10 changes: 7 additions & 3 deletions daemon/cmd/policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -688,22 +688,26 @@ func (h *putPolicy) Handle(params PutPolicyParams) middleware.Responder {

var rules policyAPI.Rules
if err := json.Unmarshal([]byte(params.Policy), &rules); err != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
return NewPutPolicyInvalidPolicy()
}

for _, r := range rules {
if err := r.Sanitize(); err != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
return api.Error(PutPolicyFailureCode, err)
}
}

rev, err := d.PolicyAdd(rules, &policy.AddOptions{Source: metrics.LabelEventSourceAPI})
if err != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
return api.Error(PutPolicyFailureCode, err)
}
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeSuccess).Inc()

policy := &models.Policy{
Revision: int64(rev),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5586,7 +5586,7 @@
"aliasColors": {
"max": "#f2c96d",
"policy errors": "#bf1b00",
"policy import errors": "#bf1b00"
"policy change errors": "#bf1b00"
},
"bars": false,
"dashLength": 10,
Expand Down Expand Up @@ -5646,7 +5646,7 @@
"lines": false
},
{
"alias": "policy import errors",
"alias": "policy change errors",
"yaxis": 2
}
],
Expand Down Expand Up @@ -5676,10 +5676,10 @@
"refId": "C"
},
{
"expr": "sum(cilium_policy_import_errors_total{k8s_app=\"cilium\", pod=~\"$pod\"}) by (pod)",
"expr": "sum(cilium_policy_change_total{k8s_app=\"cilium\", pod=~\"$pod\"}, outcome=\"fail\") by (pod)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "policy import errors",
"legendFormat": "policy change errors",
"refId": "D"
}
],
Expand Down
8 changes: 4 additions & 4 deletions examples/kubernetes/addons/prometheus/monitoring-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5938,7 +5938,7 @@ data:
"aliasColors": {
"max": "#f2c96d",
"policy errors": "#bf1b00",
"policy import errors": "#bf1b00"
"policy change errors": "#bf1b00"
},
"bars": false,
"dashLength": 10,
Expand Down Expand Up @@ -5998,7 +5998,7 @@ data:
"lines": false
},
{
"alias": "policy import errors",
"alias": "policy change errors",
"yaxis": 2
}
],
Expand Down Expand Up @@ -6028,10 +6028,10 @@ data:
"refId": "C"
},
{
"expr": "sum(cilium_policy_import_errors_total{k8s_app=\"cilium\", pod=~\"$pod\"}) by (pod)",
"expr": "sum(cilium_policy_change_total{k8s_app=\"cilium\", pod=~\"$pod\"}, outcome=\"fail\") by (pod)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "policy import errors",
"legendFormat": "policy change errors",
"refId": "D"
}
],
Expand Down
24 changes: 20 additions & 4 deletions pkg/k8s/watchers/cilium_network_policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ func (k *K8sWatcher) ciliumNetworkPoliciesInit(cs client.Clientset) {
cnpCpy := cnp.DeepCopy()

err := k.addCiliumNetworkPolicyV2(cs, cnpCpy, initialRecvTime)
reportCNPChangeMetrics(err)

k.K8sEventProcessed(resources.MetricCNP, resources.MetricCreate, err == nil)
}
},
Expand All @@ -132,6 +134,8 @@ func (k *K8sWatcher) ciliumNetworkPoliciesInit(cs client.Clientset) {
newCNPCpy := newCNP.DeepCopy()

err := k.updateCiliumNetworkPolicyV2(cs, oldCNPCpy, newCNPCpy, initialRecvTime)
reportCNPChangeMetrics(err)

k.K8sEventProcessed(resources.MetricCNP, resources.MetricUpdate, err == nil)
}
}
Expand All @@ -145,6 +149,8 @@ func (k *K8sWatcher) ciliumNetworkPoliciesInit(cs client.Clientset) {
}
valid = true
err := k.deleteCiliumNetworkPolicyV2(cnp)
reportCNPChangeMetrics(err)

k.K8sEventProcessed(resources.MetricCNP, resources.MetricDelete, err == nil)
},
},
Expand Down Expand Up @@ -182,7 +188,7 @@ func (k *K8sWatcher) addCiliumNetworkPolicyV2(ciliumNPClient clientset.Interface
}

if policyImportErr != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
scopedLog.WithError(policyImportErr).Warn("Unable to add CiliumNetworkPolicy")
} else {
scopedLog.Info("Imported CiliumNetworkPolicy")
Expand Down Expand Up @@ -251,12 +257,12 @@ func (k *K8sWatcher) updateCiliumNetworkPolicyV2(ciliumNPClient clientset.Interf
// update to the new policy will be skipped.
switch {
case ns != "" && !errors.Is(err, cilium_v2.ErrEmptyCNP):
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
log.WithError(err).WithField(logfields.Object, logfields.Repr(oldRuleCpy)).
Warn("Error parsing old CiliumNetworkPolicy rule")
return err
case ns == "" && !errors.Is(err, cilium_v2.ErrEmptyCCNP):
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
log.WithError(err).WithField(logfields.Object, logfields.Repr(oldRuleCpy)).
Warn("Error parsing old CiliumClusterwideNetworkPolicy rule")
return err
Expand All @@ -265,7 +271,7 @@ func (k *K8sWatcher) updateCiliumNetworkPolicyV2(ciliumNPClient clientset.Interf

_, err = newRuleCpy.Parse()
if err != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
log.WithError(err).WithField(logfields.Object, logfields.Repr(newRuleCpy)).
Warn("Error parsing new CiliumNetworkPolicy rule")
return err
Expand Down Expand Up @@ -343,3 +349,13 @@ func (k *K8sWatcher) updateCiliumNetworkPolicyV2AnnotationsOnly(ciliumNPClient c
})

}

// reportCNPChangeMetrics generates metrics for changes (Add, Update, Delete) to
// Cilium Network Policies depending on the operation's success.
func reportCNPChangeMetrics(err error) {
if err != nil {
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
} else {
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeSuccess).Inc()
}
}
9 changes: 7 additions & 2 deletions pkg/k8s/watchers/network_policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ func (k *K8sWatcher) addK8sNetworkPolicyV1(k8sNP *slim_networkingv1.NetworkPolic
scopedLog := log.WithField(logfields.K8sAPIVersion, k8sNP.TypeMeta.APIVersion)
rules, err := k8s.ParseNetworkPolicy(k8sNP)
if err != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
scopedLog.WithError(err).WithFields(logrus.Fields{
logfields.CiliumNetworkPolicy: logfields.Repr(k8sNP),
}).Error("Error while parsing k8s kubernetes NetworkPolicy")
Expand All @@ -88,13 +89,15 @@ func (k *K8sWatcher) addK8sNetworkPolicyV1(k8sNP *slim_networkingv1.NetworkPolic

opts := policy.AddOptions{Replace: true, Source: metrics.LabelEventSourceK8s}
if _, err := k.policyManager.PolicyAdd(rules, &opts); err != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
scopedLog.WithError(err).WithFields(logrus.Fields{
logfields.CiliumNetworkPolicy: logfields.Repr(rules),
}).Error("Unable to add NetworkPolicy rules to policy repository")
return err
}

metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeSuccess).Inc()
scopedLog.Info("NetworkPolicy successfully added")
return nil
}
Expand Down Expand Up @@ -125,10 +128,12 @@ func (k *K8sWatcher) deleteK8sNetworkPolicyV1(k8sNP *slim_networkingv1.NetworkPo
logfields.Labels: logfields.Repr(labels),
})
if _, err := k.policyManager.PolicyDelete(labels); err != nil {
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
scopedLog.WithError(err).Error("Error while deleting k8s NetworkPolicy")
return err
}

metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeSuccess).Inc()
scopedLog.Info("NetworkPolicy successfully removed")
return nil
}
20 changes: 19 additions & 1 deletion pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -274,9 +274,15 @@ var (
// PolicyRevision is the current policy revision number for this agent
PolicyRevision = NoOpGauge

// PolicyImportErrorsTotal is a count of failed policy imports
// PolicyImportErrorsTotal is a count of failed policy imports.
// This metric was deprecated in Cilium 1.14 and is to be removed in 1.15.
// It is replaced by PolicyChangeTotal metric.
PolicyImportErrorsTotal = NoOpCounter

// PolicyChangeTotal is a count of policy changes by outcome ("success" or
// "failure")
PolicyChangeTotal = NoOpCounterVec

// PolicyEndpointStatus is the number of endpoints with policy labeled by enforcement type
PolicyEndpointStatus = NoOpGaugeVec

Expand Down Expand Up @@ -540,6 +546,7 @@ type Configuration struct {
PolicyRegenerationTimeStatsEnabled bool
PolicyRevisionEnabled bool
PolicyImportErrorsEnabled bool
PolicyChangeTotalEnabled bool
PolicyEndpointStatusEnabled bool
PolicyImplementationDelayEnabled bool
IdentityCountEnabled bool
Expand Down Expand Up @@ -616,6 +623,7 @@ func DefaultMetrics() map[string]struct{} {
Namespace + "_policy_regeneration_time_stats_seconds": {},
Namespace + "_policy_max_revision": {},
Namespace + "_policy_import_errors_total": {},
Namespace + "_policy_change_total": {},
Namespace + "_policy_endpoint_enforcement_status": {},
Namespace + "_policy_implementation_delay": {},
Namespace + "_identity": {},
Expand Down Expand Up @@ -789,6 +797,16 @@ func CreateConfiguration(metricsEnabled []string) (Configuration, []prometheus.C
collectors = append(collectors, PolicyImportErrorsTotal)
c.PolicyImportErrorsEnabled = true

case Namespace + "_policy_change_total":
PolicyChangeTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: Namespace,
Name: "policy_change_total",
Help: "Number of policy changes by outcome",
}, []string{"outcome"})

collectors = append(collectors, PolicyChangeTotal)
c.PolicyChangeTotalEnabled = true

case Namespace + "_policy_endpoint_enforcement_status":
PolicyEndpointStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: Namespace,
Expand Down
4 changes: 3 additions & 1 deletion pkg/policy/api/selector.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,11 +164,13 @@ func (n EndpointSelector) GetMatch(key string) ([]string, bool) {
func labelSelectorToRequirements(labelSelector *slim_metav1.LabelSelector) *k8sLbls.Requirements {
selector, err := slim_metav1.LabelSelectorAsSelector(labelSelector)
if err != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
log.WithError(err).WithField(logfields.EndpointLabelSelector,
logfields.Repr(labelSelector)).Error("unable to construct selector in label selector")
return nil
}
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeSuccess).Inc()

requirements, selectable := selector.Requirements()
if !selectable {
Expand Down
7 changes: 5 additions & 2 deletions pkg/policy/groups/actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,8 @@ func addDerivativePolicy(ctx context.Context, clientset client.Clientset, cnp *c
}

if derivativeErr != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
scopedLog.WithError(derivativeErr).Error("Cannot create derivative rule. Installing deny-all rule.")
statusErr := updateDerivativeStatus(clientset, cnp, derivativePolicy.GetName(), derivativeErr, clusterScoped)
if statusErr != nil {
Expand All @@ -237,11 +238,13 @@ func addDerivativePolicy(ctx context.Context, clientset client.Clientset, cnp *c
if err != nil {
statusErr := updateDerivativeStatus(clientset, cnp, derivativePolicy.GetName(), err, clusterScoped)
if statusErr != nil {
metrics.PolicyImportErrorsTotal.Inc()
metrics.PolicyImportErrorsTotal.Inc() // Deprecated in Cilium 1.14, to be removed in 1.15.
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeFail).Inc()
scopedLog.WithError(err).Error("Cannot update status for derivative policy")
}
return statusErr
}
metrics.PolicyChangeTotal.WithLabelValues(metrics.LabelValueOutcomeSuccess).Inc()

err = updateDerivativeStatus(clientset, cnp, derivativePolicy.GetName(), nil, clusterScoped)
if err != nil {
Expand Down