Skip to content

Commit

Permalink
Remediate webhooks affecting lease resource in kube-system namespace …
Browse files Browse the repository at this point in the history
…with timeoutSeconds more than 4 (#7902)

* Consider webhook affecting leases in kube-system namespace problemetic even if FailurePolicy is Ignore and timieoutSeconds is 10

* Remediate webhooks affecting leases in kube-system namespace if timieoutSeconds is greater than 3 seconds

* Address Review

* Enahnce Docs
  • Loading branch information
acumino committed Jun 1, 2023
1 parent 25aec5e commit 5eea9f2
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 12 deletions.
2 changes: 2 additions & 0 deletions docs/usage/shoot_status.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ However, some other resources (some of them cluster-scoped) might still trigger

If one of the above resources triggers the remediator, the preferred solution is to remove that particular resource from your webhook's `rules`. You can also use the `objectSelector` to reduce the scope of webhook's `rules`. However, in special cases where a webhook is absolutely needed for the workload, it is possible to add the `remediation.webhook.shoot.gardener.cloud/exclude=true` label to your webhook so that the remediator ignores it. This label **should not be used to silence an alert**, but rather to confirm that a webhook won't cause problems. Note that all of this is no perfect solution and just done on a best effort basis, and only the owner of the webhook can know whether it indeed is problematic and configured correctly.

In a special case, if a webhook has a rule for `CREATE/UPDATE` lease resources in `kube-system` namespace, its `timeoutSeconds` is updated to 3 seconds. This is required to ensure the proper functioning of the leader election of essential control plane controllers.

You can also find more help from the [Kubernetes documentation](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#best-practices-and-warnings)

**`MaintenancePreconditionsSatisfied`**:
Expand Down
8 changes: 8 additions & 0 deletions pkg/operation/botanist/matchers/matcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,14 @@ var (
v1beta1constants.ShootNoCleanup: "true",
managedresources.LabelKeyOrigin: managedresources.LabelValueGardener,
}

// WebhookConstraintMatchersForLeases contains a list of lease API resources that can break
// leader election of essential control plane controllers.
WebhookConstraintMatchersForLeases = []WebhookConstraintMatcher{
{GVR: coordinationv1.SchemeGroupVersion.WithResource("leases"), NamespaceLabels: kubeSystemNamespaceLabels},
{GVR: coordinationv1beta1.SchemeGroupVersion.WithResource("leases"), NamespaceLabels: kubeSystemNamespaceLabels},
}

// WebhookConstraintMatchers contains a list of all api resources which can break
// the waking up of a cluster.
WebhookConstraintMatchers = []WebhookConstraintMatcher{
Expand Down
27 changes: 23 additions & 4 deletions pkg/operation/care/constraints.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,17 @@ import (
secretsmanager "github.com/gardener/gardener/pkg/utils/secrets/manager"
)

// WebhookMaximumTimeoutSecondsNotProblematic is the maximum timeout in seconds a webhooks on critical resources can
// have in order to not be considered as a problematic webhook by the constraints checks. Any webhook on critical
// resources with a larger timeout is considered to be problematic.
const WebhookMaximumTimeoutSecondsNotProblematic = 15
const (
// WebhookMaximumTimeoutSecondsNotProblematic is the maximum timeout in seconds a webhooks on critical resources can
// have in order to not be considered as a problematic webhook by the constraints checks. Any webhook on critical
// resources with a larger timeout is considered to be problematic.
WebhookMaximumTimeoutSecondsNotProblematic = 15
// WebhookMaximumTimeoutSecondsNotProblematicForLeases is the maximum timeout in seconds a webhooks on lease resources in
// kube-system namespace can have in order to not be considered as a problematic webhook by the constraints checks.
// Any webhook on lease resources in kube-system namespace with a larger timeout can break leader election of essential
// control plane controllers.
WebhookMaximumTimeoutSecondsNotProblematicForLeases = 3
)

func shootHibernatedConstraints(clock clock.Clock, conditions ...gardencorev1beta1.Condition) []gardencorev1beta1.Condition {
hibernationConditions := make([]gardencorev1beta1.Condition, 0, len(conditions))
Expand Down Expand Up @@ -337,6 +344,18 @@ func IsProblematicWebhook(
rules []admissionregistrationv1.RuleWithOperations,
timeoutSeconds *int32,
) bool {
// this is a special case, webhook affecting lease in kube-system namespace can block hibernation
// even if FailurePolicy is set to `Ignore` and timeoutSeconds is 10 seconds.
if timeoutSeconds == nil || (timeoutSeconds != nil && *timeoutSeconds > WebhookMaximumTimeoutSecondsNotProblematicForLeases) {
for _, rule := range rules {
for _, matcher := range matchers.WebhookConstraintMatchersForLeases {
if matcher.Match(rule, objSelector, nsSelector) {
return true
}
}
}
}

if failurePolicy != nil && *failurePolicy != admissionregistrationv1.Fail {
// in admissionregistration.k8s.io/v1 FailurePolicy is defaulted to `Fail`
// see https://github.com/kubernetes/api/blob/release-1.16/admissionregistration/v1/types.go#L195
Expand Down
9 changes: 8 additions & 1 deletion pkg/operation/care/constraints_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,14 @@ var _ = Describe("Constraints", func() {
}

commonTests = func(gvr schema.GroupVersionResource, problematic, notProblematic []TableEntry) {
if gvr.Resource == "leases" || (gvr.Group == "*" && gvr.Resource == "*") {
problematic = append(problematic,
Entry("failurePolicy 'Ignore' and timeoutSeconds ok", webhookTestCase{failurePolicy: &failurePolicyIgnore, timeoutSeconds: &timeoutSecondsNotProblematic}))
} else {
notProblematic = append(notProblematic,
Entry("failurePolicy 'Ignore' and timeoutSeconds ok", webhookTestCase{failurePolicy: &failurePolicyIgnore, timeoutSeconds: &timeoutSecondsNotProblematic}))
}

DescribeTable(fmt.Sprintf("problematic webhook for %s", gvr.String()),
func(testCase webhookTestCase) {
testCase.gvr = gvr
Expand Down Expand Up @@ -184,7 +192,6 @@ var _ = Describe("Constraints", func() {
testCase.gvr = gvr
Expect(IsProblematicWebhook(testCase.build())).To(BeFalse(), "expected webhook not to be problematic")
},
Entry("failurePolicy 'Ignore' and timeoutSeconds ok", webhookTestCase{failurePolicy: &failurePolicyIgnore, timeoutSeconds: &timeoutSecondsNotProblematic}),
Entry("operationType 'DELETE'", webhookTestCase{operationType: &operationDelete}),
notProblematic,
)
Expand Down
38 changes: 36 additions & 2 deletions pkg/operation/care/webhook_remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,17 @@ func (r *WebhookRemediation) Remediate(ctx context.Context) error {
mustPatch bool
patch = client.StrategicMergeFrom(webhookConfig.DeepCopy())
remediations []string
matchers []webhookmatchers.WebhookConstraintMatcher
)

for i, w := range webhookConfig.Webhooks {
remediate := newRemediator(r.log, "ValidatingWebhookConfiguration", webhookConfig.Name, w.Name, &remediations)

if mustRemediateTimeoutSecondsIfLeaseResource(w.Rules, w.ObjectSelector, w.NamespaceSelector, w.TimeoutSeconds) {
mustPatch = true
webhookConfig.Webhooks[i].TimeoutSeconds = remediate.timeoutSecondsToThree()
}

if mustRemediateTimeoutSeconds(w.TimeoutSeconds) {
mustPatch = true
webhookConfig.Webhooks[i].TimeoutSeconds = remediate.timeoutSeconds()
Expand All @@ -96,7 +102,7 @@ func (r *WebhookRemediation) Remediate(ctx context.Context) error {
continue
}

matchers := getMatchingRules(w.Rules, w.ObjectSelector, w.NamespaceSelector)
matchers = getMatchingRules(w.Rules, w.ObjectSelector, w.NamespaceSelector)

if mustRemediateFailurePolicy(matchers) {
mustPatch = true
Expand Down Expand Up @@ -127,11 +133,17 @@ func (r *WebhookRemediation) Remediate(ctx context.Context) error {
mustPatch bool
patch = client.StrategicMergeFrom(webhookConfig.DeepCopy())
remediations []string
matchers []webhookmatchers.WebhookConstraintMatcher
)

for i, w := range webhookConfig.Webhooks {
remediate := newRemediator(r.log, "MutatingWebhookConfiguration", webhookConfig.Name, w.Name, &remediations)

if mustRemediateTimeoutSecondsIfLeaseResource(w.Rules, w.ObjectSelector, w.NamespaceSelector, w.TimeoutSeconds) {
mustPatch = true
webhookConfig.Webhooks[i].TimeoutSeconds = remediate.timeoutSecondsToThree()
}

if mustRemediateTimeoutSeconds(w.TimeoutSeconds) {
mustPatch = true
webhookConfig.Webhooks[i].TimeoutSeconds = remediate.timeoutSeconds()
Expand All @@ -141,7 +153,7 @@ func (r *WebhookRemediation) Remediate(ctx context.Context) error {
continue
}

matchers := getMatchingRules(w.Rules, w.ObjectSelector, w.NamespaceSelector)
matchers = getMatchingRules(w.Rules, w.ObjectSelector, w.NamespaceSelector)

if mustRemediateFailurePolicy(matchers) {
mustPatch = true
Expand Down Expand Up @@ -179,6 +191,22 @@ func getMatchingRules(
return matchers
}

func mustRemediateTimeoutSecondsIfLeaseResource(rules []admissionregistrationv1.RuleWithOperations,
objLabelSelector *metav1.LabelSelector,
namespaceLabelSelector *metav1.LabelSelector,
timeoutSeconds *int32) bool {
for _, rule := range rules {
for _, matcher := range webhookmatchers.WebhookConstraintMatchersForLeases {
if matcher.Match(rule, objLabelSelector, namespaceLabelSelector) &&
(timeoutSeconds == nil || *timeoutSeconds > WebhookMaximumTimeoutSecondsNotProblematicForLeases) {
return true
}
}
}

return false
}

func mustRemediateTimeoutSeconds(timeoutSeconds *int32) bool {
return timeoutSeconds == nil || *timeoutSeconds > WebhookMaximumTimeoutSecondsNotProblematic
}
Expand Down Expand Up @@ -224,6 +252,12 @@ func (r *remediator) timeoutSeconds() *int32 {
return pointer.Int32(timeoutSeconds)
}

func (r *remediator) timeoutSecondsToThree() *int32 {
var timeoutSeconds int32 = WebhookMaximumTimeoutSecondsNotProblematicForLeases
r.reportf("timeoutSeconds", "set to %d", timeoutSeconds)
return pointer.Int32(timeoutSeconds)
}

func (r *remediator) selectors(matchers []webhookmatchers.WebhookConstraintMatcher) (objectSelector, namespaceSelector []metav1.LabelSelectorRequirement) {
for _, matcher := range matchers {
for k, v := range matcher.ObjectLabels {
Expand Down
48 changes: 43 additions & 5 deletions pkg/operation/care/webhook_remediation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -329,18 +329,56 @@ var _ = Describe("WebhookRemediation", func() {
})

It("timeoutSeconds when failurePolicy=Ignore", func() {
mutatingWebhookConfiguration.Webhooks = []admissionregistrationv1.MutatingWebhook{{
Name: "some-webhook.example.com",
TimeoutSeconds: pointer.Int32(30),
FailurePolicy: &ignore,
}}
mutatingWebhookConfiguration.Webhooks = []admissionregistrationv1.MutatingWebhook{
{
Name: "some-webhook1.example.com",
TimeoutSeconds: pointer.Int32(30),
FailurePolicy: &ignore,
},
{
Name: "some-webhook2.example.com",
TimeoutSeconds: pointer.Int32(10),
FailurePolicy: &ignore,
Rules: []admissionregistrationv1.RuleWithOperations{
{
Rule: admissionregistrationv1.Rule{
APIGroups: []string{"coordination.k8s.io"},
APIVersions: []string{"v1", "v1beta1"},
Resources: []string{"leases"},
},
Operations: []admissionregistrationv1.OperationType{
"UPDATE",
},
},
},
},
{
Name: "some-webhook3.example.com",
TimeoutSeconds: pointer.Int32(1),
FailurePolicy: &ignore,
Rules: []admissionregistrationv1.RuleWithOperations{
{
Rule: admissionregistrationv1.Rule{
APIGroups: []string{"coordination.k8s.io"},
APIVersions: []string{"v1", "v1beta1"},
Resources: []string{"leases"},
},
Operations: []admissionregistrationv1.OperationType{
"UPDATE",
},
},
},
},
}
Expect(fakeClient.Create(ctx, mutatingWebhookConfiguration)).To(Succeed())

Expect(remediator.Remediate(ctx)).To(Succeed())

Expect(fakeClient.Get(ctx, client.ObjectKeyFromObject(mutatingWebhookConfiguration), mutatingWebhookConfiguration)).To(Succeed())
Expect(mutatingWebhookConfiguration.Annotations).To(HaveKey("gardener.cloud/warning"))
Expect(mutatingWebhookConfiguration.Webhooks[0].TimeoutSeconds).To(Equal(pointer.Int32(15)))
Expect(mutatingWebhookConfiguration.Webhooks[1].TimeoutSeconds).To(Equal(pointer.Int32(3)))
Expect(mutatingWebhookConfiguration.Webhooks[2].TimeoutSeconds).To(Equal(pointer.Int32(1)))
})

It("failurePolicy", func() {
Expand Down

0 comments on commit 5eea9f2

Please sign in to comment.