Skip to content

Commit

Permalink
Be more conservative with errors of 3rd party components
Browse files Browse the repository at this point in the history
```improvement operator
The handling when evaluating errors from third-party controllers/components is more conservative now. It will wait until a certain grace period is exceed before actually revealing and reporting the error to the `Shoot` resource.
```

Co-Authored-By: Tim Usner <tim.usner@sap.com>
  • Loading branch information
2 people authored and ialidzhikov committed May 27, 2020
1 parent 1f87b44 commit 66c5c28
Show file tree
Hide file tree
Showing 21 changed files with 131 additions and 52 deletions.
11 changes: 6 additions & 5 deletions pkg/apis/core/v1beta1/helper/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,23 +28,24 @@ import (
"k8s.io/apimachinery/pkg/util/sets"
)

type errorWithCodes struct {
// ErrorWithCodes contains error codes and an error message.
type ErrorWithCodes struct {
message string
codes []gardencorev1beta1.ErrorCode
}

// NewErrorWithCodes creates a new error that additionally exposes the given codes via the Coder interface.
func NewErrorWithCodes(message string, codes ...gardencorev1beta1.ErrorCode) error {
return &errorWithCodes{message, codes}
return &ErrorWithCodes{message, codes}
}

// Codes returns all error codes.
func (e *errorWithCodes) Codes() []gardencorev1beta1.ErrorCode {
func (e *ErrorWithCodes) Codes() []gardencorev1beta1.ErrorCode {
return e.codes
}

// Error returns the error message.
func (e *errorWithCodes) Error() string {
func (e *ErrorWithCodes) Error() string {
return e.message
}

Expand Down Expand Up @@ -72,7 +73,7 @@ func DetermineError(err error, message string) error {
if codes == nil {
return errors.New(errMsg)
}
return &errorWithCodes{errMsg, codes}
return &ErrorWithCodes{errMsg, codes}
}

// DetermineErrorCodes determines error codes based on the given error.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ import (
)

const (
defaultTimeout = 30 * time.Second
defaultInterval = 5 * time.Second
defaultTimeout = 30 * time.Second
defaultInterval = 5 * time.Second
defaultSevereThreshold = 15 * time.Second
)

// Actuator acts upon BackupBucket resources.
Expand Down Expand Up @@ -197,6 +198,7 @@ func (a *actuator) waitUntilBackupBucketExtensionReconciled(ctx context.Context)
"",
a.backupBucket.Name,
defaultInterval,
defaultSevereThreshold,
defaultTimeout,
func(obj runtime.Object) error {
backupBucket, ok := obj.(*extensionsv1alpha1.BackupBucket)
Expand Down
7 changes: 5 additions & 2 deletions pkg/gardenlet/controller/backupentry/backup_entry_actuator.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ import (
)

const (
defaultTimeout = 30 * time.Second
defaultInterval = 5 * time.Second
defaultTimeout = 30 * time.Second
defaultSevereThreshold = 15 * time.Second
defaultInterval = 5 * time.Second
)

// Actuator acts upon BackupEntry resources.
Expand Down Expand Up @@ -147,6 +148,7 @@ func (a *actuator) waitUntilCoreBackupBucketReconciled(ctx context.Context, back
"",
a.backupEntry.Spec.BucketName,
defaultInterval,
defaultSevereThreshold,
defaultTimeout,
func(obj runtime.Object) error {
bb, ok := obj.(*gardencorev1beta1.BackupBucket)
Expand Down Expand Up @@ -244,6 +246,7 @@ func (a *actuator) waitUntilBackupEntryExtensionReconciled(ctx context.Context)
a.backupEntry.Namespace,
a.backupEntry.Name,
defaultInterval,
defaultSevereThreshold,
defaultTimeout,
nil,
)
Expand Down
8 changes: 8 additions & 0 deletions pkg/operation/botanist/botanist.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"fmt"
"sort"
"strings"
"time"

gardencorev1beta1 "github.com/gardener/gardener/pkg/apis/core/v1beta1"
"github.com/gardener/gardener/pkg/apis/core/v1beta1/helper"
Expand All @@ -33,6 +34,13 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
)

const (
// DefaultInterval is the default interval for retry operations.
DefaultInterval = 5 * time.Second
// DefaultSevereThreshold is the default threshold until an error reported by another component is treated as 'severe'.
DefaultSevereThreshold = 30 * time.Second
)

// New takes an operation object <o> and creates a new Botanist object. It checks whether the given Shoot DNS
// domain is covered by a default domain, and if so, it sets the <DefaultDomainSecret> attribute on the Botanist
// object.
Expand Down
4 changes: 0 additions & 4 deletions pkg/operation/botanist/cleanup.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ package botanist

import (
"context"
"time"

gardencorev1beta1 "github.com/gardener/gardener/pkg/apis/core/v1beta1"
v1beta1constants "github.com/gardener/gardener/pkg/apis/core/v1beta1/constants"
Expand Down Expand Up @@ -46,9 +45,6 @@ import (
)

const (
// DefaultInterval is the default interval for retry operations.
DefaultInterval = 5 * time.Second

// Provider is the kubernetes provider label.
Provider = "provider"
// KubernetesProvider is the 'kubernetes' value of the Provider label.
Expand Down
1 change: 1 addition & 0 deletions pkg/operation/botanist/containerruntime.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ func (b *Botanist) WaitUntilContainerRuntimeResourcesReady(ctx context.Context)
b.Shoot.SeedNamespace,
getContainerRuntimeKey(containerRuntime.Type, worker.Name),
DefaultInterval,
DefaultSevereThreshold,
shoot.ExtensionDefaultTimeout,
nil,
)
Expand Down
1 change: 1 addition & 0 deletions pkg/operation/botanist/controlplane.go
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,7 @@ func (b *Botanist) waitUntilControlPlaneReady(ctx context.Context, name string)
b.Shoot.SeedNamespace,
name,
DefaultInterval,
DefaultSevereThreshold,
ControlPlaneDefaultTimeout,
func(o runtime.Object) error {
obj, ok := o.(extensionsv1alpha1.Object)
Expand Down
12 changes: 10 additions & 2 deletions pkg/operation/botanist/dns/dnsentry.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,16 @@ func (d *dnsEntry) Wait(ctx context.Context) error {
var (
status string
message string

retryCountUntilSevere int
interval = 5 * time.Second
severeThreshold = 15 * time.Second
timeout = 2 * time.Minute
)

if err := d.waiter.UntilTimeout(ctx, 5*time.Second, 2*time.Minute, func(ctx context.Context) (done bool, err error) {
if err := d.waiter.UntilTimeout(ctx, interval, timeout, func(ctx context.Context) (done bool, err error) {
retryCountUntilSevere++

entry := &dnsv1alpha1.DNSEntry{}
if err := d.client.Get(
ctx,
Expand All @@ -114,8 +121,9 @@ func (d *dnsEntry) Wait(ctx context.Context) error {

d.logger.Infof("Waiting for %q DNS record to be ready... (status=%s, message=%s)", d.values.Name, status, message)
if status == dnsv1alpha1.STATE_ERROR || status == dnsv1alpha1.STATE_INVALID {
return retry.SevereError(entryErr)
return retry.MinorOrSevereError(retryCountUntilSevere, int(severeThreshold.Nanoseconds()/interval.Nanoseconds()), entryErr)
}

return retry.MinorError(entryErr)
}); err != nil {
return gardencorev1beta1helper.DetermineError(err, fmt.Sprintf("Failed to create %q DNS record: %q (status=%s, message=%s)", d.values.Name, err.Error(), status, message))
Expand Down
10 changes: 8 additions & 2 deletions pkg/operation/botanist/dns/dnsprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,14 @@ func (d *dnsProvider) Wait(ctx context.Context) error {
var (
status string
message string

retryCountUntilSevere int
interval = 5 * time.Second
severeThreshold = 15 * time.Second
timeout = 2 * time.Minute
)

if err := d.waiter.UntilTimeout(ctx, 5*time.Second, 2*time.Minute, func(ctx context.Context) (done bool, err error) {
if err := d.waiter.UntilTimeout(ctx, interval, timeout, func(ctx context.Context) (done bool, err error) {
provider := &dnsv1alpha1.DNSProvider{}
if err := d.client.Get(
ctx,
Expand All @@ -121,8 +126,9 @@ func (d *dnsProvider) Wait(ctx context.Context) error {

d.logger.Infof("Waiting for %q DNS provider to be ready... (status=%s, message=%s)", d.values.Name, status, message)
if status == dnsv1alpha1.STATE_ERROR || status == dnsv1alpha1.STATE_INVALID {
return retry.SevereError(providerErr)
return retry.MinorOrSevereError(retryCountUntilSevere, int(severeThreshold.Nanoseconds()/interval.Nanoseconds()), providerErr)
}

return retry.MinorError(providerErr)
}); err != nil {
return gardencorev1beta1helper.DetermineError(err, fmt.Sprintf("Failed to create DNS provider for %q DNS record: %q (status=%s, message=%s)", d.values.Name, err.Error(), status, message))
Expand Down
1 change: 1 addition & 0 deletions pkg/operation/botanist/extension.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ func (b *Botanist) WaitUntilExtensionResourcesReady(ctx context.Context) error {
extension.Namespace,
extension.Name,
DefaultInterval,
DefaultSevereThreshold,
extension.Timeout,
nil,
)
Expand Down
1 change: 1 addition & 0 deletions pkg/operation/botanist/infrastructure.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ func (b *Botanist) WaitUntilInfrastructureReady(ctx context.Context) error {
b.Shoot.SeedNamespace,
b.Shoot.Info.Name,
DefaultInterval,
DefaultSevereThreshold,
InfrastructureDefaultTimeout,
func(obj runtime.Object) error {
infrastructure, ok := obj.(*extensionsv1alpha1.Infrastructure)
Expand Down
1 change: 1 addition & 0 deletions pkg/operation/botanist/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ func (b *Botanist) WaitUntilNetworkIsReady(ctx context.Context) error {
b.Shoot.SeedNamespace,
b.Shoot.Info.Name,
DefaultInterval,
DefaultSevereThreshold,
NetworkDefaultTimeout,
nil,
)
Expand Down
1 change: 1 addition & 0 deletions pkg/operation/botanist/operatingsystemconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,7 @@ func (b *Botanist) applyAndWaitForShootOperatingSystemConfig(ctx context.Context
b.Shoot.SeedNamespace,
name,
DefaultInterval,
15*time.Second,
30*time.Second,
func(obj runtime.Object) error {
o, ok := obj.(*extensionsv1alpha1.OperatingSystemConfig)
Expand Down
13 changes: 11 additions & 2 deletions pkg/operation/botanist/waiter.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,16 @@ func (b *Botanist) WaitUntilNginxIngressServiceIsReady(ctx context.Context) erro

// WaitUntilEtcdReady waits until the etcd statefulsets indicate readiness in their statuses.
func (b *Botanist) WaitUntilEtcdReady(ctx context.Context) error {
return retry.UntilTimeout(ctx, 5*time.Second, 300*time.Second, func(ctx context.Context) (done bool, err error) {
var (
retryCountUntilSevere int
interval = 5 * time.Second
severeThreshold = 30 * time.Second
timeout = 5 * time.Minute
)

return retry.UntilTimeout(ctx, interval, timeout, func(ctx context.Context) (done bool, err error) {
retryCountUntilSevere++

etcdList := &druidv1alpha1.EtcdList{}
if err := b.K8sSeedClient.Client().List(ctx, etcdList,
client.InNamespace(b.Shoot.SeedNamespace),
Expand All @@ -97,7 +106,7 @@ func (b *Botanist) WaitUntilEtcdReady(ctx context.Context) error {
for _, etcd := range etcdList.Items {
switch {
case etcd.Status.LastError != nil:
return retry.SevereError(fmt.Errorf("%s reconciliation errored: %s", etcd.Name, *etcd.Status.LastError))
return retry.MinorOrSevereError(retryCountUntilSevere, int(severeThreshold.Nanoseconds()/interval.Nanoseconds()), fmt.Errorf("%s reconciliation errored: %s", etcd.Name, *etcd.Status.LastError))
case etcd.DeletionTimestamp != nil:
lastErrors = multierror.Append(lastErrors, fmt.Errorf("%s unexpectedly has a deletion timestamp", etcd.Name))
case etcd.Status.ObservedGeneration == nil || etcd.Generation != *etcd.Status.ObservedGeneration:
Expand Down
1 change: 1 addition & 0 deletions pkg/operation/botanist/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ func (b *Botanist) WaitUntilWorkerReady(ctx context.Context) error {
b.Shoot.SeedNamespace,
b.Shoot.Info.Name,
DefaultInterval,
DefaultSevereThreshold,
WorkerDefaultTimeout,
func(obj runtime.Object) error {
worker, ok := obj.(*extensionsv1alpha1.Worker)
Expand Down
21 changes: 17 additions & 4 deletions pkg/operation/common/extensions.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ func WaitUntilExtensionCRReady(
namespace string,
name string,
interval time.Duration,
severeThreshold time.Duration,
timeout time.Duration,
postReadyFunc func(runtime.Object) error,
) error {
Expand All @@ -57,6 +58,7 @@ func WaitUntilExtensionCRReady(
namespace,
name,
interval,
severeThreshold,
timeout,
postReadyFunc,
)
Expand All @@ -68,26 +70,37 @@ func WaitUntilObjectReadyWithHealthFunction(
ctx context.Context,
c client.Client,
logger *logrus.Entry,
healthFunc func(obj runtime.Object) (bool, error),
healthFunc func(obj runtime.Object) error,
newObjFunc func() runtime.Object,
kind string,
namespace string,
name string,
interval time.Duration,
severeThreshold time.Duration,
timeout time.Duration,
postReadyFunc func(runtime.Object) error,
) error {
var lastObservedError error
var (
errorWithCode *gardencorev1beta1helper.ErrorWithCodes
lastObservedError error
retryCountUntilSevere int
)

if err := retry.UntilTimeout(ctx, interval, timeout, func(ctx context.Context) (bool, error) {
retryCountUntilSevere++

obj := newObjFunc()
if err := c.Get(ctx, client.ObjectKey{Name: name, Namespace: namespace}, obj); err != nil {
return retry.SevereError(err)
}

if retry, err := healthFunc(obj); err != nil {
if err := healthFunc(obj); err != nil {
lastObservedError = err
logger.WithError(err).Errorf("%s did not get ready yet", extensionKey(kind, namespace, name))
return retry, err
if errors.As(err, &errorWithCode) {
return retry.MinorOrSevereError(retryCountUntilSevere, int(severeThreshold.Nanoseconds()/interval.Nanoseconds()), err)
}
return retry.MinorError(err)
}

if postReadyFunc != nil {
Expand Down
23 changes: 11 additions & 12 deletions pkg/utils/kubernetes/health/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ import (
gardencorev1beta1helper "github.com/gardener/gardener/pkg/apis/core/v1beta1/helper"
extensionsv1alpha1 "github.com/gardener/gardener/pkg/apis/extensions/v1alpha1"
"github.com/gardener/gardener/pkg/utils"
"github.com/gardener/gardener/pkg/utils/retry"

druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1"
appsv1 "k8s.io/api/apps/v1"
Expand Down Expand Up @@ -265,21 +264,21 @@ func CheckSeed(seed *gardencorev1beta1.Seed, identity *gardencorev1beta1.Gardene
}

// CheckExtensionObject checks if an extension Object is healthy or not.
func CheckExtensionObject(o runtime.Object) (bool, error) {
func CheckExtensionObject(o runtime.Object) error {
obj, ok := o.(extensionsv1alpha1.Object)
if !ok {
return retry.SevereError(fmt.Errorf("expected extensionsv1alpha1.Object but got %T", o))
return fmt.Errorf("expected extensionsv1alpha1.Object but got %T", o)
}

status := obj.GetExtensionStatus()
return checkExtensionObject(obj.GetGeneration(), status.GetObservedGeneration(), obj.GetAnnotations(), status.GetLastError(), status.GetLastOperation())
}

// CheckBackupBucket checks if an backup bucket Object is healthy or not.
func CheckBackupBucket(bb runtime.Object) (bool, error) {
func CheckBackupBucket(bb runtime.Object) error {
obj, ok := bb.(*gardencorev1beta1.BackupBucket)
if !ok {
return retry.SevereError(fmt.Errorf("expected gardencorev1beta1.BackupBucket but got %T", bb))
return fmt.Errorf("expected gardencorev1beta1.BackupBucket but got %T", bb)
}
return checkExtensionObject(obj.Generation, obj.Status.ObservedGeneration, obj.Annotations, obj.Status.LastError, obj.Status.LastOperation)
}
Expand All @@ -290,28 +289,28 @@ func CheckBackupBucket(bb runtime.Object) (bool, error) {
// * No gardener.cloud/operation is set
// * No lastError is in the status
// * A last operation is state succeeded is present
func checkExtensionObject(generation int64, observedGeneration int64, annotations map[string]string, lastError *gardencorev1beta1.LastError, lastOperation *gardencorev1beta1.LastOperation) (bool, error) {
func checkExtensionObject(generation int64, observedGeneration int64, annotations map[string]string, lastError *gardencorev1beta1.LastError, lastOperation *gardencorev1beta1.LastOperation) error {
if lastError != nil {
return retry.SevereError(gardencorev1beta1helper.NewErrorWithCodes(fmt.Sprintf("extension encountered error during reconciliation: %s", lastError.Description), lastError.Codes...))
return gardencorev1beta1helper.NewErrorWithCodes(fmt.Sprintf("extension encountered error during reconciliation: %s", lastError.Description), lastError.Codes...)
}

if observedGeneration != generation {
return retry.MinorError(fmt.Errorf("observed generation outdated (%d/%d)", observedGeneration, generation))
return fmt.Errorf("observed generation outdated (%d/%d)", observedGeneration, generation)
}

if op, ok := annotations[v1beta1constants.GardenerOperation]; ok {
return retry.MinorError(fmt.Errorf("gardener operation %q is not yet picked up by controller", op))
return fmt.Errorf("gardener operation %q is not yet picked up by controller", op)
}

if lastOperation == nil {
return retry.MinorError(fmt.Errorf("extension did not record a last operation yet"))
return fmt.Errorf("extension did not record a last operation yet")
}

if lastOperation.State != gardencorev1beta1.LastOperationStateSucceeded {
return retry.MinorError(fmt.Errorf("extension state is not succeeded but %v", lastOperation.State))
return fmt.Errorf("extension state is not succeeded but %v", lastOperation.State)
}

return retry.Ok()
return nil
}

// Now determines the current time.
Expand Down

0 comments on commit 66c5c28

Please sign in to comment.