Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance backup ready conditions #619

Closed
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ install: manifests
kubectl apply -f config/crd/bases

# Deploy controller in the configured Kubernetes cluster in ~/.kube/config
.PHONY: deploy
.PHONY: deploy-via-kustomize
shreyas-s-rao marked this conversation as resolved.
Show resolved Hide resolved
deploy-via-kustomize: manifests $(KUSTOMIZE)
kubectl apply -f config/crd/bases
kustomize build config/default | kubectl apply -f -
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ require (
github.com/onsi/ginkgo/v2 v2.6.1
github.com/onsi/gomega v1.24.2
github.com/prometheus/client_golang v1.14.0
github.com/robfig/cron/v3 v3.0.1
go.uber.org/zap v1.24.0
golang.org/x/exp v0.0.0-20230213192124-5e25df0256eb
gopkg.in/yaml.v2 v2.4.0
Expand Down Expand Up @@ -92,7 +93,6 @@ require (
github.com/prometheus/client_model v0.3.0 // indirect
github.com/prometheus/common v0.37.0 // indirect
github.com/prometheus/procfs v0.8.0 // indirect
github.com/robfig/cron/v3 v3.0.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/sirupsen/logrus v1.8.1 // indirect
github.com/spf13/afero v1.8.2 // indirect
Expand Down
2 changes: 1 addition & 1 deletion pkg/health/condition/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ func (b *defaultBuilder) Build(replicas int32) []druidv1alpha1.Condition {
if condition.Status == "" {
condition.Status = druidv1alpha1.ConditionUnknown
}
condition.Reason = ConditionNotChecked
condition.Reason = NotChecked
shreyas-s-rao marked this conversation as resolved.
Show resolved Hide resolved
condition.Message = "etcd cluster has been scaled down"
} else {
condition.Status = res.Status()
Expand Down
102 changes: 71 additions & 31 deletions pkg/health/condition/check_backup_ready.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"time"

druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1"
"github.com/gardener/etcd-druid/pkg/utils"
coordinationv1 "k8s.io/api/coordination/v1"

"k8s.io/apimachinery/pkg/types"
Expand All @@ -37,12 +38,12 @@ const (
BackupFailed string = "BackupFailed"
// Unknown is a constant that means that the etcd backup status is currently not known
Unknown string = "Unknown"
// ConditionNotChecked is a constant that means that the etcd backup status has not been updated or rechecked
ConditionNotChecked string = "ConditionNotChecked"
// NotChecked is a constant that means that the etcd backup status has not been updated or rechecked
NotChecked string = "ConditionNotChecked"
shreyas-s-rao marked this conversation as resolved.
Show resolved Hide resolved
)

func (a *backupReadyCheck) Check(ctx context.Context, etcd druidv1alpha1.Etcd) Result {
//Default case
// Default case
result := &result{
conType: druidv1alpha1.ConditionTypeBackupReady,
status: druidv1alpha1.ConditionUnknown,
Expand All @@ -56,53 +57,92 @@ func (a *backupReadyCheck) Check(ctx context.Context, etcd druidv1alpha1.Etcd) R
return nil
}

//Fetch snapshot leases
// Fetch snapshot leases
var (
fullSnapErr, incrSnapErr error
fullSnapLease = &coordinationv1.Lease{}
deltaSnapLease = &coordinationv1.Lease{}
err error
fullSnapLease = &coordinationv1.Lease{}
deltaSnapLease = &coordinationv1.Lease{}
)
fullSnapErr = a.cl.Get(ctx, types.NamespacedName{Name: getFullSnapLeaseName(&etcd), Namespace: etcd.ObjectMeta.Namespace}, fullSnapLease)
incrSnapErr = a.cl.Get(ctx, types.NamespacedName{Name: getDeltaSnapLeaseName(&etcd), Namespace: etcd.ObjectMeta.Namespace}, deltaSnapLease)

//Set status to Unknown if errors in fetching snapshot leases or lease never renewed
if fullSnapErr != nil || incrSnapErr != nil || (fullSnapLease.Spec.RenewTime == nil && deltaSnapLease.Spec.RenewTime == nil) {
err = a.cl.Get(ctx, types.NamespacedName{Name: getFullSnapLeaseName(&etcd), Namespace: etcd.ObjectMeta.Namespace}, fullSnapLease)
if err != nil {
result.message = fmt.Sprintf("Unable to fetch full snap lease. %s", err.Error())
return result
}

err = a.cl.Get(ctx, types.NamespacedName{Name: getDeltaSnapLeaseName(&etcd), Namespace: etcd.ObjectMeta.Namespace}, deltaSnapLease)
if err != nil {
result.message = fmt.Sprintf("Unable to fetch delta snap lease. %s", err.Error())
return result
}

deltaLeaseRenewTime := deltaSnapLease.Spec.RenewTime
fullLeaseRenewTime := fullSnapLease.Spec.RenewTime
fullLeaseCreateTime := &fullSnapLease.ObjectMeta.CreationTimestamp

if fullLeaseRenewTime == nil && deltaLeaseRenewTime != nil {
if fullLeaseRenewTime == nil && deltaLeaseRenewTime == nil {
// Both snapshot leases are not yet renewed
result.message = "Snapshotter has not started yet"
return result

} else if fullLeaseRenewTime == nil && deltaLeaseRenewTime != nil {
// Most probable during reconcile of existing clusters if fresh leases are created
// Treat backup as succeeded if delta snap lease renewal happens in the required time window and full snap lease is not older than 24h.
if time.Since(deltaLeaseRenewTime.Time) < 2*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration && time.Since(fullLeaseCreateTime.Time) < 24*time.Hour {
result.reason = BackupSucceeded
result.message = "Delta snapshot backup succeeded"
result.status = druidv1alpha1.ConditionTrue

fullSnapshotDuration, err := utils.ComputeScheduleDuration(*etcd.Spec.Backup.FullSnapshotSchedule)
if err != nil {
return result
}

// Treat backup as succeeded if delta snap lease renewal happens in the required time window
// and full snap lease is not older than the computed full snapshot duration.
if time.Since(deltaLeaseRenewTime.Time) < 2*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration {
if time.Since(fullLeaseCreateTime.Time) < fullSnapshotDuration {
result.reason = BackupSucceeded
result.message = "Delta snapshot backup succeeded"
result.status = druidv1alpha1.ConditionTrue
return result
} else {
result.status = druidv1alpha1.ConditionFalse
result.reason = BackupFailed
result.message = "Full snapshot backup failed. Full snapshot lease created long ago, but not renewed"
return result
}
}

} else if fullLeaseRenewTime != nil && deltaLeaseRenewTime == nil {
// Most probable during a startup scenario for new clusters
// Special case: return Unknown condition for some time to allow delta backups to start up
// Even though full snapshot may have succeeded by the required time, we must still wait
// for delta snapshotting to begin to consider the backups as healthy, to maintain the given RPO
if time.Since(fullLeaseRenewTime.Time) < 5*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration {
result.message = "Waiting for periodic delta snapshotting to begin"
return result
} else {
result.status = druidv1alpha1.ConditionFalse
result.reason = BackupFailed
result.message = "Delta snapshot backup failed. Delta snapshot lease created long ago, but not renewed"
return result
}
} else if deltaLeaseRenewTime == nil && fullLeaseRenewTime != nil {
//Most probable during a startup scenario for new clusters
//Special case. Return Unknown condition for some time to allow delta backups to start up
if time.Since(fullLeaseRenewTime.Time) > 5*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration {
result.message = "Periodic delta snapshots not started yet"

} else {
// Both snap leases are renewed, ie, maintained. Both are expected to be renewed periodically
fullSnapshotDuration, err := utils.ComputeScheduleDuration(*etcd.Spec.Backup.FullSnapshotSchedule)
if err != nil {
return result
}
} else if deltaLeaseRenewTime != nil && fullLeaseRenewTime != nil {
//Both snap leases are maintained. Both are expected to be renewed periodically
if time.Since(deltaLeaseRenewTime.Time) < 2*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration && time.Since(fullLeaseRenewTime.Time) < 24*time.Hour {

// Mark backup as succeeded only if at least one of the last two delta snapshots has been taken successfully
// and if the last full snapshot has been taken according to the given full snapshot schedule
if time.Since(deltaLeaseRenewTime.Time) < 2*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration && time.Since(fullLeaseRenewTime.Time) < fullSnapshotDuration {
shreyas-s-rao marked this conversation as resolved.
Show resolved Hide resolved
result.reason = BackupSucceeded
result.message = "Snapshot backup succeeded"
result.status = druidv1alpha1.ConditionTrue
return result
}
}

//Cases where snapshot leases are not updated for a long time
//If snapshot leases are present and leases aren't updated, it is safe to assume that backup is not healthy

// Cases where snapshot leases are not updated for a long time
// If snapshot leases are present and leases aren't updated, it is safe to assume that backup is not healthy
if etcd.Status.Conditions != nil {
var prevBackupReadyStatus druidv1alpha1.Condition
for _, prevBackupReadyStatus = range etcd.Status.Conditions {
Expand All @@ -122,16 +162,16 @@ func (a *backupReadyCheck) Check(ctx context.Context, etcd druidv1alpha1.Etcd) R
}
}

//Transition to "Unknown" state is we cannot prove a "True" state
// Transition to "Unknown" state is we cannot prove a "True" state
return result
}

func getDeltaSnapLeaseName(etcd *druidv1alpha1.Etcd) string {
return fmt.Sprintf("%s-delta-snap", string(etcd.ObjectMeta.Name))
return fmt.Sprintf("%s-delta-snap", etcd.ObjectMeta.Name)
}

func getFullSnapLeaseName(etcd *druidv1alpha1.Etcd) string {
return fmt.Sprintf("%s-full-snap", string(etcd.ObjectMeta.Name))
return fmt.Sprintf("%s-full-snap", etcd.ObjectMeta.Name)
}

// BackupReadyCheck returns a check for the "BackupReady" condition.
Expand Down
104 changes: 96 additions & 8 deletions pkg/health/condition/check_backup_ready_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"k8s.io/apimachinery/pkg/types"
"k8s.io/utils/pointer"
"sigs.k8s.io/controller-runtime/pkg/client"

coordinationv1 "k8s.io/api/coordination/v1"
Expand Down Expand Up @@ -56,6 +57,7 @@ var _ = Describe("BackupReadyCheck", func() {
Spec: druidv1alpha1.EtcdSpec{
Replicas: 1,
Backup: druidv1alpha1.BackupSpec{
FullSnapshotSchedule: pointer.String("0 0 * * *"), // at 00:00 every day
DeltaSnapshotPeriod: &v1.Duration{
Duration: deltaSnapshotDuration,
},
Expand Down Expand Up @@ -90,7 +92,7 @@ var _ = Describe("BackupReadyCheck", func() {
})

Context("With no snapshot leases present", func() {
It("Should return Unknown rediness", func() {
It("Should return Unknown readiness", func() {
cl.EXPECT().Get(context.TODO(), gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, er *coordinationv1.Lease, _ ...client.GetOption) error {
return &noLeaseError
Expand All @@ -108,24 +110,35 @@ var _ = Describe("BackupReadyCheck", func() {
})

Context("With both snapshot leases present", func() {
It("Should set status to BackupSucceeded if both leases are recently renewed", func() {
It("Should set status to Unknown if both leases are recently created but not renewed", func() {
cl.EXPECT().Get(context.TODO(), gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error {
*le = lease
le.Spec.RenewTime = nil
le.Spec.HolderIdentity = nil
return nil
},
).AnyTimes()

etcd.Status.Conditions = []druidv1alpha1.Condition{
{
Type: druidv1alpha1.ConditionTypeBackupReady,
Status: druidv1alpha1.ConditionTrue,
Message: "True",
},
}

check := BackupReadyCheck(cl)
result := check.Check(context.TODO(), etcd)

Expect(result).ToNot(BeNil())
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionTrue))
Expect(result.Reason()).To(Equal(BackupSucceeded))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionUnknown))
Expect(result.Reason()).To(Equal(Unknown))
Expect(result.Message()).To(Equal("Snapshotter has not started yet"))
})

It("Should set status to BackupSucceeded if delta snap lease is recently created and empty full snap lease has been created in the last 24h", func() {
It("Should set status to BackupSucceeded if delta snap lease is renewed recently, and empty full snap lease has been created in the last 24h", func() {
cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-full-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error {
*le = lease
Expand All @@ -149,13 +162,41 @@ var _ = Describe("BackupReadyCheck", func() {
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionTrue))
Expect(result.Reason()).To(Equal(BackupSucceeded))
Expect(result.Message()).To(Equal("Delta snapshot backup succeeded"))
})

It("Should set status to Unknown if delta snap lease is renewed recently, and no full snap lease has been created in the last 24h", func() {
cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-full-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error {
*le = lease
le.Spec.RenewTime = nil
le.Spec.HolderIdentity = nil
le.ObjectMeta.CreationTimestamp = v1.NewTime(time.Now().Add(-25 * time.Hour))
return nil
},
).AnyTimes()
cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-delta-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error {
*le = lease
return nil
},
).AnyTimes()

check := BackupReadyCheck(cl)
result := check.Check(context.TODO(), etcd)

Expect(result).ToNot(BeNil())
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionFalse))
Expect(result.Reason()).To(Equal(BackupFailed))
Expect(result.Message()).To(Equal("Full snapshot backup failed. Full snapshot lease created long ago, but not renewed"))
})

It("Should set status to Unknown if empty delta snap lease is present but full snap lease is renewed recently", func() {
It("Should set status to Unknown if empty delta snap lease is present but full snap lease has been renewed recently", func() {
cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-full-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error {
*le = lease
le.Spec.RenewTime = &v1.MicroTime{Time: lease.Spec.RenewTime.Time.Add(-5 * deltaSnapshotDuration)}
le.Spec.RenewTime = &v1.MicroTime{Time: lease.Spec.RenewTime.Time.Add(-4 * deltaSnapshotDuration)}
return nil
},
).AnyTimes()
Expand All @@ -175,7 +216,52 @@ var _ = Describe("BackupReadyCheck", func() {
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionUnknown))
Expect(result.Reason()).To(Equal(Unknown))
Expect(result.Message()).To(Equal("Periodic delta snapshots not started yet"))
Expect(result.Message()).To(Equal("Waiting for periodic delta snapshotting to begin"))
})

It("Should set status to Unknown if empty delta snap lease is present but full snap lease has not been renewed recently", func() {
cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-full-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error {
*le = lease
le.Spec.RenewTime = &v1.MicroTime{Time: lease.Spec.RenewTime.Time.Add(-6 * deltaSnapshotDuration)}
return nil
},
).AnyTimes()
cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-delta-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error {
*le = lease
le.Spec.RenewTime = nil
le.Spec.HolderIdentity = nil
return nil
},
).AnyTimes()

check := BackupReadyCheck(cl)
result := check.Check(context.TODO(), etcd)

Expect(result).ToNot(BeNil())
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionFalse))
Expect(result.Reason()).To(Equal(BackupFailed))
Expect(result.Message()).To(Equal("Delta snapshot backup failed. Delta snapshot lease created long ago, but not renewed"))
})

It("Should set status to BackupSucceeded if both leases are recently renewed", func() {
cl.EXPECT().Get(context.TODO(), gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error {
*le = lease
return nil
},
).AnyTimes()

check := BackupReadyCheck(cl)
result := check.Check(context.TODO(), etcd)

Expect(result).ToNot(BeNil())
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionTrue))
Expect(result.Reason()).To(Equal(BackupSucceeded))
Expect(result.Message()).To(Equal("Snapshot backup succeeded"))
})

It("Should set status to Unknown if both leases are stale", func() {
Expand Down Expand Up @@ -204,6 +290,7 @@ var _ = Describe("BackupReadyCheck", func() {
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionUnknown))
Expect(result.Reason()).To(Equal(Unknown))
Expect(result.Message()).To(Equal("Cannot determine etcd backup status"))
})

It("Should set status to BackupFailed if both leases are stale and current condition is Unknown", func() {
Expand Down Expand Up @@ -232,6 +319,7 @@ var _ = Describe("BackupReadyCheck", func() {
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionFalse))
Expect(result.Reason()).To(Equal(BackupFailed))
Expect(result.Message()).To(Equal("Stale snapshot leases. Not renewed in a long time"))
})
})

Expand Down
Loading