Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Leases for etcd member health checks #214

Merged
merged 8 commits into from
Aug 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions api/v1alpha1/etcd_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -279,38 +279,38 @@ type Condition struct {
type EtcdMemberConditionStatus string

const (
// EtcdMemeberStatusReady means a etcd member is ready.
EtcdMemeberStatusReady EtcdMemberConditionStatus = "Ready"
// EtcdMemeberStatusNotReady means a etcd member is not ready.
EtcdMemeberStatusNotReady EtcdMemberConditionStatus = "NotReady"
// EtcdMemeberStatusUnknown means the status of an etcd member is unkown.
EtcdMemeberStatusUnknown EtcdMemberConditionStatus = "Unknown"
// EtcdMemberStatusReady means a etcd member is ready.
EtcdMemberStatusReady EtcdMemberConditionStatus = "Ready"
// EtcdMemberStatusNotReady means a etcd member is not ready.
EtcdMemberStatusNotReady EtcdMemberConditionStatus = "NotReady"
// EtcdMemberStatusUnknown means the status of an etcd member is unkown.
EtcdMemberStatusUnknown EtcdMemberConditionStatus = "Unknown"
)

// EtcdRole is the role of an etcd cluster member.
type EtcdRole string

const (
// EtcdRoleLeader describes the etcd role `Leader`.
EtcdRoleLeader EtcdRole = "Leader"
// EtcdRoleMember describes the etcd role `Member`.
EtcdRoleMember EtcdRole = "Member"
// EtcdRoleLearner describes the etcd role `Learner`.
EtcdRoleLearner EtcdRole = "Learner"
)

// EtcdMemberStatus holds information about a etcd cluster membership.
type EtcdMemberStatus struct {
// Name is the name of the etcd member. It is the name of the backing `Pod`.
Name string `json:"name"`
// ID is the ID of the etcd member.
ID string `json:"id"`
// Role is the role in the etcd cluster, either `Member` or `Learner`.
Role EtcdRole `json:"role"`
// +optional
ID *string `json:"id,omitempty"`
// Role is the role in the etcd cluster, either `Leader` or `Member`.
// +optional
Role *EtcdRole `json:"role,omitempty"`
// Status of the condition, one of True, False, Unknown.
Status EtcdMemberConditionStatus `json:"status"`
// The reason for the condition's last transition.
Reason string `json:"reason"`
// LastUpdateTime is the last time this condition was updated.
LastUpdateTime metav1.Time `json:"lastUpdateTime"`
// LastTransitionTime is the last time the condition's status changed.
LastTransitionTime metav1.Time `json:"lastTransitionTime"`
}
Expand Down
11 changes: 10 additions & 1 deletion api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

272 changes: 80 additions & 192 deletions config/crd/bases/druid.gardener.cloud_etcds.yaml

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions controllers/config/custodian.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@ type EtcdCustodianController struct {
}

type EtcdMemberConfig struct {
// EtcdMemberUnknownThreshold is the duration after which a etcd member's state is considered `Unknown`.
EtcdMemberUnknownThreshold time.Duration
// EtcdMemberUnknownThreshold is the duration after which a etcd member's state is considered `NotReady`.
EtcdMemberNotReadyThreshold time.Duration
}
1 change: 0 additions & 1 deletion controllers/controllers_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ var _ = BeforeSuite(func(done Done) {

custodian := NewEtcdCustodian(mgr, controllersconfig.EtcdCustodianController{
EtcdMember: controllersconfig.EtcdMemberConfig{
EtcdMemberUnknownThreshold: 1 * time.Minute,
EtcdMemberNotReadyThreshold: 1 * time.Minute,
},
})
Expand Down
2 changes: 1 addition & 1 deletion controllers/etcd_custodian_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ func (ec *EtcdCustodian) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.
}

statusCheck := status.NewChecker(ec.Client, ec.config)
if err := statusCheck.Check(ctx, etcd); err != nil {
if err := statusCheck.Check(ctx, logger, etcd); err != nil {
logger.Error(err, "Error executing status checks")
return ctrl.Result{}, err
}
Expand Down
15 changes: 11 additions & 4 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ import (
"github.com/gardener/etcd-druid/controllers"
controllersconfig "github.com/gardener/etcd-druid/controllers/config"

coordinationv1 "k8s.io/api/coordination/v1"
coordinationv1beta1 "k8s.io/api/coordination/v1beta1"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
schemev1 "k8s.io/client-go/kubernetes/scheme"
Expand Down Expand Up @@ -55,9 +57,9 @@ func main() {
etcdWorkers int
custodianWorkers int
custodianSyncPeriod time.Duration
disableLeaseCache bool
ignoreOperationAnnotation bool

etcdMemberUnknownThreshold time.Duration
etcdMemberNotReadyThreshold time.Duration

// TODO: migrate default to `leases` in one of the next releases
Expand All @@ -75,8 +77,8 @@ func main() {
"Defaults to 'druid-leader-election'.")
flag.StringVar(&leaderElectionResourceLock, "leader-election-resource-lock", defaultLeaderElectionResourceLock, "Which resource type to use for leader election. "+
"Supported options are 'endpoints', 'configmaps', 'leases', 'endpointsleases' and 'configmapsleases'.")
flag.BoolVar(&disableLeaseCache, "disable-lease-cache", false, "Disable cache for lease.coordination.k8s.io resources.")
flag.BoolVar(&ignoreOperationAnnotation, "ignore-operation-annotation", true, "Ignore the operation annotation or not.")
flag.DurationVar(&etcdMemberUnknownThreshold, "etcd-member-unknown-threshold", 60*time.Second, "Threshold after which an etcd member status is considered unknown if no heartbeat happened.")
flag.DurationVar(&etcdMemberNotReadyThreshold, "etcd-member-notready-threshold", 5*time.Minute, "Threshold after which an etcd member is considered not ready if the status was unknown before.")

flag.Parse()
Expand All @@ -85,8 +87,14 @@ func main() {

ctx := ctrl.SetupSignalHandler()

// TODO this can be removed once we have an improved informer, see https://github.com/gardener/etcd-druid/issues/215
uncachedObjects := controllers.UncachedObjectList
if disableLeaseCache {
uncachedObjects = append(uncachedObjects, &coordinationv1.Lease{}, &coordinationv1beta1.Lease{})
}

mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
ClientDisableCacheFor: controllers.UncachedObjectList,
ClientDisableCacheFor: uncachedObjects,
Scheme: scheme,
MetricsBindAddress: metricsAddr,
LeaderElection: enableLeaderElection,
Expand All @@ -111,7 +119,6 @@ func main() {

custodian := controllers.NewEtcdCustodian(mgr, controllersconfig.EtcdCustodianController{
EtcdMember: controllersconfig.EtcdMemberConfig{
EtcdMemberUnknownThreshold: etcdMemberUnknownThreshold,
EtcdMemberNotReadyThreshold: etcdMemberNotReadyThreshold,
},
SyncPeriod: custodianSyncPeriod,
Expand Down
2 changes: 1 addition & 1 deletion pkg/health/condition/check_all_members.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ func (a *allMembersReady) Check(status druidv1alpha1.EtcdStatus) Result {
}

for _, member := range status.Members {
if member.Status != druidv1alpha1.EtcdMemeberStatusReady {
if member.Status != druidv1alpha1.EtcdMemberStatusReady {
result.status = druidv1alpha1.ConditionFalse
result.reason = "NotAllMembersReady"
result.message = "At least one member is not ready"
Expand Down
4 changes: 2 additions & 2 deletions pkg/health/condition/check_all_members_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@ var _ = Describe("AllMembersReadyCheck", func() {

BeforeEach(func() {
readyMember = druidv1alpha1.EtcdMemberStatus{
Status: druidv1alpha1.EtcdMemeberStatusReady,
Status: druidv1alpha1.EtcdMemberStatusReady,
}
notReadyMember = druidv1alpha1.EtcdMemberStatus{
Status: druidv1alpha1.EtcdMemeberStatusNotReady,
Status: druidv1alpha1.EtcdMemberStatusNotReady,
}
})

Expand Down
15 changes: 13 additions & 2 deletions pkg/health/condition/check_ready.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,27 @@ func (r *readyCheck) Check(status druidv1alpha1.EtcdStatus) Result {
}
}

// TODO: remove this case as soon as leases are completely supported by etcd-backup-restore
if len(status.Members) == 0 {
return &result{
conType: druidv1alpha1.ConditionTypeReady,
status: druidv1alpha1.ConditionUnknown,
reason: "NoMembersInStatus",
message: "Cannot determine readiness since status has no members",
}
}

var (
size = utils.Max(int(*status.ClusterSize), len(status.Members))
quorum = size/2 + 1
readyMembers = 0
)

for _, member := range status.Members {
if member.Status == druidv1alpha1.EtcdMemeberStatusReady {
readyMembers++
if member.Status == druidv1alpha1.EtcdMemberStatusNotReady {
continue
}
readyMembers++
}

if readyMembers < quorum {
Expand Down
26 changes: 23 additions & 3 deletions pkg/health/condition/check_ready_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,17 @@ import (

var _ = Describe("ReadyCheck", func() {
Describe("#Check", func() {
var readyMember, notReadyMember druidv1alpha1.EtcdMemberStatus
var readyMember, notReadyMember, unknownMember druidv1alpha1.EtcdMemberStatus

BeforeEach(func() {
readyMember = druidv1alpha1.EtcdMemberStatus{
Status: druidv1alpha1.EtcdMemeberStatusReady,
Status: druidv1alpha1.EtcdMemberStatusReady,
}
notReadyMember = druidv1alpha1.EtcdMemberStatus{
Status: druidv1alpha1.EtcdMemeberStatusNotReady,
Status: druidv1alpha1.EtcdMemberStatusNotReady,
}
unknownMember = druidv1alpha1.EtcdMemberStatus{
Status: druidv1alpha1.EtcdMemberStatusUnknown,
}
})

Expand All @@ -54,6 +57,23 @@ var _ = Describe("ReadyCheck", func() {
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionTrue))
})

It("should return that the cluster has a quorum (members are partly unknown)", func() {
status := druidv1alpha1.EtcdStatus{
ClusterSize: pointer.Int32Ptr(3),
Members: []druidv1alpha1.EtcdMemberStatus{
readyMember,
unknownMember,
unknownMember,
},
}
check := ReadyCheck()

result := check.Check(status)

Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionTrue))
})

It("should return that the cluster has a quorum (one member not ready)", func() {
status := druidv1alpha1.EtcdStatus{
ClusterSize: pointer.Int32Ptr(3),
Expand Down
36 changes: 17 additions & 19 deletions pkg/health/etcdmember/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func NewBuilder() Builder {
// WithOldMember sets the old etcd member statuses. It can be used to provide default values.
func (b *defaultBuilder) WithOldMembers(members []druidv1alpha1.EtcdMemberStatus) Builder {
for _, member := range members {
b.old[member.ID] = member
b.old[member.Name] = member
}

return b
Expand All @@ -62,7 +62,7 @@ func (b *defaultBuilder) WithResults(results []Result) Builder {
if res == nil {
continue
}
b.results[res.ID()] = res
b.results[res.Name()] = res
}

return b
Expand All @@ -79,33 +79,31 @@ func (b *defaultBuilder) WithNowFunc(now func() metav1.Time) Builder {
// It merges the existing members with the results added to the builder.
// If OldCondition is provided:
// - Any changes to status set the `LastTransitionTime`
// - `LastUpdateTime` is always set.
func (b *defaultBuilder) Build() []druidv1alpha1.EtcdMemberStatus {
var (
now = b.nowFunc()

members []druidv1alpha1.EtcdMemberStatus
)

for id, res := range b.results {
member, ok := b.old[id]
if !ok {
// Continue if we can't find an existing member because druid is not supposed to add one.
continue
for name, res := range b.results {
memberStatus := druidv1alpha1.EtcdMemberStatus{
ID: res.ID(),
Name: res.Name(),
Role: res.Role(),
Status: res.Status(),
Reason: res.Reason(),
LastTransitionTime: now,
}

member.Status = res.Status()
member.LastTransitionTime = now
member.LastUpdateTime = now
member.Reason = res.Reason()

members = append(members, member)
delete(b.old, id)
}
// Don't reset LastTransitionTime if status didn't change
if oldMemberStatus, ok := b.old[name]; ok {
if oldMemberStatus.Status == res.Status() {
memberStatus.LastTransitionTime = oldMemberStatus.LastTransitionTime
}
}

for _, member := range b.old {
// Add existing members as they were. This needs to be changed when SSA is used.
members = append(members, member)
members = append(members, memberStatus)
}

return members
Expand Down
Loading