Skip to content
This repository has been archived by the owner on Jun 8, 2022. It is now read-only.

Enhance HealthScope #160

Merged
merged 4 commits into from
Aug 27, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
285 changes: 198 additions & 87 deletions pkg/controller/v1alpha2/core/scopes/healthscope/healthscope.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,133 +19,244 @@ package healthscope
import (
"context"
"fmt"
"sync"
"time"

apps "k8s.io/api/apps/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/crossplane/oam-kubernetes-runtime/apis/core/v1alpha2"
corev1alpha2 "github.com/crossplane/oam-kubernetes-runtime/apis/core/v1alpha2"

runtimev1alpha1 "github.com/crossplane/crossplane-runtime/apis/core/v1alpha1"
"github.com/crossplane/crossplane-runtime/pkg/fieldpath"
"github.com/crossplane/crossplane-runtime/pkg/logging"
"github.com/pkg/errors"
)

const (
errNoWorkload = "could not retrieve workload %q"
errNoWorkloadResources = "could not retrieve resources for workload %q"
errResourceNotFound = "could not retrieve resource %q %q %q"
errDeploymentUnavailable = "no ready instance found in %q %q %q"
errFmtUnsupportWorkload = "APIVersion %v Kind %v workload is not supportted by HealthScope"
errHealthCheck = "error occurs in health check"
errUnhealthyChildResource = "unhealthy child resource exists"
errFmtResourceNotReady = "resource not ready, resource status: %+v"

defaultTimeout = 10 * time.Second

kindContainerizedWorkload = "ContainerizedWorkload"
wonderflow marked this conversation as resolved.
Show resolved Hide resolved
kindDeployment = "Deployment"
kindService = "Service"
kindStatefulSet = "StatefulSet"
kindDaemonSet = "DaemonSet"
)

// UpdateHealthStatus updates the status of the healthscope based on workload resources.
func UpdateHealthStatus(ctx context.Context, log logging.Logger, client client.Client, healthScope *v1alpha2.HealthScope) error {
timeout := defaultTimeout
if healthScope.Spec.ProbeTimeout != nil {
timeout = time.Duration(*healthScope.Spec.ProbeTimeout) * time.Second
}
ctxWithTimeout, cancel := context.WithTimeout(ctx, timeout)
defer cancel()

resourceRefs := []runtimev1alpha1.TypedReference{}
for _, workloadRef := range healthScope.Spec.WorkloadReferences {
// Get workload object.
workloadObject := unstructured.Unstructured{}
workloadObject.SetAPIVersion(workloadRef.APIVersion)
workloadObject.SetKind(workloadRef.Kind)
workloadObjectRef := types.NamespacedName{Namespace: healthScope.GetNamespace(), Name: workloadRef.Name}
if err := client.Get(ctxWithTimeout, workloadObjectRef, &workloadObject); err != nil {
return errors.Wrapf(err, errNoWorkload, workloadRef.Name)
}
var (
// for general check on worload's replicas
generalReplicaCheckFiled = []string{
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not all resources have these two fields

"status.availableReplicas", // e.g. kruise.CloneSet
"status.readyReplicas",
}
)

// TODO(artursouza): not every workload has child resources, need to handle those scenarios too.
// TODO(artursouza): change this to use an utility method instead.
if value, err := fieldpath.Pave(workloadObject.UnstructuredContent()).GetValue("status.resources"); err == nil {
refs := value.([]interface{})
for _, item := range refs {
ref := item.(map[string]interface{})
resourceRef := runtimev1alpha1.TypedReference{
APIVersion: fmt.Sprintf("%v", ref["apiVersion"]),
Kind: fmt.Sprintf("%v", ref["kind"]),
Name: fmt.Sprintf("%v", ref["name"]),
}
// HealthCondition holds health status of any resource
type HealthCondition struct {
// Target represents resource being diagnosed
Target runtimev1alpha1.TypedReference `json:"target"`

IsHealthy bool `json:"isHealthy"`

// Diagnosis contains diagnosis info as well as error info
Diagnosis string `json:"diagnosis,omitempty"`

// SubConditions represents health status of its child resources, if exist
SubConditions []*HealthCondition `json:"subConditions,omitempty"`
}

// A WorloadHealthChecker checks health status of specified resource
// and saves status into an HealthCondition object.
type WorloadHealthChecker interface {
Check(context.Context, client.Client, runtimev1alpha1.TypedReference, string) *HealthCondition
}

// WorkloadHealthCheckFn checks health status of specified resource
// and saves status into an HealthCondition object.
type WorkloadHealthCheckFn func(context.Context, client.Client, runtimev1alpha1.TypedReference, string) *HealthCondition

resourceRefs = append(resourceRefs, resourceRef)
// Check the health status of specified resource
func (fn WorkloadHealthCheckFn) Check(ctx context.Context, c client.Client, tr runtimev1alpha1.TypedReference, ns string) *HealthCondition {
return fn(ctx, c, tr, ns)
}

// CheckContainerziedWorkloadHealth check health status of ContainerizedWorkload
func CheckContainerziedWorkloadHealth(ctx context.Context, c client.Client, ref runtimev1alpha1.TypedReference, namespace string) *HealthCondition {
if ref.GroupVersionKind() != corev1alpha2.SchemeGroupVersion.WithKind(kindContainerizedWorkload) {
return nil
}
r := &HealthCondition{
IsHealthy: false,
Target: ref,
}
cwObj := corev1alpha2.ContainerizedWorkload{}
cwObj.SetGroupVersionKind(corev1alpha2.SchemeGroupVersion.WithKind(kindContainerizedWorkload))
if err := c.Get(ctx, types.NamespacedName{Namespace: namespace, Name: ref.Name}, &cwObj); err != nil {
r.Diagnosis = errors.Wrap(err, errHealthCheck).Error()
return r
}
r.Target.UID = cwObj.GetUID()

r.SubConditions = []*HealthCondition{}
childRefs := cwObj.Status.Resources

for _, childRef := range childRefs {
switch childRef.Kind {
case kindDeployment:
// reuse Deployment health checker
childCondition := CheckDeploymentHealth(ctx, c, childRef, namespace)
r.SubConditions = append(r.SubConditions, childCondition)
default:
childCondition := &HealthCondition{
Target: childRef,
IsHealthy: true,
}
o := unstructured.Unstructured{}
o.SetAPIVersion(childRef.APIVersion)
o.SetKind(childRef.Kind)
if err := c.Get(ctx, types.NamespacedName{Namespace: namespace, Name: childRef.Name}, &o); err != nil {
// for unspecified resource
// if cannot get it, then check fails
childCondition.IsHealthy = false
childCondition.Diagnosis = errors.Wrap(err, errHealthCheck).Error()
}
} else {
return errors.Wrapf(err, errNoWorkloadResources, workloadRef.Name)
r.SubConditions = append(r.SubConditions, childCondition)
}
}

statusc := resourcesHealthStatus(ctxWithTimeout, log, client, healthScope.Namespace, resourceRefs)
status := true
for r := range statusc {
status = status && r
r.IsHealthy = true
for _, sc := range r.SubConditions {
if !sc.IsHealthy {
r.IsHealthy = false
r.Diagnosis = errUnhealthyChildResource
break
}
}
return r
}

health := "unhealthy"
if status {
health = "healthy"
// CheckDeploymentHealth checks health status of Deployment
func CheckDeploymentHealth(ctx context.Context, client client.Client, ref runtimev1alpha1.TypedReference, namespace string) *HealthCondition {
if ref.GroupVersionKind() != apps.SchemeGroupVersion.WithKind(kindDeployment) {
return nil
}
r := &HealthCondition{
IsHealthy: false,
Target: ref,
}
deployment := apps.Deployment{}
deployment.SetGroupVersionKind(apps.SchemeGroupVersion.WithKind(kindDeployment))
deploymentRef := types.NamespacedName{Namespace: namespace, Name: ref.Name}
if err := client.Get(ctx, deploymentRef, &deployment); err != nil {
r.Diagnosis = errors.Wrap(err, errHealthCheck).Error()
return r
}
r.Target.UID = deployment.GetUID()

healthScope.Status.Health = health
return nil
if deployment.Status.ReadyReplicas == 0 {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I may be missing something here, why is that "ReadyReplicas != 0" means healthy?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually I just stay in line with current logic on health checking .And a more rigorous check is x.Status.ReadyReplicas == x.Spec.Replicas

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we could fix it in the following PRs

r.Diagnosis = fmt.Sprintf(errFmtResourceNotReady, deployment.Status)
return r
}
r.IsHealthy = true
return r
}

func resourcesHealthStatus(ctx context.Context, log logging.Logger, client client.Client, namespace string, refs []runtimev1alpha1.TypedReference) <-chan bool {
status := make(chan bool, len(refs))
var wg sync.WaitGroup
wg.Add(len(refs))
for _, ref := range refs {
go func(resourceRef runtimev1alpha1.TypedReference) {
defer wg.Done()
err := resourceHealthStatus(ctx, client, namespace, resourceRef)
status <- (err == nil)
if err != nil {
log.Debug("Unhealthy resource", "resource", resourceRef.Name, "error", err)
}
}(ref)
// CheckStatefulsetHealth checks health status of StatefulSet
func CheckStatefulsetHealth(ctx context.Context, client client.Client, ref runtimev1alpha1.TypedReference, namespace string) *HealthCondition {
if ref.GroupVersionKind() != apps.SchemeGroupVersion.WithKind(kindStatefulSet) {
return nil
}
r := &HealthCondition{
IsHealthy: false,
Target: ref,
}
statefulset := apps.StatefulSet{}
statefulset.APIVersion = ref.APIVersion
statefulset.Kind = ref.Kind
nk := types.NamespacedName{Namespace: namespace, Name: ref.Name}
if err := client.Get(ctx, nk, &statefulset); err != nil {
r.Diagnosis = errors.Wrap(err, errHealthCheck).Error()
return r
}
go func() {
wg.Wait()
close(status)
}()
r.Target.UID = statefulset.GetUID()

return status
if statefulset.Status.ReadyReplicas == 0 {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here, why is that "ReadyReplicas != 0" means healthy?

r.Diagnosis = fmt.Sprintf(errFmtResourceNotReady, statefulset.Status)
return r
}
r.IsHealthy = true
return r
}

func resourceHealthStatus(ctx context.Context, client client.Client, namespace string, ref runtimev1alpha1.TypedReference) error {
if ref.GroupVersionKind() == apps.SchemeGroupVersion.WithKind("Deployment") {
return deploymentHealthStatus(ctx, client, namespace, ref)
// CheckDaemonsetHealth checks health status of DaemonSet
func CheckDaemonsetHealth(ctx context.Context, client client.Client, ref runtimev1alpha1.TypedReference, namespace string) *HealthCondition {
if ref.GroupVersionKind() != apps.SchemeGroupVersion.WithKind(kindDaemonSet) {
return nil
}
r := &HealthCondition{
IsHealthy: false,
Target: ref,
}
daemonset := apps.DaemonSet{}
daemonset.APIVersion = ref.APIVersion
daemonset.Kind = ref.Kind
nk := types.NamespacedName{Namespace: namespace, Name: ref.Name}
if err := client.Get(ctx, nk, &daemonset); err != nil {
r.Diagnosis = errors.Wrap(err, errHealthCheck).Error()
return r
}
r.Target.UID = daemonset.GetUID()

// TODO(artursouza): add other health checks.
// Generic health check by validating if the resource exists.
object := unstructured.Unstructured{}
object.SetAPIVersion(ref.APIVersion)
object.SetKind(ref.Kind)
objectRef := types.NamespacedName{Namespace: namespace, Name: ref.Name}
err := client.Get(ctx, objectRef, &object)
return err
if daemonset.Status.NumberUnavailable != 0 {
r.Diagnosis = fmt.Sprintf(errFmtResourceNotReady, daemonset.Status)
return r
}
r.IsHealthy = true
return r
}

func deploymentHealthStatus(ctx context.Context, client client.Client, namespace string, ref runtimev1alpha1.TypedReference) error {
deployment := apps.Deployment{}
deployment.APIVersion = ref.APIVersion
deployment.Kind = ref.Kind
deploymentRef := types.NamespacedName{Namespace: namespace, Name: ref.Name}
if err := client.Get(ctx, deploymentRef, &deployment); err != nil {
return errors.Wrapf(err, errResourceNotFound, ref.APIVersion, ref.Kind, ref.Name)
// GeneralHealthChecker checks a list of workload fields
// if any field check passes, it's a healthy workload
func GeneralHealthChecker(ctx context.Context, client client.Client, ref runtimev1alpha1.TypedReference, namespace string) *HealthCondition {
o := unstructured.Unstructured{}
o.SetAPIVersion(ref.APIVersion)
o.SetKind(ref.Kind)

nk := types.NamespacedName{Namespace: namespace, Name: ref.Name}
if err := client.Get(ctx, nk, &o); err != nil {
return &HealthCondition{
IsHealthy: false,
Target: ref,
Diagnosis: errors.Wrap(err, errHealthCheck).Error(),
}
}
ref.UID = o.GetUID()
pavedV := fieldpath.Pave(o.UnstructuredContent())

if deployment.Status.ReadyReplicas == 0 {
return fmt.Errorf(errDeploymentUnavailable, ref.APIVersion, ref.Kind, ref.Name)
for _, fp := range generalReplicaCheckFiled {
if value, err := pavedV.GetNumber(fp); err == nil {
// just check ready/available replica exists
if value != 0 {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just one replica exists doesn't mean it's healthy...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it seems such kind of general checking rule doesn't make sense... Maybe I should remove generalReplicaCheck and leave it to HealthCheckTrait to satisfy diverse health checking needs.

return &HealthCondition{
Target: ref,
IsHealthy: true,
}
}
//TODO(roywang) does every workload have status?
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not really

status, _ := pavedV.GetValue("status")
return &HealthCondition{
Target: ref,
IsHealthy: false,
Diagnosis: fmt.Sprintf(errFmtResourceNotReady, status),
}

}
}
// no matched general check filed
return nil
}
Loading