diff --git a/pkg/library/status/check.go b/pkg/library/status/check.go new file mode 100644 index 000000000..d36dc9d1f --- /dev/null +++ b/pkg/library/status/check.go @@ -0,0 +1,167 @@ +// +// Copyright (c) 2019-2022 Red Hat, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package status + +import ( + "context" + "fmt" + "strings" + + "github.com/devfile/devworkspace-operator/pkg/common" + "github.com/devfile/devworkspace-operator/pkg/config" + "github.com/devfile/devworkspace-operator/pkg/infrastructure" + "github.com/devfile/devworkspace-operator/pkg/provision/sync" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/fields" + k8sclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +var containerFailureStateReasons = []string{ + "CrashLoopBackOff", + "ImagePullBackOff", + "CreateContainerError", + "RunContainerError", +} + +// unrecoverablePodEventReasons contains Kubernetes events that should fail workspace startup +// if they occur related to a workspace pod. Events are stored as a map with event names as keys +// and values representing the threshold of how many times we can see an event before it is considered +// unrecoverable. +var unrecoverablePodEventReasons = map[string]int32{ + "FailedPostStartHook": 1, + "FailedMount": 3, + "FailedScheduling": 1, + "FailedCreate": 1, + "ReplicaSetCreateError": 1, +} + +var unrecoverableDeploymentConditionReasons = []string{ + "FailedCreate", +} + +func CheckDeploymentStatus(deployment *appsv1.Deployment) (ready bool) { + return deployment.Status.ReadyReplicas > 0 +} + +func CheckDeploymentConditions(deployment *appsv1.Deployment) (healthy bool, errorMsg string) { + conditions := deployment.Status.Conditions + for _, condition := range conditions { + for _, unrecoverableReason := range unrecoverableDeploymentConditionReasons { + if condition.Reason == unrecoverableReason { + return false, fmt.Sprintf("Detected unrecoverable deployment condition: %s %s", condition.Reason, condition.Message) + } + } + } + return true, "" +} + +// checkPodsState checks if workspace-related pods are in an unrecoverable state. A pod is considered to be unrecoverable +// if it has a container with one of the containerFailureStateReasons states, or if an unrecoverable event (with reason +// matching unrecoverablePodEventReasons) has the pod as the involved object. +// Returns optional message with detected unrecoverable state details +// error if any happens during check +func CheckPodsState(workspaceID string, namespace string, labelSelector k8sclient.MatchingLabels, + clusterAPI sync.ClusterAPI) (stateMsg string, checkFailure error) { + podList := &corev1.PodList{} + if err := clusterAPI.Client.List(context.TODO(), podList, k8sclient.InNamespace(namespace), labelSelector); err != nil { + return "", err + } + + for _, pod := range podList.Items { + for _, containerStatus := range pod.Status.ContainerStatuses { + ok, reason := CheckContainerStatusForFailure(&containerStatus) + if !ok { + return fmt.Sprintf("Container %s has state %s", containerStatus.Name, reason), nil + } + } + for _, initContainerStatus := range pod.Status.InitContainerStatuses { + ok, reason := CheckContainerStatusForFailure(&initContainerStatus) + if !ok { + return fmt.Sprintf("Init Container %s has state %s", initContainerStatus.Name, reason), nil + } + } + if msg, err := CheckPodEvents(&pod, workspaceID, clusterAPI); err != nil || msg != "" { + return msg, err + } + } + return "", nil +} + +func CheckPodEvents(pod *corev1.Pod, workspaceID string, clusterAPI sync.ClusterAPI) (msg string, err error) { + evs := &corev1.EventList{} + selector, err := fields.ParseSelector(fmt.Sprintf("involvedObject.name=%s", pod.Name)) + if err != nil { + return "", fmt.Errorf("failed to parse field selector: %s", err) + } + if err := clusterAPI.Client.List(clusterAPI.Ctx, evs, k8sclient.InNamespace(pod.Namespace), k8sclient.MatchingFieldsSelector{Selector: selector}); err != nil { + return "", fmt.Errorf("failed to list events in namespace %s: %w", pod.Namespace, err) + } + for _, ev := range evs.Items { + if ev.InvolvedObject.Kind != "Pod" { + continue + } + + // On OpenShift, it's possible see "FailedMount" events when using a routingClass that depends on the service-ca + // operator. To avoid this, we always ignore FailedMount events if the message refers to the DWO-provisioned volume + if infrastructure.IsOpenShift() && + ev.Reason == "FailedMount" && + strings.Contains(ev.Message, common.ServingCertVolumeName(common.ServiceName(workspaceID))) { + continue + } + + if maxCount, isUnrecoverableEvent := unrecoverablePodEventReasons[ev.Reason]; isUnrecoverableEvent { + if !checkIfUnrecoverableEventIgnored(ev.Reason) && ev.Count >= maxCount { + var msg string + if ev.Count > 1 { + msg = fmt.Sprintf("Detected unrecoverable event %s %d times: %s.", ev.Reason, ev.Count, ev.Message) + } else { + msg = fmt.Sprintf("Detected unrecoverable event %s: %s.", ev.Reason, ev.Message) + } + return msg, nil + } + } + } + return "", nil +} + +func CheckContainerStatusForFailure(containerStatus *corev1.ContainerStatus) (ok bool, reason string) { + if containerStatus.State.Waiting != nil { + for _, failureReason := range containerFailureStateReasons { + if containerStatus.State.Waiting.Reason == failureReason { + return checkIfUnrecoverableEventIgnored(containerStatus.State.Waiting.Reason), containerStatus.State.Waiting.Reason + } + } + } + + if containerStatus.State.Terminated != nil { + for _, failureReason := range containerFailureStateReasons { + if containerStatus.State.Terminated.Reason == failureReason { + return checkIfUnrecoverableEventIgnored(containerStatus.State.Terminated.Reason), containerStatus.State.Terminated.Reason + } + } + } + return true, "" +} + +func checkIfUnrecoverableEventIgnored(reason string) (ignored bool) { + for _, ignoredReason := range config.Workspace.IgnoredUnrecoverableEvents { + if ignoredReason == reason { + return true + } + } + return false +} diff --git a/pkg/provision/storage/cleanup.go b/pkg/provision/storage/cleanup.go index 6aa9f0463..ac81e22be 100644 --- a/pkg/provision/storage/cleanup.go +++ b/pkg/provision/storage/cleanup.go @@ -21,6 +21,7 @@ import ( "time" dw "github.com/devfile/api/v2/pkg/apis/workspaces/v1alpha2" + "github.com/devfile/devworkspace-operator/pkg/library/status" nsconfig "github.com/devfile/devworkspace-operator/pkg/provision/config" "github.com/devfile/devworkspace-operator/pkg/provision/sync" batchv1 "k8s.io/api/batch/v1" @@ -29,6 +30,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + k8sclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "github.com/devfile/devworkspace-operator/internal/images" @@ -91,6 +93,21 @@ func runCommonPVCCleanupJob(workspace *dw.DevWorkspace, clusterAPI sync.ClusterA } } } + + msg, err := status.CheckPodsState(workspace.Status.DevWorkspaceId, clusterJob.Namespace, k8sclient.MatchingLabels{"job-name": common.PVCCleanupJobName(workspace.Status.DevWorkspaceId)}, clusterAPI) + if err != nil { + return &ProvisioningError{ + Err: err, + } + } + + if msg != "" { + errMsg := fmt.Sprintf("DevWorkspace common PVC cleanup job failed: see logs for job %q for details. Additional information: %s", clusterJob.Name, msg) + return &ProvisioningError{ + Message: errMsg, + } + } + // Requeue at least each 10 seconds to check if PVC is not removed by someone else return &NotReadyError{ Message: "Cleanup job is not in completed state", @@ -110,7 +127,9 @@ func getSpecCommonPVCCleanupJob(workspace *dw.DevWorkspace, clusterAPI sync.Clus } jobLabels := map[string]string{ - constants.DevWorkspaceIDLabel: workspaceId, + constants.DevWorkspaceIDLabel: workspaceId, + constants.DevWorkspaceNameLabel: workspace.Name, + constants.DevWorkspaceCreatorLabel: workspace.Labels[constants.DevWorkspaceCreatorLabel], } if restrictedAccess, needsRestrictedAccess := workspace.Annotations[constants.DevWorkspaceRestrictedAccessAnnotation]; needsRestrictedAccess { jobLabels[constants.DevWorkspaceRestrictedAccessAnnotation] = restrictedAccess @@ -126,6 +145,9 @@ func getSpecCommonPVCCleanupJob(workspace *dw.DevWorkspace, clusterAPI sync.Clus Completions: &cleanupJobCompletions, BackoffLimit: &cleanupJobBackoffLimit, Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: jobLabels, + }, Spec: corev1.PodSpec{ RestartPolicy: "Never", SecurityContext: wsprovision.GetDevWorkspaceSecurityContext(), diff --git a/pkg/provision/workspace/deployment.go b/pkg/provision/workspace/deployment.go index c5779e3ca..bb3a35a79 100644 --- a/pkg/provision/workspace/deployment.go +++ b/pkg/provision/workspace/deployment.go @@ -19,11 +19,10 @@ import ( "context" "errors" "fmt" - "strings" + "github.com/devfile/devworkspace-operator/pkg/library/status" nsconfig "github.com/devfile/devworkspace-operator/pkg/provision/config" "github.com/devfile/devworkspace-operator/pkg/provision/sync" - "k8s.io/apimachinery/pkg/fields" dw "github.com/devfile/api/v2/pkg/apis/workspaces/v1alpha2" "github.com/devfile/devworkspace-operator/apis/controller/v1alpha1" @@ -44,29 +43,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) -var containerFailureStateReasons = []string{ - "CrashLoopBackOff", - "ImagePullBackOff", - "CreateContainerError", - "RunContainerError", -} - -// unrecoverablePodEventReasons contains Kubernetes events that should fail workspace startup -// if they occur related to a workspace pod. Events are stored as a map with event names as keys -// and values representing the threshold of how many times we can see an event before it is considered -// unrecoverable. -var unrecoverablePodEventReasons = map[string]int32{ - "FailedPostStartHook": 1, - "FailedMount": 3, - "FailedScheduling": 1, - "FailedCreate": 1, - "ReplicaSetCreateError": 1, -} - -var unrecoverableDeploymentConditionReasons = []string{ - "FailedCreate", -} - type DeploymentProvisioningStatus struct { ProvisioningStatus } @@ -121,7 +97,7 @@ func SyncDeploymentToCluster( } clusterDeployment := clusterObj.(*appsv1.Deployment) - deploymentReady := checkDeploymentStatus(clusterDeployment) + deploymentReady := status.CheckDeploymentStatus(clusterDeployment) if deploymentReady { return DeploymentProvisioningStatus{ ProvisioningStatus: ProvisioningStatus{ @@ -130,7 +106,7 @@ func SyncDeploymentToCluster( } } - deploymentHealthy, deploymentErrMsg := checkDeploymentConditions(clusterDeployment) + deploymentHealthy, deploymentErrMsg := status.CheckDeploymentConditions(clusterDeployment) if !deploymentHealthy { return DeploymentProvisioningStatus{ ProvisioningStatus: ProvisioningStatus{ @@ -140,7 +116,9 @@ func SyncDeploymentToCluster( } } - failureMsg, checkErr := checkPodsState(workspace, clusterAPI) + failureMsg, checkErr := status.CheckPodsState(workspace.Status.DevWorkspaceId, workspace.Namespace, k8sclient.MatchingLabels{ + constants.DevWorkspaceIDLabel: workspace.Status.DevWorkspaceId, + }, clusterAPI) if checkErr != nil { return DeploymentProvisioningStatus{ ProvisioningStatus: ProvisioningStatus{ @@ -201,22 +179,6 @@ func GetDevWorkspaceSecurityContext() *corev1.PodSecurityContext { return config.Workspace.PodSecurityContext } -func checkDeploymentStatus(deployment *appsv1.Deployment) (ready bool) { - return deployment.Status.ReadyReplicas > 0 -} - -func checkDeploymentConditions(deployment *appsv1.Deployment) (healthy bool, errorMsg string) { - conditions := deployment.Status.Conditions - for _, condition := range conditions { - for _, unrecoverableReason := range unrecoverableDeploymentConditionReasons { - if condition.Reason == unrecoverableReason { - return false, fmt.Sprintf("Detected unrecoverable deployment condition: %s %s", condition.Reason, condition.Message) - } - } - } - return true, "" -} - func getSpecDeployment( workspace *dw.DevWorkspace, podAdditionsList []v1alpha1.PodAdditions, @@ -337,46 +299,6 @@ func getSpecDeployment( return deployment, nil } -func getPods(workspace *dw.DevWorkspace, client runtimeClient.Client) (*corev1.PodList, error) { - pods := &corev1.PodList{} - if err := client.List(context.TODO(), pods, k8sclient.InNamespace(workspace.Namespace), k8sclient.MatchingLabels{ - constants.DevWorkspaceIDLabel: workspace.Status.DevWorkspaceId, - }); err != nil { - return nil, err - } - return pods, nil -} - -// checkPodsState checks if workspace-related pods are in an unrecoverable state. A pod is considered to be unrecoverable -// if it has a container with one of the containerStateFailureReasons states, or if an unrecoverable event (with reason -// matching unrecoverablePodEventReasons) has the pod as the involved object. -// Returns optional message with detected unrecoverable state details -// error if any happens during check -func checkPodsState(workspace *dw.DevWorkspace, - clusterAPI sync.ClusterAPI) (stateMsg string, checkFailure error) { - podList, err := getPods(workspace, clusterAPI.Client) - if err != nil { - return "", err - } - - for _, pod := range podList.Items { - for _, containerStatus := range pod.Status.ContainerStatuses { - if !checkContainerStatusForFailure(&containerStatus) { - return fmt.Sprintf("Container %s has state %s", containerStatus.Name, containerStatus.State.Waiting.Reason), nil - } - } - for _, initContainerStatus := range pod.Status.InitContainerStatuses { - if !checkContainerStatusForFailure(&initContainerStatus) { - return fmt.Sprintf("Init Container %s has state %s", initContainerStatus.Name, initContainerStatus.State.Waiting.Reason), nil - } - } - if msg, err := checkPodEvents(&pod, workspace.Status.DevWorkspaceId, clusterAPI); err != nil || msg != "" { - return msg, err - } - } - return "", nil -} - func mergePodAdditions(toMerge []v1alpha1.PodAdditions) (*v1alpha1.PodAdditions, error) { podAdditions := &v1alpha1.PodAdditions{} @@ -476,60 +398,3 @@ func getAdditionalAnnotations(workspace *dw.DevWorkspace) (map[string]string, er return annotations, nil } - -func checkPodEvents(pod *corev1.Pod, workspaceID string, clusterAPI sync.ClusterAPI) (msg string, err error) { - evs := &corev1.EventList{} - selector, err := fields.ParseSelector(fmt.Sprintf("involvedObject.name=%s", pod.Name)) - if err != nil { - return "", fmt.Errorf("failed to parse field selector: %s", err) - } - if err := clusterAPI.Client.List(clusterAPI.Ctx, evs, k8sclient.InNamespace(pod.Namespace), k8sclient.MatchingFieldsSelector{Selector: selector}); err != nil { - return "", fmt.Errorf("failed to list events in namespace %s: %w", pod.Namespace, err) - } - for _, ev := range evs.Items { - if ev.InvolvedObject.Kind != "Pod" { - continue - } - - // On OpenShift, it's possible see "FailedMount" events when using a routingClass that depends on the service-ca - // operator. To avoid this, we always ignore FailedMount events if the message refers to the DWO-provisioned volume - if infrastructure.IsOpenShift() && - ev.Reason == "FailedMount" && - strings.Contains(ev.Message, common.ServingCertVolumeName(common.ServiceName(workspaceID))) { - continue - } - - if maxCount, isUnrecoverableEvent := unrecoverablePodEventReasons[ev.Reason]; isUnrecoverableEvent { - if !checkIfUnrecoverableEventIgnored(ev.Reason) && ev.Count >= maxCount { - var msg string - if ev.Count > 1 { - msg = fmt.Sprintf("Detected unrecoverable event %s %d times: %s", ev.Reason, ev.Count, ev.Message) - } else { - msg = fmt.Sprintf("Detected unrecoverable event %s: %s", ev.Reason, ev.Message) - } - return msg, nil - } - } - } - return "", nil -} - -func checkContainerStatusForFailure(containerStatus *corev1.ContainerStatus) (ok bool) { - if containerStatus.State.Waiting != nil { - for _, failureReason := range containerFailureStateReasons { - if containerStatus.State.Waiting.Reason == failureReason { - return checkIfUnrecoverableEventIgnored(containerStatus.State.Waiting.Reason) - } - } - } - return true -} - -func checkIfUnrecoverableEventIgnored(reason string) (ignored bool) { - for _, ignoredReason := range config.Workspace.IgnoredUnrecoverableEvents { - if ignoredReason == reason { - return true - } - } - return false -}