Merge pull request #17863 from vrothberg/v4.4.1-rhel-backport-fix-17777

openshift-merge-robot · web-flow · commit bf400bd36f70 · 2023-03-21T07:29:21.000-04:00
[v4.4.1-rhel] fix --health-on-failure=restart in transient unit
diff --git a/libpod/container_internal.go b/libpod/container_internal.go
@@ -228,6 +228,15 @@ func (c *Container) handleExitFile(exitFile string, fi os.FileInfo) error {
 }
 
 func (c *Container) shouldRestart() bool {
+	if c.config.HealthCheckOnFailureAction == define.HealthCheckOnFailureActionRestart {
+		isUnhealthy, err := c.isUnhealthy()
+		if err != nil {
+			logrus.Errorf("Checking if container is unhealthy: %v", err)
+		} else if isUnhealthy {
+			return true
+		}
+	}
+
 	// If we did not get a restart policy match, return false
 	// Do the same if we're not a policy that restarts.
 	if !c.state.RestartPolicyMatch ||
@@ -267,6 +276,12 @@ func (c *Container) handleRestartPolicy(ctx context.Context) (_ bool, retErr err
 		return false, err
 	}
 
+	if c.config.HealthCheckConfig != nil {
+		if err := c.removeTransientFiles(ctx, c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
+			return false, err
+		}
+	}
+
 	// Is the container running again?
 	// If so, we don't have to do anything
 	if c.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) {
@@ -1431,6 +1446,7 @@ func (c *Container) restartWithTimeout(ctx context.Context, timeout uint) (retEr
 		if err := c.stop(timeout); err != nil {
 			return err
 		}
+
 		if c.config.HealthCheckConfig != nil {
 			if err := c.removeTransientFiles(context.Background(), c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
 				logrus.Error(err.Error())
diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go
@@ -184,8 +184,12 @@ func (c *Container) processHealthCheckStatus(status string) error {
 		}
 
 	case define.HealthCheckOnFailureActionRestart:
-		if err := c.RestartWithTimeout(context.Background(), c.config.StopTimeout); err != nil {
-			return fmt.Errorf("restarting container after health-check turned unhealthy: %w", err)
+		// We let the cleanup process handle the restart.  Otherwise
+		// the container would be restarted in the context of a
+		// transient systemd unit which may cause undesired side
+		// effects.
+		if err := c.Stop(); err != nil {
+			return fmt.Errorf("restarting/stopping container after health-check turned unhealthy: %w", err)
 		}
 
 	case define.HealthCheckOnFailureActionStop:
@@ -346,6 +350,18 @@ func (c *Container) updateHealthStatus(status string) error {
 	return os.WriteFile(c.healthCheckLogPath(), newResults, 0700)
 }
 
+// isUnhealthy returns if the current health check status in unhealthy.
+func (c *Container) isUnhealthy() (bool, error) {
+	if !c.HasHealthCheck() {
+		return false, nil
+	}
+	healthCheck, err := c.getHealthCheckLog()
+	if err != nil {
+		return false, err
+	}
+	return healthCheck.Status == define.HealthCheckUnhealthy, nil
+}
+
 // UpdateHealthCheckLog parses the health check results and writes the log
 func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPeriod bool) (string, error) {
 	c.lock.Lock()
diff --git a/test/system/220-healthcheck.bats b/test/system/220-healthcheck.bats
@@ -139,20 +139,23 @@ Log[-1].Output   | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
         run_podman 1 healthcheck run $ctr
         is "$output" "unhealthy" "output from 'podman healthcheck run' (policy: $policy)"
 
-        run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
-	if [[ $policy == "restart" ]];then
-	    # Container has been restarted and health check works again
-            is "$output" "running $policy" "container has been restarted"
-            run_podman container inspect $ctr --format "{{.State.Healthcheck.FailingStreak}}"
-            is "$output" "0" "Failing streak of restarted container should be 0 again"
-            run_podman healthcheck run $ctr
+        if [[ $policy == "restart" ]];then
+           # Make sure the container transitions back to running
+           run_podman wait --condition=running $ctr
+           run_podman inspect $ctr --format "{{.RestartCount}}"
+           assert "${#lines[@]}" != 0 "Container has been restarted at least once"
+           run_podman container inspect $ctr --format "{{.State.Healthcheck.FailingStreak}}"
+           is "$output" "0" "Failing streak of restarted container should be 0 again"
+           run_podman healthcheck run $ctr
         elif [[ $policy == "none" ]];then
+            run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
             # Container is still running and health check still broken
             is "$output" "running $policy" "container continued running"
             run_podman 1 healthcheck run $ctr
             is "$output" "unhealthy" "output from 'podman healthcheck run' (policy: $policy)"
-	else
-	    # kill and stop yield the container into a non-running state
+        else
+            run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
+            # kill and stop yield the container into a non-running state
             is "$output" ".* $policy" "container was stopped/killed (policy: $policy)"
             assert "$output" != "running $policy"
             # also make sure that it's not stuck in the stopping state
@@ -164,4 +167,39 @@ Log[-1].Output   | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
     done
 }
 
+@test "podman healthcheck --health-on-failure with interval" {
+    ctr="healthcheck_c"
+
+    for policy in stop kill restart ;do
+        t0=$(date --iso-8601=seconds)
+        run_podman run -d --name $ctr      \
+               --health-cmd /bin/false     \
+               --health-retries=1          \
+               --health-on-failure=$policy \
+               --health-interval=1s        \
+               $IMAGE top
+
+        if [[ $policy == "restart" ]];then
+            # Sleeping for 2 seconds makes the test much faster than using
+            # podman-wait which would compete with the container getting
+            # restarted.
+            sleep 2
+            # Make sure the container transitions back to running
+            run_podman wait --condition=running $ctr
+            run_podman inspect $ctr --format "{{.RestartCount}}"
+            assert "${#lines[@]}" != 0 "Container has been restarted at least once"
+        else
+            # kill and stop yield the container into a non-running state
+            run_podman wait $ctr
+            run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
+            is "$output" ".* $policy" "container was stopped/killed (policy: $policy)"
+            assert "$output" != "running $policy"
+            # also make sure that it's not stuck in the stopping state
+            assert "$output" != "stopping $policy"
+        fi
+
+        run_podman rm -f -t0 $ctr
+    done
+}
+
 # vim: filetype=sh