Skip to content

Commit bf400bd

Browse files
Merge pull request #17863 from vrothberg/v4.4.1-rhel-backport-fix-17777
[v4.4.1-rhel] fix --health-on-failure=restart in transient unit
2 parents ffc2614 + a2e80c5 commit bf400bd

File tree

3 files changed

+81
-11
lines changed

3 files changed

+81
-11
lines changed

libpod/container_internal.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,15 @@ func (c *Container) handleExitFile(exitFile string, fi os.FileInfo) error {
228228
}
229229

230230
func (c *Container) shouldRestart() bool {
231+
if c.config.HealthCheckOnFailureAction == define.HealthCheckOnFailureActionRestart {
232+
isUnhealthy, err := c.isUnhealthy()
233+
if err != nil {
234+
logrus.Errorf("Checking if container is unhealthy: %v", err)
235+
} else if isUnhealthy {
236+
return true
237+
}
238+
}
239+
231240
// If we did not get a restart policy match, return false
232241
// Do the same if we're not a policy that restarts.
233242
if !c.state.RestartPolicyMatch ||
@@ -267,6 +276,12 @@ func (c *Container) handleRestartPolicy(ctx context.Context) (_ bool, retErr err
267276
return false, err
268277
}
269278

279+
if c.config.HealthCheckConfig != nil {
280+
if err := c.removeTransientFiles(ctx, c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
281+
return false, err
282+
}
283+
}
284+
270285
// Is the container running again?
271286
// If so, we don't have to do anything
272287
if c.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) {
@@ -1431,6 +1446,7 @@ func (c *Container) restartWithTimeout(ctx context.Context, timeout uint) (retEr
14311446
if err := c.stop(timeout); err != nil {
14321447
return err
14331448
}
1449+
14341450
if c.config.HealthCheckConfig != nil {
14351451
if err := c.removeTransientFiles(context.Background(), c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
14361452
logrus.Error(err.Error())

libpod/healthcheck.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -184,8 +184,12 @@ func (c *Container) processHealthCheckStatus(status string) error {
184184
}
185185

186186
case define.HealthCheckOnFailureActionRestart:
187-
if err := c.RestartWithTimeout(context.Background(), c.config.StopTimeout); err != nil {
188-
return fmt.Errorf("restarting container after health-check turned unhealthy: %w", err)
187+
// We let the cleanup process handle the restart. Otherwise
188+
// the container would be restarted in the context of a
189+
// transient systemd unit which may cause undesired side
190+
// effects.
191+
if err := c.Stop(); err != nil {
192+
return fmt.Errorf("restarting/stopping container after health-check turned unhealthy: %w", err)
189193
}
190194

191195
case define.HealthCheckOnFailureActionStop:
@@ -346,6 +350,18 @@ func (c *Container) updateHealthStatus(status string) error {
346350
return os.WriteFile(c.healthCheckLogPath(), newResults, 0700)
347351
}
348352

353+
// isUnhealthy returns if the current health check status in unhealthy.
354+
func (c *Container) isUnhealthy() (bool, error) {
355+
if !c.HasHealthCheck() {
356+
return false, nil
357+
}
358+
healthCheck, err := c.getHealthCheckLog()
359+
if err != nil {
360+
return false, err
361+
}
362+
return healthCheck.Status == define.HealthCheckUnhealthy, nil
363+
}
364+
349365
// UpdateHealthCheckLog parses the health check results and writes the log
350366
func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPeriod bool) (string, error) {
351367
c.lock.Lock()

test/system/220-healthcheck.bats

Lines changed: 47 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -139,20 +139,23 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
139139
run_podman 1 healthcheck run $ctr
140140
is "$output" "unhealthy" "output from 'podman healthcheck run' (policy: $policy)"
141141

142-
run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
143-
if [[ $policy == "restart" ]];then
144-
# Container has been restarted and health check works again
145-
is "$output" "running $policy" "container has been restarted"
146-
run_podman container inspect $ctr --format "{{.State.Healthcheck.FailingStreak}}"
147-
is "$output" "0" "Failing streak of restarted container should be 0 again"
148-
run_podman healthcheck run $ctr
142+
if [[ $policy == "restart" ]];then
143+
# Make sure the container transitions back to running
144+
run_podman wait --condition=running $ctr
145+
run_podman inspect $ctr --format "{{.RestartCount}}"
146+
assert "${#lines[@]}" != 0 "Container has been restarted at least once"
147+
run_podman container inspect $ctr --format "{{.State.Healthcheck.FailingStreak}}"
148+
is "$output" "0" "Failing streak of restarted container should be 0 again"
149+
run_podman healthcheck run $ctr
149150
elif [[ $policy == "none" ]];then
151+
run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
150152
# Container is still running and health check still broken
151153
is "$output" "running $policy" "container continued running"
152154
run_podman 1 healthcheck run $ctr
153155
is "$output" "unhealthy" "output from 'podman healthcheck run' (policy: $policy)"
154-
else
155-
# kill and stop yield the container into a non-running state
156+
else
157+
run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
158+
# kill and stop yield the container into a non-running state
156159
is "$output" ".* $policy" "container was stopped/killed (policy: $policy)"
157160
assert "$output" != "running $policy"
158161
# also make sure that it's not stuck in the stopping state
@@ -164,4 +167,39 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
164167
done
165168
}
166169

170+
@test "podman healthcheck --health-on-failure with interval" {
171+
ctr="healthcheck_c"
172+
173+
for policy in stop kill restart ;do
174+
t0=$(date --iso-8601=seconds)
175+
run_podman run -d --name $ctr \
176+
--health-cmd /bin/false \
177+
--health-retries=1 \
178+
--health-on-failure=$policy \
179+
--health-interval=1s \
180+
$IMAGE top
181+
182+
if [[ $policy == "restart" ]];then
183+
# Sleeping for 2 seconds makes the test much faster than using
184+
# podman-wait which would compete with the container getting
185+
# restarted.
186+
sleep 2
187+
# Make sure the container transitions back to running
188+
run_podman wait --condition=running $ctr
189+
run_podman inspect $ctr --format "{{.RestartCount}}"
190+
assert "${#lines[@]}" != 0 "Container has been restarted at least once"
191+
else
192+
# kill and stop yield the container into a non-running state
193+
run_podman wait $ctr
194+
run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
195+
is "$output" ".* $policy" "container was stopped/killed (policy: $policy)"
196+
assert "$output" != "running $policy"
197+
# also make sure that it's not stuck in the stopping state
198+
assert "$output" != "stopping $policy"
199+
fi
200+
201+
run_podman rm -f -t0 $ctr
202+
done
203+
}
204+
167205
# vim: filetype=sh

0 commit comments

Comments
 (0)