Skip to content

Commit

Permalink
Merge pull request #5742 from grondo/issue#5734
Browse files Browse the repository at this point in the history
do not drain ranks when job is canceled during prolog
  • Loading branch information
mergify[bot] committed Feb 15, 2024
2 parents 35c2c10 + 9d20065 commit 04729cc
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 1 deletion.
8 changes: 7 additions & 1 deletion src/cmd/flux-perilog-run.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,14 @@ async def run_per_rank(name, jobid, args):
if proc.canceled:
timeout_ids.set(rank)
rc = 128 + signal.SIGTERM
elif rc != 0:
elif rc > 0 and rc <= 128:
# process failed with non-zero exit code. Add this rank to
# the failed set which will be drained.
fail_ids.set(rank)
else:
# process was signaled (returncode < 0) or shell reported it
# was signaled (128+n). Do nothing in this case.
pass
if rc > returncode:
returncode = rc

Expand Down
14 changes: 14 additions & 0 deletions t/t2274-manager-perilog.t
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,20 @@ test_expect_success 'perilog: epilog can be specified without a prolog' '
flux job wait-event -t 15 $jobid epilog-start &&
flux job wait-event -t 15 $jobid epilog-finish
'
test_expect_success 'perilog: canceled prolog does not drain ranks' '
cat <<-EOF >config/perilog.toml &&
[job-manager.prolog]
command = [ "flux", "perilog-run", "prolog", "-vesleep,30" ]
EOF
flux config reload &&
flux jobtap load --remove=*.so perilog.so &&
jobid=$(flux submit hostname) &&
flux job wait-event -t 15 $jobid prolog-start &&
flux cancel $jobid &&
flux job wait-event -vt 15 $jobid prolog-finish &&
flux resource drain &&
test "$(drained_ranks)" = ""
'
# Note: run this job before taking rank 3 offline below
test_expect_success 'perilog: run job across all 4 ranks' '
jobid=$(flux submit --wait-event=clean -N4 -n4 true)
Expand Down

0 comments on commit 04729cc

Please sign in to comment.