Skip to content

Commit

Permalink
Merge pull request #5814 from grondo/lost-leader-shell
Browse files Browse the repository at this point in the history
job-exec: raise fatal job exception if rank 0 job shell is lost due to signal
  • Loading branch information
mergify[bot] committed Mar 21, 2024
2 parents 9c32493 + de8f216 commit 1e3635a
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 15 deletions.
11 changes: 11 additions & 0 deletions src/modules/job-exec/exec.c
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,17 @@ static void exit_cb (struct bulk_exec *exec,
shell_rank,
flux_get_hostbyrank (job->h, rank),
strsignal (signo));
else {
/* Job can't continue without the leader shell, which has
* terminated unexpectedly. Cancel the job now to avoid
* a potential hang.
*/
jobinfo_fatal_error (job,
0,
"shell rank 0 (on %s): %s",
flux_get_hostbyrank (job->h, rank),
strsignal (signo));
}
}
rank = idset_next (ranks, rank);
}
Expand Down
37 changes: 22 additions & 15 deletions t/issues/t2492-shell-lost.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ try:
# Kill job shell on broker rank 3
broker_rank = h.get_rank()
if broker_rank == 3:
if broker_rank == int(sys.argv[1]):
# kill job shell
os.kill(os.getppid(), 9)
sys.exit(0)
Expand All @@ -47,20 +47,27 @@ except KeyboardInterrupt:
sys.exit(0)
EOF
)
id=$(flux submit -N4 --tasks-per-node=1 \
--input=/dev/null \
-o exit-timeout=none \
--add-file=critical.py="${CRITICAL_RANKS}" \
flux python {{tmpdir}}/critical.py)
for rank in 3 1; do
log ""
log "Testing handling of lost shell rank $rank:"

log "Sumbmitted job $id. Waiting for shell rank 3 to be lost"
id=$(flux submit -N4 --tasks-per-node=1 \
--input=/dev/null \
-o exit-timeout=none \
--add-file=critical.py="${CRITICAL_RANKS}" \
flux python {{tmpdir}}/critical.py $rank)

log "Sumbmitted job $id. Waiting for shell rank $rank to be lost"

value="shell rank $rank (on $(hostname -s)): Killed"
flux job wait-event -Wt 15 -Hvp output -m message="$value" $id log

log "Sending SIGINT to $id. Job should now exit"
flux job kill --signal=2 $id
flux job attach -vEX $id
rc=$?
log "Job exited with rc=$rc (expecting 137 (128+9))"
test $rc -eq 137 || die "Unexpected job exit code $rc"
done

value="shell rank 3 (on $(hostname -s)): Killed"
flux job wait-event -Wt 15 -Hvp output -m message="$value" $id log

log "Sending SIGINT to $id. Job should now exit"
flux job kill --signal=2 $id
flux job attach -vEX $id
rc=$?
log "Job exited with rc=$rc (expecting 137 (128+9))"
test $rc -eq 137 || die "Unexpected job exit code $rc"

0 comments on commit 1e3635a

Please sign in to comment.