Skip to content

Commit

Permalink
Merge pull request #5886 from grondo/issue#5861
Browse files Browse the repository at this point in the history
properly report signaled tasks in `flux job wait` and `flux job attach`
  • Loading branch information
mergify[bot] committed Apr 16, 2024
2 parents 64ae355 + 60f4fcb commit ad01b85
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 28 deletions.
14 changes: 4 additions & 10 deletions src/cmd/job/attach.c
Original file line number Diff line number Diff line change
Expand Up @@ -1067,18 +1067,12 @@ void attach_event_continuation (flux_future_t *f, void *arg)
}
else {
if (streq (name, "finish")) {
flux_error_t error;
if (json_unpack (context, "{s:i}", "status", &status) < 0)
log_err_exit ("error decoding finish context");
if (WIFSIGNALED (status)) {
ctx->exit_code = WTERMSIG (status) + 128;
log_msg ("task(s) %s", strsignal (WTERMSIG (status)));
}
else if (WIFEXITED (status)) {
ctx->exit_code = WEXITSTATUS (status);
if (ctx->exit_code != 0)
log_msg ("task(s) exited with exit code %d",
ctx->exit_code);
}
ctx->exit_code = flux_job_waitstatus_to_exitcode (status, &error);
if (ctx->exit_code != 0)
log_msg ("%s", error.text);
}
}

Expand Down
34 changes: 34 additions & 0 deletions src/common/libjob/job.c
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,40 @@ int flux_job_timeleft (flux_t *h, flux_error_t *errp, double *timeleft)
return rc;
}

int flux_job_waitstatus_to_exitcode (int waitstatus, flux_error_t *errp)
{
int code;

/* If waitstatus indicates WIFSIGNALED, then this means the job shell
* was signaled, not the tasks. Report accordingly:
*/
if (WIFSIGNALED (waitstatus)) {
/* Whether the job shell or one or more tasks is terminated by a
* signal, set the exit code to signal + 128
*/
code = WTERMSIG (waitstatus) + 128;
errprintf (errp, "job shell %s", strsignal (WTERMSIG (waitstatus)));
}
else if (WIFEXITED (waitstatus)) {
code = WEXITSTATUS (waitstatus);
/* If exit code > 128, then tasks were likely terminated by a
* signal. (job shell returns 128+signo in this case)
*/
if (code > 128)
errprintf (errp, "task(s) %s", strsignal (code - 128));
else if (code > 0)
errprintf (errp, "task(s) exited with exit code %d", code);
else /* Ensure errp->text is cleared */
err_init (errp);
}
else {
errprintf (errp, "unexpected wait(2) status %d", waitstatus);
code = -1;
errno = EINVAL;
}
return code;
}

/*
* vi:tabstop=4 shiftwidth=4 expandtab
*/
7 changes: 7 additions & 0 deletions src/common/libjob/job.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,13 @@ char *flux_unwrap_string (const char *in,
uint32_t *userid,
flux_error_t *error);


/* Convert the waitstatus from a job `finish` event to an exit code.
* If the job exited with nonzero status, then place an appropriate error
* message in errp->text.
*/
int flux_job_waitstatus_to_exitcode (int waitstatus, flux_error_t *errp);

#ifdef __cplusplus
}
#endif
Expand Down
31 changes: 31 additions & 0 deletions src/common/libjob/test/job.c
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,35 @@ static void check_job_timeleft (void)
"flux_job_timeleft (h, error, NULL) returns EINVAL");
}

static void check_waitstatus_to_exitcode (void)
{
flux_error_t error;
ok (flux_job_waitstatus_to_exitcode (-1, &error) < 0 && errno == EINVAL,
"flux_job_waitstatus_to_exitcode (-1) returns EINVAL");
is (error.text, "unexpected wait(2) status -1",
"error.text is %s", error.text);
ok (flux_job_waitstatus_to_exitcode (0, &error) == 0,
"flux_job_waitstatus_to_exitcode (0) returns 0");
is (error.text, "",
"error.text is cleared");
ok (flux_job_waitstatus_to_exitcode (9, &error) == 128+9,
"flux_job_waitstatus_to_exitcode (9) == %d", 128+9);
is (error.text, "job shell Killed",
"error.text is %s", error.text);
ok (flux_job_waitstatus_to_exitcode (1<<8, &error) == 1,
"flux_job_waitstatus_to_exitcode (1<<8) = 1");
is (error.text, "task(s) exited with exit code 1",
"error.text is %s", error.text);
ok (flux_job_waitstatus_to_exitcode ((128+15)<< 8, &error) == 128+15,
"flux_job_waitstatus_to_exitcode ((128+15)<<8) = 128+15");
is (error.text, "task(s) Terminated",
"error.text is %s", error.text);
ok (flux_job_waitstatus_to_exitcode ((128+11)<<8, &error) == 128+11,
"flux_job_waitstatus_to_exitcode ((128+11)<<8) = 128+11");
is (error.text, "task(s) Segmentation fault",
"error.text is %s", error.text);
}

int main (int argc, char *argv[])
{
plan (NO_PLAN);
Expand All @@ -495,6 +524,8 @@ int main (int argc, char *argv[])

check_job_timeleft ();

check_waitstatus_to_exitcode ();

done_testing ();
return 0;
}
Expand Down
21 changes: 3 additions & 18 deletions src/modules/job-manager/wait.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,27 +107,12 @@ static int decode_job_result (struct job *job,
*/
else if (streq (name, "finish")) {
int status;

if (json_unpack (context, "{s:i}", "status", &status) < 0)
return -1;
if (WIFSIGNALED (status)) {
errprintf (errp,
"task(s) %s",
strsignal (WTERMSIG (status)));
*success = false;
}
else if (WIFEXITED (status)) {
errprintf (errp,
"task(s) exited with exit code %d",
WEXITSTATUS (status));
*success = WEXITSTATUS (status) == 0 ? true : false;
}
else {
errprintf (errp,
"unexpected wait(2) status %d",
status);
if (flux_job_waitstatus_to_exitcode (status, errp) != 0)
*success = false;
}
else
*success = true;
}
else
return -1;
Expand Down
25 changes: 25 additions & 0 deletions t/t2500-job-attach.t
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,31 @@ test_expect_success 'attach: submit a job and cancel it' '
test_expect_success 'attach: exit code reflects cancellation' '
! flux job attach $(cat jobid2)
'
test_expect_success 'attach: reports task exit code with nonzero exit' '
id=$(flux submit sh -c 'exit 42') &&
test_must_fail flux job attach $id 2>exited.err &&
test_debug "cat exited.err" &&
grep "exited with exit code 42" exited.err
'
test_expect_success 'attach: reports Killed when job tasks are killed' '
id=$(flux submit --wait-event=exec.shell.start sleep 30) &&
flux job kill -s 9 $id &&
test_must_fail_or_be_terminated flux job attach $id 2>killed.err &&
test_debug "cat killed.err" &&
grep Killed killed.err
'
test_expect_success 'attach: reports Terminated when tasks are terminated' '
id=$(flux submit --wait-event=exec.shell.start sleep 30) &&
flux job kill -s 15 $id &&
test_must_fail_or_be_terminated flux job attach $id 2>terminated.err &&
test_debug "cat terminated.err" &&
grep Terminated terminated.err
'
test_expect_success 'attach: reports job shell Killed if job shell is killed' '
id=$(flux submit --wait-event=exec.shell.start sh -c "kill -9 \$PPID") &&
test_must_fail_or_be_terminated flux job attach $id 2>shell-killed.out &&
grep "job shell Killed" shell-killed.out
'

# Usage run_attach seq
# Run a 30s job, then attach to it in the background
Expand Down

0 comments on commit ad01b85

Please sign in to comment.