Skip to content

Commit

Permalink
job-manager: make some replay errors non-fatal
Browse files Browse the repository at this point in the history
Problem: if a few jobs get messed up in the KVS due to an
improper shutdown, recovery is a tedious process involving
starting flux in --recovery mode, fixing one job, and starting
again.

When a job cannot be replayed from the KVS and the reason is
that the directory is incomplete, log the failure at LOG_ERR
level but let replay continue and ultimately the flux restart
be successful.

If a job has more serious problems like incorrect content in
the eventlog, treat that as a fatal error as before.  This
avoids breaking the 'valid' tests that check backwards
compatibility with older kvs dumps, which might use an older
eventlog format.

Update t2219-job-manage-restart.t to expect warnings rather
than failure when such jobs are encountered during replay.

Fixes #5147
  • Loading branch information
garlick committed May 4, 2023
1 parent 6db9928 commit 702474e
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 4 deletions.
34 changes: 30 additions & 4 deletions src/modules/job-manager/restart.c
Expand Up @@ -46,7 +46,10 @@ int restart_count_char (const char *s, char c)
return count;
}

static struct job *lookup_job (flux_t *h, flux_jobid_t id, flux_error_t *error)
static struct job *lookup_job (flux_t *h,
flux_jobid_t id,
flux_error_t *error,
bool *fatal)
{
flux_future_t *f1 = NULL;
flux_future_t *f2 = NULL;
Expand All @@ -63,24 +66,33 @@ static struct job *lookup_job (flux_t *h, flux_jobid_t id, flux_error_t *error)
"cannot send lookup requests for job %ju: %s",
(uintmax_t)id,
strerror (errno));
*fatal = true;
goto done;
}
if (flux_kvs_lookup_get (f1, &eventlog) < 0) {
errprintf (error, "lookup %s: %s", k1, strerror (errno));
*fatal = false;
goto done;
}
if (flux_kvs_lookup_get (f2, &jobspec) < 0) {
errprintf (error, "lookup %s: %s", k2, strerror (errno));
*fatal = false;
goto done;
}
if (!(job = job_create_from_eventlog (id, eventlog, jobspec, &e)))
if (!(job = job_create_from_eventlog (id, eventlog, jobspec, &e))) {
errprintf (error, "replay %s: %s", k1, e.text);
*fatal = true;
}
done:
flux_future_destroy (f1);
flux_future_destroy (f2);
return job;
}

/* Create a 'struct job' from the KVS, using synchronous KVS RPCs.
* Return 1 on success, 0 on non-fatal error, or -1 on a fatal error,
* where a fatal error will prevent flux from starting.
*/
static int depthfirst_map_one (flux_t *h,
const char *key,
int dirskip,
Expand All @@ -101,8 +113,22 @@ static int depthfirst_map_one (flux_t *h,
errprintf (error, "could not decode %s to job ID", key + dirskip + 1);
return -1;
}
if (!(job = lookup_job (h, id, error)))
return -1;

flux_error_t lookup_error;
bool fatal;
if (!(job = lookup_job (h, id, &lookup_error, &fatal))) {
if (fatal) {
errprintf (error, "%s", lookup_error.text);
return -1;
}
flux_log (h,
LOG_ERR,
"job %ju not replayed: %s",
(uintmax_t)id,
lookup_error.text);
return 0;
}

if (cb (job, arg, error) < 0)
goto done;
rc = 1;
Expand Down
15 changes: 15 additions & 0 deletions t/t2219-job-manager-restart.t
Expand Up @@ -19,6 +19,16 @@ restart_flux() {
flux module stats job-manager
}

# Returns 0 if dump file is replayed successfully AND one or more
# "not replayed" warnings were logged
restart_with_job_warning() {
local out=$(basename $1).dmesg
flux start -o,-Scontent.restore=$1 /bin/true 2>$out
result=$?
cat $out
test $result -eq 0 && grep -q "not replayed:" $out
}

test_expect_success 'verify that job manager can restart with current dump' '
restart_flux dump.tar >stats.out
'
Expand Down Expand Up @@ -193,6 +203,11 @@ for dump in ${DUMPS}/valid/*.tar.bz2; do
test_expect_success 'successfully started from '$testname "restart_flux $dump"
done

for dump in ${DUMPS}/warn/*.tar.bz2; do
testname=$(basename $dump)
test_expect_success 'successfully started with warning from '$testname "restart_with_job_warning $dump"
done

for dump in ${DUMPS}/invalid/*.tar.bz2; do
testname=$(basename $dump)
test_expect_success 'failed on '$testname "test_must_fail restart_flux $dump"
Expand Down

0 comments on commit 702474e

Please sign in to comment.