Skip to content

Commit

Permalink
Merge pull request #5910 from grondo/prolog-and-offline
Browse files Browse the repository at this point in the history
job-manager: improve handling of offline ranks in job prolog
  • Loading branch information
mergify[bot] committed Apr 23, 2024
2 parents 63953d3 + a557e58 commit d8b86fd
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 4 deletions.
34 changes: 33 additions & 1 deletion src/cmd/flux-perilog-run.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from pathlib import Path

import flux
from flux.future import Future
from flux.idset import IDset
from flux.job import JobID
from flux.resource import ResourceSet
Expand Down Expand Up @@ -112,6 +113,19 @@ async def run_with_timeout(cmd, label, timeout=1800.0):
return p


def job_raise(handle, jobid, message):
try:
Future(handle.flux_job_raise(jobid, "prolog", 1, message)).get()
except OSError as exc:
LOGGER.error(f"Failed to raise exception: {message}: {exc}")


def plural(sequence):
if len(sequence) > 1:
return "s"
return ""


async def run_per_rank(name, jobid, args):
"""Run args.exec_per_rank on every rank of jobid
Expand Down Expand Up @@ -141,7 +155,25 @@ async def run_per_rank(name, jobid, args):
offline = offline_ranks(handle) & ranks
if offline:
returncode = 1
LOGGER.info("%s: %s: ranks %s offline. Skipping.", jobid, name, offline)
LOGGER.warning(
"%s: %s: rank%s %s offline. Skipping.",
jobid,
name,
plural(offline),
offline,
)
# If this is a prolog, the job will get a fatal exception after the
# prolog finishes, but the exception will be generic such as:
#
# "prolog exited with exit code=1".
#
# We can't raise a fatal exception here, because that would terminate
# this script and other prologs would not finish. So, raise a
# nonfatal exception to give a hint to the user and admins why the
# prolog failed:
#
if name == "prolog":
job_raise(handle, jobid, f"rank{plural(offline)} {offline} offline")
ranks.subtract(offline)
if args.drain_offline:
drain(handle, offline, f"offline for {jobid} {name}")
Expand Down
11 changes: 11 additions & 0 deletions t/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,7 @@ check_LTLIBRARIES = \
job-manager/plugins/resource-update-expiration.la \
job-manager/plugins/update-test.la \
job-manager/plugins/project-bank-validate.la \
job-manager/plugins/offline.la \
stats/stats-basic.la \
stats/stats-immediate.la

Expand Down Expand Up @@ -1054,6 +1055,16 @@ job_manager_plugins_project_bank_validate_la_LIBADD = \
$(top_builddir)/src/common/libflux-internal.la \
$(top_builddir)/src/common/libflux-core.la

job_manager_plugin_offline_la_SOURCES = \
job-manager/plugins/offline.a
job_manager_plugin_offline_la_CPPFLAGS = \
$(test_cppflags)
job_manager_plugins_offline_la_LDFLAGS = \
$(fluxplugin_ldflags) -module -rpath /nowhere
job_manager_plugins_offline_la_LIBADD = \
$(top_builddir)/src/common/libflux-internal.la \
$(top_builddir)/src/common/libflux-core.la

hwloc_hwloc_convert_SOURCES = hwloc/hwloc-convert.c
hwloc_hwloc_convert_CPPFLAGS = $(HWLOC_CFLAGS) $(test_cppflags)
hwloc_hwloc_convert_LDADD = $(HWLOC_LIBS) \
Expand Down
51 changes: 51 additions & 0 deletions t/job-manager/plugins/offline.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/************************************************************\
* Copyright 2024 Lawrence Livermore National Security, LLC
* (c.f. AUTHORS, NOTICE.LLNS, COPYING)
*
* This file is part of the Flux resource manager framework.
* For details, see https://github.com/flux-framework.
*
* SPDX-License-Identifier: LGPL-3.0
\************************************************************/

/* cleanup-event.c - emit a test event in CLEANUP state
*/

#include <flux/jobtap.h>

/* Disconnect rank 3 by default */
static int rank = 3;

static int run_cb (flux_plugin_t *p,
const char *topic,
flux_plugin_arg_t *args,
void *arg)
{
/* Immediately on state RUN, disconnect the configured rank
*/
flux_t *h = flux_jobtap_get_flux (p);
flux_future_t *f;

/* Assumes parent of rank is rank 0 */
if (!(f = flux_rpc_pack (h,
"overlay.disconnect-subtree",
0,
0,
"{s:i}",
"rank", rank))
|| flux_rpc_get (f, NULL) < 0) {
flux_log_error (h, "failed to disconnect rank %d", rank);
}
flux_future_destroy (f);
return 0;
}

int flux_plugin_init (flux_plugin_t *p)
{
return flux_plugin_add_handler (p,
"job.state.run",
run_cb,
NULL);
}

// vi:ts=4 sw=4 expandtab
35 changes: 32 additions & 3 deletions t/t2274-manager-perilog.t
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@ command = [
]
EOF

test_under_flux 4 full -o,--config-path=$(pwd)/config --test-exit-mode=leader
test_under_flux 4 full \
-o,--config-path=$(pwd)/config,-Stbon.topo=kary:4 \
--test-exit-mode=leader

OFFLINE_PLUGIN=${FLUX_BUILD_DIR}/t/job-manager/plugins/.libs/offline.so
startctl="flux python ${SHARNESS_TEST_SRCDIR}/scripts/startctl.py"

flux setattr log-stderr-level 1
Expand Down Expand Up @@ -338,9 +341,35 @@ test_expect_success 'perilog: run job across all 4 ranks' '
'
# Note: rank 3 is taken offline after this point for testing handling
# of offline ranks.
test_expect_success 'perilog: disconnect rank 3 for offline tests' '
#
# The rank is taken offline via the test jobtap plugin offline.so, which
# explicitly sets rank 3 offline in RUN state before allowing the job to
# proceed. This is required to simulate a rank going offline between
# the scheduler assigning resources and the prolog starting.
#
test_expect_success 'perilog: create config to run flux-perilog-run' '
cat <<-EOF >config/perilog.toml &&
[job-manager.prolog]
command = [ "flux", "perilog-run", "prolog", "-e", "true" ]
EOF
flux config reload
'
test_expect_success 'perilog: load offline.so before perilog.so' '
flux jobtap load $OFFLINE_PLUGIN &&
flux jobtap load perilog.so
'
test_expect_success 'perilog: prolog with offline ranks raises sev 1 exception' '
id=$(flux submit -N4 -n4 true) &&
flux job wait-event -vt 15 -m severity=1 $id exception &&
flux job wait-event -t 15 $id clean &&
test_must_fail flux job attach $id &&
flux jobtap remove offline.so
'
test_expect_success 'perilog: offline ranks are logged by prolog' '
flux dmesg -HL | grep "rank 3 offline"
'
test_expect_success 'perilog: check that rank 3 is now offline' '
flux resource status &&
flux overlay disconnect 3 &&
test_must_fail $startctl wait 3 &&
test "$(flux resource status -s offline -no {ranks})" = "3"
'
Expand Down

0 comments on commit d8b86fd

Please sign in to comment.