Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add --taskmap=hostfile:FILE support #5844

Merged
merged 5 commits into from
Mar 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 8 additions & 0 deletions doc/man1/common/job-other-run.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,14 @@
the job shell, so this option is not useful unless the total number
of nodes and tasks per node are known at job submission time.

hostfile:FILE
Assign tasks in order to hosts as they appear in FILE. FILE should
have one or more lines each of which contains a host name or RFC
29 Hostlist string. Each host assigned to the job must appear in
the hostfile and be assigned the same number of tasks as the default
taskmap from the shell. If there are less hosts in the hostfile than
tasks in the job, then the list of hosts will be reused.

However, shell plugins may provide other task mapping schemes, so
check the current job shell configuration for a full list of supported
taskmap schemes.
Expand Down
14 changes: 7 additions & 7 deletions doc/man1/flux-shell.rst
Original file line number Diff line number Diff line change
Expand Up @@ -126,13 +126,13 @@ topics:
**taskmap.SCHEME**
Called when a taskmap scheme *SCHEME* is requested via the taskmap
shell option or corresponding :option:`flux submit --taskmap` option.
Plugins that want to offer a different taskmap scheme than the defaults of
``block``, ``cyclic``, and ``manual`` can register a ``taskmap.*`` plugin
callback and then users can request this mapping with the appropriate
:option:`flux submit --taskmap=name`` option. The default block taskmap is
passed to the plugin as "taskmap" in the plugin input arguments, and the
plugin should return the new taskmap as a string in the output args. This
callback is called before ``shell.init``.
Plugins that want to offer a different taskmap scheme than the defaults
of ``block``, ``cyclic``, ``hostfile``, and ``manual`` can register a
``taskmap.*`` plugin callback and then users can request this mapping
with the appropriate :option:`flux submit --taskmap=name`` option.
The default block taskmap is passed to the plugin as "taskmap" in the
plugin input arguments, and the plugin should return the new taskmap as a
string in the output args. This callback is called before ``shell.init``.

**shell.connect**
Called just after the shell connects to the local Flux broker. (Only
Expand Down
3 changes: 2 additions & 1 deletion src/shell/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ flux_shell_SOURCES = \
doom.c \
exception.c \
rlimit.c \
cyclic.c \
taskmap/cyclic.c \
taskmap/hostfile.c \
signal.c \
files.c \
oom.c \
Expand Down
2 changes: 2 additions & 0 deletions src/shell/builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ extern struct shell_builtin builtin_doom;
extern struct shell_builtin builtin_exception;
extern struct shell_builtin builtin_rlimit;
extern struct shell_builtin builtin_cyclic;
extern struct shell_builtin builtin_hostfile;
extern struct shell_builtin builtin_signal;
extern struct shell_builtin builtin_oom;
extern struct shell_builtin builtin_hwloc;
Expand All @@ -74,6 +75,7 @@ static struct shell_builtin * builtins [] = {
&builtin_exception,
&builtin_rlimit,
&builtin_cyclic,
&builtin_hostfile,
&builtin_signal,
&builtin_oom,
&builtin_hwloc,
Expand Down
1 change: 1 addition & 0 deletions src/shell/output.c
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,7 @@ void shell_output_destroy (struct shell_output *out)
zhash_destroy (&out->fds);
}
eventlogger_destroy (out->ev);
idset_destroy (out->active_shells);
free (out);
errno = saved_errno;
}
Expand Down
File renamed without changes.
204 changes: 204 additions & 0 deletions src/shell/taskmap/hostfile.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
/************************************************************\
* Copyright 2024 Lawrence Livermore National Security, LLC
* (c.f. AUTHORS, NOTICE.LLNS, COPYING)
*
* This file is part of the Flux resource manager framework.
* For details, see https://github.com/flux-framework.
*
* SPDX-License-Identifier: LGPL-3.0
\************************************************************/

/* shell taskmap.hostfile plugin
*
* Read a list of hosts from a file, and assign tasks to hosts in order
* they are listed.
*/
#define FLUX_SHELL_PLUGIN_NAME "taskmap.hostfile"

#if HAVE_CONFIG_H
#include "config.h"
#endif
#include <stdio.h>
#include <jansson.h>

#include <flux/core.h>
#include <flux/taskmap.h>
#include <flux/hostlist.h>

#include "src/common/libutil/errprintf.h"

#include "builtins.h"


/* Create a taskmap that represents 'ntasks' tasks mapped across a set
* of hosts in 'nodelist', ordered by hostlist 'hl'.
*/
char *taskmap_hostlist (int ntasks,
struct hostlist *nodelist,
struct hostlist *hl,
flux_error_t *errp)
{
struct taskmap *map = NULL;
char *result = NULL;
const char *host = NULL;

if (!(map = taskmap_create ()))
goto error;

Check warning on line 46 in src/shell/taskmap/hostfile.c

View check run for this annotation

Codecov / codecov/patch

src/shell/taskmap/hostfile.c#L46

Added line #L46 was not covered by tests

/* Loop through hostlist hl until all tasks have been assigned to hosts
*/
while (ntasks > 0) {
int rank;
if (host == NULL)
host = hostlist_first (hl);
if ((rank = hostlist_find (nodelist, host)) < 0) {
errprintf (errp, "host %s not found in job nodelist", host);
goto error;
}
if (taskmap_append (map, rank, 1, 1) < 0) {
errprintf (errp,

Check warning on line 59 in src/shell/taskmap/hostfile.c

View check run for this annotation

Codecov / codecov/patch

src/shell/taskmap/hostfile.c#L59

Added line #L59 was not covered by tests
"failed to append task to taskmap: %s",
strerror (errno));
goto error;

Check warning on line 62 in src/shell/taskmap/hostfile.c

View check run for this annotation

Codecov / codecov/patch

src/shell/taskmap/hostfile.c#L61-L62

Added lines #L61 - L62 were not covered by tests
}
host = hostlist_next (hl);
ntasks--;
}
result = taskmap_encode (map, TASKMAP_ENCODE_WRAPPED);
error:
taskmap_destroy (map);
return result;
}

static struct hostlist *hostlist_from_file (const char *path)
{
ssize_t n;
size_t size;
struct hostlist *hl = NULL;
FILE *fp = NULL;
char *line = NULL;

if (!(fp = fopen (path, "r"))) {
shell_log_errno ("failed to open hostfile: %s", path);
goto error;

Check warning on line 83 in src/shell/taskmap/hostfile.c

View check run for this annotation

Codecov / codecov/patch

src/shell/taskmap/hostfile.c#L82-L83

Added lines #L82 - L83 were not covered by tests
}
if (!(hl = hostlist_create ())) {
shell_log_errno ("failed to create hostlist");
goto error;

Check warning on line 87 in src/shell/taskmap/hostfile.c

View check run for this annotation

Codecov / codecov/patch

src/shell/taskmap/hostfile.c#L86-L87

Added lines #L86 - L87 were not covered by tests
}
while ((n = getline (&line, &size, fp)) != -1) {
int len = strlen (line);
Fixed Show fixed Hide fixed
if (line[len-1] == '\n')
line[len-1] = '\0';
if (strlen (line) > 0 && hostlist_append (hl, line) < 0) {
shell_log_errno ("hostlist_append: %s", line);

Check warning on line 94 in src/shell/taskmap/hostfile.c

View check run for this annotation

Codecov / codecov/patch

src/shell/taskmap/hostfile.c#L94

Added line #L94 was not covered by tests
}
}
error:
if (fp)
fclose (fp);
free (line);
return hl;
}

static struct hostlist *hostlist_from_R (flux_shell_t *shell)
{
size_t i;
json_t *nodelist;
json_t *val;
struct hostlist *hl = NULL;

if (flux_shell_info_unpack (shell,
"{s:{s:{s:o}}}",
"R",
"execution",
"nodelist", &nodelist) < 0) {
shell_log_errno ("unable to get job nodelist");
return NULL;

Check warning on line 117 in src/shell/taskmap/hostfile.c

View check run for this annotation

Codecov / codecov/patch

src/shell/taskmap/hostfile.c#L116-L117

Added lines #L116 - L117 were not covered by tests
}
if (!(hl = hostlist_create ())) {
shell_log_errno ("hostlist_create");
return NULL;

Check warning on line 121 in src/shell/taskmap/hostfile.c

View check run for this annotation

Codecov / codecov/patch

src/shell/taskmap/hostfile.c#L120-L121

Added lines #L120 - L121 were not covered by tests
}
json_array_foreach (nodelist, i, val) {
const char *host = json_string_value (val);
if (!host)
goto error;

Check warning on line 126 in src/shell/taskmap/hostfile.c

View check run for this annotation

Codecov / codecov/patch

src/shell/taskmap/hostfile.c#L126

Added line #L126 was not covered by tests
if (hostlist_append (hl, host) < 0) {
shell_log_errno ("hostlist_append %s", host);
goto error;

Check warning on line 129 in src/shell/taskmap/hostfile.c

View check run for this annotation

Codecov / codecov/patch

src/shell/taskmap/hostfile.c#L128-L129

Added lines #L128 - L129 were not covered by tests
}
}
return hl;
error:
hostlist_destroy (hl);
return NULL;

Check warning on line 135 in src/shell/taskmap/hostfile.c

View check run for this annotation

Codecov / codecov/patch

src/shell/taskmap/hostfile.c#L133-L135

Added lines #L133 - L135 were not covered by tests
}

static int map_hostfile (flux_plugin_t *p,
const char *topic,
flux_plugin_arg_t *args,
void *data)
{
flux_shell_t *shell;
int rc = -1;
const char *value = NULL;
struct hostlist *hl = NULL;
struct hostlist *nodelist = NULL;
char *map = NULL;
int ntasks;
flux_error_t error;

if (!(shell = flux_plugin_get_shell (p)))
return -1;

if (flux_plugin_arg_unpack (args,
FLUX_PLUGIN_ARG_IN,
"{s?s}",
"value", &value) < 0) {
shell_log_error ("unpack: %s", flux_plugin_arg_strerror (args));
return -1;

Check warning on line 160 in src/shell/taskmap/hostfile.c

View check run for this annotation

Codecov / codecov/patch

src/shell/taskmap/hostfile.c#L159-L160

Added lines #L159 - L160 were not covered by tests
}

if (!(hl = hostlist_from_file (value))
|| !(nodelist = hostlist_from_R (shell))) {
shell_log_error ("failed to get hostlists from file and R");
goto out;

Check warning on line 166 in src/shell/taskmap/hostfile.c

View check run for this annotation

Codecov / codecov/patch

src/shell/taskmap/hostfile.c#L165-L166

Added lines #L165 - L166 were not covered by tests
}
if ((ntasks = taskmap_total_ntasks (flux_shell_get_taskmap (shell))) < 0)
shell_log_error ("failed to get ntasks from current shell taskmap");

Check warning on line 169 in src/shell/taskmap/hostfile.c

View check run for this annotation

Codecov / codecov/patch

src/shell/taskmap/hostfile.c#L169

Added line #L169 was not covered by tests

if (!(map = taskmap_hostlist (ntasks, nodelist, hl, &error))) {
shell_log_error ("failed to map tasks with hostfile:%s: %s",
value,
error.text);
goto out;
}
if (flux_plugin_arg_pack (args,
FLUX_PLUGIN_ARG_OUT,
"{s:s}",
"taskmap", map) < 0) {
shell_log_error ("failed to set new taskmap in plugin output args");
goto out;

Check warning on line 182 in src/shell/taskmap/hostfile.c

View check run for this annotation

Codecov / codecov/patch

src/shell/taskmap/hostfile.c#L181-L182

Added lines #L181 - L182 were not covered by tests
}
rc = 0;
out:
free (map);
hostlist_destroy (hl);
hostlist_destroy (nodelist);
return rc;
}

static int plugin_init (flux_plugin_t *p)
{
return flux_plugin_add_handler (p, "taskmap.hostfile", map_hostfile, NULL);
}

struct shell_builtin builtin_hostfile = {
.name = FLUX_SHELL_PLUGIN_NAME,
.plugin_init = plugin_init,
};

/*
* vi:tabstop=4 shiftwidth=4 expandtab
*/
1 change: 1 addition & 0 deletions t/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ TESTSCRIPTS = \
t2614-job-shell-doom.t \
t2615-job-shell-rlimit.t \
t2616-job-shell-taskmap.t \
t2616-job-shell-taskmap-hostfile.t \
t2617-job-shell-stage-in.t \
t2618-job-shell-signal.t \
t2619-job-shell-hwloc.t \
Expand Down
107 changes: 107 additions & 0 deletions t/t2616-job-shell-taskmap-hostfile.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/bin/sh
#
test_description='Test hostfile taskmap plugin support'

. `dirname $0`/sharness.sh

# Use "system" personality to get fake hostnames for hostfile use
test_under_flux 4 system

# Test that actual task ranks match expected ranks.
# Assumes job output is `echo $FLUX_TASK_RANK: $(flux getattr rank)`
test_check_taskmap() {
local id=$1
flux job attach $id | sort -n >$id.output &&
flux job taskmap --to=multiline $id >$id.expected &&
test_cmp $id.expected $id.output
}

test_expect_success 'create script for testing task mapping' '
cat <<-EOF >map.sh &&
#!/bin/sh
echo \$FLUX_TASK_RANK: \$(flux getattr rank)
EOF
chmod +x map.sh
'
test_expect_success 'taskmap=hostfile works' '
cat <<-EOF >h1 &&
fake3
fake2
fake1
fake0
EOF
expected="[[3,1,1,1],[2,1,1,1],[1,1,1,1],[0,1,1,1]]" &&
id=$(flux submit --taskmap=hostfile:h1 -N4 -n4 ./map.sh) &&
flux job attach -vEX $id &&
flux job wait-event -p exec -f json $id shell.start &&
flux job wait-event -p exec -f json $id shell.start \
| jq -e ".context.taskmap.map == $expected" &&
test_check_taskmap $id
'
test_expect_success 'taskmap=hostfile works with multiple tasks per node' '
cat <<-EOF >h2 &&
fake3
fake3
fake2
fake2
fake1
fake1
fake0
fake0
EOF
expected="[[3,1,2,1],[2,1,2,1],[1,1,2,1],[0,1,2,1]]" &&
id=$(flux submit --taskmap=hostfile:h2 -N4 --tasks-per-node=2 ./map.sh) &&
flux job attach -vEX $id &&
flux job wait-event -p exec -f json $id shell.start &&
flux job wait-event -p exec -f json $id shell.start \
| jq -e ".context.taskmap.map == $expected" &&
test_check_taskmap $id
'
test_expect_success 'taskmap=hostfile reuses hosts in short hostlist' '
cat <<-EOF >h3 &&
fake3
fake2
fake1
fake0
EOF
expected="[[3,1,1,1],[2,1,1,1],[1,1,1,1],[0,1,1,1],[3,1,1,1],[2,1,1,1],[1,1,1,1],[0,1,1,1]]" &&
id=$(flux submit --taskmap=hostfile:h3 -N4 --tasks-per-node=2 ./map.sh) &&
flux job attach -vEX $id &&
flux job wait-event -p exec -f json $id shell.start &&
flux job wait-event -p exec -f json $id shell.start \
| jq -e ".context.taskmap.map == $expected" &&
test_check_taskmap $id
'
test_expect_success 'taskmap=hostfile works with hostlists' '
cat <<-EOF >h4 &&
fake[1,2]
fake[3,0]
EOF
expected="[[1,3,1,1],[0,4,1,1],[0,1,1,1]]" &&
id=$(flux submit --taskmap=hostfile:h4 -N4 --tasks-per-node=2 ./map.sh) &&
flux job attach -vEX $id &&
flux job wait-event -p exec -f json $id shell.start &&
flux job wait-event -p exec -f json $id shell.start \
| jq -e ".context.taskmap.map == $expected" &&
test_check_taskmap $id
'
test_expect_success 'taskmap=hostfile fails with invalid hostlist' '
echo "fake[0-3">h5 &&
test_must_fail_or_be_terminated \
flux run --taskmap=hostfile:h5 -N4 hostname
'
test_expect_success 'taskmap=hostfile fails with incorrect hosts' '
echo "foo[0-3]">h6 &&
test_must_fail_or_be_terminated \
flux run --taskmap=hostfile:h6 -N4 hostname
'
test_expect_success 'taskmap=hostfile fails when not all hosts present' '
echo "foo[0,0,1,2]">h7 &&
test_must_fail_or_be_terminated \
flux run --taskmap=hostfile:h7 -N4 hostname
'
test_expect_success 'taskmap=hostfile fails with invalid filename' '
test_must_fail_or_be_terminated \
flux run --taskmap=hostfile:badfile -N4 hostname
'
test_done