Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add flux.taskmap to PMI kvs for better cyclic task distribution scalability #4798

Merged
merged 6 commits into from Dec 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 12 additions & 2 deletions src/broker/boot_pmi.c
Expand Up @@ -94,12 +94,22 @@ static int set_broker_mapping_attr (struct pmi_handle *pmi,
if (pmi_params->size == 1)
val = strdup ("{\"version\":1,\"map\":[[0,1,1,1]]}");
else {
/* First attempt to get flux.taskmap, falling back to
* PMI_process_mapping if this key is not available.
* This should be replaced when #4800 is fixed.
*/
if (broker_pmi_kvs_get (pmi,
pmi_params->kvsname,
"PMI_process_mapping",
"flux.taskmap",
buf,
sizeof (buf),
-1) == PMI_SUCCESS) {
-1) == PMI_SUCCESS
|| broker_pmi_kvs_get (pmi,
pmi_params->kvsname,
"PMI_process_mapping",
buf,
sizeof (buf),
-1) == PMI_SUCCESS) {
val = pmi_mapping_to_taskmap (buf);
}
}
Expand Down
5 changes: 5 additions & 0 deletions src/common/libpmi/pmi2.h
Expand Up @@ -33,6 +33,11 @@ extern "C" {
#define PMI2_ERR_OTHER 14

#define PMI2_MAX_KEYLEN 64

/* PMI2_MAX_VALLEN of 1024 is a de-facto standard and should not be
* increased. Experimentally increasing to 2048 was shown to cause
* crashes with mpich and mvapich2 MPI implementations.
*/
#define PMI2_MAX_VALLEN 1024
#define PMI2_MAX_ATTRVALUE 1024
#define PMI2_ID_NULL -1
Expand Down
21 changes: 17 additions & 4 deletions src/common/libpmi/simple_client.c
Expand Up @@ -270,7 +270,6 @@ int pmi_simple_client_kvs_get (struct pmi_simple_client *pmi,
*/
static struct taskmap *fetch_taskmap (struct pmi_simple_client *pmi)
{
const char *key = "PMI_process_mapping";
struct taskmap *map = NULL;
int result;
char *nom;
Expand All @@ -291,9 +290,23 @@ static struct taskmap *fetch_taskmap (struct pmi_simple_client *pmi)
result = pmi_simple_client_kvs_get_my_name (pmi, nom, pmi->kvsname_max);
if (result != PMI_SUCCESS)
goto done;
result = pmi_simple_client_kvs_get (pmi, nom, key, val, pmi->vallen_max);
if (result != PMI_SUCCESS)
goto done;
/* First try flux.taskmap, falling back to PMI_process_mapping if it
* does not exist (e.g. if process manager is not Flux).
*/
result = pmi_simple_client_kvs_get (pmi,
nom,
"flux.taskmap",
val,
pmi->vallen_max);
if (result != PMI_SUCCESS) {
result = pmi_simple_client_kvs_get (pmi,
nom,
"PMI_process_mapping",
val,
pmi->vallen_max);
if (result != PMI_SUCCESS)
goto done;
}
if (!(map = taskmap_decode (val, NULL))) {
result = PMI_FAIL;
goto done;
Expand Down
9 changes: 9 additions & 0 deletions src/common/libpmi/simple_server.h
Expand Up @@ -14,6 +14,15 @@
struct pmi_simple_server;

#define SIMPLE_KVS_KEY_MAX 64

/* Maximum size of a PMI KVS value. One might be tempted to increase
* this number to hold larger values, for example to hold an encoded
* PMI_process_mapping with a large count of tasks per node. However,
* experimentally, mpich and mvapich2 do not handle a larger max value
* correctly, and in many cases this causes a segfault in MPI. Therefore,
* it is suggested to leave SIMPLE_KVS_MAX at the de-facto standard of
* 1024 for now.
*/
#define SIMPLE_KVS_VAL_MAX 1024
#define SIMPLE_KVS_NAME_MAX 64

Expand Down
2 changes: 1 addition & 1 deletion src/common/libpmi/test/pmi_info.c
Expand Up @@ -109,7 +109,7 @@ int main(int argc, char *argv[])
int clen;
int *clique;
char *s;
char buf[256];
char buf[4096];

e = PMI_Get_clique_size (&clen);
if (e != PMI_SUCCESS)
Expand Down
19 changes: 18 additions & 1 deletion src/shell/pmi/pmi.c
Expand Up @@ -437,6 +437,22 @@ static int set_flux_instance_level (struct shell_pmi *pmi)
return rc;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In commit message for "...PMI key to RFC 34 taskmap":

s/with/will/
s/cuases/causes/

}

static int set_flux_taskmap (struct shell_pmi *pmi)
{
struct taskmap *map = pmi->shell->info->taskmap;
char *val = NULL;
int rc = -1;

if (!(val = taskmap_encode (map, TASKMAP_ENCODE_WRAPPED))
|| strlen (val) > SIMPLE_KVS_VAL_MAX)
goto out;
put_dict (pmi->locals, "flux.taskmap", val);
rc = 0;
out:
free (val);
return rc;
}

static void pmi_destroy (struct shell_pmi *pmi)
{
if (pmi) {
Expand Down Expand Up @@ -538,7 +554,8 @@ static struct shell_pmi *pmi_create (flux_shell_t *shell)
if (!nomap && init_clique (pmi) < 0)
goto error;
if (!shell->standalone) {
if (set_flux_instance_level (pmi) < 0)
if (set_flux_instance_level (pmi) < 0
|| (!nomap && set_flux_taskmap (pmi) < 0))
goto error;
}
return pmi;
Expand Down
12 changes: 12 additions & 0 deletions t/t3002-pmi.t
Expand Up @@ -94,4 +94,16 @@ test_expect_success 'PMI2 application abort is handled properly' '
${pmi2_info} --abort 0
'

# Ensure tha pmi_info can get clique ranks with a large enough
# number of tasks per node that PMI_process_mapping likely
# overflowed the max value length for the PMI-1 KVS. This ensures
# that the PMI client picked up flux.taskmap instead:

test_expect_success 'PMI1 can calculate clique ranks with 128 tasks per node' '
flux mini run -t 1m -N2 --taskmap=cyclic --tasks-per-node=128 \
${pmi_info} --clique >tpn.128.out &&
test_debug "cat tpn.128.out" &&
grep "0: clique=0,2,4,6,8,10,12,14,16,18,20" tpn.128.out
'

test_done