From 7e19c3740a68fc34316fae73efe3b7722f415727 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Tue, 30 Jul 2019 10:44:20 -0700 Subject: [PATCH 1/6] libpmi: add pmi_process_mapping_encode() Add a helper for encoding the PMI_process_mapping value from an array of 'struct pmi_map_block's. Link clique.c in with both libpmi_server.so and libpmi_client.so instead of just the client, so this server-side helper can be used by server implementations. --- src/common/libpmi/Makefile.am | 6 +++--- src/common/libpmi/clique.c | 38 +++++++++++++++++++++++++++++++++++ src/common/libpmi/clique.h | 8 ++++++++ 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/src/common/libpmi/Makefile.am b/src/common/libpmi/Makefile.am index 0d6a03994903..7668f10b4bab 100644 --- a/src/common/libpmi/Makefile.am +++ b/src/common/libpmi/Makefile.am @@ -19,7 +19,9 @@ libpmi_common_sources = \ pmi_strerror.c \ pmi_strerror.h \ keyval.c \ - keyval.h + keyval.h \ + clique.c \ + clique.h libpmi_client_la_SOURCES = \ simple_client.c \ @@ -27,8 +29,6 @@ libpmi_client_la_SOURCES = \ pmi.c \ dgetline.c \ dgetline.h \ - clique.c \ - clique.h \ $(libpmi_common_sources) libpmi_server_la_SOURCES = \ diff --git a/src/common/libpmi/clique.c b/src/common/libpmi/clique.c index 7894e1cd4d81..3ef432a095b3 100644 --- a/src/common/libpmi/clique.c +++ b/src/common/libpmi/clique.c @@ -11,6 +11,7 @@ #if HAVE_CONFIG_H #include "config.h" #endif +#include #include #include #include @@ -20,6 +21,43 @@ #include "pmi.h" #include "clique.h" +static int catprintf (char **buf, int *bufsz, const char *fmt, ...) +{ + va_list ap; + int n; + + va_start (ap, fmt); + n = vsnprintf (*buf, *bufsz, fmt, ap); + va_end (ap); + if (n >= *bufsz) + return -1; + *bufsz -= n; + *buf += n; + return 0; +} + +int pmi_process_mapping_encode (struct pmi_map_block *blocks, + int nblocks, + char *buf, + int bufsz) +{ + int i; + + if (catprintf (&buf, &bufsz, "(vector,") < 0) + return -1; + for (i = 0; i < nblocks; i++) { + if (catprintf (&buf, &bufsz, "%s(%d,%d,%d)", + i > 0 ? "," : "", + blocks[i].nodeid, + blocks[i].nodes, + blocks[i].procs) < 0) + return -1; + } + if (catprintf (&buf, &bufsz, ")") < 0) + return -1; + return 0; +} + static int parse_block (const char *s, struct pmi_map_block *block) { char *endptr; diff --git a/src/common/libpmi/clique.h b/src/common/libpmi/clique.h index adbfae4841b4..2c32ba2bfe3c 100644 --- a/src/common/libpmi/clique.h +++ b/src/common/libpmi/clique.h @@ -41,6 +41,14 @@ struct pmi_map_block { int pmi_process_mapping_parse (const char *s, struct pmi_map_block **blocks, int *nblocks); +/* Generate PMI_process_mapping value string from array of pmi_map_blocks, + * and place it in 'buf'. Result will be null terminated. + */ +int pmi_process_mapping_encode (struct pmi_map_block *blocks, + int nblocks, + char *buf, + int bufsz); + /* Determine the nodeid that will start 'rank', and return it in 'nodeid'. */ From b6ec83e042d5dfdb0d30bf5a4771133b5a554610 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Tue, 30 Jul 2019 11:19:53 -0700 Subject: [PATCH 2/6] flux-shell: populate PMI_process_mapping in PMI KVS Problem: PMI_process_mapping is unavailable for clique computation by MPI tasks. Add logic to generate this key, and add it to the PMI KVS hash on each shell. If the key cannot be set because the value exceeds the static value length, emit a log message on stderr but don't treat it as an error since the client can run without clique information. Fixes #2277 --- src/shell/pmi.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/src/shell/pmi.c b/src/shell/pmi.c index f49f89d702bd..e565b7ca78bd 100644 --- a/src/shell/pmi.c +++ b/src/shell/pmi.c @@ -68,6 +68,7 @@ #include #include "src/common/libpmi/simple_server.h" +#include "src/common/libpmi/clique.h" #include "src/common/libutil/log.h" #include "task.h" @@ -279,6 +280,57 @@ void shell_pmi_task_ready (struct shell_task *task, void *arg) } } +/* Generate 'PMI_process_mapping' key (see RFC 13) for MPI clique computation. + * + * Create an array of pmi_map_block structures, sized for worst case mapping + * (no compression possible). Walk through the rcalc info for each shell rank. + * If shell's mapping looks identical to previous one, increment block->nodes; + * otherwise consume another array slot. Finally, encode to string, put it + * in the local KVS hash, and free array. + */ +int init_clique (struct shell_pmi *pmi) +{ + struct pmi_map_block *blocks; + int nblocks; + int i; + char val[SIMPLE_KVS_VAL_MAX]; + + if (!(blocks = calloc (pmi->shell->info->shell_size, sizeof (*blocks)))) + return -1; + nblocks = 0; + + for (i = 0; i < pmi->shell->info->shell_size; i++) { + struct rcalc_rankinfo ri; + + if (rcalc_get_nth (pmi->shell->info->rcalc, i, &ri) < 0) + goto error; + if (nblocks == 0 || blocks[nblocks - 1].procs != ri.ntasks) { + blocks[nblocks].nodeid = i; + blocks[nblocks].procs = ri.ntasks; + blocks[nblocks].nodes = 1; + nblocks++; + } + else + blocks[nblocks - 1].nodes++; + } + /* If value exceeds SIMPLE_KVS_VAL_MAX, skip setting the key + * without generating an error. The client side will not treat + * a missing key as an error. It should be unusual though so log it. + */ + if (pmi_process_mapping_encode (blocks, nblocks, val, sizeof (val)) < 0) { + log_err ("pmi_process_mapping_encode"); + goto out; + } + zhashx_update (pmi->kvs, "PMI_process_mapping", val); +out: + free (blocks); + return 0; +error: + free (blocks); + errno = EINVAL; + return -1; +} + void shell_pmi_destroy (struct shell_pmi *pmi) { if (pmi) { @@ -339,6 +391,8 @@ struct shell_pmi *shell_pmi_create (flux_shell_t *shell) } zhashx_set_destructor (pmi->kvs, kvs_value_destructor); zhashx_set_duplicator (pmi->kvs, kvs_value_duplicator); + if (init_clique (pmi) < 0) + goto error; return pmi; error: shell_pmi_destroy (pmi); From 41d1e83615f3d0784863188ef46f769989c98e91 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Tue, 30 Jul 2019 11:40:27 -0700 Subject: [PATCH 3/6] testsuite: verify standalone shell has one clique --- t/t2601-job-shell-standalone.t | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/t/t2601-job-shell-standalone.t b/t/t2601-job-shell-standalone.t index 5dfa0afbaf72..fccb4aaddade 100755 --- a/t/t2601-job-shell-standalone.t +++ b/t/t2601-job-shell-standalone.t @@ -135,6 +135,13 @@ test_expect_success 'flux-shell: shell PMI works' ' ${FLUX_SHELL} -v -s -r 0 -j j8pmi -R R8 51 \ >pmi_info.out 2>pmi_info.err ' +test_expect_success 'flux-shell: shell PMI exports clique info' ' + flux jobspec srun -N1 -n8 ${PMI_INFO} -c >j8pmi_clique && + ${FLUX_SHELL} -v -s -r 0 -j j8pmi_clique -R R8 51 \ + >pmi_clique.out 2>pmi_clique.err && + COUNT=$(grep "clique=0,1,2,3,4,5,6,7" pmi_clique.out | wc -l) && + test ${COUNT} -eq 8 +' test_expect_success 'flux-shell: shell PMI KVS works' ' flux jobspec srun -N1 -n8 ${KVSTEST} >j8kvs && ${FLUX_SHELL} -v -s -r 0 -j j8kvs -R R8 52 \ From 251a02c1ff46344f44ee68cfb7452c71b38f3eb7 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Tue, 30 Jul 2019 11:59:37 -0700 Subject: [PATCH 4/6] testsuite: verify PMI cliques with various ppn --- t/t2602-job-shell.t | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/t/t2602-job-shell.t b/t/t2602-job-shell.t index 35e70026249d..3c7a67c2e223 100755 --- a/t/t2602-job-shell.t +++ b/t/t2602-job-shell.t @@ -70,6 +70,43 @@ test_expect_success 'job-shell: PMI works' ' flux job attach $id >pmi_info.out 2>pmi_info.err && grep size=4 pmi_info.out ' +test_expect_success 'pmi-shell: PMI cliques are correct for 1 ppn' ' + id=$(flux jobspec srun -N4 -n4 ${PMI_INFO} -c | flux job submit) && + flux job attach $id >pmi_clique1.raw 2>pmi_clique1.err && + sort -snk1 pmi_clique1.out && + sort >pmi_clique1.exp <<-EOT && + 0: clique=0 + 1: clique=1 + 2: clique=2 + 3: clique=3 + EOT + test_cmp pmi_clique1.exp pmi_clique1.out +' +test_expect_success 'pmi-shell: PMI cliques are correct for 2 ppn' ' + id=$(flux jobspec srun -N2 -n4 ${PMI_INFO} -c | flux job submit) && + flux job attach $id >pmi_clique2.raw 2>pmi_clique2.err && + sort -snk1 pmi_clique2.out && + sort >pmi_clique2.exp <<-EOT && + 0: clique=0,1 + 1: clique=0,1 + 2: clique=2,3 + 3: clique=2,3 + EOT + test_cmp pmi_clique2.exp pmi_clique2.out +' +test_expect_success 'pmi-shell: PMI cliques are correct for irregular ppn' ' + id=$(flux jobspec srun -N4 -n5 ${PMI_INFO} -c | flux job submit) && + flux job attach $id >pmi_cliquex.raw 2>pmi_cliquex.err && + sort -snk1 pmi_cliquex.out && + sort >pmi_cliquex.exp <<-EOT && + 0: clique=0,1 + 1: clique=0,1 + 2: clique=2 + 3: clique=3 + 4: clique=4 + EOT + test_cmp pmi_cliquex.exp pmi_cliquex.out +' test_expect_success 'job-shell: PMI KVS works' ' id=$(flux jobspec srun -N4 ${KVSTEST} | flux job submit) && flux job attach $id >kvstest.out 2>kvstest.err && From fdea25c5b6d76e3835d8ef31c09332f20806d018 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Tue, 30 Jul 2019 12:04:52 -0700 Subject: [PATCH 5/6] flux-shell: [cleanup] reword PMI inline doc Problem: this description of callbacks from the PMI protocol engine was unclear. It sounds like the whole thing is blocked while one callback is handled. Reword to emphasize that only the task that caused the callback is effectively blocked while awaiting a response. Other tasks may continue to work in the protocol engine, and the shell's reactor is never stalled. --- src/shell/pmi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/shell/pmi.c b/src/shell/pmi.c index e565b7ca78bd..6670583a6c93 100644 --- a/src/shell/pmi.c +++ b/src/shell/pmi.c @@ -27,10 +27,10 @@ * * Other requests have callbacks from the engine to provide data, * which is fed back to the engine, which then calls shell_pmi_response_send(). - * These are kvs_get, kvs_put, and barrier. Although the protocol engine + * These are kvs_get, kvs_put, and barrier. Although the task * is effectively blocked while these callbacks are handled, they are - * implemented with asynchronous continuation callbacks so that the shell's - * reactor remains live while they are waiting for an answer. + * implemented with asynchronous continuation callbacks so that other tasks + * and the shell's reactor remain live while the task awaits an answer. * * The PMI KVS supports a put / barrier / get pattern. The barrier * distributes KVS data that was "put" so that it is available to "get". From b3852e96e65a8e8320ba296fbc71e8dcd478bd53 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Tue, 30 Jul 2019 16:07:16 -0700 Subject: [PATCH 6/6] flux-shell: don't commit PMI_process_mapping PMI_process_mapping is added to the local PMI hash at initialization, and should have the same value on every shell. But because it's in the hash, it's considered "dirty data" to be committed to the shared KVS during a PMI_Barrier. Explicitly exclude this key from the KVS transaction committed during the PMI_Barrier, and thus avoid overhead that has no effect. --- src/shell/pmi.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/shell/pmi.c b/src/shell/pmi.c index 6670583a6c93..129fb2b2be2f 100644 --- a/src/shell/pmi.c +++ b/src/shell/pmi.c @@ -205,6 +205,10 @@ static int shell_pmi_barrier_enter (void *arg) val = zhashx_first (pmi->kvs); while (val) { key = zhashx_cursor (pmi->kvs); + if (!strcmp (key, "PMI_process_mapping")) { + val = zhashx_next (pmi->kvs); + continue; + } if (shell_pmi_kvs_key (nkey, sizeof (nkey), pmi->shell->jobid,