Skip to content

Commit

Permalink
Merge pull request #3105 from garlick/pmi_logging
Browse files Browse the repository at this point in the history
broker: restore client-side PMI logging
  • Loading branch information
mergify[bot] committed Aug 2, 2020
2 parents c06e0fa + 000dc78 commit 63b1f16
Showing 1 changed file with 187 additions and 78 deletions.
265 changes: 187 additions & 78 deletions src/broker/pmiutil.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@
#include "pmiutil.h"
#include "liblist.h"

typedef enum {
PMI_MODE_SINGLETON,
PMI_MODE_DLOPEN,
PMI_MODE_WIRE1,
} pmi_mode_t;

struct pmi_dso {
void *dso;
int (*init) (int *spawned);
Expand All @@ -43,8 +49,33 @@ struct pmi_handle {
struct pmi_dso *dso;
struct pmi_simple_client *cli;
int debug;
pmi_mode_t mode;
int rank;
};

static void vdebugf (struct pmi_handle *pmi, const char *fmt, va_list ap)
{

if (pmi->debug) {
char buf[1024];
(void)vsnprintf (buf, sizeof (buf), fmt, ap);
fprintf (stderr, "pmi-debug-%s[%d]: %s\n",
pmi->mode == PMI_MODE_SINGLETON ? "singleton" :
pmi->mode == PMI_MODE_WIRE1 ? "wire.1" :
pmi->mode == PMI_MODE_DLOPEN ? "dlopen" : "unknown",
pmi->rank,
buf);
}
}

static void debugf (struct pmi_handle *pmi, const char *fmt, ...)
{
va_list ap;
va_start (ap, fmt);
vdebugf (pmi, fmt, ap);
va_end (ap);
}

static void broker_pmi_dlclose (struct pmi_dso *dso)
{
if (dso) {
Expand Down Expand Up @@ -77,20 +108,20 @@ static struct pmi_dso *broker_pmi_dlopen (const char *pmi_library, int debug)
if (debug) {
char *errstr = dlerror ();
if (errstr)
log_msg ("%s", errstr);
log_msg ("pmi-debug-dlopen: %s", errstr);
else
log_msg ("dlopen %s failed", name);
log_msg ("pmi-debug-dlopen: dlopen %s failed", name);
}
}
else if (dlsym (dso->dso, "flux_pmi_library")) {
if (debug)
log_msg ("skipping %s", name);
log_msg ("pmi-debug-dlopen: skipping %s", name);
dlclose (dso->dso);
dso->dso = NULL;
}
else {
if (debug)
log_msg ("dlopen %s", name);
log_msg ("pmi-debug-dlopen: library name %s", name);
}
}
liblist_destroy (libs);
Expand All @@ -110,7 +141,8 @@ static struct pmi_dso *broker_pmi_dlopen (const char *pmi_library, int debug)
if (!dso->init || !dso->finalize || !dso->get_size || !dso->get_rank
|| !dso->barrier || !dso->kvs_get_my_name
|| !dso->kvs_put || !dso->kvs_commit || !dso->kvs_get) {
log_msg ("dlsym: %s is missing required symbols", pmi_library);
log_msg ("pmi-debug-dlopen: dlsym: %s is missing required symbols",
pmi_library);
goto error;
}
return dso;
Expand All @@ -123,23 +155,48 @@ static struct pmi_dso *broker_pmi_dlopen (const char *pmi_library, int debug)

int broker_pmi_kvs_commit (struct pmi_handle *pmi, const char *kvsname)
{
if (pmi->cli)
return PMI_SUCCESS;
if (pmi->dso)
return pmi->dso->kvs_commit (kvsname);
return PMI_SUCCESS;
int ret = PMI_SUCCESS;

switch (pmi->mode) {
case PMI_MODE_SINGLETON:
break;
case PMI_MODE_WIRE1:
break;
case PMI_MODE_DLOPEN:
ret = pmi->dso->kvs_commit (kvsname);
break;
}
debugf (pmi,
"kvs_commit (kvsname=%s) = %s",
kvsname,
pmi_strerror (ret));
return ret;
}

int broker_pmi_kvs_put (struct pmi_handle *pmi,
const char *kvsname,
const char *key,
const char *value)
{
if (pmi->cli)
return pmi_simple_client_kvs_put (pmi->cli, kvsname, key, value);
if (pmi->dso)
return pmi->dso->kvs_put (kvsname, key, value);
return PMI_SUCCESS;
int ret = PMI_SUCCESS;

switch (pmi->mode) {
case PMI_MODE_SINGLETON:
break;
case PMI_MODE_WIRE1:
ret = pmi_simple_client_kvs_put (pmi->cli, kvsname, key, value);
break;
case PMI_MODE_DLOPEN:
ret = pmi->dso->kvs_put (kvsname, key, value);
break;
}
debugf (pmi,
"kvs_put (kvsname=%s key=%s value=%s) = %s",
kvsname,
key,
value,
pmi_strerror (ret));
return ret;
}

int broker_pmi_kvs_get (struct pmi_handle *pmi,
Expand All @@ -148,87 +205,134 @@ int broker_pmi_kvs_get (struct pmi_handle *pmi,
char *value,
int len)
{
if (pmi->cli)
return pmi_simple_client_kvs_get (pmi->cli, kvsname, key, value, len);
if (pmi->dso)
return pmi->dso->kvs_get (kvsname, key, value, len);
return PMI_FAIL;
int ret = PMI_FAIL;

switch (pmi->mode) {
case PMI_MODE_SINGLETON:
break;
case PMI_MODE_WIRE1:
ret = pmi_simple_client_kvs_get (pmi->cli, kvsname, key, value, len);
break;
case PMI_MODE_DLOPEN:
ret = pmi->dso->kvs_get (kvsname, key, value, len);
break;
}
debugf (pmi,
"kvs_get (kvsname=%s key=%s value=%s) = %s",
kvsname,
key,
ret == PMI_SUCCESS ? value : "<none>",
pmi_strerror (ret));
return ret;
}

int broker_pmi_barrier (struct pmi_handle *pmi)
{
if (pmi->cli)
return pmi_simple_client_barrier (pmi->cli);
if (pmi->dso)
return pmi->dso->barrier();
return PMI_SUCCESS;
int ret = PMI_SUCCESS;

switch (pmi->mode) {
case PMI_MODE_SINGLETON:
break;
case PMI_MODE_WIRE1:
ret = pmi_simple_client_barrier (pmi->cli);
break;
case PMI_MODE_DLOPEN:
ret = pmi->dso->barrier();
break;
}
debugf (pmi, "barrier = %s", pmi_strerror (ret));
return ret;
}

int broker_pmi_get_params (struct pmi_handle *pmi,
struct pmi_params *params)
{
int result;
int ret = PMI_SUCCESS;

if (pmi->cli) {
params->rank = pmi->cli->rank;
params->size = pmi->cli->size;
result = pmi_simple_client_kvs_get_my_name (pmi->cli,
params->kvsname,
sizeof (params->kvsname));
if (result != PMI_SUCCESS)
goto error;
}
else if (pmi->dso) {
result = pmi->dso->get_rank (&params->rank);
if (result != PMI_SUCCESS)
goto error;
result = pmi->dso->get_size (&params->size);
if (result != PMI_SUCCESS)
goto error;
result = pmi->dso->kvs_get_my_name (params->kvsname,
sizeof (params->kvsname));
if (result != PMI_SUCCESS)
goto error;
}
else {
params->rank = 0;
params->size = 1;
snprintf (params->kvsname, sizeof (params->kvsname), "singleton");
switch (pmi->mode) {
case PMI_MODE_SINGLETON:
params->rank = 0;
params->size = 1;
snprintf (params->kvsname, sizeof (params->kvsname), "singleton");
break;
case PMI_MODE_WIRE1:
params->rank = pmi->cli->rank;
params->size = pmi->cli->size;
ret = pmi_simple_client_kvs_get_my_name (pmi->cli,
params->kvsname,
sizeof (params->kvsname));
break;
case PMI_MODE_DLOPEN:
if ((ret = pmi->dso->get_rank (&params->rank)) != PMI_SUCCESS)
break;
if ((ret = pmi->dso->get_size (&params->size)) != PMI_SUCCESS)
break;
ret = pmi->dso->kvs_get_my_name (params->kvsname,
sizeof (params->kvsname));
break;
}

return PMI_SUCCESS;
error:
return result;
if (ret == PMI_SUCCESS)
pmi->rank = params->rank;
debugf (pmi,
"get_params (rank=%d size=%d kvsname=%s) = %s",
ret == PMI_SUCCESS ? params->rank : -1,
ret == PMI_SUCCESS ? params->size : -1,
ret == PMI_SUCCESS ? params->kvsname: "<none>",
pmi_strerror (ret));
return ret;
}

int broker_pmi_init (struct pmi_handle *pmi)
{
int spawned;
int ret = PMI_SUCCESS;

if (pmi->cli)
return pmi_simple_client_init (pmi->cli);
if (pmi->dso)
return pmi->dso->init(&spawned);
return PMI_SUCCESS;
switch (pmi->mode) {
case PMI_MODE_SINGLETON:
break;
case PMI_MODE_WIRE1:
ret = pmi_simple_client_init (pmi->cli);
break;
case PMI_MODE_DLOPEN:
ret = pmi->dso->init(&spawned);
break;
}
debugf (pmi, "init = %s", pmi_strerror (ret));
return ret;
}

int broker_pmi_finalize (struct pmi_handle *pmi)
{
if (pmi->cli)
return pmi_simple_client_finalize (pmi->cli);
if (pmi->dso)
return pmi->dso->finalize ();
int ret = PMI_SUCCESS;

switch (pmi->mode) {
case PMI_MODE_SINGLETON:
break;
case PMI_MODE_WIRE1:
ret = pmi_simple_client_finalize (pmi->cli);
break;
case PMI_MODE_DLOPEN:
ret = pmi->dso->finalize ();
break;
}
debugf (pmi, "finalize = %s", pmi_strerror (ret));
return PMI_SUCCESS;
}

void broker_pmi_destroy (struct pmi_handle *pmi)
{
if (pmi) {
int saved_errno = errno;
if (pmi->cli)
pmi_simple_client_destroy (pmi->cli);
else if (pmi->dso)
broker_pmi_dlclose (pmi->dso);
switch (pmi->mode) {
case PMI_MODE_SINGLETON:
break;
case PMI_MODE_WIRE1:
pmi_simple_client_destroy (pmi->cli);
break;
case PMI_MODE_DLOPEN:
broker_pmi_dlclose (pmi->dso);
break;
}
free (pmi);
errno = saved_errno;
}
Expand All @@ -244,24 +348,29 @@ struct pmi_handle *broker_pmi_create (void)
struct pmi_handle *pmi = calloc (1, sizeof (*pmi));
if (!pmi)
return NULL;
pmi->rank = -1;
pmi_debug = getenv ("FLUX_PMI_DEBUG");
if (pmi_debug)
pmi->debug = strtol (pmi_debug, NULL, 10);
pmi->cli = pmi_simple_client_create_fd (getenv ("PMI_FD"),
getenv ("PMI_RANK"),
getenv ("PMI_SIZE"),
NULL);
if ((pmi->cli = pmi_simple_client_create_fd (getenv ("PMI_FD"),
getenv ("PMI_RANK"),
getenv ("PMI_SIZE"),
NULL))) {
pmi->mode = PMI_MODE_WIRE1;
}
/* N.B. SLURM boldly installs its libpmi.so into the system libdir,
* so it will be found here, even if not running in a SLURM job.
* Fortunately it emulates singleton in that case, in lieu of failing.
*/
if (!pmi->cli)
pmi->dso = broker_pmi_dlopen (getenv ("PMI_LIBRARY"), pmi->debug);
/* If neither pmi->cli nor pmi->dso is set, singleton is assumed later.
else if ((pmi->dso = broker_pmi_dlopen (getenv ("PMI_LIBRARY"),
pmi->debug))) {
pmi->mode = PMI_MODE_DLOPEN;
}
/* If neither pmi->cli nor pmi->dso is set, singleton is assumed.
*/
if (pmi->debug)
log_msg ("using %s", pmi->cli ? "PMI-1 wire protocol"
: pmi->dso ? "dlopen" : "singleton");
else {
pmi->mode = PMI_MODE_SINGLETON;
}
return pmi;
}

Expand Down

0 comments on commit 63b1f16

Please sign in to comment.