From 951c000f98a277d94e993ed7f6b92512cdb9806a Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Fri, 24 Aug 2018 15:22:59 -0700 Subject: [PATCH 01/20] broker/: Remove unused variable --- src/broker/broker.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/broker/broker.c b/src/broker/broker.c index 79582ce5ce02..5cfe51b3c20d 100644 --- a/src/broker/broker.c +++ b/src/broker/broker.c @@ -144,7 +144,6 @@ typedef struct { char *init_shell_cmd; size_t init_shell_cmd_len; - struct subprocess *init_shell; } broker_ctx_t; static int broker_event_sendmsg (broker_ctx_t *ctx, const flux_msg_t *msg); From e175d5c4fc6239bb904ed6f56025bcf636e68c56 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Sat, 25 Aug 2018 09:39:38 -0700 Subject: [PATCH 02/20] cmd/flux-start: Fix invalid error message output --- src/cmd/flux-start.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cmd/flux-start.c b/src/cmd/flux-start.c index 1fa257f793e6..b227728f03a6 100644 --- a/src/cmd/flux-start.c +++ b/src/cmd/flux-start.c @@ -334,7 +334,7 @@ void add_args_list (char **argz, size_t *argz_len, optparse_t *opt, const char * optparse_getopt_iterator_reset (opt, name); while ((arg = optparse_getopt_next (opt, name))) if (argz_add (argz, argz_len, arg) != 0) - log_err_exit ("subprocess_argv_append"); + log_err_exit ("argv_add"); } char *create_scratch_dir (const char *session_id) From 6e2887c7b0901a2460719ca1c3ba874eee158871 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Mon, 13 Aug 2018 11:00:55 -0700 Subject: [PATCH 03/20] common/subprocess: New subprocess API Add support for a new flux subprocess API that merges both local subprocess support (in the old "libsubprocess") and remote subprocess support and remote subprocess server support. Support a new cmd API to encapsulate the commands that callers wish to launch in subprocces. By Albert Chu and Mark Grondona --- configure.ac | 1 + src/common/Makefile.am | 4 +- src/common/subprocess/Makefile.am | 30 + src/common/subprocess/command.c | 758 ++++++++++++++ src/common/subprocess/command.h | 45 + src/common/subprocess/local.c | 703 +++++++++++++ src/common/subprocess/local.h | 8 + src/common/subprocess/remote.c | 771 ++++++++++++++ src/common/subprocess/remote.h | 12 + src/common/subprocess/server.c | 704 +++++++++++++ src/common/subprocess/server.h | 13 + src/common/subprocess/subprocess.c | 1071 ++++++++++++++++++++ src/common/subprocess/subprocess.h | 356 +++++++ src/common/subprocess/subprocess_private.h | 120 +++ src/common/subprocess/util.c | 94 ++ src/common/subprocess/util.h | 12 + 16 files changed, 4701 insertions(+), 1 deletion(-) create mode 100644 src/common/subprocess/Makefile.am create mode 100644 src/common/subprocess/command.c create mode 100644 src/common/subprocess/command.h create mode 100644 src/common/subprocess/local.c create mode 100644 src/common/subprocess/local.h create mode 100644 src/common/subprocess/remote.c create mode 100644 src/common/subprocess/remote.h create mode 100644 src/common/subprocess/server.c create mode 100644 src/common/subprocess/server.h create mode 100644 src/common/subprocess/subprocess.c create mode 100644 src/common/subprocess/subprocess.h create mode 100644 src/common/subprocess/subprocess_private.h create mode 100644 src/common/subprocess/util.c create mode 100644 src/common/subprocess/util.h diff --git a/configure.ac b/configure.ac index 63252d5206b3..fa40b8219ae2 100644 --- a/configure.ac +++ b/configure.ac @@ -311,6 +311,7 @@ AC_CONFIG_FILES( \ src/common/libjsc/Makefile \ src/common/libjob/Makefile \ src/common/libsubprocess/Makefile \ + src/common/subprocess/Makefile \ src/common/libcompat/Makefile \ src/common/liboptparse/Makefile \ src/common/libidset/Makefile \ diff --git a/src/common/Makefile.am b/src/common/Makefile.am index 5a36b355881e..1495bb18c4c6 100644 --- a/src/common/Makefile.am +++ b/src/common/Makefile.am @@ -13,7 +13,8 @@ SUBDIRS = libtap \ liboptparse \ libidset \ libtomlc99 \ - libkz + libkz \ + subprocess if ENABLE_JOBSPEC SUBDIRS += libjobspec @@ -52,6 +53,7 @@ libflux_core_la_LIBADD = \ $(builddir)/libkvs/libkvs.la \ $(builddir)/libjsc/libjsc.la \ $(builddir)/libjob/libjob.la \ + $(builddir)/subprocess/libsubprocess.la \ libflux-internal.la libflux_core_la_LDFLAGS = \ -Wl,--version-script=$(srcdir)/libflux-core.map \ diff --git a/src/common/subprocess/Makefile.am b/src/common/subprocess/Makefile.am new file mode 100644 index 000000000000..b69cca70a8cf --- /dev/null +++ b/src/common/subprocess/Makefile.am @@ -0,0 +1,30 @@ +AM_CFLAGS = \ + $(WARNING_CFLAGS) \ + $(CODE_COVERAGE_CFLAGS) + +AM_LDFLAGS = \ + $(CODE_COVERAGE_LDFLAGS) + +AM_CPPFLAGS = \ + -I$(top_srcdir) -I$(top_srcdir)/src/include + + +noinst_LTLIBRARIES = \ + libsubprocess.la + +libsubprocess_la_SOURCES = \ + command.c \ + command.h \ + local.c \ + local.h \ + remote.c \ + remote.h \ + server.c \ + server.h \ + util.c \ + util.h \ + subprocess.c \ + subprocess_private.h + +fluxcoreinclude_HEADERS = \ + subprocess.h diff --git a/src/common/subprocess/command.c b/src/common/subprocess/command.c new file mode 100644 index 000000000000..8508ba2a1fde --- /dev/null +++ b/src/common/subprocess/command.c @@ -0,0 +1,758 @@ +/*****************************************************************************\ + * Copyright (c) 2017 Lawrence Livermore National Security, LLC. Produced at + * the Lawrence Livermore National Laboratory (cf, AUTHORS, DISCLAIMER.LLNS). + * LLNL-CODE-658032 All rights reserved. + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the license, or (at your option) + * any later version. + * + * Flux is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the terms and conditions of the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * See also: http://www.gnu.org/licenses/ +\*****************************************************************************/ +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include +#include + +#include +#include + +#include "command.h" + +struct flux_command { + char *cwd; + + /* Command arguments in argz format */ + size_t argz_len; + char *argz; + + /* Command environment hash */ + size_t envz_len; + char *envz; + + /* Extra key=value options */ + zhash_t *opts; + + /* Extra channels to create in the subprocess (i.e. socketpairs) */ + zlist_t *channels; +}; + +/* + * Static functions: + */ + +/* + * Initialize an argz vector. If av == NULL then the argz vector is + * freed and length (*argz_lenp) reset to 0, otherwise av is passed + * directly to argz_create(3). + */ +static int init_argz (char **argzp, size_t *argz_lenp, char * const av[]) +{ + int e; + if (*argzp != NULL) { + free (*argzp); + *argzp = NULL; + *argz_lenp = 0; + } + if (av && (e = argz_create (av, argzp, argz_lenp)) != 0) { + errno = e; + return -1; + } + return (0); +} + +/* + * Same as init_argz, but pass argument count (ac) and verify that + * the argument vector av has NULL as its final element. + */ +static int init_argz_count (char **argzp, size_t *argz_lenp, + int ac, char * const av[]) +{ + if (av && (av[ac] != NULL)) { + errno = EINVAL; + return -1; + } + return init_argz (argzp, argz_lenp, av); +} + +/* + * Append string defined by [fmt, ap] to argz vector in argzp + */ +static int argz_appendv (char **argzp, size_t *argz_lenp, + const char *fmt, va_list ap) +{ + int e; + char *s; + if (vasprintf (&s, fmt, ap) < 0) + return -1; + if ((e = argz_add (argzp, argz_lenp, s))) { + errno = e; + return -1; + } + free (s); + return 0; +} + +/* + * Return an argv/env array for argz/envz object. Caller must free + * the returned array, which is filled via argz_extract(3). + */ +static char **expand_argz (char *argz, size_t argz_len) +{ + size_t len; + char **argv; + + len = argz_count (argz, argz_len) + 1; + argv = calloc (len + 1, sizeof (char *)); + + argz_extract (argz, argz_len, argv); + + return (argv); +} + +/* + * Return the "name" portion of an environment entry of the form + * "NAME=VALUE", copying the name into the destination buffer `dst` + * of size `len`. If the result cannot fit in `dst`, NULL is returned + * as opposed to truncation. + * + * On success, a pointer to `dst` is returned. + */ +static char *env_entry_name (char *entry, char *dst, size_t len) +{ + char *p; + if (!entry) + return NULL; + /* If there is no '=' in entry, then "name" is the entire entry. */ + if (!(p = strchr (entry, '='))) + p = entry + strlen (entry) + 1; + + /* Refuse to truncate */ + if (len-1 < p - entry) + return NULL; + + /* strncat(3): safer than strncpy(3), faster than by-hand: */ + *dst = '\0'; + return strncat (dst, entry, p - entry); +} + +/* + * Return a pointer to the "value" portion of an environment entry + * of the form "NAME=VALUE". The result should not be modified as + * it points to a substring of the original `entry`. + * + * If `entry` does not contain an '=' character, then it has no value + * and NULL is returned. + */ +static const char * env_entry_value (const char *entry) +{ + char *p; + if (!entry || !(p = strchr (entry, '='))) + return NULL; + return p+1; +} + +static json_t * argz_tojson (const char *argz, size_t argz_len) +{ + char *arg = NULL; + json_t *o = json_array (); + + if (o == NULL) + goto err; + + while ((arg = argz_next (argz, argz_len, arg))) { + json_t *val = json_string (arg); + if (!val || json_array_append_new (o, val)) { + json_decref (val); + goto err; + } + } + return o; +err: + json_decref (o); + return NULL; +} + +static int argz_fromjson (json_t *o, char **argzp, size_t *argz_lenp) +{ + size_t index; + json_t *value; + + assert (*argzp == NULL && *argz_lenp == 0); + if (!json_is_array (o)) + goto fail; + + json_array_foreach (o, index, value) { + if (!json_is_string (value)) + goto fail; + if (argz_add (argzp, argz_lenp, json_string_value (value))) + goto fail; + } + return 0; +fail: + free (*argzp); + *argzp = NULL; + *argz_lenp = 0; + errno = EINVAL; + return -1; +} + +/* + * Convert and envz array (argz with NAME=VALUE entries) to a json + * dictionary object. + */ +static json_t * envz_tojson (const char *envz, size_t envz_len) +{ + char buf [1024]; + const char *name, *value; + char *entry = NULL; + json_t *o = json_object (); + + if (o == NULL) + goto err; + + while ((entry = argz_next (envz, envz_len, entry))) { + json_t *v; + if (!(name = env_entry_name (entry, buf, sizeof (buf)))) + continue; + if (!(value = env_entry_value (entry))) + continue; + if (!(v = json_string (value)) || json_object_set_new (o, name, v)) { + json_decref (v); + goto err; + } + } + return o; +err: + json_decref (o); + return NULL; +} + +static int envz_fromjson (json_t *o, char **envzp, size_t *envz_lenp) +{ + const char *var; + json_t *val; + int errnum = EINVAL; + + assert (*envzp == NULL && *envz_lenp == 0); + if (!json_is_object (o)) + goto fail; + + json_object_foreach (o, var, val) { + if (!json_is_string (val)) + goto fail; + if (envz_add (envzp, envz_lenp, var, json_string_value (val))) + goto fail; + } + return 0; +fail: + free (*envzp); + *envzp = NULL; + *envz_lenp = 0; + errno = errnum; + return -1; +} + +/* + * Convert a hash with string keys,values to json string + */ +static json_t * zhash_tojson (zhash_t *h) +{ + const char *val; + json_t *o = json_object (); + + if (o == NULL) + goto err; + + val = zhash_first (h); + while (val) { + json_t *v = json_string (val); + if (!v || json_object_set_new (o, zhash_cursor (h), v)) { + json_decref (v); + goto err; + } + val = zhash_next (h); + } + return o; +err: + json_decref (o); + return NULL; +} + +/* + * New zhash with string keys/vals from json dictionary `o`. + * "autofree" will be set on the hash. + */ +static zhash_t *zhash_fromjson (json_t *o) +{ + const char *key; + json_t *val; + zhash_t *h = NULL; + int errnum = EPROTO; + + if (!json_is_object (o)) + goto fail; + + h = zhash_new (); + zhash_autofree (h); + + json_object_foreach (o, key, val) { + if (!json_is_string (val)) + goto fail; + if (zhash_insert (h, key, (char *) json_string_value (val)) < 0) { + /* Duplicate key. This can't happen unless json object is + * corrupt, so give up and return error (EINVAL) + */ + goto fail; + } + } + return h; +fail: + if (h) + zhash_destroy (&h); + errno = errnum; + return NULL; +} + +static zlist_t *zlist_fromjson (json_t *o) +{ + int errnum = EPROTO; + size_t index; + json_t *value; + zlist_t *l = NULL; + + if (!json_is_array (o)) + goto fail; + l = zlist_new (); + zlist_autofree (l); + + json_array_foreach (o, index, value) { + if (!json_is_string (value)) + goto fail; + if (zlist_append (l, (char *) json_string_value (value)) < 0) { + errnum = errno; + goto fail; + } + } + return l; +fail: + zlist_destroy (&l); + errno = errnum; + return NULL; +} + +static json_t * zlist_tojson (zlist_t *l) +{ + char *s = NULL; + json_t *o = json_array (); + + if (o == NULL) + goto err; + + s = zlist_first (l); + while (s) { + json_t *val = json_string (s); + if (!val || json_array_append_new (o, val)) { + json_decref (val); + goto err; + } + s = zlist_next (l); + } + return o; +err: + json_decref (o); + return NULL; +} + + +static const char * z_list_find (zlist_t *l, const char *s) +{ + const char *v = zlist_first (l); + while (v != NULL) { + if (strcmp (s, v) == 0) + return (v); + v = zlist_next (l); + } + return NULL; +} + +/* Version of zhash_dup() that duplicates both string keys and values + */ +static zhash_t * z_hash_dup (zhash_t *src) +{ + zhash_t *new; + zlist_t *keys = zhash_keys (src); + const char *k; + + new = zhash_new (); + zhash_autofree (new); + + k = zlist_first (keys); + while (k) { + zhash_insert (new, k, zhash_lookup (src, k)); + k = zlist_next (keys); + } + zlist_destroy (&keys); + return (new); +} + +/***************************************************************************/ +/* + * flux_cmd_t interface + */ +static void flux_cmd_free (flux_cmd_t *cmd) +{ + if (cmd) { + free (cmd->cwd); + free (cmd->argz); + free (cmd->envz); + if (cmd->opts) + zhash_destroy (&cmd->opts); + if (cmd->channels) + zlist_destroy (&cmd->channels); + free (cmd); + } +} + +void flux_cmd_destroy (flux_cmd_t *cmd) +{ + flux_cmd_free (cmd); +} + +flux_cmd_t *flux_cmd_create (int argc, char *argv[], char **env) +{ + int err; + flux_cmd_t *cmd = calloc (1, sizeof (*cmd)); + + if (argv && init_argz_count (&cmd->argz, &cmd->argz_len, argc, argv) < 0) { + err = errno; + goto fail; + } + if (env && init_argz (&cmd->envz, &cmd->envz_len, env) < 0) { + err = errno; + goto fail; + } + + if (!(cmd->opts = zhash_new ()) + || !(cmd->channels = zlist_new ())) { + err = ENOMEM; + goto fail; + } + + /* Set autofree on both the opts hash and the channels list. + * + * This means keys in the hash and items in the list will automatically + * be strdup'd on insertion, and freed on destruction. For zlist + * it also makes zlist_dup() duplicate values instead of referencing + * the originals. + */ + zhash_autofree (cmd->opts); + zlist_autofree (cmd->channels); + + return (cmd); +fail: + flux_cmd_free (cmd); + errno = err; + return NULL; +} + +int flux_cmd_argc (const flux_cmd_t *cmd) +{ + return argz_count (cmd->argz, cmd->argz_len); +} + +const char *flux_cmd_arg (const flux_cmd_t *cmd, int n) +{ + char *arg = NULL; + int argc; + int i; + + argc = flux_cmd_argc (cmd); + if (n >= argc) { + errno = EINVAL; + return NULL; + } + + for (i = 0; i <= n; i++) + arg = argz_next (cmd->argz, cmd->argz_len, arg); + + return arg; +} + +int flux_cmd_argv_append (flux_cmd_t *cmd, const char *fmt, ...) +{ + int rc = 0; + int errnum = 0; + va_list ap; + va_start (ap, fmt); + if ((rc = argz_appendv (&cmd->argz, &cmd->argz_len, fmt, ap)) < 0) + errnum = errno; + va_end (ap); + errno = errnum; + return (rc); +} + +static int flux_cmd_setenv (flux_cmd_t *cmd, const char *k, const char *v, + int overwrite) +{ + if (!overwrite && envz_entry (cmd->envz, cmd->envz_len, k)) { + errno = EEXIST; + return -1; + } + if (envz_add (&cmd->envz, &cmd->envz_len, k, v) != 0) { + errno = ENOMEM; + return -1; + } + return 0; +} + +int flux_cmd_setenvf (flux_cmd_t *cmd, int overwrite, + const char *name, const char *fmt, ...) +{ + va_list ap; + char *val; + int rc; + + va_start (ap, fmt); + rc = vasprintf (&val, fmt, ap); + va_end (ap); + if (rc < 0) + return rc; + rc = flux_cmd_setenv (cmd, name, val, overwrite); + free (val); + return (rc); +} + +void flux_cmd_unsetenv (flux_cmd_t *cmd, const char *name) +{ + envz_remove (&cmd->envz, &cmd->envz_len, name); +} + +const char * flux_cmd_getenv (const flux_cmd_t *cmd, const char *name) +{ + return (envz_get (cmd->envz, cmd->envz_len, name)); +} + +int flux_cmd_setcwd (flux_cmd_t *cmd, const char *path) +{ + free (cmd->cwd); + cmd->cwd = strdup (path); + if (cmd->cwd == NULL) { + errno = ENOMEM; + return -1; + } + return 0; +} + +const char * flux_cmd_getcwd (const flux_cmd_t *cmd) +{ + return cmd->cwd; +} + +int flux_cmd_add_channel (flux_cmd_t *cmd, const char *name) +{ + if (name == NULL) + return -1; + if (z_list_find (cmd->channels, name)) { + errno = EEXIST; + return -1; + } + /* autofree is set on cmd->channels, so name is automatically strdup'd */ + return zlist_append (cmd->channels, (char *) name); +} + +int flux_cmd_setopt (flux_cmd_t *cmd, const char *var, const char *val) +{ + if (!var || !val) { + errno = EINVAL; + return -1; + } + /* autofree is set on cmd->opts, so val is automatically strdup'd */ + return zhash_insert (cmd->opts, var, (char *) val); +} + +const char *flux_cmd_getopt (flux_cmd_t *cmd, const char *var) +{ + return zhash_lookup (cmd->opts, var); +} + +flux_cmd_t * flux_cmd_copy (const flux_cmd_t *src) +{ + error_t e = 0; + flux_cmd_t *cmd = calloc (1, sizeof (*cmd)); + if (cmd == NULL) + goto err; + e = argz_append (&cmd->argz, &cmd->argz_len, src->argz, src->argz_len); + if (e != 0) + goto err; + e = argz_append (&cmd->envz, &cmd->envz_len, src->envz, src->envz_len); + if (e != 0) + goto err; + if (src->cwd && !(cmd->cwd = strdup (src->cwd))) + goto err; + cmd->channels = zlist_dup (src->channels); + cmd->opts = z_hash_dup (src->opts); + return (cmd); +err: + flux_cmd_destroy (cmd); + return NULL; +} + +flux_cmd_t * flux_cmd_fromjson (const char *json_str, json_error_t *errp) +{ + int errnum; + json_t *o = NULL; + json_t *jenv = NULL; + json_t *jargv = NULL; + json_t *jopts = NULL; + json_t *jchans = NULL; + const char *cwd; + flux_cmd_t *cmd = NULL;; + + if (!(o = json_loads (json_str, 0, errp))) { + errnum = EPROTO; + goto fail; + } + if (!(cmd = calloc (1, sizeof (*cmd)))) { + errnum = ENOMEM; + goto fail; + } + if (json_unpack_ex (o, errp, 0, "{s:s, s:o, s:o, s:o, s:o}", + "cwd", &cwd, + "cmdline", &jargv, + "env", &jenv, + "opts", &jopts, + "channels", &jchans) < 0) { + errnum = EPROTO; + goto fail; + } + if (!(cmd->cwd = strdup (cwd)) + || (argz_fromjson (jargv, &cmd->argz, &cmd->argz_len) < 0) + || (envz_fromjson (jenv, &cmd->envz, &cmd->envz_len) < 0) + || !(cmd->opts = zhash_fromjson (jopts)) + || !(cmd->channels = zlist_fromjson (jchans))) { + errnum = errno; + goto fail; + } + /* All sub-objects of `o` inherit reference from root object so + * this decref should free jenv, jargv, ... etc. + */ + json_decref (o); + return cmd; + +fail: + json_decref (o); + flux_cmd_destroy (cmd); + errno = errnum; + return NULL; +} + +char * flux_cmd_tojson (const flux_cmd_t *cmd) +{ + char *str = NULL; + json_t *o = json_object (); + json_t *a; + + /* Pack cwd */ + if (cmd->cwd) { + if (!(a = json_string (cmd->cwd))) + goto err; + if (json_object_set_new (o, "cwd", a) != 0) { + json_decref (a); + goto err; + } + } + + /* Pack argv */ + if (cmd->argz) { + if (!(a = argz_tojson (cmd->argz, cmd->argz_len))) + goto err; + if (json_object_set_new (o, "cmdline", a) != 0) { + json_decref (a); + goto err; + } + } + + /* Pack env */ + if (cmd->envz) { + if (!(a = envz_tojson (cmd->envz, cmd->envz_len))) + goto err; + if (json_object_set_new (o, "env", a) != 0) { + json_decref (a); + goto err; + } + } + + /* Pack opts dictionary */ + if (!(a = zhash_tojson (cmd->opts))) + goto err; + if (json_object_set_new (o, "opts", a) != 0) { + json_decref (a); + goto err; + } + + /* Pack channels */ + if (!(a = zlist_tojson (cmd->channels))) + goto err; + if (json_object_set_new (o, "channels", a) != 0) { + json_decref (a); + goto err; + } + str = json_dumps (o, JSON_COMPACT); + json_decref (o); + return str; +err: + json_decref (o); + return NULL; +} + +char **flux_cmd_env_expand (flux_cmd_t *cmd) +{ + return expand_argz (cmd->envz, cmd->envz_len); +} + +char **flux_cmd_argv_expand (flux_cmd_t *cmd) +{ + return expand_argz (cmd->argz, cmd->argz_len); +} + +int flux_cmd_set_env (flux_cmd_t *cmd, char **env) +{ + size_t new_envz_len = 0; + char *new_envz = NULL; + + if (init_argz (&new_envz, &new_envz_len, env) < 0) + return -1; + + if (cmd->envz) + free (cmd->envz); + cmd->envz = new_envz; + cmd->envz_len = new_envz_len; + + return 0; +} + +zlist_t *flux_cmd_channel_list (flux_cmd_t *cmd) +{ + return cmd->channels; +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/src/common/subprocess/command.h b/src/common/subprocess/command.h new file mode 100644 index 000000000000..5296407f1989 --- /dev/null +++ b/src/common/subprocess/command.h @@ -0,0 +1,45 @@ +#ifndef _SUBPROCESS_CMD_H +#define _SUBPROCESS_CMD_H + +#include +#include + +#include "subprocess.h" + +/* + * Internal only flux_cmd_t interfaces + */ + +/* + * Return JSON string representation of command object `cmd` + */ +char * flux_cmd_tojson (const flux_cmd_t *cmd); + +/* + * Return a newly allocated flux_cmd_t from a JSON string representation. + * Returns NULL on failure. + * If non-NULL, any jansson decode errors are returned in *errp. + */ +flux_cmd_t *flux_cmd_fromjson (const char *json_str, json_error_t *errp); + +/* + * Return environment for flux_cmd_t as a NULL terminated string array. + */ +char **flux_cmd_env_expand (flux_cmd_t *cmd); + +/* + * Return argument vector for flux_cmd_t as NULL terminated string array. + */ +char **flux_cmd_argv_expand (flux_cmd_t *cmd); + +/* + * Set an entirely new environment, discarding internal one. + */ +int flux_cmd_set_env (flux_cmd_t *cmd, char **env); + +/* + * Return list of channels. Should not be destryed by caller. + */ +zlist_t *flux_cmd_channel_list (flux_cmd_t *cmd); + +#endif /* !_SUBPROCESS_CMD_H */ diff --git a/src/common/subprocess/local.c b/src/common/subprocess/local.c new file mode 100644 index 000000000000..47b629cc49de --- /dev/null +++ b/src/common/subprocess/local.c @@ -0,0 +1,703 @@ +/*****************************************************************************\ + * Copyright (c) 2017 Lawrence Livermore National Security, LLC. Produced at + * the Lawrence Livermore National Laboratory (cf, AUTHORS, DISCLAIMER.LLNS). + * LLNL-CODE-658032 All rights reserved. + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the license, or (at your option) + * any later version. + * + * Flux is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the terms and conditions of the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * See also: http://www.gnu.org/licenses/ +\*****************************************************************************/ +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include +#include + +#include + +#include + +#include "src/common/libutil/log.h" +#include "src/common/libutil/fdwalk.h" +#include "src/common/libutil/base64.h" + +#include "subprocess.h" +#include "subprocess_private.h" +#include "command.h" +#include "local.h" +#include "util.h" + +static void local_channel_flush (struct subprocess_channel *c) +{ + /* This is a full channel with read and write, a close on the + * write side needs to "generate" an EOF on the read side + */ + if (!(c->flags & CHANNEL_READ)) + return; + + if (!c->eof_sent_to_caller && c->output_f) { + flux_buffer_t *fb; + int len; + + if (!(fb = flux_buffer_read_watcher_get_buffer (c->buffer_read_w))) { + flux_log_error (c->p->h, "flux_buffer_read_watcher_get_buffer"); + return; + } + + while ((len = flux_buffer_bytes (fb)) > 0) + c->output_f (c->p, c->name); + + /* eof call */ + c->output_f (c->p, c->name); + + c->eof_sent_to_caller = true; + c->p->channels_eof_sent++; + flux_watcher_stop (c->buffer_read_w); + + if (c->p->state == FLUX_SUBPROCESS_EXITED && c->eof_sent_to_caller) + subprocess_check_completed (c->p); + } +} + +static void local_in_cb (flux_reactor_t *r, flux_watcher_t *w, + int revents, void *arg) +{ + struct subprocess_channel *c = (struct subprocess_channel *)arg; + int err = 0; + + if (flux_buffer_write_watcher_is_closed (w, &err) == 1) { + if (err) + log_msg ("flux_buffer_write_watcher close error: %s", + strerror (err)); + else + c->parent_fd = -1; /* closed by reactor */ + flux_watcher_stop (w); /* c->buffer_write_w */ + local_channel_flush (c); + } + else + flux_log_error (c->p->h, "flux_buffer_write_watcher: stream %s: %d:", + c->name, revents); +} + +static void local_output (struct subprocess_channel *c, + flux_watcher_t *w, int revents, + flux_subprocess_output_f output_cb) +{ + bool eof_set = false; + + if (revents & FLUX_POLLIN) { + flux_buffer_t *fb; + if (!c->eof_sent_to_caller) { + + if (!(fb = flux_buffer_read_watcher_get_buffer (w))) { + flux_log_error (c->p->h, "flux_buffer_read_watcher_get_buffer"); + return; + } + + if (!flux_buffer_bytes (fb)) { + c->eof_sent_to_caller = true; + eof_set = true; + c->p->channels_eof_sent++; + } + } + + output_cb (c->p, c->name); + + if (c->p->state == FLUX_SUBPROCESS_EXITED && !c->eof_sent_to_caller) { + + if (!(fb = flux_buffer_read_watcher_get_buffer (w))) { + flux_log_error (c->p->h, "flux_buffer_read_watcher_get_buffer"); + return; + } + + if (!flux_buffer_bytes (fb)) { + + output_cb (c->p, c->name); + + c->eof_sent_to_caller = true; + eof_set = true; + c->p->channels_eof_sent++; + } + } + } + else + flux_log_error (c->p->h, "flux_buffer_read_watcher on %s: 0x%X:", + c->name, revents); + + if (eof_set) { + flux_watcher_stop (w); + + /* if the read pipe is ended, then we can go ahead and close + * the write side as well. Note that there is no need to + * "flush" the write buffer. If we've received the EOF on the + * read side, no more writes matter. + */ + if (c->flags & CHANNEL_WRITE) { + flux_watcher_stop (c->buffer_write_w); + c->closed = true; + } + } + + if (c->p->state == FLUX_SUBPROCESS_EXITED && c->eof_sent_to_caller) + subprocess_check_completed (c->p); +} + +static void local_out_cb (flux_reactor_t *r, flux_watcher_t *w, + int revents, void *arg) +{ + struct subprocess_channel *c = (struct subprocess_channel *)arg; + local_output (c, w, revents, c->p->ops.on_channel_out); +} + +static void local_stdout_cb (flux_reactor_t *r, flux_watcher_t *w, + int revents, void *arg) +{ + struct subprocess_channel *c = (struct subprocess_channel *)arg; + local_output (c, w, revents, c->p->ops.on_stdout); +} + +static void local_stderr_cb (flux_reactor_t *r, flux_watcher_t *w, + int revents, void *arg) +{ + struct subprocess_channel *c = (struct subprocess_channel *)arg; + local_output (c, w, revents, c->p->ops.on_stderr); +} + +static int channel_local_setup (flux_subprocess_t *p, + flux_subprocess_output_f output_f, + flux_watcher_f in_cb, + flux_watcher_f out_cb, + const char *name, + int channel_flags, + int buffer_size) +{ + struct subprocess_channel *c = NULL; + int fds[2] = { -1, -1 }; + char *e = NULL; + int save_errno; + + if (!(c = channel_create (p, output_f, name, channel_flags))) { + flux_log_error (p->h, "calloc"); + goto error; + } + + if (socketpair (PF_LOCAL, SOCK_STREAM, 0, fds) < 0) { + flux_log_error (p->h, "socketpair"); + goto error; + } + + c->parent_fd = fds[0]; + c->child_fd = fds[1]; + + /* set fds[] to -1, on error is now subprocess_free()'s + * responsibility + */ + fds[0] = -1; + fds[1] = -1; + + if ((channel_flags & CHANNEL_WRITE) && in_cb) { + c->buffer_write_w = flux_buffer_write_watcher_create (p->reactor, + c->parent_fd, + buffer_size, + in_cb, + 0, + c); + if (!c->buffer_write_w) { + flux_log_error (p->h, "flux_buffer_write_watcher_create"); + goto error; + } + } + + if ((channel_flags & CHANNEL_READ) && out_cb) { + c->buffer_read_w = flux_buffer_read_watcher_create (p->reactor, + c->parent_fd, + buffer_size, + out_cb, + 0, + c); + if (!c->buffer_read_w) { + flux_log_error (p->h, "flux_buffer_read_watcher_create"); + goto error; + } + + p->channels_eof_expected++; + } + + if (channel_flags & CHANNEL_FD) { + if (asprintf (&e, "%s_FD", name) < 0) { + flux_log_error (p->h, "asprintf"); + goto error; + } + + /* set overwrite flag, if caller recursively launches + * another subprocess */ + if (flux_cmd_setenvf (p->cmd, + 1, + e, + "%d", + c->child_fd) < 0) { + flux_log_error (p->h, "flux_cmd_setenvf"); + goto error; + } + } + + if (zhash_insert (p->channels, name, c) < 0) { + flux_log_error (p->h, "zhash_insert"); + goto error; + } + if (!zhash_freefn (p->channels, name, channel_destroy)) { + flux_log_error (p->h, "zhash_freefn"); + goto error; + } + + /* now error is in subprocess_free()'s responsibility + */ + c = NULL; + + free (e); + return 0; + +error: + save_errno = errno; + close_pair_fds (fds); + channel_destroy (c); + free (e); + errno = save_errno; + return -1; +} + +static int local_setup_stdio (flux_subprocess_t *p) +{ + int buffer_size; + + if (p->flags & FLUX_SUBPROCESS_FLAGS_STDIO_FALLTHROUGH) + return 0; + + /* stdio is identical to channels, except they are limited to read + * and/or write, and the buffer's automatically get a NUL char + * appended on reads */ + + if ((buffer_size = cmd_option_bufsize (p, "STDIN")) < 0) + return -1; + + if (channel_local_setup (p, + NULL, + local_in_cb, + NULL, + "STDIN", + CHANNEL_WRITE, + buffer_size) < 0) + return -1; + + if (p->ops.on_stdout) { + if ((buffer_size = cmd_option_bufsize (p, "STDOUT")) < 0) + return -1; + + if (channel_local_setup (p, + p->ops.on_stdout, + NULL, + local_stdout_cb, + "STDOUT", + CHANNEL_READ, + buffer_size) < 0) + return -1; + } + + if (p->ops.on_stderr) { + if ((buffer_size = cmd_option_bufsize (p, "STDERR")) < 0) + return -1; + + if (channel_local_setup (p, + p->ops.on_stderr, + NULL, + local_stderr_cb, + "STDERR", + CHANNEL_READ, + buffer_size) < 0) + return -1; + } + + return 0; +} + +static int local_setup_channels (flux_subprocess_t *p) +{ + zlist_t *channels; + const char *name; + int channel_flags = CHANNEL_READ | CHANNEL_WRITE | CHANNEL_FD; + int len; + + if (!(channels = flux_cmd_channel_list (p->cmd))) { + flux_log_error (p->h, "flux_cmd_channel_list"); + return -1; + } + + if (!(len = zlist_size (channels))) + return 0; + + if (!p->ops.on_channel_out) + channel_flags &= ~CHANNEL_READ; + + name = zlist_first (channels); + while (name) { + int buffer_size; + + if ((buffer_size = cmd_option_bufsize (p, name)) < 0) + return -1; + + if (channel_local_setup (p, + p->ops.on_channel_out, + local_in_cb, + p->ops.on_channel_out ? local_out_cb : NULL, + name, + channel_flags, + buffer_size) < 0) + return -1; + name = zlist_next (channels); + } + + return 0; +} + +static int sigmask_unblock_all (void) +{ + sigset_t mask; + sigemptyset (&mask); + return sigprocmask (SIG_SETMASK, &mask, NULL); +} + +static void close_fds (flux_subprocess_t *p, bool parent) +{ + struct subprocess_channel *c; + int f = parent ? 0 : 1; + + close (p->sync_fds[f]); + p->sync_fds[f] = -1; + + /* note, it is safe to iterate via zhash, child & parent will have + * different copies of zhash */ + c = zhash_first (p->channels); + while (c) { + if (parent && c->parent_fd != -1) { + close (c->parent_fd); + c->parent_fd = -1; + } + else if (!parent && c->child_fd != -1) { + close (c->child_fd); + c->child_fd = -1; + } + c = zhash_next (p->channels); + } +} + +static void close_parent_fds (flux_subprocess_t *p) +{ + close_fds (p, true); +} + +static void close_child_fds (flux_subprocess_t *p) +{ + close_fds (p, false); +} + +static void closefd_child (void *arg, int fd) +{ + flux_subprocess_t *p = arg; + struct subprocess_channel *c; + if (fd < 3 || fd == p->sync_fds[1]) + return; + c = zhash_first (p->channels); + while (c) { + if (c->child_fd == fd) { + int flags = fcntl (fd, F_GETFD, 0); + if (flags >= 0) + (void) fcntl (fd, F_SETFD, flags & ~FD_CLOEXEC); + return; + } + c = zhash_next (p->channels); + } + close (fd); +} + +/* Signal parent that child is ready for exec(2) and wait for parent's + * signal to proceed. This is done by writing 1 byte to child side of + * socketpair, and waiting for parent to write one byte back. + * + */ +static int local_child_ready (flux_subprocess_t *p) +{ + int n; + int fd = p->sync_fds[1]; + char c = 0; + + if (write (fd, &c, sizeof (c)) != 1) { + flux_log_error (p->h, "local_child_ready: write"); + return -1; + } + if ((n = read (fd, &c, sizeof (c))) != 1) { + flux_log_error (p->h, "local_child_ready: read (fd=%d): rc=%d", fd, n); + return -1; + } + return 0; +} + +static void local_child_report_exec_failed_errno (flux_subprocess_t *p, int e) +{ + int fd = p->sync_fds[1]; + if (write (fd, &e, sizeof (e)) != sizeof (e)) + flux_log_error (p->h, "local_child_report_exec_failed_errno"); +} + +static int local_child (flux_subprocess_t *p) +{ + struct subprocess_channel *c; + int errnum; + char **argv; + const char *cwd; + + if (sigmask_unblock_all () < 0) + flux_log_error (p->h, "sigprocmask"); + + close_parent_fds (p); + + if (!(p->flags & FLUX_SUBPROCESS_FLAGS_STDIO_FALLTHROUGH)) { + if ((c = zhash_lookup (p->channels, "STDIN"))) { + if (dup2 (c->child_fd, STDIN_FILENO) < 0) { + flux_log_error (p->h, "dup2"); + exit (1); + } + } + + if ((c = zhash_lookup (p->channels, "STDOUT"))) { + if (dup2 (c->child_fd, STDOUT_FILENO) < 0) { + flux_log_error (p->h, "dup2"); + exit (1); + } + } + else + close (STDOUT_FILENO); + + if ((c = zhash_lookup (p->channels, "STDERR"))) { + if (dup2 (c->child_fd, STDERR_FILENO) < 0) { + flux_log_error (p->h, "dup2"); + exit (1); + } + } + else + close (STDERR_FILENO); + } + + // Change working directory + if ((cwd = flux_cmd_getcwd (p->cmd)) && chdir (cwd) < 0) { + flux_log_error (p->h, "Couldn't change dir to %s: going to /tmp instead", cwd); + if (chdir ("/tmp") < 0) + exit (1); + } + + // Send ready to parent + if (local_child_ready (p) < 0) + exit (1); + + // Close fds + if (fdwalk (closefd_child, (void *) p) < 0) { + flux_log_error (p->h, "Failed closing all fds"); + exit (1); + } + + if (p->flags & FLUX_SUBPROCESS_FLAGS_SETPGRP) { + if (setpgrp () < 0) { + flux_log_error (p->h, "setpgrp"); + exit (1); + } + } + + environ = flux_cmd_env_expand (p->cmd); + argv = flux_cmd_argv_expand (p->cmd); + execvp (argv[0], argv); + + errnum = errno; + /* + * NB: close stdout and stderr here to avoid flushing buffers at exit. + * This can cause duplicate output if parent was running in fully + * bufferred mode, and there was buffered output. + */ + close (STDOUT_FILENO); + local_child_report_exec_failed_errno (p, errnum); + close (STDERR_FILENO); + /* exit code doesn't matter, can't be returned to user */ + exit (1); +} + +/* Wait for child to indicate it is ready for exec(2) by doing a blocking + * read() of one byte on parent side of sync_fds. + */ +static int subprocess_parent_wait_on_child (flux_subprocess_t *p) +{ + char c; + + if (read (p->sync_fds[0], &c, sizeof (c)) != 1) { + flux_log_error (p->h, "subprocess_parent_wait_on_child: read"); + return -1; + } + return 0; +} + +static void child_watch_cb (flux_reactor_t *r, flux_watcher_t *w, + int revents, void *arg) +{ + flux_subprocess_t *p = arg; + int status; + + if ((status = flux_child_watcher_get_rstatus (w)) < 0) { + flux_log_error (p->h, "flux_child_watcher_get_rstatus"); + return; + } + + p->status = status; + + if (WIFEXITED (p->status) || WIFSIGNALED (p->status)) { + + /* remote/server code may have set EXEC_FAILED or + * FAILED on fatal errors. + */ + if (p->state == FLUX_SUBPROCESS_RUNNING) { + p->state = FLUX_SUBPROCESS_EXITED; + state_change_start (p); + } + + /* Child watcher no longer needed, pid now invalid */ + if (p->child_w) + flux_watcher_stop (p->child_w); + } + + if (p->state == FLUX_SUBPROCESS_EXITED) + subprocess_check_completed (p); +} + +static int local_fork (flux_subprocess_t *p) +{ + if ((p->pid = fork ()) < 0) + return -1; + + if (p->pid == 0) + local_child (p); /* No return */ + + close_child_fds (p); + + /* no-op if reactor is !FLUX_REACTOR_SIGCHLD */ + if (!(p->child_w = flux_child_watcher_create (p->reactor, + p->pid, + true, + child_watch_cb, + p))) { + flux_log_error (p->h, "flux_child_watcher_create"); + return -1; + } + + flux_watcher_start (p->child_w); + + if (subprocess_parent_wait_on_child (p) < 0) + return -1; + + p->state = FLUX_SUBPROCESS_STARTED; + + return (0); +} + +/* Signal child to proceed with exec(2) and read any error from exec + * back on sync_fds. Return < 0 on failure to signal, or > 0 errnum if + * an exec error was returned from child. + */ +static int local_release_child (flux_subprocess_t *p) +{ + int fd = p->sync_fds[0]; + char c = 0; + int e = 0; + ssize_t n; + + if (write (fd, &c, sizeof (c)) != 1) + return -1; + if ((n = read (fd, &e, sizeof (e))) < 0) + return -1; + else if (n == sizeof (int)) { + // exec error received + return e; + } + /* else n == 0, child exec'ed and closed sync_fds[1] */ + + /* no longer need this fd */ + close (p->sync_fds[0]); + p->sync_fds[0] = -1; + return 0; +} + +static int local_exec (flux_subprocess_t *p) +{ + if ((p->exec_failed_errno = local_release_child (p)) != 0) { + /* + * Reap child immediately. Expectation from caller is that + * failure to exec will not require subsequent reaping of + * child. + */ + int status; + pid_t pid; + if ((pid = waitpid (p->pid, &status, 0)) <= 0) + return -1; + p->status = status; + + /* spritually FLUX_SUBPROCESS_EXEC_FAILED state at this + * point */ + errno = p->exec_failed_errno; + return -1; + } + p->state = FLUX_SUBPROCESS_RUNNING; + + return 0; +} + +static void start_local_watchers (flux_subprocess_t *p) +{ + struct subprocess_channel *c; + + c = zhash_first (p->channels); + while (c) { + flux_watcher_start (c->buffer_write_w); + flux_watcher_start (c->buffer_read_w); + c = zhash_next (p->channels); + } +} + +int subprocess_local_setup (flux_subprocess_t *p) +{ + if (local_setup_stdio (p) < 0) + return -1; + if (local_setup_channels (p) < 0) + return -1; + if (local_fork (p) < 0) + return -1; + if (local_exec (p) < 0) + return -1; + start_local_watchers (p); + return 0; +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/src/common/subprocess/local.h b/src/common/subprocess/local.h new file mode 100644 index 000000000000..0d9a231acf9b --- /dev/null +++ b/src/common/subprocess/local.h @@ -0,0 +1,8 @@ +#ifndef _SUBPROCESS_LOCAL_H +#define _SUBPROCESS_LOCAL_H + +#include "subprocess.h" + +int subprocess_local_setup (flux_subprocess_t *p); + +#endif /* !_SUBPROCESS_LOCAL_H */ diff --git a/src/common/subprocess/remote.c b/src/common/subprocess/remote.c new file mode 100644 index 000000000000..ad23268c94c0 --- /dev/null +++ b/src/common/subprocess/remote.c @@ -0,0 +1,771 @@ +/*****************************************************************************\ + * Copyright (c) 2017 Lawrence Livermore National Security, LLC. Produced at + * the Lawrence Livermore National Laboratory (cf, AUTHORS, DISCLAIMER.LLNS). + * LLNL-CODE-658032 All rights reserved. + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the license, or (at your option) + * any later version. + * + * Flux is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the terms and conditions of the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * See also: http://www.gnu.org/licenses/ +\*****************************************************************************/ +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include +#include + +#include + +#include + +#include "src/common/libutil/log.h" +#include "src/common/libutil/fdwalk.h" +#include "src/common/libutil/base64.h" + +#include "subprocess.h" +#include "subprocess_private.h" +#include "command.h" +#include "remote.h" +#include "util.h" + +static void start_channel_watchers (flux_subprocess_t *p) +{ + struct subprocess_channel *c; + c = zhash_first (p->channels); + while (c) { + flux_watcher_start (c->in_prep_w); + flux_watcher_start (c->in_check_w); + flux_watcher_start (c->out_prep_w); + flux_watcher_start (c->out_check_w); + c = zhash_next (p->channels); + } +} + +static void stop_channel_watchers (flux_subprocess_t *p, bool in, bool out) +{ + struct subprocess_channel *c; + c = zhash_first (p->channels); + while (c) { + if (in) { + flux_watcher_stop (c->in_prep_w); + flux_watcher_stop (c->in_idle_w); + flux_watcher_stop (c->in_check_w); + } + if (out) { + flux_watcher_stop (c->out_prep_w); + flux_watcher_stop (c->out_idle_w); + flux_watcher_stop (c->out_check_w); + } + c = zhash_next (p->channels); + } +} + +static void stop_io_watchers (flux_subprocess_t *p) +{ + stop_channel_watchers (p, true, true); +} + +static void stop_in_watchers (flux_subprocess_t *p) +{ + stop_channel_watchers (p, true, false); +} + +#if 0 +static void stop_out_watchers (flux_subprocess_t *p) +{ + stop_channel_watchers (p, false, true); +} +#endif + +static void process_new_state (flux_subprocess_t *p, + flux_subprocess_state_t state, + int rank, pid_t pid, int errnum, int status) +{ + if (p->state == FLUX_SUBPROCESS_EXEC_FAILED + || p->state == FLUX_SUBPROCESS_FAILED) + return; + + p->state = state; + + if (p->state == FLUX_SUBPROCESS_RUNNING) { + p->pid = pid; + start_channel_watchers (p); + } + else if (state == FLUX_SUBPROCESS_EXEC_FAILED) { + p->exec_failed_errno = errnum; + stop_io_watchers (p); + } + else if (state == FLUX_SUBPROCESS_EXITED) { + p->status = status; + stop_in_watchers (p); + } + else if (state == FLUX_SUBPROCESS_FAILED) { + p->failed_errno = errnum; + stop_io_watchers (p); + } + + if (p->state != p->state_reported) + state_change_start (p); +} + +static void remote_in_prep_cb (flux_reactor_t *r, + flux_watcher_t *w, + int revents, + void *arg) +{ + struct subprocess_channel *c = arg; + + if (flux_buffer_bytes (c->write_buffer) > 0 + || (c->closed && !c->write_eof_sent) + || (c->p->state == FLUX_SUBPROCESS_EXITED + || c->p->state == FLUX_SUBPROCESS_FAILED)) + flux_watcher_start (c->in_idle_w); +} + +static int remote_write (struct subprocess_channel *c) +{ + flux_future_t *f = NULL; + const void *ptr; + char *s_data = NULL; + int lenp, s_len; + int rv = -1; + + if (!(ptr = flux_buffer_read (c->write_buffer, -1, &lenp))) { + flux_log_error (c->p->h, "flux_buffer_read"); + goto error; + } + + s_len = base64_encode_length (lenp); + + if (!(s_data = calloc (1, s_len))) { + flux_log_error (c->p->h, "calloc"); + goto error; + } + + if (base64_encode_block (s_data, &s_len, ptr, lenp) < 0) { + flux_log_error (c->p->h, "base64_encode_block"); + goto error; + } + + if (!(f = flux_rpc_pack (c->p->h, "cmb.rexec.write", c->p->rank, + FLUX_RPC_NORESPONSE, + "{ s:i s:s s:s s:i }", + "pid", c->p->pid, + "name", c->name, + "data", s_data, + "close", 0))) { + flux_log_error (c->p->h, "flux_rpc_pack"); + return -1; + } + + rv = 0; + error: + /* no response */ + flux_future_destroy (f); + free (s_data); + return rv; +} + +static int remote_close (struct subprocess_channel *c) +{ + flux_future_t *f; + + if (!(f = flux_rpc_pack (c->p->h, "cmb.rexec.write", c->p->rank, + FLUX_RPC_NORESPONSE, + "{ s:i s:s s:i }", + "pid", c->p->pid, + "name", c->name, + "close", 1))) { + flux_log_error (c->p->h, "flux_rpc_pack"); + return -1; + } + + /* no response */ + flux_future_destroy (f); + + /* No need to do a "channel_flush", normal io reactor will handle + * flush of any data in read buffer */ + return 0; +} + +static void remote_in_check_cb (flux_reactor_t *r, + flux_watcher_t *w, + int revents, + void *arg) +{ + struct subprocess_channel *c = arg; + flux_future_t *fkill; + + flux_watcher_stop (c->in_idle_w); + + if (flux_buffer_bytes (c->write_buffer) > 0) { + if (remote_write (c) < 0) { + flux_log_error (c->p->h, "remote_write"); + goto error; + } + } + + if (!flux_buffer_bytes (c->write_buffer) + && c->closed + && !c->write_eof_sent) { + if (remote_close (c) < 0) { + flux_log_error (c->p->h, "remote_close"); + goto error; + } + c->write_eof_sent++; + } + + if (c->write_eof_sent + || c->p->state == FLUX_SUBPROCESS_EXITED + || c->p->state == FLUX_SUBPROCESS_FAILED) { + flux_watcher_stop (c->in_prep_w); + flux_watcher_stop (c->in_check_w); + } + + return; + +error: + process_new_state (c->p, FLUX_SUBPROCESS_FAILED, + c->p->rank, -1, errno, 0); + if (!(fkill = remote_kill (c->p, SIGKILL))) + flux_log_error (c->p->h, "%s: remote_kill", __FUNCTION__); + else + flux_future_destroy (fkill); + flux_future_destroy (c->p->f); + c->p->f = NULL; +} + +static void remote_out_prep_cb (flux_reactor_t *r, + flux_watcher_t *w, + int revents, + void *arg) +{ + struct subprocess_channel *c = arg; + + /* no need to handle failure states, on fatal error, these + * reactors are closed */ + if (flux_buffer_bytes (c->read_buffer) > 0 + || (c->read_eof_received && !c->eof_sent_to_caller)) + flux_watcher_start (c->out_idle_w); +} + +static void remote_out_check_cb (flux_reactor_t *r, + flux_watcher_t *w, + int revents, + void *arg) +{ + struct subprocess_channel *c = arg; + + flux_watcher_stop (c->out_idle_w); + + if (flux_buffer_bytes (c->read_buffer) > 0) { + c->output_f (c->p, c->name); + } + + if (!flux_buffer_bytes (c->read_buffer) + && c->read_eof_received + && !c->eof_sent_to_caller) { + c->output_f (c->p, c->name); + c->eof_sent_to_caller++; + c->p->channels_eof_sent++; + } + + /* no need to handle failure states, on fatal error, these + * reactors are closed */ + if (c->eof_sent_to_caller) { + flux_watcher_stop (c->out_prep_w); + flux_watcher_stop (c->out_check_w); + + /* close input side as well */ + flux_watcher_stop (c->in_prep_w); + flux_watcher_stop (c->in_idle_w); + flux_watcher_stop (c->in_check_w); + c->closed = true; + } + + if (c->p->state == FLUX_SUBPROCESS_EXITED && c->eof_sent_to_caller) + subprocess_check_completed (c->p); +} + +static int remote_channel_setup (flux_subprocess_t *p, + flux_subprocess_output_f output_f, + const char *name, + int channel_flags, + int buffer_size) +{ + struct subprocess_channel *c = NULL; + char *e = NULL; + int save_errno; + + if (!(c = channel_create (p, output_f, name, channel_flags))) { + flux_log_error (p->h, "calloc"); + goto error; + } + + if (channel_flags & CHANNEL_WRITE) { + if (!(c->write_buffer = flux_buffer_create (buffer_size))) { + flux_log_error (p->h, "flux_buffer_create"); + goto error; + } + + if (!(c->in_prep_w = flux_prepare_watcher_create (p->reactor, + remote_in_prep_cb, + c))) { + flux_log_error (p->h, "flux_prepare_watcher_create"); + goto error; + } + + if (!(c->in_idle_w = flux_idle_watcher_create (p->reactor, + NULL, + c))) { + flux_log_error (p->h, "flux_idle_watcher_create"); + goto error; + } + + if (!(c->in_check_w = flux_check_watcher_create (p->reactor, + remote_in_check_cb, + c))) { + flux_log_error (p->h, "flux_check_watcher_create"); + goto error; + } + + /* do not start these watchers till later, cannot send data to + * remote until it has reached running state + */ + } + + if (channel_flags & CHANNEL_READ) { + if (!(c->read_buffer = flux_buffer_create (buffer_size))) { + flux_log_error (p->h, "flux_buffer_create"); + goto error; + } + p->channels_eof_expected++; + + if (!(c->out_prep_w = flux_prepare_watcher_create (p->reactor, + remote_out_prep_cb, + c))) { + flux_log_error (p->h, "flux_prepare_watcher_create"); + goto error; + } + + if (!(c->out_idle_w = flux_idle_watcher_create (p->reactor, + NULL, + c))) { + flux_log_error (p->h, "flux_idle_watcher_create"); + goto error; + } + + if (!(c->out_check_w = flux_check_watcher_create (p->reactor, + remote_out_check_cb, + c))) { + flux_log_error (p->h, "flux_check_watcher_create"); + goto error; + } + + /* don't start these watchers until we've reached the running + * state */ + } + + if (zhash_insert (p->channels, name, c) < 0) { + flux_log_error (p->h, "zhash_insert"); + goto error; + } + if (!zhash_freefn (p->channels, name, channel_destroy)) { + flux_log_error (p->h, "zhash_freefn"); + goto error; + } + + /* now error is in subprocess_free()'s responsibility + */ + c = NULL; + + free (e); + return 0; + + error: + save_errno = errno; + channel_destroy (c); + free (e); + errno = save_errno; + return -1; +} + +static int remote_setup_stdio (flux_subprocess_t *p) +{ + int buffer_size; + + /* stdio is identical to channels, except they are limited to read + * and/or write, and the buffer's automatically get a NUL char + * appended on reads */ + + if ((buffer_size = cmd_option_bufsize (p, "STDIN")) < 0) + return -1; + + if (remote_channel_setup (p, + NULL, + "STDIN", + CHANNEL_WRITE, + buffer_size) < 0) + return -1; + + if (p->ops.on_stdout) { + if ((buffer_size = cmd_option_bufsize (p, "STDOUT")) < 0) + return -1; + + if (remote_channel_setup (p, + p->ops.on_stdout, + "STDOUT", + CHANNEL_READ, + buffer_size) < 0) + return -1; + } + + if (p->ops.on_stderr) { + if ((buffer_size = cmd_option_bufsize (p, "STDERR")) < 0) + return -1; + + if (remote_channel_setup (p, + p->ops.on_stderr, + "STDERR", + CHANNEL_READ, + buffer_size) < 0) + return -1; + } + + return 0; +} + +static int remote_setup_channels (flux_subprocess_t *p) +{ + zlist_t *channels; + const char *name; + int channel_flags = CHANNEL_READ | CHANNEL_WRITE | CHANNEL_FD; + int len; + + if (!(channels = flux_cmd_channel_list (p->cmd))) { + flux_log_error (p->h, "flux_cmd_channel_list"); + return -1; + } + + if (!(len = zlist_size (channels))) + return 0; + + if (!p->ops.on_channel_out) + channel_flags &= ~CHANNEL_READ; + + name = zlist_first (channels); + while (name) { + int buffer_size; + + if ((buffer_size = cmd_option_bufsize (p, name)) < 0) + return -1; + + if (remote_channel_setup (p, + p->ops.on_channel_out, + name, + channel_flags, + buffer_size) < 0) + return -1; + name = zlist_next (channels); + } + + return 0; +} + +int subprocess_remote_setup (flux_subprocess_t *p) +{ + if (remote_setup_stdio (p) < 0) + return -1; + if (remote_setup_channels (p) < 0) + return -1; + return 0; +} + +static int remote_state (flux_subprocess_t *p, flux_future_t *f, + int rank) +{ + flux_subprocess_state_t state; + pid_t pid = -1; + int errnum = 0; + int status = 0; + + if (flux_rpc_get_unpack (f, "{ s:i }", "state", &state) < 0) { + flux_log_error (p->h, "%s: flux_rpc_get_unpack", __FUNCTION__); + return -1; + } + + if (state == FLUX_SUBPROCESS_RUNNING) { + if (flux_rpc_get_unpack (f, "{ s:i }", "pid", &pid) < 0) { + flux_log_error (p->h, "%s: flux_rpc_get_unpack", __FUNCTION__); + return -1; + } + } + + if (state == FLUX_SUBPROCESS_EXEC_FAILED + || state == FLUX_SUBPROCESS_FAILED) { + if (flux_rpc_get_unpack (f, "{ s:i }", "errno", &errnum) < 0) { + flux_log_error (p->h, "%s: flux_rpc_get_unpack", __FUNCTION__); + return -1; + } + } + + if (state == FLUX_SUBPROCESS_EXITED) { + if (flux_rpc_get_unpack (f, "{ s:i }", "status", &status) < 0) { + flux_log_error (p->h, "%s: flux_rpc_get_unpack", __FUNCTION__); + return -1; + } + } + + process_new_state (p, state, rank, pid, errnum, status); + + return 0; +} + +static int remote_output (flux_subprocess_t *p, flux_future_t *f, + int rank, pid_t pid) +{ + struct subprocess_channel *c; + const char *s_data; + char *data = NULL; + int s_len, len, tmp; + const char *stream; + int eof; + int rv = -1; + + if (flux_rpc_get_unpack (f, "{ s:s }", "stream", &stream)) { + flux_log_error (p->h, "flux_rpc_get_unpack EPROTO stream"); + goto cleanup; + } + + if (!(c = zhash_lookup (p->channels, stream))) { + flux_log_error (p->h, "invalid channel received: rank = %d, pid = %d, stream = %s", + rank, pid, stream); + errno = EPROTO; + goto cleanup; + } + + if (!flux_rpc_get_unpack (f, "{ s:s }", "data", &s_data)) { + + s_len = strlen (s_data); + len = base64_decode_length (s_len); + + if (!(data = calloc (1, len))) { + flux_log_error (p->h, "calloc"); + goto cleanup; + } + + if (base64_decode_block (data, &len, s_data, s_len) < 0) { + flux_log_error (p->h, "base64_decode_block"); + goto cleanup; + } + + if ((tmp = flux_buffer_write (c->read_buffer, data, len)) < 0) { + flux_log_error (p->h, "flux_buffer_write"); + goto cleanup; + } + + /* add list of msgs if there is overflow? */ + + if (tmp != len) { + flux_log_error (p->h, "channel buffer error: rank = %d pid = %d, stream = %s, len = %d", + rank, pid, stream, len); + errno = EOVERFLOW; + goto cleanup; + } + } + else if (!flux_rpc_get_unpack (f, "{ s:i }", "eof", &eof)) { + c->read_eof_received = true; + } + + rv = 0; +cleanup: + free (data); + return rv; +} + +static void remote_completion (flux_subprocess_t *p) +{ + p->remote_completed = true; + /* TBON inorder delivery of messages should guarantee we received + * FLUX_SUBPROCESS_EXITED before this. + */ + subprocess_check_completed (p); +} + +static void remote_exec_cb (flux_future_t *f, void *arg) +{ + flux_subprocess_t *p = arg; + const char *type; + int rank; + pid_t pid; + + if (flux_rpc_get_unpack (f, "{ s:s s:i }", + "type", &type, + "rank", &rank) < 0) { + flux_log_error (p->h, "%s: flux_rpc_get_unpack", __FUNCTION__); + goto error; + } + + if (!strcmp (type, "state")) { + if (remote_state (p, f, rank) < 0) + goto error; + if (p->state == FLUX_SUBPROCESS_EXEC_FAILED + || p->state == FLUX_SUBPROCESS_FAILED) { + flux_future_destroy (f); + p->f = NULL; + } + else + flux_future_reset (f); + } + else if (!strcmp (type, "output")) { + if (flux_rpc_get_unpack (f, "{ s:i }", "pid", &pid) < 0) { + flux_log_error (p->h, "%s: flux_rpc_get_unpack", __FUNCTION__); + goto error; + } + if (remote_output (p, f, rank, pid) < 0) + goto error; + flux_future_reset (f); + } + else if (!strcmp (type, "complete")) { + remote_completion (p); + flux_future_destroy (f); + p->f = NULL; + } + else { + flux_log_error (p->h, "%s: EPROTO", __FUNCTION__); + errno = EPROTO; + goto error; + } + + return; + +error: + if (p->state == FLUX_SUBPROCESS_RUNNING) { + flux_future_t *fkill; + if (!(fkill = remote_kill (p, SIGKILL))) + flux_log_error (p->h, "%s: remote_kill", __FUNCTION__); + else + flux_future_destroy (fkill); + } + process_new_state (p, FLUX_SUBPROCESS_FAILED, + p->rank, -1, errno, 0); + flux_future_destroy (f); + p->f = NULL; +} + +static void remote_continuation_cb (flux_future_t *f, void *arg) +{ + flux_subprocess_t *p = arg; + const char *type; + int rank; + int save_errno; + + if (flux_rpc_get_unpack (f, "{ s:s s:i }", + "type", &type, + "rank", &rank) < 0) { + flux_log_error (p->h, "%s: flux_rpc_get_unpack", __FUNCTION__); + goto error; + } + + if (!strcmp (type, "start")) { + flux_future_reset (f); + if (flux_future_then (f, -1., remote_exec_cb, p) < 0) { + flux_log_error (p->h, "flux_future_then"); + goto error; + } + } + else { + flux_log_error (p->h, "%s: EPROTO", __FUNCTION__); + errno = EPROTO; + goto error; + } + + return; + +error: + /* error here is fatal, set EXEC_FAILED. + * we can't do anything else b/c we lack a PID or anything similar. + */ + process_new_state (p, FLUX_SUBPROCESS_EXEC_FAILED, p->rank, -1, errno, 0); + save_errno = errno; + flux_future_destroy (p->f); + p->f = NULL; + errno = save_errno; + return; +} + +int remote_exec (flux_subprocess_t *p) +{ + flux_future_t *f = NULL; + char *cmd_str = NULL; + int save_errno; + + if (!(cmd_str = flux_cmd_tojson (p->cmd))) { + flux_log_error (p->h, "flux_cmd_tojson"); + goto error; + } + + /* completion & state_change cbs always required b/c we use it + * internally in this code. But output callbacks are optional, we + * don't care if user doesn't want it. + */ + if (!(f = flux_rpc_pack (p->h, "cmb.rexec", p->rank, 0, + "{s:s s:i s:i s:i}", + "cmd", cmd_str, + "on_channel_out", p->ops.on_channel_out ? 1 : 0, + "on_stdout", p->ops.on_stdout ? 1 : 0, + "on_stderr", p->ops.on_stderr ? 1 : 0))) { + flux_log_error (p->h, "flux_rpc"); + goto error; + } + + if (flux_future_then (f, -1., remote_continuation_cb, p) < 0) { + flux_log_error (p->h, "flux_future_then"); + goto error; + } + + p->f = f; + free (cmd_str); + return 0; + + error: + save_errno = errno; + flux_future_destroy (f); + free (cmd_str); + errno = save_errno; + return -1; +} + +flux_future_t *remote_kill (flux_subprocess_t *p, int signum) +{ + flux_future_t *f; + + if (!(f = flux_rpc_pack (p->h, "cmb.rexec.signal", p->rank, 0, + "{s:i s:i}", + "pid", p->pid, + "signum", signum))) { + flux_log_error (p->h, "%s: flux_rpc_pack", __FUNCTION__); + return NULL; + } + return f; +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/src/common/subprocess/remote.h b/src/common/subprocess/remote.h new file mode 100644 index 000000000000..9521439a62e0 --- /dev/null +++ b/src/common/subprocess/remote.h @@ -0,0 +1,12 @@ +#ifndef _SUBPROCESS_REMOTE_H +#define _SUBPROCESS_REMOTE_H + +#include "subprocess.h" + +int subprocess_remote_setup (flux_subprocess_t *p); + +int remote_exec (flux_subprocess_t *p); + +flux_future_t *remote_kill (flux_subprocess_t *p, int signum); + +#endif /* !_SUBPROCESS_REMOTE_H */ diff --git a/src/common/subprocess/server.c b/src/common/subprocess/server.c new file mode 100644 index 000000000000..65fa6eab5a56 --- /dev/null +++ b/src/common/subprocess/server.c @@ -0,0 +1,704 @@ +/*****************************************************************************\ + * Copyright (c) 2017 Lawrence Livermore National Security, LLC. Produced at + * the Lawrence Livermore National Laboratory (cf, AUTHORS, DISCLAIMER.LLNS). + * LLNL-CODE-658032 All rights reserved. + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the license, or (at your option) + * any later version. + * + * Flux is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the terms and conditions of the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * See also: http://www.gnu.org/licenses/ +\*****************************************************************************/ +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include +#include + +#include + +#include + +#include "src/common/libutil/log.h" +#include "src/common/libutil/fdwalk.h" +#include "src/common/libutil/base64.h" + +#include "subprocess.h" +#include "subprocess_private.h" +#include "command.h" +#include "remote.h" +#include "server.h" +#include "util.h" + +static int store_pid (flux_subprocess_server_t *s, flux_subprocess_t *p) +{ + pid_t pid = flux_subprocess_pid (p); + char *str = NULL; + int rv = -1; + + if (asprintf (&str, "%d", pid) < 0) { + flux_log_error (s->h, "%s: asprintf", __FUNCTION__); + goto cleanup; + } + + if (zhash_insert (s->subprocesses, str, p) < 0) { + flux_log_error (s->h, "%s: zhash_insert", __FUNCTION__); + goto cleanup; + } + + rv = 0; +cleanup: + free (str); + return rv; +} + +static void remove_pid (flux_subprocess_server_t *s, flux_subprocess_t *p) +{ + pid_t pid = flux_subprocess_pid (p); + char *str = NULL; + + if (asprintf (&str, "%d", pid) < 0) { + flux_log_error (s->h, "%s: asprintf", __FUNCTION__); + goto cleanup; + } + + zhash_delete (s->subprocesses, str); + +cleanup: + free (str); +} + +static flux_subprocess_t *lookup_pid (flux_subprocess_server_t *s, pid_t pid) +{ + flux_subprocess_t *p = NULL; + char *str = NULL; + int save_errno; + + if (asprintf (&str, "%d", pid) < 0) + goto cleanup; + + if (!(p = zhash_lookup (s->subprocesses, str))) { + errno = ENOENT; + goto cleanup; + } + +cleanup: + save_errno = errno; + free (str); + errno = save_errno; + return p; +} + +static void subprocess_cleanup (flux_subprocess_t *p) +{ + flux_subprocess_server_t *s = flux_subprocess_get_context (p, "server_ctx"); + flux_msg_t *msg = (flux_msg_t *) flux_subprocess_get_context (p, "msg"); + + assert (s && msg); + + remove_pid (s, p); + flux_msg_destroy (msg); + flux_subprocess_unref (p); +} + +static void rexec_completion_cb (flux_subprocess_t *p) +{ + flux_subprocess_server_t *s = flux_subprocess_get_context (p, "server_ctx"); + flux_msg_t *msg = (flux_msg_t *) flux_subprocess_get_context (p, "msg"); + + assert (s && msg); + + if (p->state != FLUX_SUBPROCESS_FAILED) { + /* no fallback if this fails */ + if (flux_respond_pack (s->h, msg, "{s:s s:i}", + "type", "complete", + "rank", s->rank) < 0) + flux_log_error (s->h, "%s: flux_respond_pack", __FUNCTION__); + } + + subprocess_cleanup (p); +} + +static void internal_fatal (flux_subprocess_server_t *s, flux_subprocess_t *p) +{ + if (p->state == FLUX_SUBPROCESS_FAILED) + return; + + /* report of state change handled through typical state change + * callback. Normaly cleanup occurs through completion of local + * subprocess. + */ + p->state = FLUX_SUBPROCESS_FAILED; + p->failed_errno = errno; + state_change_start (p); + + /* if we fail here, probably not much can be done */ + if (kill (p->pid, SIGKILL) < 0) { + if (errno != ESRCH) + flux_log_error (s->h, "%s: kill", __FUNCTION__); + } +} + +static void rexec_state_change_cb (flux_subprocess_t *p, flux_subprocess_state_t state) +{ + flux_subprocess_server_t *s = flux_subprocess_get_context (p, "server_ctx"); + flux_msg_t *msg = (flux_msg_t *) flux_subprocess_get_context (p, "msg"); + + assert (s && msg); + + if (state == FLUX_SUBPROCESS_STARTED) { + if (flux_respond_pack (s->h, msg, "{s:s s:i s:i}", + "type", "state", + "rank", s->rank, + "state", state) < 0) { + flux_log_error (s->h, "%s: flux_respond_pack", __FUNCTION__); + goto error; + } + } else if (state == FLUX_SUBPROCESS_RUNNING) { + if (store_pid (s, p) < 0) + goto error; + + if (flux_respond_pack (s->h, msg, "{s:s s:i s:i s:i}", + "type", "state", + "rank", s->rank, + "pid", flux_subprocess_pid (p), + "state", state) < 0) { + flux_log_error (s->h, "%s: flux_respond_pack", __FUNCTION__); + goto error; + } + } else if (state == FLUX_SUBPROCESS_EXITED) { + if (flux_respond_pack (s->h, msg, "{s:s s:i s:i s:i}", + "type", "state", + "rank", s->rank, + "state", state, + "status", flux_subprocess_status (p)) < 0) { + flux_log_error (s->h, "%s: flux_respond_pack", __FUNCTION__); + goto error; + } + } else if (state == FLUX_SUBPROCESS_FAILED) { + if (flux_respond_pack (s->h, msg, "{s:s s:i s:i s:i}", + "type", "state", + "rank", s->rank, + "state", FLUX_SUBPROCESS_FAILED, + "errno", p->failed_errno) < 0) { + flux_log_error (s->h, "%s: flux_respond_pack", __FUNCTION__); + goto error; + } + subprocess_cleanup (p); + } else { + errno = EPROTO; + flux_log_error (s->h, "%s: illegal state", __FUNCTION__); + goto error; + } + + return; + +error: + internal_fatal (s, p); +} + +static int rexec_output_data (flux_subprocess_t *p, const char *stream, + flux_subprocess_server_t *s, flux_msg_t *msg, + const char *data, int len) +{ + char *s_data = NULL; + int s_len; + int rv = -1; + + assert (len); + + s_len = base64_encode_length (len); + + if (!(s_data = calloc (1, s_len))) { + flux_log_error (s->h, "%s: calloc", __FUNCTION__); + goto error; + } + + if (base64_encode_block (s_data, &s_len, data, len) < 0) { + flux_log_error (s->h, "%s: base64_encode_block", __FUNCTION__); + goto error; + } + + if (flux_respond_pack (s->h, msg, "{s:s s:i s:i s:s s:s}", + "type", "output", + "rank", s->rank, + "pid", flux_subprocess_pid (p), + "stream", stream, + "data", s_data) < 0) { + flux_log_error (s->h, "%s: flux_respond_pack", __FUNCTION__); + goto error; + } + + rv = 0; +error: + free (s_data); + return rv; +} + +static int rexec_output_eof (flux_subprocess_t *p, const char *stream, + flux_subprocess_server_t *s, flux_msg_t *msg) +{ + if (flux_respond_pack (s->h, msg, "{s:s s:i s:i s:s s:i}", + "type", "output", + "rank", s->rank, + "pid", flux_subprocess_pid (p), + "stream", stream, + "eof", 1) < 0) { + flux_log_error (s->h, "%s: flux_respond_pack", __FUNCTION__); + return -1; + } + + return 0; +} + +static void rexec_output_cb (flux_subprocess_t *p, const char *stream) +{ + flux_subprocess_server_t *s = flux_subprocess_get_context (p, "server_ctx"); + flux_msg_t *msg = (flux_msg_t *) flux_subprocess_get_context (p, "msg"); + const char *ptr; + int lenp; + + assert (s && msg); + + if (!(ptr = flux_subprocess_read (p, stream, -1, &lenp))) { + flux_log_error (s->h, "%s: flux_subprocess_read", __FUNCTION__); + goto error; + } + + if (lenp) { + if (rexec_output_data (p, stream, s, msg, ptr, lenp) < 0) + goto error; + } + else { + if (rexec_output_eof (p, stream, s, msg) < 0) + goto error; + } + + return; + +error: + internal_fatal (s, p); +} + +static void server_exec_cb (flux_t *h, flux_msg_handler_t *mh, + const flux_msg_t *msg, void *arg) +{ + flux_subprocess_server_t *s = arg; + const char *cmd_str; + flux_cmd_t *cmd = NULL; + flux_msg_t *copy = NULL; + flux_subprocess_t *p = NULL; + flux_subprocess_ops_t ops = { + .on_completion = rexec_completion_cb, + .on_state_change = rexec_state_change_cb, + .on_channel_out = rexec_output_cb, + .on_stdout = rexec_output_cb, + .on_stderr = rexec_output_cb, + }; + int on_channel_out, on_stdout, on_stderr; + char **env = NULL; + + if (flux_request_unpack (msg, NULL, "{s:s s:i s:i s:i}", + "cmd", &cmd_str, + "on_channel_out", &on_channel_out, + "on_stdout", &on_stdout, + "on_stderr", &on_stderr)) + goto error; + + if (!on_channel_out) + ops.on_channel_out = NULL; + if (!on_stdout) + ops.on_stdout = NULL; + if (!on_stderr) + ops.on_stderr = NULL; + + if (!(cmd = flux_cmd_fromjson (cmd_str, NULL))) + goto error; + + if (!flux_cmd_argc (cmd)) { + errno = EPROTO; + goto error; + } + + if (!flux_cmd_getcwd (cmd)) { + errno = EPROTO; + goto error; + } + + if (!(env = flux_cmd_env_expand (cmd))) + goto error; + + /* if no environment sent, use local server environment */ + if (env[0] == NULL) { + if (flux_cmd_set_env (cmd, environ) < 0) { + flux_log_error (s->h, "%s: flux_cmd_set_env", __FUNCTION__); + goto error; + } + } + + if (flux_cmd_setenvf (cmd, 1, "FLUX_URI", s->local_uri) < 0) + goto error; + + if (flux_respond_pack (s->h, msg, "{s:s s:i}", + "type", "start", + "rank", s->rank) < 0) { + flux_log_error (s->h, "%s: flux_respond_pack", __FUNCTION__); + goto error; + } + + /* FLUX_NODEID_ANY maps to -1, use -999 */ + if (!(p = flux_rexec (s->h, -999, FLUX_SUBPROCESS_FLAGS_SETPGRP, cmd, &ops))) { + /* error here, generate FLUX_SUBPROCESS_EXEC_FAILED state */ + if (flux_respond_pack (h, msg, "{s:s s:i s:i s:i}", + "type", "state", + "rank", s->rank, + "state", FLUX_SUBPROCESS_EXEC_FAILED, + "errno", errno) < 0) { + flux_log_error (h, "%s: flux_respond_pack", __FUNCTION__); + goto error; + } + goto cleanup; + } + + if (!(copy = flux_msg_copy (msg, true))) + goto error; + if (flux_subprocess_set_context (p, "msg", (void *) copy) < 0) + goto error; + if (flux_subprocess_set_context (p, "server_ctx", s) < 0) + goto error; + + flux_cmd_destroy (cmd); + free (env); + return; + +error: + if (flux_respond (h, msg, errno, NULL) < 0) + flux_log_error (h, "%s: flux_respond", __FUNCTION__); +cleanup: + flux_cmd_destroy (cmd); + free (env); + flux_msg_destroy (copy); + flux_subprocess_unref (p); +} + +static int write_subprocess (flux_subprocess_server_t *s, flux_subprocess_t *p, + const char *name, const char *s_data) +{ + int save_errno, s_len, len; + char *data = NULL; + int tmp, rv = -1; + + s_len = strlen (s_data); + len = base64_decode_length (s_len); + + if (!(data = calloc (1, len))) { + flux_log_error (s->h, "%s: calloc", __FUNCTION__); + goto cleanup; + } + + if (base64_decode_block (data, &len, s_data, s_len) < 0) { + flux_log_error (s->h, "%s: base64_decode_block", __FUNCTION__); + goto cleanup; + } + + if ((tmp = flux_subprocess_write (p, name, data, len)) < 0) { + flux_log_error (s->h, "%s: flux_subprocess_write", __FUNCTION__); + goto cleanup; + } + + /* add list of msgs if there is overflow? */ + + if (tmp != len) { + flux_log_error (s->h, "channel buffer error: rank = %d pid = %d, stream = %s, len = %d", + s->rank, flux_subprocess_pid (p), name, len); + errno = EOVERFLOW; + goto cleanup; + } + + rv = 0; +cleanup: + save_errno = errno; + free (data); + errno = save_errno; + return rv; +} + +static int close_subprocess (flux_subprocess_server_t *s, flux_subprocess_t *p, + const char *name) +{ + if (flux_subprocess_close (p, name) < 0) { + flux_log_error (s->h, "%s: flux_subprocess_close", __FUNCTION__); + return -1; + } + + return 0; +} + +static void server_write_cb (flux_t *h, flux_msg_handler_t *mh, + const flux_msg_t *msg, void *arg) +{ + flux_subprocess_t *p; + flux_subprocess_server_t *s = arg; + const char *name; + pid_t pid; + int close_flag; + + if (flux_request_unpack (msg, NULL, "{ s:i s:s s:i }", + "pid", &pid, + "name", &name, + "close", &close_flag) < 0) { + /* can't handle error, no pid to sent errno back to, so just + * return */ + flux_log_error (s->h, "%s: flux_request_unpack", __FUNCTION__); + return; + } + + if (!(p = lookup_pid (s, pid))) { + /* can't handle error, no pid to send errno back to, so just + * return + * + * It's common on EOF to be sent and server has already + * removed process from hash. Don't output error in that + * case. + */ + if (!(errno == ENOENT && close_flag)) + flux_log_error (s->h, "%s: lookup_pid", __FUNCTION__); + return; + } + + /* Chance subprocess exited/killed/etc. since user write request + * was sent. + */ + if (p->state != FLUX_SUBPROCESS_RUNNING) + return; + + if (close_flag) { + if (close_subprocess (s, p, name) < 0) + goto error; + } + else { + const char *data; + + if (flux_request_unpack (msg, NULL, "{ s:s }", + "data", &data) < 0) { + flux_log_error (s->h, "%s: flux_request_unpack", __FUNCTION__); + errno = EPROTO; + goto error; + } + + if (write_subprocess (s, p, name, data) < 0) + goto error; + } + + return; + +error: + internal_fatal (s, p); +} + +static void server_signal_cb (flux_t *h, flux_msg_handler_t *mh, + const flux_msg_t *msg, void *arg) +{ + flux_subprocess_server_t *s = arg; + pid_t pid; + int signum; + + errno = 0; + + if (flux_request_unpack (msg, NULL, "{ s:i s:i }", + "pid", &pid, + "signum", &signum) < 0) { + flux_log_error (s->h, "%s: flux_request_unpack", __FUNCTION__); + errno = EPROTO; + goto error; + } + + if (!lookup_pid (s, pid)) + goto error; + + if (kill (pid, signum) < 0) + goto error; + +error: + if (flux_respond (h, msg, errno, NULL) < 0) + flux_log_error (h, "%s: flux_respond", __FUNCTION__); +} + +char *subprocess_sender (flux_subprocess_t *p) +{ + flux_msg_t *msg; + char *sender; + + msg = flux_subprocess_get_context (p, "msg"); + if (!msg || flux_msg_get_route_first (msg, &sender) < 0) + return NULL; + + return sender; +} + +static json_t *process_info (flux_subprocess_t *p) +{ + flux_cmd_t *cmd; + char *cmd_str = NULL; + char *sender = NULL; + json_t *info = NULL; + + if (!(cmd = flux_subprocess_get_cmd (p))) + goto cleanup; + + if (!(cmd_str = flux_cmd_tojson (cmd))) + goto cleanup; + + if (!(sender = subprocess_sender (p))) { + errno = ENOENT; + goto cleanup; + } + + /* very limited returned, just for testing */ + if (!(info = json_pack ("{s:i s:s}", + "pid", flux_subprocess_pid (p), + "sender", sender))) { + errno = ENOMEM; + goto cleanup; + } + +cleanup: + free (sender); + free (cmd_str); + return info; +} + +static void server_processes_cb (flux_t *h, flux_msg_handler_t *mh, + const flux_msg_t *msg, void *arg) +{ + flux_subprocess_server_t *s = arg; + flux_subprocess_t *p; + json_t *procs = NULL; + + if (!(procs = json_array ())) { + errno = ENOMEM; + goto error; + } + + p = zhash_first (s->subprocesses); + while (p) { + json_t *o = NULL; + if (!(o = process_info (p)) + || json_array_append_new (procs, o) < 0) { + json_decref (o); + errno = ENOMEM; + goto error; + } + p = zhash_next (s->subprocesses); + } + + if (flux_respond_pack (h, msg, "{s:i s:o}", "rank", s->rank, + "procs", procs) < 0) + flux_log_error (h, "%s: flux_respond_pack", __FUNCTION__); + return; + +error: + if (flux_respond (h, msg, errno, NULL) < 0) + flux_log_error (h, "%s: flux_respond", __FUNCTION__); + json_decref (procs); +} + +int server_start (flux_subprocess_server_t *s, const char *prefix) +{ + /* rexec.processes is primarily for testing */ + struct flux_msg_handler_spec htab[] = { + { FLUX_MSGTYPE_REQUEST, "rexec", server_exec_cb, 0 }, + { FLUX_MSGTYPE_REQUEST, "rexec.write", server_write_cb, 0 }, + { FLUX_MSGTYPE_REQUEST, "rexec.signal", server_signal_cb, 0 }, + { FLUX_MSGTYPE_REQUEST, "rexec.processes", server_processes_cb, 0 }, + FLUX_MSGHANDLER_TABLE_END, + }; + char *topic_globs[4] = {NULL, NULL, NULL, NULL}; + int rv = -1; + + assert (prefix); + + if (asprintf (&topic_globs[0], "%s.rexec", prefix) < 0) + goto cleanup; + if (asprintf (&topic_globs[1], "%s.rexec.write", prefix) < 0) + goto cleanup; + if (asprintf (&topic_globs[2], "%s.rexec.signal", prefix) < 0) + goto cleanup; + if (asprintf (&topic_globs[3], "%s.rexec.processes", prefix) < 0) + goto cleanup; + + htab[0].topic_glob = (const char *)topic_globs[0]; + htab[1].topic_glob = (const char *)topic_globs[1]; + htab[2].topic_glob = (const char *)topic_globs[2]; + htab[3].topic_glob = (const char *)topic_globs[3]; + + if (flux_msg_handler_addvec (s->h, htab, s, &s->handlers) < 0) + goto cleanup; + + rv = 0; +cleanup: + free (topic_globs[0]); + free (topic_globs[1]); + free (topic_globs[2]); + free (topic_globs[3]); + return rv; +} + +void server_stop (flux_subprocess_server_t *s) +{ + flux_msg_handler_delvec (s->handlers); +} + +void terminate_uuid (flux_subprocess_t *p, const char *id) +{ + char *sender; + + if (!(sender = subprocess_sender (p))) + return; + + if (!strcmp (id, sender)) { + flux_future_t *f; + if (!(f = flux_subprocess_kill (p, SIGKILL))) { + flux_subprocess_server_t *s; + s = flux_subprocess_get_context (p, "server_ctx"); + flux_log_error (s->h, "%s: flux_subprocess_kill", __FUNCTION__); + return; + } + flux_future_destroy (f); + } + + free (sender); +} + +int server_terminate_by_uuid (flux_subprocess_server_t *s, + const char *id) +{ + flux_subprocess_t *p; + + p = zhash_first (s->subprocesses); + while (p) { + terminate_uuid (p, id); + p = zhash_next (s->subprocesses); + } + + return 0; +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/src/common/subprocess/server.h b/src/common/subprocess/server.h new file mode 100644 index 000000000000..71cdcdaa11b1 --- /dev/null +++ b/src/common/subprocess/server.h @@ -0,0 +1,13 @@ +#ifndef _SUBPROCESS_SERVER_H +#define _SUBPROCESS_SERVER_H + +#include "subprocess.h" + +int server_start (flux_subprocess_server_t *s, const char *prefix); + +void server_stop (flux_subprocess_server_t *s); + +int server_terminate_by_uuid (flux_subprocess_server_t *s, + const char *id); + +#endif /* !_SUBPROCESS_SERVER_H */ diff --git a/src/common/subprocess/subprocess.c b/src/common/subprocess/subprocess.c new file mode 100644 index 000000000000..722ed94c27bc --- /dev/null +++ b/src/common/subprocess/subprocess.c @@ -0,0 +1,1071 @@ +/*****************************************************************************\ + * Copyright (c) 2017 Lawrence Livermore National Security, LLC. Produced at + * the Lawrence Livermore National Laboratory (cf, AUTHORS, DISCLAIMER.LLNS). + * LLNL-CODE-658032 All rights reserved. + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the license, or (at your option) + * any later version. + * + * Flux is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the terms and conditions of the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * See also: http://www.gnu.org/licenses/ +\*****************************************************************************/ +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include +#include + +#include + +#include + +#include "src/common/libutil/log.h" +#include "src/common/libutil/fdwalk.h" +#include "src/common/libutil/base64.h" + +#include "subprocess.h" +#include "subprocess_private.h" +#include "command.h" +#include "local.h" +#include "remote.h" +#include "server.h" +#include "util.h" + +/* + * Primary Structures + */ + +void channel_destroy (void *arg) +{ + struct subprocess_channel *c = arg; + if (c && c->magic == CHANNEL_MAGIC) { + if (c->name) + free (c->name); + + if (c->parent_fd != -1) + close (c->parent_fd); + if (c->child_fd != -1) + close (c->child_fd); + flux_watcher_destroy (c->buffer_write_w); + flux_watcher_destroy (c->buffer_read_w); + + flux_buffer_destroy (c->write_buffer); + flux_buffer_destroy (c->read_buffer); + flux_watcher_destroy (c->in_prep_w); + flux_watcher_destroy (c->in_idle_w); + flux_watcher_destroy (c->in_check_w); + flux_watcher_destroy (c->out_prep_w); + flux_watcher_destroy (c->out_idle_w); + flux_watcher_destroy (c->out_check_w); + + c->magic = ~CHANNEL_MAGIC; + free (c); + } +} + +struct subprocess_channel *channel_create (flux_subprocess_t *p, + flux_subprocess_output_f output_f, + const char *name, + int flags) +{ + struct subprocess_channel *c = calloc (1, sizeof (*c)); + int save_errno; + + if (!c) + return NULL; + + c->magic = CHANNEL_MAGIC; + + c->p = p; + c->output_f = output_f; + if (!(c->name = strdup (name))) + goto error; + c->flags = flags; + + c->eof_sent_to_caller = false; + c->closed = false; + + c->parent_fd = -1; + c->child_fd = -1; + c->buffer_write_w = NULL; + c->buffer_read_w = NULL; + + c->write_buffer = NULL; + c->read_buffer = NULL; + c->write_eof_sent = false; + c->read_eof_received = false; + c->in_prep_w = NULL; + c->in_idle_w = NULL; + c->in_check_w = NULL; + c->out_prep_w = NULL; + c->out_idle_w = NULL; + c->out_check_w = NULL; + + return c; + +error: + save_errno = errno; + channel_destroy (c); + errno = save_errno; + return NULL; +} + +static void subprocess_free (flux_subprocess_t *p) +{ + if (p && p->magic == SUBPROCESS_MAGIC) { + flux_cmd_destroy (p->cmd); + + if (p->aux) + zhash_destroy (&p->aux); + if (p->channels) + zhash_destroy (&p->channels); + + flux_watcher_destroy (p->child_w); + + close_pair_fds (p->sync_fds); + + flux_watcher_destroy (p->state_prep_w); + flux_watcher_destroy (p->state_idle_w); + flux_watcher_destroy (p->state_check_w); + + flux_watcher_destroy (p->completed_prep_w); + flux_watcher_destroy (p->completed_idle_w); + flux_watcher_destroy (p->completed_check_w); + + if (p->f) + flux_future_destroy (p->f); + + p->magic = ~SUBPROCESS_MAGIC; + free (p); + } +} + +static flux_subprocess_t * subprocess_create (int flags, + const flux_cmd_t *cmd, + flux_subprocess_ops_t *ops, + flux_t *h, + flux_reactor_t *r, + int rank, + bool local) +{ + flux_subprocess_t *p = calloc (1, sizeof (*p)); + int save_errno; + + if (!p) + return NULL; + + p->magic = SUBPROCESS_MAGIC; + + /* init fds, so on error we don't accidentally close stdin + * (i.e. fd == 0) + */ + init_pair_fds (p->sync_fds); + + /* set CLOEXEC on sync_fds, so on exec(), child sync_fd is closed + * and seen by parent */ + if (socketpair (PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, p->sync_fds) < 0) + goto error; + + if (!(p->aux = zhash_new ()) + || !(p->channels = zhash_new ())) + goto error; + + p->state = FLUX_SUBPROCESS_INIT; + p->state_reported = p->state; + + if (!(p->cmd = flux_cmd_copy (cmd))) + goto error; + + if (ops) + p->ops = *ops; + + p->h = h; + p->reactor = r; + p->rank = rank; + p->flags = flags; + p->kill_signum = 0; + + p->local = local; + + p->refcount = 1; + return (p); + +error: + save_errno = errno; + subprocess_free (p); + errno = save_errno; + return NULL; +} + +static void subprocess_server_destroy (void *arg) +{ + flux_subprocess_server_t *s = arg; + if (s && s->magic == SUBPROCESS_SERVER_MAGIC) { + /* s->handlers handle in server_stop, this is for destroying + * things only + */ + zhash_destroy (&s->subprocesses); + free (s->local_uri); + s->magic = ~SUBPROCESS_SERVER_MAGIC; + free (s); + } +} + +static flux_subprocess_server_t *subprocess_server_create (flux_t *h, + const char *local_uri, + int rank) +{ + flux_subprocess_server_t *s = calloc (1, sizeof (*s)); + int save_errno; + + if (!s) + return NULL; + + s->magic = SUBPROCESS_SERVER_MAGIC; + s->h = h; + if (!(s->r = flux_get_reactor (h))) + goto error; + if (!(s->subprocesses = zhash_new ())) + goto error; + if (!(s->local_uri = strdup (local_uri))) + goto error; + s->rank = rank; + + return s; + +error: + save_errno = errno; + subprocess_server_destroy (s); + errno = save_errno; + return NULL; +} + +/* + * Accessors + */ + +int subprocess_status (flux_subprocess_t *p) +{ + assert (p); + return p->status; +} + +/* + * General support: + */ + +flux_subprocess_server_t *flux_subprocess_server_start (flux_t *h, + const char *prefix, + const char *local_uri, + uint32_t rank) +{ + flux_subprocess_server_t *s = NULL; + int save_errno; + + if (!h || !prefix || !local_uri) { + errno = EINVAL; + goto error; + } + + if (!(s = subprocess_server_create (h, local_uri, rank))) + goto error; + + if (server_start (s, prefix) < 0) + goto error; + + return s; + +error: + save_errno = errno; + subprocess_server_destroy (s); + errno = save_errno; + return NULL; +} + +void flux_subprocess_server_stop (flux_subprocess_server_t *s) +{ + if (s && s->magic == SUBPROCESS_SERVER_MAGIC) { + server_stop (s); + subprocess_server_destroy (s); + } +} + +int flux_subprocess_server_terminate_by_uuid (flux_subprocess_server_t *s, + const char *id) +{ + if (!s || s->magic != SUBPROCESS_SERVER_MAGIC) { + errno = EINVAL; + return -1; + } + + return server_terminate_by_uuid (s, id); +} + +void flux_subprocess_output (flux_subprocess_t *p, const char *stream) +{ + /* everything except stderr goes to stdout */ + FILE *fstream = !strcasecmp (stream, "STDERR") ? stderr : stdout; + const char *ptr; + int lenp; + + if (!(ptr = flux_subprocess_read_line (p, stream, &lenp))) { + log_err ("flux_subprocess_output: read_line"); + return; + } + + /* if process exited, read remaining stuff or EOF, otherwise + * wait for future newline */ + if (!lenp + && flux_subprocess_state (p) == FLUX_SUBPROCESS_EXITED) { + if (!(ptr = flux_subprocess_read (p, stream, -1, &lenp))) { + log_err ("flux_subprocess_output: read_line"); + return; + } + } + + if (lenp) + fwrite (ptr, lenp, 1, fstream); +} + +/* + * Process handling: + */ + +void subprocess_check_completed (flux_subprocess_t *p) +{ + assert (p->state == FLUX_SUBPROCESS_EXITED); + + /* we're also waiting for the "complete" to come from the remote end */ + if (!p->local && !p->remote_completed) + return; + + if (p->completed) + return; + + if (p->channels_eof_sent == p->channels_eof_expected) { + p->completed = true; + flux_watcher_start (p->completed_prep_w); + flux_watcher_start (p->completed_check_w); + } +} + +void state_change_start (flux_subprocess_t *p) +{ + if (p->ops.on_state_change) { + flux_watcher_start (p->state_prep_w); + flux_watcher_start (p->state_check_w); + } +} + +static void state_change_prep_cb (flux_reactor_t *r, + flux_watcher_t *w, + int revents, + void *arg) +{ + flux_subprocess_t *p = arg; + + if (p->state_reported != p->state) + flux_watcher_start (p->state_idle_w); +} + +static flux_subprocess_state_t state_change_next (flux_subprocess_t *p) +{ + assert (p->state != FLUX_SUBPROCESS_FAILED); + + switch (p->state_reported) { + case FLUX_SUBPROCESS_INIT: + /* next state to report must be STARTED */ + return FLUX_SUBPROCESS_STARTED; + case FLUX_SUBPROCESS_STARTED: + /* next state must be RUNNING or EXEC_FAILED */ + if (p->state == FLUX_SUBPROCESS_EXEC_FAILED) + return FLUX_SUBPROCESS_EXEC_FAILED; + else /* p->state == FLUX_SUBPROCESS_RUNNING + || p->state == FLUX_SUBPROCESS_EXITED */ + return FLUX_SUBPROCESS_RUNNING; + case FLUX_SUBPROCESS_RUNNING: + /* next state is EXITED */ + return FLUX_SUBPROCESS_EXITED; + case FLUX_SUBPROCESS_EXEC_FAILED: + case FLUX_SUBPROCESS_EXITED: + case FLUX_SUBPROCESS_FAILED: + break; + } + + /* shouldn't be possible to reach here */ + assert (0); +} + +static void state_change_check_cb (flux_reactor_t *r, + flux_watcher_t *w, + int revents, + void *arg) +{ + flux_subprocess_t *p = arg; + flux_subprocess_state_t next_state = FLUX_SUBPROCESS_INIT; + + flux_watcher_stop (p->state_idle_w); + + /* always a chance caller may destroy subprocess in callback */ + flux_subprocess_ref (p); + + if (p->state_reported != p->state) { + /* this is the ubiquitous fail state for internal failures, + * any state can jump to this state. Even if some state changes + * occurred in between, we'll jump to this state. + */ + if (p->state == FLUX_SUBPROCESS_FAILED) + next_state = FLUX_SUBPROCESS_FAILED; + else + next_state = state_change_next (p); + + (*p->ops.on_state_change) (p, next_state); + p->state_reported = next_state; + } + + /* once we hit one of these states, no more state changes */ + if (p->state_reported == FLUX_SUBPROCESS_EXEC_FAILED + || p->state_reported == FLUX_SUBPROCESS_EXITED + || p->state_reported == FLUX_SUBPROCESS_FAILED) { + flux_watcher_stop (p->state_prep_w); + flux_watcher_stop (p->state_check_w); + } + else if (p->state == p->state_reported) { + flux_watcher_stop (p->state_prep_w); + flux_watcher_stop (p->state_check_w); + } + + if (p->state_reported == FLUX_SUBPROCESS_EXITED) + subprocess_check_completed (p); + + flux_subprocess_unref (p); +} + +static int subprocess_setup_state_change (flux_subprocess_t *p) +{ + if (p->ops.on_state_change) { + p->state_prep_w = flux_prepare_watcher_create (p->reactor, + state_change_prep_cb, + p); + if (!p->state_prep_w) { + log_err ("flux_prepare_watcher_create"); + return -1; + } + + p->state_idle_w = flux_idle_watcher_create (p->reactor, + NULL, + p); + if (!p->state_idle_w) { + log_err ("flux_idle_watcher_create"); + return -1; + } + + p->state_check_w = flux_check_watcher_create (p->reactor, + state_change_check_cb, + p); + if (!p->state_check_w) { + log_err ("flux_check_watcher_create"); + return -1; + } + } + return 0; +} + +static void completed_prep_cb (flux_reactor_t *r, + flux_watcher_t *w, + int revents, + void *arg) +{ + flux_subprocess_t *p = arg; + + assert (p->completed); + + flux_watcher_start (p->completed_idle_w); +} + +static void completed_check_cb (flux_reactor_t *r, + flux_watcher_t *w, + int revents, + void *arg) +{ + flux_subprocess_t *p = arg; + + assert (p->completed); + + flux_watcher_stop (p->completed_idle_w); + + /* always a chance caller may destroy subprocess in callback */ + flux_subprocess_ref (p); + + /* There is a small "racy" component, where the state we're at may + * not yet align with the state that has been reported to the + * user. We would like to report state EXITED to the user before + * calling the completion callback. + * + * If no state change callback was specified, we must have reached + * state FLUX_SUBPROCESS_EXITED to have reached this point. + */ + if (!p->ops.on_state_change + || p->state_reported == FLUX_SUBPROCESS_EXITED) { + if (p->ops.on_completion) + (*p->ops.on_completion) (p); + + flux_watcher_stop (p->completed_prep_w); + flux_watcher_stop (p->completed_check_w); + } + + flux_subprocess_unref (p); +} + +static int subprocess_setup_completed (flux_subprocess_t *p) +{ + if (p->ops.on_completion) { + p->completed_prep_w = flux_prepare_watcher_create (p->reactor, + completed_prep_cb, + p); + if (!p->completed_prep_w) { + log_err ("flux_prepare_watcher_create"); + return -1; + } + + p->completed_idle_w = flux_idle_watcher_create (p->reactor, + NULL, + p); + if (!p->completed_idle_w) { + log_err ("flux_idle_watcher_create"); + return -1; + } + + p->completed_check_w = flux_check_watcher_create (p->reactor, + completed_check_cb, + p); + if (!p->completed_check_w) { + log_err ("flux_check_watcher_create"); + return -1; + } + + /* start when process completed */ + } + return 0; +} + +static flux_subprocess_t * flux_exec_wrap (flux_t *h, flux_reactor_t *r, int flags, + const flux_cmd_t *cmd, + flux_subprocess_ops_t *ops) +{ + flux_subprocess_t *p = NULL; + int valid_flags = (FLUX_SUBPROCESS_FLAGS_STDIO_FALLTHROUGH + | FLUX_SUBPROCESS_FLAGS_SETPGRP); + int save_errno; + + if (!r || !cmd) { + errno = EINVAL; + return NULL; + } + + if (flags & ~valid_flags) { + errno = EINVAL; + return NULL; + } + + if (!(p = subprocess_create (flags, cmd, ops, NULL, r, -1, true))) + goto error; + + if (subprocess_local_setup (p) < 0) + goto error; + + if (subprocess_setup_state_change (p) < 0) + goto error; + + state_change_start (p); + + if (subprocess_setup_completed (p) < 0) + goto error; + + return p; + +error: + save_errno = errno; + flux_subprocess_unref (p); + errno = save_errno; + return NULL; +} + +flux_subprocess_t * flux_exec (flux_reactor_t *r, int flags, + const flux_cmd_t *cmd, + flux_subprocess_ops_t *ops) +{ + return flux_exec_wrap (NULL, r, flags, cmd, ops); +} + +flux_subprocess_t * rexec_local (flux_t *h, int flags, + const flux_cmd_t *cmd, + flux_subprocess_ops_t *ops) +{ + flux_reactor_t *r; + + if (!(r = flux_get_reactor (h))) + return NULL; + + return flux_exec_wrap (h, r, flags, cmd, ops); +} + +flux_subprocess_t *flux_rexec (flux_t *h, int rank, int flags, + const flux_cmd_t *cmd, + flux_subprocess_ops_t *ops) +{ + flux_subprocess_t *p = NULL; + flux_reactor_t *r; + int save_errno; + + if (!h || !cmd) { + errno = EINVAL; + return NULL; + } + + if (rank < 0 && rank != FLUX_NODEID_ANY) + return rexec_local (h, flags, cmd, ops); + + /* no flags supported yet */ + if (flags) { + errno = EINVAL; + return NULL; + } + + /* user required to set some args */ + if (!flux_cmd_argc (cmd)) { + errno = EINVAL; + goto error; + } + + /* user required to set cwd */ + if (!flux_cmd_getcwd (cmd)) { + errno = EINVAL; + goto error; + } + + if (!(r = flux_get_reactor (h))) + goto error; + + if (!(p = subprocess_create (flags, cmd, ops, h, r, rank, false))) + goto error; + + if (subprocess_remote_setup (p) < 0) + goto error; + + if (subprocess_setup_state_change (p) < 0) + goto error; + + if (subprocess_setup_completed (p) < 0) + goto error; + + if (remote_exec (p) < 0) + goto error; + + return p; + +error: + save_errno = errno; + flux_subprocess_unref (p); + errno = save_errno; + return NULL; +} + +int flux_subprocess_write (flux_subprocess_t *p, const char *stream, + const char *buf, size_t len) +{ + struct subprocess_channel *c; + flux_buffer_t *fb; + int ret; + + if (!p || p->magic != SUBPROCESS_MAGIC) { + errno = EINVAL; + return -1; + } + + if (!buf || !len) { + errno = EINVAL; + return -1; + } + + if (!stream) + stream = "STDIN"; + + c = zhash_lookup (p->channels, stream); + if (!c || !(c->flags & CHANNEL_WRITE)) { + errno = EINVAL; + return -1; + } + + if (c->closed) { + errno = EPIPE; + return -1; + } + + if (p->local) { + if (p->state != FLUX_SUBPROCESS_STARTED + && p->state != FLUX_SUBPROCESS_RUNNING) { + errno = EPIPE; + return -1; + } + if (!(fb = flux_buffer_write_watcher_get_buffer (c->buffer_write_w))) { + log_err ("flux_buffer_write_watcher_get_buffer"); + return -1; + } + + if ((ret = flux_buffer_write (fb, buf, len)) < 0) { + log_err ("flux_buffer_write"); + return -1; + } + } + else { + if (p->state != FLUX_SUBPROCESS_INIT + && p->state != FLUX_SUBPROCESS_STARTED + && p->state != FLUX_SUBPROCESS_RUNNING) { + errno = EPIPE; + return -1; + } + if ((ret = flux_buffer_write (c->write_buffer, buf, len)) < 0) { + log_err ("flux_buffer_write"); + return -1; + } + } + + return ret; +} + +int flux_subprocess_close (flux_subprocess_t *p, const char *stream) +{ + struct subprocess_channel *c; + + if (!p || p->magic != SUBPROCESS_MAGIC) { + errno = EINVAL; + return -1; + } + + if (!stream) + stream = "STDIN"; + + c = zhash_lookup (p->channels, stream); + if (!c || !(c->flags & CHANNEL_WRITE)) { + errno = EINVAL; + return -1; + } + + if (c->closed) + return 0; + + if (p->local) { + if (p->state == FLUX_SUBPROCESS_STARTED + || p->state == FLUX_SUBPROCESS_RUNNING) { + if (flux_buffer_write_watcher_close (c->buffer_write_w) < 0) { + log_err ("flux_buffer_write_watcher_close"); + return -1; + } + } + /* else p->state == FLUX_SUBPROCESS_EXEC_FAILED + || p->state == FLUX_SUBPROCESS_EXITED + || p->state == FLUX_SUBPROCESS_FAILED + */ + c->closed = true; + } + else { + /* doesn't matter about state, b/c reactors will send closed. + * If those reactors are already turned off, it's b/c + * subprocess failed/exited. + */ + c->closed = true; + } + + return 0; +} + +static const char *subprocess_read (flux_subprocess_t *p, + const char *stream, + int len, int *lenp, + bool read_line) +{ + struct subprocess_channel *c; + flux_buffer_t *fb; + const char *ptr; + + if (!p || p->magic != SUBPROCESS_MAGIC) { + errno = EINVAL; + return NULL; + } + + if (!read_line && len == 0) { + errno = EINVAL; + return NULL; + } + + if (!stream) + stream = "STDOUT"; + + c = zhash_lookup (p->channels, stream); + if (!c || !(c->flags & CHANNEL_READ)) { + errno = EINVAL; + return NULL; + } + + if (p->local) { + if (!(fb = flux_buffer_read_watcher_get_buffer (c->buffer_read_w))) + return NULL; + } + else + fb = c->read_buffer; + + if (read_line) { + if (!(ptr = flux_buffer_read_line (fb, lenp))) + return NULL; + } + else { + if (!(ptr = flux_buffer_read (fb, len, lenp))) + return NULL; + } + + return ptr; +} + +const char *flux_subprocess_read (flux_subprocess_t *p, + const char *stream, + int len, int *lenp) +{ + return subprocess_read (p, stream, len, lenp, false); +} + +const char *flux_subprocess_read_line (flux_subprocess_t *p, + const char *stream, + int *lenp) +{ + return subprocess_read (p, stream, 0, lenp, true); +} + +flux_future_t *flux_subprocess_kill (flux_subprocess_t *p, int signum) +{ + flux_future_t *f = NULL; + + if (!p || p->magic != SUBPROCESS_MAGIC || !signum) { + errno = EINVAL; + return NULL; + } + + if (p->kill_signum) { + /* XXX right errno? */ + errno = EBUSY; + return NULL; + } + + if (p->state != FLUX_SUBPROCESS_RUNNING) { + /* XXX right errno? */ + errno = EINVAL; + return NULL; + } + + if (p->local) { + f = flux_future_create (NULL, NULL); + if (kill (p->pid, signum) < 0) + flux_future_fulfill_error (f, errno, NULL); + else + flux_future_fulfill (f, NULL, NULL); + } + else { + if (!(f = remote_kill (p, signum))) { + int save_errno = errno; + f = flux_future_create (NULL, NULL); + flux_future_fulfill_error (f, save_errno, NULL); + } + } + p->kill_signum = signum; + return f; +} + +void flux_subprocess_ref (flux_subprocess_t *p) +{ + if (p && p->magic == SUBPROCESS_MAGIC) + p->refcount++; +} + +void flux_subprocess_unref (flux_subprocess_t *p) +{ + if (p && p->magic == SUBPROCESS_MAGIC) { + if (--p->refcount == 0) + subprocess_free (p); + } +} + +void flux_subprocess_destroy (void *arg) +{ + flux_subprocess_t *p = arg; + flux_subprocess_unref (p); +} + +flux_subprocess_state_t flux_subprocess_state (flux_subprocess_t *p) +{ + if (!p || p->magic != SUBPROCESS_MAGIC) { + errno = EINVAL; + return -1; + } + return p->state; +} + +const char *flux_subprocess_state_string (flux_subprocess_state_t state) +{ + switch (state) + { + case FLUX_SUBPROCESS_INIT: + return "Init"; + case FLUX_SUBPROCESS_STARTED: + return "Started"; + case FLUX_SUBPROCESS_EXEC_FAILED: + return "Exec Failed"; + case FLUX_SUBPROCESS_RUNNING: + return "Running"; + case FLUX_SUBPROCESS_EXITED: + return "Exited"; + case FLUX_SUBPROCESS_FAILED: + return "Failed"; + } + return NULL; +} + +int flux_subprocess_rank (flux_subprocess_t *p) +{ + if (!p || p->magic != SUBPROCESS_MAGIC) { + errno = EINVAL; + return -1; + } + if (p->local) { + errno = EINVAL; + return -1; + } + return p->rank; +} + +int flux_subprocess_fail_errno (flux_subprocess_t *p) +{ + if (!p || p->magic != SUBPROCESS_MAGIC) { + errno = EINVAL; + return -1; + } + if (p->state != FLUX_SUBPROCESS_EXEC_FAILED + && p->state != FLUX_SUBPROCESS_FAILED) { + errno = EINVAL; + return -1; + } + if (p->state == FLUX_SUBPROCESS_EXEC_FAILED) + return p->exec_failed_errno; + else + return p->failed_errno; +} + +int flux_subprocess_status (flux_subprocess_t *p) +{ + if (!p || p->magic != SUBPROCESS_MAGIC) { + errno = EINVAL; + return -1; + } + if (p->state != FLUX_SUBPROCESS_EXITED) { + errno = EINVAL; + return -1; + } + return p->status; +} + +int flux_subprocess_exit_code (flux_subprocess_t *p) +{ + if (!p || p->magic != SUBPROCESS_MAGIC) { + errno = EINVAL; + return -1; + } + if (p->state != FLUX_SUBPROCESS_EXITED) { + errno = EINVAL; + return -1; + } + if (!WIFEXITED (p->status)) { + errno = EINVAL; + return -1; + } + return WEXITSTATUS (p->status); +} + +int flux_subprocess_signaled (flux_subprocess_t *p) +{ + if (!p || p->magic != SUBPROCESS_MAGIC) { + errno = EINVAL; + return -1; + } + if (p->state != FLUX_SUBPROCESS_EXITED) { + errno = EINVAL; + return -1; + } + if (!WIFSIGNALED (p->status)) { + errno = EINVAL; + return -1; + } + return WTERMSIG (p->status); +} + +pid_t flux_subprocess_pid (flux_subprocess_t *p) +{ + if (!p || p->magic != SUBPROCESS_MAGIC) { + errno = EINVAL; + return -1; + } + return p->pid; +} + +flux_cmd_t * flux_subprocess_get_cmd (flux_subprocess_t *p) +{ + if (!p || p->magic != SUBPROCESS_MAGIC) { + errno = EINVAL; + return NULL; + } + return p->cmd; +} + +flux_reactor_t * flux_subprocess_get_reactor (flux_subprocess_t *p) +{ + if (!p || p->magic != SUBPROCESS_MAGIC) { + errno = EINVAL; + return NULL; + } + return p->reactor; +} + +int flux_subprocess_set_context (flux_subprocess_t *p, const char *name, void *x) +{ + if (!p || p->magic != SUBPROCESS_MAGIC) { + errno = EINVAL; + return -1; + } + return zhash_insert (p->aux, name, x); +} + +void * flux_subprocess_get_context (flux_subprocess_t *p, const char *name) +{ + if (!p || p->magic != SUBPROCESS_MAGIC) { + errno = EINVAL; + return NULL; + } + return zhash_lookup (p->aux, name); +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/src/common/subprocess/subprocess.h b/src/common/subprocess/subprocess.h new file mode 100644 index 000000000000..be019e4f0df2 --- /dev/null +++ b/src/common/subprocess/subprocess.h @@ -0,0 +1,356 @@ +#ifndef _FLUX_CORE_SUBPROCESS_H +#define _FLUX_CORE_SUBPROCESS_H + +#include + +/* + * flux_cmd_t: An object that defines a command to be run, either + * remotely or as a child of the current process. Includes cmdline + * arguments, environment, and working directory. A flux_cmd_t is + * used to create Flux subprocesses. + */ +typedef struct flux_command flux_cmd_t; + +/* + * flux_subprocess_t: A subprocess is an instantiation of a command + * as a remote or local process. A subprocess has a state (e.g. + * initialized, starting, running, exited, completed), a PID, and + * a rank if running remotely. + */ +typedef struct flux_subprocess flux_subprocess_t; + +/* flux_subprocess_server_t: Handler for a subprocess remote server */ +typedef struct flux_subprocess_server flux_subprocess_server_t; + +/* + * Subprocess states, on changes, will lead to calls to + * on_state_change below. + * + * Possible state changes: + * + * init -> started + * started -> exec failed + * started -> running + * running -> exited + * any state -> failed + */ +typedef enum { + FLUX_SUBPROCESS_INIT, /* initial state */ + FLUX_SUBPROCESS_STARTED, /* fork() has been issued/requested */ + FLUX_SUBPROCESS_EXEC_FAILED, /* exec(2) has failed, only for rexec() */ + FLUX_SUBPROCESS_RUNNING, /* exec(2) has been called */ + FLUX_SUBPROCESS_EXITED, /* process has exited */ + FLUX_SUBPROCESS_FAILED, /* internal failure, catch all for + * all other errors */ +} flux_subprocess_state_t; + +/* + * Subprocess flags + */ +enum { + /* flux_exec(): let parent stdin, stdout, stderr, carry to child. + * Do not create "STDIN", "STDOUT", or "STDERR" channels. Subsequently, + * flux_subprocess_write()/close()/read()/read_line() will fail on + * streams of "STDIN", "STDOUT", or "STDERR". + */ + FLUX_SUBPROCESS_FLAGS_STDIO_FALLTHROUGH = 1, + /* flux_exec(): call setpgrp() before exec(2) */ + FLUX_SUBPROCESS_FLAGS_SETPGRP = 2, +}; + +/* + * Typedefs for subprocess hooks and callbacks: + * + */ +typedef void (*flux_subprocess_f) (flux_subprocess_t *p); +typedef void (*flux_subprocess_output_f) (flux_subprocess_t *p, + const char *stream); +typedef void (*flux_subprocess_state_f) (flux_subprocess_t *p, + flux_subprocess_state_t state); + +/* + * Functions for event-driven subprocess handling: + * + */ +typedef struct { + flux_subprocess_f on_completion; /* Process exited and all I/O + * complete, will not be + * called if EXEC_FAILED or + * FAILED states reached. + */ + flux_subprocess_state_f on_state_change; /* Process state change */ + flux_subprocess_output_f on_channel_out; /* Read from channel when ready */ + flux_subprocess_output_f on_stdout; /* Read of stdout is ready */ + flux_subprocess_output_f on_stderr; /* Read of stderr is ready */ +} flux_subprocess_ops_t; + +/* + * General support: + */ + +/* Start a subprocess server on the handle `h`. Registers message + * handlers, etc for remote execution. "prefix" is the topic prefix + * used to listen for this service, e.g. `broker` would listen + * for `broker.exec`. + */ +flux_subprocess_server_t *flux_subprocess_server_start (flux_t *h, + const char *prefix, + const char *local_uri, + uint32_t rank); + +/* Stop a subprocess server / cleanup flux_subprocess_server_t */ +void flux_subprocess_server_stop (flux_subprocess_server_t *s); + +/* Terminate all subprocesses started by a sender id */ +int flux_subprocess_server_terminate_by_uuid (flux_subprocess_server_t *s, + const char *id); + +/* General output callback that will send output from the subprocess + * to stdout or stderr. Set to `on_stdout` and/or `on_stderr` in + * flux_subprocess_ops_t. Can also be used for 'on_channel_out' + * callback, sending all output to stdout. + */ +void flux_subprocess_output (flux_subprocess_t *p, const char *stream); + +/* + * Commands: + */ + +/* + * Create a cmd object, from which subprocesses can be created + */ +flux_cmd_t * flux_cmd_create (int argc, char *argv[], char **env); + +/* + * Create a copy of a cmd object. + */ +flux_cmd_t * flux_cmd_copy (const flux_cmd_t *cmd); + +/* + * Destroy and free command object `cmd` + */ +void flux_cmd_destroy (flux_cmd_t *cmd); + +/* + * Append formatted string to argv of `cmd`. + */ +int flux_cmd_argv_append (flux_cmd_t *cmd, const char *fmt, ...); + +/* + * Return the current argument count for `cmd`. + */ +int flux_cmd_argc (const flux_cmd_t *cmd); + +/* + * Return the current argument at index n (range 0 to argc - 1) + */ +const char *flux_cmd_arg (const flux_cmd_t *cmd, int n); + +/* + * Set a single environment variable (name) to formatted string `fmt`. + * If `overwrite` is non-zero then overwrite any existing setting for `name`. + */ +int flux_cmd_setenvf (flux_cmd_t *cmd, int overwrite, + const char *name, const char *fmt, ...); + +/* + * Unset environment variable `name` in the command object `cmd`. + */ +void flux_cmd_unsetenv (flux_cmd_t *cmd, const char *name); + +/* + * Return current value for environment variable `name` as set in + * command object `cmd`. If environment variable is not set then NULL + * is returned. + */ +const char *flux_cmd_getenv (const flux_cmd_t *cmd, const char *name); + +/* + * Set/get the working directory for the command `cmd`. + */ +int flux_cmd_setcwd (flux_cmd_t *cmd, const char *cwd); +const char *flux_cmd_getcwd (const flux_cmd_t *cmd); + +/* + * Request a channel for communication between process and caller. + * Callers can write to the subproces via flux_subprocess_write() + * and read from it via flux_subprocess_read(), which is typically + * called from a callback set in 'on_channel_out'. + * + * The `name` argument is also used as the name of an environment variable + * in the subprocess environment that is set to the file descriptor number + * of the process side of the socketpair. E.g. name = "FLUX_PMI" would + * result in the environment variable "FLUX_PMI_FD=N" set in the process + * environment. + */ +int flux_cmd_add_channel (flux_cmd_t *cmd, const char *name); + +/* + * Set generic string options for command object `cmd`. As with environment + * variables, this function adds the option `var` to with value `val` to + * the options array for this command. This can be used to enable optional + * behavior for executed processes (e.g. setpgrp(2)) + * + * String options, note that name indicates the 'name' argument used + * in flux_cmd_add_channel() above. + * + * name + "_BUFSIZE" = buffer size + * STDIN_BUFSIZE = buffer size + * STDOUT_BUFSIZE = buffer size + * STDERR_BUFSIZE = buffer size + * + * By default, stdio and channels use an internal buffer of 1 meg. + * The buffer size can be adjusted with this option. + */ +int flux_cmd_setopt (flux_cmd_t *cmd, const char *var, const char *val); +const char *flux_cmd_getopt (flux_cmd_t *cmd, const char *var); + + + +/* + * Subprocesses: + */ + +/* + * Asynchronously create a new subprocess described by command object + * `cmd`. flux_exec() creates a new subprocess locally, + * flux_rexec() creates a new subprocess on Flux rank + * `rank`. Callbacks in `ops` structure that are non-NULL will be + * called to process state changes, I/O, and completion. + * + * 'rank' can be set to FLUX_NODEID_ANY. If 'rank` < 0 (and != + * FLUX_NODEID_ANY), then flux_rexec() calls flux_exec(). + * + * This function may return NULL (with errno set) on invalid argument(s) + * (EINVAL), or failure of underlying Flux messaging calls. Otherwise, + * a valid subprocess object is returned, though there is no guarantee + * the subprocess has started running anywhere by the time the call returns. + * + */ +flux_subprocess_t *flux_exec (flux_reactor_t *r, int flags, + const flux_cmd_t *cmd, + flux_subprocess_ops_t *ops); + +flux_subprocess_t *flux_rexec (flux_t *h, int rank, int flags, + const flux_cmd_t *cmd, + flux_subprocess_ops_t *ops); + + +/* + * Write data to "stream" stream of subprocess `p`. 'stream' can be + * "STDIN" or the name of a stream specified with + * flux_cmd_add_channel(). If 'stream' is NULL, defaults to "STDIN". + * + * Returns the total amount of data successfully buffered. + */ +int flux_subprocess_write (flux_subprocess_t *p, const char *stream, + const char *buf, size_t len); + +/* + * Close "stream" stream of subprocess `p` and schedule EOF to be sent. + * 'stream' can be "STDIN" or the name of a stream specified with + * flux_cmd_add_channel(). If 'stream' is NULL, defaults to "STDIN". + */ +int flux_subprocess_close (flux_subprocess_t *p, const char *stream); + +/* + * Read up to `len` bytes of unread data from stream `stream`. To + * read all data, specify 'len' of -1. 'stream' can be "STDOUT", + * "STDERR", or the name of a stream specified with + * flux_cmd_add_channel(). If 'stream' is NULL, defaults to + * "STDOUT". + * + * Returns pointer to buffer on success and NULL on error with errno + * set. If reading from "STDOUT" or "STDERR", buffer is guaranteed + * to be NUL terminated. User shall not free returned pointer. + * Length of buffer returned can optionally returned in 'lenp'. A + * length of 0 indicates that the subprocess has closed this stream. + */ +const char *flux_subprocess_read (flux_subprocess_t *p, + const char *stream, + int len, int *lenp); + +/* + * Read line unread data from stream `stream`. 'stream' can be + * "STDOUT", "STDERR", or the name of a stream specified with + * flux_cmd_add_channel(). If 'stream' is NULL, defaults to + * "STDOUT". + * + * Returns pointer to buffer on success and NULL on error with errno + * set. If reading from "STDOUT" or "STDERR", buffer is guaranteed + * to be NUL terminated. User shall not free returned pointer. + * Length of buffer returned can optionally returned in 'lenp'. + */ +const char *flux_subprocess_read_line (flux_subprocess_t *p, + const char *stream, + int *lenp); + +/* + * Create RPC to send signal `signo` to subprocess `p`. + * This call returns a flux_future_t. Use flux_future_then(3) to register + * a continuation callback when the kill operation is complete, or + * flux_future_wait_for(3) to block until the kill operation is complete. + */ +flux_future_t *flux_subprocess_kill (flux_subprocess_t *p, int signo); + +/* + * Add/remove a reference to subprocess object `p`. The subprocess object + * is destroyed once the last reference is removed. + */ +void flux_subprocess_ref (flux_subprocess_t *p); +void flux_subprocess_unref (flux_subprocess_t *p); +void flux_subprocess_destroy (void *arg); + +/* Return current state value of subprocess. Note this may differ + * than state returned in on_state_change callback, as a subprocess + * may have already transitioned past that point (e.g. the callback + * received a transition change to RUNNING, but the child subprocess + * has already EXITED). + */ +flux_subprocess_state_t flux_subprocess_state (flux_subprocess_t *p); + +/* Return string value of state of subprocess + */ +const char *flux_subprocess_state_string (flux_subprocess_state_t state); + +int flux_subprocess_rank (flux_subprocess_t *p); + +/* Specific for FLUX_SUBPROCESS_EXEC_FAILED and + * FLUX_SUBPROCESS_FAILED error states + */ +int flux_subprocess_fail_errno (flux_subprocess_t *p); + +/* For FLUX_SUBPROCESS_EXITED state */ +int flux_subprocess_status (flux_subprocess_t *p); + +/* For FLUX_SUBPROCESS_EXITED state */ +int flux_subprocess_exit_code (flux_subprocess_t *p); + +/* For FLUX_SUBPROCESS_EXITED state */ +int flux_subprocess_signaled (flux_subprocess_t *p); + +pid_t flux_subprocess_pid (flux_subprocess_t *p); + +/* Return the command object associated with subprocess `p`. + */ +flux_cmd_t *flux_subprocess_get_cmd (flux_subprocess_t *p); + +/* Return the reactor object associated with subprocess `p`. + */ +flux_reactor_t * flux_subprocess_get_reactor (flux_subprocess_t *p); + +/* + * Set arbitrary context `ctx` with name `name` on subprocess object `p`. + * + * Returns 0 on success + */ +int flux_subprocess_set_context (flux_subprocess_t *p, + const char *name, void *ctx); + +/* + * Return pointer to any context associated with `p` under `name`. If + * no such context exists, then NULL is returned. + */ +void *flux_subprocess_get_context (flux_subprocess_t *p, const char *name); + +#endif /* !_FLUX_CORE_SUBPROCESS_H */ diff --git a/src/common/subprocess/subprocess_private.h b/src/common/subprocess/subprocess_private.h new file mode 100644 index 000000000000..cb5168955143 --- /dev/null +++ b/src/common/subprocess/subprocess_private.h @@ -0,0 +1,120 @@ +#ifndef _SUBPROCESS_PRIVATE_H +#define _SUBPROCESS_PRIVATE_H + +#include "subprocess.h" + +#define SUBPROCESS_MAGIC 0xbeefcafe + +#define SUBPROCESS_SERVER_MAGIC 0xbeefbeef + +#define SUBPROCESS_DEFAULT_BUFSIZE 1048576 + +#define CHANNEL_MAGIC 0xcafebeef + +#define CHANNEL_READ 0x01 +#define CHANNEL_WRITE 0x02 +#define CHANNEL_FD 0x04 + +struct subprocess_channel { + int magic; + + flux_subprocess_t *p; + flux_subprocess_output_f output_f; + char *name; + int flags; + + /* caller info */ + bool eof_sent_to_caller; /* eof sent to user */ + bool closed; + + /* local */ + int parent_fd; + int child_fd; + flux_watcher_t *buffer_write_w; + flux_watcher_t *buffer_read_w; + + /* remote */ + flux_buffer_t *write_buffer; + flux_buffer_t *read_buffer; + bool write_eof_sent; + bool read_eof_received; + flux_watcher_t *in_prep_w; + flux_watcher_t *in_idle_w; + flux_watcher_t *in_check_w; + flux_watcher_t *out_prep_w; + flux_watcher_t *out_idle_w; + flux_watcher_t *out_check_w; +}; + +struct flux_subprocess { + int magic; + + flux_t *h; + flux_reactor_t *reactor; + uint32_t rank; + int flags; + int kill_signum; + bool local; /* This is a local process, not remote. */ + + int refcount; + pid_t pid; + + flux_subprocess_ops_t ops; /* Callbacks registered for this proc */ + + flux_cmd_t *cmd; /* readonly/o copy of the command */ + + zhash_t *aux; /* hash for auxillary data */ + + zhash_t *channels; /* hash index by name to channel info */ + int channels_eof_expected; /* number of eofs to expect */ + int channels_eof_sent; /* counter to avoid loop checks */ + + int status; /* Raw status from waitpid(2), valid if exited */ + int exec_failed_errno; /* Holds errno from exec(2) if exec() failed */ + + flux_subprocess_state_t state; + flux_subprocess_state_t state_reported; /* for on_state_change */ + flux_watcher_t *state_prep_w; + flux_watcher_t *state_idle_w; + flux_watcher_t *state_check_w; + + bool completed; /* process has exited and i/o is complete */ + flux_watcher_t *completed_prep_w; + flux_watcher_t *completed_idle_w; + flux_watcher_t *completed_check_w; + + /* local */ + + /* fds[0] is parent/user, fds[1] is child */ + int sync_fds[2]; /* socketpair for fork/exec sync */ + flux_watcher_t *child_w; + + /* remote */ + + flux_future_t *f; /* primary future reactor */ + bool remote_completed; /* if remote has completed */ + int failed_errno; /* Holds errno if FAILED state reached */ +}; + +struct flux_subprocess_server { + int magic; + flux_t *h; + flux_reactor_t *r; + char *local_uri; + uint32_t rank; + zhash_t *subprocesses; + flux_msg_handler_t **handlers; +}; + +void subprocess_check_completed (flux_subprocess_t *p); + +void state_change_start (flux_subprocess_t *p); + +void channel_destroy (void *arg); + +struct subprocess_channel *channel_create (flux_subprocess_t *p, + flux_subprocess_output_f output_f, + const char *name, + int flags); + +#endif /* !_SUBPROCESS_PRIVATE_H */ diff --git a/src/common/subprocess/util.c b/src/common/subprocess/util.c new file mode 100644 index 000000000000..bfac83a5a508 --- /dev/null +++ b/src/common/subprocess/util.c @@ -0,0 +1,94 @@ +/*****************************************************************************\ + * Copyright (c) 2017 Lawrence Livermore National Security, LLC. Produced at + * the Lawrence Livermore National Laboratory (cf, AUTHORS, DISCLAIMER.LLNS). + * LLNL-CODE-658032 All rights reserved. + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the license, or (at your option) + * any later version. + * + * Flux is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the terms and conditions of the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * See also: http://www.gnu.org/licenses/ +\*****************************************************************************/ +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include +#include + +#include + +#include + +#include "src/common/libutil/log.h" +#include "src/common/libutil/fdwalk.h" +#include "src/common/libutil/base64.h" + +#include "subprocess.h" +#include "subprocess_private.h" +#include "util.h" + +void init_pair_fds (int *fds) +{ + fds[0] = -1; + fds[1] = -1; +} + +void close_pair_fds (int *fds) +{ + if (!fds) + return; + if (fds[0] != -1) + close (fds[0]); + if (fds[1] != -1) + close (fds[1]); +} + +int cmd_option_bufsize (flux_subprocess_t *p, const char *name) +{ + char *var; + const char *val; + int rv = -1; + + if (asprintf (&var, "%s_BUFSIZE", name) < 0) { + log_err ("asprintf"); + goto cleanup; + } + + if ((val = flux_cmd_getopt (p->cmd, var))) { + char *endptr; + errno = 0; + rv = strtol (val, &endptr, 10); + if (errno + || endptr[0] != '\0' + || rv <= 0) { + rv = -1; + errno = EINVAL; + goto cleanup; + } + } + else + rv = SUBPROCESS_DEFAULT_BUFSIZE; + +cleanup: + free (var); + return rv; +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/src/common/subprocess/util.h b/src/common/subprocess/util.h new file mode 100644 index 000000000000..4baf474f2cdc --- /dev/null +++ b/src/common/subprocess/util.h @@ -0,0 +1,12 @@ +#ifndef _SUBPROCESS_UTIL_H +#define _SUBPROCESS_UTIL_H + +#include "subprocess.h" + +void init_pair_fds (int *fds); + +void close_pair_fds (int *fds); + +int cmd_option_bufsize (flux_subprocess_t *p, const char *name); + +#endif /* !_SUBPROCESS_UTIL_H */ From b27d541271c5d4176b61ebb8a366403bdd2c714c Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Mon, 13 Aug 2018 11:03:18 -0700 Subject: [PATCH 04/20] command/subprocess: Add subprocess unit tests --- src/common/subprocess/Makefile.am | 34 + src/common/subprocess/test/cmd.c | 227 ++++ src/common/subprocess/test/subprocess.c | 1575 +++++++++++++++++++++++ src/common/subprocess/test/test_echo.c | 146 +++ 4 files changed, 1982 insertions(+) create mode 100644 src/common/subprocess/test/cmd.c create mode 100644 src/common/subprocess/test/subprocess.c create mode 100644 src/common/subprocess/test/test_echo.c diff --git a/src/common/subprocess/Makefile.am b/src/common/subprocess/Makefile.am index b69cca70a8cf..91940e82e384 100644 --- a/src/common/subprocess/Makefile.am +++ b/src/common/subprocess/Makefile.am @@ -28,3 +28,37 @@ libsubprocess_la_SOURCES = \ fluxcoreinclude_HEADERS = \ subprocess.h + +TESTS = \ + test_cmd.t \ + test_subprocess.t + +check_PROGRAMS = \ + $(TESTS) \ + test_echo + +TEST_EXTENSIONS = .t +T_LOG_DRIVER = env AM_TAP_AWK='$(AWK)' $(SHELL) \ + $(top_srcdir)/config/tap-driver.sh + +test_ldadd = \ + $(top_builddir)/src/common/subprocess/libsubprocess.la \ + $(top_builddir)/src/common/libflux-internal.la \ + $(top_builddir)/src/common/libflux-core.la \ + $(top_builddir)/src/common/libtap/libtap.la + +test_cppflags = \ + $(AM_CPPFLAGS) \ + -I$(top_srcdir)/src/common/libtap + +test_cmd_t_SOURCES = test/cmd.c +test_cmd_t_CPPFLAGS = $(test_cppflags) +test_cmd_t_LDADD = $(test_ldadd) + +test_subprocess_t_SOURCES = test/subprocess.c +test_subprocess_t_CPPFLAGS = \ + -DTEST_SUBPROCESS_DIR=\"$(top_builddir)/src/common/subprocess/\" \ + $(test_cppflags) +test_subprocess_t_LDADD = $(test_ldadd) + +test_echo_SOURCES = test/test_echo.c diff --git a/src/common/subprocess/test/cmd.c b/src/common/subprocess/test/cmd.c new file mode 100644 index 000000000000..033dec2ede11 --- /dev/null +++ b/src/common/subprocess/test/cmd.c @@ -0,0 +1,227 @@ + +#include +#include + +#include "src/common/libtap/tap.h" +#include "src/common/subprocess/command.h" + +/* + * Check basic flux_cmd_create () with args + */ +void check_basic_create () +{ + char **av; + char * argv[] = { + "test", + "--option=foo", + "bar", + NULL + }; + int argc = (sizeof (argv)/sizeof (argv[0])) - 1; + char * env[] = { + "FOO=bar", + "PATH=/bin", + NULL + }; + flux_cmd_t *cmd; + + diag ("simple flux_cmd_create (argc, argv, env)"); + cmd = flux_cmd_create (argc, argv, env); + ok (cmd != NULL, "flux_cmd_create ()"); + av = flux_cmd_argv_expand (cmd); + ok (av != NULL, "flux_cmd_argv_expand ()"); + is (av[0], "test", "av[0] == test"); + is (av[1], "--option=foo", "av[1] == --option=foo"); + is (av[2], "bar", "av[2] == bar"); + ok (av[3] == NULL, "av[3] == NULL"); + free (av); + is (flux_cmd_getenv (cmd, "FOO"), "bar", "flux_cmd_getenv"); + is (flux_cmd_getenv (cmd, "PATH"), "/bin", "flux_cmd_getenv"); + + flux_cmd_destroy (cmd); +} + + +void check_empty_cmd_attributes (flux_cmd_t *cmd) +{ + char **argv, **env; + + ok (flux_cmd_argc (cmd) == 0, "flux_cmd_argc"); + + argv = flux_cmd_argv_expand (cmd); + ok (argv != NULL, "flux_cmd_argv_expand returned an argv"); + ok (argv[0] == NULL, "argv is properly NULL terminated"); + free (argv); + + env = flux_cmd_env_expand (cmd); + ok (env != NULL, "flux_cmd_env_expand works"); + ok (env[0] == NULL, "flux_cmd_env_expand properly terminates env"); + free (env); + + ok (flux_cmd_getcwd (cmd) == NULL, + "flux_cmd_getcwd returns NULL"); +} + +/* + * Set some basic known cmd attributes for testing + */ +void set_cmd_attributes (flux_cmd_t *cmd) +{ + assert (flux_cmd_argc (cmd) == 0); + + // Append to argv + ok (flux_cmd_argv_append (cmd, "command") >= 0, + "flux_cmd_argv_append"); + ok (flux_cmd_argv_append (cmd, "foo") >= 0, + "flux_cmd_argv_append"); + ok (flux_cmd_argv_append (cmd, "bar") >= 0, + "flux_cmd_argv_append"); + + // Test setenvf + ok (flux_cmd_setenvf (cmd, 0, "PATH", "/bin:/usr/bin") >= 0, + "flux_cmd_setenvf (PATH)"); + + ok (flux_cmd_setcwd (cmd, "/tmp") >= 0, + "flux_cmd_setcwd (/tmp)"); + ok (flux_cmd_add_channel (cmd, "MY_FD") >= 0, + "flux_cmd_add_channel"); + ok (flux_cmd_setopt (cmd, "OPTION", "VALUE") >= 0, + "flux_cmd_setopt"); +} + +/* set alternate way, to ensure alternate ways also work */ +void set_cmd_attributes2 (flux_cmd_t *cmd) +{ + char *env[] = { "PATH=/bin:/usr/bin", NULL }; + + ok (flux_cmd_set_env (cmd, env) == 0, + "flux_cmd_set_env"); +} + +void check_cmd_attributes (flux_cmd_t *cmd) +{ + char **argv, **env; + const char *arg = NULL; + + ok (flux_cmd_argc (cmd) == 3, "flux_cmd_argc"); + + argv = flux_cmd_argv_expand (cmd); + ok (argv != NULL, "flux_cmd_argv_expand returned an argv"); + ok (argv[3] == NULL, "argv is properly NULL terminated"); + is (argv[0], "command", "argv[0] is correct"); + is (argv[1], "foo", "argv[1] is correct"); + is (argv[2], "bar", "argv[2] is correct"); + free (argv); + + ok (flux_cmd_arg (cmd, 3) == NULL + && errno == EINVAL, + "flux_cmd_arg returns EINVAL on bad range"); + arg = flux_cmd_arg (cmd, 0); + ok (arg != NULL + && !strcmp (arg, "command"), + "flux_cmd_arg returns correct argv[0]"); + arg = flux_cmd_arg (cmd, 1); + ok (arg != NULL + && !strcmp (arg, "foo"), + "flux_cmd_arg returns correct argv[1]"); + arg = flux_cmd_arg (cmd, 2); + ok (arg != NULL + && !strcmp (arg, "bar"), + "flux_cmd_arg returns correct argv[2]"); + + is (flux_cmd_getenv (cmd, "PATH"), "/bin:/usr/bin", + "flux_cmd_getenv"); + + env = flux_cmd_env_expand (cmd); + ok (env != NULL, "flux_cmd_env_expand works"); + ok (env[1] == NULL, "flux_cmd_env_expand properly terminates env"); + is (env[0], "PATH=/bin:/usr/bin", + "first entry of env is as expected"); + free (env); + + is (flux_cmd_getcwd (cmd), "/tmp", + "flux_cmd_getcwd"); + is (flux_cmd_getopt (cmd, "OPTION"), "VALUE", + "flux_cmd_getopt (cmd, 'OPTION') == VALUE"); +} + +int main (int argc, char *argv[]) +{ + char *s; + flux_cmd_t *cmd, *copy; + + plan (NO_PLAN); + + diag ("Basic flux_cmd_create"); + check_basic_create (); + + diag ("Create a flux_cmd_t and fill it with known values"); + // Create an empty command then fill it with nonsense: + cmd = flux_cmd_create (0, NULL, NULL); + ok (cmd != NULL, "flux_cmd_create (0, NULL, NULL)"); + check_empty_cmd_attributes (cmd); + set_cmd_attributes (cmd); + + diag ("Ensure flux_cmd_t contains expected values and test interfaces"); + // Check the nonsense + check_cmd_attributes (cmd); + + set_cmd_attributes2 (cmd); + + diag ("Ensure flux_cmd_t contains expected values again"); + check_cmd_attributes (cmd); + + // Test unsetenv with throwaway var + diag ("Test setenv/getenv/unsetenv"); + ok (flux_cmd_setenvf (cmd, 1, "FOO", "%d", 42) >= 0, + "flux_cmd_setenvf (FOO=42)"); + is (flux_cmd_getenv (cmd, "FOO"), "42", + "flux_cmd_getenv (FOO) == 42"); + flux_cmd_unsetenv (cmd, "FOO"); + ok (flux_cmd_getenv (cmd, "FOO") == NULL, + "flux_cmd_unsetenv works"); + + // Test env overwrite + ok (flux_cmd_setenvf (cmd, 0, "FOO", "%d", 42) >= 0, + "flux_cmd_setenvf (FOO=42)"); + is (flux_cmd_getenv (cmd, "FOO"), "42", + "flux_cmd_getenv (FOO) == 42"); + ok (flux_cmd_setenvf (cmd, 0, "FOO", "%d", 24) < 0, + "flux_cmd_setenvf (FOO=24) no overwrite fails"); + ok (flux_cmd_setenvf (cmd, 1, "FOO", "%d", 24) >= 0, + "flux_cmd_setenvf (FOO=24, overwrite=true)"); + is (flux_cmd_getenv (cmd, "FOO"), "24", + "flux_cmd_getenv (FOO) == 24"); + flux_cmd_unsetenv (cmd, "FOO"); + + diag ("Copy a flux_cmd_t and and ensure it matches source cmd"); + copy = flux_cmd_copy (cmd); + ok (copy != NULL, "flux_cmd_copy"); + check_cmd_attributes (copy); + flux_cmd_destroy (copy); + + diag ("Convert flux_cmd_t to/from JSON"); + s = flux_cmd_tojson (cmd); + ok (s != NULL, "flux_cmd_tojson (%d bytes)", strlen (s)); + if (s) { + json_error_t error; + diag (s); + copy = flux_cmd_fromjson (s, &error); + free (s); + ok (copy != NULL, "flux_cmd_fromjson returned a new cmd"); + if (copy) { + check_cmd_attributes (copy); + flux_cmd_destroy (copy); + } + else + diag ("%d:%d: %s", error.line, error.column, error.text); + } + flux_cmd_destroy (cmd); + + done_testing (); + return 0; +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/src/common/subprocess/test/subprocess.c b/src/common/subprocess/test/subprocess.c new file mode 100644 index 000000000000..915db3c94ae3 --- /dev/null +++ b/src/common/subprocess/test/subprocess.c @@ -0,0 +1,1575 @@ +#include + +#include +#include +#include + +#include "src/common/libtap/tap.h" +#include "src/common/subprocess/subprocess.h" + +int completion_cb_count; +int completion_fail_cb_count; +int stdout_output_cb_count; +int stderr_output_cb_count; +int output_default_stream_cb_count; +int multiple_lines_stdout_output_cb_count; +int multiple_lines_stderr_output_cb_count; +int env_passed_cb_count; +int completion_sigterm_cb_count; +int stdout_eof_cb_count; +int stderr_eof_cb_count; +int state_change_cb_count; +int channel_fd_env_cb_count; +int channel_in_cb_count; +int channel_in_and_out_cb_count; +int multiple_lines_channel_cb_count; +int channel_nul_terminate_cb_count; + +static int fdcount (void) +{ + int fd, fdlimit = sysconf (_SC_OPEN_MAX); + int count = 0; + for (fd = 0; fd < fdlimit; fd++) { + if (fcntl (fd, F_GETFD) != -1) + count++; + } + return count; +} + +void completion_cb (flux_subprocess_t *p) +{ + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_EXITED, + "subprocess state == EXITED in completion handler"); + ok (flux_subprocess_status (p) != -1, + "subprocess status is valid"); + ok (flux_subprocess_exit_code (p) == 0, + "subprocess exit code is 0"); + completion_cb_count++; +} + +void test_basic (flux_reactor_t *r) +{ + char *av[] = { "/bin/true", NULL }; + flux_cmd_t *cmd, *cmd2; + flux_reactor_t *r2; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (1, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb + }; + completion_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + ok ((flux_subprocess_pid (p) > (pid_t) 0), + "flux_exec() started pid %ld", (pid_t) flux_subprocess_pid (p)); + ok ((cmd2 = flux_subprocess_get_cmd (p)) != NULL, + "flux_subprocess_get_cmd success"); + ok ((r2 = flux_subprocess_get_reactor (p)) != NULL, + "flux_subprocess_get_reactor success"); + ok (r == r2, + "flux_subprocess_get_reactor returns correct reactor"); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + flux_subprocess_destroy (p); +} + +void completion_fail_cb (flux_subprocess_t *p) +{ + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_EXITED, + "subprocess state == EXITED in completion handler"); + ok (flux_subprocess_status (p) != -1, + "subprocess status is valid"); + ok (flux_subprocess_exit_code (p) == 1, + "subprocess exit code is 1"); + completion_fail_cb_count++; +} + +void test_basic_fail (flux_reactor_t *r) +{ + char *av[] = { "/bin/false", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (1, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_fail_cb + }; + completion_fail_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_fail_cb_count == 1, "completion fail callback called 1 time"); + flux_subprocess_destroy (p); +} + +void test_basic_errors (flux_reactor_t *r) +{ + flux_t *h_hack = (flux_t *)0x12345678; + char *avgood[] = { "/bin/true", NULL }; + char *avbad[] = { NULL }; + flux_cmd_t *cmd; + + ok (!flux_subprocess_server_start (NULL, NULL, NULL, 0) + && errno == EINVAL, + "flux_subprocess_server_start fails with NULL pointer inputs"); + ok (flux_exec (NULL, 0, NULL, NULL) == NULL + && errno == EINVAL, + "flux_exec fails with NULL pointer inputs"); + ok (flux_exec (r, 1234, NULL, NULL) == NULL + && errno == EINVAL, + "flux_exec fails with invalid flag"); + ok (flux_rexec (NULL, 0, 0, NULL, NULL) == NULL + && errno == EINVAL, + "flux_rexec fails with NULL pointer inputs"); + ok (flux_rexec (h_hack, 0, 1, NULL, NULL) == NULL + && errno == EINVAL, + "flux_rexec fails with invalid flag"); + + ok ((cmd = flux_cmd_create (0, avbad, NULL)) != NULL, + "flux_cmd_create with 0 args works"); + ok (flux_rexec (h_hack, 0, 0, cmd, NULL) == NULL + && errno == EINVAL, + "flux_rexec fails with cmd with zero args"); + flux_cmd_destroy (cmd); + + ok ((cmd = flux_cmd_create (1, avgood, NULL)) != NULL, + "flux_cmd_create with 0 args works"); + ok (flux_rexec (h_hack, 0, 0, cmd, NULL) == NULL + && errno == EINVAL, + "flux_rexec fails with cmd with no cwd"); + flux_cmd_destroy (cmd); + + ok (flux_subprocess_write (NULL, "STDIN", "foo", 3) < 0 + && errno == EINVAL, + "flux_subprocess_write fails with NULL pointer inputs"); + ok (flux_subprocess_close (NULL, "STDIN") < 0 + && errno == EINVAL, + "flux_subprocess_close fails with NULL pointer inputs"); + ok (flux_subprocess_read (NULL, "STDOUT", -1, NULL) == NULL + && errno == EINVAL, + "flux_subprocess_read fails with NULL pointer inputs"); + ok (flux_subprocess_read_line (NULL, "STDOUT", NULL) == NULL + && errno == EINVAL, + "flux_subprocess_read_line fails with NULL pointer inputs"); + ok (flux_subprocess_kill (NULL, 0) == NULL + && errno == EINVAL, + "flux_subprocess_kill fails with NULL pointer inputs"); + ok ((int)flux_subprocess_state (NULL) < 0 + && errno == EINVAL, + "flux_subprocess_state fails with NULL pointer inputs"); + ok (flux_subprocess_rank (NULL) < 0 + && errno == EINVAL, + "flux_subprocess_rank fails with NULL pointer inputs"); + ok (flux_subprocess_fail_errno (NULL) < 0 + && errno == EINVAL, + "flux_subprocess_fail_errno fails with NULL pointer inputs"); + ok (flux_subprocess_status (NULL) < 0 + && errno == EINVAL, + "flux_subprocess_status fails with NULL pointer inputs"); + ok (flux_subprocess_exit_code (NULL) < 0 + && errno == EINVAL, + "flux_subprocess_exit_code fails with NULL pointer inputs"); + ok (flux_subprocess_signaled (NULL) < 0 + && errno == EINVAL, + "flux_subprocess_signaled fails with NULL pointer inputs"); + ok (flux_subprocess_pid (NULL) < 0 + && errno == EINVAL, + "flux_subprocess_pid fails with NULL pointer inputs"); + ok (flux_subprocess_get_cmd (NULL) == NULL + && errno == EINVAL, + "flux_subprocess_get_cmd fails with NULL pointer inputs"); + ok (flux_subprocess_get_reactor (NULL) == NULL + && errno == EINVAL, + "flux_subprocess_get_reactor fails with NULL pointer inputs"); + ok (flux_subprocess_set_context (NULL, "foo", "bar") < 0 + && errno == EINVAL, + "flux_subprocess_set_context fails with NULL pointer inputs"); + ok (flux_subprocess_get_context (NULL, "foo") == NULL + && errno == EINVAL, + "flux_subprocess_get_context fails with NULL pointer inputs"); +} + +void test_errors (flux_reactor_t *r) +{ + char *av[] = { "/bin/true", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (1, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb + }; + completion_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + ok (flux_subprocess_write (p, NULL, NULL, 0) < 0 + && errno == EINVAL, + "flux_subprocess_write returns EINVAL on bad input"); + ok (flux_subprocess_write (p, "foo", "foo", 3) < 0 + && errno == EINVAL, + "flux_subprocess_write returns EINVAL on bad stream"); + ok (flux_subprocess_close (p, "foo") < 0 + && errno == EINVAL, + "flux_subprocess_close returns EINVAL on bad stream"); + ok (flux_subprocess_read (p, NULL, 0, NULL) == NULL + && errno == EINVAL, + "flux_subprocess_read returns EINVAL on bad input"); + ok (flux_subprocess_read (p, "foo", -1, NULL) == NULL + && errno == EINVAL, + "flux_subprocess_read returns EINVAL on bad stream"); + ok (flux_subprocess_read_line (p, "foo", NULL) == NULL + && errno == EINVAL, + "flux_subprocess_read returns EINVAL on bad stream"); + ok (flux_subprocess_kill (p, 0) == NULL + && errno == EINVAL, + "flux_subprocess_kill returns EINVAL on illegal signum"); + ok (flux_subprocess_rank (p) < 0, + "flux_subprocess_rank fails b/c subprocess is local"); + ok (flux_subprocess_fail_errno (p) < 0, + "subprocess fail errno fails b/c subprocess not failed"); + ok (flux_subprocess_status (p) < 0, + "subprocess status fails b/c subprocess not yet exited"); + ok (flux_subprocess_exit_code (p) < 0, + "subprocess exit_code fails b/c subprocess not yet exited"); + ok (flux_subprocess_signaled (p) < 0, + "subprocess signaled fails b/c subprocess not yet exited"); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + + ok (flux_subprocess_write (p, NULL, "foo", 3) < 0 + && errno == EPIPE, + "flux_subprocess_write returns EPIPE b/c process already completed"); + + flux_subprocess_destroy (p); +} + +void output_cb (flux_subprocess_t *p, const char *stream) +{ + const char *ptr; + char cmpbuf[1024]; + int lenp = 0; + int *counter; + + if (!strcasecmp (stream, "STDOUT")) + counter = &stdout_output_cb_count; + else if (!strcasecmp (stream, "STDERR")) + counter = &stderr_output_cb_count; + else { + ok (false, "unexpected stream %s", stream); + return; + } + + if ((*counter) == 0) { + ptr = flux_subprocess_read_line (p, stream, &lenp); + ok (ptr != NULL + && lenp > 0, + "flux_subprocess_read_line on %s success", stream); + + sprintf (cmpbuf, "%s:hi\n", stream); + + ok (!strcmp (ptr, cmpbuf), + "flux_subprocess_read_line returned correct data", stream); + } + else { + ptr = flux_subprocess_read (p, stream, -1, &lenp); + ok (ptr != NULL + && lenp == 0, + "flux_subprocess_read on %s read EOF", stream); + } + + (*counter)++; +} + +void test_basic_stdout (flux_reactor_t *r) +{ + char *av[] = { TEST_SUBPROCESS_DIR "test_echo", "-P", "-O", "hi", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (4, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_stdout = output_cb + }; + completion_cb_count = 0; + stdout_output_cb_count = 0; + stderr_output_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + ok (stdout_output_cb_count == 2, "stdout output callback called 2 times"); + ok (stderr_output_cb_count == 0, "stderr output callback called 0 times"); + flux_subprocess_destroy (p); +} + +void test_basic_stderr (flux_reactor_t *r) +{ + char *av[] = { TEST_SUBPROCESS_DIR "test_echo", "-P", "-E", "hi", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (4, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_stderr = output_cb + }; + completion_cb_count = 0; + stdout_output_cb_count = 0; + stderr_output_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + ok ((flux_subprocess_pid (p) > (pid_t) 0), + "flux_exec() started pid %ld", (pid_t) flux_subprocess_pid (p)); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + ok (stdout_output_cb_count == 0, "stdout output callback called 0 times"); + ok (stderr_output_cb_count == 2, "stderr output callback called 2 times"); + flux_subprocess_destroy (p); +} + +void test_basic_stdout_and_stderr (flux_reactor_t *r) +{ + char *av[] = { TEST_SUBPROCESS_DIR "test_echo", "-P", "-O", "-E", "hi", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (5, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_stdout = output_cb, + .on_stderr = output_cb + }; + completion_cb_count = 0; + stdout_output_cb_count = 0; + stderr_output_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + ok (stdout_output_cb_count == 2, "stdout output callback called 2 times"); + ok (stderr_output_cb_count == 2, "stderr output callback called 2 times"); + flux_subprocess_destroy (p); +} + +void test_basic_default_output (flux_reactor_t *r) +{ + char *av[] = { TEST_SUBPROCESS_DIR "test_echo", "-P", "-O", "-E", "hi", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (5, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_stdout = flux_subprocess_output, + .on_stderr = flux_subprocess_output + }; + completion_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + flux_subprocess_destroy (p); +} + +void output_default_stream_cb (flux_subprocess_t *p, const char *stream) +{ + const char *ptr; + char cmpbuf[1024]; + int lenp = 0; + + if (output_default_stream_cb_count == 0) { + ptr = flux_subprocess_read_line (p, NULL, &lenp); + ok (ptr != NULL + && lenp > 0, + "flux_subprocess_read_line on %s success", "STDOUT"); + + sprintf (cmpbuf, "%s:hi\n", stream); + + ok (!strcmp (ptr, cmpbuf), + "flux_subprocess_read_line returned correct data", "STDOUT"); + } + else { + ptr = flux_subprocess_read (p, NULL, -1, &lenp); + ok (ptr != NULL + && lenp == 0, + "flux_subprocess_read on %s read EOF", "STDOUT"); + } + + output_default_stream_cb_count++; +} + +void test_basic_stdout_default_stream (flux_reactor_t *r) +{ + char *av[] = { TEST_SUBPROCESS_DIR "test_echo", "-P", "-O", "hi", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (4, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_stdout = output_default_stream_cb + }; + completion_cb_count = 0; + output_default_stream_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + ok (output_default_stream_cb_count == 2, "stdout output default stream callback called 2 times"); + flux_subprocess_destroy (p); +} + +void test_basic_stdin (flux_reactor_t *r) +{ + char *av[] = { TEST_SUBPROCESS_DIR "test_echo", "-P", "-O", "-E", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (4, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_stdout = output_cb + }; + completion_cb_count = 0; + stdout_output_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + + ok (flux_subprocess_write (p, "STDIN", "hi", 2) == 2, + "flux_subprocess_write success"); + + ok (flux_subprocess_close (p, "STDIN") == 0, + "flux_subprocess_close success"); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + ok (stdout_output_cb_count == 2, "stdout output callback called 2 times"); + flux_subprocess_destroy (p); +} + +void test_basic_stdin_default_stream (flux_reactor_t *r) +{ + char *av[] = { TEST_SUBPROCESS_DIR "test_echo", "-P", "-O", "-E", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (4, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_stdout = output_cb + }; + completion_cb_count = 0; + stdout_output_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + + ok (flux_subprocess_write (p, NULL, "hi", 2) == 2, + "flux_subprocess_write success"); + + ok (flux_subprocess_close (p, NULL) == 0, + "flux_subprocess_close success"); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + ok (stdout_output_cb_count == 2, "stdout output callback called 2 times"); + flux_subprocess_destroy (p); +} + +void output_no_newline_cb (flux_subprocess_t *p, const char *stream) +{ + const char *ptr; + char cmpbuf[1024]; + int lenp = 0; + int *counter; + + if (!strcasecmp (stream, "STDOUT")) + counter = &stdout_output_cb_count; + else if (!strcasecmp (stream, "STDERR")) + counter = &stderr_output_cb_count; + else { + ok (false, "unexpected stream %s", stream); + return; + } + + if ((*counter) == 0) { + ptr = flux_subprocess_read_line (p, stream, &lenp); + ok (ptr != NULL + && lenp == 0, + "flux_subprocess_read_line on %s read 0 lines", stream); + + ptr = flux_subprocess_read (p, stream, -1, &lenp); + ok (ptr != NULL + && lenp > 0, + "flux_subprocess_read on %s read success", stream); + + sprintf (cmpbuf, "%s:hi", stream); + + ok (!strcmp (ptr, cmpbuf), + "flux_subprocess_read returned correct data"); + } + else { + ptr = flux_subprocess_read (p, stream, -1, &lenp); + ok (ptr != NULL + && lenp == 0, + "flux_subprocess_read on %s read EOF", stream); + } + + (*counter)++; +} + +void test_basic_no_newline (flux_reactor_t *r) +{ + char *av[] = { TEST_SUBPROCESS_DIR "test_echo", "-P", "-O", "-E", "-n", "hi", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (6, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_stdout = output_no_newline_cb, + .on_stderr = output_no_newline_cb + }; + completion_cb_count = 0; + stdout_output_cb_count = 0; + stderr_output_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + ok (stdout_output_cb_count == 2, "stdout output callback called 2 times"); + ok (stderr_output_cb_count == 2, "stderr output callback called 2 times"); + flux_subprocess_destroy (p); +} + +void multiple_lines_output_cb (flux_subprocess_t *p, const char *stream) +{ + const char *ptr; + int lenp = 0; + int *counter; + + if (!strcasecmp (stream, "STDOUT")) + counter = &multiple_lines_stdout_output_cb_count; + else if (!strcasecmp (stream, "STDERR")) + counter = &multiple_lines_stderr_output_cb_count; + else { + ok (false, "unexpected stream %s", stream); + return; + } + + if ((*counter) == 0) { + ptr = flux_subprocess_read_line (p, stream, &lenp); + ok (ptr != NULL + && lenp > 0, + "flux_subprocess_read_line on %s success", stream); + + ok (!strcmp (ptr, "foo\n"), + "flux_subprocess_read_line returned correct data", stream); + } + else if ((*counter) == 1) { + ptr = flux_subprocess_read_line (p, stream, &lenp); + ok (ptr != NULL + && lenp > 0, + "flux_subprocess_read_line on %s success", stream); + + ok (!strcmp (ptr, "bar\n"), + "flux_subprocess_read_line returned correct data"); + } + else if ((*counter) == 2) { + ptr = flux_subprocess_read_line (p, stream, &lenp); + ok (ptr != NULL + && lenp > 0, + "flux_subprocess_read_line on %s success", stream); + + ok (!strcmp (ptr, "bo\n"), + "flux_subprocess_read_line returned correct data", stream); + } + else { + ptr = flux_subprocess_read (p, stream, -1, &lenp); + ok (ptr != NULL + && lenp == 0, + "flux_subprocess_read on %s read EOF", stream); + } + + (*counter)++; +} + +void test_basic_multiple_lines (flux_reactor_t *r) +{ + char *av[] = { TEST_SUBPROCESS_DIR "test_echo", "-O", "-E", "-n", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (4, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_stdout = multiple_lines_output_cb, + .on_stderr = multiple_lines_output_cb + }; + completion_cb_count = 0; + multiple_lines_stdout_output_cb_count = 0; + multiple_lines_stderr_output_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + + ok (flux_subprocess_write (p, "STDIN", "foo\n", 4) == 4, + "flux_subprocess_write success"); + + ok (flux_subprocess_write (p, "STDIN", "bar\n", 4) == 4, + "flux_subprocess_write success"); + + ok (flux_subprocess_write (p, "STDIN", "bo\n", 3) == 3, + "flux_subprocess_write success"); + + ok (flux_subprocess_close (p, "STDIN") == 0, + "flux_subprocess_close success"); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + ok (multiple_lines_stdout_output_cb_count == 4, "stdout output callback called 4 times"); + ok (multiple_lines_stderr_output_cb_count == 4, "stderr output callback called 4 times"); + flux_subprocess_destroy (p); +} + +void test_write_after_close (flux_reactor_t *r) +{ + char *av[] = { TEST_SUBPROCESS_DIR "test_echo", "-O", "-E", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (3, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_stdout = output_cb + }; + completion_cb_count = 0; + stdout_output_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + + ok (flux_subprocess_write (p, "STDIN", "hi", 2) == 2, + "flux_subprocess_write success"); + + ok (flux_subprocess_close (p, "STDIN") == 0, + "flux_subprocess_close success"); + + ok (flux_subprocess_write (p, "STDIN", "hi", 2) < 0 + && errno == EPIPE, + "flux_subprocess_write failed with EPIPE after a close"); + + flux_subprocess_destroy (p); +} + +void env_passed_cb (flux_subprocess_t *p, const char *stream) +{ + const char *ptr; + int lenp = 0; + + ok (!strcasecmp (stream, "STDOUT"), + "env_passed_cb called with correct stream"); + + if (!env_passed_cb_count) { + ptr = flux_subprocess_read_line (p, stream, &lenp); + ok (ptr + && lenp > 0, + "flux_subprocess_read_line on %s success", stream); + + ok (!strncmp (ptr, "FOOBAR=foobaz", 13), + "environment variable FOOBAR in subprocess"); + } + else { + ptr = flux_subprocess_read (p, stream, -1, &lenp); + ok (ptr != NULL + && lenp == 0, + "flux_subprocess_read on %s read EOF", stream); + } + + env_passed_cb_count++; +} + +void test_env_passed (flux_reactor_t *r) +{ + char *av[] = { "/usr/bin/env", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (1, av, NULL)) != NULL, "flux_cmd_create"); + + ok (flux_cmd_setenvf (cmd, 1, "FOOBAR", "foobaz") == 0, + "flux_cmd_setenvf"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_stdout = env_passed_cb + }; + completion_cb_count = 0; + env_passed_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + ok (env_passed_cb_count == 2, "channel fd callback called 2 times"); + flux_subprocess_destroy (p); +} + +void completion_sigterm_cb (flux_subprocess_t *p) +{ + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_EXITED, + "subprocess state == EXITED in completion handler"); + ok (flux_subprocess_status (p) != -1, + "subprocess status is valid"); + ok (flux_subprocess_signaled (p) == SIGTERM, + "subprocess terminated by SIGTERM"); + flux_reactor_stop (flux_subprocess_get_reactor (p)); + completion_sigterm_cb_count++; +} + +void test_kill (flux_reactor_t *r) +{ + char *av[] = { "/bin/sleep", "1000", NULL }; + flux_cmd_t *cmd = NULL; + flux_subprocess_t *p = NULL; + flux_future_t *f = NULL; + + ok ((cmd = flux_cmd_create (2, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_sigterm_cb + }; + completion_sigterm_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + f = flux_subprocess_kill (p, SIGTERM); + ok (f != NULL, "flux_subprocess_kill returns future_t"); + ok (flux_future_wait_for (f, 0.) == 0, + "future fulfilled immediately for local process"); + ok (flux_subprocess_kill (p, SIGINT) == NULL + && errno == EBUSY, + "flux_subprocess_kill returns EBUSY, trying to kill again"); + + ok (flux_future_get (f, NULL) == 0, "flux_future_get (f) returns 0"); + ok (flux_reactor_run (r, 0) == 0, "reactor_run exits normally"); + ok (completion_sigterm_cb_count == 1, "completion sigterm callback called 1 time"); + flux_subprocess_destroy (p); +} + +void eof_cb (flux_subprocess_t *p, const char *stream) +{ + const char *ptr; + int lenp = 0; + int *counter; + + if (!strcasecmp (stream, "STDOUT")) + counter = &stdout_eof_cb_count; + else if (!strcasecmp (stream, "STDERR")) + counter = &stderr_eof_cb_count; + else { + ok (false, "unexpected stream %s", stream); + return; + } + + ptr = flux_subprocess_read (p, stream, -1, &lenp); + ok (ptr != NULL + && lenp == 0, + "flux_subprocess_read on %s read EOF", stream); + + (*counter)++; +} + +void test_kill_eofs (flux_reactor_t *r) +{ + char *av[] = { "/bin/sleep", "1000", NULL }; + flux_cmd_t *cmd = NULL; + flux_subprocess_t *p = NULL; + flux_future_t *f = NULL; + + ok ((cmd = flux_cmd_create (2, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_sigterm_cb, + .on_stdout = eof_cb, + .on_stderr = eof_cb, + }; + completion_sigterm_cb_count = 0; + stdout_eof_cb_count = 0; + stderr_eof_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + f = flux_subprocess_kill (p, SIGTERM); + ok (f != NULL, "flux_subprocess_kill returns future_t"); + ok (flux_future_wait_for (f, 0.) == 0, + "future fulfilled immediately for local process"); + + ok (flux_future_get (f, NULL) == 0, "flux_future_get (f) returns 0"); + ok (flux_reactor_run (r, 0) == 0, "reactor_run exits normally"); + ok (completion_sigterm_cb_count == 1, "completion sigterm callback called 1 time"); + ok (stdout_eof_cb_count == 1, "stdout eof callback called 1 times"); + ok (stderr_eof_cb_count == 1, "stderr eof callback called 1 times"); + flux_subprocess_destroy (p); +} + +void state_change_cb (flux_subprocess_t *p, flux_subprocess_state_t state) +{ + if (state_change_cb_count == 0) + ok (state == FLUX_SUBPROCESS_STARTED, + "subprocess state == STARTED in state change handler"); + else if (state_change_cb_count == 1) + ok (state == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING in state change handler"); + else + ok (state == FLUX_SUBPROCESS_EXITED, + "subprocess state == EXITED in state change handler"); + state_change_cb_count++; +} + +void test_state_change (flux_reactor_t *r) +{ + char *av[] = { "/bin/true", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (1, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_state_change = state_change_cb + }; + completion_cb_count = 0; + state_change_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + ok (state_change_cb_count == 3, "state change callback called 3 times"); + flux_subprocess_destroy (p); +} + +void test_state_strings (void) +{ + ok (!strcasecmp (flux_subprocess_state_string (FLUX_SUBPROCESS_INIT), "Init"), + "flux_subprocess_state_string returns correct string"); + ok (!strcasecmp (flux_subprocess_state_string (FLUX_SUBPROCESS_STARTED), "Started"), + "flux_subprocess_state_string returns correct string"); + ok (!strcasecmp (flux_subprocess_state_string (FLUX_SUBPROCESS_RUNNING), "Running"), + "flux_subprocess_state_string returns correct string"); + ok (!strcasecmp (flux_subprocess_state_string (FLUX_SUBPROCESS_EXITED), "Exited"), + "flux_subprocess_state_string returns correct string"); + ok (!strcasecmp (flux_subprocess_state_string (FLUX_SUBPROCESS_EXEC_FAILED), "Exec Failed"), + "flux_subprocess_state_string returns correct string"); + ok (!flux_subprocess_state_string (100), + "flux_subprocess_state_string returns NULL on bad state"); +} + +void test_exec_fail (flux_reactor_t *r) +{ + char *av_eacces[] = { "/", NULL }; + char *av_enoent[] = { "/usr/bin/foobarbaz", NULL }; + flux_cmd_t *cmd = NULL; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (1, av_eacces, NULL)) != NULL, "flux_cmd_create"); + + p = flux_exec (r, 0, cmd, NULL); + ok (p == NULL + && errno == EACCES, + "flux_exec failed with EACCES"); + + flux_cmd_destroy (cmd); + + ok ((cmd = flux_cmd_create (1, av_enoent, NULL)) != NULL, "flux_cmd_create"); + + p = flux_exec (r, 0, cmd, NULL); + ok (p == NULL + && errno == ENOENT, + "flux_exec failed with ENOENT"); + + flux_cmd_destroy (cmd); +} + +void test_context (flux_reactor_t *r) +{ + char *av[] = { "/bin/true", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + char *extra = "mydata"; + char *tmp; + + ok ((cmd = flux_cmd_create (1, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb + }; + completion_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + ok (flux_subprocess_set_context (p, "extra", extra) == 0, + "flux_subprocess_set_context success"); + ok ((tmp = flux_subprocess_get_context (p, "extra")) != NULL, + "flux_subprocess_get_context success"); + ok (tmp == extra, + "flux_subprocess_get_context returned correct pointer"); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + flux_subprocess_destroy (p); +} + +void test_refcount (flux_reactor_t *r) +{ + char *av[] = { "/bin/true", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + char *extra = "mydata"; + char *tmp; + + ok ((cmd = flux_cmd_create (1, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb + }; + completion_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + ok (flux_subprocess_set_context (p, "extra", extra) == 0, + "flux_subprocess_set_context success"); + flux_subprocess_ref (p); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + flux_subprocess_unref (p); + + /* normally this should fail, but we've increased the refcount so + * subprocess should not be destroyed */ + ok ((tmp = flux_subprocess_get_context (p, "extra")) != NULL, + "flux_subprocess_get_context success"); + ok (tmp == extra, + "flux_subprocess_get_context returned correct pointer"); + + flux_subprocess_unref (p); +} + +void channel_fd_env_cb (flux_subprocess_t *p, const char *stream) +{ + const char *ptr; + int lenp = 0; + + ok (!strcasecmp (stream, "STDOUT"), + "channel_fd_env_cb called with correct stream"); + + if (!channel_fd_env_cb_count) { + ptr = flux_subprocess_read_line (p, stream, &lenp); + ok (ptr + && lenp > 0, + "flux_subprocess_read_line on %s success", stream); + + ok (!strncmp (ptr, "FOO_FD=", 7), + "environment variable FOO_FD created in subprocess"); + } + else { + ptr = flux_subprocess_read (p, stream, -1, &lenp); + ok (ptr != NULL + && lenp == 0, + "flux_subprocess_read on %s read EOF", stream); + } + + channel_fd_env_cb_count++; +} + +void test_channel_fd_env (flux_reactor_t *r) +{ + char *av[] = { "/usr/bin/env", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (1, av, NULL)) != NULL, "flux_cmd_create"); + + ok (flux_cmd_add_channel (cmd, "FOO") == 0, + "flux_cmd_add_channel success adding channel FOO"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_stdout = channel_fd_env_cb + }; + completion_cb_count = 0; + channel_fd_env_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + ok (channel_fd_env_cb_count == 2, "channel fd callback called 2 times"); + flux_subprocess_destroy (p); +} + +void channel_in_cb (flux_subprocess_t *p, const char *stream) +{ + const char *ptr; + int lenp = 0; + + ok (!strcasecmp (stream, "STDOUT"), + "channel_in_cb called with correct stream"); + + if (!channel_in_cb_count) { + ptr = flux_subprocess_read_line (p, stream, &lenp); + ok (ptr != NULL + && lenp == 7, + "flux_subprocess_read_line on %s success", stream); + + ok (!memcmp (ptr, "foobar\n", 7), + "read on channel returned correct data"); + + ok (flux_subprocess_close (p, "TEST_CHANNEL") == 0, + "flux_subprocess_close success"); + } + else { + ptr = flux_subprocess_read (p, stream, -1, &lenp); + ok (ptr != NULL + && lenp == 0, + "flux_subprocess_read on %s read EOF", stream); + } + + channel_in_cb_count++; +} + +void test_channel_fd_in (flux_reactor_t *r) +{ + char *av[] = { TEST_SUBPROCESS_DIR "test_echo", "-c", "TEST_CHANNEL", "-O", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (4, av, NULL)) != NULL, "flux_cmd_create"); + + ok (flux_cmd_add_channel (cmd, "TEST_CHANNEL") == 0, + "flux_cmd_add_channel success adding channel TEST_CHANNEL"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_channel_out = NULL, + .on_stdout = channel_in_cb, + .on_stderr = flux_subprocess_output + }; + completion_cb_count = 0; + channel_in_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + + ok (flux_subprocess_write (p, "TEST_CHANNEL", "foobar", 6) == 6, + "flux_subprocess_write success"); + + /* close after we get output */ + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + ok (channel_in_cb_count == 2, "channel in callback called 2 times"); + flux_subprocess_destroy (p); +} + +void channel_in_and_out_cb (flux_subprocess_t *p, const char *stream) +{ + const char *ptr; + int lenp = 0; + + ok (!strcasecmp (stream, "TEST_CHANNEL"), + "channel_in_and_out_cb called with correct stream"); + + if (!channel_in_and_out_cb_count) { + ptr = flux_subprocess_read_line (p, stream, &lenp); + ok (ptr != NULL + && lenp == 7, + "flux_subprocess_read_line on %s success", stream); + + ok (!memcmp (ptr, "foobaz\n", 7), + "read on channel returned correct data"); + + ok (flux_subprocess_close (p, "TEST_CHANNEL") == 0, + "flux_subprocess_close success"); + } + else { + ptr = flux_subprocess_read (p, stream, -1, &lenp); + ok (ptr != NULL + && lenp == 0, + "flux_subprocess_read on %s read EOF", stream); + } + + channel_in_and_out_cb_count++; +} + +void test_channel_fd_in_and_out (flux_reactor_t *r) +{ + char *av[] = { TEST_SUBPROCESS_DIR "test_echo", "-c", "TEST_CHANNEL", "-C", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (4, av, NULL)) != NULL, "flux_cmd_create"); + + ok (flux_cmd_add_channel (cmd, "TEST_CHANNEL") == 0, + "flux_cmd_add_channel success adding channel TEST_CHANNEL"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_channel_out = channel_in_and_out_cb, + .on_stdout = flux_subprocess_output, + .on_stderr = flux_subprocess_output + }; + completion_cb_count = 0; + channel_in_and_out_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + + ok (flux_subprocess_write (p, "TEST_CHANNEL", "foobaz", 6) == 6, + "flux_subprocess_write success"); + + /* don't call flux_subprocess_close() here, we'll race with data + * coming back, call in callback */ + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + ok (channel_in_and_out_cb_count == 2, "channel out callback called 2 times"); + flux_subprocess_destroy (p); +} + +void channel_multiple_lines_cb (flux_subprocess_t *p, const char *stream) +{ + const char *ptr; + int lenp = 0; + + ok (!strcasecmp (stream, "TEST_CHANNEL"), + "channel_multiple_lines_cb called with correct stream"); + + if (multiple_lines_channel_cb_count == 0) { + ptr = flux_subprocess_read_line (p, stream, &lenp); + ok (ptr != NULL + && lenp > 0, + "flux_subprocess_read_line on %s success", stream); + + ok (!strcmp (ptr, "bob\n"), + "flux_subprocess_read_line returned correct data", stream); + } + else if (multiple_lines_channel_cb_count == 1) { + ptr = flux_subprocess_read_line (p, stream, &lenp); + ok (ptr != NULL + && lenp > 0, + "flux_subprocess_read_line on %s success", stream); + + ok (!strcmp (ptr, "dan\n"), + "flux_subprocess_read_line returned correct data %s", stream); + } + else if (multiple_lines_channel_cb_count == 2) { + ptr = flux_subprocess_read_line (p, stream, &lenp); + ok (ptr != NULL + && lenp > 0, + "flux_subprocess_read_line on %s success", stream); + + ok (!strcmp (ptr, "jo\n"), + "flux_subprocess_read_line returned correct data", stream); + + ok (flux_subprocess_close (p, "TEST_CHANNEL") == 0, + "flux_subprocess_close success"); + } + else { + ptr = flux_subprocess_read (p, stream, -1, &lenp); + ok (ptr != NULL + && lenp == 0, + "flux_subprocess_read on %s read EOF", stream); + } + + multiple_lines_channel_cb_count++; +} + +void test_channel_multiple_lines (flux_reactor_t *r) +{ + char *av[] = { TEST_SUBPROCESS_DIR "test_echo", "-c", "TEST_CHANNEL", "-C", "-n", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (5, av, NULL)) != NULL, "flux_cmd_create"); + + ok (flux_cmd_add_channel (cmd, "TEST_CHANNEL") == 0, + "flux_cmd_add_channel success adding channel TEST_CHANNEL"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_channel_out = channel_multiple_lines_cb, + .on_stdout = flux_subprocess_output, + .on_stderr = flux_subprocess_output + }; + completion_cb_count = 0; + multiple_lines_channel_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + + ok (flux_subprocess_write (p, "TEST_CHANNEL", "bob\n", 4) == 4, + "flux_subprocess_write success"); + + ok (flux_subprocess_write (p, "TEST_CHANNEL", "dan\n", 4) == 4, + "flux_subprocess_write success"); + + ok (flux_subprocess_write (p, "TEST_CHANNEL", "jo\n", 3) == 3, + "flux_subprocess_write success"); + + /* don't call flux_subprocess_close() here, we'll race with data + * coming back, call in callback */ + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + ok (multiple_lines_channel_cb_count == 4, "channel output callback called 4 times"); + flux_subprocess_destroy (p); +} + +void channel_nul_terminate_cb (flux_subprocess_t *p, const char *stream) +{ + const char *ptr; + int lenp = 0; + + if (!channel_nul_terminate_cb_count) { + ptr = flux_subprocess_read_line (p, stream, &lenp); + ok (ptr != NULL + && lenp == 7, + "flux_subprocess_read_line on %s success", stream); + + ok (!memcmp (ptr, "foobaz\n\0", 8), + "read on channel returned correct data"); + + ok (flux_subprocess_close (p, "TEST_CHANNEL") == 0, + "flux_subprocess_close success"); + } + else { + ptr = flux_subprocess_read (p, stream, -1, &lenp); + ok (ptr != NULL + && lenp == 0, + "flux_subprocess_read on %s read EOF", stream); + } + + channel_nul_terminate_cb_count++; +} + +void test_bufsize (flux_reactor_t *r) +{ + char *av[] = { "/bin/true", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (1, av, NULL)) != NULL, "flux_cmd_create"); + + ok (flux_cmd_add_channel (cmd, "TEST_CHANNEL") == 0, + "flux_cmd_add_channel success adding channel TEST_CHANNEL"); + + ok (flux_cmd_setopt (cmd, "STDIN_BUFSIZE", "1024") == 0, + "flux_cmd_setopt set STDIN_BUFSIZE success"); + + ok (flux_cmd_setopt (cmd, "STDOUT_BUFSIZE", "1024") == 0, + "flux_cmd_setopt set STDOUT_BUFSIZE success"); + + ok (flux_cmd_setopt (cmd, "STDERR_BUFSIZE", "1024") == 0, + "flux_cmd_setopt set STDERR_BUFSIZE success"); + + ok (flux_cmd_setopt (cmd, "TEST_CHANNEL_BUFSIZE", "1024") == 0, + "flux_cmd_setopt set TEST_CHANNEL_BUFSIZE success"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_channel_out = flux_subprocess_output, + .on_stdout = flux_subprocess_output, + .on_stderr = flux_subprocess_output + }; + completion_cb_count = 0; + p = flux_exec (r, 0, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + flux_subprocess_destroy (p); +} + +void test_bufsize_error (flux_reactor_t *r) +{ + char *av[] = { "/bin/true", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (1, av, NULL)) != NULL, "flux_cmd_create"); + + ok (flux_cmd_add_channel (cmd, "TEST_CHANNEL") == 0, + "flux_cmd_add_channel success adding channel TEST_CHANNEL"); + + ok (flux_cmd_setopt (cmd, "TEST_CHANNEL_BUFSIZE", "ABCD") == 0, + "flux_cmd_setopt set TEST_CHANNEL_BUFSIZE success"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_channel_out = flux_subprocess_output, + .on_stdout = flux_subprocess_output, + .on_stderr = flux_subprocess_output + }; + p = flux_exec (r, 0, cmd, &ops); + ok (p == NULL + && errno == EINVAL, + "flux_exec fails with EINVAL due to bad bufsize input"); +} + +void test_flag_stdio_fallthrough (flux_reactor_t *r) +{ + char *av[] = { "echo", "foo", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + int lenp; + + ok ((cmd = flux_cmd_create (2, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb + }; + completion_cb_count = 0; + p = flux_exec (r, FLUX_SUBPROCESS_FLAGS_STDIO_FALLTHROUGH, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + + ok (flux_subprocess_write (p, "STDIN", "foo", 3) < 0 + && errno == EINVAL, + "flux_subprocess_write fails on STDIN"); + + ok (flux_subprocess_read (p, "STDOUT", -1, &lenp) == NULL + && errno == EINVAL, + "flux_subprocess_read fails on STDOUT"); + + ok (flux_subprocess_read (p, "STDERR", -1, &lenp) == NULL + && errno == EINVAL, + "flux_subprocess_read fails on STDERR"); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + flux_subprocess_destroy (p); +} + +void test_flag_setpgrp (flux_reactor_t *r) +{ + char *av[] = { "/bin/true", NULL }; + flux_cmd_t *cmd; + flux_subprocess_t *p = NULL; + + ok ((cmd = flux_cmd_create (1, av, NULL)) != NULL, "flux_cmd_create"); + + flux_subprocess_ops_t ops = { + .on_completion = completion_cb + }; + completion_cb_count = 0; + p = flux_exec (r, FLUX_SUBPROCESS_FLAGS_SETPGRP, cmd, &ops); + ok (p != NULL, "flux_exec"); + + ok (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING, + "subprocess state == RUNNING after flux_exec"); + + int rc = flux_reactor_run (r, 0); + ok (rc == 0, "flux_reactor_run returned zero status"); + ok (completion_cb_count == 1, "completion callback called 1 time"); + flux_subprocess_destroy (p); +} + +int main (int argc, char *argv[]) +{ + flux_reactor_t *r; + int start_fdcount, end_fdcount; + + plan (NO_PLAN); + + // Create shared reactor for all tests + ok ((r = flux_reactor_create (FLUX_REACTOR_SIGCHLD)) != NULL, + "flux_reactor_create"); + + start_fdcount = fdcount (); + + diag ("basic"); + test_basic (r); + diag ("basic_fail"); + test_basic_fail (r); + diag ("basic_errors"); + test_basic_errors (r); + diag ("errors"); + test_errors (r); + diag ("basic_stdout"); + test_basic_stdout (r); + diag ("basic_stderr"); + test_basic_stderr (r); + diag ("basic_stdout_and_stderr"); + test_basic_stdout_and_stderr (r); + diag ("basic_default_output"); + test_basic_default_output (r); + diag ("basic_stdout_default_stream"); + test_basic_stdout_default_stream (r); + diag ("basic_stdin"); + test_basic_stdin (r); + diag ("basic_stdin_default_stream"); + test_basic_stdin_default_stream (r); + diag ("basic_no newline"); + test_basic_no_newline (r); + diag ("basic_multiple_lines"); + test_basic_multiple_lines (r); + diag ("write_after_close"); + test_write_after_close (r); + diag ("env_passed"); + test_env_passed (r); + diag ("kill"); + test_kill (r); + diag ("kill_eofs"); + test_kill_eofs (r); + diag ("state_change"); + test_state_change (r); + diag ("state_strings"); + test_state_strings (); + diag ("exec_fail"); + test_exec_fail (r); + diag ("context"); + test_context (r); + diag ("refcount"); + test_refcount (r); + diag ("channel_fd_env"); + test_channel_fd_env (r); + diag ("channel_fd_in"); + test_channel_fd_in (r); + diag ("channel_fd_in_and_out"); + test_channel_fd_in_and_out (r); + diag ("channel_multiple_lines"); + test_channel_multiple_lines (r); + diag ("bufsize"); + test_bufsize (r); + diag ("bufsize_error"); + test_bufsize_error (r); + diag ("flag_stdio_fallthrough"); + test_flag_stdio_fallthrough (r); + diag ("flag_setpgrp"); + test_flag_setpgrp (r); + + end_fdcount = fdcount (); + + ok (start_fdcount == end_fdcount, + "no file descriptors leaked"); + + flux_reactor_destroy (r); + done_testing (); + return 0; +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/src/common/subprocess/test/test_echo.c b/src/common/subprocess/test/test_echo.c new file mode 100644 index 000000000000..59362735c58e --- /dev/null +++ b/src/common/subprocess/test/test_echo.c @@ -0,0 +1,146 @@ +/* simple tool that outputs args to stdout/stderr or both depending on + * options + */ +#include +#include +#include +#include +#include + +int out = 0; +int err = 0; +int channel_out = 0; +int no_newline = 0; +char *channel_name; +int channel = 0; +int fd = STDIN_FILENO; +int prefix = 0; + +void output (const char *str) +{ + if (channel && channel_out) { + char prefixbuf[1024]; + int plen; + + plen = sprintf (prefixbuf, + "%s%s%s", + prefix ? channel_name : "", + prefix ? ":" : "", + str); + if (write (fd, prefixbuf, plen) < 0) { + perror ("write"); + exit (1); + } + } + if (out) { + fprintf (stdout, + "%s%s", + prefix ? "STDOUT:" : "", + str); + fflush (stdout); + } + if (err) { + fprintf (stderr, + "%s%s", + prefix ? "STDERR:" : "", + str); + fflush (stderr); + } +} + +int +main (int argc, char *argv[]) +{ + int bytes = 0; + + while (1) { + int c = getopt (argc, argv, "c:COEnPb:"); + if (c < 0) + break; + + switch (c) { + case 'O': + out++; + break; + case 'E': + err++; + break; + case 'n': + no_newline++; + break; + case 'c': + channel++; + channel_name = optarg; + break; + case 'C': + channel_out++; + break; + case 'P': + prefix++; + break; + case 'b': + bytes = atoi (optarg); + break; + } + } + + if ((out + err) == 0 + && (!channel && !channel_out)) { + fprintf (stderr, "must specify a way to output"); + exit (1); + } + + if (channel) { + const char *fdstr; + char channelstr[1024]; + + sprintf (channelstr, "%s_FD", channel_name); + + if (!(fdstr = getenv (channelstr))) { + perror ("getenv"); + exit (1); + } + + fd = atoi (fdstr); + } + + if (optind != argc) { + while (optind < argc) { + char outbuf[1024]; + + sprintf (outbuf, + "%s%s", + argv[optind], + no_newline ? "" : "\n"); + + output (outbuf); + optind++; + } + } + else { + char buf[1024]; + int total = 0; + int len; + + memset (buf, '\0', 1024); + while ((len = read (fd, buf, 1024)) > 0) { + char outbuf[1024]; + + sprintf (outbuf, + "%s%s", + buf, + no_newline ? "" : "\n"); + + output (outbuf); + + total += len; + + if (bytes && total >= bytes) + break; + + memset (buf, '\0', 1024); + } + } + + exit (0); +} From 26a81a83e3e9aabdd3fb91a4f6dca341de1bedbf Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Mon, 13 Aug 2018 11:03:56 -0700 Subject: [PATCH 05/20] broker: Support new "exec2" module Add support of the new "exec2" module, which is similar to the old "exec" module but uses the new flux subprocess library. This module will be used for transition and eventually be renamed to "exec". --- src/broker/Makefile.am | 2 + src/broker/broker.c | 4 ++ src/broker/exec2.c | 84 ++++++++++++++++++++++++++++++++++++++++++ src/broker/exec2.h | 19 ++++++++++ 4 files changed, 109 insertions(+) create mode 100644 src/broker/exec2.c create mode 100644 src/broker/exec2.h diff --git a/src/broker/Makefile.am b/src/broker/Makefile.am index a8543c37eed2..ef82a91868fa 100644 --- a/src/broker/Makefile.am +++ b/src/broker/Makefile.am @@ -40,6 +40,8 @@ flux_broker_SOURCES = \ heaptrace.c \ exec.h \ exec.c \ + exec2.h \ + exec2.c \ ping.h \ ping.c \ rusage.h \ diff --git a/src/broker/broker.c b/src/broker/broker.c index 5cfe51b3c20d..7d01b54e01c9 100644 --- a/src/broker/broker.c +++ b/src/broker/broker.c @@ -82,6 +82,7 @@ #include "runlevel.h" #include "heaptrace.h" #include "exec.h" +#include "exec2.h" #include "ping.h" #include "rusage.h" #include "boot_config.h" @@ -611,6 +612,8 @@ int main (int argc, char *argv[]) log_err_exit ("sequence_hash_initialize"); if (exec_initialize (ctx.h, ctx.sm, rank, ctx.attrs) < 0) log_err_exit ("exec_initialize"); + if (exec2_initialize (ctx.h, rank, ctx.attrs) < 0) + log_err_exit ("exec2_initialize"); if (ping_initialize (ctx.h, "cmb") < 0) log_err_exit ("ping_initialize"); if (rusage_initialize (ctx.h, "cmb") < 0) @@ -1294,6 +1297,7 @@ static void cmb_disconnect_cb (flux_t *h, flux_msg_handler_t *mh, if (flux_msg_get_route_first (msg, &sender) == 0) { exec_terminate_subprocesses_by_uuid (h, sender); + exec2_terminate_subprocesses_by_uuid (h, sender); free (sender); } /* no response */ diff --git a/src/broker/exec2.c b/src/broker/exec2.c new file mode 100644 index 000000000000..0eb06310c33f --- /dev/null +++ b/src/broker/exec2.c @@ -0,0 +1,84 @@ +/*****************************************************************************\ + * Copyright (c) 2014 Lawrence Livermore National Security, LLC. Produced at + * the Lawrence Livermore National Laboratory (cf, AUTHORS, DISCLAIMER.LLNS). + * LLNL-CODE-658032 All rights reserved. + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the license, or (at your option) + * any later version. + * + * Flux is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the terms and conditions of the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * See also: http://www.gnu.org/licenses/ +\*****************************************************************************/ + +#if HAVE_CONFIG_H +#include "config.h" +#endif +#include +#include +#include +#include +#include +#include + +#include "src/common/subprocess/subprocess.h" +#include "src/common/subprocess/command.h" +#include "src/common/libutil/log.h" +#include "src/common/libutil/base64.h" + +#include "attr.h" +#include "exec2.h" + +static void exec2_finalize (void *arg) +{ + flux_subprocess_server_t *s = arg; + flux_subprocess_server_stop (s); +} + +int exec2_terminate_subprocesses_by_uuid (flux_t *h, const char *id) +{ + flux_subprocess_server_t *s = flux_aux_get (h, "flux::exec2"); + + if (!s) { + flux_log (h, LOG_DEBUG, "no server_ctx found"); + return -1; + } + + if (flux_subprocess_server_terminate_by_uuid (s, id) < 0) { + flux_log_error (h, "flux_subprocess_server_terminate_by_uuid"); + return -1; + } + + return 0; +} + +int exec2_initialize (flux_t *h, uint32_t rank, attr_t *attrs) +{ + flux_subprocess_server_t *s = NULL; + const char *local_uri; + + if (attr_get (attrs, "local-uri", &local_uri, NULL) < 0) + goto cleanup; + if (!(s = flux_subprocess_server_start (h, "cmb", local_uri, rank))) + goto cleanup; + flux_aux_set (h, "flux::exec2", s, exec2_finalize); + return 0; +cleanup: + flux_subprocess_server_stop (s); + return -1; +} + +/* + * vi:tabstop=4 shiftwidth=4 expandtab + */ diff --git a/src/broker/exec2.h b/src/broker/exec2.h new file mode 100644 index 000000000000..404eaac71d51 --- /dev/null +++ b/src/broker/exec2.h @@ -0,0 +1,19 @@ +#ifndef BROKER_EXEC2_H +#define BROKER_EXEC2_H + +#include +#include +#include "src/common/subprocess/subprocess.h" +#include "attr.h" + +/* Kill any processes started by disconnecting client. + */ +int exec2_terminate_subprocesses_by_uuid (flux_t *h, const char *id); + +int exec2_initialize (flux_t *h, uint32_t rank, attr_t *attrs); + +#endif /* BROKER_EXEC2_H */ + +/* + * vi:tabstop=4 shiftwidth=4 expandtab + */ From 3692c56534b6e199235d0fd7c0a9aa41deed23c0 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Mon, 13 Aug 2018 11:07:19 -0700 Subject: [PATCH 06/20] t/: Add flux subprocess remote tests Add unit tests to test the flux subprocess library's remote execution. --- t/Makefile.am | 23 ++++- t/rexec/rexec.c | 186 +++++++++++++++++++++++++++++++++++++++++ t/rexec/rexec_ps.c | 126 ++++++++++++++++++++++++++++ t/rexec/rexec_signal.c | 135 ++++++++++++++++++++++++++++++ t/t0005-rexec.t | 185 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 654 insertions(+), 1 deletion(-) create mode 100644 t/rexec/rexec.c create mode 100644 t/rexec/rexec_ps.c create mode 100644 t/rexec/rexec_signal.c create mode 100755 t/t0005-rexec.t diff --git a/t/Makefile.am b/t/Makefile.am index 0ca59b0dbeac..521858a88360 100644 --- a/t/Makefile.am +++ b/t/Makefile.am @@ -45,6 +45,7 @@ TESTS = \ t0003-module.t \ t0004-event.t \ t0005-exec.t \ + t0005-rexec.t \ t0007-ping.t \ t0008-attr.t \ t0009-dmesg.t \ @@ -140,6 +141,7 @@ check_SCRIPTS = \ t0003-module.t \ t0004-event.t \ t0005-exec.t \ + t0005-rexec.t \ t0007-ping.t \ t0008-attr.t \ t0009-dmesg.t \ @@ -228,7 +230,10 @@ check_PROGRAMS = \ request/treq \ barrier/tbarrier \ wreck/rcalc \ - reactor/reactorcat + reactor/reactorcat \ + rexec/rexec \ + rexec/rexec_signal \ + rexec/rexec_ps check_LTLIBRARIES = \ module/parent.la \ @@ -273,6 +278,7 @@ test_ldadd = \ $(top_builddir)/src/common/libflux-internal.la \ $(top_builddir)/src/common/libflux-core.la \ $(top_builddir)/src/common/libtap/libtap.la \ + $(top_builddir)/src/common/libflux-optparse.la \ $(ZMQ_LIBS) $(LIBPTHREAD) test_cppflags = \ @@ -427,3 +433,18 @@ reactor_reactorcat_SOURCES = reactor/reactorcat.c reactor_reactorcat_CPPFLAGS = $(test_cppflags) reactor_reactorcat_LDADD = \ $(test_ldadd) $(LIBDL) $(LIBUTIL) + +rexec_rexec_SOURCES = rexec/rexec.c +rexec_rexec_CPPFLAGS = $(test_cppflags) +rexec_rexec_LDADD = \ + $(test_ldadd) $(LIBDL) $(LIBUTIL) + +rexec_rexec_signal_SOURCES = rexec/rexec_signal.c +rexec_rexec_signal_CPPFLAGS = $(test_cppflags) +rexec_rexec_signal_LDADD = \ + $(test_ldadd) $(LIBDL) $(LIBUTIL) + +rexec_rexec_ps_SOURCES = rexec/rexec_ps.c +rexec_rexec_ps_CPPFLAGS = $(test_cppflags) +rexec_rexec_ps_LDADD = \ + $(test_ldadd) $(LIBDL) $(LIBUTIL) diff --git a/t/rexec/rexec.c b/t/rexec/rexec.c new file mode 100644 index 000000000000..00b49c38202b --- /dev/null +++ b/t/rexec/rexec.c @@ -0,0 +1,186 @@ +/*****************************************************************************\ + * Copyright (c) 2014 Lawrence Livermore National Security, LLC. Produced at + * the Lawrence Livermore National Laboratory (cf, AUTHORS, DISCLAIMER.LLNS). + * LLNL-CODE-658032 All rights reserved. + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the license, or (at your option) + * any later version. + * + * Flux is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the terms and conditions of the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * See also: http://www.gnu.org/licenses/ +\*****************************************************************************/ + +#if HAVE_CONFIG_H +#include "config.h" +#endif +#include +#include +#include +#include +#include +#include + +#include "src/common/libutil/log.h" +#include "src/common/libutil/read_all.h" +#include "src/common/subprocess/subprocess.h" + +extern char **environ; + +static struct optparse_option cmdopts[] = { + { .name = "rank", .key = 'r', .has_arg = 1, .arginfo = "rank", + .usage = "Specify rank for test" }, + { .name = "outputstates", .key = 's', .has_arg = 0, .arginfo = "NONE", + .usage = "Output state changes as they occur" }, + { .name = "stdin2stream", .key = 'i', .has_arg = 1, .arginfo = "CHANNEL", + .usage = "Read in stdin and forward to subprocess channel" }, + OPTPARSE_TABLE_END +}; + +optparse_t *opts; + +int exit_code = 0; + +void completion_cb (flux_subprocess_t *p) +{ + int ec = flux_subprocess_exit_code (p); + + if (ec > exit_code) + exit_code = ec; +} + +void state_cb (flux_subprocess_t *p, flux_subprocess_state_t state) +{ + if (optparse_getopt (opts, "outputstates", NULL) > 0) + printf ("%s\n", flux_subprocess_state_string (state)); + + if (state == FLUX_SUBPROCESS_EXEC_FAILED + || state == FLUX_SUBPROCESS_FAILED) { + fprintf (stderr, "rank %d: %s: %s\n", + flux_subprocess_rank (p), + flux_subprocess_state_string (state), + strerror (flux_subprocess_fail_errno (p))); + + /* just so we fail non-zero */ + if (!exit_code) + exit_code++; + } +} + +void stdin2stream (flux_subprocess_t *p, const char *stream) +{ + char *buf = NULL; + int tmp, len; + + if ((len = read_all (STDIN_FILENO, (void **)&buf)) < 0) + log_err_exit ("read_all"); + + if (len) { + if ((tmp = flux_subprocess_write (p, stream, buf, len)) < 0) + log_err_exit ("flux_subprocess_write"); + + if (tmp != len) + log_err_exit ("overflow in write"); + } + + /* do not close for channel, b/c can race w/ data coming back */ + if (!strcmp (stream, "STDIN")) { + if (flux_subprocess_close (p, stream) < 0) + log_err_exit ("flux_subprocess_close"); + } + + free (buf); +} + +int main (int argc, char *argv[]) +{ + flux_t *h; + flux_reactor_t *reactor; + flux_cmd_t *cmd; + char *cwd; + flux_subprocess_t *p = NULL; + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_state_change = state_cb, + .on_channel_out = NULL, + .on_stdout = flux_subprocess_output, + .on_stderr = flux_subprocess_output, + }; + const char *optargp; + int optindex; + int rank = 0; + + log_init ("rexec"); + + opts = optparse_create ("rexec"); + if (optparse_add_option_table (opts, cmdopts) != OPTPARSE_SUCCESS) + log_msg_exit ("optparse_add_option_table"); + if ((optindex = optparse_parse_args (opts, argc, argv)) < 0) + exit (1); + + if (optparse_getopt (opts, "rank", &optargp) > 0) + rank = atoi (optargp); + + if (optindex == argc) { + optparse_print_usage (opts); + exit (1); + } + + /* all args to cmd */ + if (!(cmd = flux_cmd_create (argc - optindex, &argv[optindex], environ))) + log_err_exit ("flux_cmd_create"); + + if (!(cwd = get_current_dir_name ())) + log_err_exit ("get_current_dir_name"); + + if (flux_cmd_setcwd (cmd, cwd) < 0) + log_err_exit ("flux_cmd_setcwd"); + + if (optparse_getopt (opts, "stdin2stream", &optargp) > 0) { + if (strcmp (optargp, "STDIN") + && strcmp (optargp, "STDOUT") + && strcmp (optargp, "STDERR")) { + if (flux_cmd_add_channel (cmd, optargp) < 0) + log_err_exit ("flux_cmd_add_channel"); + ops.on_channel_out = flux_subprocess_output; + } + } + + if (!(h = flux_open (NULL, 0))) + log_err_exit ("flux_open"); + + if (!(reactor = flux_get_reactor (h))) + log_err_exit ("flux_get_reactor"); + + if (!(p = flux_rexec (h, rank, 0, cmd, &ops))) + log_err_exit ("flux_rexec"); + + if (optparse_getopt (opts, "stdin2stream", &optargp) > 0) + stdin2stream (p, optargp); + + if (flux_reactor_run (reactor, 0) < 0) + log_err_exit ("flux_reactor_run"); + + /* Clean up. + */ + flux_subprocess_destroy (p); + flux_close (h); + log_fini (); + + return exit_code; +} + +/* + * vi:tabstop=4 shiftwidth=4 expandtab + */ diff --git a/t/rexec/rexec_ps.c b/t/rexec/rexec_ps.c new file mode 100644 index 000000000000..91273de4879b --- /dev/null +++ b/t/rexec/rexec_ps.c @@ -0,0 +1,126 @@ +/*****************************************************************************\ + * Copyright (c) 2014 Lawrence Livermore National Security, LLC. Produced at + * the Lawrence Livermore National Laboratory (cf, AUTHORS, DISCLAIMER.LLNS). + * LLNL-CODE-658032 All rights reserved. + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the license, or (at your option) + * any later version. + * + * Flux is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the terms and conditions of the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * See also: http://www.gnu.org/licenses/ +\*****************************************************************************/ + +#if HAVE_CONFIG_H +#include "config.h" +#endif +#include +#include +#include +#include +#include +#include +#include + +#include "src/common/libutil/log.h" +#include "src/common/libutil/read_all.h" +#include "src/common/subprocess/subprocess.h" + +extern char **environ; + +static struct optparse_option cmdopts[] = { + { .name = "rank", .key = 'r', .has_arg = 1, .arginfo = "rank", + .usage = "Specify rank for test" }, + OPTPARSE_TABLE_END +}; + +void output (int rank, json_t *procs) +{ + size_t index; + json_t *value; + + if (!json_is_array (procs)) + log_msg_exit ("procs returned is not an array"); + + json_array_foreach (procs, index, value) { + int pid; + char *sender; + + if (json_unpack (value, "{ s:i s:s }", + "pid", &pid, + "sender", &sender) < 0) + log_msg_exit ("json_unpack"); + + printf ("%s\t%d\t%d\n", sender, rank, pid); + } + +} + +int main (int argc, char *argv[]) +{ + flux_t *h; + flux_reactor_t *reactor; + const char *optargp; + int rank; + flux_future_t *f; + optparse_t *opts; + int optindex; + int resp_rank; + json_t *resp_procs; + + log_init ("rexec_ps"); + + opts = optparse_create ("rexec_ps"); + if (optparse_add_option_table (opts, cmdopts) != OPTPARSE_SUCCESS) + log_msg_exit ("optparse_add_option_table"); + if ((optindex = optparse_parse_args (opts, argc, argv)) < 0) + exit (1); + + if (optparse_getopt (opts, "rank", &optargp) > 0) { + rank = atoi (optargp); + } else { + optparse_print_usage (opts); + exit (1); + } + + if (!(h = flux_open (NULL, 0))) + log_err_exit ("flux_open"); + + if (!(reactor = flux_get_reactor (h))) + log_err_exit ("flux_get_reactor"); + + if (!(f = flux_rpc (h, "cmb.rexec.processes", NULL, rank, 0))) + log_err_exit ("flux_rpc"); + + if (flux_rpc_get_unpack (f, "{ s:i s:o }", + "rank", &resp_rank, + "procs", &resp_procs) < 0) + log_err_exit ("flux_rpc_get_unpack"); + + if (rank != resp_rank) + log_err_exit ("invalid rank returned = %d", resp_rank); + + output (rank, resp_procs); + + /* Clean up. + */ + flux_close (h); + log_fini (); + + return 0; +} + +/* + * vi:tabstop=4 shiftwidth=4 expandtab + */ diff --git a/t/rexec/rexec_signal.c b/t/rexec/rexec_signal.c new file mode 100644 index 000000000000..b63da541abe0 --- /dev/null +++ b/t/rexec/rexec_signal.c @@ -0,0 +1,135 @@ +/*****************************************************************************\ + * Copyright (c) 2014 Lawrence Livermore National Security, LLC. Produced at + * the Lawrence Livermore National Laboratory (cf, AUTHORS, DISCLAIMER.LLNS). + * LLNL-CODE-658032 All rights reserved. + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the license, or (at your option) + * any later version. + * + * Flux is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the terms and conditions of the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * See also: http://www.gnu.org/licenses/ +\*****************************************************************************/ + +#if HAVE_CONFIG_H +#include "config.h" +#endif +#include +#include +#include +#include +#include +#include + +#include "src/common/libutil/log.h" +#include "src/common/subprocess/subprocess.h" + +extern char **environ; + +int exit_code = 0; + +void completion_cb (flux_subprocess_t *p) +{ + int ec = flux_subprocess_exit_code (p); + int sig; + + if (ec > exit_code) + exit_code = ec; + + if ((sig = flux_subprocess_signaled (p)) < 0) + log_err_exit ("flux_subprocess_signaled"); + printf ("subprocess terminated by signal %d\n", sig); +} + +void signal_result (flux_future_t *f, void *arg) +{ + if (flux_future_get (f, NULL) < 0) + log_err_exit ("flux_subprocess_kill error"); + flux_future_destroy (f); +} + +void state_cb (flux_subprocess_t *p, flux_subprocess_state_t state) +{ + if (state == FLUX_SUBPROCESS_EXEC_FAILED + || state == FLUX_SUBPROCESS_FAILED) { + fprintf (stderr, "rank %d: %s: %s\n", + flux_subprocess_rank (p), + flux_subprocess_state_string (state), + strerror (flux_subprocess_fail_errno (p))); + + /* just so we fail non-zero */ + if (!exit_code) + exit_code++; + } + + if (state == FLUX_SUBPROCESS_RUNNING) { + flux_future_t *f; + if (!(f = flux_subprocess_kill (p, SIGTERM))) + log_err_exit ("flux_subprocess_kill"); + if (flux_future_then (f, -1., signal_result, p) < 0) + log_err_exit ("flux_future_then"); + } +} + +int main (int argc, char *argv[]) +{ + flux_t *h; + flux_reactor_t *reactor; + flux_cmd_t *cmd; + char *cwd; + flux_subprocess_t *p = NULL; + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_state_change = state_cb, + .on_channel_out = NULL, + .on_stdout = flux_subprocess_output, + .on_stderr = flux_subprocess_output, + }; + + log_init ("rexec_signal"); + + if (!(cmd = flux_cmd_create (argc - 1, &argv[1], environ))) + log_err_exit ("flux_cmd_create"); + + if (!(cwd = get_current_dir_name ())) + log_err_exit ("get_current_dir_name"); + + if (flux_cmd_setcwd (cmd, cwd) < 0) + log_err_exit ("flux_cmd_setcwd"); + + if (!(h = flux_open (NULL, 0))) + log_err_exit ("flux_open"); + + if (!(reactor = flux_get_reactor (h))) + log_err_exit ("flux_get_reactor"); + + /* always to rank 1 */ + if (!(p = flux_rexec (h, 1, 0, cmd, &ops))) + log_err_exit ("flux_rexec"); + + if (flux_reactor_run (reactor, 0) < 0) + log_err_exit ("flux_reactor_run"); + + /* Clean up. + */ + flux_subprocess_destroy (p); + flux_close (h); + log_fini (); + + return exit_code; +} + +/* + * vi:tabstop=4 shiftwidth=4 expandtab + */ diff --git a/t/t0005-rexec.t b/t/t0005-rexec.t new file mode 100755 index 000000000000..5a6dc0a6ca90 --- /dev/null +++ b/t/t0005-rexec.t @@ -0,0 +1,185 @@ +#!/bin/sh +# + +test_description='Test broker rexec functionality + + +Test rexec functionality +' + +. `dirname $0`/sharness.sh +SIZE=4 +test_under_flux ${SIZE} minimal + +TEST_SUBPROCESS_DIR=${FLUX_BUILD_DIR}/src/common/subprocess + +test_expect_success 'basic rexec functionality (process success)' ' + ${FLUX_BUILD_DIR}/t/rexec/rexec /bin/true +' + +test_expect_success 'basic rexec functionality (process fail)' ' + ! ${FLUX_BUILD_DIR}/t/rexec/rexec /bin/false +' + +test_expect_success 'basic rexec - cwd correct' ' + (cd /tmp && + cwd=`${FLUX_BUILD_DIR}/t/rexec/rexec pwd` && + test "$cwd" = "/tmp") +' + +test_expect_success 'basic rexec - env passed through' ' + export FOO_BAR_BAZ=10 && + ${FLUX_BUILD_DIR}/t/rexec/rexec env > output && + grep "FOO_BAR_BAZ=10" output +' + +test_expect_success 'basic rexec functionality (echo stdout)' ' + ${FLUX_BUILD_DIR}/t/rexec/rexec ${TEST_SUBPROCESS_DIR}/test_echo -P -O foobar.stdout > output && + echo "STDOUT:foobar.stdout" > expected && + test_cmp expected output +' + +test_expect_success 'basic rexec functionality (echo stderr)' ' + ${FLUX_BUILD_DIR}/t/rexec/rexec ${TEST_SUBPROCESS_DIR}/test_echo -P -E foobar.stderr > output 2>&1 && + echo "STDERR:foobar.stderr" > expected && + test_cmp expected output +' + +test_expect_success 'basic rexec functionality (echo stdout/err)' ' + ${FLUX_BUILD_DIR}/t/rexec/rexec ${TEST_SUBPROCESS_DIR}/test_echo -O -E foobar.stdouterr > output 2>&1 && + echo "foobar.stdouterr" > expected && + echo "foobar.stdouterr" >> expected && + test_cmp expected output +' + +test_expect_success 'basic rexec invalid rank' ' + ! ${FLUX_BUILD_DIR}/t/rexec/rexec -r 32 /bin/true > output 2>&1 && + grep -q "No route to host" output +' + +test_expect_success 'basic rexec fail exec()' ' + ! ${FLUX_BUILD_DIR}/t/rexec/rexec / > output 2>&1 && + grep -q "Permission denied" output +' + +test_expect_success 'basic rexec fail exec() EACCES' ' + ! ${FLUX_BUILD_DIR}/t/rexec/rexec / > output 2>&1 && + grep -q "Permission denied" output +' + +test_expect_success 'basic rexec fail exec() ENOENT' ' + ! ${FLUX_BUILD_DIR}/t/rexec/rexec /usr/bin/foobarbaz > output 2>&1 && + grep -q "No such file or directory" output +' + +test_expect_success 'basic rexec propogates exit code()' ' + test_expect_code 0 ${FLUX_BUILD_DIR}/t/rexec/rexec /bin/true && + test_expect_code 1 ${FLUX_BUILD_DIR}/t/rexec/rexec /bin/false && + test_expect_code 2 ${FLUX_BUILD_DIR}/t/rexec/rexec sh -c "exit 2" && + test_expect_code 3 ${FLUX_BUILD_DIR}/t/rexec/rexec sh -c "exit 3" +' + +test_expect_success 'basic rexec functionality (check state changes)' ' + ${FLUX_BUILD_DIR}/t/rexec/rexec -s /bin/true > output && + echo "Started" > expected && + echo "Running" >> expected && + echo "Exited" >> expected && + test_cmp expected output +' + +test_expect_success 'basic rexec fail exec() (check state changes)' ' + ! ${FLUX_BUILD_DIR}/t/rexec/rexec -s / > output && + echo "Started" > expected && + echo "Exec Failed" >> expected && + test_cmp expected output +' + +test_expect_success 'basic rexec stdin' ' + echo -n "hello" | ${FLUX_BUILD_DIR}/t/rexec/rexec -i STDIN ${TEST_SUBPROCESS_DIR}/test_echo -O -E > output 2>&1 && + echo "hello" > expected && + echo "hello" >> expected && + test_cmp expected output +' + +test_expect_success 'basic rexec stdin / stdout multiple lines' ' + /bin/echo -en "foo\nbar\nbaz\n" | ${FLUX_BUILD_DIR}/t/rexec/rexec -i STDIN ${TEST_SUBPROCESS_DIR}/test_echo -O -n > output 2>&1 && + echo "foo" > expected && + echo "bar" >> expected && + echo "baz" >> expected && + test_cmp expected output +' + +test_expect_success 'basic rexec stdin / stdout long lines' ' + dd if=/dev/urandom bs=4096 count=1 | base64 --wrap=0 >expected && + ${FLUX_BUILD_DIR}/t/rexec/rexec cat expected > output && + test_cmp expected output +' + +# pipe in /dev/null, we don't care about stdin for this test +test_expect_success 'rexec check channel FD created' ' + ${FLUX_BUILD_DIR}/t/rexec/rexec -i TEST_CHANNEL /usr/bin/env < /dev/null > output 2>&1 && + grep "TEST_CHANNEL_FD=" output +' + +# execbasic does not close TEST_CHANNEL, so we tell test_echo max +# bytes we're feeding in +test_expect_success 'rexec channel input' ' + echo -n "foobar" | ${FLUX_BUILD_DIR}/t/rexec/rexec -i TEST_CHANNEL ${TEST_SUBPROCESS_DIR}/test_echo -c TEST_CHANNEL -P -O -b 6 > output 2>&1 && + echo "STDOUT:foobar" > expected && + test_cmp expected output +' + +# execbasic does not close TEST_CHANNEL, so we tell test_echo max +# bytes we're feeding in +test_expect_success 'rexec channel input and output' ' + echo -n "foobaz" | ${FLUX_BUILD_DIR}/t/rexec/rexec -i TEST_CHANNEL ${TEST_SUBPROCESS_DIR}/test_echo -c TEST_CHANNEL -P -C -b 6 > output 2>&1 && + echo "TEST_CHANNEL:foobaz" > expected && + test_cmp expected output +' + +# execbasic does not close TEST_CHANNEL, so we tell test_echo max +# bytes we're feeding in +test_expect_success 'rexec channel input and output multiple lines' ' + /bin/echo -en "foo\nbar\nbaz\n" | ${FLUX_BUILD_DIR}/t/rexec/rexec -i TEST_CHANNEL ${TEST_SUBPROCESS_DIR}/test_echo -c TEST_CHANNEL -C -n -b 6 > output 2>&1 && + echo "foo" > expected && + echo "bar" >> expected && + echo "baz" >> expected && + test_cmp expected output +' + +test_expect_success 'rexec kill' ' + ${FLUX_BUILD_DIR}/t/rexec/rexec_signal /bin/sleep 10 > output 2>&1 && + grep "subprocess terminated by signal 15" output +' + +test_expect_success NO_CHAIN_LINT 'rexec ps works' ' + ${FLUX_BUILD_DIR}/t/rexec/rexec -r 1 sleep 100 & + pid1=$! + ${FLUX_BUILD_DIR}/t/rexec/rexec -r 1 sleep 100 & + pid2=$! + sleep 1 && + ${FLUX_BUILD_DIR}/t/rexec/rexec_ps -r 1 > output && + count=`cat output | wc -l` && + kill -TERM $pid1 && + kill -TERM $pid2 && + test "$count" = "2" +' + +test_expect_success 'disconnect terminates all running processes' ' + ${FLUX_BUILD_DIR}/t/rexec/rexec -r 1 sleep 100 & + pid1=$! + ${FLUX_BUILD_DIR}/t/rexec/rexec -r 1 sleep 100 & + pid2=$! + sleep 1 && + ${FLUX_BUILD_DIR}/t/rexec/rexec_ps -r 1 > output && + count=`cat output | wc -l` && + test "$count" = "2" && + sleep 1 && + kill -TERM $pid1 && + kill -TERM $pid2 && + ${FLUX_BUILD_DIR}/t/rexec/rexec_ps -r 1 > output && + count=`cat output | wc -l` && + test "$count" = "0" +' + +test_done From d7fab10d0275fdde4a00e09ba32fe84c40a2f39c Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Mon, 20 Aug 2018 16:21:36 -0700 Subject: [PATCH 07/20] modules/wreck/: Convert remote exec code Convert to use cmb.exec2 service via the new flux subprocess library, instead of using the cmb.exec service. --- src/modules/wreck/job.c | 262 ++++++++++++++++++++-------------------- 1 file changed, 133 insertions(+), 129 deletions(-) diff --git a/src/modules/wreck/job.c b/src/modules/wreck/job.c index 15b5c69819f5..49afdbcad68b 100644 --- a/src/modules/wreck/job.c +++ b/src/modules/wreck/job.c @@ -48,7 +48,7 @@ #include "src/common/libutil/log.h" #include "src/common/libutil/fdwalk.h" -#include "src/common/libsubprocess/zio.h" +#include "src/common/subprocess/subprocess.h" #include "rcalc.h" #include "wreck_job.h" @@ -537,190 +537,194 @@ static int flux_attr_get_int (flux_t *h, const char *attr, int *valp) return (0); } -static void spawn_io_cb (flux_t *h, struct wreck_job *job, - const flux_msg_t *msg) +static void completion_cb (flux_subprocess_t *p) { - const char *stream = "stdout"; - const char *json_str; - void *data = NULL; - int level = LOG_INFO; - int len; + flux_t *h; + struct wreck_job *job = NULL; + int tmp; - if (flux_msg_get_string (msg, &json_str) < 0) - return; + if (!(h = flux_subprocess_get_context (p, "handle"))) + goto cleanup; - if ((len = zio_json_decode (json_str, &data, NULL)) < 0) { - flux_log_error (h, "wrexecd: io decode"); + if (!(job = flux_subprocess_get_context (p, "job"))) { + flux_log_error (h, "%s: flux_subprocess_get_context", __FUNCTION__); + goto cleanup; } - if (len > 0) { - (void) flux_msg_unpack (msg, "{s:s}", "name", &stream); - if (strcmp (stream, "stderr") == 0) - level = LOG_ERR; - flux_log (h, level, "job%ju: wrexecd says: %s", - (uintmax_t) job->id, - (char *) data); + + /* skip output text if exit code == 0 */ + if (!(tmp = flux_subprocess_exit_code (p))) + goto cleanup; + + if (tmp > 0) { + flux_log_error (h, "job%ju: wrexecd: Exit %d", + (uintmax_t) job->id, tmp); } - free (data); - return; + else if (tmp < 0) { + if ((tmp = flux_subprocess_signaled (p)) < 0) + flux_log_error (h, "job%ju: unknown exit status", (uintmax_t) job->id); + else + flux_log_error (h, "job%ju: wrexecd: %s", + (uintmax_t) job->id, strsignal (tmp)); + + } + +cleanup: + wreck_job_destroy (job); + flux_subprocess_destroy (p); } -static void cmb_exec_cb (flux_future_t *f, void *arg) +static void state_change_cb (flux_subprocess_t *p, flux_subprocess_state_t state) { - int64_t pid = 0; - const char *type = NULL; - const char *state = NULL; - int status = 1; - const flux_msg_t *msg; - flux_t *h = flux_future_get_flux (f); - struct wreck_job *job = arg; + flux_t *h; + struct wreck_job *job; - if (flux_future_get (f, (const void **)&msg) < 0) { - flux_log_error (h, "cmb_exec_cb: flux_future_get"); - flux_future_destroy (f); + if (!(h = flux_subprocess_get_context (p, "handle"))) return; - } - if (flux_msg_unpack (msg, "{s?s,s?s,s?i,s:i}", - "type", &type, "state", &state, - "status", &status, "pid", &pid) < 0) { - flux_log_error (h, "cmb_exec_cb: flux_msg_unpack"); - flux_future_destroy (f); + + if (!(job = flux_subprocess_get_context (p, "job"))) { + flux_log_error (h, "%s: flux_subprocess_get_context", __FUNCTION__); return; } - if (type && strcmp (type, "io") == 0) - spawn_io_cb (h, job, msg); - else if (state && strcmp (state, "Exited") == 0) { - if (WIFSIGNALED (status)) - flux_log_error (h, "job%ju: wrexecd: %s", - (uintmax_t) job->id, - strsignal (WTERMSIG (status))); - else if (WEXITSTATUS (status) != 0) - flux_log_error (h, "job%ju: wrexecd: Exit %d", - (uintmax_t) job->id, WEXITSTATUS (status)); - - wreck_job_destroy (job); - /* Done with this job, it is safe to destroy future */ - flux_future_destroy (f); - return; + // XXX: Update job state to failed + if (state == FLUX_SUBPROCESS_EXEC_FAILED) { + flux_log_error (h, "spawn: job%ju: wrexecd exec failure", (uintmax_t) job->id); + flux_subprocess_destroy (p); + } + else if (state == FLUX_SUBPROCESS_FAILED) { + flux_log_error (h, "spawn: job%ju: wrexecd failure", (uintmax_t) job->id); + flux_subprocess_destroy (p); } - else flux_log (h, LOG_ERR, "job%ju: unknown state %s", - (uintmax_t) job->id, state ? state : "NULL"); - flux_future_reset (f); - return; } -static json_t *wrexecd_cmdline_create (flux_t *h, struct wreck_job *job) +static void io_cb (flux_subprocess_t *p, const char *stream) { - json_t *o, *s; + int lenp = 0; + const char *ptr; + + if ((ptr = flux_subprocess_read_line (p, stream, &lenp)) + && lenp > 0) { + flux_t *h; + struct wreck_job *job; + int level = LOG_INFO; + + if (!(h = flux_subprocess_get_context (p, "handle"))) + return; + + if (!(job = flux_subprocess_get_context (p, "job"))) { + flux_log_error (h, "%s: flux_subprocess_get_context", __FUNCTION__); + return; + } + + if (!strcasecmp (stream, "STDERR")) + level = LOG_ERR; + + flux_log (h, level, + "job%ju: wrexecd says: %s", + (uintmax_t) job->id, ptr); + } +} + +static flux_cmd_t *wrexecd_cmd_create (flux_t *h, struct wreck_job *job) +{ + flux_cmd_t *cmd = NULL; const char *wrexecd_path; + char *cwd = NULL; char buf [4096]; int n; - if (!(wrexecd_path = flux_attr_get (h, "wrexec.wrexecd_path", NULL))) { - flux_log_error (h, "wrexecd_cmdline_create: flux_attr_get"); - return (NULL); - } - if (!(o = json_array ())) { - flux_log_error (h, "wrexecd_cmdline_create: json_array"); - return (NULL); + if (!(cmd = flux_cmd_create (0, NULL, NULL))) { + flux_log_error (h, "wrexecd_cmd_create: flux_cmd_create"); + goto error; } - if (!(s = json_string (wrexecd_path))) { - flux_log_error (h, "wrexecd_cmdline_create: json_string"); + if (!(wrexecd_path = flux_attr_get (h, "wrexec.wrexecd_path", NULL))) { + flux_log_error (h, "wrexecd_cmd_create: flux_attr_get"); goto error; } - if (json_array_append_new (o, s) < 0) { - json_decref (s); - flux_log_error (h, "wrexecd_cmdline_create: json_array_append_new"); + if (flux_cmd_argv_append (cmd, "%s", wrexecd_path) < 0) { + flux_log_error (h, "wrexecd_cmd_create: flux_cmd_argv_append"); goto error; } - n = snprintf (buf, sizeof(buf), "--lwj-id=%ju", (uintmax_t) job->id); if ((n >= sizeof (buf)) || (n < 0)) { flux_log_error (h, "failed to append id to cmdline for job%ju\n", (uintmax_t) job->id); goto error; } - json_array_append_new (o, json_string (buf)); - + if (flux_cmd_argv_append (cmd, "%s", buf) < 0) { + flux_log_error (h, "wrexecd_cmd_create: flux_cmd_argv_append"); + goto error; + } n = snprintf (buf, sizeof (buf), "--kvs-path=%s", job->kvs_path); if ((n >= sizeof (buf)) || (n < 0)) { flux_log_error (h, "failed to append kvspath to cmdline for job%ju\n", (uintmax_t) job->id); goto error; } - json_array_append_new (o, json_string (buf)); + if (flux_cmd_argv_append (cmd, "%s", buf) < 0) { + flux_log_error (h, "wrexecd_cmd_create: flux_cmd_argv_append"); + goto error; + } + /* flux_rexec() requires cwd to be set */ + if (!(cwd = get_current_dir_name ())) { + flux_log_error (h, "wrexecd_cmd_create: get_current_dir_name"); + goto error; + } + if (flux_cmd_setcwd (cmd, cwd) < 0) { + flux_log_error (h, "wrexecd_cmd_create: flux_cmd_setcwd"); + goto error; + } - return (o); + free (cwd); + return (cmd); error: - json_decref (o); + free (cwd); + flux_cmd_destroy (cmd); return (NULL); } -static void spawn_continuation (flux_future_t *f, void *arg) +static int spawn_exec_handler (flux_t *h, struct wreck_job *job) { - flux_t *h = flux_future_get_flux (f); - struct wreck_job *job = arg; - const char *state = NULL; - int rank; - pid_t pid; - - /* State should be either "Running" or "Exec Failed". In latter - * case the pid key will not be included in the message, which - * is why it is optional below - */ - if (flux_rpc_get_unpack (f, "{s:i,s?i,s:s}", - "rank", &rank, "pid", &pid, - "state", &state) < 0) { - flux_log_error (h, "spawn: rpc_unpack"); - goto err; - } - if (strcmp (state, "Exec Failure") == 0) { - flux_log_error (h, "spawn: job%ju: wrexecd exec failure", - (uintmax_t) job->id); - // XXX: Update job state to failed - goto err; - } - else if (strcmp (state, "Running") != 0) { - flux_log_error (h, "spawn: wrexecd for job %ju unexpected state %s", - (uintmax_t) job->id, state); - goto err; - } - - /* Reset future. setup continuation for remaining cmb.exec responses */ - flux_future_reset (f); - if (flux_future_then (f, -1., cmb_exec_cb, job) < 0) - flux_log_error (h, "spawn_continuation: flux_future_then"); - return; + flux_cmd_t *cmd = NULL; + flux_subprocess_t *p = NULL; + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_state_change = state_change_cb, + .on_channel_out = NULL, + .on_stdout = io_cb, + .on_stderr = io_cb + }; -err: - flux_future_destroy (f); - return; -} + if (!(cmd = wrexecd_cmd_create (h, job))) { + flux_log_error (h, "wrexecd_cmd_create"); + goto error; + } -static int spawn_exec_handler (flux_t *h, struct wreck_job *job) -{ - flux_future_t *f = NULL; - json_t *cmdline = wrexecd_cmdline_create (h, job); + if (!(p = flux_rexec (h, FLUX_NODEID_ANY, 0, cmd, &ops))) { + flux_log_error (h, "flux_rexec"); + goto error; + } - if (!cmdline) - return (-1); - if (!(f = flux_rpc_pack (h, "cmb.exec", FLUX_NODEID_ANY, 0, - "{s:o}", "cmdline", cmdline))) { - flux_log_error (h, "spawn_exec_handler: flux_rpc"); + if (flux_subprocess_set_context (p, "handle", h) < 0) { + flux_log_error (h, "flux_subprocess_set_context"); goto error; } - if (flux_future_then (f, -1., spawn_continuation, job) < 0) { - flux_log_error (h, "spawn_exec_handler: flux_future_then"); + + if (flux_subprocess_set_context (p, "job", job) < 0) { + flux_log_error (h, "flux_subprocess_set_context"); goto error; } + /* Take a reference on this job since it is now embedded in - * a future. + * a flux rexec. */ + flux_cmd_destroy (cmd); wreck_job_incref (job); return (0); + error: - json_decref (cmdline); - flux_future_destroy (f); + flux_cmd_destroy (cmd); + flux_subprocess_destroy (p); return (-1); } From 622156b96cada676383f59bc2267575b7e4972d1 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Tue, 21 Aug 2018 17:25:30 -0700 Subject: [PATCH 08/20] modules/cron/: Convert remote exec code Convert to use cmb.exec2 service via the new flux subprocess library, instead of using the cmb.exec service. --- src/modules/cron/cron.c | 25 ++- src/modules/cron/task.c | 415 +++++++++++++++++++--------------------- src/modules/cron/task.h | 4 +- 3 files changed, 212 insertions(+), 232 deletions(-) diff --git a/src/modules/cron/cron.c b/src/modules/cron/cron.c index 821e345580cc..71e5ece92c75 100644 --- a/src/modules/cron/cron.c +++ b/src/modules/cron/cron.c @@ -73,7 +73,7 @@ static int cron_entry_stop (cron_entry_t *e); static void cron_entry_completion_handler (flux_t *h, cron_task_t *t, void *arg); static void cron_entry_io_cb (flux_t *h, cron_task_t *t, void *arg, - bool is_stderr, void *data, int datalen, bool eof); + bool is_stderr, const char *data, int datalen, bool eof); static int cron_entry_run_task (cron_entry_t *e); static int cron_entry_defer (cron_entry_t *e); @@ -151,21 +151,28 @@ int cron_entry_schedule_task (cron_entry_t *e) /**************************************************************************/ -static void cron_entry_loglines (flux_t *h, cron_entry_t *e, int level, char *s) +static void cron_entry_logline (flux_t *h, cron_entry_t *e, int level, + const char *s) { - char *p, *saveptr = NULL; - while ((p = strtok_r (s, "\n", &saveptr))) { - flux_log (h, level, "cron-%ju[%s]: rank=%d: command=\"%s\": \"%s\"", - e->id, e->name, e->rank, e->command, p); - s = NULL; + /* XXX - this is awful, needs work */ + char *tmp = NULL; + if (strchr (s, '\n')) { + if ((tmp = strdup (s))) { + char *p = strchr (tmp, '\n'); + *p = '\0'; + } + s = tmp; } + flux_log (h, level, "cron-%ju[%s]: rank=%d: command=\"%s\": \"%s\"", + e->id, e->name, e->rank, e->command, s); + free (tmp); } static void cron_entry_io_cb (flux_t *h, cron_task_t *t, void *arg, - bool is_stderr, void *data, int datalen, bool eof) + bool is_stderr, const char *data, int datalen, bool eof) { if (data) - cron_entry_loglines (h, arg, is_stderr ? LOG_ERR : LOG_INFO, data); + cron_entry_logline (h, arg, is_stderr ? LOG_ERR : LOG_INFO, data); } /* Push task t onto the front of the completed tasks list for diff --git a/src/modules/cron/task.c b/src/modules/cron/task.c index c35749ad18d7..44504e666879 100644 --- a/src/modules/cron/task.c +++ b/src/modules/cron/task.c @@ -29,17 +29,18 @@ #include #include #include +#include #include +#include #include "src/common/libutil/log.h" -#include "src/common/libsubprocess/zio.h" +#include "src/common/subprocess/subprocess.h" #include "task.h" struct cron_task { flux_t * h; /* flux handle used to create this task */ - struct flux_match match; /* match object for message handler */ - flux_msg_handler_t * mh; /* msg handler specific to this task */ + flux_subprocess_t *p; /* flux subprocess */ int rank; /* rank on which task is being run */ pid_t pid; /* remote process id */ @@ -75,9 +76,7 @@ struct cron_task { void cron_task_destroy (cron_task_t *t) { - flux_msg_handler_stop (t->mh); - flux_msg_handler_destroy (t->mh); - t->mh = NULL; + flux_subprocess_destroy (t->p); flux_watcher_destroy (t->timeout_w); t->timeout_w = NULL; free (t->state); @@ -94,7 +93,6 @@ cron_task_t *cron_task_new (flux_t *h, cron_task_complete_f cb, void *arg) return NULL; } t->h = h; - t->match = FLUX_MATCH_RESPONSE; t->completion_cb = cb; t->arg = arg; clock_gettime (CLOCK_REALTIME, &t->createtime); @@ -132,72 +130,106 @@ static void cron_task_state_update (cron_task_t *t, const char *fmt, ...) va_end (ap); } -static int io_handler (flux_t *h, cron_task_t *t, const flux_msg_t *msg) +static void timeout_cb (flux_reactor_t *r, flux_watcher_t *w, + int revents, void *arg) { - const char *stream = "stdout"; - const char *json_str; - void *data = NULL; - bool eof; - bool is_stderr = false; - int len; - - if (flux_msg_get_string (msg, &json_str) < 0) - return -1; + cron_task_t *t = arg; + t->timedout = 1; + if (t->timeout_cb) + t->timeout_cb (t->h, t, t->arg); + else + cron_task_kill (t, SIGTERM); +} - if ((len = zio_json_decode (json_str, &data, &eof)) < 0) { - flux_log_error (h, "io decode"); - free (data); - return (-1); +static void cron_task_timeout_start (cron_task_t *t) +{ + flux_watcher_t *w; + flux_reactor_t *r; + if (t->timeout <= 0.0) + return; + r = flux_get_reactor (t->h); + if (!(w = flux_timer_watcher_create (r, t->timeout, 0.0, timeout_cb, t))) { + flux_log_error (t->h, "task_timeout_start"); + return; } - (void) flux_msg_unpack (msg, "{s:s}", "name", &stream); - if (strcmp (stream, "stderr") == 0) - is_stderr = true; + flux_watcher_start (w); + t->timeout_w = w; +} - if (t->io_cb) - (*t->io_cb) (h, t, t->arg, is_stderr, data, len, eof); +void cron_task_set_timeout (cron_task_t *t, double to, cron_task_state_f cb) +{ + t->timeout_cb = cb; + t->timeout = to; + if (t->started) + cron_task_timeout_start (t); +} - if (eof) { - if (is_stderr) - t->stderr_closed = 1; - else - t->stdout_closed = 1; - } - free (data); - return (0); +static void cron_task_rexec_failed (cron_task_t *t, int errnum) +{ + t->rexec_failed = 1; + t->rexec_errno = errnum; + cron_task_state_update (t, "Rexec Failure"); } -static int state_handler (flux_t *h, cron_task_t *t, const flux_msg_t *msg) +static void cron_task_handle_completion (flux_subprocess_t *p, cron_task_t *t) { - const char *state; + clock_gettime (CLOCK_REALTIME, &t->endtime); + flux_watcher_destroy (t->timeout_w); + t->timeout_w = NULL; + flux_subprocess_destroy (t->p); + t->p = NULL; - if (flux_msg_unpack (msg, "{s:s}", "state", &state) < 0) { - flux_log_error (h, "unable to get exec state"); - return -1; - } - cron_task_state_update (t, state); + /* Call completion handler for this entry */ + if (t->completion_cb) + (*t->completion_cb) (t->h, t, t->arg); +} + +static void completion_cb (flux_subprocess_t *p) +{ + cron_task_t *t = flux_subprocess_get_context (p, "task"); + + assert (t); + + cron_task_handle_completion (p, t); +} + +static void state_change_cb (flux_subprocess_t *p, flux_subprocess_state_t state) +{ + cron_task_t *t = flux_subprocess_get_context (p, "task"); + + assert (t); - if (strcmp (state, "Running") == 0) { + cron_task_state_update (t, flux_subprocess_state_string (state)); + + if (state == FLUX_SUBPROCESS_STARTED) { + t->started = 1; + clock_gettime (CLOCK_REALTIME, &t->starttime); + if (t->timeout >= 0.0) + cron_task_timeout_start (t); + } + else if (state == FLUX_SUBPROCESS_RUNNING) { clock_gettime (CLOCK_REALTIME, &t->runningtime); t->running = 1; - (void) flux_msg_unpack (msg, "{s:i, s:i}", - "pid", &t->pid, "rank", &t->rank); + t->pid = flux_subprocess_pid (p); + t->rank = flux_subprocess_rank (p); } - else if (strcmp (state, "Exec Failure") == 0) { - if (flux_msg_unpack (msg, "{s:i}", "exec_errno", &t->exec_errno) < 0) { - flux_log_error (h, - "cron task: state handler unable to get exec errno"); - t->exec_errno = 0; - } + else if (state == FLUX_SUBPROCESS_EXEC_FAILED) { + t->exec_errno = flux_subprocess_fail_errno (p); t->exited = 1; t->stderr_closed = t->stdout_closed = 1; + cron_task_handle_completion (p, t); errno = t->exec_errno; } - else if (strcmp (state, "Exited") == 0) { + else if (state == FLUX_SUBPROCESS_FAILED) { + cron_task_rexec_failed (t, flux_subprocess_fail_errno (p)); t->exited = 1; - if (flux_msg_unpack (msg, "{s:i}", "status", &t->status) < 0) { - flux_log_error (h, "cron task: state handler failed to get status"); - t->status = 0; - } + t->stderr_closed = t->stdout_closed = 1; + cron_task_handle_completion (p, t); + errno = t->rexec_errno; + } + else if (state == FLUX_SUBPROCESS_EXITED) { + t->exited = 1; + t->status = flux_subprocess_status (p); if (WIFSIGNALED (t->status)) cron_task_state_update (t, "%s", strsignal (WTERMSIG (t->status))); else if (WEXITSTATUS (t->status) != 0) @@ -205,227 +237,168 @@ static int state_handler (flux_t *h, cron_task_t *t, const flux_msg_t *msg) } if (t->state_cb) - (*t->state_cb) (h, t, t->arg); - - return (0); - + (*t->state_cb) (t->h, t, t->arg); } -static void cron_task_rexec_failed (cron_task_t *t, int errnum) +static void io_cb (flux_subprocess_t *p, const char *stream) { - t->rexec_failed = 1; - t->rexec_errno = errnum; - cron_task_state_update (t, "Rexec Failure"); -} + cron_task_t *t = flux_subprocess_get_context (p, "task"); + const char *ptr = NULL; + int lenp; + bool is_stderr = false; + bool eof = false; -static void cron_task_handle_completion (cron_task_t *t) -{ - clock_gettime (CLOCK_REALTIME, &t->endtime); - /* - * Disable message handling for this task. Task will be destroyed - * later. - */ - flux_msg_handler_stop (t->mh); - flux_msg_handler_destroy (t->mh); - t->mh = NULL; - flux_watcher_destroy (t->timeout_w); - t->timeout_w = NULL; + assert (t); - /* Call completion handler for this entry */ - if (t->completion_cb) - (*t->completion_cb) (t->h, t, t->arg); -} + if (!strcmp (stream, "STDERR")) + is_stderr = true; -static void exec_handler (flux_t *h, flux_msg_handler_t *w, - const flux_msg_t *msg, void *arg) -{ - struct cron_task *t = arg; - const char *json_str; - const char *topic; - const char *type; - json_t *resp = NULL; - json_error_t error; - - if (flux_response_decode (msg, &topic, &json_str) < 0) { - cron_task_rexec_failed (t, errno); - flux_log_error (h, "cron task: exec handler"); - goto out; - } - else if (!json_str || (resp = json_loads (json_str, 0, &error)) == NULL) { - errno = EPROTO; - cron_task_rexec_failed (t, errno); - flux_log_error (h, "cron task: json decode: %s", error.text); + if (!(ptr = flux_subprocess_read_line (p, stream, &lenp))) { + flux_log_error (t->h, "%s: flux_subprocess_read_line", __FUNCTION__); + return; } - else if (json_unpack (resp, "{s:s}", "type", &type) == 0 - && strcmp (type, "io") == 0) { - if (io_handler (h, t, msg) < 0) - goto out; + + if (!lenp) { + if (!(ptr = flux_subprocess_read (p, stream, -1, &lenp))) { + flux_log_error (t->h, "%s: flux_subprocess_read_line", __FUNCTION__); + return; + } + if (!lenp) + eof = true; } - else if (state_handler (h, t, msg) < 0) - goto out; - - if (cron_task_completed (t)) - cron_task_handle_completion (t); -out: - if (resp) - json_decref (resp); -} -static flux_msg_t *kill_request_create (cron_task_t *t, int sig) -{ - int e = 0; - flux_msg_t *msg; - char *s = NULL;; - json_t *o = json_pack ("{s:i, s:i}", "pid", t->pid, "signum", sig); - if (o == NULL) - return NULL; + if (t->io_cb && lenp) + (*t->io_cb) (t->h, t, t->arg, is_stderr, ptr, lenp, eof); - s = json_dumps (o, JSON_COMPACT); - if (!(msg = flux_request_encode ("cmb.exec.signal", s)) - || (flux_msg_set_nodeid (msg, t->rank, 0) < 0)) { - e = errno; - flux_msg_destroy (msg); - errno = e; - msg = NULL; + if (eof) { + if (is_stderr) + t->stderr_closed = 1; + else + t->stdout_closed = 1; } - json_decref (o); - free (s); - return (msg); } int cron_task_kill (cron_task_t *t, int sig) { flux_t *h = t->h; - flux_msg_t *msg; + flux_future_t *f; - if (!t->started || t->exited) { + if (!t->running || t->exited) { errno = EINVAL; return -1; } - msg = kill_request_create (t, sig); - if (!msg || flux_send (h, msg, 0) < 0) { - flux_log_error (h, "cron_task_kill"); + if (!(f = flux_subprocess_kill (t->p, sig))) { + flux_log_error (h, "%s: flux_subprocess_kill", __FUNCTION__); return (-1); } + /* ignore response */ + flux_future_destroy (f); return (0); } -static void timeout_cb (flux_reactor_t *r, flux_watcher_t *w, - int revents, void *arg) -{ - cron_task_t *t = arg; - t->timedout = 1; - if (t->timeout_cb) - t->timeout_cb (t->h, t, t->arg); - else - cron_task_kill (t, SIGTERM); -} - -static void cron_task_timeout_start (cron_task_t *t) -{ - flux_watcher_t *w; - flux_reactor_t *r; - if (t->timeout <= 0.0) - return; - r = flux_get_reactor (t->h); - if (!(w = flux_timer_watcher_create (r, t->timeout, 0.0, timeout_cb, t))) { - flux_log_error (t->h, "task_timeout_start"); - return; - } - flux_watcher_start (w); - t->timeout_w = w; -} - -void cron_task_set_timeout (cron_task_t *t, double to, cron_task_state_f cb) -{ - t->timeout_cb = cb; - t->timeout = to; - if (t->started) - cron_task_timeout_start (t); -} -static json_t *exec_request_create (struct cron_task *t, +static flux_cmd_t *exec_cmd_create (struct cron_task *t, const char *command, const char *cwd, json_t *env) { - json_t *o; - json_t *cmdline = NULL; - - if ((o = json_object ()) == NULL) - return NULL; + flux_cmd_t *cmd = NULL; + char *tmp_cwd = NULL; - if ((cmdline = json_pack ("[s,s,s]", "sh", "-c", command)) == NULL - || (json_object_set_new (o, "cmdline", cmdline) < 0)) { - json_decref (cmdline); - goto fail; + if (!(cmd = flux_cmd_create (0, NULL, NULL))) { + flux_log_error (t->h, "exec_cmd_create: flux_cmd_create"); + goto error; } - - if (cwd) { - json_t *x = json_string (cwd); - if (x == NULL || json_object_set_new (o, "cwd", x) < 0) { - json_decref (x); - goto fail; + if (flux_cmd_argv_append (cmd, "%s", "sh") < 0) { + flux_log_error (t->h, "exec_cmd_create: flux_cmd_argv_append"); + goto error; + } + if (flux_cmd_argv_append (cmd, "%s", "-c") < 0) { + flux_log_error (t->h, "exec_cmd_create: flux_cmd_argv_append"); + goto error; + } + if (flux_cmd_argv_append (cmd, "%s", command) < 0) { + flux_log_error (t->h, "exec_cmd_create: flux_cmd_argv_append"); + goto error; + } + if (!cwd) { + /* flux_rexec() requires a cwd */ + if (!(tmp_cwd = get_current_dir_name ())) { + flux_log_error (t->h, "exec_cmd_create: get_get_current_dir_name"); + goto error; + } + cwd = tmp_cwd; + } + if (flux_cmd_setcwd (cmd, cwd) < 0) { + flux_log_error (t->h, "exec_cmd_create: flux_cmd_setcwd"); + goto error; + } + if (env) { + /* obj is a JSON object */ + const char *key; + json_t *value; + + json_object_foreach(env, key, value) { + const char *value_str = json_string_value (value); + if (!value_str) { + flux_log_error (t->h, "exec_cmd_create: json_string_value"); + errno = EPROTO; + goto error; + } + if (flux_cmd_setenvf (cmd, 1, key, "%s", value_str) < 0) { + flux_log_error (t->h, "exec_cmd_create: flux_cmd_setenvf"); + goto error; + } } } - if (env && json_object_set (o, "env", env) < 0) - goto fail; - return (o); -fail: - json_decref (o); + free (tmp_cwd); + return (cmd); + error: + free (tmp_cwd); + flux_cmd_destroy (cmd); return (NULL); } int cron_task_run (cron_task_t *t, - int rank, const char *cmd, const char *cwd, + int rank, const char *command, const char *cwd, json_t *env) { flux_t *h = t->h; - json_t *req = NULL; - char *json_str = NULL; - flux_msg_t *msg = NULL; + flux_subprocess_t *p = NULL; + flux_cmd_t *cmd; + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_state_change = state_change_cb, + .on_channel_out = NULL, + .on_stdout = io_cb, + .on_stderr = io_cb + }; int rc = -1; - t->match.matchtag = flux_matchtag_alloc (h, 0); - if (t->match.matchtag == FLUX_MATCHTAG_NONE) - return -1; - t->match.topic_glob = "cmb.exec"; - t->mh = flux_msg_handler_create (h, t->match, exec_handler, t); - if (!t->mh) - return -1; - - if (!(req = exec_request_create (t, cmd, cwd, env))) + if (!(cmd = exec_cmd_create (t, command, cwd, env))) goto done; - if (!(json_str = json_dumps (req, JSON_COMPACT))) - goto done; - if (!(msg = flux_request_encode ("cmb.exec", json_str))) - goto done; - if (flux_msg_set_nodeid (msg, rank, 0) < 0) - goto done; - if (flux_msg_set_matchtag (msg, t->match.matchtag) < 0) + + if (!(p = flux_rexec (h, rank, 0, cmd, &ops))) { + cron_task_rexec_failed (t, errno); goto done; - if ((rc = flux_send (h, msg, 0)) < 0) { - flux_log_error (h, "cron_task_run: flux_send"); + } + + if (flux_subprocess_set_context (p, "task", t) < 0) { + flux_log_error (h, "flux_subprocess_set_context"); goto done; } - t->started = 1; - clock_gettime (CLOCK_REALTIME, &t->starttime); - cron_task_state_update (t, "Started"); - rc = 0; - flux_msg_handler_start (t->mh); - if (t->timeout >= 0.0) - cron_task_timeout_start (t); + t->p = p; + rc = 0; done: - json_decref (req); - free (json_str); if (rc < 0) { t->rexec_errno = errno; cron_task_state_update (t, "Rexec Failure"); + flux_subprocess_destroy (p); } - flux_msg_destroy (msg); + flux_cmd_destroy (cmd); return (rc); } diff --git a/src/modules/cron/task.h b/src/modules/cron/task.h index f5dffc949d33..9393a4d31cc6 100644 --- a/src/modules/cron/task.h +++ b/src/modules/cron/task.h @@ -35,7 +35,7 @@ typedef struct cron_task cron_task_t; /* io callback fn for cron task */ typedef void (*cron_task_io_f) (flux_t *h, cron_task_t *t, void *arg, - bool is_stderr, void *data, int datalen, + bool is_stderr, const char *data, int datalen, bool eof); /* task state change handler for cron task, check state with @@ -77,7 +77,7 @@ void cron_task_set_timeout (cron_task_t *t, double to, cron_task_state_f cb); * efficiency). */ int cron_task_run (cron_task_t *t, - int rank, const char *cmd, + int rank, const char *command, const char *cwd, json_t *env); From a975b442dfb978e25ac183206133ddcd74fdc4d7 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Wed, 22 Aug 2018 14:19:43 -0700 Subject: [PATCH 09/20] t/t0015-cron.t: Update tests for subprocess change --- t/t0015-cron.t | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/t/t0015-cron.t b/t/t0015-cron.t index 5c11bc459eeb..624cf888037e 100755 --- a/t/t0015-cron.t +++ b/t/t0015-cron.t @@ -118,8 +118,8 @@ test_expect_success 'cron entry launch failure recorded' ' sleep 0.1 && test_debug "flux cron dump ${id} >&2" && cron_entry_check ${id} stopped true && - cron_entry_check ${id} task.1.state "Rexec Failure" && - cron_entry_check ${id} task.1.rexec_errno 113 + cron_entry_check ${id} task.1.state "Exec Failure" && + cron_entry_check ${id} task.1.exec_errno 113 ' test_expect_success 'flux-cron event works' ' id=$(flux_cron event t.cron.trigger flux event pub t.cron.complete) && From c98b4231a75292ea31a994c7c846dda3658e3cc4 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Fri, 24 Aug 2018 08:00:35 -0700 Subject: [PATCH 10/20] cmd/flux-ps: Remove command and flux ps tests Original tool was developed primarily for testing. For new cmb.exec2 broker, a test specific ps tool was developed. --- src/cmd/Makefile.am | 1 - src/cmd/flux-ps | 155 -------------------------------------------- t/t0005-exec.t | 64 ------------------ 3 files changed, 220 deletions(-) delete mode 100755 src/cmd/flux-ps diff --git a/src/cmd/Makefile.am b/src/cmd/Makefile.am index 87db8829331f..30923a8cb9dc 100644 --- a/src/cmd/Makefile.am +++ b/src/cmd/Makefile.am @@ -59,7 +59,6 @@ dist_fluxcmd_SCRIPTS = \ flux-wreckrun \ flux-wreck \ flux-exec \ - flux-ps \ flux-cron \ flux-aggregate \ flux-hostlist diff --git a/src/cmd/flux-ps b/src/cmd/flux-ps deleted file mode 100755 index 9de92d825f3b..000000000000 --- a/src/cmd/flux-ps +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env lua ---[[-------------------------------------------------------------------------- - * Copyright (c) 2014 Lawrence Livermore National Security, LLC. Produced at - * the Lawrence Livermore National Laboratory (cf, AUTHORS, DISCLAIMER.LLNS). - * LLNL-CODE-658032 All rights reserved. - * - * This file is part of the Flux resource manager framework. - * For details, see https://github.com/flux-framework. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the license, or (at your option) - * any later version. - * - * Flux is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the terms and conditions of the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. - * See also: http://www.gnu.org/licenses/ - ---------------------------------------------------------------------------]] - -- - -- flux-ps: simple frontend for `cmb.processes` service - -- -------------------------------------------------------------------------------- --- Modules: -------------------------------------------------------------------------------- -local flux = require 'flux' -local posix = require 'flux.posix' -local hostlist = require 'flux.hostlist' - -local prog = string.match (arg[0], "([^/]+)$") -local shortprog = prog:match ("flux%-(.+)$") -local verbose = false - -local usage = -[[ -Usage: %s [OPTIONS] - -List subprocesses managed by flux-broker. - - -h, --help Display this message - -v, --verbose Be verbose. - -r, --rank=NODESET Target only ranks in NODESET - -]] - - --- --- Termination state needs to remain a global for access from --- signal handler functions. See setup_signal_handlers() below. --- -terminate = false - -------------------------------------------------------------------------------- --- Local functions: -------------------------------------------------------------------------------- --- --- -local function say (fmt, ...) - if not verbose then return end - io.stderr:write (string.format ("%s: "..fmt, shortprog, ...)) -end - -local function warn (fmt, ...) - io.stderr:write (string.format ("%s: "..fmt, shortprog, ...)) -end - -local function die (fmt, ...) - io.stderr:write (string.format ("%s: "..fmt, shortprog, ...)) - os.exit (1) -end - -local function display_usage () - io.stdout:write (string.format (usage, prog)) - os.exit (0) -end - -local function get_ranklist (f, r) - if r == "all" or not r then r = '0-'..f.size-1 end - return hostlist.new ('['..r..']') -end - -local header = "OWNER RANK PID COMMAND" -local fmt = "%-5.5s %8d %9d %s" -local function print_process_info (procs) - print (header) - for _,p in pairs (procs) do - print (fmt:format (p.sender or "none", p.rank, p.pid, p.cmdline[1])) - end -end -------------------------------------------------------------------------------- --- Main program: -------------------------------------------------------------------------------- --- Parse cmdline args: --- -local getopt = require 'flux.alt_getopt' .get_opts -local opts, optind = getopt (arg, "r:vh", - { rank = "r", verbose = "v", help = "h" }) - -if opts.h then display_usage () end -if opts.v then verbose = true end - --- Create new local broker connection --- -local f, err = flux.new() -if not f then die ("Connecting to flux failed: %s\n", err) end - -local ranks = get_ranklist (f, opts.r) -local procs = {} -local size = #ranks -local count = 0 - --- Set up msghandler for process listing responses --- -local mh, err = f:msghandler { - pattern = "*.processes", - msgtypes = { flux.MSGTYPE_RESPONSE }, - - handler = function (f, msg, mh) - if msg.errnum ~= 0 then - warn ("Error: %s\n", posix.errno (msg.errnum)) - elseif not msg.data then - warn ("Error: empty message!\n") - else - local resp = msg.data - local rank = resp.rank - for _,p in pairs (resp.procs) do - p.rank = rank - table.insert (procs, p) - end - end - count = count + 1 - if count == size then f:reactor_stop() end - end - -} - --- Send requests to configured ranks --- -for i in ranks:next() do - local matchtag, err = f:send ("cmb.processes", {}, i ) - if not matchtag then error (err) end -end - --- Begin reactor loop: --- -local r = f:reactor() - -print_process_info (procs) - --- vi: ts=4 sw=4 expandtab diff --git a/t/t0005-exec.t b/t/t0005-exec.t index c934647981f4..fa4366d019af 100755 --- a/t/t0005-exec.t +++ b/t/t0005-exec.t @@ -159,70 +159,6 @@ test_expect_success 'signal forwarding works' ' test_expect_code 143 run_timeout 5 ./test_signal.sh TERM ' -flux_exec_bg() { flux exec "$@" stderr && - grep "No route to host" stderr -' - -test_expect_success 'process listing with valid and invalid ranks' ' - flux_exec_bg -r 0,$(invalid_rank) sleep 100 && - q=$lastpid && - sleep 1 && - flux ps -r 0,$(invalid_rank) 1> stdout 2> stderr && - count1=$(grep -c sleep stdout) && - count2=$(grep -c "No route to host" stderr) && - kill -INT $q && - test "$count1" = "1" && - test "$count2" = "1" && - test_expect_code 130 wait $q && - test "$(flux ps -r 0,$(invalid_rank) | grep -c sleep)" = "0" -' - -test_expect_success 'flux-exec disconnect terminates all running processes' ' - flux_exec_bg -r0-3 sleep 100 && - q=$lastpid && - sleep 1 && - count=$(flux ps | grep -c sleep) && - kill -9 $q && - test "$count" = "4" && - test_expect_code 137 wait $q && - test "$(flux ps | grep -c sleep)" = "0" -' - test_expect_success 'flux-exec: stdin bcast' ' count=$(echo Hello | flux exec -r0-3 cat | grep -c Hello) && test "$count" = "4" From c7ea138c508211ed5c52f25f0c7f464e985f2f6a Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Wed, 22 Aug 2018 16:26:10 -0700 Subject: [PATCH 11/20] cmd/flux-exec: Replace flux-exec command. Replace flux-exec command with a new flux-exec command based on the new subprocess library. --- src/cmd/Makefile.am | 4 +- src/cmd/flux-exec | 298 -------------------------------------- src/cmd/flux-exec.c | 345 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 347 insertions(+), 300 deletions(-) delete mode 100755 src/cmd/flux-exec create mode 100644 src/cmd/flux-exec.c diff --git a/src/cmd/Makefile.am b/src/cmd/Makefile.am index 30923a8cb9dc..8d680b9b3a24 100644 --- a/src/cmd/Makefile.am +++ b/src/cmd/Makefile.am @@ -58,7 +58,6 @@ dist_fluxcmd_SCRIPTS = \ flux-submit \ flux-wreckrun \ flux-wreck \ - flux-exec \ flux-cron \ flux-aggregate \ flux-hostlist @@ -73,7 +72,8 @@ fluxcmd_PROGRAMS = \ flux-kvs \ flux-start \ flux-jstat \ - flux-job + flux-job \ + flux-exec if ENABLE_JOBSPEC fluxcmd_PROGRAMS += \ diff --git a/src/cmd/flux-exec b/src/cmd/flux-exec deleted file mode 100755 index 0e3221414b2e..000000000000 --- a/src/cmd/flux-exec +++ /dev/null @@ -1,298 +0,0 @@ -#!/usr/bin/env lua - -------------------------------------------------------------------------------- --- Modules: -------------------------------------------------------------------------------- -local flux = require 'flux' -local decode = require 'flux.base64' .decode -local encode = require 'flux.base64' .encode -local posix = require 'flux.posix' -local timer = require 'flux.timer' -local hostlist = require 'flux.hostlist' - -local prog = string.match (arg[0], "([^/]+)$") -local shortprog = prog:match ("flux%-(.+)$") -local verbose = false - - --- --- Termination state needs to remain a global for access from --- signal handler functions. See setup_signal_handlers() below. --- -terminate = false - -------------------------------------------------------------------------------- --- Local functions: -------------------------------------------------------------------------------- --- --- -local function say (fmt, ...) - if not verbose then return end - io.stderr:write (string.format ("%s: "..fmt, shortprog, ...)) -end - -local function warn (fmt, ...) - io.stderr:write (string.format ("%s: "..fmt, shortprog, ...)) -end - -local function die (fmt, ...) - io.stderr:write (string.format ("%s: "..fmt, shortprog, ...)) - os.exit (1) -end - -local function program_state_create (f, n) - local f = f - local s = { - size = n or 1, - nexited = 0, - nstarted = 0, - nclosed = { stdout = 0, stderr = 0 }, - running = {}, - status = {}, - code = {}, - matchtag = {}, - } - local T = {} - - local function write (data) - if data.data then data.data = encode (data.data) end - for rank,pid in pairs(s.running) do - f:send ("cmb.exec.write", { pid = pid, stdin = data }, rank) - end - end - - local function activate_stdin () - f:iowatcher { - fd = posix.fileno (io.input()), - handler = function (iow, data) - write (data) - end - } - end - - function increment_nstarted () - s.nstarted = s.nstarted + 1 - if s.nstarted == s.size then - activate_stdin () - end - end - - - function T.size (n) - if n then s.size = n end - return s.size - end - function T.matchtag (mt, rank) - if rank then - s.matchtag [mt] = rank - return rank - else - return s.matchtag [mt] - end - end - function T.exited (resp) - s.nexited = s.nexited + 1 - s.code [resp.rank] = resp.code - s.status [resp.rank] = resp.status - end - function T.started (rank, pid) - increment_nstarted () - s.running [rank] = pid - end - function T.killall (f, signum) - say ("sending signal %d to %d running processes\n", - signum, s.nstarted - s.nexited) - for rank,pid in pairs (s.running) do - local mt, err = f:send ("cmb.exec.signal", - { pid = pid, signum = signum }, - rank) - if not mt then say ("failed to signal rank %d: %s\n", rank, err) end - end - end - function T.failed (rank, errnum) - increment_nstarted () - s.nexited = s.nexited + 1 - s.nclosed.stdout = s.nclosed.stdout + 1 - s.nclosed.stderr = s.nclosed.stderr + 1 - s.code [rank] = 68 -- EX_NOHOST - s.status [rank] = 68 - end - function T.eof (rank, name) - s.nclosed [name] = s.nclosed [name] + 1 - end - function T.complete () - if s.nexited == s.size and - s.nclosed.stdout == s.size and - s.nclosed.stderr == s.size then - return true - end - return false - end - function T.status (rank) - if rank then return s.status [rank] end - local rv = 0 - for k,v in pairs (s.status) do - if v > rv then rv = v end - end - return rv - end - function T.exit_code (rank) - if rank then return s.code [rank] end - local rv = 0 - for k,v in pairs (s.code) do - if v > rv then rv = v end - end - return rv - end - return T -end - -local function get_ranklist (f, r) - if r == "all" or not r then r = '0-'..f.size-1 end - return hostlist.new ('['..r..']') -end - -------------------------------------------------------------------------------- --- Main program: -------------------------------------------------------------------------------- --- Parse cmdline args: --- -local getopt = require 'flux.alt_getopt' .get_opts -local opts, optind = getopt (arg, "d:r:vln", - { rank = "r", verbose = "v", dir = "d", labelio = "l", - noinput = 'n' }) - -if opts.v then verbose = true end -if opts.n then io.input ("/dev/null") end -if not arg[optind] then die ("Command to run required\n") end -local cmdline = {} -for i = optind, #arg do - table.insert (cmdline, arg[i]) -end - -local sigtimer - --- Start in-program timer: -local tt = timer.new() -local t = timer.new() - --- Create new connection to local cmbd: --- -local f, err = flux.new() -if not f then die ("Connecting to flux failed: %s\n", err) end - -local ranks = get_ranklist (f, opts.r) -local state = program_state_create (f, #ranks) - - --- Set up msghandler for exec responses --- -local mh, err = f:msghandler { - pattern = "*.exec", - msgtypes = { flux.MSGTYPE_RESPONSE }, - - handler = function (f, zmsg, mh) - if zmsg.errnum ~= 0 then - local rank = state.matchtag (zmsg.matchtag) - warn ("Error: rank %d: %s\n", rank, posix.errno (zmsg.errnum)) - state.failed (rank, zmsg.errnum) - if state.complete() then - f:reactor_stop () - end - return - end - - local resp = zmsg.data - if not resp then return end - --say ("%03fms: rank %d %s\n", t:get0() * 1000, resp.rank or -1, resp.state or "error") - -- - if resp.type == "io" then - local dst = resp.name == "stdout" and io.stdout or io.stderr - if resp.data then - local lines = decode (resp.data) - if opts.l then - lines:gsub ('([^\n]+\n?)', function (s) - dst:write (resp.rank..": "..s) - end) - else - dst:write (lines) - end - end - if resp.eof then - state.eof (resp.rank, resp.name) - --io.close (dst) - if state.complete() then f:reactor_stop () end - end - elseif resp.state == "Running" then - state.started (resp.rank, resp.pid) - elseif resp.state == "Exited" or resp.state == "Exec Failure" then - if resp.state == "Exec Failure" then - warn ("Error: rank %d: %s\n", - resp.rank, posix.errno (resp.exec_errno)) - end - state.exited (resp) - if state.complete() then - f:reactor_stop () - end - else - warn ("got unexpected msg!\n") - end - end - -} - -local s, err = f:sighandler { - sigmask = { posix.SIGINT, posix.SIGTERM }, - handler = function (f, s, sig) - terminate = true - state.killall (f, sig) - f:reactor_stop() - end -} - - --- Begin reactor loop: --- -local sigtimer = nil - -t:set() -say ("%03fms: Starting %s on %s\n", t:get0() * 1000, cmdline[1], tostring(ranks)) - -local env = posix.getenv() - -local cwd = opts.d or posix.getcwd() - -local msg = { - cmdline = cmdline, - env = env, - cwd = cwd -} - -for i in ranks:next() do - local matchtag, err = f:send ("cmb.exec", msg, i ) - if not matchtag then error (err) end - state.matchtag (matchtag, i) -end -say ("%03fms: Sent all requests\n", t:get0() * 1000) - -repeat - local r = f:reactor() - if not terminate then break end - -- - -- If we catch a signal then lwj:watch() will be interrupted. - -- Check to see if we should terminate the job now: - -- - if not sigtimer then - sigtimer = timer.new() - sigtimer:get() - elseif sigtimer:get() < 1.0 then - say ("Detaching from job. Processes may still be running\n"); - os.exit (0); - end - terminate = false -until false - -say ("%03fms: %d tasks complete with code %d\n", t:get0() * 1000, state.size(), state.exit_code()) -os.exit (state.exit_code()) - --- vi: ts=4 sw=4 expandtab diff --git a/src/cmd/flux-exec.c b/src/cmd/flux-exec.c new file mode 100644 index 000000000000..38ab185760fd --- /dev/null +++ b/src/cmd/flux-exec.c @@ -0,0 +1,345 @@ +/*****************************************************************************\ + * Copyright (c) 2014 Lawrence Livermore National Security, LLC. Produced at + * the Lawrence Livermore National Laboratory (cf, AUTHORS, DISCLAIMER.LLNS). + * LLNL-CODE-658032 All rights reserved. + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the license, or (at your option) + * any later version. + * + * Flux is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the terms and conditions of the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * See also: http://www.gnu.org/licenses/ +\*****************************************************************************/ + +#if HAVE_CONFIG_H +#include "config.h" +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +#include "src/common/libutil/xzmalloc.h" +#include "src/common/libutil/monotime.h" +#include "src/common/libutil/nodeset.h" +#include "src/common/libutil/log.h" +#include "src/common/subprocess/subprocess.h" + +static struct optparse_option cmdopts[] = { + { .name = "rank", .key = 'r', .has_arg = 1, .arginfo = "NODESET", + .usage = "Specify specific target ranks. Default is \"all\"" }, + { .name = "dir", .key = 'd', .has_arg = 1, .arginfo = "PATH", + .usage = "Set the working directory to PATH" }, + { .name = "labelio", .key = 'l', .has_arg = 0, + .usage = "Label lines of output with the source RANK" }, + { .name = "noinput", .key = 'n', .has_arg = 0, + .usage = "Redirect stdin from /dev/null" }, + { .name = "verbose", .key = 'v', .has_arg = 0, + .usage = "Run with more verbosity." }, + OPTPARSE_TABLE_END +}; + +extern char **environ; + +uint32_t rank_count; +uint32_t started = 0; +uint32_t exited = 0; +int exit_code = 0; + +zlist_t *subprocesses; + +optparse_t *opts = NULL; + +flux_watcher_t *stdin_w; +int stdin_fd = STDIN_FILENO; + +void completion_cb (flux_subprocess_t *p) +{ + int ec = flux_subprocess_exit_code (p); + + if ((ec = flux_subprocess_exit_code (p)) < 0) { + /* bash standard, signals + 128 */ + if ((ec = flux_subprocess_signaled (p)) >= 0) + ec += 128; + } + if (ec > exit_code) + exit_code = ec; +} + +void state_cb (flux_subprocess_t *p, flux_subprocess_state_t state) +{ + if (state == FLUX_SUBPROCESS_RUNNING) { + started++; + /* see FLUX_SUBPROCESS_FAILED case below */ + (void)flux_subprocess_set_context (p, "started", p); + } + else if (state == FLUX_SUBPROCESS_EXITED) + exited++; + else if (state == FLUX_SUBPROCESS_EXEC_FAILED) { + /* EXEC_FAILED means RUNNING never reached, so must increment started */ + started++; + exited++; + } + else if (state == FLUX_SUBPROCESS_FAILED) { + /* FLUX_SUBPROCESS_FAILED is a catch all error case, no way to + * know if process started or not. So we cheat with a + * subprocess context setting. + */ + if (flux_subprocess_get_context (p, "started") == NULL) + started++; + exited++; + } + + if (started == rank_count) + flux_watcher_start (stdin_w); + if (exited == rank_count) + flux_watcher_stop (stdin_w); + + if (state == FLUX_SUBPROCESS_EXEC_FAILED + || state == FLUX_SUBPROCESS_FAILED) { + int errnum = flux_subprocess_fail_errno (p); + int ec = 1; + + log_err ("Error: rank %d: %s", flux_subprocess_rank (p), strerror (errnum)); + + /* bash standard, 126 for permission/access denied, 127 for + * command not found. 68 (EX_NOHOST) for No route to host. + */ + if (errnum == EPERM || errnum == EACCES) + ec = 126; + else if (errnum == ENOENT) + ec = 127; + else if (errnum == EHOSTUNREACH) + ec = 68; + + if (ec > exit_code) + exit_code = ec; + } +} + +void output_cb (flux_subprocess_t *p, const char *stream) +{ + FILE *fstream = !strcasecmp (stream, "STDERR") ? stderr : stdout; + const char *ptr; + int lenp; + + if (!(ptr = flux_subprocess_read_line (p, stream, &lenp))) + log_err_exit ("flux_subprocess_output: read_line"); + + /* if process exited, read remaining stuff or EOF, otherwise + * wait for future newline */ + if (!lenp + && flux_subprocess_state (p) == FLUX_SUBPROCESS_EXITED) { + + if (!(ptr = flux_subprocess_read (p, stream, -1, &lenp))) + log_err_exit ("flux_subprocess_output: read_line"); + } + + if (lenp) { + if (optparse_getopt (opts, "labelio", NULL) > 0) + fprintf (fstream, "%d: ", flux_subprocess_rank (p)); + fwrite (ptr, lenp, 1, fstream); + } +} + +static void stdin_cb (flux_reactor_t *r, flux_watcher_t *w, + int revents, void *arg) +{ + flux_buffer_t *fb = flux_buffer_read_watcher_get_buffer (w); + flux_subprocess_t *p; + const char *ptr; + int lenp; + + if (!(ptr = flux_buffer_read (fb, -1, &lenp))) + log_err_exit ("flux_buffer_read"); + + if (lenp) { + p = zlist_first (subprocesses); + while (p) { + if (flux_subprocess_state (p) == FLUX_SUBPROCESS_INIT + || flux_subprocess_state (p) == FLUX_SUBPROCESS_STARTED + || flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING) { + if (flux_subprocess_write (p, "STDIN", ptr, lenp) < 0) + log_err_exit ("flux_subprocess_write"); + } + p = zlist_next (subprocesses); + } + } + else { + p = zlist_first (subprocesses); + while (p) { + if (flux_subprocess_close (p, "STDIN") < 0) + log_err_exit ("flux_subprocess_close"); + p = zlist_next (subprocesses); + } + flux_watcher_stop (stdin_w); + } +} + +static void signal_cb (int signum) +{ + flux_subprocess_t *p = zlist_first (subprocesses); + while (p) { + if (optparse_getopt (opts, "verbose", NULL) > 0) + fprintf (stderr, "sending signal %d to %d running processes\n", + signum, started - exited); + if (flux_subprocess_state (p) == FLUX_SUBPROCESS_RUNNING) { + flux_future_t *f = flux_subprocess_kill (p, signum); + if (!f) { + if (optparse_getopt (opts, "verbose", NULL) > 0) + fprintf (stderr, "failed to signal rank %d: %s\n", + flux_subprocess_rank (p), strerror (errno)); + } + /* don't care about response */ + flux_future_destroy (f); + } + p = zlist_next (subprocesses); + } +} + +int main (int argc, char *argv[]) +{ + const char *optargp; + int optindex; + flux_t *h; + flux_reactor_t *r; + nodeset_t *ns; + nodeset_iterator_t *nsitr; + uint32_t rank; + flux_cmd_t *cmd; + char *cwd = NULL; + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_state_change = state_cb, + .on_channel_out = NULL, + .on_stdout = output_cb, + .on_stderr = output_cb, + }; + struct timespec t0; + + log_init ("flux-exec"); + + opts = optparse_create ("flux-exec"); + if (optparse_add_option_table (opts, cmdopts) != OPTPARSE_SUCCESS) + log_msg_exit ("optparse_add_option_table"); + if ((optindex = optparse_parse_args (opts, argc, argv)) < 0) + exit (1); + + if (optindex == argc) { + optparse_print_usage (opts); + exit (1); + } + + if (!(cmd = flux_cmd_create (argc - optindex, &argv[optindex], environ))) + log_err_exit ("flux_cmd_create"); + + if (optparse_getopt (opts, "dir", &optargp) > 0) { + if (!(cwd = strdup (optargp))) + log_err_exit ("strdup"); + } + else { + if (!(cwd = get_current_dir_name ())) + log_err_exit ("get_current_dir_name"); + } + + if (flux_cmd_setcwd (cmd, cwd) < 0) + log_err_exit ("flux_cmd_setcwd"); + + if (!(h = flux_open (NULL, 0))) + log_err_exit ("flux_open"); + + if (!(r = flux_get_reactor (h))) + log_err_exit ("flux_get_reactor"); + + if (optparse_getopt (opts, "rank", &optargp) > 0 + && strcmp (optargp, "all")) { + if (!(ns = nodeset_create_string (optargp))) + log_err_exit ("nodeset_create_string"); + if (flux_get_size (h, &rank_count) < 0) + log_err_exit ("flux_get_size"); + } + else { + if (flux_get_size (h, &rank_count) < 0) + log_err_exit ("flux_get_size"); + if (!(ns = nodeset_create_range (0, rank_count - 1))) + log_err_exit ("nodeset_create_range"); + } + + monotime (&t0); + if (optparse_getopt (opts, "verbose", NULL) > 0) { + const char *argv0 = flux_cmd_arg (cmd, 0); + fprintf (stderr, "%03fms: Starting %s on %s\n", + monotime_since (t0), argv0, nodeset_string (ns)); + } + + if (!(subprocesses = zlist_new ())) + log_err_exit ("zlist_new"); + + if (!(nsitr = nodeset_iterator_create (ns))) + log_err_exit ("nodeset_iterator_create"); + + while ((rank = nodeset_next (nsitr)) != NODESET_EOF) { + flux_subprocess_t *p; + if (!(p = flux_rexec (h, rank, 0, cmd, &ops))) + log_err_exit ("flux_rexec"); + if (zlist_append (subprocesses, p) < 0) + log_err_exit ("zlist_append"); + if (!zlist_freefn (subprocesses, p, flux_subprocess_destroy, true)) + log_err_exit ("zlist_freefn"); + } + + if (optparse_getopt (opts, "verbose", NULL) > 0) + fprintf (stderr, "%03fms: Sent all requests\n", monotime_since (t0)); + + if (optparse_getopt (opts, "noinput", NULL) > 0) { + if ((stdin_fd = open ("/dev/null", O_RDONLY)) < 0) + log_err_exit ("open"); + } + + if (!(stdin_w = flux_buffer_read_watcher_create (r, stdin_fd, + 1 << 20, stdin_cb, + 0, NULL))) + log_err_exit ("flux_buffer_read_watcher_create"); + + if (signal (SIGINT, signal_cb) == SIG_ERR) + log_err_exit ("signal"); + + if (signal (SIGTERM, signal_cb) == SIG_ERR) + log_err_exit ("signal"); + + if (flux_reactor_run (r, 0) < 0) + log_err_exit ("flux_reactor_run"); + + if (optparse_getopt (opts, "verbose", NULL) > 0) + fprintf (stderr, "%03fms: %d tasks complete with code %d\n", + monotime_since (t0), exited, exit_code); + + /* Clean up. + */ + free (cwd); + flux_close (h); + optparse_destroy (opts); + log_fini (); + zlist_destroy (&subprocesses); + + return exit_code; +} + +/* + * vi:tabstop=4 shiftwidth=4 expandtab + */ From 332f76c2b4f5df04494357560770360aaa177a5e Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Fri, 24 Aug 2018 09:33:40 -0700 Subject: [PATCH 12/20] broker/: Replace exec module with exec2 module Adjust all instances of "exec2" and replace with "exec" appropriately. --- src/broker/Makefile.am | 2 - src/broker/broker.c | 8 +- src/broker/exec.c | 517 ++--------------------------------------- src/broker/exec.h | 5 +- src/broker/exec2.c | 84 ------- src/broker/exec2.h | 19 -- 6 files changed, 28 insertions(+), 607 deletions(-) delete mode 100644 src/broker/exec2.c delete mode 100644 src/broker/exec2.h diff --git a/src/broker/Makefile.am b/src/broker/Makefile.am index ef82a91868fa..a8543c37eed2 100644 --- a/src/broker/Makefile.am +++ b/src/broker/Makefile.am @@ -40,8 +40,6 @@ flux_broker_SOURCES = \ heaptrace.c \ exec.h \ exec.c \ - exec2.h \ - exec2.c \ ping.h \ ping.c \ rusage.h \ diff --git a/src/broker/broker.c b/src/broker/broker.c index 7d01b54e01c9..00099cd29f54 100644 --- a/src/broker/broker.c +++ b/src/broker/broker.c @@ -66,8 +66,6 @@ #include "src/common/libutil/monotime.h" #include "src/common/libpmi/pmi.h" #include "src/common/libpmi/pmi_strerror.h" -#include "src/common/libsubprocess/zio.h" -#include "src/common/libsubprocess/subprocess.h" #include "heartbeat.h" #include "module.h" @@ -82,7 +80,6 @@ #include "runlevel.h" #include "heaptrace.h" #include "exec.h" -#include "exec2.h" #include "ping.h" #include "rusage.h" #include "boot_config.h" @@ -610,9 +607,7 @@ int main (int argc, char *argv[]) log_msg_exit ("heaptrace_initialize"); if (sequence_hash_initialize (ctx.h) < 0) log_err_exit ("sequence_hash_initialize"); - if (exec_initialize (ctx.h, ctx.sm, rank, ctx.attrs) < 0) - log_err_exit ("exec_initialize"); - if (exec2_initialize (ctx.h, rank, ctx.attrs) < 0) + if (exec_initialize (ctx.h, rank, ctx.attrs) < 0) log_err_exit ("exec2_initialize"); if (ping_initialize (ctx.h, "cmb") < 0) log_err_exit ("ping_initialize"); @@ -1297,7 +1292,6 @@ static void cmb_disconnect_cb (flux_t *h, flux_msg_handler_t *mh, if (flux_msg_get_route_first (msg, &sender) == 0) { exec_terminate_subprocesses_by_uuid (h, sender); - exec2_terminate_subprocesses_by_uuid (h, sender); free (sender); } /* no response */ diff --git a/src/broker/exec.c b/src/broker/exec.c index dc7bc0fc03c4..4ac48a0e278b 100644 --- a/src/broker/exec.c +++ b/src/broker/exec.c @@ -32,518 +32,51 @@ #include #include -#include "src/common/libsubprocess/zio.h" -#include "src/common/libsubprocess/subprocess.h" +#include "src/common/subprocess/subprocess.h" +#include "src/common/subprocess/command.h" #include "src/common/libutil/log.h" +#include "src/common/libutil/base64.h" #include "attr.h" #include "exec.h" -typedef struct { - flux_t *h; - flux_msg_handler_t **handlers; - struct subprocess_manager *sm; - uint32_t rank; - const char *local_uri; -} exec_t; - -static char *prepare_exit_payload (exec_t *x, struct subprocess *p) -{ - int n; - json_t *resp; - char *s; - - if (!(resp = json_pack ("{s:i s:i s:s s:i s:i}", - "rank", x->rank, - "pid", subprocess_pid (p), - "state", subprocess_state_string (p), - "status", subprocess_exit_status (p), - "code", subprocess_exit_code (p)))) { - errno = ENOMEM; - goto error; - } - if ((n = subprocess_signaled (p))) { - json_t *o = json_integer (n); - if (!o || json_object_set_new (resp, "signal", o) < 0) { - json_decref (o); - errno = ENOMEM; - goto error; - } - - } - if ((n = subprocess_exec_error (p))) { - json_t *o = json_integer (n); - if (!o || json_object_set_new (resp, "exec_errno", o) < 0) { - json_decref (o); - errno = ENOMEM; - goto error; - } - } - if (!(s = json_dumps (resp, 0))) { - errno = ENOMEM; - goto error; - } - json_decref (resp); - return s; -error: - json_decref (resp); - return NULL; -} - -/* Handler for child exit (registered with libsubprocess). - * Respond to user with exit status, etc. - * using orig. request message stashed in subprocess context. - */ -static int child_exit_handler (struct subprocess *p) -{ - exec_t *x = subprocess_get_context (p, "exec_ctx"); - flux_msg_t *msg = (flux_msg_t *) subprocess_get_context (p, "msg"); - char *s = NULL; - - assert (x != NULL); - assert (msg != NULL); - - if (!(s = prepare_exit_payload (x, p))) { - if (flux_respond (x->h, msg, errno, NULL) < 0) - flux_log_error (x->h, "%s: flux_respond", __FUNCTION__); - goto done; - } - if (flux_respond (x->h, msg, 0, s) < 0) - flux_log_error (x->h, "%s: flux_respond", __FUNCTION__); -done: - free (s); - flux_msg_destroy (msg); - subprocess_destroy (p); - return (0); -} - -static char *prepare_io_payload (exec_t *x, const char *json_str) -{ - json_t *resp; - json_t *o; - char *s; - - if (!(resp = json_loads (json_str, 0, NULL))) { - errno = EPROTO; - goto error; - } - if (!(o = json_integer (x->rank)) - || json_object_set_new (resp, "rank", o) < 0) { - json_decref (o); - errno = ENOMEM; - goto error; - } - if (!(s = json_dumps (resp, 0))) { - errno = ENOMEM; - goto error; - } - json_decref (resp); - return s; -error: - json_decref (resp); - return NULL; -} - -/* Handler for child stdio (registered with libsubprocess). - * Respond to user with zio-formatted data, tacking on the rank. - * using orig. request message stashed in subprocess context. - */ -static int child_io_cb (struct subprocess *p, const char *json_str) -{ - exec_t *x = subprocess_get_context (p, "exec_ctx"); - flux_msg_t *msg = subprocess_get_context (p, "msg"); - char *s; - - assert (x != NULL); - assert (msg != NULL); - - if (!(s = prepare_io_payload (x, json_str))) { - if (flux_respond (x->h, msg, errno, NULL) < 0) - flux_log_error (x->h, "%s: flux_respond", __FUNCTION__); - goto done; - } - if (flux_respond (x->h, msg, 0, s) < 0) - flux_log_error (x->h, "%s: flux_respond", __FUNCTION__); -done: - free (s); - return (0); // return value is not checked in libsubprocess -} - -static struct subprocess * -subprocess_get_pid (struct subprocess_manager *sm, int pid) -{ - struct subprocess *p = NULL; - p = subprocess_manager_first (sm); - while (p) { - if (pid == subprocess_pid (p)) - return (p); - p = subprocess_manager_next (sm); - } - return (NULL); -} - -static int write_to_child (struct subprocess *p, const char *s) -{ - int len; - void *data = NULL; - bool eof; - int rc = -1; - - /* XXX: We use zio_json_decode() here for convenience. Probably - * this should be bubbled up as a subprocess IO json spec with - * encode/decode functions. - */ - if ((len = zio_json_decode (s, &data, &eof)) < 0) - goto done; - if (subprocess_write (p, data, len, eof) < 0) - goto done; - rc = 0; -done: - free (data); - return rc; -} - -static void write_request_cb (flux_t *h, flux_msg_handler_t *mh, - const flux_msg_t *msg, void *arg) -{ - exec_t *x = arg; - json_t *o; - char *s = NULL; - int pid; - int errnum = 0; - struct subprocess *p; - - if (flux_request_unpack (msg, NULL, "{s:i s:o}", "pid", &pid, - "stdin", &o) < 0) { - errnum = errno; - goto out; - } - if (!(p = subprocess_get_pid (x->sm, pid))) { - errnum = ENOENT; - goto out; - } - if (!(s = json_dumps (o, 0))) { - errnum = EPROTO; - goto out; - } - if (write_to_child (p, s) < 0) { - errnum = errno; - goto out; - } -out: - free (s); - if (flux_respond_pack (h, msg, "{ s:i }", "code", errnum) < 0) - flux_log_error (h, "write_request_cb: flux_respond_pack"); -} - -static void signal_request_cb (flux_t *h, flux_msg_handler_t *mh, - const flux_msg_t *msg, void *arg) -{ - exec_t *x = arg; - int pid; - int errnum = EPROTO; - int signum = SIGTERM; - struct subprocess *p; - - if (flux_request_unpack (msg, NULL, "{s:i s?:i}", - "pid", &pid, - "signum", &signum) < 0) { - errnum = errno; - goto out; - } - p = subprocess_manager_first (x->sm); - while (p) { - if (pid == subprocess_pid (p)) { - errnum = 0; - /* Send signal to entire process group */ - if (kill (-pid, signum) < 0) - errnum = errno; - } - p = subprocess_manager_next (x->sm); - } -out: - if (flux_respond_pack (h, msg, "{ s:i }", "code", errnum) < 0) - flux_log_error (h, "signal_request_cb: flux_respond_pack"); -} - -static int do_setpgrp (struct subprocess *p) -{ - if (setpgrp () < 0) - fprintf (stderr, "setpgrp: %s", strerror (errno)); - return (0); -} - - -static int prepare_subprocess (exec_t *x, - json_t *args, - json_t *env, - const char *cwd, - const flux_msg_t *msg, - struct subprocess **pp) -{ - struct subprocess *p; - const char *s; - flux_msg_t *copy = NULL; - const char *key; - size_t index; - json_t *o; - - if (!(p = subprocess_create (x->sm))) - goto error; - if (subprocess_add_hook (p, SUBPROCESS_COMPLETE, child_exit_handler) < 0) - goto error; - if (subprocess_add_hook (p, SUBPROCESS_PRE_EXEC, do_setpgrp) < 0) - goto error; - if (subprocess_set_io_callback (p, child_io_cb) < 0) - goto error; - /* Save context for subprocess callbacks. - * Include request message for multiple responses. - */ - if (!(copy = flux_msg_copy (msg, true))) - goto error; - if (subprocess_set_context (p, "msg", (void *) copy) < 0) - goto error; - subprocess_set_context (p, "exec_ctx", x); - /* Command and arguments - */ - json_array_foreach (args, index, o) { - if (!(s = json_string_value (o))) { - errno = EPROTO; - goto error; - } - if (subprocess_argv_append (p, s) < 0) - goto error; - } - /* Environment - */ - if (env) { - json_object_foreach (env, key, o) { - if (!(s = json_string_value (o))) { - errno = EPROTO; - goto error; - } - if (subprocess_setenv (p, key, s, 1) < 0) - goto error; - } - } - else { - if (subprocess_set_environ (p, environ) < 0) - goto error; - } - if (subprocess_setenv (p, "FLUX_URI", x->local_uri, 1) < 0) - goto error; - /* Working directory - */ - if (cwd) { - if (subprocess_set_cwd (p, cwd) < 0) - goto error; - } - - *pp = p; - return 0; -error: - flux_msg_destroy (copy); - subprocess_destroy (p); - return -1; -} - -static void exec_request_cb (flux_t *h, flux_msg_handler_t *mh, - const flux_msg_t *msg, void *arg) -{ - exec_t *x = arg; - json_t *args; - json_t *env = NULL; - const char *cwd = NULL; - struct subprocess *p; - - if (flux_request_unpack (msg, NULL, "{s:o s?:o s?:s}", - "cmdline", &args, - "env", &env, - "cwd", &cwd) < 0) - goto error; - if (prepare_subprocess (x, args, env, cwd, msg, &p) < 0) - goto error; - - if (subprocess_fork (p) < 0) { - /* - * Fork error, respond directly to exec client with error - * (There will be no subprocess to reap) - */ - goto error; - } - - if (subprocess_exec (p) >= 0) { - /* - * Successful exec response. - * For "Exec Failure" allow that state to be transmitted - * to caller on completion handler (which will be called - * immediately) - */ - if (flux_respond_pack (h, msg, "{s:i s:i s:s}", - "rank", x->rank, - "pid", subprocess_pid (p), - "state", subprocess_state_string (p)) < 0) - flux_log_error (h, "%s: flux_respond", __FUNCTION__); - } - return; -error: - if (flux_respond (h, msg, errno, NULL) < 0) - flux_log_error (h, "%s: flux_respond", __FUNCTION__); -} - -static char *subprocess_sender (struct subprocess *p) +static void exec_finalize (void *arg) { - char *sender; - flux_msg_t *msg = subprocess_get_context (p, "msg"); - if (!msg || flux_msg_get_route_first (msg, &sender) < 0) - return NULL; - return (sender); + flux_subprocess_server_t *s = arg; + flux_subprocess_server_stop (s); } int exec_terminate_subprocesses_by_uuid (flux_t *h, const char *id) { - exec_t *x = flux_aux_get (h, "flux::exec"); + flux_subprocess_server_t *s = flux_aux_get (h, "flux::exec"); - struct subprocess *p = subprocess_manager_first (x->sm); - while (p) { - char *sender; - if ((sender = subprocess_sender (p))) { - pid_t pid; - if ((strcmp (id, sender) == 0) - && ((pid = subprocess_pid (p)) > (pid_t) 0)) { - /* Kill process group for subprocess p */ - flux_log (x->h, LOG_INFO, - "Terminating PGRP %ld", (unsigned long) pid); - if (kill (-pid, SIGKILL) < 0) - flux_log_error (x->h, "killpg"); - } - free (sender); - } - p = subprocess_manager_next (x->sm); + if (!s) { + flux_log (h, LOG_DEBUG, "no server_ctx found"); + return -1; } - return (0); -} - -static json_t *subprocess_json_info (struct subprocess *p) -{ - int i; - char buf [MAXPATHLEN]; - const char *cwd; - char *sender = NULL; - json_t *info = NULL; - json_t *args = NULL; - if ((cwd = subprocess_get_cwd (p)) == NULL) { - if (!(cwd = getcwd (buf, MAXPATHLEN-1))) - goto error; - } - if (!(args = json_array ())) { - errno = ENOMEM; - goto error; - } - for (i = 0; i < subprocess_get_argc (p); i++) { - json_t *o; - if (!(o = json_string (subprocess_get_arg (p, i))) - || json_array_append_new (args, o) < 0) { - json_decref (o); - json_decref (args); - errno = ENOMEM; - goto error; - } - } - if (!(info = json_pack ("{s:i s:s s:o}", - "pid", subprocess_pid (p), - "cwd", cwd, - "cmdline", args))) { - json_decref (args); - errno = ENOMEM; - goto error; - } - if ((sender = subprocess_sender (p))) { - json_t *o; - if (!(o = json_string (sender)) - || json_object_set_new (info, "sender", o) < 0) { - json_decref (o); - free (sender); - errno = ENOMEM; - goto error; - } - free (sender); + if (flux_subprocess_server_terminate_by_uuid (s, id) < 0) { + flux_log_error (h, "flux_subprocess_server_terminate_by_uuid"); + return -1; } - return (info); -error: - json_decref (info); - return NULL; -} - -static void ps_request_cb (flux_t *h, flux_msg_handler_t *mh, - const flux_msg_t *msg, void *arg) -{ - struct subprocess *p; - exec_t *x = arg; - json_t *procs; - if (!(procs = json_array ())) { - errno = ENOMEM; - goto error; - } - p = subprocess_manager_first (x->sm); - while (p) { - json_t *o; - if (!(o = subprocess_json_info (p)) - || json_array_append_new (procs, o) < 0) { - json_decref (o); - errno = ENOMEM; - goto error; - } - p = subprocess_manager_next (x->sm); - } - if (flux_respond_pack (h, msg, "{s:i s:o}", "rank", x->rank, - "procs", procs) < 0) - flux_log_error (h, "%s: flux_respond_pack", __FUNCTION__); - return; -error: - if (flux_respond (h, msg, errno, NULL) < 0) - flux_log_error (h, "%s: flux_respond", __FUNCTION__); - json_decref (procs); + return 0; } -static const struct flux_msg_handler_spec htab[] = { - { FLUX_MSGTYPE_REQUEST, "cmb.exec", exec_request_cb, 0 }, - { FLUX_MSGTYPE_REQUEST, "cmb.exec.signal", signal_request_cb, 0 }, - { FLUX_MSGTYPE_REQUEST, "cmb.exec.write", write_request_cb, 0 }, - { FLUX_MSGTYPE_REQUEST, "cmb.processes", ps_request_cb, 0 }, - FLUX_MSGHANDLER_TABLE_END, -}; - -static void exec_finalize (void *arg) +int exec_initialize (flux_t *h, uint32_t rank, attr_t *attrs) { - exec_t *x = arg; - flux_msg_handler_delvec (x->handlers); - free (x); -} + flux_subprocess_server_t *s = NULL; + const char *local_uri; -int exec_initialize (flux_t *h, struct subprocess_manager *sm, - uint32_t rank, attr_t *attrs) -{ - exec_t *x = calloc (1, sizeof (*x)); - if (!x) { - errno = ENOMEM; - return -1; - } - x->h = h; - x->sm = sm; - x->rank = rank; - if (attr_get (attrs, "local-uri", &x->local_uri, NULL) < 0) { - free (x); - return -1; - } - if (flux_msg_handler_addvec (h, htab, x, &x->handlers) < 0) { - free (x); - return -1; - } - flux_aux_set (h, "flux::exec", x, exec_finalize); + if (attr_get (attrs, "local-uri", &local_uri, NULL) < 0) + goto cleanup; + if (!(s = flux_subprocess_server_start (h, "cmb", local_uri, rank))) + goto cleanup; + flux_aux_set (h, "flux::exec", s, exec_finalize); return 0; +cleanup: + flux_subprocess_server_stop (s); + return -1; } /* diff --git a/src/broker/exec.h b/src/broker/exec.h index 67bf0c24ddaf..961c107c2154 100644 --- a/src/broker/exec.h +++ b/src/broker/exec.h @@ -3,15 +3,14 @@ #include #include -#include "src/common/libsubprocess/subprocess.h" +#include "src/common/subprocess/subprocess.h" #include "attr.h" /* Kill any processes started by disconnecting client. */ int exec_terminate_subprocesses_by_uuid (flux_t *h, const char *id); -int exec_initialize (flux_t *h, struct subprocess_manager *sm, - uint32_t rank, attr_t *attrs); +int exec_initialize (flux_t *h, uint32_t rank, attr_t *attrs); #endif /* BROKER_EXEC_H */ diff --git a/src/broker/exec2.c b/src/broker/exec2.c deleted file mode 100644 index 0eb06310c33f..000000000000 --- a/src/broker/exec2.c +++ /dev/null @@ -1,84 +0,0 @@ -/*****************************************************************************\ - * Copyright (c) 2014 Lawrence Livermore National Security, LLC. Produced at - * the Lawrence Livermore National Laboratory (cf, AUTHORS, DISCLAIMER.LLNS). - * LLNL-CODE-658032 All rights reserved. - * - * This file is part of the Flux resource manager framework. - * For details, see https://github.com/flux-framework. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the license, or (at your option) - * any later version. - * - * Flux is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the terms and conditions of the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. - * See also: http://www.gnu.org/licenses/ -\*****************************************************************************/ - -#if HAVE_CONFIG_H -#include "config.h" -#endif -#include -#include -#include -#include -#include -#include - -#include "src/common/subprocess/subprocess.h" -#include "src/common/subprocess/command.h" -#include "src/common/libutil/log.h" -#include "src/common/libutil/base64.h" - -#include "attr.h" -#include "exec2.h" - -static void exec2_finalize (void *arg) -{ - flux_subprocess_server_t *s = arg; - flux_subprocess_server_stop (s); -} - -int exec2_terminate_subprocesses_by_uuid (flux_t *h, const char *id) -{ - flux_subprocess_server_t *s = flux_aux_get (h, "flux::exec2"); - - if (!s) { - flux_log (h, LOG_DEBUG, "no server_ctx found"); - return -1; - } - - if (flux_subprocess_server_terminate_by_uuid (s, id) < 0) { - flux_log_error (h, "flux_subprocess_server_terminate_by_uuid"); - return -1; - } - - return 0; -} - -int exec2_initialize (flux_t *h, uint32_t rank, attr_t *attrs) -{ - flux_subprocess_server_t *s = NULL; - const char *local_uri; - - if (attr_get (attrs, "local-uri", &local_uri, NULL) < 0) - goto cleanup; - if (!(s = flux_subprocess_server_start (h, "cmb", local_uri, rank))) - goto cleanup; - flux_aux_set (h, "flux::exec2", s, exec2_finalize); - return 0; -cleanup: - flux_subprocess_server_stop (s); - return -1; -} - -/* - * vi:tabstop=4 shiftwidth=4 expandtab - */ diff --git a/src/broker/exec2.h b/src/broker/exec2.h deleted file mode 100644 index 404eaac71d51..000000000000 --- a/src/broker/exec2.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef BROKER_EXEC2_H -#define BROKER_EXEC2_H - -#include -#include -#include "src/common/subprocess/subprocess.h" -#include "attr.h" - -/* Kill any processes started by disconnecting client. - */ -int exec2_terminate_subprocesses_by_uuid (flux_t *h, const char *id); - -int exec2_initialize (flux_t *h, uint32_t rank, attr_t *attrs); - -#endif /* BROKER_EXEC2_H */ - -/* - * vi:tabstop=4 shiftwidth=4 expandtab - */ From 0e80603d6362f768280e996f1308c9a59c4b491b Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Fri, 24 Aug 2018 11:12:24 -0700 Subject: [PATCH 13/20] cmd/builtin/proxy: Use new subprocess library --- src/cmd/builtin/proxy.c | 86 +++++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 34 deletions(-) diff --git a/src/cmd/builtin/proxy.c b/src/cmd/builtin/proxy.c index 1825b37aa2e2..c58bebac59dd 100644 --- a/src/cmd/builtin/proxy.c +++ b/src/cmd/builtin/proxy.c @@ -47,7 +47,7 @@ #include "src/common/libutil/xzmalloc.h" #include "src/common/libutil/oom.h" #include "src/common/libutil/cleanup.h" -#include "src/common/libsubprocess/subprocess.h" +#include "src/common/subprocess/subprocess.h" #define LISTEN_BACKLOG 5 @@ -59,8 +59,7 @@ typedef struct { flux_reactor_t *reactor; uid_t session_owner; zhash_t *subscriptions; - struct subprocess_manager *sm; - struct subprocess *p; + flux_subprocess_t *p; bool oneshot; int exit_code; } proxy_ctx_t; @@ -106,10 +105,6 @@ static void ctx_destroy (proxy_ctx_t *ctx) if (ctx) { zlist_destroy (&ctx->clients); zhash_destroy (&ctx->subscriptions); - if (ctx->sm) - subprocess_manager_destroy (ctx->sm); - if (ctx->reactor) - flux_reactor_destroy (ctx->reactor); free (ctx); } } @@ -127,11 +122,6 @@ static proxy_ctx_t *ctx_create (flux_t *h) if (!(ctx->subscriptions = zhash_new ())) oom (); ctx->session_owner = geteuid (); - if (!(ctx->sm = subprocess_manager_create ())) - log_err_exit ("subprocess_manager_create"); - if (subprocess_manager_set (ctx->sm, SM_REACTOR, ctx->reactor) < 0) - log_err_exit ("subprocess_manager_set reactor"); - subprocess_manager_set (ctx->sm, SM_WAIT_FLAGS, WNOHANG); return ctx; } @@ -822,14 +812,19 @@ static char *findjob (const char *job) return uri; } -static int child_cb (struct subprocess *p) +static void completion_cb (flux_subprocess_t *p) { - proxy_ctx_t *ctx = subprocess_get_context (p, "ctx"); + proxy_ctx_t *ctx = flux_subprocess_get_context (p, "ctx"); - ctx->exit_code = subprocess_exit_code (p); + assert (ctx); + + if ((ctx->exit_code = flux_subprocess_exit_code (p)) < 0) { + /* bash standard, signals + 128 */ + if ((ctx->exit_code = flux_subprocess_signaled (p)) >= 0) + ctx->exit_code += 128; + } flux_reactor_stop (ctx->reactor); - subprocess_destroy (p); - return 0; + flux_subprocess_destroy (p); } static int child_create (proxy_ctx_t *ctx, int ac, char **av, const char *workpath) @@ -837,42 +832,65 @@ static int child_create (proxy_ctx_t *ctx, int ac, char **av, const char *workpa const char *shell = getenv ("SHELL"); char *argz = NULL; size_t argz_len = 0; - struct subprocess *p = NULL; + flux_subprocess_t *p = NULL; + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_state_change = NULL, + .on_channel_out = NULL, + .on_stdout = NULL, + .on_stderr = NULL, + }; + flux_cmd_t *cmd = NULL; int i; if (!shell) shell = "/bin/sh"; + if (!(cmd = flux_cmd_create (0, NULL, environ))) + goto error; + + if (flux_cmd_argv_append (cmd, "%s", shell) < 0) + goto error; + for (i = 0; i < ac; i++) { if (argz_add (&argz, &argz_len, av[i]) != 0) { errno = ENOMEM; goto error; } } - if (argz) + if (argz) { argz_stringify (argz, argz_len, ' '); - if (!(p = subprocess_create (ctx->sm)) - || subprocess_set_context (p, "ctx", ctx) < 0 - || subprocess_add_hook (p, SUBPROCESS_COMPLETE, child_cb) < 0 - || subprocess_argv_append (p, shell) < 0 - || (argz && subprocess_argv_append (p, "-c") < 0) - || (argz && subprocess_argv_append (p, argz) < 0) - || subprocess_set_environ (p, environ) < 0 - || subprocess_setenvf (p, "FLUX_URI", 1, - "local://%s", workpath) < 0 - || subprocess_run (p) < 0) + if (flux_cmd_argv_append (cmd, "-c") < 0) + goto error; + + if (flux_cmd_argv_append (cmd, argz) < 0) + goto error; + } + + if (flux_cmd_setenvf (cmd, 1, "FLUX_URI", "local://%s", workpath) < 0) goto error; - if (argz) - free (argz); + /* We want stdio fallthrough so subprocess can capture tty if + * necessary (i.e. an interactive shell) + */ + if (!(p = flux_exec (ctx->reactor, + FLUX_SUBPROCESS_FLAGS_STDIO_FALLTHROUGH, + cmd, + &ops))) + goto error; + + if (flux_subprocess_set_context (p, "ctx", ctx) < 0) + goto error; + + flux_cmd_destroy (cmd); + ctx->p = p; return 0; error: if (p) - subprocess_destroy (p); - if (argz) - free (argz); + flux_subprocess_destroy (p); + flux_cmd_destroy (cmd); return -1; } From 1bbbe692cc89fa7e768488785741c846ae33445e Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Sat, 25 Aug 2018 09:34:42 -0700 Subject: [PATCH 14/20] cmd/flux-start: Use new subprocess library --- src/cmd/flux-start.c | 209 ++++++++++++++++++++++++------------------- 1 file changed, 118 insertions(+), 91 deletions(-) diff --git a/src/cmd/flux-start.c b/src/cmd/flux-start.c index b227728f03a6..917dab057137 100644 --- a/src/cmd/flux-start.c +++ b/src/cmd/flux-start.c @@ -41,7 +41,7 @@ #include "src/common/libutil/setenvf.h" #include "src/common/libpmi/simple_server.h" #include "src/common/libpmi/dgetline.h" -#include "src/common/libsubprocess/subprocess.h" +#include "src/common/subprocess/subprocess.h" #define DEFAULT_KILLER_TIMEOUT 2.0 @@ -49,7 +49,7 @@ static struct { double killer_timeout; flux_reactor_t *reactor; flux_watcher_t *timer; - struct subprocess_manager *sm; + zlist_t *subprocesses; optparse_t *opts; int size; int count; @@ -62,10 +62,8 @@ static struct { struct client { int rank; - int fd; - struct subprocess *p; - flux_watcher_t *w; - char buf[SIMPLE_MAX_PROTO_LINE]; + flux_subprocess_t *p; + flux_cmd_t *cmd; }; void killer (flux_reactor_t *r, flux_watcher_t *w, int revents, void *arg); @@ -278,54 +276,92 @@ char *find_broker (const char *searchpath) void killer (flux_reactor_t *r, flux_watcher_t *w, int revents, void *arg) { - struct subprocess *p; + flux_subprocess_t *p; - p = subprocess_manager_first (ctx.sm); + p = zlist_first (ctx.subprocesses); while (p) { - if (subprocess_pid (p)) - (void)subprocess_kill (p, SIGKILL); - p = subprocess_manager_next (ctx.sm); + flux_future_t *f = flux_subprocess_kill (p, SIGKILL); + if (f) + flux_future_destroy (f); + p = zlist_next (ctx.subprocesses); } } -static int child_report (struct subprocess *p) +static void completion_cb (flux_subprocess_t *p) { - struct client *cli = subprocess_get_context (p, "cli"); - pid_t pid = subprocess_pid (p); - int sig; - - if ((sig = subprocess_stopped (p))) - log_msg ("%d (pid %d) %s", cli->rank, pid, strsignal (sig)); - else if ((subprocess_continued (p))) - log_msg ("%d (pid %d) %s", cli->rank, pid, strsignal (SIGCONT)); - else if ((sig = subprocess_signaled (p))) - log_msg ("%d (pid %d) %s", cli->rank, pid, strsignal (sig)); - else if (subprocess_exited (p)) { - int rc = subprocess_exit_code (p); - if (rc >= 128) - log_msg ("%d (pid %d) exited with rc=%d (%s)", cli->rank, pid, rc, - strsignal (rc - 128)); - else if (rc > 0) - log_msg ("%d (pid %d) exited with rc=%d", cli->rank, pid, rc); - } else - log_msg ("%d (pid %d) status=%d", cli->rank, pid, - subprocess_exit_status (p)); - return 0; -} + struct client *cli = flux_subprocess_get_context (p, "cli"); + int rc; -static int child_exit (struct subprocess *p) -{ - struct client *cli = subprocess_get_context (p, "cli"); - int rc = subprocess_exit_code (p); + assert (cli); - if (ctx.exit_rc < rc) + if ((rc = flux_subprocess_exit_code (p)) < 0) { + /* bash standard, signals + 128 */ + if ((rc = flux_subprocess_signaled (p)) >= 0) + rc += 128; + } + + if (rc > ctx.exit_rc) ctx.exit_rc = rc; + if (--ctx.count > 0) flux_watcher_start (ctx.timer); else flux_watcher_stop (ctx.timer); + client_destroy (cli); - return 0; +} + +static void state_cb (flux_subprocess_t *p, flux_subprocess_state_t state) +{ + struct client *cli = flux_subprocess_get_context (p, "cli"); + + assert (cli); + + if (state == FLUX_SUBPROCESS_FAILED) { + log_errn (errno, "%d FAILED", cli->rank); + if (--ctx.count > 0) + flux_watcher_start (ctx.timer); + else + flux_watcher_stop (ctx.timer); + client_destroy (cli); + } + else if (state == FLUX_SUBPROCESS_EXITED) { + pid_t pid = flux_subprocess_pid (p); + int status; + + if ((status = flux_subprocess_status (p)) >= 0) { + if (WIFSIGNALED (status)) + log_msg ("%d (pid %d) %s", cli->rank, pid, strsignal (WSTOPSIG (status))); + else if (WIFCONTINUED (status)) + log_msg ("%d (pid %d) %s", cli->rank, pid, strsignal (SIGCONT)); + else if (WIFSIGNALED (status)) + log_msg ("%d (pid %d) %s", cli->rank, pid, strsignal (WTERMSIG (status))); + else if (WIFEXITED (status)) + log_msg ("%d (pid %d) exited with rc=%d", cli->rank, pid, WEXITSTATUS (status)); + } else + log_msg ("%d (pid %d) exited, unknown status", cli->rank, pid); + } +} + +void channel_cb (flux_subprocess_t *p, const char *stream) +{ + struct client *cli = flux_subprocess_get_context (p, "cli"); + const char *ptr; + int rc, lenp; + + assert (cli); + assert (!strcmp (stream, "PMI")); + + if (!(ptr = flux_subprocess_read_line (p, stream, &lenp))) + log_err_exit ("%s: flux_subprocess_read_line", __FUNCTION__); + + if (lenp) { + rc = pmi_simple_server_request (ctx.pmi.srv, ptr, cli); + if (rc < 0) + log_err_exit ("%s: pmi_simple_server_request", __FUNCTION__); + if (rc == 1) + (void) flux_subprocess_close (p, stream); + } } void add_args_list (char **argz, size_t *argz_len, optparse_t *opt, const char *name) @@ -352,7 +388,7 @@ char *create_scratch_dir (const char *session_id) static int pmi_response_send (void *client, const char *buf) { struct client *cli = client; - return dputline (cli->fd, buf); + return flux_subprocess_write (cli->p, "PMI", buf, strlen (buf)); } static void pmi_debug_trace (void *client, const char *buf) @@ -361,23 +397,6 @@ static void pmi_debug_trace (void *client, const char *buf) fprintf (stderr, "%d: %s", cli->rank, buf); } -void pmi_simple_cb (flux_reactor_t *r, flux_watcher_t *w, - int revents, void *arg) -{ - struct client *cli = arg; - int rc; - if (dgetline (cli->fd, cli->buf, sizeof (cli->buf)) < 0) - log_err_exit ("%s", __FUNCTION__); - rc = pmi_simple_server_request (ctx.pmi.srv, cli->buf, cli); - if (rc < 0) - log_err_exit ("%s", __FUNCTION__); - if (rc == 1) { - close (cli->fd); - cli->fd = -1; - flux_watcher_stop (w); - } -} - int pmi_kvs_put (void *arg, const char *kvsname, const char *key, const char *val) { @@ -454,17 +473,13 @@ struct client *client_create (const char *broker_path, const char *scratch_dir, int rank, const char *cmd_argz, size_t cmd_argz_len) { struct client *cli = xzmalloc (sizeof (*cli)); - int client_fd; + char *arg; char * argz = NULL; size_t argz_len = 0; cli->rank = rank; - cli->fd = -1; - if (!(cli->p = subprocess_create (ctx.sm))) + if (!(cli->cmd = flux_cmd_create (0, NULL, environ))) goto fail; - subprocess_set_context (cli->p, "cli", cli); - subprocess_add_hook (cli->p, SUBPROCESS_COMPLETE, child_exit); - subprocess_add_hook (cli->p, SUBPROCESS_STATUS, child_report); add_args_list (&argz, &argz_len, ctx.opts, "wrap"); argz_add (&argz, &argz_len, broker_path); char *run_dir = xasprintf ("%s/%d", scratch_dir, rank); @@ -480,22 +495,20 @@ struct client *client_create (const char *broker_path, const char *scratch_dir, if (rank == 0 && cmd_argz) argz_append (&argz, &argz_len, cmd_argz, cmd_argz_len); /* must be last arg */ - subprocess_set_args_from_argz (cli->p, argz, argz_len); + arg = argz_next (argz, argz_len, NULL); + while (arg) { + if (flux_cmd_argv_append (cli->cmd, arg) < 0) + log_err_exit ("flux_cmd_argv_append"); + arg = argz_next (argz, argz_len, arg); + } free (argz); - subprocess_set_environ (cli->p, environ); - - if ((cli->fd = subprocess_socketpair (cli->p, &client_fd)) < 0) - goto fail; - subprocess_set_context (cli->p, "client", cli); - cli->w = flux_fd_watcher_create (ctx.reactor, cli->fd, FLUX_POLLIN, - pmi_simple_cb, cli); - if (!cli->w) - goto fail; - flux_watcher_start (cli->w); - subprocess_setenvf (cli->p, "PMI_FD", 1, "%d", client_fd); - subprocess_setenvf (cli->p, "PMI_RANK", 1, "%d", rank); - subprocess_setenvf (cli->p, "PMI_SIZE", 1, "%d", ctx.size); + if (flux_cmd_add_channel (cli->cmd, "PMI") < 0) + log_err_exit ("flux_cmd_add_channel"); + if (flux_cmd_setenvf (cli->cmd, 1, "PMI_RANK", "%d", rank) < 0) + log_err_exit ("flux_cmd_setenvf"); + if (flux_cmd_setenvf (cli->cmd, 1, "PMI_SIZE", "%d", ctx.size) < 0) + log_err_exit ("flux_cmd_setenvf"); return cli; fail: client_destroy (cli); @@ -505,24 +518,23 @@ struct client *client_create (const char *broker_path, const char *scratch_dir, void client_destroy (struct client *cli) { if (cli) { - flux_watcher_destroy (cli->w); - if (cli->fd != -1) - close (cli->fd); if (cli->p) - subprocess_destroy (cli->p); + flux_subprocess_destroy (cli->p); + if (cli->cmd) + flux_cmd_destroy (cli->cmd); free (cli); } } void client_dumpargs (struct client *cli) { - int i, argc = subprocess_get_argc (cli->p); + int i, argc = flux_cmd_argc (cli->cmd); char *az = NULL; size_t az_len = 0; int e; for (i = 0; i < argc; i++) - if ((e = argz_add (&az, &az_len, subprocess_get_arg (cli->p, i))) != 0) + if ((e = argz_add (&az, &az_len, flux_cmd_arg (cli->cmd, i))) != 0) log_errn_exit (e, "argz_add"); argz_stringify (az, az_len, ' '); log_msg ("%d: %s", cli->rank, az); @@ -555,7 +567,24 @@ void pmi_server_finalize (void) int client_run (struct client *cli) { - return subprocess_run (cli->p); + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_state_change = state_cb, + .on_channel_out = channel_cb, + .on_stdout = NULL, + .on_stderr = NULL, + }; + /* We want stdio fallthrough so subprocess can capture tty if + * necessary (i.e. an interactive shell) + */ + if (!(cli->p = flux_exec (ctx.reactor, + FLUX_SUBPROCESS_FLAGS_STDIO_FALLTHROUGH, + cli->cmd, + &ops))) + log_err_exit ("flux_exec"); + if (flux_subprocess_set_context (cli->p, "cli", cli) < 0) + log_err_exit ("flux_subprocess_set_context"); + return 0; } /* Start an internal PMI server, and then launch "size" number of @@ -580,10 +609,8 @@ int start_session (const char *cmd_argz, size_t cmd_argz_len, ctx.killer_timeout, 0., killer, NULL))) log_err_exit ("flux_timer_watcher_create"); - if (!(ctx.sm = subprocess_manager_create ())) - log_err_exit ("subprocess_manager_create"); - if (subprocess_manager_set (ctx.sm, SM_REACTOR, ctx.reactor) < 0) - log_err_exit ("subprocess_manager_set reactor"); + if (!(ctx.subprocesses = zlist_new ())) + log_err_exit ("zlist_new"); session_id = xasprintf ("%d", getpid ()); if (optparse_hasopt (ctx.opts, "scratchdir")) @@ -607,7 +634,7 @@ int start_session (const char *cmd_argz, size_t cmd_argz_len, continue; } if (client_run (cli) < 0) - log_err_exit ("subprocess_run"); + log_err_exit ("client_run"); ctx.count++; } if (flux_reactor_run (ctx.reactor, 0) < 0) @@ -618,7 +645,7 @@ int start_session (const char *cmd_argz, size_t cmd_argz_len, free (session_id); free (scratch_dir); - subprocess_manager_destroy (ctx.sm); + zlist_destroy (&ctx.subprocesses); flux_watcher_destroy (ctx.timer); flux_reactor_destroy (ctx.reactor); From c0ece9d9cc569f30b4dd7e986599630285011f33 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Fri, 24 Aug 2018 13:48:53 -0700 Subject: [PATCH 15/20] broker/: Convert to new subprocess library Convert runlevel to launch local subprocesses via the new subprocess library. Remove lingering uses of libsubprocess in main broker as well. --- src/broker/broker.c | 14 +-- src/broker/runlevel.c | 249 ++++++++++++++++++++++++------------------ src/broker/runlevel.h | 3 - 3 files changed, 142 insertions(+), 124 deletions(-) diff --git a/src/broker/broker.c b/src/broker/broker.c index 00099cd29f54..d7490d5dacdd 100644 --- a/src/broker/broker.c +++ b/src/broker/broker.c @@ -136,10 +136,6 @@ typedef struct { flux_t *enclosing_h; runlevel_t *runlevel; - /* Subprocess management - */ - struct subprocess_manager *sm; - char *init_shell_cmd; size_t init_shell_cmd_len; } broker_ctx_t; @@ -338,10 +334,6 @@ int main (int argc, char *argv[]) init_attrs (ctx.attrs, getpid()); - if (!(ctx.sm = subprocess_manager_create ())) - oom (); - subprocess_manager_set (ctx.sm, SM_WAIT_FLAGS, WNOHANG); - parse_command_line_arguments(argc, argv, &ctx, &sec_typemask); /* Record the instance owner: the effective uid of the broker. @@ -396,8 +388,6 @@ int main (int argc, char *argv[]) if (flux_set_reactor (ctx.h, ctx.reactor) < 0) log_err_exit ("flux_set_reactor"); - subprocess_manager_set (ctx.sm, SM_REACTOR, ctx.reactor); - /* Prepare signal handling */ broker_handle_signals (&ctx, sigwatchers); @@ -566,7 +556,6 @@ int main (int argc, char *argv[]) log_err_exit ("conf.pmi_library_path is not set"); runlevel_set_size (ctx.runlevel, size); - runlevel_set_subprocess_manager (ctx.runlevel, ctx.sm); runlevel_set_callback (ctx.runlevel, runlevel_cb, &ctx); runlevel_set_io_callback (ctx.runlevel, runlevel_io_cb, &ctx); runlevel_set_flux (ctx.runlevel, ctx.h); @@ -716,7 +705,6 @@ int main (int argc, char *argv[]) } runlevel_destroy (ctx.runlevel); free (ctx.init_shell_cmd); - subprocess_manager_destroy (ctx.sm); return exit_rc; } @@ -829,7 +817,7 @@ static void runlevel_io_cb (runlevel_t *r, const char *name, const char *msg, void *arg) { broker_ctx_t *ctx = arg; - int loglevel = !strcmp (name, "stderr") ? LOG_ERR : LOG_INFO; + int loglevel = !strcmp (name, "STDERR") ? LOG_ERR : LOG_INFO; int runlevel = runlevel_get_level (r); flux_log (ctx->h, loglevel, "rc%d: %s", runlevel, msg); diff --git a/src/broker/runlevel.c b/src/broker/runlevel.c index 65298605d944..1bf79b79e8dd 100644 --- a/src/broker/runlevel.c +++ b/src/broker/runlevel.c @@ -34,7 +34,7 @@ #include #include -#include "src/common/libsubprocess/zio.h" +#include "src/common/subprocess/subprocess.h" #include "src/common/libutil/log.h" #include "src/common/libutil/xzmalloc.h" #include "src/common/libutil/monotime.h" @@ -42,7 +42,8 @@ #include "runlevel.h" struct level { - struct subprocess *subprocess; + flux_subprocess_t *p; + flux_cmd_t *cmd; struct timespec start; double timeout; flux_watcher_t *timer; @@ -50,7 +51,6 @@ struct level { struct runlevel { int level; - struct subprocess_manager *sm; flux_t *h; struct level rc[4]; runlevel_cb_f cb; @@ -78,8 +78,10 @@ void runlevel_destroy (runlevel_t *r) if (r) { int i; for (i = 0; i < 4; i++) { - if (r->rc[i].subprocess) - subprocess_destroy (r->rc[i].subprocess); + if (r->rc[i].p) + flux_subprocess_destroy (r->rc[i].p); + if (r->rc[i].cmd) + flux_cmd_destroy (r->rc[i].cmd); flux_watcher_destroy (r->rc[i].timer); } free (r); @@ -196,12 +198,6 @@ void runlevel_set_size (runlevel_t *r, uint32_t size) assert (n < sizeof (r->nodeset)); } -void runlevel_set_subprocess_manager (runlevel_t *r, - struct subprocess_manager *sm) -{ - r->sm = sm; -} - void runlevel_set_callback (runlevel_t *r, runlevel_cb_f cb, void *arg) { r->cb = cb; @@ -218,34 +214,139 @@ static void runlevel_timeout (flux_reactor_t *reactor, flux_watcher_t *w, int revents, void *arg) { runlevel_t *r = arg; + flux_future_t *f; flux_log (r->h, LOG_ERR, "runlevel %d timeout, sending SIGTERM", r->level); - subprocess_kill (r->rc[r->level].subprocess, SIGTERM); + if (!(f = flux_subprocess_kill (r->rc[r->level].p, SIGTERM))) + flux_log_error (r->h, "flux_subprocess_kill"); + /* don't care about response */ + flux_future_destroy (f); +} + +/* See POSIX 2008 Volume 3 Shell and Utilities, Issue 7 + * Section 2.8.2 Exit status for shell commands (page 2315) + */ +static void completion_cb (flux_subprocess_t *p) +{ + runlevel_t *r = flux_subprocess_get_context (p, "runlevel"); + const char *exit_string = NULL; + int rc; + + if ((rc = flux_subprocess_exit_code (p)) < 0) { + /* bash standard, signals + 128 */ + if ((rc = flux_subprocess_signaled (p)) >= 0) { + rc += 128; + exit_string = strsignal (rc); + } + } + else { + if (rc) + exit_string = "Exited with non-zero status"; + else + exit_string = "Exited"; + } + + assert (r->rc[r->level].p == p); + r->rc[r->level].p = NULL; + + flux_watcher_stop (r->rc[r->level].timer); + + if (r->cb) { + double elapsed = monotime_since (r->rc[r->level].start) / 1000; + r->cb (r, r->level, rc, elapsed, exit_string, r->cb_arg); + } + flux_subprocess_destroy (p); +} + +static void io_cb (flux_subprocess_t *p, const char *stream) +{ + runlevel_t *r; + const char *ptr; + int lenp; + + r = flux_subprocess_get_context (p, "runlevel_t"); + if (!r) + return; + + if (!(ptr = flux_subprocess_read_line (p, stream, &lenp))) { + flux_log_error (r->h, "%s: flux_subprocess_read_line", __FUNCTION__); + return; + } + + if (!lenp) { + if (!(ptr = flux_subprocess_read (p, stream, -1, &lenp))) { + flux_log_error (r->h, "%s: flux_subprocess_read", __FUNCTION__); + return; + } + } + + if (lenp && r->io_cb) + r->io_cb (r, stream, ptr, r->io_cb_arg); } static int runlevel_start_subprocess (runlevel_t *r, int level) { - if (r->rc[level].subprocess) { - if (subprocess_run (r->rc[level].subprocess) < 0) - return -1; + flux_subprocess_t *p = NULL; + + assert (r->h != NULL); + + if (r->rc[level].cmd) { + flux_subprocess_ops_t ops = { + .on_completion = completion_cb, + .on_state_change = NULL, + .on_channel_out = NULL, + .on_stdout = NULL, + .on_stderr = NULL, + }; + flux_reactor_t *reactor = flux_get_reactor (r->h); + int flags = 0; + + /* set alternate io callback for levels 1 and 3 */ + if (level == 1 || level == 3) { + ops.on_stdout = io_cb; + ops.on_stderr = io_cb; + } + else + flags |= FLUX_SUBPROCESS_FLAGS_STDIO_FALLTHROUGH; + + /* FLUX_NODEID_ANY maps to -1, use -999 */ + if (!(p = flux_rexec (r->h, + -999, + flags, + r->rc[level].cmd, + &ops))) + goto error; + + if (flux_subprocess_set_context (p, "runlevel", r) < 0) + goto error; + + if (level == 1 || level == 3) { + if (flux_subprocess_set_context (p, "runlevel_t", r) < 0) + goto error; + } + monotime (&r->rc[level].start); if (r->rc[level].timeout > 0.) { - assert (r->h != NULL); - flux_reactor_t *reactor = flux_get_reactor (r->h); flux_watcher_t *w; if (!(w = flux_timer_watcher_create (reactor, r->rc[level].timeout, 0., runlevel_timeout, r))) - return -1; + goto error; flux_watcher_start (w); r->rc[level].timer = w; flux_log (r->h, LOG_INFO, "runlevel %d (%.1fs) timer started", level, r->rc[level].timeout); } + + r->rc[level].p = p; } else { if (r->cb) r->cb (r, r->level, 0, 0., "Not configured", r->cb_arg); } return 0; + +error: + flux_subprocess_destroy (p); + return -1; } int runlevel_set_level (runlevel_t *r, int level) @@ -276,123 +377,55 @@ int runlevel_get_level (runlevel_t *r) return r->level; } -/* See POSIX 2008 Volume 3 Shell and Utilities, Issue 7 - * Section 2.8.2 Exit status for shell commands (page 2315) - */ -static int subprocess_cb (struct subprocess *p) -{ - runlevel_t *r = subprocess_get_context (p, "runlevel"); - int rc = subprocess_exit_code (p); - const char *exit_string = subprocess_exit_string (p); - - assert (r->rc[r->level].subprocess == p); - r->rc[r->level].subprocess = NULL; - - flux_watcher_stop (r->rc[r->level].timer); - - if (r->cb) { - double elapsed = monotime_since (r->rc[r->level].start) / 1000; - r->cb (r, r->level, rc, elapsed, exit_string, r->cb_arg); - } - subprocess_destroy (p); - - return 0; -} - -/* Note: return value of this function is ignored by libsubprocess. - * Also: zio_json_decode() returns -1 on error, 0 on eof, strlen(s) on - * success; caller must free 's'. - */ -static int subprocess_io_cb (struct subprocess *p, const char *json_str) -{ - runlevel_t *r; - json_t *o = NULL; - const char *name; - int len; - bool eof; - char *s = NULL, *argz = NULL, *line = NULL; - size_t argz_len; - - r = subprocess_get_context (p, "runlevel_t"); - assert (r != NULL); - - if (!r->io_cb) - goto done; - /* N.B. libsubprocess tacks "name" etc. onto zio-encoded JSON output - */ - if (!(o = json_loads (json_str, 0, NULL))) - goto done; - if (json_unpack (o, "{s:s}", "name", &name) < 0) - goto done; - len = zio_json_decode (json_str, (void **)&s, &eof); - if (len <= 0 || !s || !*s || s[len] != '\0') - goto done; - if (argz_create_sep (s, '\n', &argz, &argz_len) != 0) - goto done; - while ((line = argz_next (argz, argz_len, line)) && *line) - r->io_cb (r, name, line, r->io_cb_arg); -done: - free (s); - free (argz); - json_decref (o); - return 0; -} - int runlevel_set_rc (runlevel_t *r, int level, const char *cmd_argz, size_t cmd_argz_len, const char *local_uri) { - struct subprocess *p = NULL; + flux_subprocess_t *p = NULL; + flux_cmd_t *cmd = NULL; const char *shell = getenv ("SHELL"); if (!shell) shell = "/bin/bash"; - if (level < 1 || level > 3 || r->rc[level].subprocess != NULL || !r->sm) { + if (level < 1 || level > 3 || r->rc[level].p != NULL) { errno = EINVAL; goto error; } // Only wrap in a shell if there is only one argument bool shell_wrap = argz_count (cmd_argz, cmd_argz_len) < 2; - if ((p = subprocess_create (r->sm)) == NULL) - goto error; - if ((subprocess_set_context (p, "runlevel", r)) < 0) - goto error; - if ((subprocess_add_hook (p, SUBPROCESS_COMPLETE, subprocess_cb)) < 0) + if (!(cmd = flux_cmd_create (0, NULL, environ))) goto error; if (shell_wrap || !cmd_argz) { - if ((subprocess_argv_append (p, shell)) < 0) + if (flux_cmd_argv_append (cmd, shell) < 0) goto error; } if (shell_wrap) { - if (cmd_argz && subprocess_argv_append (p, "-c") < 0) + if (cmd_argz && flux_cmd_argv_append (cmd, "-c") < 0) goto error; } - if (cmd_argz && subprocess_argv_append_argz (p, cmd_argz, cmd_argz_len) < 0) - goto error; - if (subprocess_set_environ (p, environ) < 0) - goto error; - if (subprocess_unsetenv (p, "PMI_FD") < 0) - goto error; - if (subprocess_unsetenv (p, "PMI_RANK") < 0) - goto error; - if (subprocess_unsetenv (p, "PMI_SIZE") < 0) - goto error; - if (local_uri && subprocess_setenv (p, "FLUX_URI", local_uri, 1) < 0) + if (cmd_argz) { + char *arg = argz_next (cmd_argz, cmd_argz_len, NULL); + while (arg) { + if (flux_cmd_argv_append (cmd, arg) < 0) + goto error; + arg = argz_next (cmd_argz, cmd_argz_len, arg); + } + } + flux_cmd_unsetenv (cmd, "PMI_FD"); + flux_cmd_unsetenv (cmd, "PMI_RANK"); + flux_cmd_unsetenv (cmd, "PMI_SIZE"); + if (local_uri && flux_cmd_setenvf (cmd, 1, "FLUX_URI", local_uri) < 0) goto error; - if (level == 1 || level == 3) { - if (subprocess_setenv (p, "FLUX_NODESET_MASK", r->nodeset, 1) < 0) - goto error; - if (subprocess_set_io_callback (p, subprocess_io_cb) < 0) - goto error; - if (subprocess_set_context (p, "runlevel_t", r) < 0) + if (flux_cmd_setenvf (cmd, 1, "FLUX_NODESET_MASK", r->nodeset) < 0) goto error; } - r->rc[level].subprocess = p; + r->rc[level].cmd = cmd; return 0; error: if (p) - subprocess_destroy (p); + flux_subprocess_destroy (p); + flux_cmd_destroy (cmd); return -1; } diff --git a/src/broker/runlevel.h b/src/broker/runlevel.h index bbc85173af9d..45c4db6458c7 100644 --- a/src/broker/runlevel.h +++ b/src/broker/runlevel.h @@ -2,7 +2,6 @@ #define _BROKER_RUNLEVEL_H #include "attr.h" -#include "src/common/libsubprocess/subprocess.h" #include #include // for size_t @@ -18,8 +17,6 @@ typedef void (*runlevel_io_cb_f)(runlevel_t *r, const char *name, */ runlevel_t *runlevel_create (void); int runlevel_register_attrs (runlevel_t *r, attr_t *attr); -void runlevel_set_subprocess_manager (runlevel_t *r, - struct subprocess_manager *sm); void runlevel_set_size (runlevel_t *r, uint32_t size); void runlevel_set_flux (runlevel_t *r, flux_t *h); void runlevel_destroy (runlevel_t *r); From aa0c394a2c65e55dc485591b20c8c60fc32f0d9d Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Tue, 28 Aug 2018 14:41:51 -0700 Subject: [PATCH 16/20] t/t0015-cron.t: Fix race between kill and dump After a cron delete --kill, there is a small race between the kill and cron knowing it has been killed. Add a loop that waits for the kill to be confirmed, but gives up after a long 5 seconds. --- t/t0015-cron.t | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/t/t0015-cron.t b/t/t0015-cron.t index 624cf888037e..8b6055eb22d2 100755 --- a/t/t0015-cron.t +++ b/t/t0015-cron.t @@ -56,6 +56,23 @@ test_expect_success 'cron interval --after= works' ' flux cron dump ${id} && cron_entry_check ${id} task.1.state Exited ' + +wait_cron_delete () { + i=0 + flux cron dump ${id} + while [ $? -eq 0 ] && [ $i -lt 50 ] + do + sleep 0.1 + i=$((i + 1)) + flux cron dump ${id} + done + if [ "$i" -eq "50" ] + then + return 1 + fi + return 0; +} + test_expect_success 'cron delete leaves running task - --kill works' ' id=$(flux_cron interval --after=.01s 0 sleep 100) && sleep .1 && @@ -64,7 +81,7 @@ test_expect_success 'cron delete leaves running task - --kill works' ' grep "sleep still running" delete.${id}.out && cron_entry_check ${id} task.1.state Running && flux cron delete --kill ${id} && - test_expect_code 1 flux cron dump ${id} + wait_cron_delete ' test_expect_success 'repeat count works' ' id=$(flux_cron interval -c1 .01s echo hi) && From 2bf7b66cd9405dc15a5d18825c7b3e67ffa72c70 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Tue, 28 Aug 2018 23:42:39 -0700 Subject: [PATCH 17/20] subprocess server debugging --- src/common/subprocess/server.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/common/subprocess/server.c b/src/common/subprocess/server.c index 65fa6eab5a56..1daf9a4f2f97 100644 --- a/src/common/subprocess/server.c +++ b/src/common/subprocess/server.c @@ -182,6 +182,7 @@ static void rexec_state_change_cb (flux_subprocess_t *p, flux_subprocess_state_t goto error; } } else if (state == FLUX_SUBPROCESS_EXITED) { + flux_log (s->h, LOG_ERR, "server: pid %d, send exited\n", p->pid); if (flux_respond_pack (s->h, msg, "{s:s s:i s:i s:i}", "type", "state", "rank", s->rank, @@ -253,6 +254,7 @@ static int rexec_output_data (flux_subprocess_t *p, const char *stream, static int rexec_output_eof (flux_subprocess_t *p, const char *stream, flux_subprocess_server_t *s, flux_msg_t *msg) { + flux_log (s->h, LOG_ERR, "server: pid = %d, sending eof stream = %s\n", p->pid, stream); if (flux_respond_pack (s->h, msg, "{s:s s:i s:i s:s s:i}", "type", "output", "rank", s->rank, @@ -531,8 +533,11 @@ static void server_signal_cb (flux_t *h, flux_msg_handler_t *mh, if (!lookup_pid (s, pid)) goto error; - if (kill (pid, signum) < 0) + flux_log (s->h, LOG_ERR, "server: kill request: pid %d, kill %d\n", pid, signum); + if (kill (pid, signum) < 0) { + flux_log_error (s->h, "server: kill error %d\n", errno); goto error; + } error: if (flux_respond (h, msg, errno, NULL) < 0) From 2d51e8c3aaed636fc0e6942436481de252a58357 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Wed, 29 Aug 2018 07:47:49 -0700 Subject: [PATCH 18/20] more debug server subprocess --- src/common/subprocess/server.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/subprocess/server.c b/src/common/subprocess/server.c index 1daf9a4f2f97..f9d01335e9a8 100644 --- a/src/common/subprocess/server.c +++ b/src/common/subprocess/server.c @@ -182,7 +182,7 @@ static void rexec_state_change_cb (flux_subprocess_t *p, flux_subprocess_state_t goto error; } } else if (state == FLUX_SUBPROCESS_EXITED) { - flux_log (s->h, LOG_ERR, "server: pid %d, send exited\n", p->pid); + flux_log (s->h, LOG_ERR, "server: pid %d, send exited, eof expected %d, eof sent %d\n", p->pid, p->channels_eof_expected, p->channels_eof_sent); if (flux_respond_pack (s->h, msg, "{s:s s:i s:i s:i}", "type", "state", "rank", s->rank, From 9b7508c3bffa49845728fdd3b6c034752ad4a0b3 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Wed, 29 Aug 2018 08:21:55 -0700 Subject: [PATCH 19/20] remove a bunch of tests temporarily to make debug cycle faster and reduce output --- t/Makefile.am | 152 +----------------- t/t0015-cron.t | 412 ++++++++++++++++++++++++------------------------- 2 files changed, 208 insertions(+), 356 deletions(-) diff --git a/t/Makefile.am b/t/Makefile.am index 521858a88360..470ddd76737f 100644 --- a/t/Makefile.am +++ b/t/Makefile.am @@ -30,88 +30,7 @@ uninstall-local: $(RM) $(DESTDIR)$(luadir)/fluxometer/conf.lua TESTS = \ - shmem/backtoback.t \ - loop/handle.t \ - loop/dispatch.t \ - loop/reactor.t \ - loop/reduce.t \ - loop/log.t \ - rpc/rpc.t \ - rpc/mrpc.t \ - rolemask/loop.t \ - t0000-sharness.t \ - t0001-basic.t \ - t0002-request.t \ - t0003-module.t \ - t0004-event.t \ - t0005-exec.t \ - t0005-rexec.t \ - t0007-ping.t \ - t0008-attr.t \ - t0009-dmesg.t \ - t0010-generic-utils.t \ - t0011-content-cache.t \ - t0012-content-sqlite.t \ - t0013-config-file.t \ - t0014-runlevel.t \ - t0015-cron.t \ - t0016-cron-faketime.t \ - t0017-security.t \ - t1000-kvs.t \ - t1001-kvs-internals.t \ - t1002-kvs-watch.t \ - t1003-kvs-stress.t \ - t1004-kvs-namespace.t \ - t1005-kvs-security.t \ - t1006-kvs-getroot.t \ - t1101-barrier-basic.t \ - t1102-cmddriver.t \ - t1103-apidisconnect.t \ - t1104-kz.t \ - t1105-proxy.t \ - t1999-wreck-rcalc.t \ - t2000-wreck.t \ - t2000-wreck-env.t \ - t2000-wreck-dummy-sched.t \ - t2000-wreck-epilog.t \ - t2001-jsc.t \ - t2002-pmi.t \ - t2003-recurse.t \ - t2004-hydra.t \ - t2005-hwloc-basic.t \ - t2006-joblog.t \ - t2007-caliper.t \ - t2008-althash.t \ - t2009-hostlist.t \ - t2100-aggregate.t \ - t2200-job-ingest.t \ - t2201-job-cmd.t \ - t3000-mpi-basic.t \ - t3001-mpi-personalities.t \ - t4000-issues-test-driver.t \ - t5000-valgrind.t \ - lua/t0001-send-recv.t \ - lua/t0002-rpc.t \ - lua/t0003-events.t \ - lua/t0004-getattr.t \ - lua/t0007-alarm.t \ - lua/t0009-sequences.t \ - lua/t1000-reactor.t \ - lua/t1001-timeouts.t \ - lua/t1002-kvs.t \ - lua/t1003-iowatcher.t \ - lua/t1004-statwatcher.t \ - lua/t1005-fdwatcher.t - -if ENABLE_JOBSPEC -TESTS += \ - t0018-jobspec.t -endif - -if HAVE_PYTHON -TESTS += \ - $(top_builddir)/t/t9990-python-tests.t -endif + t0015-cron.t EXTRA_DIST= \ $(check_SCRIPTS) \ @@ -135,74 +54,7 @@ clean-local: rm -fr trash-directory.* test-results .prove *.broker.log */*.broker.log *.output check_SCRIPTS = \ - t0000-sharness.t \ - t0001-basic.t \ - t0002-request.t \ - t0003-module.t \ - t0004-event.t \ - t0005-exec.t \ - t0005-rexec.t \ - t0007-ping.t \ - t0008-attr.t \ - t0009-dmesg.t \ - t0010-generic-utils.t \ - t0011-content-cache.t \ - t0012-content-sqlite.t \ - t0013-config-file.t \ - t0014-runlevel.t \ - t0015-cron.t \ - t0016-cron-faketime.t \ - t0017-security.t \ - t1000-kvs.t \ - t1001-kvs-internals.t \ - t1002-kvs-watch.t \ - t1003-kvs-stress.t \ - t1004-kvs-namespace.t \ - t1005-kvs-security.t \ - t1006-kvs-getroot.t \ - t1101-barrier-basic.t \ - t1102-cmddriver.t \ - t1103-apidisconnect.t \ - t1104-kz.t \ - t1105-proxy.t \ - t1999-wreck-rcalc.t \ - t2000-wreck.t \ - t2000-wreck-env.t \ - t2000-wreck-dummy-sched.t \ - t2000-wreck-epilog.t \ - t2001-jsc.t \ - t2002-pmi.t \ - t2003-recurse.t \ - t2004-hydra.t \ - t2005-hwloc-basic.t \ - t2006-joblog.t \ - t2007-caliper.t \ - t2008-althash.t \ - t2009-hostlist.t \ - t2100-aggregate.t \ - t2200-job-ingest.t \ - t2201-job-cmd.t \ - t3000-mpi-basic.t \ - t3001-mpi-personalities.t \ - t4000-issues-test-driver.t \ - t5000-valgrind.t \ - issues/t0441-kvs-put-get.sh \ - issues/t0505-msg-handler-reg.lua \ - issues/t0821-kvs-segfault.sh \ - lua/t0001-send-recv.t \ - lua/t0002-rpc.t \ - lua/t0003-events.t \ - lua/t0004-getattr.t \ - lua/t0007-alarm.t \ - lua/t0009-sequences.t \ - lua/t1000-reactor.t \ - lua/t1001-timeouts.t \ - lua/t1002-kvs.t \ - lua/t1003-iowatcher.t \ - lua/t1004-statwatcher.t \ - lua/t1005-fdwatcher.t \ - t0018-jobspec.t \ - $(top_builddir)/t/t9990-python-tests.t + t0015-cron.t check_PROGRAMS = \ shmem/backtoback.t \ diff --git a/t/t0015-cron.t b/t/t0015-cron.t index 8b6055eb22d2..ca67ca7b8de3 100755 --- a/t/t0015-cron.t +++ b/t/t0015-cron.t @@ -83,215 +83,215 @@ test_expect_success 'cron delete leaves running task - --kill works' ' flux cron delete --kill ${id} && wait_cron_delete ' -test_expect_success 'repeat count works' ' - id=$(flux_cron interval -c1 .01s echo hi) && - sleep .02 && - cron_entry_check ${id} repeat 1 && - cron_entry_check ${id} stats.count 1 && - cron_entry_check ${id} stopped true -' -test_expect_success 'restarted job restarts repeat count' ' - id=$(flux_cron interval -c1 .01s echo repeat-count-check) && - sleep .1 && - cron_entry_check ${id} stopped true && - test $(flux dmesg | grep -c repeat-count-check) = 1 && - flux dmesg -c && - flux cron start ${id} && - sleep .1 && - test $(flux dmesg | grep -c repeat-count-check) = 1 -' -test_expect_success 'rank option works' ' - id=$(flux_cron interval -c1 -o rank=1 .01s flux getattr rank) && - sleep .1 && - cron_entry_check ${id} stopped true && - cron_entry_check ${id} rank 1 && - flux dmesg | grep "cron-${id}.*command=\"flux getattr rank\": \"1\"" -' -test_expect_success '--preserve-env option works' ' - export FOO=bar && - id=$(flux_cron interval --preserve-env -c1 -o rank=1 .01s printenv FOO) && - unset FOO && - sleep .1 && - cron_entry_check ${id} stopped true && - flux dmesg | grep "cron-${id}.*command=\"printenv FOO\": \"bar\"" -' -test_expect_success '--working-dir option works' ' - id=$(flux_cron interval -c1 -d /tmp .01s pwd) && - sleep .1 && - cron_entry_check ${id} stopped true && - flux dmesg | grep "cron-${id}.*command=\"pwd\": \"/tmp\"" -' +# test_expect_success 'repeat count works' ' +# id=$(flux_cron interval -c1 .01s echo hi) && +# sleep .02 && +# cron_entry_check ${id} repeat 1 && +# cron_entry_check ${id} stats.count 1 && +# cron_entry_check ${id} stopped true +# ' +# test_expect_success 'restarted job restarts repeat count' ' +# id=$(flux_cron interval -c1 .01s echo repeat-count-check) && +# sleep .1 && +# cron_entry_check ${id} stopped true && +# test $(flux dmesg | grep -c repeat-count-check) = 1 && +# flux dmesg -c && +# flux cron start ${id} && +# sleep .1 && +# test $(flux dmesg | grep -c repeat-count-check) = 1 +# ' +# test_expect_success 'rank option works' ' +# id=$(flux_cron interval -c1 -o rank=1 .01s flux getattr rank) && +# sleep .1 && +# cron_entry_check ${id} stopped true && +# cron_entry_check ${id} rank 1 && +# flux dmesg | grep "cron-${id}.*command=\"flux getattr rank\": \"1\"" +# ' +# test_expect_success '--preserve-env option works' ' +# export FOO=bar && +# id=$(flux_cron interval --preserve-env -c1 -o rank=1 .01s printenv FOO) && +# unset FOO && +# sleep .1 && +# cron_entry_check ${id} stopped true && +# flux dmesg | grep "cron-${id}.*command=\"printenv FOO\": \"bar\"" +# ' +# test_expect_success '--working-dir option works' ' +# id=$(flux_cron interval -c1 -d /tmp .01s pwd) && +# sleep .1 && +# cron_entry_check ${id} stopped true && +# flux dmesg | grep "cron-${id}.*command=\"pwd\": \"/tmp\"" +# ' -test_expect_success 'cron entry exec failure is recorded' ' - id=$(flux_cron interval -c1 0.01s notaprogram) && - sleep 0.1 && - test_debug "flux cron dump ${id} >&2" && - cron_entry_check ${id} stopped true && - cron_entry_check ${id} task.1.state "Failed" && - cron_entry_check ${id} task.1.code 127 -' -test_expect_success 'cron entry launch failure recorded' ' - id=$(flux_cron interval -o rank=99 -c1 0.01s hostname) && - sleep 0.1 && - test_debug "flux cron dump ${id} >&2" && - cron_entry_check ${id} stopped true && - cron_entry_check ${id} task.1.state "Exec Failure" && - cron_entry_check ${id} task.1.exec_errno 113 -' -test_expect_success 'flux-cron event works' ' - id=$(flux_cron event t.cron.trigger flux event pub t.cron.complete) && - cron_entry_check ${id} type event && - cron_entry_check ${id} stopped false && - cron_entry_check ${id} stats.count 0 && - $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.complete \ - flux event pub t.cron.trigger && - cron_entry_check ${id} stats.count 1 && - cron_entry_check ${id} task.1.state Exited && - $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.complete \ - flux event pub t.cron.trigger && - cron_entry_check ${id} stats.count 2 && - cron_entry_check ${id} task.1.state Exited && - flux cron stop ${id} && - cron_entry_check ${id} stopped true && - flux cron delete ${id} && - test_expect_code 1 flux cron dump ${id} -' -test_expect_success 'flux-cron event --nth works' ' - id=$(flux_cron event --nth=3 t.cron.trigger flux event pub t.cron.complete) && - test_when_finished "flux cron delete ${id}" && - cron_entry_check ${id} type event && - cron_entry_check ${id} stopped false && - cron_entry_check ${id} stats.count 0 && - cron_entry_check ${id} typedata.nth 3 && - $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.trigger \ - flux event pub t.cron.trigger && - cron_entry_check ${id} stats.count 0 && - cron_entry_check ${id} typedata.counter 1 && - $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.trigger \ - flux event pub t.cron.trigger && - cron_entry_check ${id} stats.count 0 && - cron_entry_check ${id} typedata.counter 2 && - $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.complete \ - flux event pub t.cron.trigger && - cron_entry_check ${id} stats.count 1 && - cron_entry_check ${id} typedata.counter 3 && - $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.trigger \ - flux event pub t.cron.trigger && - cron_entry_check ${id} stats.count 1 && - cron_entry_check ${id} typedata.counter 4 -' +# test_expect_success 'cron entry exec failure is recorded' ' +# id=$(flux_cron interval -c1 0.01s notaprogram) && +# sleep 0.1 && +# test_debug "flux cron dump ${id} >&2" && +# cron_entry_check ${id} stopped true && +# cron_entry_check ${id} task.1.state "Failed" && +# cron_entry_check ${id} task.1.code 127 +# ' +# test_expect_success 'cron entry launch failure recorded' ' +# id=$(flux_cron interval -o rank=99 -c1 0.01s hostname) && +# sleep 0.1 && +# test_debug "flux cron dump ${id} >&2" && +# cron_entry_check ${id} stopped true && +# cron_entry_check ${id} task.1.state "Exec Failure" && +# cron_entry_check ${id} task.1.exec_errno 113 +# ' +# test_expect_success 'flux-cron event works' ' +# id=$(flux_cron event t.cron.trigger flux event pub t.cron.complete) && +# cron_entry_check ${id} type event && +# cron_entry_check ${id} stopped false && +# cron_entry_check ${id} stats.count 0 && +# $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.complete \ +# flux event pub t.cron.trigger && +# cron_entry_check ${id} stats.count 1 && +# cron_entry_check ${id} task.1.state Exited && +# $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.complete \ +# flux event pub t.cron.trigger && +# cron_entry_check ${id} stats.count 2 && +# cron_entry_check ${id} task.1.state Exited && +# flux cron stop ${id} && +# cron_entry_check ${id} stopped true && +# flux cron delete ${id} && +# test_expect_code 1 flux cron dump ${id} +# ' +# test_expect_success 'flux-cron event --nth works' ' +# id=$(flux_cron event --nth=3 t.cron.trigger flux event pub t.cron.complete) && +# test_when_finished "flux cron delete ${id}" && +# cron_entry_check ${id} type event && +# cron_entry_check ${id} stopped false && +# cron_entry_check ${id} stats.count 0 && +# cron_entry_check ${id} typedata.nth 3 && +# $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.trigger \ +# flux event pub t.cron.trigger && +# cron_entry_check ${id} stats.count 0 && +# cron_entry_check ${id} typedata.counter 1 && +# $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.trigger \ +# flux event pub t.cron.trigger && +# cron_entry_check ${id} stats.count 0 && +# cron_entry_check ${id} typedata.counter 2 && +# $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.complete \ +# flux event pub t.cron.trigger && +# cron_entry_check ${id} stats.count 1 && +# cron_entry_check ${id} typedata.counter 3 && +# $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.trigger \ +# flux event pub t.cron.trigger && +# cron_entry_check ${id} stats.count 1 && +# cron_entry_check ${id} typedata.counter 4 +# ' -test_expect_success 'flux-cron event --after works' ' - id=$(flux_cron event --after=3 t.cron.trigger flux event pub t.cron.complete) && - test_when_finished "flux cron delete ${id}" && - cron_entry_check ${id} type event && - cron_entry_check ${id} stopped false && - cron_entry_check ${id} stats.count 0 && - cron_entry_check ${id} typedata.after 3 && - $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.trigger \ - flux event pub t.cron.trigger && - cron_entry_check ${id} stats.count 0 && - cron_entry_check ${id} typedata.counter 1 && - $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.trigger \ - flux event pub t.cron.trigger && - cron_entry_check ${id} stats.count 0 && - cron_entry_check ${id} typedata.counter 2 && - $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.complete \ - flux event pub t.cron.trigger && - flux cron dump ${id} && - cron_entry_check ${id} stats.count 1 && - cron_entry_check ${id} typedata.counter 3 && - $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.trigger \ - flux event pub t.cron.trigger && - cron_entry_check ${id} typedata.counter 4 && - cron_entry_check ${id} stats.count 2 -' -test_expect_success 'flux-cron event --min-interval works' ' - id=$(flux_cron event --min-interval=.5s t.cron.trigger hostname) && - test_when_finished "flux cron delete ${id}" && - cron_entry_check ${id} type event && - cron_entry_check ${id} stopped false && - cron_entry_check ${id} stats.count 0 && - cron_entry_check ${id} typedata.min_interval 0.5 && - flux event pub t.cron.trigger && flux event pub t.cron.trigger && - cron_entry_check ${id} stats.count 1 && - sleep 0.5 && - cron_entry_check ${id} stats.count 2 -' -test_expect_success 'flux-cron can set timeout on tasks' ' - id=$(flux_cron event -o timeout=0.1 t.cron.trigger sleep 120) && - test_when_finished "flux cron delete ${id}" && - $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.trigger \ - flux event pub t.cron.trigger && - sleep 0.1 && - i=0 && - while test $i -lt 5; do - cron_entry_check ${id} task.1.state Timeout - rc=$? - if test $rc -eq 0; then break; fi - sleep 0.1 - i=$((i+1)) - echo "cron-${id}: $i" - flux cron dump ${id} - done && - test $rc -eq 0 -' -test_expect_success 'flux-cron can set stop-on-failure' ' - id=$(flux_cron event -o stop-on-failure=3 t2.cron.trigger \ - "flux event pub t2.cron.complete && false" ) && - cron_entry_check ${id} type event && - cron_entry_check ${id} stopped false && - cron_entry_check ${id} stats.count 0 && - $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t2.cron t2.cron.complete \ - flux event pub t2.cron.trigger && - flux cron dump ${id} && - cron_entry_check ${id} stats.count 1 && - cron_entry_check ${id} stats.failure 1 && - $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t2.cron t2.cron.complete \ - flux event pub t2.cron.trigger && - cron_entry_check ${id} stats.count 2 && - cron_entry_check ${id} stats.failure 2 && - $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t2.cron t2.cron.complete \ - flux event pub t2.cron.trigger && - cron_entry_check ${id} stats.count 3 && - cron_entry_check ${id} stats.failure 3 && - cron_entry_check ${id} stopped true -' +# test_expect_success 'flux-cron event --after works' ' +# id=$(flux_cron event --after=3 t.cron.trigger flux event pub t.cron.complete) && +# test_when_finished "flux cron delete ${id}" && +# cron_entry_check ${id} type event && +# cron_entry_check ${id} stopped false && +# cron_entry_check ${id} stats.count 0 && +# cron_entry_check ${id} typedata.after 3 && +# $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.trigger \ +# flux event pub t.cron.trigger && +# cron_entry_check ${id} stats.count 0 && +# cron_entry_check ${id} typedata.counter 1 && +# $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.trigger \ +# flux event pub t.cron.trigger && +# cron_entry_check ${id} stats.count 0 && +# cron_entry_check ${id} typedata.counter 2 && +# $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.complete \ +# flux event pub t.cron.trigger && +# flux cron dump ${id} && +# cron_entry_check ${id} stats.count 1 && +# cron_entry_check ${id} typedata.counter 3 && +# $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.trigger \ +# flux event pub t.cron.trigger && +# cron_entry_check ${id} typedata.counter 4 && +# cron_entry_check ${id} stats.count 2 +# ' +# test_expect_success 'flux-cron event --min-interval works' ' +# id=$(flux_cron event --min-interval=.5s t.cron.trigger hostname) && +# test_when_finished "flux cron delete ${id}" && +# cron_entry_check ${id} type event && +# cron_entry_check ${id} stopped false && +# cron_entry_check ${id} stats.count 0 && +# cron_entry_check ${id} typedata.min_interval 0.5 && +# flux event pub t.cron.trigger && flux event pub t.cron.trigger && +# cron_entry_check ${id} stats.count 1 && +# sleep 0.5 && +# cron_entry_check ${id} stats.count 2 +# ' +# test_expect_success 'flux-cron can set timeout on tasks' ' +# id=$(flux_cron event -o timeout=0.1 t.cron.trigger sleep 120) && +# test_when_finished "flux cron delete ${id}" && +# $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.trigger \ +# flux event pub t.cron.trigger && +# sleep 0.1 && +# i=0 && +# while test $i -lt 5; do +# cron_entry_check ${id} task.1.state Timeout +# rc=$? +# if test $rc -eq 0; then break; fi +# sleep 0.1 +# i=$((i+1)) +# echo "cron-${id}: $i" +# flux cron dump ${id} +# done && +# test $rc -eq 0 +# ' +# test_expect_success 'flux-cron can set stop-on-failure' ' +# id=$(flux_cron event -o stop-on-failure=3 t2.cron.trigger \ +# "flux event pub t2.cron.complete && false" ) && +# cron_entry_check ${id} type event && +# cron_entry_check ${id} stopped false && +# cron_entry_check ${id} stats.count 0 && +# $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t2.cron t2.cron.complete \ +# flux event pub t2.cron.trigger && +# flux cron dump ${id} && +# cron_entry_check ${id} stats.count 1 && +# cron_entry_check ${id} stats.failure 1 && +# $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t2.cron t2.cron.complete \ +# flux event pub t2.cron.trigger && +# cron_entry_check ${id} stats.count 2 && +# cron_entry_check ${id} stats.failure 2 && +# $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t2.cron t2.cron.complete \ +# flux event pub t2.cron.trigger && +# cron_entry_check ${id} stats.count 3 && +# cron_entry_check ${id} stats.failure 3 && +# cron_entry_check ${id} stopped true +# ' -## Reload cron module with sync enabled -test_expect_success 'flux module remove cron' ' - flux module remove cron -' -test_expect_success 'module load with sync' ' - flux module load cron sync=cron.sync sync_epsilon=0.025 -' -test_expect_success 'sync and sync_epsilon are set as expected' ' - flux cron sync | grep "cron\.sync.*epsilon=0.025" -' -test_expect_success 'tasks do not run until sync event' ' - id=$(flux_cron event t.cron.trigger flux event pub t.cron.complete) && - test_when_finished "flux cron delete ${id}" && - cron_entry_check ${id} stopped false && - cron_entry_check ${id} stats.count 0 && - $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.trigger \ - flux event pub t.cron.trigger && - cron_entry_check ${id} task.1.state Deferred && - $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.complete \ - flux event pub cron.sync && - cron_entry_check ${id} stats.count 1 -' -test_expect_success 'flux cron sync can disable sync' ' - flux cron sync --disable && - flux cron sync | grep disabled -' -test_expect_success 'flux cron sync can enable sync' ' - flux cron sync cron.sync2 && - flux cron sync | grep cron.sync2 -' -test_expect_success 'flux cron sync can set epsilon' ' - flux cron sync --epsilon=42s cron.sync2 && - flux cron sync | grep 42.000s -' +# ## Reload cron module with sync enabled +# test_expect_success 'flux module remove cron' ' +# flux module remove cron +# ' +# test_expect_success 'module load with sync' ' +# flux module load cron sync=cron.sync sync_epsilon=0.025 +# ' +# test_expect_success 'sync and sync_epsilon are set as expected' ' +# flux cron sync | grep "cron\.sync.*epsilon=0.025" +# ' +# test_expect_success 'tasks do not run until sync event' ' +# id=$(flux_cron event t.cron.trigger flux event pub t.cron.complete) && +# test_when_finished "flux cron delete ${id}" && +# cron_entry_check ${id} stopped false && +# cron_entry_check ${id} stats.count 0 && +# $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.trigger \ +# flux event pub t.cron.trigger && +# cron_entry_check ${id} task.1.state Deferred && +# $SHARNESS_TEST_SRCDIR/scripts/event-trace.lua t.cron t.cron.complete \ +# flux event pub cron.sync && +# cron_entry_check ${id} stats.count 1 +# ' +# test_expect_success 'flux cron sync can disable sync' ' +# flux cron sync --disable && +# flux cron sync | grep disabled +# ' +# test_expect_success 'flux cron sync can enable sync' ' +# flux cron sync cron.sync2 && +# flux cron sync | grep cron.sync2 +# ' +# test_expect_success 'flux cron sync can set epsilon' ' +# flux cron sync --epsilon=42s cron.sync2 && +# flux cron sync | grep 42.000s +# ' test_expect_success 'flux module remove cron' ' flux module remove cron ' From ad09dc1d0c686025aa48fe9b914b7541c72738a9 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Wed, 29 Aug 2018 08:50:50 -0700 Subject: [PATCH 20/20] subprocess local debugging --- src/common/subprocess/local.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/common/subprocess/local.c b/src/common/subprocess/local.c index 47b629cc49de..b233dd5646bc 100644 --- a/src/common/subprocess/local.c +++ b/src/common/subprocess/local.c @@ -102,8 +102,11 @@ static void local_output (struct subprocess_channel *c, { bool eof_set = false; + flux_log (c->p->h, LOG_ERR, "local: pid = %d, revents = %d, channels eof_expected %d, channesl eof_sent %d", c->p->pid, revents, c->p->channels_eof_expected, c->p->channels_eof_sent); if (revents & FLUX_POLLIN) { flux_buffer_t *fb; + flux_log (c->p->h, LOG_ERR, "local: local_output: pid = %d, state = %s, eof_sent = %d", c->p->pid, flux_subprocess_state_string (c->p->state), c->eof_sent_to_caller); + if (!c->eof_sent_to_caller) { if (!(fb = flux_buffer_read_watcher_get_buffer (w))) { @@ -116,6 +119,7 @@ static void local_output (struct subprocess_channel *c, eof_set = true; c->p->channels_eof_sent++; } + flux_log (c->p->h, LOG_ERR, "local: local_output: pid = %d, state = %s, eof_set = %d", c->p->pid, flux_subprocess_state_string (c->p->state), eof_set); } output_cb (c->p, c->name); @@ -135,6 +139,8 @@ static void local_output (struct subprocess_channel *c, eof_set = true; c->p->channels_eof_sent++; } + + flux_log (c->p->h, LOG_ERR, "local: local_output - post cb: pid = %d, state = %s, eof_set = %d", c->p->pid, flux_subprocess_state_string (c->p->state), eof_set); } } else @@ -579,6 +585,7 @@ static void child_watch_cb (flux_reactor_t *r, flux_watcher_t *w, */ if (p->state == FLUX_SUBPROCESS_RUNNING) { p->state = FLUX_SUBPROCESS_EXITED; + flux_log (p->h, LOG_ERR, "local: child_watch - pid = %d, state = %s, exited", p->pid, flux_subprocess_state_string (p->state)); state_change_start (p); }