From 81342b9612c8c3fe8b5445d39a82949a389712ac Mon Sep 17 00:00:00 2001 From: Stephen Herbein Date: Mon, 16 Jan 2017 10:59:14 -0800 Subject: [PATCH] wrexecd: delay sending of complete event Send the 'wreck.state.complete' after job archival as opposed to before. This is done to prevent a race condition between wrexecd and other modules. Before, after wrexecd sent the 'complete' event it archived the job by modifying the KVS. Other modules that were subscribed to the 'complete' event could be activated and read from the job's KVS directory before/while wrexecd was archiving the job. Now, modules subscribed to the 'complete' state will only be activated after wrexecd has finished modifying the KVS. Closes #728 (see for more info) --- src/modules/wreck/wrexecd.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/modules/wreck/wrexecd.c b/src/modules/wreck/wrexecd.c index 442ebbf16d15..7a8915dea3ca 100644 --- a/src/modules/wreck/wrexecd.c +++ b/src/modules/wreck/wrexecd.c @@ -2328,6 +2328,7 @@ int main (int ac, char **av) { int code = 0; int parent_fd = -1; + int exec_rc = -1; struct prog_ctx *ctx = NULL; optparse_t *p; struct optparse_option opts [] = { @@ -2382,16 +2383,10 @@ int main (int ac, char **av) if (!prog_ctx_getopt (ctx, "no-pmi-server") && prog_ctx_initialize_pmi (ctx) < 0) wlog_fatal (ctx, 1, "failed to initialize pmi-server"); - if (exec_commands (ctx) == 0) { + exec_rc = exec_commands (ctx); - if (flux_reactor_run (flux_get_reactor (ctx->flux), 0) < 0) - wlog_err (ctx, "flux_reactor_run: %s", flux_strerror (errno)); - - rexec_state_change (ctx, "complete"); - wlog_msg (ctx, "job complete. exiting..."); - - lua_stack_call (ctx->lua_stack, "rexecd_exit"); - } + if (exec_rc == 0 && flux_reactor_run (flux_get_reactor (ctx->flux), 0) < 0) + wlog_err (ctx, "flux_reactor_run: %s", flux_strerror (errno)); if (ctx->nodeid == 0) { /* At final job state, archive the completed lwj back to the @@ -2399,7 +2394,13 @@ int main (int ac, char **av) */ if (archive_lwj (ctx) < 0) wlog_err (ctx, "archive_lwj failed"); + } + if (exec_rc == 0) { + rexec_state_change (ctx, "complete"); + wlog_msg (ctx, "job complete. exiting..."); + + lua_stack_call (ctx->lua_stack, "rexecd_exit"); } code = ctx->errnum;