From 12687c8c4068ad4315e64e76167fc8220823b359 Mon Sep 17 00:00:00 2001 From: no author Date: Fri, 21 Nov 2003 21:18:52 +0000 Subject: [PATCH] This commit was manufactured by cvs2svn to create tag 'slurm-0-2-22-1'. --- META | 4 +- NEWS | 17 +++++- doc/man/man5/slurm.conf.5 | 15 ++++-- src/api/allocate.c | 6 ++- src/api/cancel.c | 3 +- src/api/complete.c | 3 +- src/api/config_info.c | 2 +- src/api/job_info.c | 3 +- src/api/job_step_info.c | 2 +- src/api/node_info.c | 2 +- src/api/partition_info.c | 2 +- src/api/reconfigure.c | 4 +- src/api/submit.c | 1 + src/common/log.c | 13 +++-- src/common/slurm_auth.c | 11 ++-- src/common/slurm_errno.c | 2 +- src/common/slurm_protocol_defs.c | 2 +- src/common/slurm_protocol_pack.c | 4 -- src/common/xsignal.c | 1 + src/plugins/auth/auth_munge.c | 1 + src/scontrol/scontrol.c | 2 +- src/slurmctld/job_mgr.c | 22 -------- src/slurmctld/node_mgr.c | 31 +++++++---- src/slurmd/interconnect.h | 1 - src/slurmd/mgr.c | 10 +++- src/slurmd/req.c | 5 ++ src/slurmd/shm.c | 43 ++++++++++----- src/slurmd/slurmd.c | 93 +++++++++++++++++--------------- src/slurmd/ulimits.c | 3 +- src/srun/io.c | 2 + src/srun/job.c | 20 +++++-- src/srun/job.h | 3 +- src/srun/opt.c | 2 +- 33 files changed, 201 insertions(+), 134 deletions(-) diff --git a/META b/META index c7dd47ae8a..17ee31df70 100644 --- a/META +++ b/META @@ -7,6 +7,6 @@ Name: slurm Major: 0 Minor: 2 - Micro: 21 - Version: 0.2.21 + Micro: 22 + Version: 0.2.22 Release: 1 diff --git a/NEWS b/NEWS index 56c73e8556..5668b55a69 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,19 @@ This file describes changes in recent versions of SLURM. It primarily documents those changes that are of interest to users and admins. +* Changes in SLURM 0.2.22 +========================= + -- Fixes for reported problems: + - slurm/326: Node stays in completing state indefinitely. + - slurm/328: slurmd uses different shared memory key on restart. + - slurm/329: Job step processes may be left running when one task dies. + - slurm/334: slurmd segv with multiple simultaneous job steps. + -- Allow more digits for priority values in scontrol. + -- Applied various fixes for memory leaks. + -- Remove logic preventing DPCS from allocating jobs with more than + eight node segments. Fix for DPCS should now be in production. + -- Changed compact string for DRAINING state to "drng" from "drain." + * Changes in SLURM 0.2.21 ========================= -- Fixes for reported problems: @@ -9,6 +22,7 @@ documents those changes that are of interest to users and admins. - slurm/300: Possibly killing wrong job on slurmd restart - slurm/312: Freeing non-allocated memory and killing slurmd -- Assorted changes to support RedHat Enterprise Linux 3.0 and IA64 + -- Initial Elan4 and libelanctrl support (--with-elan). -- Slurmctld was sometimes inappropriately setting a job's priority to 1 when a node was down (even if up nodes could be used for the job when a running job completes) @@ -17,7 +31,8 @@ documents those changes that are of interest to users and admins. variable for `%J' expansion in TV bulk launch string. -- Fix several locking bugs in slurmd IO layer. -- Throttle back repetitious error messages in slurmd to avoid filling - slurm logfiles. + log files. + * Changes in SLURM 0.2.20 ========================= diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index a0e8772145..b382aa50e9 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -205,10 +205,15 @@ on the same nodes or the values of \fBSlurmctldPort\fR and \fBSlurmdPort\fR must be different. .TP \fBSlurmdSpoolDir\fR -Fully qualified pathname of a file into which the \fBslurmd\fR daemon's state -information is written. This must be a common pathname for all nodes, but -should represent a file which is local to each node (reference a local file -system). The default value is "/tmp/slurmd". +Fully qualified pathname of a directory into which the \fBslurmd\fR +daemon's state information and batch job script information are written. This +must be a common pathname for all nodes, but should represent a directory which +is local to each node (reference a local file system). The default value +is "/var/spool/slurmd." \fBNOTE\fR: This directory is also used to store \fBslurmd\fR's +shared memory lockfile, and \fBshould not be changed\fR unless the system +is being cleanly restarted. If the location of \fBSlurmdSpoolDir\fR is +changed and \fBslurmd\fR is restarted, the new daemon will attach to a +different shared memory region and lose track of any running jobs. .TP \fBSlurmdTimeout\fR The interval, in seconds, that the SLURM controller waits for \fBslurmd\fR @@ -228,7 +233,7 @@ Fully qualified pathname of the file system available to user jobs for temporary storage. This parameter is used in establishing a node's \fBTmpDisk\fR space. The default value is "/tmp". .TP -\fBWaitTimefR +\fBWaitTime\fR Specifies how many seconds the srun command should by default wait after the first task terminates before terminating all remaining tasks. The "--wait" option on the srun command line overrides this value. diff --git a/src/api/allocate.c b/src/api/allocate.c index 0895109f3f..cecf35dcf1 100644 --- a/src/api/allocate.c +++ b/src/api/allocate.c @@ -89,6 +89,7 @@ slurm_allocate_resources (job_desc_msg_t *req, if (rc == SLURM_SOCKET_ERROR) return SLURM_SOCKET_ERROR; + slurm_free_cred(resp_msg.cred); switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) @@ -127,6 +128,7 @@ int slurm_job_will_run (job_desc_msg_t *req, if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_SOCKET_ERROR; + slurm_free_cred(resp_msg.cred); switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) @@ -183,7 +185,7 @@ slurm_allocate_resources_and_run (job_desc_msg_t *req, if (rc == SLURM_SOCKET_ERROR) return SLURM_SOCKET_ERROR; - + slurm_free_cred(resp_msg.cred); switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) @@ -222,6 +224,7 @@ slurm_job_step_create (job_step_create_request_msg_t *req, if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; + slurm_free_cred(resp_msg.cred); switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) @@ -259,6 +262,7 @@ slurm_confirm_allocation (old_job_alloc_msg_t *req, if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; + slurm_free_cred(resp_msg.cred); switch(resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) diff --git a/src/api/cancel.c b/src/api/cancel.c index fb364e2888..79258b7019 100644 --- a/src/api/cancel.c +++ b/src/api/cancel.c @@ -73,7 +73,8 @@ slurm_kill_job_step (uint32_t job_id, uint32_t step_id, uint16_t signal) if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0) return SLURM_FAILURE; - if (rc) slurm_seterrno_ret(rc); + if (rc) + slurm_seterrno_ret(rc); return SLURM_SUCCESS; } diff --git a/src/api/complete.c b/src/api/complete.c index 3c529677c4..de75377c9d 100644 --- a/src/api/complete.c +++ b/src/api/complete.c @@ -85,7 +85,8 @@ slurm_complete_job_step ( uint32_t job_id, uint32_t step_id, if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0) return SLURM_ERROR; - if (rc) slurm_seterrno_ret(rc); + if (rc) + slurm_seterrno_ret(rc); return SLURM_PROTOCOL_SUCCESS; } diff --git a/src/api/config_info.c b/src/api/config_info.c index e43f5b928d..58020ab7d7 100644 --- a/src/api/config_info.c +++ b/src/api/config_info.c @@ -151,10 +151,10 @@ slurm_load_ctl_conf (time_t update_time, slurm_ctl_conf_t **confp) if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; + slurm_free_cred(resp_msg.cred); switch (resp_msg.msg_type) { case RESPONSE_BUILD_INFO: *confp = (slurm_ctl_conf_info_msg_t *) resp_msg.data; - slurm_free_cred(resp_msg.cred); break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; diff --git a/src/api/job_info.c b/src/api/job_info.c index 0e46334481..3dfd658b4f 100644 --- a/src/api/job_info.c +++ b/src/api/job_info.c @@ -219,10 +219,10 @@ slurm_load_jobs (time_t update_time, job_info_msg_t **resp) if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; + slurm_free_cred(resp_msg.cred); switch (resp_msg.msg_type) { case RESPONSE_JOB_INFO: *resp = (job_info_msg_t *)resp_msg.data; - slurm_free_cred(resp_msg.cred); break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; @@ -267,6 +267,7 @@ slurm_pid2jobid (pid_t job_pid, uint32_t *jobid) if (slurm_send_recv_node_msg(&req_msg, &resp_msg, 0) < 0) return SLURM_ERROR; + slurm_free_cred(resp_msg.cred); switch (resp_msg.msg_type) { case RESPONSE_JOB_ID: *jobid = ((job_id_response_msg_t *) resp_msg.data)->job_id; diff --git a/src/api/job_step_info.c b/src/api/job_step_info.c index f07cea90b6..9f517b0a6c 100644 --- a/src/api/job_step_info.c +++ b/src/api/job_step_info.c @@ -125,10 +125,10 @@ slurm_get_job_steps (time_t update_time, uint32_t job_id, uint32_t step_id, if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; + slurm_free_cred(resp_msg.cred); switch (resp_msg.msg_type) { case RESPONSE_JOB_STEP_INFO: *resp = (job_step_info_response_msg_t *) resp_msg.data; - slurm_free_cred(resp_msg.cred); break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; diff --git a/src/api/node_info.c b/src/api/node_info.c index f220ff62a6..91ce0d846f 100644 --- a/src/api/node_info.c +++ b/src/api/node_info.c @@ -129,10 +129,10 @@ slurm_load_node (time_t update_time, node_info_msg_t **resp) if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; + slurm_free_cred(resp_msg.cred); switch (resp_msg.msg_type) { case RESPONSE_NODE_INFO: *resp = (node_info_msg_t *) resp_msg.data; - slurm_free_cred(resp_msg.cred); break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; diff --git a/src/api/partition_info.c b/src/api/partition_info.c index d9c01cbbb7..0d66f643c6 100644 --- a/src/api/partition_info.c +++ b/src/api/partition_info.c @@ -161,10 +161,10 @@ slurm_load_partitions (time_t update_time, partition_info_msg_t **resp) if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; + slurm_free_cred(resp_msg.cred); switch (resp_msg.msg_type) { case RESPONSE_PARTITION_INFO: *resp = (partition_info_msg_t *) resp_msg.data; - slurm_free_cred(resp_msg.cred); break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; diff --git a/src/api/reconfigure.c b/src/api/reconfigure.c index 37f027563c..b9109b428a 100644 --- a/src/api/reconfigure.c +++ b/src/api/reconfigure.c @@ -58,7 +58,8 @@ slurm_reconfigure ( void ) if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0) return SLURM_ERROR; - if (rc) slurm_seterrno_ret(rc); + if (rc) + slurm_seterrno_ret(rc); return SLURM_PROTOCOL_SUCCESS; } @@ -129,6 +130,7 @@ _send_message_controller (enum controller_id dest, slurm_msg_t *req) if ((rc = slurm_receive_msg(fd, &resp_msg, 0)) < 0) slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_RECEIVE_ERROR); + slurm_free_cred(resp_msg.cred); if (slurm_shutdown_msg_conn(fd) != SLURM_SUCCESS) slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_SHUTDOWN_ERROR); diff --git a/src/api/submit.c b/src/api/submit.c index c413cff506..f49926b20f 100644 --- a/src/api/submit.c +++ b/src/api/submit.c @@ -87,6 +87,7 @@ slurm_submit_batch_job (job_desc_msg_t *req, if (rc == SLURM_SOCKET_ERROR) return SLURM_ERROR; + slurm_free_cred(resp_msg.cred); switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; diff --git a/src/common/log.c b/src/common/log.c index 4d57f99a58..3c66406fb6 100644 --- a/src/common/log.c +++ b/src/common/log.c @@ -227,13 +227,13 @@ int log_init(char *prog, log_options_t opt, log_facility_t fac, char *logfile) void log_fini() { - if (!log) return; + if (!log) + return; + log_flush(); slurm_mutex_lock(&log_lock); - if (log->argv0) - xfree(log->argv0); - if (log->fpfx) - xfree(log->fpfx); + xfree(log->argv0); + xfree(log->fpfx); if (log->buf) cbuf_destroy(log->buf); if (log->fbuf) @@ -254,8 +254,7 @@ void log_reinit() void log_set_fpfx(char *prefix) { slurm_mutex_lock(&log_lock); - if (log->fpfx) - xfree(log->fpfx); + xfree(log->fpfx); if (!prefix) log->fpfx = xstrdup(""); else { diff --git a/src/common/slurm_auth.c b/src/common/slurm_auth.c index 5495030fa1..54b09da0c5 100644 --- a/src/common/slurm_auth.c +++ b/src/common/slurm_auth.c @@ -243,7 +243,7 @@ slurm_auth_context_create( const char *auth_type ) c->auth_errno = SLURM_SUCCESS; /* Copy the authentication type. */ - c->auth_type = strdup( auth_type ); + c->auth_type = xstrdup( auth_type ); if ( c->auth_type == NULL ) { debug3( "can't make local copy of authentication type" ); xfree( c ); @@ -299,7 +299,7 @@ slurm_auth_context_destroy( slurm_auth_context_t c ) } } - free( c->auth_type ); + xfree( c->auth_type ); xfree( c ); return SLURM_SUCCESS; @@ -340,7 +340,12 @@ slurm_auth_fini( void ) if ( g_context ) slurm_auth_context_destroy( g_context ); - free_slurm_conf( &conf ); + slurm_mutex_lock( &config_lock ); + if ( conf.slurmd_port ) { + free_slurm_conf( &conf ); + conf.slurmd_port = 0; + } + slurm_mutex_unlock( &config_lock ); } /* diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index 92c63dcb1a..57076ecb35 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -92,7 +92,7 @@ static slurm_errtab_t slurm_errtab[] = { { ESLURM_INVALID_PARTITION_NAME, "Invalid partition name specified" }, { ESLURM_DEFAULT_PARTITION_NOT_SET, - "System default partition not set" }, + "No partition specified or system default partition" }, { ESLURM_ACCESS_DENIED, "Access denied" }, { ESLURM_JOB_MISSING_REQUIRED_PARTITION_GROUP, diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 19f4523265..94a399d56b 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -372,7 +372,7 @@ char *node_state_string_compact(enum node_states inx) "IDLE", "ALLOC", "DRAIN", - "DRAIN", + "DRNG", "COMP", "END" }; diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index bf369be975..bc20895676 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -796,10 +796,6 @@ _unpack_resource_allocation_response_msg(resource_allocation_response_msg_t safe_unpack16(&tmp_ptr->num_cpu_groups, buffer); if (tmp_ptr->num_cpu_groups > 0) { - tmp_ptr->cpus_per_node = (uint32_t *) - xmalloc(sizeof(uint32_t) * tmp_ptr->num_cpu_groups); - tmp_ptr->cpu_count_reps = (uint32_t *) - xmalloc(sizeof(uint32_t) * tmp_ptr->num_cpu_groups); safe_unpack32_array((uint32_t **) & (tmp_ptr->cpus_per_node), &uint32_tmp, buffer); diff --git a/src/common/xsignal.c b/src/common/xsignal.c index 28d05470d3..8eb09aa21e 100644 --- a/src/common/xsignal.c +++ b/src/common/xsignal.c @@ -91,6 +91,7 @@ xsignal_sigset_create(int sigarray[], sigset_t *setp) int xsignal_save_mask(sigset_t *set) { + sigemptyset(set); return _sigmask(SIG_SETMASK, NULL, set); } diff --git a/src/plugins/auth/auth_munge.c b/src/plugins/auth/auth_munge.c index 11cefc3089..a46713716e 100644 --- a/src/plugins/auth/auth_munge.c +++ b/src/plugins/auth/auth_munge.c @@ -462,6 +462,7 @@ _decode_cred(char *m, slurm_auth_credential_t *c) * Block all signals to allow munge_decode() to proceed * uninterrupted. (Testing for gnats slurm/223) */ + sigemptyset(&oset); sigfillset(&set); sigdelset(&set, SIGABRT); sigdelset(&set, SIGSEGV); diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c index c2d14173be..8177d76180 100644 --- a/src/scontrol/scontrol.c +++ b/src/scontrol/scontrol.c @@ -1268,7 +1268,7 @@ _update_job (int argc, char *argv[]) } else if (strncasecmp(argv[i], "Priority=", 9) == 0) job_msg.priority = - (uint32_t) strtol(&argv[i][9], + (uint32_t) strtoll(&argv[i][9], (char **) NULL, 10); else if (strncasecmp(argv[i], "ReqProcs=", 9) == 0) job_msg.num_procs = diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 7781a48682..fb0ea3dc01 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -122,7 +122,6 @@ static void _reset_step_bitmaps(struct job_record *job_ptr); static void _set_job_id(struct job_record *job_ptr); static void _set_job_prio(struct job_record *job_ptr); static bool _slurm_picks_nodes(job_desc_msg_t * job_specs); -static bool _too_many_fragments(bitstr_t *req_bitmap); static bool _top_priority(struct job_record *job_ptr); static int _validate_job_create_req(job_desc_msg_t * job_desc); static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate, @@ -1415,10 +1414,6 @@ static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id, error_code = ESLURM_REQUESTED_NODES_NOT_IN_PARTITION; goto cleanup; } - if (_too_many_fragments(req_bitmap)) { - error_code = ESLURM_TOO_MANY_REQUESTED_NODES; - goto cleanup; - } i = count_cpus(req_bitmap); if (i > job_desc->num_procs) job_desc->num_procs = i; @@ -3196,20 +3191,3 @@ void job_fini (void) xfree(job_hash_over); } -static bool _too_many_fragments(bitstr_t *req_bitmap) -{ -#ifdef MAX_NODE_FRAGMENTS - int i, frags=0; - int last_bit = 0, next_bit; - - for (i = 0; i < node_record_count; i++) { - next_bit = bit_test(req_bitmap, i); - if (next_bit == last_bit) - continue; - last_bit = next_bit; - if (next_bit && (++frags > MAX_NODE_FRAGMENTS)) - return true; - } -#endif - return false; -} diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 36f8ff6ea2..b46d705454 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -332,16 +332,21 @@ int dump_all_node_state ( void ) error ("Can't save state, error creating file %s %m", new_file); error_code = errno; - } - else { - if (write (log_fd, get_buf_data(buffer), - get_buf_offset(buffer)) != - get_buf_offset(buffer)) { - error ("Can't save state, error writing file %s %m", - new_file); - error_code = errno; + } else { + int pos = 0, nwrite = get_buf_offset(buffer), amount; + char *data = (char *)get_buf_data(buffer); + + while (nwrite > 0) { + amount = write(log_fd, &data[pos], nwrite); + if ((amount < 0) && (errno != EINTR)) { + error("Error writing file %s, %m", new_file); + error_code = errno; + break; + } + nwrite -= amount; + pos += amount; } - close (log_fd); + close(log_fd); } if (error_code) (void) unlink (new_file); @@ -1153,7 +1158,7 @@ validate_node_specs (char *node_name, uint32_t cpus, #endif if (node_ptr->node_state == NODE_STATE_UNKNOWN) { reset_job_priority(); - info ("validate_node_specs: node %s has registered", + debug("validate_node_specs: node %s has registered", node_name); if (job_count) node_ptr->node_state = NODE_STATE_ALLOCATED; @@ -1491,9 +1496,13 @@ void make_node_comp(struct node_record *node_ptr) node_ptr->name, node_state_string((enum node_states) node_ptr->node_state)); - } else { + } else if (!no_resp_flag) { node_ptr->node_state = NODE_STATE_COMPLETING | no_resp_flag; xfree(node_ptr->reason); + } else if ( (base_state == NODE_STATE_ALLOCATED) && + (node_ptr->run_job_cnt == 0) ) { + bit_set(idle_node_bitmap, inx); + node_ptr->node_state = NODE_STATE_IDLE | no_resp_flag; } } diff --git a/src/slurmd/interconnect.h b/src/slurmd/interconnect.h index d700b78834..c5461df757 100644 --- a/src/slurmd/interconnect.h +++ b/src/slurmd/interconnect.h @@ -29,7 +29,6 @@ #ifndef _INTERCONNECT_H_ #define _INTERCONNECT_H_ -#include "src/common/slurm_protocol_api.h" #include "src/slurmd/job.h" /* diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c index aa9f918d12..d62153cbcf 100644 --- a/src/slurmd/mgr.c +++ b/src/slurmd/mgr.c @@ -555,13 +555,19 @@ _create_job_session(slurmd_job_t *job) return ESLURMD_FORK_FAILED; } + /* + * If the created job terminates immediately, the shared memory + * record can be purged before we canset the mpid and sid below. + * This does not truly indicate an error condition, but a rare + * timing anomaly. Thus we log the event using debug() + */ job->jmgr_pid = getpid(); if (shm_update_step_mpid(job->jobid, job->stepid, getpid()) < 0) - error("shm_update_step_mpid: %m"); + debug("shm_update_step_mpid: %m"); job->smgr_pid = spid; if (shm_update_step_sid(job->jobid, job->stepid, spid) < 0) - error("shm_update_step_sid: %m"); + debug("shm_update_step_sid: %m"); /* * Read information from session manager slurmd diff --git a/src/slurmd/req.c b/src/slurmd/req.c index 6490dcb70c..7b09998321 100644 --- a/src/slurmd/req.c +++ b/src/slurmd/req.c @@ -78,6 +78,7 @@ static int _run_prolog(uint32_t jobid, uid_t uid); static int _run_epilog(uint32_t jobid, uid_t uid); static int _wait_for_procs(uint32_t job_id); +static pthread_mutex_t launch_mutex = PTHREAD_MUTEX_INITIALIZER; void slurmd_req(slurm_msg_t *msg, slurm_addr *cli) @@ -86,12 +87,16 @@ slurmd_req(slurm_msg_t *msg, slurm_addr *cli) switch(msg->msg_type) { case REQUEST_BATCH_JOB_LAUNCH: + slurm_mutex_lock(&launch_mutex); _rpc_batch_job(msg, cli); slurm_free_job_launch_msg(msg->data); + slurm_mutex_unlock(&launch_mutex); break; case REQUEST_LAUNCH_TASKS: + slurm_mutex_lock(&launch_mutex); _rpc_launch_tasks(msg, cli); slurm_free_launch_tasks_request_msg(msg->data); + slurm_mutex_unlock(&launch_mutex); break; case REQUEST_KILL_TASKS: _rpc_kill_tasks(msg, cli); diff --git a/src/slurmd/shm.c b/src/slurmd/shm.c index 5ab547f3cb..af4ac0f9fa 100644 --- a/src/slurmd/shm.c +++ b/src/slurmd/shm.c @@ -117,7 +117,8 @@ typedef struct shmem_struct { * static variables: * */ static sem_t *shm_lock; -static char *lockname; +static char *lockname; +static char *lockdir; static slurmd_shm_t *slurmd_shm; static int shmid; static pid_t attach_pid = (pid_t) 0; @@ -193,22 +194,25 @@ shm_fini(void) /* detach segment from local memory */ if (shmdt(slurmd_shm) < 0) { error("shmdt: %m"); - return -1; + goto error; } slurmd_shm = NULL; if (destroy && (shmctl(shmid, IPC_RMID, NULL) < 0)) { error("shmctl: %m"); - return -1; + goto error; } _shm_unlock(); if (destroy && (_shm_unlink_lock() < 0)) { error("_shm_unlink_lock: %m"); - return -1; + goto error; } return 0; + + error: + return -1; } void @@ -218,8 +222,13 @@ shm_cleanup(void) key_t key; int id = -1; - info("request to destroy shm lock [%s]", SHM_LOCKNAME); + if (!lockdir) + lockdir = xstrdup(conf->spooldir); + if ((s = _create_ipc_name(SHM_LOCKNAME))) { + + info("request to destroy shm lock [%s]", s); + key = ftok(s, 1); if (sem_unlink(s) < 0) error("sem_unlink: %m"); @@ -304,25 +313,24 @@ _is_valid_ipc_name(const char *name) return(1); } +/* + * Create IPC name by appending `name' to slurmd spooldir + * setting. + */ static char * _create_ipc_name(const char *name) { char *dst = NULL, *dir = NULL, *slash = NULL; int rc; + xassert (lockdir != NULL); + if ((rc = _is_valid_ipc_name(name)) != 1) fatal("invalid ipc name: `%s' %d", name, rc); else if (!(dst = xmalloc(PATH_MAX))) fatal("memory allocation failure"); -#if defined(POSIX_IPC_PREFIX) && defined(HAVE_POSIX_SEMS) - dir = POSIX_IPC_PREFIX; -#else - if ( !(dir = conf->spooldir) - && !(strlen(dir)) - && !(dir = getenv("TMPDIR"))) - dir = "/tmp"; -#endif /* POSIX_IPC_PREFIX */ + dir = lockdir; slash = (dir[strlen(dir) - 1] == '/') ? "" : "/"; @@ -1086,6 +1094,15 @@ _shm_lock_and_initialize() /* * Create locked semaphore (initial value == 0) */ + + /* + * Init lockdir to slurmd spooldir. + * Make sure it does not change for this instance of slurmd, + * even if spooldir does. + */ + if (!lockdir) + lockdir = xstrdup(conf->spooldir); + shm_lock = _sem_open(SHM_LOCKNAME, O_CREAT|O_EXCL, 0600, 0); debug3("slurmd lockfile is \"%s\"", lockname); diff --git a/src/slurmd/slurmd.c b/src/slurmd/slurmd.c index cc490b40e4..f72fe83282 100644 --- a/src/slurmd/slurmd.c +++ b/src/slurmd/slurmd.c @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -140,6 +141,10 @@ main (int argc, char *argv[]) log_init(argv[0], conf->log_opts, LOG_DAEMON, conf->logfile); + xsignal(SIGTERM, &_term_handler); + xsignal(SIGINT, &_term_handler); + xsignal(SIGHUP, &_hup_handler ); + /* * Run slurmd_init() here in order to report early errors * (with shared memory and public keyfile) @@ -163,6 +168,12 @@ main (int argc, char *argv[]) _kill_old_slurmd(); + /* + * Restore any saved revoked credential information + */ + if (_restore_cred_state(conf->vctx)) + return SLURM_FAILURE; + if (interconnect_node_init() < 0) fatal("Unable to initialize interconnect."); @@ -176,10 +187,6 @@ main (int argc, char *argv[]) if (send_registration_msg(SLURM_SUCCESS) < 0) error("Unable to register with slurm controller"); - xsignal(SIGTERM, &_term_handler); - xsignal(SIGINT, &_term_handler); - xsignal(SIGHUP, &_hup_handler ); - _install_fork_handlers(); list_install_fork_handlers(); @@ -203,34 +210,32 @@ main (int argc, char *argv[]) return 0; } + static void _msg_engine() { slurm_fd sock; - slurm_addr cli; - while (1) { - if (_shutdown) - break; - again: - if ((sock = slurm_accept_msg_conn(conf->lfd, &cli)) < 0) { - if (errno == EINTR) { - if (_shutdown) { - verbose("got shutdown request"); - break; - } - if (_reconfig) { - _reconfigure(); - verbose("got reconfigure request"); - } - goto again; - } - error("accept: %m"); + while (!_shutdown) { + slurm_addr *cli = xmalloc (sizeof (*cli)); + if ((sock = slurm_accept_msg_conn(conf->lfd, cli)) >= 0) { + _handle_connection(sock, cli); continue; } - if (sock > 0) - _handle_connection(sock, &cli); + /* + * Otherwise, accept() failed. + */ + xfree (cli); + if (errno == EINTR) { + if (_reconfig) { + verbose("got reconfigure request"); + _reconfigure(); + } + continue; + } + error("accept: %m"); } + verbose("got shutdown request"); slurm_shutdown_msg_engine(conf->lfd); return; } @@ -336,6 +341,7 @@ _service_connection(void *arg) error ("close(%d): %m", con->fd); done: + xfree(con->cli_addr); xfree(con); slurm_free_msg(msg); _decrement_thd_count(); @@ -345,21 +351,24 @@ _service_connection(void *arg) int send_registration_msg(uint32_t status) { + int retval = SLURM_SUCCESS; slurm_msg_t req; slurm_msg_t resp; - slurm_node_registration_status_msg_t msg; + slurm_node_registration_status_msg_t *msg = xmalloc (sizeof (*msg)); - _fill_registration_msg(&msg); - msg.status = status; + _fill_registration_msg(msg); + msg->status = status; req.msg_type = MESSAGE_NODE_REGISTRATION_STATUS; - req.data = &msg; + req.data = msg; if (slurm_send_recv_controller_msg(&req, &resp) < 0) { error("Unable to register: %m"); - return SLURM_FAILURE; + retval = SLURM_FAILURE; } + slurm_free_node_registration_status_msg (msg); + /* XXX look at response msg */ @@ -374,7 +383,7 @@ _fill_registration_msg(slurm_node_registration_status_msg_t *msg) job_step_t *s; int n; - msg->node_name = conf->hostname; + msg->node_name = xstrdup (conf->hostname); get_procs(&msg->cpus); get_memory(&msg->real_memory_size); @@ -642,10 +651,12 @@ _slurmd_init() return SLURM_FAILURE; /* - * Restore any saved revoked credential information + * Create slurmd spool directory if necessary. */ - if (_restore_cred_state(conf->vctx)) + if (_set_slurmd_spooldir() < 0) { + error("Unable to initialize slurmd spooldir"); return SLURM_FAILURE; + } /* * Cleanup shared memory if so configured @@ -662,18 +673,14 @@ _slurmd_init() /* * Initialize slurmd shared memory + * This *must* be called after _set_slurmd_spooldir() + * since the default location of the slurmd lockfile is + * _in_ the spooldir. + * */ if (shm_init(true) < 0) return SLURM_FAILURE; - /* - * Create slurmd spool directory if necessary. - */ - if (_set_slurmd_spooldir() < 0) { - error("Unable to initialize slurmd spooldir"); - return SLURM_FAILURE; - } - if (conf->daemonize && (chdir("/tmp") < 0)) { error("Unable to chdir to /tmp"); return SLURM_FAILURE; @@ -691,8 +698,8 @@ _restore_cred_state(slurm_cred_ctx_t ctx) int cred_fd, data_allocated, data_read = 0; Buf buffer = NULL; - if ((mkdir(conf->spooldir, 0755) < 0) && - (errno != EEXIST)) { + if ( (mkdir(conf->spooldir, 0755) < 0) + && (errno != EEXIST) ) { error("mkdir(%s): %m", conf->spooldir); return SLURM_ERROR; } @@ -727,7 +734,7 @@ static int _slurmd_fini() { save_cred_state(conf->vctx); - shm_fini(); + shm_fini(); return SLURM_SUCCESS; } diff --git a/src/slurmd/ulimits.c b/src/slurmd/ulimits.c index 92f4967d26..ca3155f99c 100644 --- a/src/slurmd/ulimits.c +++ b/src/slurmd/ulimits.c @@ -110,7 +110,8 @@ _set_limit(char **env, struct userlim *u) r.rlim_cur = (val == -1L) ? RLIM_INFINITY : (rlim_t) val; if (setrlimit(u->resource, &r) < 0) - error("setrlimit(%s,%ld): %m", name, (long)r.rlim_cur); + error("Can't propagate %s of %ld from submit host: %m", + name, (long)r.rlim_cur); } unsetenvp(env, u->var); diff --git a/src/srun/io.c b/src/srun/io.c index 08676aff8f..9926c156b5 100644 --- a/src/srun/io.c +++ b/src/srun/io.c @@ -593,9 +593,11 @@ _read_io_header(int fd, job_t *job, char *host) (hdr.type == SLURM_IO_STDERR ? "stderr" : "stdout"), host, hdr.taskid, fd ); + cbuf_destroy(cb); return SLURM_SUCCESS; fail: + cbuf_destroy(cb); close(fd); return SLURM_ERROR; } diff --git a/src/srun/job.c b/src/srun/job.c index cf27a2107c..d3b901a5d3 100644 --- a/src/srun/job.c +++ b/src/srun/job.c @@ -219,12 +219,15 @@ int job_rc(job_t *job) { int i; - int rc; + int rc = 0; - if (job->rc) return(job->rc); + if (job->rc >= 0) return(job->rc); - for (i = 0; i < opt.nprocs; i++) - job->rc |= job->tstatus[i]; + + for (i = 0; i < opt.nprocs; i++) { + if (job->rc < job->tstatus[i]) + job->rc = job->tstatus[i]; + } if ((rc = WEXITSTATUS(job->rc))) job->rc = rc; @@ -248,8 +251,12 @@ void job_fatal(job_t *job, const char *msg) void job_destroy(job_t *job, int error) { + if (job->removed) + return; + if (job->old_job) { debug("cancelling job step %u.%u", job->jobid, job->stepid); + slurm_kill_job_step(job->jobid, job->stepid, SIGKILL); slurm_complete_job_step(job->jobid, job->stepid, 0, error); } else if (!opt.no_alloc) { debug("cancelling job %u", job->jobid); @@ -263,6 +270,8 @@ job_destroy(job_t *job, int error) #ifdef HAVE_TOTALVIEW if (error) tv_launch_failure(); #endif + + job->removed = true; } @@ -389,7 +398,7 @@ _job_create_internal(allocation_info_t *info) job->state = SRUN_JOB_INIT; job->signaled = false; - job->rc = 0; + job->rc = -1; job->nodelist = xstrdup(info->nodelist); hl = hostlist_create(job->nodelist); @@ -398,6 +407,7 @@ _job_create_internal(allocation_info_t *info) job->jobid = info->jobid; job->stepid = info->stepid; job->old_job = false; + job->removed = false; /* * Initialize Launch and Exit timeout values diff --git a/src/srun/job.h b/src/srun/job.h index 6c254d331e..177f6c2b6b 100644 --- a/src/srun/job.h +++ b/src/srun/job.h @@ -75,6 +75,7 @@ typedef struct srun_job { uint32_t jobid; /* assigned job id */ uint32_t stepid; /* assigned step id */ bool old_job; /* run job step under previous allocation */ + bool removed; /* job has been removed from SLURM */ job_state_t state; /* job state */ pthread_mutex_t state_mutex; @@ -92,7 +93,7 @@ typedef struct srun_job { uint32_t **tids; /* host id => task ids mapping */ uint32_t *hostid; /* task id => host id mapping */ - slurm_addr *slurmd_addr;/* slurm_addr vector to slurmd's */ + slurm_addr *slurmd_addr;/* slurm_addr vector to slurmd's */ pthread_t sigid; /* signals thread tid */ diff --git a/src/srun/opt.c b/src/srun/opt.c index ca6b9fc401..23ff662d94 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -418,7 +418,7 @@ static void _opt_default() opt.exc_nodes = NULL; opt.max_launch_time = 60; /* 60 seconds to launch job */ opt.max_exit_timeout= 60; /* Warn user 60 seconds after task exit */ - opt.msg_timeout = 2; /* Default launch msg timeout */ + opt.msg_timeout = 5; /* Default launch msg timeout */ mode = MODE_NORMAL;