diff --git a/META b/META index d40c15fc4b..67c9073476 100644 --- a/META +++ b/META @@ -9,8 +9,8 @@ Name: slurm Major: 0 Minor: 3 - Micro: 7 - Version: 0.3.7 + Micro: 14 + Version: 0.3.14 Release: 1 API_CURRENT: 5 API_AGE: 4 diff --git a/NEWS b/NEWS index ff4efc507a..541eb9293c 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,53 @@ This file describes changes in recent versions of SLURM. It primarily documents those changes that are of interest to users and admins. +* Changes in SLURM 0.3.14 +========================= + -- Fix bug in bitfmt2int() which can go off allocated memory. + +* Changes in SLURM 0.3.13 +========================= + -- "sinfo -i" no longer aborts when nodes have associated features + +* Changes in SLURM 0.3.12 +========================= + -- Always report job scheduling details (so prolog/epilog can find + constraints). + +* Changes in SLURM 0.3.11 +========================= + -- Fixes for reported problems: + - slurm/538: user tasks block writing to stdio + -- Added srun option --disable-status,-X to disable srun status feature + and instead forward SIGINT immediately to job upon receipt of Ctrl-C. + -- Fix for bogus slurmd error message "Unable to put task N into pgrp..." + -- Fix case where slurmd may erroneously detect shared memory entry + as "stale" and delete entry for unkillable or slow-to-exit job. + -- (qsnet) Fix for running slurmd on node without and elan3 adapter. + +* Changes in SLURM 0.3.10 +========================= + -- Move startup script from "/etc/rc.d/init.d/slurm" to "/etc/init.d/slurm" + -- Avoid creating orphaned process group when starting job processes. + -- Remove calls in auth/munge plugin deprecated by munge-0.4. + +* Changes in SLURM 0.3.9 +======================== + -- Fixes for reported problems: + - slurm/512: Let job steps run on DRAINING nodes + - slurm/513: Gracefully deal with UIDs missing from passwd file + -- Create new allocation as needed for debugger in case old allocation + has been purged + -- Fix bug in scheduling jobs when a processor count is specified + and FastSchedule=0 and the cluster is heterogeneous. + -- Fix srun bug when --input, --output and --error are all "none" + -- Allow single task id to be selected with --input, --output, and --error. + -- Create shared memory segment for Elan statistics when using the + switch/elan plugin. + +* Changes in SLURM 0.3.8 +======================== + -- More fixes necessary for TotalView. * Changes in SLURM 0.3.7 ======================== diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index ab63f8f4ba..becb6ef018 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -171,6 +171,7 @@ Possible values are"YES" and "NO". Set the job's required features on nodes specified value. Multiple values may be comma separated if all features are required (AND operation) or separated by "|" if any of the specified features are required (OR operation). +Value may be cleared with blank data value, "Features=". .TP \fIJobId\fP= Identify the job to be updated. This specification is required. @@ -196,6 +197,7 @@ Set the job's priority to the specified value. \fIReqNodeList\fP= Set the job's list of required node. Multiple node names may be specified using simple node range expressions (e.g. "lx[10-20]"). +Value may be cleared with blank data value, "ReqNodeList=". .TP \fIReqNodes\fP= Set the job's count of required nodes to the specified value. @@ -232,8 +234,9 @@ changing its underlying state. \fBSPECIFICATIONS FOR UPDATE AND DELETE COMMANDS, PARTITIONS\fR .TP \fIAllowGroups\fP= -Identify the user groups which may use this partition. Multiple groups -may be specified in a comma separated list. +Identify the user groups which may use this partition. +Multiple groups may be specified in a comma separated list. +To permit all groups to use the partition specify "AllowGroups=ALL" .TP \fIDefault\fP= Specify if this partition is to be used by jobs which do not explicitly @@ -249,6 +252,7 @@ Possible values are"YES" and "NO". Identify the node(s) to be associated with this partition. Multiple node names may be specified using simple node range expressions (e.g. "lx[10-20]"). Note that jobs may only be associated with one partition at any time. +Specify a blank data value to remove all nodes from a partition: "Nodes=". .TP \fIPartitionName\fP= Identify the partition to be updated. This specification is required. diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index ece94140f6..26a080132e 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -223,15 +223,24 @@ the job. By default only errors are displayed. .TP \fB\-W\fR, \fB\-\-wait\fR=\fIseconds\fR Specify how long to wait after the first task terminates before terminating -all remaining tasks. The default value is unlimited. This can be useful to -insure that a job is terminated in a timely fashion in the event that one -or more tasks terminate prematurely. +all remaining tasks. A value of 0 indicates an unlimited wait (a warning will +be issued after 60 seconds). The default value is set by the WaitTime +parameter in the slurm configuration file (see \fBslurm.conf(5)\fR). This +option can be useful to insure that a job is terminated in a timely fashion +in the event that one or more tasks terminate prematurely. .TP \fB\-q\fR, \fB\-\-quit-on-interrupt\fR Quit immediately on single SIGINT (Ctrl-C). Use of this option disables the status feature normally available when \fBsrun\fR receives a single Ctrl-C and causes \fBsrun\fR to instead immediately terminate the -running job. +running job. +.TP +\fB\-X\fR, \fB\-\-disable-status\fR +Disable the display of task status when srun receives a single SIGINT +(Ctrl-C). Instead immediately forward the SIGINT to the running job. +A second Ctrl-C in one second will forcibly terminate the job and +\fBsrun\fR will immediately exit. May also be set via the environment +variable SLURM_DISABLE_STATUS. .TP \fB\-Q\fR, \fB\-\-quiet\fR Quiet operation. Suppress informational messages. Errors will still @@ -604,6 +613,9 @@ SLURM_TIMELIMIT .TP SLURM_WAIT \fB\-W, \-\-wait\fR=\fIseconds\fR +.TP +SLURM_DISABLE_STATUS +\fB\-X, \-\-disable-status\fR .PP Additionally, .B srun diff --git a/etc/init.d.slurm b/etc/init.d.slurm index 4a5af65f29..467de96c7c 100644 --- a/etc/init.d.slurm +++ b/etc/init.d.slurm @@ -16,6 +16,11 @@ # $Id$ +BINDIR=/usr/bin +CONFDIR=/etc/slurm +LIBDIR=/usr/lib +SBINDIR=/usr/sbin + # Source function library. [ -f /etc/rc.d/init.d/functions ] || exit 0 . /etc/rc.d/init.d/functions @@ -28,14 +33,17 @@ else SLURMD_OPTIONS="" fi -[ -f /etc/slurm/slurm.conf ] || exit 1 +[ -f $CONFDIR/slurm.conf ] || exit 1 + +# setup library paths for slurm and munge support +export LD_LIBRARY_PATH="$LIBDIR:$LD_LIBRARY_PATH" RETVAL=0 start() { echo -n "starting $1: " unset HOME MAIL USER USERNAME - daemon /usr/sbin/$1 $2 + daemon $SBINDIR/$1 $2 RETVAL=$? echo touch /var/lock/subsys/slurm @@ -52,7 +60,7 @@ stop() { } startall() { - for prog in `scontrol show daemons`; do + for prog in `$BINDIR/scontrol show daemons`; do optvar=`echo ${prog}_OPTIONS | tr "a-z" "A-Z"` start $prog ${!optvar} done @@ -68,7 +76,7 @@ slurmstatus() { local rpid local pidfile - pidfile=`grep -i ${base}pid /etc/slurm/slurm.conf | grep -v '^ *#'` + pidfile=`grep -i ${base}pid $CONFDIR/slurm.conf | grep -v '^ *#'` if [ $? = 0 ]; then pidfile=${pidfile##*=} pidfile=${pidfile%#*} @@ -123,33 +131,34 @@ case "$1" in startall ;; stop) - for prog in `scontrol show daemons`; do + for prog in `$BINDIR/scontrol show daemons`; do stop $prog done ;; status) - for prog in `scontrol show daemons`; do + for prog in `$BINDIR/scontrol show daemons`; do slurmstatus $prog done ;; restart) - $0 stop && $0 start + $0 stop + $0 start ;; condrestart) if [ -f /var/lock/subsys/slurm ]; then - for prog in `scontrol show daemons`; do + for prog in `$BINDIR/scontrol show daemons`; do stop $prog start $prog done fi ;; reconfig) - for prog in `scontrol show daemons`; do + for prog in `$BINDIR/scontrol show daemons`; do killproc $prog -HUP done ;; test) - for prog in `scontrol show daemons`; do + for prog in `$BINDIR/scontrol show daemons`; do echo "$prog runs here" done ;; diff --git a/slurm.spec.in b/slurm.spec.in index a5f9f9e47c..d0631a2e06 100644 --- a/slurm.spec.in +++ b/slurm.spec.in @@ -24,8 +24,12 @@ Requires: openssl >= 0.9.6 # %if %{?_with_debug:1}%{!?_with_debug:0} %define _enable_debug --enable-debug - %define __os_install_post /usr/lib/rpm/brp-compress %endif +# +# Never allow rpm to strip binaries as this will break +# parallel debugging capability +# +%define __os_install_post /usr/lib/rpm/brp-compress %package devel @@ -108,16 +112,16 @@ rm -rf "$RPM_BUILD_ROOT" mkdir -p "$RPM_BUILD_ROOT" DESTDIR="$RPM_BUILD_ROOT" make install -install -D -m755 etc/init.d.slurm $RPM_BUILD_ROOT/etc/rc.d/init.d/slurm +install -D -m755 etc/init.d.slurm $RPM_BUILD_ROOT/etc/init.d/slurm install -D -m644 etc/slurm.conf.example $RPM_BUILD_ROOT/etc/slurm/slurm.conf # Delete unpackaged files: -rm -f $RPM_BUILD_ROOT/usr/lib/slurm/*.{a,la} +rm -f $RPM_BUILD_ROOT/%{_libdir}/slurm/*.{a,la} # Build file lists for optional plugin packages for plugin in auth_munge auth_authd switch_elan; do LIST=./${plugin}.files touch $LIST - test -f $RPM_BUILD_ROOT/usr/lib/slurm/${plugin}.so && + test -f $RPM_BUILD_ROOT/%{_libdir}/slurm/${plugin}.so && echo %{_libdir}/slurm/${plugin}.so > $LIST done @@ -154,7 +158,7 @@ rm -rf $RPM_BUILD_ROOT %{_libdir}/slurm/sched_builtin.so %{_libdir}/slurm/switch_none.so %dir %{_libdir}/slurm/src -%config(noreplace) /etc/rc.d/init.d/slurm +%config(noreplace) /etc/init.d/slurm %config(noreplace) /etc/slurm/slurm.conf ############################################################################# @@ -189,29 +193,24 @@ rm -rf $RPM_BUILD_ROOT ############################################################################# %pre -#if [ -x /etc/rc.d/init.d/slurm ]; then -# if /etc/rc.d/init.d/slurm status | grep -q running; then -# /etc/rc.d/init.d/slurm stop +#if [ -x /etc/init.d/slurm ]; then +# if /etc/init.d/slurm status | grep -q running; then +# /etc/init.d/slurm stop # fi #fi %post /sbin/ldconfig %{_libdir} -if [ -x /etc/rc.d/init.d/slurm ]; then - [ -x /sbin/chkconfig ] && /sbin/chkconfig --del slurm +if [ $1 = 1 ]; then [ -x /sbin/chkconfig ] && /sbin/chkconfig --add slurm -# if ! /etc/rc.d/init.d/slurm status | grep -q running \ -# || ! /etc/rc.d/init.d/slurm test | grep -q slurmctld; then -# /etc/rc.d/init.d/slurm start -# fi fi %preun -if [ "$1" = 0 ]; then - if [ -x /etc/rc.d/init.d/slurm ]; then +if [ $1 = 0 ]; then + if [ -x /etc/init.d/slurm ]; then [ -x /sbin/chkconfig ] && /sbin/chkconfig --del slurm - if /etc/rc.d/init.d/slurm status | grep -q running; then - /etc/rc.d/init.d/slurm stop + if /etc/init.d/slurm status | grep -q running; then + /etc/init.d/slurm stop fi fi fi @@ -224,6 +223,11 @@ fi %changelog +* Fri Oct 01 2004 Mark Grondona +- don't delete and add service in %post +* Wed Aug 04 2004 Mark Grondona +- don't allow rpm to strip binaries since this breaks interface to parallel + debuggers. * Thu Jul 29 2004 Morris Jette - added checkpoint_none.so and jobcomp_script.so plugins * Fri Mar 07 2004 Mark Grondona diff --git a/src/common/bitstring.c b/src/common/bitstring.c index c7fc9598d7..5080f9776b 100644 --- a/src/common/bitstring.c +++ b/src/common/bitstring.c @@ -624,7 +624,8 @@ bitfmt2int (char *bit_str_ptr) if (bit_str_ptr == NULL) return NULL; size = strlen (bit_str_ptr) + 1; - bit_int_ptr = xmalloc ( sizeof (int *) * size); + bit_int_ptr = xmalloc ( sizeof (int *) * + (size * 2 + 1)); /* more than enough space */ if (bit_int_ptr == NULL) return NULL; @@ -653,6 +654,7 @@ bitfmt2int (char *bit_str_ptr) sum = 0; } } + assert(bit_inx < (size*2+1)); bit_int_ptr[bit_inx] = -1; return bit_int_ptr; } diff --git a/src/common/cbuf.c b/src/common/cbuf.c index bfcb23cf62..9ea53ebe06 100644 --- a/src/common/cbuf.c +++ b/src/common/cbuf.c @@ -1,9 +1,9 @@ /***************************************************************************** * $Id$ ***************************************************************************** - * $LSDId: cbuf.c,v 1.32 2003/01/03 21:08:19 dun Exp $ + * $LSDId: cbuf.c,v 1.35 2005/01/13 00:41:17 dun Exp $ ***************************************************************************** - * Copyright (C) 2002-2003 The Regents of the University of California. + * Copyright (C) 2002-2005 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Chris Dunlap . * @@ -330,7 +330,7 @@ cbuf_size (cbuf_t cb) assert(cb != NULL); cbuf_mutex_lock(cb); assert(cbuf_is_valid(cb)); - size = cb->size; + size = cb->maxsize; cbuf_mutex_unlock(cb); return(size); } @@ -344,7 +344,7 @@ cbuf_free (cbuf_t cb) assert(cb != NULL); cbuf_mutex_lock(cb); assert(cbuf_is_valid(cb)); - nfree = cb->size - cb->used; + nfree = cb->maxsize - cb->used; cbuf_mutex_unlock(cb); return(nfree); } @@ -381,6 +381,12 @@ cbuf_lines_used (cbuf_t cb) int cbuf_reused (cbuf_t cb) { +/* If (O > R) + * n = O - R + * else + * n = (O - 0) + ((S+1) - R). + * (S+1) is used since data[] contains 'size' bytes + a 1-byte sentinel. + */ int reused; assert(cb != NULL); @@ -988,12 +994,11 @@ cbuf_write_from_fd (cbuf_t dst, int srcfd, int len, int *ndropped) if (len == -1) { /* * Try to use all of the free buffer space available for writing. - * If it is all in use, try to grab another chunk and limit the - * amount of data being overwritten. + * If it is all in use, try to grab another chunk. */ len = dst->size - dst->used; if (len == 0) { - len = MIN(dst->size, CBUF_CHUNK); + len = CBUF_CHUNK; } } if (len > 0) { @@ -1275,7 +1280,7 @@ cbuf_get_fd (void *dstbuf, int *psrcfd, int len) do { n = read(*psrcfd, dstbuf, len); - } while ((n < 0) && ((errno == EINTR) || (errno == EAGAIN))); + } while ((n < 0) && (errno == EINTR)); return(n); } diff --git a/src/common/cbuf.h b/src/common/cbuf.h index f2368ef2fa..6ca1f68267 100644 --- a/src/common/cbuf.h +++ b/src/common/cbuf.h @@ -1,7 +1,9 @@ /***************************************************************************** * $Id$ ***************************************************************************** - * Copyright (C) 2002-2003 The Regents of the University of California. + * $LSDId: cbuf.h,v 1.22 2005/01/19 22:35:57 dun Exp $ + ***************************************************************************** + * Copyright (C) 2002-2005 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Chris Dunlap . * @@ -104,14 +106,14 @@ void cbuf_flush (cbuf_t cb); int cbuf_size (cbuf_t cb); /* - * Returns the current size of the buffer allocated to [cb] - * (ie, the number of bytes in can currently hold). + * Returns the maximum size of the buffer allocated to [cb] + * (ie, the number of bytes it can currently hold). */ int cbuf_free (cbuf_t cb); /* - * Returns the number of bytes in [cb] available for writing before - * unread data is overwritten (unless the cbuf is able to resize itself). + * Returns the number of bytes in [cb] available for writing before unread + * data is overwritten (assuming the cbuf can resize itself if needed). */ int cbuf_used (cbuf_t cb); diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index c6353b6ca2..9b9f4bc0db 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -902,6 +902,9 @@ int slurm_send_rc_msg(slurm_msg_t *msg, int rc) { slurm_msg_t resp_msg; return_code_msg_t rc_msg; + + if (msg->conn_fd < 0) + return (ENOTCONN); rc_msg.return_code = rc; diff --git a/src/plugins/auth/munge/auth_munge.c b/src/plugins/auth/munge/auth_munge.c index 46ac99dc3f..7be9bd237f 100644 --- a/src/plugins/auth/munge/auth_munge.c +++ b/src/plugins/auth/munge/auth_munge.c @@ -452,7 +452,7 @@ _decode_cred(char *m, slurm_auth_credential_t *c) error ("Munge decode failed: %s %s", munge_ctx_strerror(ctx), retry ? "(retrying ...)": ""); - if ((e = EMUNGE_SOCKET) && retry--) + if ((e == EMUNGE_SOCKET) && retry--) goto again; /* @@ -545,22 +545,9 @@ _print_cred_info(munge_info_t *mi) if (mi->encoded > 0) info ("ENCODED: %s", ctime_r(&mi->encoded, buf)); + if (mi->decoded > 0) info ("DECODED: %s", ctime_r(&mi->decoded, buf)); - - if ( (mi->cipher > MUNGE_CIPHER_NONE) - && (mi->cipher < MUNGE_CIPHER_LAST_ENTRY) ) - info ("CIPHER: %s", munge_cipher_strings[mi->cipher]); - - if ( (mi->mac > MUNGE_MAC_NONE) - && (mi->mac < MUNGE_MAC_LAST_ENTRY) ) { - info ("MAC: %s", munge_mac_strings[mi->mac]); - /* - * Only print ZIP if MAC is valid. - * (because ZIP == NONE could be valid) - */ - info ("ZIP: %s", munge_zip_strings[mi->zip]); - } } diff --git a/src/plugins/jobcomp/filetxt/jobcomp_filetxt.c b/src/plugins/jobcomp/filetxt/jobcomp_filetxt.c index 0783e815f5..4c33a6bd5c 100644 --- a/src/plugins/jobcomp/filetxt/jobcomp_filetxt.c +++ b/src/plugins/jobcomp/filetxt/jobcomp_filetxt.c @@ -162,7 +162,7 @@ _get_user_name(uint32_t user_id, char *user_name, int buf_size) else snprintf(user_name, buf_size, "Unknown"); cache_uid = user_id; - snprintf(cache_name, sizeof(cache_name), user_info->pw_name); + snprintf(cache_name, sizeof(cache_name), user_name); } } @@ -218,7 +218,7 @@ int slurm_jobcomp_log_record ( struct job_record *job_ptr ) job_state_string(job_state), job_ptr->partition, lim_str, start_str, end_str, job_ptr->nodes); - tot_size = (strlen(job_rec) + 1); + tot_size = strlen(job_rec); while ( offset < tot_size ) { wrote = write(job_comp_fd, job_rec + offset, diff --git a/src/plugins/switch/elan/qsw.c b/src/plugins/switch/elan/qsw.c index 6865c526ea..a47ad0c315 100644 --- a/src/plugins/switch/elan/qsw.c +++ b/src/plugins/switch/elan/qsw.c @@ -35,6 +35,8 @@ #include #include #include +#include +#include #include #include #include @@ -56,8 +58,17 @@ * want to include here since we are using the new * version-nonspecific libelanctrl. * (XXX: What is the equivalent in libelanctrl?) + * + * slurm/482: the elan USER context range is now split + * into two segments, regular user context and RMS + * context ranges. Do not allow a context range + * (lowcontext -- highcontext) to span these two segments, + * as this will generate and elan initialization error + * when MPI tries to attach to the capability. For now, + * restrict SLURM's range to the RMS one (starting at 0x400) + * */ -# define ELAN_USER_BASE_CONTEXT_NUM 0x020 +# define ELAN_USER_BASE_CONTEXT_NUM 0x400 /* act. RMS_BASE_CONTEXT_NUM */ # define ELAN_USER_TOP_CONTEXT_NUM 0x7ff # define Version cap_version @@ -108,6 +119,7 @@ #define QSW_CTX_END ELAN_USER_TOP_CONTEXT_NUM - 1 #define QSW_CTX_INVAL (-1) + /* * We are going to some trouble to keep these defs private so slurm * hackers not interested in the interconnect details can just pass around @@ -152,6 +164,8 @@ static qsw_libstate_t qsw_internal_state = NULL; static pthread_mutex_t qsw_lock = PTHREAD_MUTEX_INITIALIZER; static elanhost_config_t elanconf = NULL; +static int shmid = -1; + /* * Allocate a qsw_libstate_t. @@ -664,6 +678,10 @@ qsw_setup_jobinfo(qsw_jobinfo_t j, int nprocs, bitstr_t *nodeset, int qsw_prgdestroy(qsw_jobinfo_t jobinfo) { + + if (shmid >= 0) + shmctl (shmid, IPC_RMID, NULL); + if (rms_prgdestroy(jobinfo->j_prognum) < 0) { /* translate errno values to more descriptive ones */ switch (errno) { @@ -698,6 +716,50 @@ qsw_prog_fini(qsw_jobinfo_t jobinfo) #endif } +/* Key for Elan stats shared memory segment is the + * rms.o program description number, left shifted 9 less 1 + * to avoid conflicts with MPI shared memory + */ +static int elan_statkey (int prgid) +{ + return ((prgid << 9) - 1); +} + +/* + * Return the statkey to caller if shared memory was created + */ +int qsw_statkey (qsw_jobinfo_t jobinfo) +{ + return (shmid > 0 ? elan_statkey (jobinfo->j_prognum) : -1); +} + +/* + * Create shared memory segment for Elan stats use + * (ELAN_STATKEY env var is set in switch_elan.c) + */ +static int +_qsw_shmem_create (qsw_jobinfo_t jobinfo, uid_t uid) +{ + struct shmid_ds shm; + ELAN_CAPABILITY *cap = &jobinfo->j_cap; + key_t key = elan_statkey (jobinfo->j_prognum); + int maxLocal = cap->HighContext - cap->LowContext + 2; + int pgsize = getpagesize (); + + if ((shmid = shmget (key, pgsize * (maxLocal + 1), IPC_CREAT)) < 0) + return (error ("Failed to create Elan state shmem: %m")); + + /* Ensure permissions on segment allow user read/write access + */ + shm.shm_perm.uid = uid; + shm.shm_perm.mode = 0600; + + if (shmctl (shmid, IPC_SET, &shm) < 0) + return (error ("Failed to set perms on Elan state shm: %m")); + + return (0); +} + /* * Process 2: Create the context and make capability available to children. */ @@ -785,6 +847,14 @@ qsw_prog_init(qsw_jobinfo_t jobinfo, uid_t uid) goto fail; } + + /* + * Create shared memory for libelan state + * Failure to create shared memory is not a fatal error. + */ + _qsw_shmem_create (jobinfo, uid); + + /* note: _elan3_fini() destroys context and makes capability unavail */ /* do it in qsw_prog_fini() after app terminates */ return 0; diff --git a/src/plugins/switch/elan/qsw.h b/src/plugins/switch/elan/qsw.h index f2caba7703..cbfd2d14da 100644 --- a/src/plugins/switch/elan/qsw.h +++ b/src/plugins/switch/elan/qsw.h @@ -120,4 +120,7 @@ int qsw_gethost_bynodeid(char *host, int len, int elanid); char * qsw_capability_string(qsw_jobinfo_t j, char *buf, size_t len); void qsw_print_jobinfo(FILE *fp, struct qsw_jobinfo *jobinfo); + /* Return Elan shared memory state key */ +int qsw_statkey (qsw_jobinfo_t jobinfo); + #endif /* _QSW_INCLUDED */ diff --git a/src/plugins/switch/elan/switch_elan.c b/src/plugins/switch/elan/switch_elan.c index 989d1d0f7d..84fb7ca60e 100644 --- a/src/plugins/switch/elan/switch_elan.c +++ b/src/plugins/switch/elan/switch_elan.c @@ -368,7 +368,7 @@ static int _have_elan3 (void) #else struct stat st; - if (stat ("/proc/qsnet/elan3", &st) < 0) + if (stat ("/proc/qsnet/elan3/device0", &st) < 0) return (0); return (1); @@ -623,6 +623,7 @@ int switch_p_job_attach ( switch_jobinfo_t jobinfo, char ***env, uint32_t nodeid, uint32_t procid, uint32_t nnodes, uint32_t nprocs, uint32_t rank ) { + int id = -1; debug3("nodeid=%lu nnodes=%lu procid=%lu nprocs=%lu rank=%lu", (unsigned long) nodeid, (unsigned long) nnodes, (unsigned long) procid, (unsigned long) nprocs, @@ -645,6 +646,14 @@ int switch_p_job_attach ( switch_jobinfo_t jobinfo, char ***env, if (setenvpf(env, "RMS_NPROCS", "%lu", (unsigned long) nprocs) < 0) return SLURM_ERROR; + /* + * Tell libelan the key to use for Elan state shmem segment + */ + if ((id = qsw_statkey (jobinfo)) > 0) + setenvpf (env, "ELAN_STATKEY", "0x%x", id); + + + return SLURM_SUCCESS; } diff --git a/src/sinfo/sinfo.c b/src/sinfo/sinfo.c index f087aef7ac..24c1837cc6 100644 --- a/src/sinfo/sinfo.c +++ b/src/sinfo/sinfo.c @@ -491,7 +491,6 @@ static void _sinfo_list_delete(void *data) { sinfo_data_t *sinfo_ptr = data; - xfree(sinfo_ptr->features); hostlist_destroy(sinfo_ptr->nodes); xfree(sinfo_ptr); } diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index a6615e0aa6..f1873821ba 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1402,6 +1402,7 @@ _signal_batch_job(struct job_record *job_ptr, uint16_t signal) kill_tasks_msg->signal = signal; agent_args->msg_args = kill_tasks_msg; + agent_args->node_count = 1; /* slurm/477 be sure to update node_count */ agent_queue_request(agent_args); return; } @@ -2445,7 +2446,7 @@ void pack_job(struct job_record *dump_job_ptr, Buf buffer) pack_bit_fmt(dump_job_ptr->node_bitmap, buffer); detail_ptr = dump_job_ptr->details; - if (detail_ptr && dump_job_ptr->job_state == JOB_PENDING) + if (detail_ptr) _pack_job_details(detail_ptr, buffer); else _pack_job_details(NULL, buffer); @@ -2924,11 +2925,13 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) if (job_specs->features && detail_ptr) { if (super_user) { xfree(detail_ptr->features); - detail_ptr->features = job_specs->features; - info("update_job: setting features to %s for " - "job_id %u", job_specs->features, - job_specs->job_id); - job_specs->features = NULL; + if (job_specs->features[0] != '\0') { + detail_ptr->features = job_specs->features; + job_specs->features = NULL; + info("update_job: setting features to %s for " + "job_id %u", job_specs->features, + job_specs->job_id); + } } else { error("Attempt to change features for job %u", job_specs->job_id); @@ -2953,7 +2956,6 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) info("update_job: setting partition to %s for " "job_id %u", job_specs->partition, job_specs->job_id); - job_specs->partition = NULL; } else { error("Attempt to change partition for job %u", job_specs->job_id); @@ -2962,7 +2964,10 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) } if (job_specs->req_nodes && detail_ptr) { - if (super_user) { + if (job_specs->req_nodes[0] == '\0') { + xfree(detail_ptr->req_nodes); + FREE_NULL_BITMAP(detail_ptr->req_node_bitmap); + } else if (super_user) { if (node_name2bitmap(job_specs->req_nodes, false, &req_bitmap)) { error("Invalid node list for job_update: %s", diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 91aeaf6532..7c2f9bc7f6 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -704,10 +704,12 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, (!bit_super_set(*req_bitmap, avail_bitmap))) continue; if ((avail_nodes < min_nodes) || - (avail_cpus < req_cpus) || ((max_nodes > min_nodes) && (avail_nodes < max_nodes))) continue; /* Keep accumulating nodes */ + if (slurmctld_conf.fast_schedule + && (avail_cpus < req_cpus)) + continue; /* Keep accumulating CPUs */ if (shared) pick_code = _pick_best_load(avail_bitmap, @@ -755,7 +757,7 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, /* determine if job could possibly run (if all configured * nodes available) */ if ((!runable_ever || !runable_avail) && - (total_nodes >= min_nodes) && (total_cpus >= req_cpus) && + (total_nodes >= min_nodes) && ((*req_bitmap == NULL) || (bit_super_set(*req_bitmap, total_bitmap)))) { if (!runable_avail) { diff --git a/src/slurmctld/partition_mgr.c b/src/slurmctld/partition_mgr.c index 2abd84493d..c13f82bdb2 100644 --- a/src/slurmctld/partition_mgr.c +++ b/src/slurmctld/partition_mgr.c @@ -776,18 +776,28 @@ int update_part(update_part_msg_t * part_desc) if (part_desc->allow_groups != NULL) { xfree(part_ptr->allow_groups); - part_ptr->allow_groups = xstrdup(part_desc->allow_groups); - info("update_part: setting allow_groups to %s for partition %s", - part_desc->allow_groups, part_desc->name); xfree(part_ptr->allow_uids); - part_ptr->allow_uids = - _get_groups_members(part_desc->allow_groups); + if ((strcasecmp(part_desc->allow_groups, "ALL") == 0) || + (part_desc->allow_groups[0] == '\0')) { + info("update_part: setting allow_groups to ALL for partition %s", + part_desc->name); + } else { + part_ptr->allow_groups = part_desc->allow_groups; + part_desc->allow_groups = NULL; + info("update_part: setting allow_groups to %s for partition %s", + part_ptr->allow_groups, part_desc->name); + part_ptr->allow_uids = + _get_groups_members(part_ptr->allow_groups); + } } if (part_desc->nodes != NULL) { - char *backup_node_list; - backup_node_list = part_ptr->nodes; - part_ptr->nodes = xstrdup(part_desc->nodes); + char *backup_node_list = part_ptr->nodes; + + if (part_desc->nodes[0] == '\0') + part_ptr->nodes = NULL; /* avoid empty string */ + else + part_ptr->nodes = xstrdup(part_desc->nodes); error_code = _build_part_bitmap(part_ptr); if (error_code) { diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index c2bb7913f6..cd621d6ca0 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -377,11 +377,6 @@ _pick_step_nodes (struct job_record *job_ptr, step_specs *step_spec ) { step_spec->node_list, job_ptr->job_id); goto cleanup; } - if (bit_super_set (nodes_picked, avail_node_bitmap) == 0) { - info ("_pick_step_nodes: some requested node %s down", - step_spec->node_list); - goto cleanup; - } } else if (step_spec->relative) { /* Remove first (step_spec->relative) nodes from diff --git a/src/slurmd/fname.c b/src/slurmd/fname.c index 0a6d129445..d5975c98ea 100644 --- a/src/slurmd/fname.c +++ b/src/slurmd/fname.c @@ -56,6 +56,10 @@ fname_create(slurmd_job_t *job, const char *format, int taskid) char *name = NULL; char *orig = xstrdup(format); char *p, *q; + int id; + + if (((id = fname_single_task_io (format)) >= 0) && (taskid != id)) + return (xstrdup ("/dev/null")); /* If format doesn't specify an absolute pathname, * use cwd @@ -158,14 +162,36 @@ fname_free(void *name) { xfree(name); } + +/* + * Return >= 0 if fmt specifies "single task only" IO + * i.e. if it specifies a single integer only + */ +int fname_single_task_io (const char *fmt) +{ + unsigned long taskid; + char *p; + + taskid = strtoul (fmt, &p, 10); + + if (*p == '\0') + return ((int) taskid); + + return (-1); +} + int fname_trunc_all(slurmd_job_t *job, const char *fmt) { int i, rc = SLURM_SUCCESS; char *fname; ListIterator filei; - List files = list_create((ListDelF)fname_free); + List files = NULL; + + if (fname_single_task_io (fmt) >= 0) + return (0); + files = list_create((ListDelF)fname_free); for (i = 0; i < job->ntasks; i++) { fname = fname_create(job, fmt, job->task[i]->gid); if (!list_find_first(files, (ListFindF) find_fname, fname)) diff --git a/src/slurmd/fname.h b/src/slurmd/fname.h index eaf2b41f55..ecc33f1b6e 100644 --- a/src/slurmd/fname.h +++ b/src/slurmd/fname.h @@ -31,5 +31,6 @@ char *fname_create(slurmd_job_t *job, const char *fmt, int taskid); int fname_trunc_all(slurmd_job_t *job, const char *fmt); +int fname_single_task_io (const char *fmt); #endif /* !_SLURMD_FNAME_H */ diff --git a/src/slurmd/io.c b/src/slurmd/io.c index 8b19832404..6087591533 100644 --- a/src/slurmd/io.c +++ b/src/slurmd/io.c @@ -455,6 +455,23 @@ _obj_set_unwritable(io_obj_t *obj) obj->ops->writable = NULL; } +static char * +_local_filename (char *fname, int taskid) +{ + int id; + + if (fname == NULL) + return (NULL); + + if ((id = fname_single_task_io (fname)) < 0) + return (fname); + + if (id != taskid) + return ("/dev/null"); + + return (NULL); +} + static int _io_add_connecting(slurmd_job_t *job, task_info_t *t, srun_info_t *srun, slurmd_io_type_t type) @@ -462,7 +479,7 @@ _io_add_connecting(slurmd_job_t *job, task_info_t *t, srun_info_t *srun, io_obj_t *obj = NULL; int sock = -1; - debug3("in io_add_connecting"); + debug2 ("adding connecting %s for task %d", _io_str[type], t->gid); if ((sock = (int) slurm_open_stream(&srun->ioaddr)) < 0) { error("connect io: %m"); @@ -479,7 +496,7 @@ _io_add_connecting(slurmd_job_t *job, task_info_t *t, srun_info_t *srun, obj->ops = _ops_copy(&connecting_client_ops); _io_write_header(obj->arg, srun); - if ((type == CLIENT_STDOUT) && !srun->ifname) { + if ((type == CLIENT_STDOUT) && !_local_filename(srun->ifname, t->gid)) { struct io_info *io = obj->arg; /* This is the only read-write capable client * at this time: a connected CLIENT_STDOUT @@ -502,23 +519,25 @@ static int _io_prepare_one(slurmd_job_t *j, task_info_t *t, srun_info_t *s) { int retval = SLURM_SUCCESS; + char *fname = NULL; + /* Try hard to get stderr connected to something */ if ( (_open_output_file(j, t, s->efname, CLIENT_STDERR) < 0) && (_io_add_connecting(j, t, s, CLIENT_STDERR) < 0) ) retval = SLURM_FAILURE; - if (s->ofname) { - if (_open_output_file(j, t, s->ofname, CLIENT_STDOUT) < 0) + if ((fname = _local_filename (s->ofname, t->gid))) { + if (_open_output_file(j, t, fname, CLIENT_STDOUT) < 0) retval = SLURM_FAILURE; } else { _io_add_connecting(j, t, s, CLIENT_STDOUT); } - if (s->ifname) { + if ((fname = _local_filename (s->ifname, t->gid))) { if (_open_stdin_file(j, t, s) < 0) retval = SLURM_FAILURE; - } else if (s->ofname) { + } else if (_local_filename (s->ofname, t->gid)) { _io_add_connecting(j, t, s, CLIENT_STDIN); } @@ -610,16 +629,20 @@ _open_output_file(slurmd_job_t *job, task_info_t *t, char *fmt, int fd = -1; io_obj_t *obj = NULL; int flags = O_APPEND|O_WRONLY; - char *fname ; + char *fname = NULL; + + xassert((type == CLIENT_STDOUT) || (type == CLIENT_STDERR)); if (fmt == NULL) return SLURM_ERROR; - xassert((type == CLIENT_STDOUT) || (type == CLIENT_STDERR)); + if (!_local_filename (fmt, t->gid)) + return SLURM_ERROR; fname = fname_create(job, fmt, t->gid); if ((fd = _open_task_file(fname, flags)) > 0) { - debug("opened `%s' for %s fd %d", fname, _io_str[type], fd); + debug2 ("opened `%s' for task %d %s fd %d", + fname, t->gid, _io_str[type], fd); obj = _io_obj(job, t, fd, type); _obj_set_unreadable(obj); xassert(obj->ops->writable != NULL); diff --git a/src/slurmd/job.c b/src/slurmd/job.c index b0d06fc061..4e3df250a8 100644 --- a/src/slurmd/job.c +++ b/src/slurmd/job.c @@ -332,7 +332,8 @@ job_signal_tasks(slurmd_job_t *job, int signal) { int n = job->ntasks; while (--n >= 0) { - if (kill(job->task[n]->pid, signal) < 0) { + if ((job->task[n]->pid > (pid_t) 0) + && (kill(job->task[n]->pid, signal) < 0)) { if (errno != EEXIST) { error("job %d.%d: kill task %d: %m", job->jobid, job->stepid, n); diff --git a/src/slurmd/req.c b/src/slurmd/req.c index cfb00ca557..8fd06ff503 100644 --- a/src/slurmd/req.c +++ b/src/slurmd/req.c @@ -617,9 +617,14 @@ _rpc_kill_tasks(slurm_msg_t *msg, slurm_addr *cli_addr) goto done; } - if (kill(-step->sid, req->signal) < 0) + if ((step->sid > (pid_t) 0) + && (kill(-step->sid, req->signal) < 0)) rc = errno; + if ((step->task_list->pid > (pid_t) 0) + && (kill (-step->task_list->pid, req->signal) < 0)) + rc = errno; + if (rc == SLURM_SUCCESS) verbose("Sent signal %d to %u.%u", req->signal, req->job_id, req->job_step_id); @@ -873,8 +878,13 @@ _kill_all_active_steps(uint32_t jobid, int sig) step_cnt++; debug2("signal %d to job %u (pg:%d)", sig, jobid, s->sid); - if (kill(-s->sid, sig) < 0) + if ((s->sid > (pid_t) 0) + && (kill(-s->sid, sig) < 0)) error("kill jid %d sid %d: %m", s->jobid, s->sid); + if ((s->task_list->pid > (pid_t) 0) + && (kill(-s->task_list->pid, sig) < 0)) + error("kill jid %d pgrp %d: %m", s->jobid, + s->task_list->pid); } list_destroy(steps); if (step_cnt == 0) diff --git a/src/slurmd/shm.c b/src/slurmd/shm.c index a3b9858efa..c5d594134b 100644 --- a/src/slurmd/shm.c +++ b/src/slurmd/shm.c @@ -484,7 +484,8 @@ shm_signal_step(uint32_t jobid, uint32_t stepid, uint32_t signal) continue; } - if (kill(t->pid, signo) < 0) { + if ((t->pid > (pid_t) 0) + && (kill(t->pid, signo) < 0)) { error ("kill %u.%u task %d pid %ld: %m", jobid, stepid, t->id, (long)t->pid); retval = errno; @@ -666,7 +667,8 @@ shm_update_step_addrs(uint32_t jobid, uint32_t stepid, debug3("Going to send shm update signal to %ld", (long) s->mpid); - if ((s->mpid > 0) && (kill(s->mpid, SIGHUP) < 0)) { + if ((s->mpid > (pid_t) 0) + && (kill(s->mpid, SIGHUP) < 0)) { slurm_seterrno(EPERM); retval = SLURM_FAILURE; } @@ -905,10 +907,17 @@ _shm_clear_stale_entries(void) int count = 0; for (i = 0; i < MAX_JOB_STEPS; i++) { job_step_t *s = &slurmd_shm->step[i]; + task_t *t = s->task_list; + if (s->state == SLURMD_JOB_UNUSED) continue; + + while (t->next && t->id != 0) + t = t->next; - if ((s->sid > (pid_t) 0) && (kill(-s->sid, 0) != 0)) { + if ( (s->sid > (pid_t) 0) + && (kill(-s->sid, 0) != 0) + && (kill(-t->pid, 0) != 0)) { debug ("Clearing stale job %u.%u from shm", s->jobid, s->stepid); _shm_clear_step(s); diff --git a/src/slurmd/smgr.c b/src/slurmd/smgr.c index dfb2ea4149..e781d311a4 100644 --- a/src/slurmd/smgr.c +++ b/src/slurmd/smgr.c @@ -271,6 +271,7 @@ _become_user(slurmd_job_t *job) static int _exec_all_tasks(slurmd_job_t *job) { + char c; int i; int fd = job->fdpair[1]; @@ -286,13 +287,26 @@ _exec_all_tasks(slurmd_job_t *job) return error ("Unable to block signals"); for (i = 0; i < job->ntasks; i++) { - pid_t pid = fork(); + int fdpair[2]; + pid_t pid; - if (pid < 0) { + if (pipe (fdpair) < 0) + error ("exec_all_tasks: pipe: %m"); + + if ((pid = fork ()) < 0) { error("fork: %m"); return SLURM_ERROR; - } else if (pid == 0) /* child */ + } else if (pid == 0) { /* child */ + /* + * Stall exec until pgid is set by parent + */ + if (read (fdpair[0], &c, sizeof (c)) != 1) + error ("pgrp child read failed: %m"); + close (fdpair[0]); + close (fdpair[1]); + _exec_task(job, i); + } /* Parent continues: */ @@ -310,6 +324,22 @@ _exec_all_tasks(slurmd_job_t *job) job->task[i]->pid = pid; + /* + * Set this child's pgid to pid of first task + */ + if (setpgid (pid, job->task[0]->pid) < 0) + error ("Unable to put task %d (pid %ld) into pgrp %ld", + i, pid, job->task[0]->pid); + + /* + * Now it's ok to unblock this child, so it may call exec + */ + if (write (fdpair[1], &c, sizeof (c)) != 1) + error ("write to unblock task %d failed", i); + + close (fdpair[0]); + close (fdpair[1]); + /* * Prepare process for attach by parallel debugger * (if specified and able) @@ -594,7 +624,7 @@ _pdebug_trace_process(slurmd_job_t *job, pid_t pid) if (job->task_flags & TASK_PARALLEL_DEBUG) { int status; waitpid(pid, &status, WUNTRACED); - if (kill(pid, SIGSTOP) < 0) + if ((pid > (pid_t) 0) && (kill(pid, SIGSTOP) < 0)) error("kill(%lu): %m", (unsigned long) pid); if (_PTRACE(PTRACE_DETACH, pid, NULL, 0)) error("ptrace(%lu): %m", (unsigned long) pid); diff --git a/src/srun/allocate.c b/src/srun/allocate.c index b04815ee19..06f5b3752c 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -132,6 +132,8 @@ existing_allocation(void) job.uid = getuid(); if (slurm_confirm_allocation(&job, &resp) < 0) { + if (opt.parallel_debug) + return NULL; /* create new allocation as needed */ if (errno == ESLURM_ALREADY_DONE) error ("SLURM job %u has expired.", job.job_id); else diff --git a/src/srun/fname.c b/src/srun/fname.c index c047d79c83..fa2c4f8bb0 100644 --- a/src/srun/fname.c +++ b/src/srun/fname.c @@ -41,14 +41,17 @@ fname_create(job_t *job, char *format) if ((format == NULL) || (strncasecmp(format, "all", (size_t) 3) == 0) || (strncmp(format, "-", (size_t) 1) == 0) ) { - fname->type = IO_ALL; - fname->name = NULL; - return fname; + /* "all" explicitly sets IO_ALL and is the default */ + return (fname); } if (strncasecmp(format, "none", (size_t) 4) == 0) { - fname->type = IO_NONE; - fname->name = "/dev/null"; + /* + * Set type to IO_PER_TASK so that /dev/null is opened + * on every node, which should be more efficient + */ + fname->type = IO_PER_TASK; + fname->name = xstrdup ("/dev/null"); return fname; } @@ -56,7 +59,11 @@ fname_create(job_t *job, char *format) if ((*p == '\0') && ((int) taskid < opt.nprocs)) { fname->type = IO_ONE; fname->taskid = (uint32_t) taskid; - fname->name = NULL; + /* Set the name string to pass to slurmd + * to the taskid requested, so that tasks with + * no IO can open /dev/null. + */ + fname->name = xstrdup (format); return fname; } @@ -123,3 +130,12 @@ fname_destroy(io_filename_t *f) xfree(f->name); xfree(f); } + +char * +fname_remote_string (io_filename_t *f) +{ + if ((f->type == IO_PER_TASK) || (f->type == IO_ONE)) + return (xstrdup (f->name)); + + return (NULL); +} diff --git a/src/srun/fname.h b/src/srun/fname.h index d6dfcd0441..32752d35f3 100644 --- a/src/srun/fname.h +++ b/src/srun/fname.h @@ -52,5 +52,7 @@ typedef struct srun_job * srun_job_t; io_filename_t * fname_create(srun_job_t job, char *format); void fname_destroy(io_filename_t *fname); +char * fname_remote_string (io_filename_t *fname); + #endif /* !_FNAME_H */ diff --git a/src/srun/io.c b/src/srun/io.c index 40cdf1a1dd..96ecb48eae 100644 --- a/src/srun/io.c +++ b/src/srun/io.c @@ -227,41 +227,34 @@ _flush_io(job_t *job) debug3("Read %dB from tasks, wrote %dB", nbytes, nwritten); } +static int +_initial_fd_state (io_filename_t *f, int task) +{ + if (f->type == IO_ALL) + return (WAITING_FOR_IO); + if (f->type == IO_ONE && f->taskid == task) + return (WAITING_FOR_IO); + + return (IO_DONE); +} + static void _io_thr_init(job_t *job, struct pollfd *fds) { - int out_fd_state = WAITING_FOR_IO; - int err_fd_state = WAITING_FOR_IO; int i; xassert(job != NULL); - /* - * XXX: Handle job->ofname/efname == IO_ONE - */ - _set_iofds_nonblocking(job); - if (job->ofname->type == IO_ALL) - out_fd_state = WAITING_FOR_IO; - else { - if (job->ifname->type != IO_ALL) - out_fd_state = IO_DONE; - else - out_fd_state = WAITING_FOR_IO; - - if (!opt.efname) - err_fd_state = IO_DONE; - } - - if ((job->efname->type == IO_ALL) && (err_fd_state != IO_DONE)) { - err_fd_state = WAITING_FOR_IO; - } else - err_fd_state = IO_DONE; - for (i = 0; i < opt.nprocs; i++) { - job->out[i] = out_fd_state; - job->err[i] = err_fd_state; + int instate = _initial_fd_state (job->ifname, i); + job->out[i] = _initial_fd_state (job->ofname, i); + job->err[i] = _initial_fd_state (job->efname, i); + + if (job->out[i] != WAITING_FOR_IO) + job->out[i] = instate; + } for (i = 0; i < job->niofds; i++) @@ -467,20 +460,30 @@ _fopen(char *filename) return fp; } +static int +_is_local_file (io_filename_t *fname) +{ + if (fname->name == NULL) + return (0); + + return ((fname->type != IO_PER_TASK) && (fname->type != IO_ONE)); +} + + int open_streams(job_t *job) { - if ((job->ifname->type != IO_PER_TASK) && job->ifname->name) + if (_is_local_file (job->ifname)) job->stdinfd = _stdin_open(job->ifname->name); else job->stdinfd = STDIN_FILENO; - if ((job->ofname->type != IO_PER_TASK) && job->ofname->name) + if (_is_local_file (job->ofname)) job->outstream = _fopen(job->ofname->name); else job->outstream = stdout; - if ((job->efname->type != IO_PER_TASK) && job->efname->name) + if (_is_local_file (job->efname)) job->errstream = _fopen(job->efname->name); else job->errstream = stderr; diff --git a/src/srun/launch.c b/src/srun/launch.c index beb32631d8..caddf84d58 100644 --- a/src/srun/launch.c +++ b/src/srun/launch.c @@ -130,12 +130,9 @@ launch(void *arg) r->slurmd_debug = opt.slurmd_debug; r->switch_job = job->switch_job; - if (job->ofname->type == IO_PER_TASK) - r->ofname = job->ofname->name; - if (job->efname->type == IO_PER_TASK) - r->efname = job->efname->name; - if (job->ifname->type == IO_PER_TASK) - r->ifname = job->ifname->name; + r->ofname = fname_remote_string (job->ofname); + r->efname = fname_remote_string (job->efname); + r->ifname = fname_remote_string (job->ifname); if (opt.parallel_debug) r->task_flags |= TASK_PARALLEL_DEBUG; @@ -222,8 +219,12 @@ static void _join_attached_threads (int nthreads, thd_t *th) { int i; void *retval; - for (i = 0; i < nthreads; i++) - pthread_join (th[i].thread, &retval); + if (!opt.parallel_debug) + return; + for (i = 0; i < nthreads; i++) { + if (th[i].thread != (pthread_t) NULL) + pthread_join (th[i].thread, &retval); + } return; } @@ -294,6 +295,7 @@ static void _p_launch(slurm_msg_t *req, job_t *job) if (job->ntask[i] == 0) { /* No tasks for this node */ debug("Node %s is unused",job->host[i]); job->host_state[i] = SRUN_HOST_REPLIED; + thd[i].thread = (pthread_t) NULL; continue; } diff --git a/src/srun/opt.c b/src/srun/opt.c index e18b976d0c..c3dd653757 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -401,6 +401,7 @@ static void _opt_default() opt.max_wait = slurm_get_wait_time(); opt.quit_on_intr = false; + opt.disable_status = false; opt.quiet = 0; _verbose = 0; @@ -416,7 +417,7 @@ static void _opt_default() opt.contiguous = false; opt.nodelist = NULL; opt.exc_nodes = NULL; - opt.max_launch_time = 60; /* 60 seconds to launch job */ + opt.max_launch_time = 120;/* 120 seconds to launch job */ opt.max_exit_timeout= 60; /* Warn user 60 seconds after task exit */ opt.msg_timeout = 5; /* Default launch msg timeout */ @@ -473,6 +474,7 @@ env_vars_t env_vars[] = { {"SLURM_STDOUTMODE", OPT_STRING, &opt.ofname, NULL }, {"SLURM_TIMELIMIT", OPT_INT, &opt.time_limit, NULL }, {"SLURM_WAIT", OPT_INT, &opt.max_wait, NULL }, + {"SLURM_DISABLE_STATUS",OPT_INT, &opt.disable_status,NULL }, {NULL, 0, NULL, NULL} }; @@ -623,6 +625,7 @@ static void _opt_args(int argc, char **argv) {"nodelist", required_argument, 0, 'w'}, {"wait", required_argument, 0, 'W'}, {"exclude", required_argument, 0, 'x'}, + {"disable-status", no_argument, 0,'X'}, {"no-allocate", no_argument, 0, 'Z'}, {"quit-on-interrupt", no_argument, 0, 'q'}, {"quiet", no_argument, 0, 'Q'}, @@ -645,7 +648,7 @@ static void _opt_args(int argc, char **argv) {NULL, 0, 0, 0} }; char *opt_string = "+a:Abc:C:d:D:e:Hi:IjJ:klm:n:N:" - "o:Op:Qr:st:T:uvVw:W:x:Zq"; + "o:Op:Qr:st:T:uvVw:W:x:XZq"; char **rest = NULL; opt.progname = xbasename(argv[0]); @@ -809,6 +812,9 @@ static void _opt_args(int argc, char **argv) if (!_valid_node_list(&opt.exc_nodes)) exit(1); break; + case (int)'X': + opt.disable_status = true; + break; case (int)'Z': opt.no_alloc = true; break; @@ -1246,7 +1252,7 @@ static bool _under_parallel_debugger (void) static void _usage(void) { printf("\ -Usage: srun [-N nnodes] [-n ntasks] [-i in] [-i in] [-e err] [-e err]\n\ +Usage: srun [-N nnodes] [-n ntasks] [-i in] [-o out] [-e err]\n\ [-c ncpus] [-r n] [-p partition] [--hold] [-t minutes]\n\ [-D path] [--immediate] [--overcommit] [--no-kill]\n\ [--share] [--label] [--unbuffered] [-m dist] [-J jobname]\n\ @@ -1290,6 +1296,7 @@ Parallel run options:\n\ -W, --wait=sec seconds to wait after first task exits\n\ before killing job\n\ -q, --quit-on-interrupt quit on single Ctrl-C\n\ + -X, --disable-status Disable Ctrl-C status feature\n\ -v, --verbose verbose mode (multiple -v's increase verbosity)\n\ -Q, --quiet quiet mode (suppress informational messages)\n\ -d, --slurmd-debug=level slurmd debug level\n\ diff --git a/src/srun/opt.h b/src/srun/opt.h index 281429f924..637e2295aa 100644 --- a/src/srun/opt.h +++ b/src/srun/opt.h @@ -133,6 +133,7 @@ typedef struct srun_options { bool share; /* --share, -s */ int max_wait; /* --wait, -W */ bool quit_on_intr; /* --quit-on-interrupt, -q */ + bool disable_status; /* --disable-status, -X */ int quiet; bool parallel_debug; /* srun controlled by debugger */ bool debugger_test; /* --debugger-test */ diff --git a/src/srun/signals.c b/src/srun/signals.c index 6947bcf7d0..737b2e52ba 100644 --- a/src/srun/signals.c +++ b/src/srun/signals.c @@ -195,7 +195,7 @@ _handle_intr(job_t *job, time_t *last_intr, time_t *last_intr_sent) pthread_exit (0); } - if ((time(NULL) - *last_intr) > 1) { + if (((time(NULL) - *last_intr) > 1) && !opt.disable_status) { info("interrupt (one more within 1 sec to abort)"); if (mode != MODE_ATTACH) report_task_status(job); @@ -228,15 +228,15 @@ _sig_thr(void *arg) sigset_t set; time_t last_intr = 0; time_t last_intr_sent = 0; - int signo; + int signo, err; while (!_sig_thr_done(job)) { xsignal_sigset_create(srun_sigarray, &set); - if (sigwait(&set, &signo) < 0) { - if (errno != EINTR) - error ("sigwait: %m"); + if ((err = sigwait(&set, &signo)) != 0) { + if (err != EINTR) + error ("sigwait: %s", slurm_strerror (err)); continue; } diff --git a/src/srun/srun.c b/src/srun/srun.c index 804c9ed3ad..02511842ba 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -96,7 +96,7 @@ static int _run_batch_job (void); static void _run_job_script(job_t *job); static int _set_batch_script_env(job_t *job); static int _set_rlimit_env(void); -static char *_sprint_task_cnt(job_t *job); +static char *_task_count_string(job_t *job); static void _switch_standalone(job_t *job); static int _become_user (void); @@ -210,7 +210,7 @@ int srun(int ac, char **av) setenvf("SLURM_JOBID=%u", job->jobid); setenvf("SLURM_NPROCS=%d", opt.nprocs); setenvf("SLURM_NNODES=%d", job->nhosts); - setenvf("SLURM_TASKS_PER_NODE=%s", (task_cnt = _sprint_task_cnt(job))); + setenvf("SLURM_TASKS_PER_NODE=%s", task_cnt = _task_count_string (job)); setenvf("SLURM_DISTRIBUTION=%s", format_distribution_t (opt.distribution)); @@ -274,12 +274,12 @@ int srun(int ac, char **av) } static char * -_sprint_task_cnt(job_t *job) +_task_count_string (job_t *job) { int i, last_val, last_cnt; - char *task_str = xstrdup(""); char tmp[16]; - + char *str = xstrdup (""); + last_val = job->ntask[0]; last_cnt = 1; for (i=1; inhosts; i++) { @@ -290,7 +290,7 @@ _sprint_task_cnt(job_t *job) sprintf(tmp, "%d(x%d),", last_val, last_cnt); else sprintf(tmp, "%d,", last_val); - xstrcat(task_str, tmp); + xstrcat(str, tmp); last_val = job->ntask[i]; last_cnt = 1; } @@ -299,8 +299,8 @@ _sprint_task_cnt(job_t *job) sprintf(tmp, "%d(x%d)", last_val, last_cnt); else sprintf(tmp, "%d", last_val); - xstrcat(task_str, tmp); - return task_str; + xstrcat(str, tmp); + return (str); } static void @@ -367,6 +367,8 @@ _run_batch_job(void) return SLURM_ERROR; } + _set_batch_script_env (NULL); + if (!(req = job_desc_msg_create_from_opts (script))) fatal ("Unable to create job request"); @@ -560,44 +562,22 @@ static int _set_batch_script_env(job_t *job) { int rc = SLURM_SUCCESS; - char *dist = NULL, *task_cnt; + char *dist = NULL; + char *p; struct utsname name; - if (job->jobid > 0) { - if (setenvf("SLURM_JOBID=%u", job->jobid)) { - error("Unable to set SLURM_JOBID environment"); - rc = SLURM_FAILURE; - } - } - - if (job->nhosts > 0) { - if (setenvf("SLURM_NNODES=%u", job->nhosts)) { - error("Unable to set SLURM_NNODES environment var"); - rc = SLURM_FAILURE; - } - } - - if (job->nodelist) { - if (setenvf("SLURM_NODELIST=%s", job->nodelist)) { - error("Unable to set SLURM_NODELIST environment var."); - rc = SLURM_FAILURE; - } - } - if (opt.nprocs_set && setenvf("SLURM_NPROCS=%u", opt.nprocs)) { error("Unable to set SLURM_NPROCS environment variable"); rc = SLURM_FAILURE; } - if ( opt.cpus_set && setenvf("SLURM_CPUS_PER_TASK=%u", opt.cpus_per_task) ) { error("Unable to set SLURM_CPUS_PER_TASK"); rc = SLURM_FAILURE; } - - if (opt.distribution != SRUN_DIST_UNKNOWN) { + if (job && opt.distribution != SRUN_DIST_UNKNOWN) { dist = (opt.distribution == SRUN_DIST_BLOCK) ? "block" : "cyclic"; @@ -625,11 +605,41 @@ _set_batch_script_env(job_t *job) rc = SLURM_FAILURE; } - if (setenvf("SLURM_TASKS_PER_NODE=%s", (task_cnt = _sprint_task_cnt(job)))) { - error("Unable to set SLURM_TASKS_PER_NODE environment variable"); - rc = SLURM_FAILURE; + /* + * If no job has been allocated yet, just return. We are + * submitting a batch job. + */ + if (job == NULL) + return (rc); + + + if (job->jobid > 0) { + if (setenvf("SLURM_JOBID=%u", job->jobid)) { + error("Unable to set SLURM_JOBID environment"); + rc = SLURM_FAILURE; + } + } + + if (job->nhosts > 0) { + if (setenvf("SLURM_NNODES=%u", job->nhosts)) { + error("Unable to set SLURM_NNODES environment var"); + rc = SLURM_FAILURE; + } + } + + if (job->nodelist) { + if (setenvf("SLURM_NODELIST=%s", job->nodelist)) { + error("Unable to set SLURM_NODELIST environment var."); + rc = SLURM_FAILURE; + } + } + if ((p = _task_count_string (job))) { + if (setenvf ("SLURM_TASKS_PER_NODE=%s", p)) { + error ("Can't set SLURM_TASKS_PER_NODE env variable"); + rc = SLURM_FAILURE; + } + xfree (p); } - xfree(task_cnt); uname(&name); if (strcasecmp(name.sysname, "AIX") == 0) { diff --git a/testsuite/slurm_unit/slurmctld/Makefile.am b/testsuite/slurm_unit/slurmctld/Makefile.am index f594507524..195837611c 100644 --- a/testsuite/slurm_unit/slurmctld/Makefile.am +++ b/testsuite/slurm_unit/slurmctld/Makefile.am @@ -7,7 +7,6 @@ TESTS = # even if they can't be run stand-alone. check_PROGRAMS = \ security_2_2 \ - security_2_4 \ $(TESTS) INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common diff --git a/testsuite/slurm_unit/slurmctld/security_2_1.csh b/testsuite/slurm_unit/slurmctld/security_2_1.csh index d42ba5ac57..4e2bea917d 100755 --- a/testsuite/slurm_unit/slurmctld/security_2_1.csh +++ b/testsuite/slurm_unit/slurmctld/security_2_1.csh @@ -2,7 +2,7 @@ setenv CONFIG /etc/slurm/slurm.conf setenv DEPLOY /usr -echo "Insure that files are not user writable" +echo "Insure that executable files are not user writable" ls -ld $DEPLOY/bin/srun ls -ld $DEPLOY/bin/sinfo ls -ld $DEPLOY/bin/squeue @@ -21,18 +21,18 @@ grep JobCredential $CONFIG ls -ld /etc/slurm/slurm.key ls -ld /etc/slurm/slurm.cert +echo "Plugin directory and its contents must be non-writable" grep PluginDir $CONFIG ls -ld /usr/lib/slurm ls -l /usr/lib/slurm -grep Prioritize $CONFIG -#echo "Prioritize will move to a plugin" - grep Prolog $CONFIG #ls -ld /admin/sbin/slurm.prolog +echo "Spool and log files must be non-writeable" grep SlurmdSpoolDir $CONFIG -ls -ld /tmp/slurmd - +ls -ld /var/spool/slurm grep StateSaveLocation $CONFIG ls -ld /usr/local/tmp/slurm/adev +grep SlurmctldLogFile $CONFIG +ls -ld /var/log/slurm/slurmctld.log diff --git a/testsuite/slurm_unit/slurmctld/security_2_4.c b/testsuite/slurm_unit/slurmctld/security_2_4.c deleted file mode 100644 index 92d889fbbb..0000000000 --- a/testsuite/slurm_unit/slurmctld/security_2_4.c +++ /dev/null @@ -1,40 +0,0 @@ -#include -#include -#include -#include -#include - -#include -#include - -/* Attempt to run a job with the incorrect user id and confirm an error */ -int -main (int argc, char *argv[]) -{ - int error_code; - job_desc_msg_t job_mesg; - resource_allocation_and_run_response_msg_t* run_resp_msg ; - - slurm_init_job_desc_msg( &job_mesg ); - job_mesg. user_id = getuid() + 1; - job_mesg. min_nodes = 1; - job_mesg. task_dist = SLURM_DIST_CYCLIC; - - error_code = slurm_allocate_resources_and_run ( &job_mesg , - &run_resp_msg ); - if (error_code == SLURM_SUCCESS) { - fprintf (stderr, "ERROR: The allocate succeeded\n"); - exit(1); - } else if ((error_code = slurm_get_errno()) != ESLURM_USER_ID_MISSING) { - fprintf (stderr, - "ERROR: Wrong error code received: %s instead of %s\n", - slurm_strerror(error_code), "ESLURM_USER_ID_MISSING"); - exit(1); - } else { - printf ("SUCCESS!\n"); - printf ("The allocate request was rejected as expected.\n"); - printf ("Check SlurmctldLog for an error message.\n"); - exit(0); - } -} - diff --git a/testsuite/slurm_unit/slurmd/security_3_1.c b/testsuite/slurm_unit/slurmd/security_3_1.c index 59e8cac2a3..b874425760 100644 --- a/testsuite/slurm_unit/slurmd/security_3_1.c +++ b/testsuite/slurm_unit/slurmd/security_3_1.c @@ -23,7 +23,7 @@ int main(int argc, char *argv[]) { batch_job_launch_msg_t launch_msg; int uid; - uint32_t jid; + uint32_t jid, cpu_arr[1]; if (argc != 2) { _usage(argv[0]); @@ -40,6 +40,10 @@ int main(int argc, char *argv[]) launch_msg.job_id = jid; launch_msg.uid = uid; launch_msg.nodes = argv[1]; + launch_msg.num_cpu_groups = 1; + cpu_arr[0] = 1; + launch_msg.cpus_per_node = cpu_arr; + launch_msg.cpu_count_reps = cpu_arr; launch_msg.err = "/dev/null"; launch_msg.in = "/dev/null"; launch_msg.out = "/dev/null";