Permalink
Browse files

This commit was manufactured by cvs2svn to create tag

'slurm-0-3-9-1'.
  • Loading branch information...
1 parent 8e60bcf commit 62cd8e71c032033467da424115cff70ed6a3445e no author committed Nov 12, 2004
View
4 META
@@ -9,8 +9,8 @@
Name: slurm
Major: 0
Minor: 3
- Micro: 7
- Version: 0.3.7
+ Micro: 9
+ Version: 0.3.9
Release: 1
API_CURRENT: 5
API_AGE: 4
View
17 NEWS
@@ -1,6 +1,23 @@
This file describes changes in recent versions of SLURM. It primarily
documents those changes that are of interest to users and admins.
+* Changes in SLURM 0.3.9
+========================
+ -- Fixes for reported problems:
+ - slurm/512: Let job steps run on DRAINING nodes
+ - slurm/513: Gracefully deal with UIDs missing from passwd file
+ -- Create new allocation as needed for debugger in case old allocation
+ has been purged
+ -- Fix bug in scheduling jobs when a processor count is specified
+ and FastSchedule=0 and the cluster is heterogeneous.
+ -- Fix srun bug when --input, --output and --error are all "none"
+ -- Allow single task id to be selected with --input, --output, and --error.
+ -- Create shared memory segment for Elan statistics when using the
+ switch/elan plugin.
+
+* Changes in SLURM 0.3.8
+========================
+ -- More fixes necessary for TotalView.
* Changes in SLURM 0.3.7
========================
View
@@ -171,6 +171,7 @@ Possible values are"YES" and "NO".
Set the job's required features on nodes specified value. Multiple values
may be comma separated if all features are required (AND operation) or
separated by "|" if any of the specified features are required (OR operation).
+Value may be cleared with blank data value, "Features=".
.TP
\fIJobId\fP=<id>
Identify the job to be updated. This specification is required.
@@ -196,6 +197,7 @@ Set the job's priority to the specified value.
\fIReqNodeList\fP=<nodes>
Set the job's list of required node. Multiple node names may be specified using
simple node range expressions (e.g. "lx[10-20]").
+Value may be cleared with blank data value, "ReqNodeList=".
.TP
\fIReqNodes\fP=<count>
Set the job's count of required nodes to the specified value.
@@ -232,8 +234,9 @@ changing its underlying state.
\fBSPECIFICATIONS FOR UPDATE AND DELETE COMMANDS, PARTITIONS\fR
.TP
\fIAllowGroups\fP=<name>
-Identify the user groups which may use this partition. Multiple groups
-may be specified in a comma separated list.
+Identify the user groups which may use this partition.
+Multiple groups may be specified in a comma separated list.
+To permit all groups to use the partition specify "AllowGroups=ALL"
.TP
\fIDefault\fP=<yes|no>
Specify if this partition is to be used by jobs which do not explicitly
@@ -249,6 +252,7 @@ Possible values are"YES" and "NO".
Identify the node(s) to be associated with this partition. Multiple node names
may be specified using simple node range expressions (e.g. "lx[10-20]").
Note that jobs may only be associated with one partition at any time.
+Specify a blank data value to remove all nodes from a partition: "Nodes=".
.TP
\fIPartitionName\fP=<name>
Identify the partition to be updated. This specification is required.
View
@@ -223,9 +223,11 @@ the job. By default only errors are displayed.
.TP
\fB\-W\fR, \fB\-\-wait\fR=\fIseconds\fR
Specify how long to wait after the first task terminates before terminating
-all remaining tasks. The default value is unlimited. This can be useful to
-insure that a job is terminated in a timely fashion in the event that one
-or more tasks terminate prematurely.
+all remaining tasks. A value of 0 indicates an unlimited wait (a warning will
+be issued after 60 seconds). The default value is set by the WaitTime
+parameter in the slurm configuration file (see \fBslurm.conf(5)\fR). This
+option can be useful to insure that a job is terminated in a timely fashion
+in the event that one or more tasks terminate prematurely.
.TP
\fB\-q\fR, \fB\-\-quit-on-interrupt\fR
Quit immediately on single SIGINT (Ctrl-C). Use of this option
View
@@ -16,6 +16,11 @@
# $Id$
+BINDIR=/usr/bin
+CONFDIR=/etc/slurm
+LIBDIR=/usr/lib
+SBINDIR=/usr/sbin
+
# Source function library.
[ -f /etc/rc.d/init.d/functions ] || exit 0
. /etc/rc.d/init.d/functions
@@ -28,14 +33,17 @@ else
SLURMD_OPTIONS=""
fi
-[ -f /etc/slurm/slurm.conf ] || exit 1
+[ -f $CONFDIR/slurm.conf ] || exit 1
+
+# setup library paths for slurm and munge support
+export LD_LIBRARY_PATH="$LIBDIR:$LD_LIBRARY_PATH"
RETVAL=0
start() {
echo -n "starting $1: "
unset HOME MAIL USER USERNAME
- daemon /usr/sbin/$1 $2
+ daemon $SBINDIR/$1 $2
RETVAL=$?
echo
touch /var/lock/subsys/slurm
@@ -52,7 +60,7 @@ stop() {
}
startall() {
- for prog in `scontrol show daemons`; do
+ for prog in `$BINDIR/scontrol show daemons`; do
optvar=`echo ${prog}_OPTIONS | tr "a-z" "A-Z"`
start $prog ${!optvar}
done
@@ -68,7 +76,7 @@ slurmstatus() {
local rpid
local pidfile
- pidfile=`grep -i ${base}pid /etc/slurm/slurm.conf | grep -v '^ *#'`
+ pidfile=`grep -i ${base}pid $CONFDIR/slurm.conf | grep -v '^ *#'`
if [ $? = 0 ]; then
pidfile=${pidfile##*=}
pidfile=${pidfile%#*}
@@ -123,33 +131,34 @@ case "$1" in
startall
;;
stop)
- for prog in `scontrol show daemons`; do
+ for prog in `$BINDIR/scontrol show daemons`; do
stop $prog
done
;;
status)
- for prog in `scontrol show daemons`; do
+ for prog in `$BINDIR/scontrol show daemons`; do
slurmstatus $prog
done
;;
restart)
- $0 stop && $0 start
+ $0 stop
+ $0 start
;;
condrestart)
if [ -f /var/lock/subsys/slurm ]; then
- for prog in `scontrol show daemons`; do
+ for prog in `$BINDIR/scontrol show daemons`; do
stop $prog
start $prog
done
fi
;;
reconfig)
- for prog in `scontrol show daemons`; do
+ for prog in `$BINDIR/scontrol show daemons`; do
killproc $prog -HUP
done
;;
test)
- for prog in `scontrol show daemons`; do
+ for prog in `$BINDIR/scontrol show daemons`; do
echo "$prog runs here"
done
;;
View
@@ -24,8 +24,12 @@ Requires: openssl >= 0.9.6
#
%if %{?_with_debug:1}%{!?_with_debug:0}
%define _enable_debug --enable-debug
- %define __os_install_post /usr/lib/rpm/brp-compress
%endif
+#
+# Never allow rpm to strip binaries as this will break
+# parallel debugging capability
+#
+%define __os_install_post /usr/lib/rpm/brp-compress
%package devel
@@ -111,13 +115,13 @@ DESTDIR="$RPM_BUILD_ROOT" make install
install -D -m755 etc/init.d.slurm $RPM_BUILD_ROOT/etc/rc.d/init.d/slurm
install -D -m644 etc/slurm.conf.example $RPM_BUILD_ROOT/etc/slurm/slurm.conf
# Delete unpackaged files:
-rm -f $RPM_BUILD_ROOT/usr/lib/slurm/*.{a,la}
+rm -f $RPM_BUILD_ROOT/%{_libdir}/slurm/*.{a,la}
# Build file lists for optional plugin packages
for plugin in auth_munge auth_authd switch_elan; do
LIST=./${plugin}.files
touch $LIST
- test -f $RPM_BUILD_ROOT/usr/lib/slurm/${plugin}.so &&
+ test -f $RPM_BUILD_ROOT/%{_libdir}/slurm/${plugin}.so &&
echo %{_libdir}/slurm/${plugin}.so > $LIST
done
@@ -197,17 +201,12 @@ rm -rf $RPM_BUILD_ROOT
%post
/sbin/ldconfig %{_libdir}
-if [ -x /etc/rc.d/init.d/slurm ]; then
- [ -x /sbin/chkconfig ] && /sbin/chkconfig --del slurm
+if [ $1 = 1 ]; then
[ -x /sbin/chkconfig ] && /sbin/chkconfig --add slurm
-# if ! /etc/rc.d/init.d/slurm status | grep -q running \
-# || ! /etc/rc.d/init.d/slurm test | grep -q slurmctld; then
-# /etc/rc.d/init.d/slurm start
-# fi
fi
%preun
-if [ "$1" = 0 ]; then
+if [ $1 = 0 ]; then
if [ -x /etc/rc.d/init.d/slurm ]; then
[ -x /sbin/chkconfig ] && /sbin/chkconfig --del slurm
if /etc/rc.d/init.d/slurm status | grep -q running; then
@@ -224,6 +223,11 @@ fi
%changelog
+* Fri Oct 01 2004 Mark Grondona <mgrondona@llnl.gov>
+- don't delete and add service in %post
+* Wed Aug 04 2004 Mark Grondona <mgrondona@llnl.gov>
+- don't allow rpm to strip binaries since this breaks interface to parallel
+ debuggers.
* Thu Jul 29 2004 Morris Jette <jette1@llnl.gov>
- added checkpoint_none.so and jobcomp_script.so plugins
* Fri Mar 07 2004 Mark Grondona <mgrondona@llnl.gov>
@@ -902,6 +902,9 @@ int slurm_send_rc_msg(slurm_msg_t *msg, int rc)
{
slurm_msg_t resp_msg;
return_code_msg_t rc_msg;
+
+ if (msg->conn_fd < 0)
+ return (ENOTCONN);
rc_msg.return_code = rc;
@@ -452,7 +452,7 @@ _decode_cred(char *m, slurm_auth_credential_t *c)
error ("Munge decode failed: %s %s",
munge_ctx_strerror(ctx), retry ? "(retrying ...)": "");
- if ((e = EMUNGE_SOCKET) && retry--)
+ if ((e == EMUNGE_SOCKET) && retry--)
goto again;
/*
@@ -162,7 +162,7 @@ _get_user_name(uint32_t user_id, char *user_name, int buf_size)
else
snprintf(user_name, buf_size, "Unknown");
cache_uid = user_id;
- snprintf(cache_name, sizeof(cache_name), user_info->pw_name);
+ snprintf(cache_name, sizeof(cache_name), user_name);
}
}
@@ -218,7 +218,7 @@ int slurm_jobcomp_log_record ( struct job_record *job_ptr )
job_state_string(job_state),
job_ptr->partition, lim_str, start_str,
end_str, job_ptr->nodes);
- tot_size = (strlen(job_rec) + 1);
+ tot_size = strlen(job_rec);
while ( offset < tot_size ) {
wrote = write(job_comp_fd, job_rec + offset,
@@ -35,6 +35,8 @@
#include <sys/param.h>
#include <sys/types.h>
#include <sys/wait.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
#include <syslog.h>
#include <errno.h>
#include <string.h>
@@ -56,8 +58,17 @@
* want to include here since we are using the new
* version-nonspecific libelanctrl.
* (XXX: What is the equivalent in libelanctrl?)
+ *
+ * slurm/482: the elan USER context range is now split
+ * into two segments, regular user context and RMS
+ * context ranges. Do not allow a context range
+ * (lowcontext -- highcontext) to span these two segments,
+ * as this will generate and elan initialization error
+ * when MPI tries to attach to the capability. For now,
+ * restrict SLURM's range to the RMS one (starting at 0x400)
+ *
*/
-# define ELAN_USER_BASE_CONTEXT_NUM 0x020
+# define ELAN_USER_BASE_CONTEXT_NUM 0x400 /* act. RMS_BASE_CONTEXT_NUM */
# define ELAN_USER_TOP_CONTEXT_NUM 0x7ff
# define Version cap_version
@@ -108,6 +119,7 @@
#define QSW_CTX_END ELAN_USER_TOP_CONTEXT_NUM - 1
#define QSW_CTX_INVAL (-1)
+
/*
* We are going to some trouble to keep these defs private so slurm
* hackers not interested in the interconnect details can just pass around
@@ -152,6 +164,8 @@ static qsw_libstate_t qsw_internal_state = NULL;
static pthread_mutex_t qsw_lock = PTHREAD_MUTEX_INITIALIZER;
static elanhost_config_t elanconf = NULL;
+static int shmid = -1;
+
/*
* Allocate a qsw_libstate_t.
@@ -664,6 +678,10 @@ qsw_setup_jobinfo(qsw_jobinfo_t j, int nprocs, bitstr_t *nodeset,
int
qsw_prgdestroy(qsw_jobinfo_t jobinfo)
{
+
+ if (shmid >= 0)
+ shmctl (shmid, IPC_RMID, NULL);
+
if (rms_prgdestroy(jobinfo->j_prognum) < 0) {
/* translate errno values to more descriptive ones */
switch (errno) {
@@ -698,6 +716,50 @@ qsw_prog_fini(qsw_jobinfo_t jobinfo)
#endif
}
+/* Key for Elan stats shared memory segment is the
+ * rms.o program description number, left shifted 9 less 1
+ * to avoid conflicts with MPI shared memory
+ */
+static int elan_statkey (int prgid)
+{
+ return ((prgid << 9) - 1);
+}
+
+/*
+ * Return the statkey to caller if shared memory was created
+ */
+int qsw_statkey (qsw_jobinfo_t jobinfo)
+{
+ return (shmid > 0 ? elan_statkey (jobinfo->j_prognum) : -1);
+}
+
+/*
+ * Create shared memory segment for Elan stats use
+ * (ELAN_STATKEY env var is set in switch_elan.c)
+ */
+static int
+_qsw_shmem_create (qsw_jobinfo_t jobinfo, uid_t uid)
+{
+ struct shmid_ds shm;
+ ELAN_CAPABILITY *cap = &jobinfo->j_cap;
+ key_t key = elan_statkey (jobinfo->j_prognum);
+ int maxLocal = cap->HighContext - cap->LowContext + 2;
+ int pgsize = getpagesize ();
+
+ if ((shmid = shmget (key, pgsize * (maxLocal + 1), IPC_CREAT)) < 0)
+ return (error ("Failed to create Elan state shmem: %m"));
+
+ /* Ensure permissions on segment allow user read/write access
+ */
+ shm.shm_perm.uid = uid;
+ shm.shm_perm.mode = 0600;
+
+ if (shmctl (shmid, IPC_SET, &shm) < 0)
+ return (error ("Failed to set perms on Elan state shm: %m"));
+
+ return (0);
+}
+
/*
* Process 2: Create the context and make capability available to children.
*/
@@ -785,6 +847,14 @@ qsw_prog_init(qsw_jobinfo_t jobinfo, uid_t uid)
goto fail;
}
+
+ /*
+ * Create shared memory for libelan state
+ * Failure to create shared memory is not a fatal error.
+ */
+ _qsw_shmem_create (jobinfo, uid);
+
+
/* note: _elan3_fini() destroys context and makes capability unavail */
/* do it in qsw_prog_fini() after app terminates */
return 0;
Oops, something went wrong.

0 comments on commit 62cd8e7

Please sign in to comment.