Skip to content

Commit

Permalink
This commit was manufactured by cvs2svn to create tag
Browse files Browse the repository at this point in the history
'slurm-0-3-9-1'.
  • Loading branch information
no author committed Nov 12, 2004
1 parent 8e60bcf commit 62cd8e7
Show file tree
Hide file tree
Showing 31 changed files with 376 additions and 195 deletions.
4 changes: 2 additions & 2 deletions META
Expand Up @@ -9,8 +9,8 @@
Name: slurm
Major: 0
Minor: 3
Micro: 7
Version: 0.3.7
Micro: 9
Version: 0.3.9
Release: 1
API_CURRENT: 5
API_AGE: 4
Expand Down
17 changes: 17 additions & 0 deletions NEWS
@@ -1,6 +1,23 @@
This file describes changes in recent versions of SLURM. It primarily
documents those changes that are of interest to users and admins.

* Changes in SLURM 0.3.9
========================
-- Fixes for reported problems:
- slurm/512: Let job steps run on DRAINING nodes
- slurm/513: Gracefully deal with UIDs missing from passwd file
-- Create new allocation as needed for debugger in case old allocation
has been purged
-- Fix bug in scheduling jobs when a processor count is specified
and FastSchedule=0 and the cluster is heterogeneous.
-- Fix srun bug when --input, --output and --error are all "none"
-- Allow single task id to be selected with --input, --output, and --error.
-- Create shared memory segment for Elan statistics when using the
switch/elan plugin.

* Changes in SLURM 0.3.8
========================
-- More fixes necessary for TotalView.

* Changes in SLURM 0.3.7
========================
Expand Down
8 changes: 6 additions & 2 deletions doc/man/man1/scontrol.1
Expand Up @@ -171,6 +171,7 @@ Possible values are"YES" and "NO".
Set the job's required features on nodes specified value. Multiple values
may be comma separated if all features are required (AND operation) or
separated by "|" if any of the specified features are required (OR operation).
Value may be cleared with blank data value, "Features=".
.TP
\fIJobId\fP=<id>
Identify the job to be updated. This specification is required.
Expand All @@ -196,6 +197,7 @@ Set the job's priority to the specified value.
\fIReqNodeList\fP=<nodes>
Set the job's list of required node. Multiple node names may be specified using
simple node range expressions (e.g. "lx[10-20]").
Value may be cleared with blank data value, "ReqNodeList=".
.TP
\fIReqNodes\fP=<count>
Set the job's count of required nodes to the specified value.
Expand Down Expand Up @@ -232,8 +234,9 @@ changing its underlying state.
\fBSPECIFICATIONS FOR UPDATE AND DELETE COMMANDS, PARTITIONS\fR
.TP
\fIAllowGroups\fP=<name>
Identify the user groups which may use this partition. Multiple groups
may be specified in a comma separated list.
Identify the user groups which may use this partition.
Multiple groups may be specified in a comma separated list.
To permit all groups to use the partition specify "AllowGroups=ALL"
.TP
\fIDefault\fP=<yes|no>
Specify if this partition is to be used by jobs which do not explicitly
Expand All @@ -249,6 +252,7 @@ Possible values are"YES" and "NO".
Identify the node(s) to be associated with this partition. Multiple node names
may be specified using simple node range expressions (e.g. "lx[10-20]").
Note that jobs may only be associated with one partition at any time.
Specify a blank data value to remove all nodes from a partition: "Nodes=".
.TP
\fIPartitionName\fP=<name>
Identify the partition to be updated. This specification is required.
Expand Down
8 changes: 5 additions & 3 deletions doc/man/man1/srun.1
Expand Up @@ -223,9 +223,11 @@ the job. By default only errors are displayed.
.TP
\fB\-W\fR, \fB\-\-wait\fR=\fIseconds\fR
Specify how long to wait after the first task terminates before terminating
all remaining tasks. The default value is unlimited. This can be useful to
insure that a job is terminated in a timely fashion in the event that one
or more tasks terminate prematurely.
all remaining tasks. A value of 0 indicates an unlimited wait (a warning will
be issued after 60 seconds). The default value is set by the WaitTime
parameter in the slurm configuration file (see \fBslurm.conf(5)\fR). This
option can be useful to insure that a job is terminated in a timely fashion
in the event that one or more tasks terminate prematurely.
.TP
\fB\-q\fR, \fB\-\-quit-on-interrupt\fR
Quit immediately on single SIGINT (Ctrl-C). Use of this option
Expand Down
29 changes: 19 additions & 10 deletions etc/init.d.slurm
Expand Up @@ -16,6 +16,11 @@

# $Id$

BINDIR=/usr/bin
CONFDIR=/etc/slurm
LIBDIR=/usr/lib
SBINDIR=/usr/sbin

# Source function library.
[ -f /etc/rc.d/init.d/functions ] || exit 0
. /etc/rc.d/init.d/functions
Expand All @@ -28,14 +33,17 @@ else
SLURMD_OPTIONS=""
fi

[ -f /etc/slurm/slurm.conf ] || exit 1
[ -f $CONFDIR/slurm.conf ] || exit 1

# setup library paths for slurm and munge support
export LD_LIBRARY_PATH="$LIBDIR:$LD_LIBRARY_PATH"

RETVAL=0

start() {
echo -n "starting $1: "
unset HOME MAIL USER USERNAME
daemon /usr/sbin/$1 $2
daemon $SBINDIR/$1 $2
RETVAL=$?
echo
touch /var/lock/subsys/slurm
Expand All @@ -52,7 +60,7 @@ stop() {
}

startall() {
for prog in `scontrol show daemons`; do
for prog in `$BINDIR/scontrol show daemons`; do
optvar=`echo ${prog}_OPTIONS | tr "a-z" "A-Z"`
start $prog ${!optvar}
done
Expand All @@ -68,7 +76,7 @@ slurmstatus() {
local rpid
local pidfile

pidfile=`grep -i ${base}pid /etc/slurm/slurm.conf | grep -v '^ *#'`
pidfile=`grep -i ${base}pid $CONFDIR/slurm.conf | grep -v '^ *#'`
if [ $? = 0 ]; then
pidfile=${pidfile##*=}
pidfile=${pidfile%#*}
Expand Down Expand Up @@ -123,33 +131,34 @@ case "$1" in
startall
;;
stop)
for prog in `scontrol show daemons`; do
for prog in `$BINDIR/scontrol show daemons`; do
stop $prog
done
;;
status)
for prog in `scontrol show daemons`; do
for prog in `$BINDIR/scontrol show daemons`; do
slurmstatus $prog
done
;;
restart)
$0 stop && $0 start
$0 stop
$0 start
;;
condrestart)
if [ -f /var/lock/subsys/slurm ]; then
for prog in `scontrol show daemons`; do
for prog in `$BINDIR/scontrol show daemons`; do
stop $prog
start $prog
done
fi
;;
reconfig)
for prog in `scontrol show daemons`; do
for prog in `$BINDIR/scontrol show daemons`; do
killproc $prog -HUP
done
;;
test)
for prog in `scontrol show daemons`; do
for prog in `$BINDIR/scontrol show daemons`; do
echo "$prog runs here"
done
;;
Expand Down
24 changes: 14 additions & 10 deletions slurm.spec.in
Expand Up @@ -24,8 +24,12 @@ Requires: openssl >= 0.9.6
#
%if %{?_with_debug:1}%{!?_with_debug:0}
%define _enable_debug --enable-debug
%define __os_install_post /usr/lib/rpm/brp-compress
%endif
#
# Never allow rpm to strip binaries as this will break
# parallel debugging capability
#
%define __os_install_post /usr/lib/rpm/brp-compress


%package devel
Expand Down Expand Up @@ -111,13 +115,13 @@ DESTDIR="$RPM_BUILD_ROOT" make install
install -D -m755 etc/init.d.slurm $RPM_BUILD_ROOT/etc/rc.d/init.d/slurm
install -D -m644 etc/slurm.conf.example $RPM_BUILD_ROOT/etc/slurm/slurm.conf
# Delete unpackaged files:
rm -f $RPM_BUILD_ROOT/usr/lib/slurm/*.{a,la}
rm -f $RPM_BUILD_ROOT/%{_libdir}/slurm/*.{a,la}

# Build file lists for optional plugin packages
for plugin in auth_munge auth_authd switch_elan; do
LIST=./${plugin}.files
touch $LIST
test -f $RPM_BUILD_ROOT/usr/lib/slurm/${plugin}.so &&
test -f $RPM_BUILD_ROOT/%{_libdir}/slurm/${plugin}.so &&
echo %{_libdir}/slurm/${plugin}.so > $LIST
done

Expand Down Expand Up @@ -197,17 +201,12 @@ rm -rf $RPM_BUILD_ROOT

%post
/sbin/ldconfig %{_libdir}
if [ -x /etc/rc.d/init.d/slurm ]; then
[ -x /sbin/chkconfig ] && /sbin/chkconfig --del slurm
if [ $1 = 1 ]; then
[ -x /sbin/chkconfig ] && /sbin/chkconfig --add slurm
# if ! /etc/rc.d/init.d/slurm status | grep -q running \
# || ! /etc/rc.d/init.d/slurm test | grep -q slurmctld; then
# /etc/rc.d/init.d/slurm start
# fi
fi

%preun
if [ "$1" = 0 ]; then
if [ $1 = 0 ]; then
if [ -x /etc/rc.d/init.d/slurm ]; then
[ -x /sbin/chkconfig ] && /sbin/chkconfig --del slurm
if /etc/rc.d/init.d/slurm status | grep -q running; then
Expand All @@ -224,6 +223,11 @@ fi


%changelog
* Fri Oct 01 2004 Mark Grondona <mgrondona@llnl.gov>
- don't delete and add service in %post
* Wed Aug 04 2004 Mark Grondona <mgrondona@llnl.gov>
- don't allow rpm to strip binaries since this breaks interface to parallel
debuggers.
* Thu Jul 29 2004 Morris Jette <jette1@llnl.gov>
- added checkpoint_none.so and jobcomp_script.so plugins
* Fri Mar 07 2004 Mark Grondona <mgrondona@llnl.gov>
Expand Down
3 changes: 3 additions & 0 deletions src/common/slurm_protocol_api.c
Expand Up @@ -902,6 +902,9 @@ int slurm_send_rc_msg(slurm_msg_t *msg, int rc)
{
slurm_msg_t resp_msg;
return_code_msg_t rc_msg;

if (msg->conn_fd < 0)
return (ENOTCONN);

rc_msg.return_code = rc;

Expand Down
2 changes: 1 addition & 1 deletion src/plugins/auth/munge/auth_munge.c
Expand Up @@ -452,7 +452,7 @@ _decode_cred(char *m, slurm_auth_credential_t *c)
error ("Munge decode failed: %s %s",
munge_ctx_strerror(ctx), retry ? "(retrying ...)": "");

if ((e = EMUNGE_SOCKET) && retry--)
if ((e == EMUNGE_SOCKET) && retry--)
goto again;

/*
Expand Down
4 changes: 2 additions & 2 deletions src/plugins/jobcomp/filetxt/jobcomp_filetxt.c
Expand Up @@ -162,7 +162,7 @@ _get_user_name(uint32_t user_id, char *user_name, int buf_size)
else
snprintf(user_name, buf_size, "Unknown");
cache_uid = user_id;
snprintf(cache_name, sizeof(cache_name), user_info->pw_name);
snprintf(cache_name, sizeof(cache_name), user_name);
}
}

Expand Down Expand Up @@ -218,7 +218,7 @@ int slurm_jobcomp_log_record ( struct job_record *job_ptr )
job_state_string(job_state),
job_ptr->partition, lim_str, start_str,
end_str, job_ptr->nodes);
tot_size = (strlen(job_rec) + 1);
tot_size = strlen(job_rec);

while ( offset < tot_size ) {
wrote = write(job_comp_fd, job_rec + offset,
Expand Down
72 changes: 71 additions & 1 deletion src/plugins/switch/elan/qsw.c
Expand Up @@ -35,6 +35,8 @@
#include <sys/param.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <syslog.h>
#include <errno.h>
#include <string.h>
Expand All @@ -56,8 +58,17 @@
* want to include here since we are using the new
* version-nonspecific libelanctrl.
* (XXX: What is the equivalent in libelanctrl?)
*
* slurm/482: the elan USER context range is now split
* into two segments, regular user context and RMS
* context ranges. Do not allow a context range
* (lowcontext -- highcontext) to span these two segments,
* as this will generate and elan initialization error
* when MPI tries to attach to the capability. For now,
* restrict SLURM's range to the RMS one (starting at 0x400)
*
*/
# define ELAN_USER_BASE_CONTEXT_NUM 0x020
# define ELAN_USER_BASE_CONTEXT_NUM 0x400 /* act. RMS_BASE_CONTEXT_NUM */
# define ELAN_USER_TOP_CONTEXT_NUM 0x7ff

# define Version cap_version
Expand Down Expand Up @@ -108,6 +119,7 @@
#define QSW_CTX_END ELAN_USER_TOP_CONTEXT_NUM - 1
#define QSW_CTX_INVAL (-1)


/*
* We are going to some trouble to keep these defs private so slurm
* hackers not interested in the interconnect details can just pass around
Expand Down Expand Up @@ -152,6 +164,8 @@ static qsw_libstate_t qsw_internal_state = NULL;
static pthread_mutex_t qsw_lock = PTHREAD_MUTEX_INITIALIZER;
static elanhost_config_t elanconf = NULL;

static int shmid = -1;


/*
* Allocate a qsw_libstate_t.
Expand Down Expand Up @@ -664,6 +678,10 @@ qsw_setup_jobinfo(qsw_jobinfo_t j, int nprocs, bitstr_t *nodeset,
int
qsw_prgdestroy(qsw_jobinfo_t jobinfo)
{

if (shmid >= 0)
shmctl (shmid, IPC_RMID, NULL);

if (rms_prgdestroy(jobinfo->j_prognum) < 0) {
/* translate errno values to more descriptive ones */
switch (errno) {
Expand Down Expand Up @@ -698,6 +716,50 @@ qsw_prog_fini(qsw_jobinfo_t jobinfo)
#endif
}

/* Key for Elan stats shared memory segment is the
* rms.o program description number, left shifted 9 less 1
* to avoid conflicts with MPI shared memory
*/
static int elan_statkey (int prgid)
{
return ((prgid << 9) - 1);
}

/*
* Return the statkey to caller if shared memory was created
*/
int qsw_statkey (qsw_jobinfo_t jobinfo)
{
return (shmid > 0 ? elan_statkey (jobinfo->j_prognum) : -1);
}

/*
* Create shared memory segment for Elan stats use
* (ELAN_STATKEY env var is set in switch_elan.c)
*/
static int
_qsw_shmem_create (qsw_jobinfo_t jobinfo, uid_t uid)
{
struct shmid_ds shm;
ELAN_CAPABILITY *cap = &jobinfo->j_cap;
key_t key = elan_statkey (jobinfo->j_prognum);
int maxLocal = cap->HighContext - cap->LowContext + 2;
int pgsize = getpagesize ();

if ((shmid = shmget (key, pgsize * (maxLocal + 1), IPC_CREAT)) < 0)
return (error ("Failed to create Elan state shmem: %m"));

/* Ensure permissions on segment allow user read/write access
*/
shm.shm_perm.uid = uid;
shm.shm_perm.mode = 0600;

if (shmctl (shmid, IPC_SET, &shm) < 0)
return (error ("Failed to set perms on Elan state shm: %m"));

return (0);
}

/*
* Process 2: Create the context and make capability available to children.
*/
Expand Down Expand Up @@ -785,6 +847,14 @@ qsw_prog_init(qsw_jobinfo_t jobinfo, uid_t uid)
goto fail;
}


/*
* Create shared memory for libelan state
* Failure to create shared memory is not a fatal error.
*/
_qsw_shmem_create (jobinfo, uid);


/* note: _elan3_fini() destroys context and makes capability unavail */
/* do it in qsw_prog_fini() after app terminates */
return 0;
Expand Down

0 comments on commit 62cd8e7

Please sign in to comment.