Permalink
Browse files

This commit was manufactured by cvs2svn to create tag

'slurm-0-2-24-1'.
  • Loading branch information...
1 parent e5cea5e commit b371864c12c669e16272d1cceaf85a798f4d0a46 no author committed Feb 28, 2004
View
4 META
@@ -7,6 +7,6 @@
Name: slurm
Major: 0
Minor: 2
- Micro: 20
- Version: 0.2.20
+ Micro: 24
+ Version: 0.2.24
Release: 1
View
37 NEWS
@@ -1,6 +1,38 @@
This file describes changes in recent versions of SLURM. It primarily
documents those changes that are of interest to users and admins.
+* Changes in SLURM 0.2.24
+=========================
+ -- Fixes for reported problems:
+ - slurm/387: Jobs lost and nodes DOWN on slurmctld restart
+ -- Nodes no longer transition from COMPLETING to DOWN when not responding.
+ Instead, slurmctld either waits for positive verification of job
+ completion, or an administrator must explicitly set the node DOWN.
+ -- New srun option `-q, --quit-on-interrupt' enables single Ctrl-C to
+ terminate running job.
+ -- Improved error message from srun when jobid set in environment is
+ no longer running.
+ -- Added job and node state descriptions to the squeue(1) and sinfo(1)
+ man pages.
+
+* Changes in SLURM 0.2.23
+========================
+ -- Fixes for reported problems:
+ - slurm/381: Hold jobs requesting more resources than partition limit.
+
+* Changes in SLURM 0.2.22
+=========================
+ -- Fixes for reported problems:
+ - slurm/326: Node stays in completing state indefinitely.
+ - slurm/328: slurmd uses different shared memory key on restart.
+ - slurm/329: Job step processes may be left running when one task dies.
+ - slurm/334: slurmd segv with multiple simultaneous job steps.
+ -- Allow more digits for priority values in scontrol.
+ -- Applied various fixes for memory leaks.
+ -- Remove logic preventing DPCS from allocating jobs with more than
+ eight node segments. Fix for DPCS should now be in production.
+ -- Changed compact string for DRAINING state to "drng" from "drain."
+
* Changes in SLURM 0.2.21
=========================
-- Fixes for reported problems:
@@ -9,14 +41,17 @@ documents those changes that are of interest to users and admins.
- slurm/300: Possibly killing wrong job on slurmd restart
- slurm/312: Freeing non-allocated memory and killing slurmd
-- Assorted changes to support RedHat Enterprise Linux 3.0 and IA64
+ -- Initial Elan4 and libelanctrl support (--with-elan).
-- Slurmctld was sometimes inappropriately setting a job's priority
to 1 when a node was down (even if up nodes could be used for the
job when a running job completes)
-- Convert all user commands from use of popt library to getopt_long()
-- If TotalView support is requested, srun exports "totalview_jobid"
variable for `%J' expansion in TV bulk launch string.
+ -- Fix several locking bugs in slurmd IO layer.
-- Throttle back repetitious error messages in slurmd to avoid filling
- slurm logfiles.
+ log files.
+
* Changes in SLURM 0.2.20
=========================
View
@@ -1,4 +1,4 @@
-.TH SINFO "1" "October 2003" "sinfo 0.2" "Slurm components"
+.TH SINFO "1" "February 2004" "sinfo 0.2" "Slurm components"
.SH "NAME"
sinfo \- Used to view information about Slurm nodes and partitions.
@@ -209,6 +209,52 @@ that are presently not responding.
\fBTMP_DISK\fR
Size of temporary disk space in megabytes on these nodes.
+.SH "NODE STATE CODES"
+.PP
+Node state codes are shorted as required for the field size.
+If the node state code is followed by "*", this indicates the node
+is presently not responding and will not be allocated any new work.
+If the node remains non-responsive, it will be placed in the \fBDOWN\fR
+state.
+.TP 12
+ALLOCATED
+The node has been allocated to one or more jobs.
+.TP
+COMPLETING
+One or more jobs have been allocated this node and are in the process
+of COMPLETING. This node state will be left when all of the job's
+processes have terminated and the SLURM epilog program (if any) has
+terminated. See the \fBEpilog\fR parameter description in the
+\fBslurm.conf\fR man page for more information.
+.TP
+DOWN
+The node is unavailable for use. SLURM can automatically place nodes
+in this state if some failure occurs. System administrators may also
+explicitly place nodes in this state. If a node resumes normal operation,
+SLURM can automatically return it to service. See the \fBReturnToService\fR
+and \fBSlurmdTimeout\fR parameter descriptions in the \fBslurm.conf\fR(5)
+man page for more information.
+.TP
+DRAINED
+The node is unavailable for use per system administrator request.
+See the \fBupdate node\fR command in the \fBscontrol\fR(1) man page
+or the \fBslurm.conf\fR(5) man page for more information.
+.TP
+DRAINING
+The node is currently executing a job, but will not be allocated to
+additional jobs. The node state will be changed to state \fBDRAINED\fR
+when the last job on it completes. Nodes enter this state per system
+administrator request. See the \fBupdate node\fR command in the
+\fBscontrol\fR(1) man page or the \fBslurm.conf\fR(5) man page for
+more information.
+.TP
+IDLE
+The node is not allocated to any jobs and is available for use.
+.TP
+UNKNOWN
+The SLURM controller has just started and the node's state has not
+yet been determined.
+
.SH "ENVIRONMENT VARIABLES"
.PP
Some \fBsinfo\fR options may be set via environment variables. These
View
@@ -1,4 +1,4 @@
-.TH SQUEUE "1" "October 2003" "squeue 0.2" "Slurm components"
+.TH SQUEUE "1" "February 2004" "squeue 0.2" "Slurm components"
.SH "NAME"
squeue \- Used to view information of jobs located in the scheduling queue.
@@ -148,6 +148,26 @@ Report details of squeues actions.
\fB\-V\fR , \fB\-\-version\fR
Print version information and exit.
+.SH "JOB STATE CODES"
+.TP 17
+CD COMPLETED
+Job has terminated all processes on all nodes.
+.TP
+CG COMPLETING
+Job is in the process of completing. Some processes on some nodes may still be active.
+.TP
+F FAILED
+Job terminated with non-zero exit code or other failure condition.
+.TP
+NF NODE_FAIL
+Job terminated due to failure of one or more allocated nodes.
+.TP
+PD PENDING
+Job is awaiting resource allocation.
+.TP
+TO TIMEOUT
+Job terminated upon reaching its time limit.
+
.SH "ENVIRONMENT VARIABLES"
.PP
Some \fBsqueue\fR options may be set via environment variables. These
View
@@ -205,10 +205,15 @@ on the same nodes or the values of \fBSlurmctldPort\fR and \fBSlurmdPort\fR
must be different.
.TP
\fBSlurmdSpoolDir\fR
-Fully qualified pathname of a file into which the \fBslurmd\fR daemon's state
-information is written. This must be a common pathname for all nodes, but
-should represent a file which is local to each node (reference a local file
-system). The default value is "/tmp/slurmd".
+Fully qualified pathname of a directory into which the \fBslurmd\fR
+daemon's state information and batch job script information are written. This
+must be a common pathname for all nodes, but should represent a directory which
+is local to each node (reference a local file system). The default value
+is "/var/spool/slurmd." \fBNOTE\fR: This directory is also used to store \fBslurmd\fR's
+shared memory lockfile, and \fBshould not be changed\fR unless the system
+is being cleanly restarted. If the location of \fBSlurmdSpoolDir\fR is
+changed and \fBslurmd\fR is restarted, the new daemon will attach to a
+different shared memory region and lose track of any running jobs.
.TP
\fBSlurmdTimeout\fR
The interval, in seconds, that the SLURM controller waits for \fBslurmd\fR
@@ -228,7 +233,7 @@ Fully qualified pathname of the file system available to user jobs for
temporary storage. This parameter is used in establishing a node's \fBTmpDisk\fR space.
The default value is "/tmp".
.TP
-\fBWaitTimefR
+\fBWaitTime\fR
Specifies how many seconds the srun command should by default wait after
the first task terminates before terminating all remaining tasks. The
"--wait" option on the srun command line overrides this value.
View
@@ -796,7 +796,8 @@ extern int slurm_update_node PARAMS(( update_node_msg_t * node_msg ));
* default values
* OUT job_desc_msg - user defined partition descriptor
*/
-void slurm_init_part_desc_msg PARAMS((update_part_msg_t * update_part_msg ));
+extern void slurm_init_part_desc_msg PARAMS((
+ update_part_msg_t * update_part_msg ));
/*
* slurm_load_partitions - issue RPC to get slurm all partition configuration
View
@@ -43,19 +43,22 @@ libslurm_la_SOURCES = \
common_dir = $(top_builddir)/src/common
-libslurm_la_LIBADD = $(common_dir)/libcommon.la -lpthread
+libslurm_la_LIBADD = \
+ $(common_dir)/libcommon.la -lpthread
-libslurm_la_LDFLAGS = -export-symbols libslurm.sym \
- -version-info $(current):$(rev):$(age)
+libslurm_la_LDFLAGS = \
+ -export-symbols libslurm.sym \
+ -version-info $(current):$(rev):$(age)
-libslurm_la_DEPENDENCIES = libslurm.sym $(common_dir)/libcommon.la
+libslurm_la_DEPENDENCIES = \
+ libslurm.sym \
+ $(common_dir)/libcommon.la
-
-libslurm.sym : $(top_builddir)/slurm/slurm.h
- sed -n 's/^extern.* \(slurm[^ ]*\).*$$/\1/p' $< >libslurm.sym
+libslurm.sym : $(top_builddir)/slurm/slurm.h
+ -sed -n 's/^extern .* \([a-zA-Z0-9_]*\) PARAMS.*$$/\1/p' $< $* >libslurm.sym
distclean-local:
- -rm libslurm.sym
+ -rm -rf libslurm.map libslurm.sym
force:
$(libslurm_la_LIBADD) : force
View
@@ -89,6 +89,7 @@ slurm_allocate_resources (job_desc_msg_t *req,
if (rc == SLURM_SOCKET_ERROR)
return SLURM_SOCKET_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_SLURM_RC:
if (_handle_rc_msg(&resp_msg) < 0)
@@ -127,6 +128,7 @@ int slurm_job_will_run (job_desc_msg_t *req,
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_SOCKET_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_SLURM_RC:
if (_handle_rc_msg(&resp_msg) < 0)
@@ -183,7 +185,7 @@ slurm_allocate_resources_and_run (job_desc_msg_t *req,
if (rc == SLURM_SOCKET_ERROR)
return SLURM_SOCKET_ERROR;
-
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_SLURM_RC:
if (_handle_rc_msg(&resp_msg) < 0)
@@ -222,6 +224,7 @@ slurm_job_step_create (job_step_create_request_msg_t *req,
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_SLURM_RC:
if (_handle_rc_msg(&resp_msg) < 0)
@@ -259,6 +262,7 @@ slurm_confirm_allocation (old_job_alloc_msg_t *req,
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch(resp_msg.msg_type) {
case RESPONSE_SLURM_RC:
if (_handle_rc_msg(&resp_msg) < 0)
View
@@ -73,7 +73,8 @@ slurm_kill_job_step (uint32_t job_id, uint32_t step_id, uint16_t signal)
if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0)
return SLURM_FAILURE;
- if (rc) slurm_seterrno_ret(rc);
+ if (rc)
+ slurm_seterrno_ret(rc);
return SLURM_SUCCESS;
}
View
@@ -85,7 +85,8 @@ slurm_complete_job_step ( uint32_t job_id, uint32_t step_id,
if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0)
return SLURM_ERROR;
- if (rc) slurm_seterrno_ret(rc);
+ if (rc)
+ slurm_seterrno_ret(rc);
return SLURM_PROTOCOL_SUCCESS;
}
View
@@ -151,10 +151,10 @@ slurm_load_ctl_conf (time_t update_time, slurm_ctl_conf_t **confp)
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_BUILD_INFO:
*confp = (slurm_ctl_conf_info_msg_t *) resp_msg.data;
- slurm_free_cred(resp_msg.cred);
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
View
@@ -219,10 +219,10 @@ slurm_load_jobs (time_t update_time, job_info_msg_t **resp)
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_JOB_INFO:
*resp = (job_info_msg_t *)resp_msg.data;
- slurm_free_cred(resp_msg.cred);
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
@@ -267,6 +267,7 @@ slurm_pid2jobid (pid_t job_pid, uint32_t *jobid)
if (slurm_send_recv_node_msg(&req_msg, &resp_msg, 0) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_JOB_ID:
*jobid = ((job_id_response_msg_t *) resp_msg.data)->job_id;
View
@@ -125,10 +125,10 @@ slurm_get_job_steps (time_t update_time, uint32_t job_id, uint32_t step_id,
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_JOB_STEP_INFO:
*resp = (job_step_info_response_msg_t *) resp_msg.data;
- slurm_free_cred(resp_msg.cred);
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
View
@@ -129,10 +129,10 @@ slurm_load_node (time_t update_time, node_info_msg_t **resp)
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_NODE_INFO:
*resp = (node_info_msg_t *) resp_msg.data;
- slurm_free_cred(resp_msg.cred);
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
View
@@ -161,10 +161,10 @@ slurm_load_partitions (time_t update_time, partition_info_msg_t **resp)
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_PARTITION_INFO:
*resp = (partition_info_msg_t *) resp_msg.data;
- slurm_free_cred(resp_msg.cred);
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
View
@@ -58,7 +58,8 @@ slurm_reconfigure ( void )
if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0)
return SLURM_ERROR;
- if (rc) slurm_seterrno_ret(rc);
+ if (rc)
+ slurm_seterrno_ret(rc);
return SLURM_PROTOCOL_SUCCESS;
}
@@ -129,6 +130,7 @@ _send_message_controller (enum controller_id dest, slurm_msg_t *req)
if ((rc = slurm_receive_msg(fd, &resp_msg, 0)) < 0)
slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_RECEIVE_ERROR);
+ slurm_free_cred(resp_msg.cred);
if (slurm_shutdown_msg_conn(fd) != SLURM_SUCCESS)
slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_SHUTDOWN_ERROR);
View
@@ -87,6 +87,7 @@ slurm_submit_batch_job (job_desc_msg_t *req,
if (rc == SLURM_SOCKET_ERROR)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
Oops, something went wrong.

0 comments on commit b371864

Please sign in to comment.