Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

This commit was manufactured by cvs2svn to create tag

'slurm-0-2-27-1'.
  • Loading branch information...
commit 05331e620ca21f5bfda5df723274a6952b2a6859 1 parent e5cea5e
no author authored
Showing with 793 additions and 934 deletions.
  1. +2 −2 META
  2. +55 −1 NEWS
  3. +211 −121 doc/man/man1/sinfo.1
  4. +21 −1 doc/man/man1/squeue.1
  5. +6 −0 doc/man/man1/srun.1
  6. +10 −5 doc/man/man5/slurm.conf.5
  7. +2 −1  slurm/slurm.h.in
  8. +11 −8 src/api/Makefile.am
  9. +5 −1 src/api/allocate.c
  10. +2 −1  src/api/cancel.c
  11. +2 −1  src/api/complete.c
  12. +1 −1  src/api/config_info.c
  13. +2 −1  src/api/job_info.c
  14. +1 −1  src/api/job_step_info.c
  15. +1 −1  src/api/node_info.c
  16. +1 −1  src/api/partition_info.c
  17. +3 −1 src/api/reconfigure.c
  18. +1 −0  src/api/submit.c
  19. +0 −387 src/common/elanhosts.c
  20. +0 −121 src/common/elanhosts.h
  21. +6 −7 src/common/log.c
  22. +8 −3 src/common/slurm_auth.c
  23. +1 −1  src/common/slurm_errno.c
  24. +1 −1  src/common/slurm_protocol_defs.c
  25. +0 −4 src/common/slurm_protocol_pack.c
  26. +1 −0  src/common/xsignal.c
  27. +1 −0  src/plugins/auth/auth_munge.c
  28. +1 −1  src/scontrol/scontrol.c
  29. +135 −101 src/sinfo/opts.c
  30. +8 −0 src/sinfo/sinfo.c
  31. +3 −0  src/sinfo/sinfo.h
  32. +3 −4 src/slurmctld/agent.c
  33. +28 −9 src/slurmctld/controller.c
  34. +12 −32 src/slurmctld/job_mgr.c
  35. +20 −11 src/slurmctld/node_mgr.c
  36. +8 −2 src/slurmctld/node_scheduler.c
  37. +3 −0  src/slurmctld/partition_mgr.c
  38. +23 −6 src/slurmctld/ping_nodes.c
  39. +0 −1  src/slurmd/interconnect.h
  40. +32 −21 src/slurmd/io.c
  41. +8 −2 src/slurmd/mgr.c
  42. +5 −0 src/slurmd/req.c
  43. +30 −13 src/slurmd/shm.c
  44. +50 −43 src/slurmd/slurmd.c
  45. +3 −1 src/slurmd/ulimits.c
  46. +8 −4 src/srun/allocate.c
  47. +21 −3 src/srun/io.c
  48. +15 −5 src/srun/job.c
  49. +2 −1  src/srun/job.h
  50. +9 −2 src/srun/opt.c
  51. +1 −0  src/srun/opt.h
  52. +4 −0 src/srun/signals.c
  53. +6 −0 src/srun/srun.c
View
4 META
@@ -7,6 +7,6 @@
Name: slurm
Major: 0
Minor: 2
- Micro: 20
- Version: 0.2.20
+ Micro: 27
+ Version: 0.2.27
Release: 1
View
56 NEWS
@@ -1,6 +1,57 @@
This file describes changes in recent versions of SLURM. It primarily
documents those changes that are of interest to users and admins.
+* Changes in SLURM 0.2.27
+=========================
+ -- Fix for slurmd spinning when stdin buffers are full (gnats:434)
+ -- Forward stacksize limit to jobs.
+
+* Changes in SLURM 0.2.26
+=========================
+ -- Add missing documentation for srun -q,--quit-on-interrupt flag.
+ -- Add new sinfo option -R,--list-reasons for listing down nodes along
+ with their REASON field.
+ -- Add new sinfo options -r,--responding and -d,--dead for filtering
+ responding or non-responding nodes in output.
+
+* Changes in SLURM 0.2.25
+=========================
+ -- Fixes for reported problems:
+ - slurm/381: Hold jobs requesting more resources than partition limit.
+ (patch to fix in version 0.2.23 for certain combination of srun options).
+
+* Changes in SLURM 0.2.24
+=========================
+ -- Fixes for reported problems:
+ - slurm/387: Jobs lost and nodes DOWN on slurmctld restart
+ -- Nodes no longer transition from COMPLETING to DOWN when not responding.
+ Instead, slurmctld either waits for positive verification of job
+ completion, or an administrator must explicitly set the node DOWN.
+ -- New srun option `-q, --quit-on-interrupt' enables single Ctrl-C to
+ terminate running job.
+ -- Improved error message from srun when jobid set in environment is
+ no longer running.
+ -- Added job and node state descriptions to the squeue(1) and sinfo(1)
+ man pages.
+
+* Changes in SLURM 0.2.23
+========================
+ -- Fixes for reported problems:
+ - slurm/381: Hold jobs requesting more resources than partition limit.
+
+* Changes in SLURM 0.2.22
+=========================
+ -- Fixes for reported problems:
+ - slurm/326: Node stays in completing state indefinitely.
+ - slurm/328: slurmd uses different shared memory key on restart.
+ - slurm/329: Job step processes may be left running when one task dies.
+ - slurm/334: slurmd segv with multiple simultaneous job steps.
+ -- Allow more digits for priority values in scontrol.
+ -- Applied various fixes for memory leaks.
+ -- Remove logic preventing DPCS from allocating jobs with more than
+ eight node segments. Fix for DPCS should now be in production.
+ -- Changed compact string for DRAINING state to "drng" from "drain."
+
* Changes in SLURM 0.2.21
=========================
-- Fixes for reported problems:
@@ -9,14 +60,17 @@ documents those changes that are of interest to users and admins.
- slurm/300: Possibly killing wrong job on slurmd restart
- slurm/312: Freeing non-allocated memory and killing slurmd
-- Assorted changes to support RedHat Enterprise Linux 3.0 and IA64
+ -- Initial Elan4 and libelanctrl support (--with-elan).
-- Slurmctld was sometimes inappropriately setting a job's priority
to 1 when a node was down (even if up nodes could be used for the
job when a running job completes)
-- Convert all user commands from use of popt library to getopt_long()
-- If TotalView support is requested, srun exports "totalview_jobid"
variable for `%J' expansion in TV bulk launch string.
+ -- Fix several locking bugs in slurmd IO layer.
-- Throttle back repetitious error messages in slurmd to avoid filling
- slurm logfiles.
+ log files.
+
* Changes in SLURM 0.2.20
=========================
View
332 doc/man/man1/sinfo.1
@@ -1,7 +1,7 @@
-.TH SINFO "1" "October 2003" "sinfo 0.2" "Slurm components"
+.TH SINFO "1" "February 2004" "sinfo 0.2" "Slurm components"
.SH "NAME"
-sinfo \- Used to view information about Slurm nodes and partitions.
+sinfo \- view information about SLURM nodes and partitions.
.SH "SYNOPSIS"
\fBsinfo\fR [\fIOPTIONS\fR...]
@@ -17,12 +17,15 @@ Print a message describing all \fBsinfo\fR options.
\fB\-\-usage\fR
Print a brief message listing the \fBsinfo\fR options.
.TP
+\fB\-d\fR, \fB\-\-dead\fR
+If set only report state information for non-responding (dead) nodes.
+.TP
\fB\-e\fR, \fB\-\-exact\fR
-If set, do not group node information on multiple nodes unless their
-configurations to be reported are identical. Otherwise cpu count,
-memory size, and disk space for nodes will be listed with the minimum
-value followed by a "+" for nodes with the same partition and state
-(e.g., "250+").
+If set, do not group node information on multiple nodes unless
+their configurations to be reported are identical. Otherwise
+cpu count, memory size, and disk space for nodes will be listed
+with the minimum value followed by a "+" for nodes with the
+same partition and state (e.g., "250+").
.TP
\fB\-h\fR, \fB\-\-noheader\fR
Do not print a header on the output.
@@ -38,9 +41,8 @@ This is ignored if the \fB\-\-format\fR option is specified.
\fB\-n <nodes>\fR , \fB\-\-nodes=<nodes>\fR
Print information only about the specified node(s).
Multiple nodes may be comma separated or expressed using a
-node range expression.
-For example "linux[00-07]" would indicate eight nodes, "linux00"
-through "linux07".
+node range expression. For example "linux[00-07]" would
+indicate eight nodes, "linux00" through "linux07."
.TP
\fB\-N\fR , \fB\-\-Node\fR
Print information in a node-oriented format.
@@ -48,66 +50,118 @@ The default is to print information in a partition-oriented format.
This is ignored if the \fB\-\-format\fR option is specified.
.TP
\fB\-o <output_format>\fR, \fB\-\-format=<output_format>\fR
-Specify the information to be displayed.
-The default format is "%9P %5a %.9l %.5D %6t %N".
-If the \fB\-\-summarize\fR option is specified, the default
-format is "%9P %5a %.9l %15F %N".
-If the \fB\-\-long\fR option is specified, the default format is
-"%9P %5a %.9l %.8s %4r %5h %10g %.5D %11T %N".
-If the \fB\-\-Node\fR option is specified, the default format is
-"%#N %.5D %9P %6t".
-If the \fB\-\-Node\fR and \fB\-\-long\fR options are both specified,
-the default format is "%#N %.5D %9P %11T %.4c %.6m %.8d %.6w %8f %R".
-In the above two formats the value of "#" represents the maximum
-length of an node list to be printed.
+Specify the information to be displayed using an \fBsinfo\fR
+format string. Format strings transparently used by \fBsinfo\fR
+when running with various options are
+.RS
+.TP 6
+.I "default"
+"%9P %5a %.9l %.5D %6t %N"
+.TP
+.I "--summarize"
+"%9P %5a %.9l %15F %N"
+.TP
+.I "--long"
+"%9P %5a %.9l %.8s %4r %5h %10g %.5D %11T %N"
+.TP
+.I "--Node"
+"%#N %.5D %9P %6t"
+.TP
+.I "--long --Node"
+"%#N %.5D %9P %11T %.4c %.6m %.8d %.6w %8f %R"
+.TP
+.I "--list-reasons"
+"%35R %N"
+.RE
+
+In the above format strings the use of "#" represents the
+maximum length of an node list to be printed.
The field specifications available include:
-.br
-\fB%a\fR State/availability of a partition
-.br
-\fB%A\fR Number of nodes by state in the format "allocated/idle".
-Do not use this with a node state option ("%t" or "%T") or
-the different node states will be placed on separate lines.
-.br
-\fB%c\fR Number of CPUs per node
-.br
-\fB%d\fR Size of temporary disk space per node in megabytes
-.br
-\fB%D\fR Number of nodes
-.br
-\fB%f\fR Features associated with the nodes
-.br
-\fB%F\fR Number of nodes by state in the format "allocated/idle/other/total".
-Do not use this with a node state option ("%t" or "%T") or
+.RS
+.TP 4
+\fB%a\fR
+State/availability of a partition
+.TP
+\fB%A\fR
+Number of nodes by state in the format "allocated/idle".
+Do not use this with a node state option ("%t" or "%T") or
the different node states will be placed on separate lines.
-.br
-\fB%g\fR Groups which may use the nodes
-.br
-\fB%h\fR Jobs may share nodes, "yes", "no", or "force"
-.br
-\fB%l\fR Maximum time for any job in the format "days:hours:minutes:seconds"
-.br
-\fB%m\fR Size of memory per node in megabytes
-.br
-\fB%N\fR List of node names
-.br
-\fB%P\fR Partition name
-.br
-\fB%r\fR Only user root may initiate jobs, "yes" or "no"
-.br
-\fB%R\fR The reason a node is unavailable (down, drained, or draining states)
-.br
-\fB%s\fR Maximum job size in nodes
-.br
-\fB%t\fR State of nodes, compact form
-.br
-\fB%T\fR State of nodes, extended form
-.br
-\fB%w\fR Scheduling weight of the nodes
-.br
-\fB%.<*>\fR right justification of the field
-.br
-\fB%<Number><*>\fR size of field
+.TP
+\fB%c\fR
+Number of CPUs per node
+.TP
+\fB%d\fR
+Size of temporary disk space per node in megabytes
+.TP
+\fB%D\fR
+Number of nodes
+.TP
+\fB%f\fR
+Features associated with the nodes
+.TP
+\fB%F\fR
+Number of nodes by state in the format
+"allocated/idle/other/total". Do not use this with a node
+state option ("%t" or "%T") or the different node states will
+be placed on separate lines.
+.TP
+\fB%g\fR
+Groups which may use the nodes
+.TP
+\fB%h\fR
+Jobs may share nodes, "yes", "no", or "force"
+.TP
+\fB%l\fR
+Maximum time for any job in the format "days:hours:minutes:seconds"
+.TP
+\fB%m\fR
+Size of memory per node in megabytes
+.TP
+\fB%N\fR
+List of node names
+.TP
+\fB%P\fR
+Partition name
+.TP
+\fB%r\fR
+Only user root may initiate jobs, "yes" or "no"
+.TP
+\fB%R\fR
+The reason a node is unavailable (down, drained, or draining states)
+.TP
+\fB%s\fR
+Maximum job size in nodes
+.TP
+\fB%t\fR
+State of nodes, compact form
+.TP
+\fB%T\fR
+State of nodes, extended form
+.TP
+\fB%w\fR
+Scheduling weight of the nodes
+.TP
+\fB%.<*>\fR
+right justification of the field
+.TP
+\fB%<Number><*>\fR
+size of field
+.RE
+.TP
+\fB\-r\fR, \fB\-\-responding\fR
+If set only report state information for responding nodes.
+.TP
+\fB\-R\fR, \fB--list-reasons\fR
+List reasons nodes are down or drained. When nodes are in
+these states SLURM supports optional inclusion of a "reason"
+string by an administrator. This option will display the first
+35 characters of the reason field and list of nodes with that
+reason for all nodes that are, by default, down, drained, or
+draining. This option may be used with other node filtering
+options (e.g. \fB\-r\fR, \fB\-d\fR, \fB\-t\fR, \fB\-n\fR),
+however, combinations of these options that result in a list of
+nodes that are not down or drained will not produce any output.
.TP
\fB\-s\fR , \fB\-\-summarize\fR
List only a partition state summary with no node state details.
@@ -135,9 +189,11 @@ insensitive.
Possible values include: ALLOC, ALLOCATED, COMP,
COMPLETING, DOWN, DRAIN, DRAINED, DRAINING, IDLE, UNK, and UNKNOWN.
A "*" suffix may be appended to any of these states to search for
-nodes in the specified state which are not responding.
+nodes in the specified state which are not responding.
By default nodes in the specified state are reported whether they are
responding or not.
+Also note the \fB\-\-dead\fR and \fB\-\-responding\fR options for
+filtering nodes by state.
.TP
\fB\-p <partition>\fR, \fB\-\-partition=<partition>\fR
Print information only about the specified partition.
@@ -148,27 +204,29 @@ Provide detailed event logging through program execution.
\fB\-V\fR , \fB\-\-version\fR
Print version information and exit.
-.SH "OUTPUT"
+.SH "OUTPUT FIELD DESCRIPTIONS"
.TP
\fBAVAIL\fR
-Partition state, \fBup\fR or \fBdown\fR.
+Partition state: \fBup\fR or \fBdown\fR.
.TP
\fBCPUS\fR
Count of CPUs (processors) on these nodes.
.TP
\fBGROUPS\fR
-Resource allocations in this partition are restricted to the named groups.
-\fBall\fR indicates that all groups may use this partition.
+Resource allocations in this partition are restricted to the
+named groups. \fBall\fR indicates that all groups may use
+this partition.
.TP
\fBJOB_SIZE\fR
-Minimum and maximum node count that can be allocated to any user job.
-A single number indicates the minimum and maximum node count are the
-same.
-\fBinfinite\fR is used to identify partitions without a maximum node count.
+Minimum and maximum node count that can be allocated to any
+user job. A single number indicates the minimum and maximum
+node count are the same. \fBinfinite\fR is used to identify
+partitions without a maximum node count.
.TP
\fBTIMELIMIT\fR
-Maximum time limit for any user job in days:hours:minutes:seconds.
-\fBinfinite\fR is used to identify partitions without a job time limit.
+Maximum time limit for any user job in
+days:hours:minutes:seconds. \fBinfinite\fR is used to identify
+partitions without a job time limit.
.TP
\fBMEMORY\fR
Size of real memory in megabytes on these nodes.
@@ -180,7 +238,7 @@ Names of nodes associated with this configuration/partition.
Count of nodes with this particular configuration.
.TP
\fBNODES(A/I)\fR
-Count of nodes with this particular configuration by node
+Count of nodes with this particular configuration by node
state in the form "available/idle".
.TP
\fBNODES(A/I/O/T)\fR
@@ -209,6 +267,52 @@ that are presently not responding.
\fBTMP_DISK\fR
Size of temporary disk space in megabytes on these nodes.
+.SH "NODE STATE CODES"
+.PP
+Node state codes are shorted as required for the field size.
+If the node state code is followed by "*", this indicates the node
+is presently not responding and will not be allocated any new work.
+If the node remains non-responsive, it will be placed in the \fBDOWN\fR
+state.
+.TP 12
+ALLOCATED
+The node has been allocated to one or more jobs.
+.TP
+COMPLETING
+One or more jobs have been allocated this node and are in the process
+of COMPLETING. This node state will be left when all of the job's
+processes have terminated and the SLURM epilog program (if any) has
+terminated. See the \fBEpilog\fR parameter description in the
+\fBslurm.conf\fR man page for more information.
+.TP
+DOWN
+The node is unavailable for use. SLURM can automatically place nodes
+in this state if some failure occurs. System administrators may also
+explicitly place nodes in this state. If a node resumes normal operation,
+SLURM can automatically return it to service. See the \fBReturnToService\fR
+and \fBSlurmdTimeout\fR parameter descriptions in the \fBslurm.conf\fR(5)
+man page for more information.
+.TP
+DRAINED
+The node is unavailable for use per system administrator request.
+See the \fBupdate node\fR command in the \fBscontrol\fR(1) man page
+or the \fBslurm.conf\fR(5) man page for more information.
+.TP
+DRAINING
+The node is currently executing a job, but will not be allocated to
+additional jobs. The node state will be changed to state \fBDRAINED\fR
+when the last job on it completes. Nodes enter this state per system
+administrator request. See the \fBupdate node\fR command in the
+\fBscontrol\fR(1) man page or the \fBslurm.conf\fR(5) man page for
+more information.
+.TP
+IDLE
+The node is not allocated to any jobs and is available for use.
+.TP
+UNKNOWN
+The SLURM controller has just started and the node's state has not
+yet been determined.
+
.SH "ENVIRONMENT VARIABLES"
.PP
Some \fBsinfo\fR options may be set via environment variables. These
@@ -227,80 +331,66 @@ SINFO_SORT
.SH "EXAMPLES"
.eo
Report basic node and partition configurations:
-.br
+
+.nf
+
> sinfo
-.br
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
-.br
batch up infinite 2 alloc adev[8-9]
-.br
batch up infinite 6 idle adev[10-15]
-.br
debug* up 30:00 8 idle adev[0-7]
-.br
+
+.fi
-.br
Report partition summary information:
-.br
+.nf
+
> sinfo -s
-.br
PARTITION AVAIL TIMELIMIT NODES(A/I/O/T) NODELIST
-.br
batch up infinite 2/6/0/8 adev[8-15]
-.br
debug* up 30:00 0/8/0/8 adev[0-7]
-.br
+
+.fi
-.br
Report more complete information about the partition debug:
-.br
+.nf
+
> sinfo --long --partition=debug
-.br
PARTITION AVAIL TIMELIMIT JOB_SIZE ROOT SHARE GROUPS NODES STATE NODELIST
-.br
debug* up 30:00 8 no no all 8 idle dev[0-7]
-.br
+.fi
-.br
Report only those nodes that are in state DRAINED:
-.br
+.nf
+
> sinfo --states=drained
-.br
PARTITION AVAIL NODES TIMELIMIT STATE NODELIST
-.br
debug* up 2 30:00 drain adev[6-7]
-.br
-.br
+.fi
+
Report node-oriented information with details and exact matches:
-.br
+.nf
+
> sinfo -Nel
-.br
NODELIST NODES PARTITION STATE CPUS MEMORY TMP_DISK WEIGHT FEATURES REASON
-.br
adev[0-1] 2 debug* idle 2 3448 38536 16 (null) (null)
-.br
adev[2,4-7] 5 debug* idle 2 3384 38536 16 (null) (null)
-.br
adev3 1 debug* idle 2 3394 38536 16 (null) (null)
-.br
adev[8-9] 2 batch allocated 2 246 82306 16 (null) (null)
-.br
adev[10-15] 6 batch idle 2 246 82306 16 (null) (null)
-.br
-.br
+.fi
+
Report only down, drained and draining nodes and their reason field:
-.br
-> sinfo --states=down,drained,draining --sort=N --format="%12N %9T %R"
-.br
-NODELIST STATE REASON
-.br
-dev[12,18] DRAINED power supply, ETA Dec25
-.br
-dev45 DOWN* (null)
-.br
-dev123 DRAINED memory, ETA Nov24
+.nf
+
+> sinfo -R
+REASON NODELIST
+Memory errors dev[0,5]
+Not Responding dev8
+
+.fi
.ec
.SH "COPYING"
View
22 doc/man/man1/squeue.1
@@ -1,4 +1,4 @@
-.TH SQUEUE "1" "October 2003" "squeue 0.2" "Slurm components"
+.TH SQUEUE "1" "February 2004" "squeue 0.2" "Slurm components"
.SH "NAME"
squeue \- Used to view information of jobs located in the scheduling queue.
@@ -148,6 +148,26 @@ Report details of squeues actions.
\fB\-V\fR , \fB\-\-version\fR
Print version information and exit.
+.SH "JOB STATE CODES"
+.TP 17
+CD COMPLETED
+Job has terminated all processes on all nodes.
+.TP
+CG COMPLETING
+Job is in the process of completing. Some processes on some nodes may still be active.
+.TP
+F FAILED
+Job terminated with non-zero exit code or other failure condition.
+.TP
+NF NODE_FAIL
+Job terminated due to failure of one or more allocated nodes.
+.TP
+PD PENDING
+Job is awaiting resource allocation.
+.TP
+TO TIMEOUT
+Job terminated upon reaching its time limit.
+
.SH "ENVIRONMENT VARIABLES"
.PP
Some \fBsqueue\fR options may be set via environment variables. These
View
6 doc/man/man1/srun.1
@@ -208,6 +208,12 @@ Specify how long to wait after the first task terminates before terminating
all remaining tasks. The default value is unlimited. This can be useful to
insure that a job is terminated in a timely fashion in the event that one
or more tasks terminate prematurely.
+.TP
+\fB\-q\fR, \fB\-\-quit\-on\-interrupt\fR
+Quit immediately on single SIGINT (Ctrl-C). Use of this option
+disables the status feature normally available when \fBsrun\fR receives
+a single Ctrl-C and causes \fBsrun\fR to instead immediately terminate the
+running job.
.PP
Allocate options:
.TP
View
15 doc/man/man5/slurm.conf.5
@@ -205,10 +205,15 @@ on the same nodes or the values of \fBSlurmctldPort\fR and \fBSlurmdPort\fR
must be different.
.TP
\fBSlurmdSpoolDir\fR
-Fully qualified pathname of a file into which the \fBslurmd\fR daemon's state
-information is written. This must be a common pathname for all nodes, but
-should represent a file which is local to each node (reference a local file
-system). The default value is "/tmp/slurmd".
+Fully qualified pathname of a directory into which the \fBslurmd\fR
+daemon's state information and batch job script information are written. This
+must be a common pathname for all nodes, but should represent a directory which
+is local to each node (reference a local file system). The default value
+is "/var/spool/slurmd." \fBNOTE\fR: This directory is also used to store \fBslurmd\fR's
+shared memory lockfile, and \fBshould not be changed\fR unless the system
+is being cleanly restarted. If the location of \fBSlurmdSpoolDir\fR is
+changed and \fBslurmd\fR is restarted, the new daemon will attach to a
+different shared memory region and lose track of any running jobs.
.TP
\fBSlurmdTimeout\fR
The interval, in seconds, that the SLURM controller waits for \fBslurmd\fR
@@ -228,7 +233,7 @@ Fully qualified pathname of the file system available to user jobs for
temporary storage. This parameter is used in establishing a node's \fBTmpDisk\fR space.
The default value is "/tmp".
.TP
-\fBWaitTimefR
+\fBWaitTime\fR
Specifies how many seconds the srun command should by default wait after
the first task terminates before terminating all remaining tasks. The
"--wait" option on the srun command line overrides this value.
View
3  slurm/slurm.h.in
@@ -796,7 +796,8 @@ extern int slurm_update_node PARAMS(( update_node_msg_t * node_msg ));
* default values
* OUT job_desc_msg - user defined partition descriptor
*/
-void slurm_init_part_desc_msg PARAMS((update_part_msg_t * update_part_msg ));
+extern void slurm_init_part_desc_msg PARAMS((
+ update_part_msg_t * update_part_msg ));
/*
* slurm_load_partitions - issue RPC to get slurm all partition configuration
View
19 src/api/Makefile.am
@@ -43,19 +43,22 @@ libslurm_la_SOURCES = \
common_dir = $(top_builddir)/src/common
-libslurm_la_LIBADD = $(common_dir)/libcommon.la -lpthread
+libslurm_la_LIBADD = \
+ $(common_dir)/libcommon.la -lpthread
-libslurm_la_LDFLAGS = -export-symbols libslurm.sym \
- -version-info $(current):$(rev):$(age)
+libslurm_la_LDFLAGS = \
+ -export-symbols libslurm.sym \
+ -version-info $(current):$(rev):$(age)
-libslurm_la_DEPENDENCIES = libslurm.sym $(common_dir)/libcommon.la
+libslurm_la_DEPENDENCIES = \
+ libslurm.sym \
+ $(common_dir)/libcommon.la
-
-libslurm.sym : $(top_builddir)/slurm/slurm.h
- sed -n 's/^extern.* \(slurm[^ ]*\).*$$/\1/p' $< >libslurm.sym
+libslurm.sym : $(top_builddir)/slurm/slurm.h
+ -sed -n 's/^extern .* \([a-zA-Z0-9_]*\) PARAMS.*$$/\1/p' $< $* >libslurm.sym
distclean-local:
- -rm libslurm.sym
+ -rm -rf libslurm.map libslurm.sym
force:
$(libslurm_la_LIBADD) : force
View
6 src/api/allocate.c
@@ -89,6 +89,7 @@ slurm_allocate_resources (job_desc_msg_t *req,
if (rc == SLURM_SOCKET_ERROR)
return SLURM_SOCKET_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_SLURM_RC:
if (_handle_rc_msg(&resp_msg) < 0)
@@ -127,6 +128,7 @@ int slurm_job_will_run (job_desc_msg_t *req,
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_SOCKET_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_SLURM_RC:
if (_handle_rc_msg(&resp_msg) < 0)
@@ -183,7 +185,7 @@ slurm_allocate_resources_and_run (job_desc_msg_t *req,
if (rc == SLURM_SOCKET_ERROR)
return SLURM_SOCKET_ERROR;
-
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_SLURM_RC:
if (_handle_rc_msg(&resp_msg) < 0)
@@ -222,6 +224,7 @@ slurm_job_step_create (job_step_create_request_msg_t *req,
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_SLURM_RC:
if (_handle_rc_msg(&resp_msg) < 0)
@@ -259,6 +262,7 @@ slurm_confirm_allocation (old_job_alloc_msg_t *req,
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch(resp_msg.msg_type) {
case RESPONSE_SLURM_RC:
if (_handle_rc_msg(&resp_msg) < 0)
View
3  src/api/cancel.c
@@ -73,7 +73,8 @@ slurm_kill_job_step (uint32_t job_id, uint32_t step_id, uint16_t signal)
if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0)
return SLURM_FAILURE;
- if (rc) slurm_seterrno_ret(rc);
+ if (rc)
+ slurm_seterrno_ret(rc);
return SLURM_SUCCESS;
}
View
3  src/api/complete.c
@@ -85,7 +85,8 @@ slurm_complete_job_step ( uint32_t job_id, uint32_t step_id,
if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0)
return SLURM_ERROR;
- if (rc) slurm_seterrno_ret(rc);
+ if (rc)
+ slurm_seterrno_ret(rc);
return SLURM_PROTOCOL_SUCCESS;
}
View
2  src/api/config_info.c
@@ -151,10 +151,10 @@ slurm_load_ctl_conf (time_t update_time, slurm_ctl_conf_t **confp)
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_BUILD_INFO:
*confp = (slurm_ctl_conf_info_msg_t *) resp_msg.data;
- slurm_free_cred(resp_msg.cred);
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
View
3  src/api/job_info.c
@@ -219,10 +219,10 @@ slurm_load_jobs (time_t update_time, job_info_msg_t **resp)
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_JOB_INFO:
*resp = (job_info_msg_t *)resp_msg.data;
- slurm_free_cred(resp_msg.cred);
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
@@ -267,6 +267,7 @@ slurm_pid2jobid (pid_t job_pid, uint32_t *jobid)
if (slurm_send_recv_node_msg(&req_msg, &resp_msg, 0) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_JOB_ID:
*jobid = ((job_id_response_msg_t *) resp_msg.data)->job_id;
View
2  src/api/job_step_info.c
@@ -125,10 +125,10 @@ slurm_get_job_steps (time_t update_time, uint32_t job_id, uint32_t step_id,
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_JOB_STEP_INFO:
*resp = (job_step_info_response_msg_t *) resp_msg.data;
- slurm_free_cred(resp_msg.cred);
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
View
2  src/api/node_info.c
@@ -129,10 +129,10 @@ slurm_load_node (time_t update_time, node_info_msg_t **resp)
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_NODE_INFO:
*resp = (node_info_msg_t *) resp_msg.data;
- slurm_free_cred(resp_msg.cred);
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
View
2  src/api/partition_info.c
@@ -161,10 +161,10 @@ slurm_load_partitions (time_t update_time, partition_info_msg_t **resp)
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_PARTITION_INFO:
*resp = (partition_info_msg_t *) resp_msg.data;
- slurm_free_cred(resp_msg.cred);
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
View
4 src/api/reconfigure.c
@@ -58,7 +58,8 @@ slurm_reconfigure ( void )
if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0)
return SLURM_ERROR;
- if (rc) slurm_seterrno_ret(rc);
+ if (rc)
+ slurm_seterrno_ret(rc);
return SLURM_PROTOCOL_SUCCESS;
}
@@ -129,6 +130,7 @@ _send_message_controller (enum controller_id dest, slurm_msg_t *req)
if ((rc = slurm_receive_msg(fd, &resp_msg, 0)) < 0)
slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_RECEIVE_ERROR);
+ slurm_free_cred(resp_msg.cred);
if (slurm_shutdown_msg_conn(fd) != SLURM_SUCCESS)
slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_SHUTDOWN_ERROR);
View
1  src/api/submit.c
@@ -87,6 +87,7 @@ slurm_submit_batch_job (job_desc_msg_t *req,
if (rc == SLURM_SOCKET_ERROR)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
View
387 src/common/elanhosts.c
@@ -1,387 +0,0 @@
-/*****************************************************************************\
- * $Id$
- *****************************************************************************
- * Copyright (C) 2001-2002 The Regents of the University of California.
- * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- * Written by Mark Grondona <mgrondona@llnl.gov>.
- * UCRL-CODE-2003-005.
- *
- * This file is part of Pdsh, a parallel remote shell program.
- * For details, see <http://www.llnl.gov/linux/pdsh/>.
- *
- * Pdsh is free software; you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * Pdsh is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License along
- * with Pdsh; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
-\*****************************************************************************/
-
-#if HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <assert.h>
-#include <stdarg.h>
-#include <stdlib.h>
-
-#include "src/common/list.h"
-#include "src/common/hostlist.h"
-#include "elanhosts.h"
-
-/* Default ElanId config file */
-#define ELANID_CONFIG_FILE "/etc/elanhosts"
-
-/*
- * Error strings for error codes returned by parse_elanid_config()
- */
-static char *errstr[] =
-{ "No error",
- "Out of memory!",
- "Parse error",
- "Number of ElanIds specified != number of hosts",
- "Type must be \"eip\" \"eth\" or \"other\"",
- NULL
-};
-
-/*
- * Container for converting hostnames to ElanIDs
- */
-struct elan_info {
- elanhost_type_t type; /* type of entry */
- int elanid; /* ElanID corresponding to this hostname */
- char *hostname; /* Resolveable hostname */
-};
-
-struct elanhost_config {
-#ifndef NDEBUG
- int magic;
-# define ELANHOST_CONFIG_MAGIC 0xe100e100
-#endif
- int maxid; /* Storage for max ElanID in config */
- List elanid_list; /* List of elan_info objects describing configuration */
- char errstr[1024]; /* String describing last error from this object */
-};
-
-
-/*
- * Static Prototypes:
- */
-static elanhost_config_t _elanhost_config_alloc(void);
-static void _elanhost_err(elanhost_config_t ec, const char *fmt, ...);
-static int _find_host(struct elan_info *ei, char *key);
-static int _parse_elanid_config(elanhost_config_t ec, const char *path);
-static int _parse_elanid_line(elanhost_config_t ec, char *buf);
-static struct elan_info * _elan_info_create(elanhost_type_t type,
- int elanid, char *hostname);
-static void _elan_info_destroy(struct elan_info *ei);
-
-
-elanhost_config_t elanhost_config_create()
-{
- return _elanhost_config_alloc();
-}
-
-
-int elanhost_config_read(elanhost_config_t ec, const char *filename)
-{
- assert(ec != NULL);
- assert(ec->magic == ELANHOST_CONFIG_MAGIC);
- assert(ec->elanid_list != NULL);
-
- if (filename == NULL)
- filename = ELANID_CONFIG_FILE;
-
- if (_parse_elanid_config(ec, filename) < 0)
- return(-1);
-
- return(0);
-}
-
-void elanhost_config_destroy(elanhost_config_t ec)
-{
- assert(ec != NULL);
- assert(ec->magic == ELANHOST_CONFIG_MAGIC);
- list_destroy(ec->elanid_list);
- assert(ec->magic = ~ELANHOST_CONFIG_MAGIC);
- free(ec);
-}
-
-int elanhost_config_maxid(elanhost_config_t ec)
-{
- assert(ec != NULL);
- assert(ec->magic == ELANHOST_CONFIG_MAGIC);
-
- return ec->maxid;
-}
-
-int elanhost_host2elanid(elanhost_config_t ec, char *host)
-{
- struct elan_info *ei;
-
- assert(ec != NULL);
- assert(host != NULL);
- assert(ec->magic == ELANHOST_CONFIG_MAGIC);
-
- ei = list_find_first(ec->elanid_list, (ListFindF) _find_host, host);
-
- if (!ei) {
- _elanhost_err(ec, "Unable to find host \"%s\" in configuration", host);
- return -1;
- }
-
- return ei->elanid;
-}
-
-const char *elanhost_config_err(elanhost_config_t ec)
-{
- return ec->errstr;
-}
-
-
-struct elanid_find_arg {
- elanhost_type_t type;
- int elanid;
-};
-
-static int _find_elanid(struct elan_info *ei, struct elanid_find_arg *arg)
-{
- if (ei->type != arg->type)
- return 0;
-
- if (ei->elanid != arg->elanid)
- return 0;
-
- return 1;
-}
-
-char *elanhost_elanid2host(elanhost_config_t ec, elanhost_type_t type, int eid)
-{
- struct elan_info *ei;
- struct elanid_find_arg arg;
-
- assert(ec != NULL);
- assert(eid >= 0);
- assert(ec->magic == ELANHOST_CONFIG_MAGIC);
-
- arg.type = type;
- arg.elanid = eid;
-
- ei = list_find_first(ec->elanid_list, (ListFindF) _find_elanid, &arg);
-
- if (!ei) {
- _elanhost_err(ec, "Unable to find host with type=%d elanid=%d",
- type, eid);
- return(NULL);
- }
-
- return ei->hostname;
-}
-
-static elanhost_config_t _elanhost_config_alloc(void)
-{
- elanhost_config_t new = malloc(sizeof(*new));
-
- new->maxid = -1;
- new->elanid_list = list_create((ListDelF) _elan_info_destroy);
-
- assert(new->magic = ELANHOST_CONFIG_MAGIC);
-
- return new;
-}
-
-static void _elanhost_err(elanhost_config_t ec, const char *fmt, ...)
-{
- va_list ap;
-
- assert(ec != NULL);
- assert(fmt != NULL);
-
- va_start(ap, fmt);
- vsnprintf(ec->errstr, 1024, fmt, ap);
- va_end(ap);
-
- return;
-}
-
-/*
- * Parse the "elanhosts" config file which has the form
- *
- * ElanIds Hostnames
- * [n-m] host_n,...,host_m
- * [n-m] host[n-m]
- * etc.
- *
- * and which maps ElanIds to hostnames on the cluster.
- * The results are stored in the config object's elanid_list member.
- *
- * Returns 0 on Success, and an error code < 0 on failure.
- */
-static int _parse_elanid_config(elanhost_config_t ec, const char *path)
-{
- char buf[4096];
- int line;
- FILE *fp;
-
- if (!(fp = fopen(path, "r"))) {
- _elanhost_err(ec, "failed to open %s\n", path);
- return -1;
- }
-
- line = 1;
- while (fgets(buf, 4096, fp)) {
- int rc;
- if ((rc = _parse_elanid_line(ec, buf)) < 0) {
- _elanhost_err(ec, "%s: line %d: %s", path, line, errstr[-rc]);
- return -1;
- }
- line++;
- }
-
- if (fclose(fp) < 0)
- _elanhost_err(ec, "close(%s): %m", path);
-
- return 0;
-}
-
-
-/*
- * Translate type strings "eip," "eth," or "other" into their
- * corresponding elanhost_type_t number
- */
-static elanhost_type_t _get_type_num(char *type)
-{
- if (strcasecmp(type, "eip") == 0)
- return ELANHOST_EIP;
- else if (strcasecmp(type, "eth") == 0)
- return ELANHOST_ETH;
- else if (strcasecmp(type, "other") == 0)
- return ELANHOST_OTHER;
- else
- return -1;
-}
-
-/*
- * Parse one line of elanId list appending results to list "eil"
- *
- * Returns -1 for parse error, -2 if the number of elanids specified
- * doesn't equal the number of hosts.
- *
- * Returns 0 on success
- */
-static int
-_parse_elanid_line(elanhost_config_t ec, char *buf)
-{
- hostlist_t el, hl;
- const char *separators = " \t\n";
- char *type;
- char *elanids;
- char *hosts;
- char *sp, *s;
- int rc = 0;
- int typenum;
-
- /*
- * Nullify any comments
- */
- if ((s = strchr(buf, '#')))
- *s = '\0';
-
- if (!(type = strtok_r(buf, separators, &sp)))
- return 0;
-
- if (!(elanids = strtok_r(NULL, separators, &sp)))
- return -1;
-
- if (!(hosts = strtok_r(NULL, separators, &sp)))
- return -2;
-
- el = hostlist_create(NULL);
- hl = hostlist_create(NULL);
-
- if (!el || !hl) {
- rc = -1;
- goto done;
- }
-
- if (hostlist_push(el, elanids) != hostlist_push(hl, hosts)) {
- rc = -3;
- goto done;
- }
-
- if ((typenum = _get_type_num(type)) < 0)
- return -4;
-
- while ((s = hostlist_shift(el))) {
- char *eptr;
- int elanid = (int) strtoul(s, &eptr, 10);
-
- if (*eptr != '\0') {
- rc = -2;
- goto done;
- }
-
- free(s);
- if (!(s = hostlist_shift(hl))) {
- rc = -1;
- goto done;
- }
-
- if (elanid > ec->maxid)
- ec->maxid = elanid;
-
- list_append(ec->elanid_list, _elan_info_create(typenum, elanid, s));
- }
-
- done:
- hostlist_destroy(el);
- hostlist_destroy(hl);
-
- return rc;
-}
-
-static struct elan_info *
-_elan_info_create(elanhost_type_t type, int elanid, char *hostname)
-{
- struct elan_info *ei = (struct elan_info *) malloc(sizeof(*ei));
- ei->type = type;
- ei->elanid = elanid;
- ei->hostname = hostname;
- return ei;
-}
-
-static void
-_elan_info_destroy(struct elan_info *ei)
-{
- if (ei->hostname)
- free(ei->hostname);
- free(ei);
-}
-
-
-/*
- * List Find function for mapping hostname to an ElanId
- */
-static int _find_host(struct elan_info *ei, char *key)
-{
- if (strcmp(ei->hostname, key) != 0)
- return 0;
- else
- return 1;
-}
-
-
-/*
- * vi:tabstop=4 shiftwidth=4 expandtab
- */
-
View
121 src/common/elanhosts.h
@@ -1,121 +0,0 @@
-/*****************************************************************************\
- * $Id$
- *****************************************************************************
- * Copyright (C) 2001-2002 The Regents of the University of California.
- * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- * Written by Mark Grondona <mgrondona@llnl.gov>.
- * UCRL-CODE-2003-005.
- *
- * This file is part of Pdsh, a parallel remote shell program.
- * For details, see <http://www.llnl.gov/linux/pdsh/>.
- *
- * Pdsh is free software; you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * Pdsh is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License along
- * with Pdsh; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
-\*****************************************************************************/
-
-#ifndef _ELANHOSTS_H
-#define _ELANHOSTS_H
-
-/*
- * Type of Elan "hostname"
- * Hostname corresponds to the eip adapter, an ethernet adapter, or "other"
- */
-typedef enum {
- ELANHOST_EIP,
- ELANHOST_ETH,
- ELANHOST_OTHER
-} elanhost_type_t;
-
-/* Opaque type which holds the elanhost configuration
- */
-typedef struct elanhost_config * elanhost_config_t;
-
-
-/*
- * Functions
- */
-
-/*
- * Create an empty Elanhost config object
- */
-elanhost_config_t elanhost_config_create(void);
-
-/*
- * Read elanhosts configuration from `file'
- * (Default /etc/elanhosts)
- *
- * Config file format is as follows:
- *
- * Type ElanIDs Hostnames
- *
- * The "type" field may be "eip" for eip interface, "eth" for an
- * ethernet interface, or "other" for anything else. ("eth" and
- * "other" are equivalent at this time)
- *
- * The "ElanIDs" field consists of a list of one or more ElanIDs in
- * the form "[i-j,n-m,..]" or just "N" for a single ElanID.
- *
- * The "Hostname" field consists of the hostnames which correspond
- * to the ElanIDs. If the hostnames have a numeric suffix a bracketed
- * hostlist is allowed (see hostlist.[ch])
- *
- * For Example:
- *
- * Type ElanIDs Hostnames
- * eip [0-10] host[0-10]
- * eth [0-10] ehost[0-10]
- * eth [0,1] host0-eth1,host1-eth1
- *
- * Returns 0 on succes, -1 for failure.
- *
- */
-int elanhost_config_read(elanhost_config_t ec, const char *filename);
-
-
-/*
- * Destroy an elanhost configuration object.
- */
-void elanhost_config_destroy(elanhost_config_t conf);
-
-
-/*
- * Given a hostname, return the corresponding ElanID
- *
- * Returns the ElanId on success, -1 if no host matching "hostname"
- * was found in the configuration.
- *
- */
-int elanhost_host2elanid(elanhost_config_t ec, char *host);
-
-
-/*
- * Given an ElanId and adapter type, return the first matching hostname
- * from the configuration.
- */
-char *elanhost_elanid2host(elanhost_config_t ec,
- elanhost_type_t type, int elanid);
-
-
-/*
- * Returns the max ElanID from the configuration
- */
-int elanhost_config_maxid(elanhost_config_t ec);
-
-
-/*
- * Returns the last error string generated for the elan config obj `ec'
- */
-const char *elanhost_config_err(elanhost_config_t ec);
-
-#endif
View
13 src/common/log.c
@@ -227,13 +227,13 @@ int log_init(char *prog, log_options_t opt, log_facility_t fac, char *logfile)
void log_fini()
{
- if (!log) return;
+ if (!log)
+ return;
+
log_flush();
slurm_mutex_lock(&log_lock);
- if (log->argv0)
- xfree(log->argv0);
- if (log->fpfx)
- xfree(log->fpfx);
+ xfree(log->argv0);
+ xfree(log->fpfx);
if (log->buf)
cbuf_destroy(log->buf);
if (log->fbuf)
@@ -254,8 +254,7 @@ void log_reinit()
void log_set_fpfx(char *prefix)
{
slurm_mutex_lock(&log_lock);
- if (log->fpfx)
- xfree(log->fpfx);
+ xfree(log->fpfx);
if (!prefix)
log->fpfx = xstrdup("");
else {
View
11 src/common/slurm_auth.c
@@ -243,7 +243,7 @@ slurm_auth_context_create( const char *auth_type )
c->auth_errno = SLURM_SUCCESS;
/* Copy the authentication type. */
- c->auth_type = strdup( auth_type );
+ c->auth_type = xstrdup( auth_type );
if ( c->auth_type == NULL ) {
debug3( "can't make local copy of authentication type" );
xfree( c );
@@ -299,7 +299,7 @@ slurm_auth_context_destroy( slurm_auth_context_t c )
}
}
- free( c->auth_type );
+ xfree( c->auth_type );
xfree( c );
return SLURM_SUCCESS;
@@ -340,7 +340,12 @@ slurm_auth_fini( void )
if ( g_context )
slurm_auth_context_destroy( g_context );
- free_slurm_conf( &conf );
+ slurm_mutex_lock( &config_lock );
+ if ( conf.slurmd_port ) {
+ free_slurm_conf( &conf );
+ conf.slurmd_port = 0;
+ }
+ slurm_mutex_unlock( &config_lock );
}
/*
View
2  src/common/slurm_errno.c
@@ -92,7 +92,7 @@ static slurm_errtab_t slurm_errtab[] = {
{ ESLURM_INVALID_PARTITION_NAME,
"Invalid partition name specified" },
{ ESLURM_DEFAULT_PARTITION_NOT_SET,
- "System default partition not set" },
+ "No partition specified or system default partition" },
{ ESLURM_ACCESS_DENIED,
"Access denied" },
{ ESLURM_JOB_MISSING_REQUIRED_PARTITION_GROUP,
View
2  src/common/slurm_protocol_defs.c
@@ -372,7 +372,7 @@ char *node_state_string_compact(enum node_states inx)
"IDLE",
"ALLOC",
"DRAIN",
- "DRAIN",
+ "DRNG",
"COMP",
"END"
};
View
4 src/common/slurm_protocol_pack.c
@@ -796,10 +796,6 @@ _unpack_resource_allocation_response_msg(resource_allocation_response_msg_t
safe_unpack16(&tmp_ptr->num_cpu_groups, buffer);
if (tmp_ptr->num_cpu_groups > 0) {
- tmp_ptr->cpus_per_node = (uint32_t *)
- xmalloc(sizeof(uint32_t) * tmp_ptr->num_cpu_groups);
- tmp_ptr->cpu_count_reps = (uint32_t *)
- xmalloc(sizeof(uint32_t) * tmp_ptr->num_cpu_groups);
safe_unpack32_array((uint32_t **) &
(tmp_ptr->cpus_per_node), &uint32_tmp,
buffer);
View
1  src/common/xsignal.c
@@ -91,6 +91,7 @@ xsignal_sigset_create(int sigarray[], sigset_t *setp)
int
xsignal_save_mask(sigset_t *set)
{
+ sigemptyset(set);
return _sigmask(SIG_SETMASK, NULL, set);
}
View
1  src/plugins/auth/auth_munge.c
@@ -462,6 +462,7 @@ _decode_cred(char *m, slurm_auth_credential_t *c)
* Block all signals to allow munge_decode() to proceed
* uninterrupted. (Testing for gnats slurm/223)
*/
+ sigemptyset(&oset);
sigfillset(&set);
sigdelset(&set, SIGABRT);
sigdelset(&set, SIGSEGV);
View
2  src/scontrol/scontrol.c
@@ -1268,7 +1268,7 @@ _update_job (int argc, char *argv[])
}
else if (strncasecmp(argv[i], "Priority=", 9) == 0)
job_msg.priority =
- (uint32_t) strtol(&argv[i][9],
+ (uint32_t) strtoll(&argv[i][9],
(char **) NULL, 10);
else if (strncasecmp(argv[i], "ReqProcs=", 9) == 0)
job_msg.num_procs =
View
236 src/sinfo/opts.c
@@ -66,6 +66,7 @@ extern void parse_command_line(int argc, char *argv[])
int opt_char;
int option_index;
static struct option long_options[] = {
+ {"dead", no_argument, 0, 'd'},
{"exact", no_argument, 0, 'e'},
{"noheader", no_argument, 0, 'h'},
{"iterate", required_argument, 0, 'i'},
@@ -74,6 +75,8 @@ extern void parse_command_line(int argc, char *argv[])
{"Node", no_argument, 0, 'N'},
{"format", required_argument, 0, 'o'},
{"partition", required_argument, 0, 'p'},
+ {"responding",no_argument, 0, 'r'},
+ {"list-reasons", no_argument, 0, 'R'},
{"summarize", no_argument, 0, 's'},
{"sort", required_argument, 0, 'S'},
{"states", required_argument, 0, 't'},
@@ -83,103 +86,119 @@ extern void parse_command_line(int argc, char *argv[])
{"usage", no_argument, 0, OPT_LONG_USAGE}
};
- while((opt_char = getopt_long(argc, argv, "ehi:ln:No:p:sS:t:vV",
+ while((opt_char = getopt_long(argc, argv, "dehi:ln:No:p:rRsS:t:vV",
long_options, &option_index)) != -1) {
switch (opt_char) {
- case (int)'?':
- fprintf(stderr, "Try \"sinfo --help\" for more information\n");
+ case (int)'?':
+ fprintf(stderr, "Try \"sinfo --help\" for more information\n");
+ exit(1);
+ break;
+ case (int)'d':
+ params.dead_nodes = true;
+ break;
+ case (int)'e':
+ params.exact_match = true;
+ break;
+ case (int)'h':
+ params.no_header = true;
+ break;
+ case (int) 'i':
+ params.iterate= atoi(optarg);
+ if (params.iterate <= 0) {
+ error ("Error: --iterate=%s");
exit(1);
- break;
- case (int)'e':
- params.exact_match = true;
- break;
- case (int)'h':
- params.no_header = true;
- break;
- case (int) 'i':
- params.iterate= atoi(optarg);
- if (params.iterate <= 0) {
- fprintf(stderr,
- "Error: --iterate=%s",
- optarg);
- exit(1);
- }
- break;
- case (int) 'l':
- params.long_output = true;
- break;
- case (int) 'n':
- params.nodes= xstrdup(optarg);
- break;
- case (int) 'N':
- params.node_flag = true;
- break;
- case (int) 'o':
- params.format = xstrdup(optarg);
- break;
- case (int) 'p':
- params.partition = xstrdup(optarg);
- break;
- case (int) 's':
- params.summarize = true;
- break;
- case (int) 'S':
- params.sort = xstrdup(optarg);
- break;
- case (int) 't':
- params.states = xstrdup(optarg);
- params.state_list =
- _build_state_list(params.states);
- break;
- case (int) 'v':
- params.verbose++;
- break;
- case (int) 'V':
- _print_version();
- exit(0);
- case (int) OPT_LONG_HELP:
- _help();
- exit(0);
- case (int) OPT_LONG_USAGE:
- _usage();
- exit(0);
+ }
+ break;
+ case (int) 'l':
+ params.long_output = true;
+ break;
+ case (int) 'n':
+ params.nodes= xstrdup(optarg);
+ break;
+ case (int) 'N':
+ params.node_flag = true;
+ break;
+ case (int) 'o':
+ params.format = xstrdup(optarg);
+ break;
+ case (int) 'p':
+ params.partition = xstrdup(optarg);
+ break;
+ case (int) 'r':
+ params.responding_nodes = true;
+ break;
+ case (int) 'R':
+ params.list_reasons = true;
+ break;
+ case (int) 's':
+ params.summarize = true;
+ break;
+ case (int) 'S':
+ params.sort = xstrdup(optarg);
+ break;
+ case (int) 't':
+ params.states = xstrdup(optarg);
+ params.state_list =
+ _build_state_list(params.states);
+ break;
+ case (int) 'v':
+ params.verbose++;
+ break;
+ case (int) 'V':
+ _print_version();
+ exit(0);
+ case (int) OPT_LONG_HELP:
+ _help();
+ exit(0);
+ case (int) OPT_LONG_USAGE:
+ _usage();
+ exit(0);
}
}
- if ( ( params.format == NULL ) &&
- ( env_val = getenv("SINFO_FORMAT") ) )
- params.format = xstrdup(env_val);
if ( ( params.partition == NULL ) &&
- ( env_val = getenv("SINFO_PARTITION") ) )
+ ( env_val = getenv("SINFO_PARTITION") ) )
params.partition = xstrdup(env_val);
if ( ( params.partition == NULL ) &&
- ( env_val = getenv("SINFO_SORT") ) )
+ ( env_val = getenv("SINFO_SORT") ) )
params.sort = xstrdup(env_val);
if ( params.format == NULL ) {
- if ( params.summarize )
+ if ( params.summarize ) {
params.format = "%9P %.5a %.9l %.15F %N";
- else if ( params.node_flag ) {
+
+ } else if ( params.node_flag ) {
params.node_field_flag = true; /* compute size later */
- if ( params.long_output ) {
- params.format = "%N %.5D %.9P %.11T %.4c "
- "%.6m %.8d %.6w %.8f %20R";
- } else {
- params.format = "%N %.5D %.9P %6t";
- }
+ params.format = params.long_output ?
+ "%N %.5D %.9P %.11T %.4c %.6m %.8d %.6w %.8f %20R" :
+ "%N %.5D %.9P %6t";
+
+ } else if (params.list_reasons) {
+ params.format = params.long_output ?
+ "%35R %6t %N" :
+ "%35R %N";
+
+ } else if ((env_val = getenv ("SINFO_FORMAT"))) {
+ params.format = xstrdup(env_val);
+
} else {
- if ( params.long_output )
- params.format = "%9P %.5a %.9l %.8s %.4r %.5h "
- "%.10g %.5D %.11T %N";
- else
- params.format = "%9P %.5a %.9l %.5D %.6t %N";
+
+ params.format = params.long_output ?
+ "%9P %.5a %.9l %.8s %.4r %.5h %.10g %.5D %.11T %N" :
+ "%9P %.5a %.9l %.5D %.6t %N";
}
}
_parse_format( params.format );
- if (params.nodes || params.partition || params.state_list)
+ if (params.list_reasons && (params.state_list == NULL)) {
+ params.states = xstrdup ("down,drain");
+ params.state_list = _build_state_list (params.states);
+ }
+
+ if (params.dead_nodes || params.nodes || params.partition ||
+ params.responding_nodes ||params.state_list)
params.filtering = true;
if (params.verbose)
@@ -247,16 +266,24 @@ _build_all_states_list( void )
static int
_parse_state( char* str, uint16_t* states )
{
- int i;
+ int i, len;
+ uint16_t no_resp;
char *state_names;
+ len = strlen(str);
+ if (str[len - 1] == '*') {
+ no_resp = NODE_STATE_NO_RESPOND;
+ len--;
+ } else
+ no_resp = 0;
+
for (i = 0; i<NODE_STATE_END; i++) {
- if (strcasecmp (node_state_string(i), str) == 0) {
- *states = i;
+ if (strncasecmp (node_state_string(i), str, len) == 0) {
+ *states = (i | no_resp);
return SLURM_SUCCESS;
}
- if (strcasecmp (node_state_string_compact(i), str) == 0) {
- *states = i;
+ if (strncasecmp (node_state_string_compact(i), str, len) == 0) {
+ *states = (i | no_resp);
return SLURM_SUCCESS;
}
}
@@ -473,6 +500,7 @@ _parse_token( char *token, char *field, int *field_size, bool *right_justify,
void _print_options( void )
{
printf("-----------------------------\n");
+ printf("dead = %s\n", params.dead_nodes ? "true" : "false");
printf("exact = %d\n", params.exact_match);
printf("filtering = %s\n", params.filtering ? "true" : "false");
printf("format = %s\n", params.format);
@@ -485,6 +513,8 @@ void _print_options( void )
printf("nodes = %s\n", params.nodes ? params.nodes : "n/a");
printf("partition = %s\n", params.partition ?
params.partition: "n/a");
+ printf("responding = %s\n", params.responding_nodes ?
+ "true" : "false");
printf("states = %s\n", params.states);
printf("sort = %s\n", params.sort);
printf("summarize = %s\n", params.summarize ? "true" : "false");
@@ -521,28 +551,32 @@ static void _print_version(void)
static void _usage( void )
{
- printf("Usage: sinfo [-i seconds] [-t node_state] [-p PARTITION] [-n NODES]\n");
- printf(" [-S fields] [-o format] [--usage] [-elNsv]\n");
+ printf("\
+Usage: sinfo [-delNRrsv] [-i seconds] [-t states] [-p partition] [-n nodes]\n\
+ [-S fields] [-o format] \n");
}
static void _help( void )
{
- printf("Usage: sinfo [OPTIONS]\n");
- printf(" -e, --exact group nodes only on exact match of\n");
- printf(" configuration\n");
- printf(" -h, --noheader no headers on output\n");
- printf(" -i, --iterate=seconds specify an interation period\n");
- printf(" -l, --long long output - displays more information\n");
- printf(" -n, --nodes=NODES report on specific node(s)\n");
- printf(" -N, --Node Node-centric format\n");
- printf(" -o, --format=format format specification\n");
- printf(" -p, --partition=PARTITION report on specific partition\n");
- printf(" -s, --summarize report state summary only\n");
- printf(" -S, --sort=fields comma seperated list of fields to sort on\n");
- printf(" -t, --states=node_state specify the what states of nodes to view\n");
- printf(" -v, --verbose verbosity level\n");
- printf(" -V, --version output version information and exit\n");
- printf("\nHelp options:\n");
- printf(" --help show this help message\n");
- printf(" --usage display brief usage message\n");
+ printf ("\
+Usage: sinfo [OPTIONS]\n\
+ -d, --dead show only non-responding nodes\n\
+ -e, --exact group nodes only on exact match of configuration\n\
+ -h, --noheader no headers on output\n\
+ -i, --iterate=seconds specify an interation period\n\
+ -l, --long long output - displays more information\n\
+ -n, --nodes=NODES report on specific node(s)\n\
+ -N, --Node Node-centric format\n\
+ -o, --format=format format specification\n\
+ -p, --partition=PARTITION report on specific partition\n\
+ -r, --responding report only responding nodes\n\
+ -R, --list-reasons list reason nodes are down or drained\n\
+ -s, --summarize report state summary only\n\
+ -S, --sort=fields comma seperated list of fields to sort on\n\
+ -t, --states=node_state specify the what states of nodes to view\n\
+ -v, --verbose verbosity level\n\
+ -V, --version output version information and exit\n\
+\nHelp options:\n\
+ --help show this help message\n\
+ --usage display brief usage message\n");
}
View
8 src/sinfo/sinfo.c
@@ -230,6 +230,14 @@ static bool _filter_out(node_info_t *node_ptr)
return true;
}
+ if ( (params.dead_nodes) &&
+ (!(node_ptr->node_state & NODE_STATE_NO_RESPOND)) )
+ return true;
+
+ if ( (params.responding_nodes) &&
+ (node_ptr->node_state & NODE_STATE_NO_RESPOND) )
+ return true;
+
if (params.state_list) {
int *node_state;
bool match = false;
View
3  src/sinfo/sinfo.h
@@ -99,12 +99,15 @@ struct sinfo_match_flags {
/* Input parameters */
struct sinfo_parameters {
+ bool dead_nodes;
bool exact_match;
bool filtering;
bool long_output;
bool no_header;
bool node_field_flag;
bool node_flag;
+ bool responding_nodes;
+ bool list_reasons;
bool summarize;
struct sinfo_match_flags match_flags;
View
7 src/slurmctld/agent.c
@@ -364,7 +364,7 @@ static void *_wdog(void *args)
(unsigned long) thread_ptr[i].thread);
if (pthread_kill(thread_ptr[i].thread,
SIGALRM) == ESRCH)
- thread_ptr[i].state = DSH_FAILED;
+ thread_ptr[i].state = DSH_NO_RESP;
}
break;
case DSH_NEW:
@@ -522,7 +522,7 @@ static void *_thread_per_node_rpc(void *args)
msg.msg_type = msg_type;
msg.data = task_ptr->msg_args_ptr;
- thread_ptr->end_time = thread_ptr->start_time + COMMAND_TIMEOUT;
+ thread_ptr->end_time = thread_ptr->start_time + COMMAND_TIMEOUT;
if (task_ptr->get_reply) {
if (slurm_send_recv_rc_msg(&msg, &rc, timeout) < 0) {
_comm_err(thread_ptr->node_name);
@@ -613,7 +613,7 @@ static void *_thread_per_node_rpc(void *args)
}
/*
- * SIGALRM handler. We are really interested in interrupting hung communictions
+ * SIGALRM handler. We are really interested in interrupting hung communictions
* and causing them to return EINTR. Multiple interupts might be required.
*/
static void _alarm_handler(int dummy)
@@ -621,7 +621,6 @@ static void _alarm_handler(int dummy)
xsignal(SIGALRM, _alarm_handler);
}
-
/*
* _queue_agent_retry - Queue any failed RPCs for later replay
* IN agent_info_ptr - pointer to info on completed agent requests
View
37 src/slurmctld/controller.c
@@ -75,7 +75,7 @@
#define MIN_CHECKIN_TIME 3 /* Nodes have this number of seconds to
* check-in before we ping them */
#define MEM_LEAK_TEST 0 /* Running memory leak test if set */
-
+#define SHUTDOWN_WAIT 2 /* Time to wait for backup server shutdown */
/* Log to stderr and syslog until becomes a daemon */
log_options_t log_opts = LOG_OPTS_INITIALIZER;
@@ -90,6 +90,7 @@ static int debug_level = 0;
static char *debug_logfile = NULL;
static bool dump_core = false;
static int recover = DEFAULT_RECOVER;
+static char node_name[MAX_NAME_LEN];
static pthread_cond_t server_thread_cond = PTHREAD_COND_INITIALIZER;
static pid_t slurmctld_pid;
/*
@@ -111,7 +112,7 @@ static void _parse_commandline(int argc, char *argv[],
inline static int _report_locks_set(void);
static void * _service_connection(void *arg);
static int _set_slurmctld_state_loc(void);
-static int _shutdown_backup_controller(void);
+static int _shutdown_backup_controller(int wait_time);
static void * _slurmctld_background(void *no_data);
static void * _slurmctld_rpc_mgr(void *no_data);
static void * _slurmctld_signal_hand(void *no_data);
@@ -127,7 +128,6 @@ typedef struct connection_arg {
int main(int argc, char *argv[])
{
int error_code;
- char node_name[MAX_NAME_LEN];
pthread_attr_t thread_attr_sig, thread_attr_rpc;
/*
@@ -209,14 +209,13 @@ int main(int argc, char *argv[])
} else if (slurmctld_conf.control_machine &&
(strcmp(node_name, slurmctld_conf.control_machine)
== 0)) {
- (void) _shutdown_backup_controller();
+ (void) _shutdown_backup_controller(SHUTDOWN_WAIT);
/* Now recover the remaining state information */
if ((error_code = read_slurm_conf(recover))) {
error("read_slurm_conf reading %s: %m",
SLURM_CONFIG_FILE);
abort();
}
- info("Running primary controller");
} else {
error
("this host (%s) not valid controller (%s or %s)",
@@ -224,6 +223,7 @@ int main(int argc, char *argv[])
slurmctld_conf.backup_controller);
exit(0);
}
+ info("Running primary controller");
if (switch_state_begin(recover)) {
error("switch_state_begin: %m");
@@ -550,7 +550,9 @@ static void *_slurmctld_background(void *no_data)
static time_t last_group_time;
static time_t last_ping_time;
static time_t last_timelimit_time;
+ static time_t last_assert_primary_time;
time_t now;
+
/* Locks: Write job, write node, read partition */
slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK,
WRITE_LOCK, READ_LOCK
@@ -566,7 +568,7 @@ static void *_slurmctld_background(void *no_data)
/* Let the dust settle before doing work */
now = time(NULL);
last_sched_time = last_checkpoint_time = last_group_time = now;
- last_timelimit_time = now;
+ last_timelimit_time = last_assert_primary_time = now;
last_ping_time = now + (time_t)MIN_CHECKIN_TIME -
(time_t)slurmctld_conf.heartbeat_interval;
(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
@@ -640,6 +642,21 @@ static void *_slurmctld_background(void *no_data)
save_all_state();
}
+ /* Reassert this machine as the primary controller.
+ * A network or security problem could result in
+ * the backup controller assuming control even
+ * while the real primary controller is running */
+ if (slurmctld_conf.slurmctld_timeout &&
+ slurmctld_conf.backup_addr &&
+ slurmctld_conf.backup_addr[0] &&
+ (difftime(now, last_assert_primary_time) >=
+ slurmctld_conf.slurmctld_timeout) &&
+ node_name && slurmctld_conf.backup_controller &&
+ strcmp(node_name, slurmctld_conf.backup_controller)) {
+ last_assert_primary_time = now;
+ (void) _shutdown_backup_controller(0);
+ }
+
}
debug3("_slurmctld_background shutting down");
@@ -834,9 +851,10 @@ static void _usage(char *prog_name)
/*
* Tell the backup_controller to relinquish control, primary control_machine
* has resumed operation
+ * wait_time - How long to wait for backup controller to write state, seconds
* RET 0 or an error code
*/
-static int _shutdown_backup_controller(void)
+static int _shutdown_backup_controller(int wait_time)
{
int rc;
slurm_msg_t req;
@@ -856,7 +874,7 @@ static int _shutdown_backup_controller(void)
if (slurm_send_recv_rc_msg(&req, &rc, CONTROL_TIMEOUT) < 0) {
error("shutdown_backup:send/recv: %m");
- return SLURM_SOCKET_ERROR;
+ return SLURM_ERROR;
}
if (rc) {
@@ -870,7 +888,8 @@ static int _shutdown_backup_controller(void)
* not presently the case (it returns when no other work is pending,
* so the state save should occur right away). We sleep for a while
* here and give the backup controller time to shutdown */
- sleep(2);
+ if (wait_time)
+ sleep(wait_time);
return SLURM_PROTOCOL_SUCCESS;
}
View
44 src/slurmctld/job_mgr.c
@@ -122,7 +122,6 @@ static void _reset_step_bitmaps(struct job_record *job_ptr);
static void _set_job_id(struct job_record *job_ptr);
static void _set_job_prio(struct job_record *job_ptr);
static bool _slurm_picks_nodes(job_desc_msg_t * job_specs);
-static bool _too_many_fragments(bitstr_t *req_bitmap);
static bool _top_priority(struct job_record *job_ptr);
static int _validate_job_create_req(job_desc_msg_t * job_desc);
static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate,
@@ -1339,7 +1338,7 @@ job_complete(uint32_t job_id, uid_t uid, bool requeue,
* IN allocate - resource allocation request if set rather than job submit
* IN will_run - job is not to be created, test of validity only
* OUT new_job_id - the job's ID
- * OUT job_rec_ptr - pointer to the job (NULL on error)
+ * OUT job_pptr - pointer to the job (NULL on error)
* RET 0 on success, otherwise ESLURM error code
* globals: job_list - pointer to global job list
* list_part - global list of partition info
@@ -1349,13 +1348,13 @@ job_complete(uint32_t job_id, uid_t uid, bool requeue,
static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id,
int allocate, int will_run,
- struct job_record **job_rec_ptr, uid_t submit_uid)
+ struct job_record **job_pptr, uid_t submit_uid)
{
int error_code = SLURM_SUCCESS, i;
struct part_record *part_ptr;
bitstr_t *req_bitmap = NULL, *exc_bitmap = NULL;
- *job_rec_ptr = (struct job_record *) NULL;
+ *job_pptr = (struct job_record *) NULL;
if ((error_code = _validate_job_desc(job_desc, allocate, submit_uid)))
return error_code;
@@ -1415,10 +1414,6 @@ static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id,
error_code = ESLURM_REQUESTED_NODES_NOT_IN_PARTITION;
goto cleanup;
}
- if (_too_many_fragments(req_bitmap)) {
- error_code = ESLURM_TOO_MANY_REQUESTED_NODES;
- goto cleanup;
- }
i = count_cpus(req_bitmap);
if (i > job_desc->num_procs)
job_desc->num_procs = i;
@@ -1483,7 +1478,7 @@ static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id,
}
if ((error_code = _copy_job_desc_to_job_record(job_desc,
- job_rec_ptr,
+ job_pptr,
part_ptr,
&req_bitmap,
&exc_bitmap))) {
@@ -1493,16 +1488,16 @@ static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id,
if (job_desc->script) {
if ((error_code = _copy_job_desc_to_file(job_desc,
- (*job_rec_ptr)->
+ (*job_pptr)->
job_id))) {
- (*job_rec_ptr)->job_state = JOB_FAILED;
+ (*job_pptr)->job_state = JOB_FAILED;
error_code = ESLURM_WRITING_TO_FILE;
goto cleanup;
}
- (*job_rec_ptr)->batch_flag = 1;
+ (*job_pptr)->batch_flag = 1;
} else
- (*job_rec_ptr)->batch_flag = 0;
- *new_job_id = (*job_rec_ptr)->job_id;
+ (*job_pptr)->batch_flag = 0;
+ *new_job_id = (*job_pptr)->job_id;
/* Insure that requested partition is valid right now,
* otherwise leave job queued and provide warning code */
@@ -1522,6 +1517,8 @@ static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id,
*new_job_id, part_ptr->name);
error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
}
+ if (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE)
+ (*job_pptr)->priority = 1; /* Move to end of queue */
cleanup:
FREE_NULL_BITMAP(req_bitmap);
@@ -2835,7 +2832,7 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count,
job_ptr->time_last_active = now;
} else {
error
- ("Registered job %u.u on wrong node %s ",
+ ("Registered job %u.%u on wrong node %s ",
job_id_ptr[i], step_id_ptr[i], node_name);
_kill_job_on_node(job_id_ptr[i], node_ptr);
}
@@ -3196,20 +3193,3 @@ void job_fini (void)
xfree(job_hash_over);
}
-static bool _too_many_fragments(bitstr_t *req_bitmap)
-{
-#ifdef MAX_NODE_FRAGMENTS
- int i, frags=0;
- int last_bit = 0, next_bit;
-
- for (i = 0; i < node_record_count; i++) {
- next_bit = bit_test(req_bitmap, i);
- if (next_bit == last_bit)
- continue;
- last_bit = next_bit;
- if (next_bit && (++frags > MAX_NODE_FRAGMENTS))
- return true;
- }
-#endif
- return false;
-}
View
31 src/slurmctld/node_mgr.c
@@ -332,16 +332,21 @@ int dump_all_node_state ( void )
error ("Can't save state, error creating file %s %m",
new_file);
error_code = errno;
- }
- else {
- if (write (log_fd, get_buf_data(buffer),
- get_buf_offset(buffer)) !=
- get_buf_offset(buffer)) {
- error ("Can't save state, error writing file %s %m",
- new_file);
- error_code = errno;
+ } else {
+ int pos = 0, nwrite = get_buf_offset(buffer), amount;
+ char *data = (char *)get_buf_data(buffer);
+
+ while (nwrite > 0) {
+ amount = write(log_fd, &data[pos], nwrite);
+ if ((amount < 0) && (errno != EINTR)) {
+ error("Error writing file %s, %m", new_file);
+ error_code = errno;
+ break;
+ }
+ nwrite -= amount;
+ pos += amount;
}
- close (log_fd);
+ close(log_fd);
}
if (error_code)
(void) unlink (new_file);
@@ -1153,7 +1158,7 @@ validate_node_specs (char *node_name, uint32_t cpus,
#endif
if (node_ptr->node_state == NODE_STATE_UNKNOWN) {
reset_job_priority();
- info ("validate_node_specs: node %s has registered",
+ debug("validate_node_specs: node %s has registered",
node_name);
if (job_count)
node_ptr->node_state = NODE_STATE_ALLOCATED;
@@ -1491,9 +1496,13 @@ void make_node_comp(struct node_record *node_ptr)
node_ptr->name,
node_state_string((enum node_states)
node_ptr->node_state));
- } else {
+ } else if (!no_resp_flag) {
node_ptr->node_state = NODE_STATE_COMPLETING | no_resp_flag;