Permalink
Browse files

Tag of v1.1.18

  • Loading branch information...
2 parents 20ac2ec + dbcfe18 commit 97cb396aa102e76e838b78a7daca2b921f928099 @jette jette committed Nov 7, 2006
View
@@ -9,7 +9,7 @@
Name: slurm
Major: 1
Minor: 1
- Micro: 18
+ Micro: 18
Version: 1.1.18
Release: 1
API_CURRENT: 10
View
@@ -12,8 +12,13 @@ documents those changes that are of interest to users and admins.
passthrough nodes to the allocation when creating a block.
- BLUEGENE - Fix deadlock issue with starting and failing jobs at the same
time
- - Make connect() non-blocking and poll to avoid possibly very long default
- timeout.
+ - Make connect() non-blocking and poll() with timeout to avoid huge
+ waits under some conditions.
+ - Set "ENVIRONMENT=BATCH" environment variable for "srun --batch" jobs only.
+ - Add logic to save/restore select/cons_res state information.
+ - BLUEGENE - make all sprintf's into snprintf's
+ - Fix timeout calculation to work correctly for fan out.
+ - Fix for "srun -A" segfault on a node failure.
* Changes in SLURM 1.1.17
=========================
View
@@ -41,21 +41,29 @@ are not responding even if they are not in any partition?</a></li>
<h2>For Users</h2>
<p><a name="comp"><b>1. Why is my job/node in COMPLETING state?</b></a><br>
-When a job is terminating, both the job and its nodes enter the state &quot;completing.&quot;
+When a job is terminating, both the job and its nodes enter the COMPLETING state.
As the SLURM daemon on each node determines that all processes associated with
-the job have terminated, that node changes state to &quot;idle&quot; or some other
-appropriate state. When every node allocated to a job has determined that all
-processes associated with it have terminated, the job changes state to &quot;completed&quot;
-or some other appropriate state. Normally, this happens within a fraction of one
-second. However, if the job has processes that cannot be terminated with a SIGKILL,
-the job and one or more nodes can remain in the completing state for an extended
-period of time. This may be indicative of processes hung waiting for a core file
-to complete I/O or operating system failure. If this state persists, the system
-administrator should use the <span class="commandline">scontrol</span> command
-to change the node's state to DOWN (e.g. &quot;scontrol update
-NodeName=<i>name</i> State=DOWN Reason=hung_completing&quot;), reboot the node,
-then reset the node's state to IDLE (e.g. &quot;scontrol update
-NodeName=<i>name</i> State=RESUME&quot;).</p>
+the job have terminated, that node changes state to IDLE or some other appropriate
+state.
+When every node allocated to a job has determined that all processes associated
+with it have terminated, the job changes state to COMPLETED or some other
+appropriate state (e.g. FAILED).
+Normally, this happens within a second.
+However, if the job has processes that cannot be terminated with a SIGKILL
+signal, the job and one or more nodes can remain in the COMPLETING state
+for an extended period of time.
+This may be indicative of processes hung waiting for a core file
+to complete I/O or operating system failure.
+If this state persists, the system administrator should check for processes
+associated with the job that can not be terminated then use the
+<span class="commandline">scontrol</span> command to change the node's
+state to DOWN (e.g. &quot;scontrol update NodeName=<i>name</i> State=DOWN Reason=hung_completing&quot;),
+reboot the node, then reset the node's state to IDLE
+(e.g. &quot;scontrol update NodeName=<i>name</i> State=RESUME&quot;).
+Note that setting the node DOWN will terminate all running or suspended
+jobs associated with that node.
+An alternative is to set the node's state to DRAIN until all jobs
+associated with it terminate before setting it DOWN and re-booting.</p>
<p><a name="rlimit"><b>2. Why do I see the error &quot;Can't propagate RLIMIT_...&quot;?</b></a><br>
When the <span class="commandline">srun</span> command executes, it captures the
@@ -342,6 +350,6 @@ partition. You can control the frequency of this ping with the
<p class="footer"><a href="#top">top</a></p>
-<p style="text-align:center;">Last modified 24 August 2006</p>
+<p style="text-align:center;">Last modified 31 October 2006</p>
<!--#include virtual="footer.txt"-->
View
@@ -83,11 +83,12 @@ void *_forward_thread(void *arg)
/* figure out where we are in the tree and set the timeout for
to wait for our childern correctly (timeout+1 sec per step)
to let the child timeout */
- steps = (fwd_msg->header.forward.cnt+1)/slurm_get_tree_width();
- fwd_msg->timeout = (1000*steps);
- steps++;
- fwd_msg->timeout += (start_timeout*steps);
-
+ if(fwd_msg->header.forward.cnt>0) {
+ steps = (fwd_msg->header.forward.cnt+1)/slurm_get_tree_width();
+ fwd_msg->timeout = (5000*steps);
+ steps++;
+ fwd_msg->timeout += (start_timeout*steps);
+ }
/* info("sending to %s with %d forwards", */
/* fwd_msg->node_name, fwd_msg->header.forward.cnt); */
if ((fd = slurm_open_msg_conn(&fwd_msg->addr)) < 0) {
@@ -594,6 +595,7 @@ extern int forward_set(forward_t *forward,
/* info("forwarding to %s",name); */
if(span > 0) {
+ span++;
forward->addr = xmalloc(sizeof(slurm_addr) * span);
forward->name = xmalloc(sizeof(char)
* (MAX_SLURM_NAME * span));
@@ -666,6 +668,7 @@ extern int forward_set_launch(forward_t *forward,
/* info("forwarding to %s",name); */
if(span > 0) {
+ span++;
forward->addr = xmalloc(sizeof(slurm_addr) * span);
forward->name =
xmalloc(sizeof(char) * (MAX_SLURM_NAME * span));
@@ -132,7 +132,7 @@ static slurm_errtab_t slurm_errtab[] = {
{ ESLURM_PATHNAME_TOO_LONG,
"Pathname of a file or directory too long" },
{ ESLURM_NOT_TOP_PRIORITY,
- "Immediate execution impossible, higher priority jobs pending" },
+ "Immediate execution impossible, insufficient priority" },
{ ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE,
"Requested node configuration is not available" },
{ ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE,
@@ -748,7 +748,7 @@ List slurm_receive_msg(slurm_fd fd, slurm_msg_t *msg, int timeout)
timeout);
}
-
+ //info("The timeout is %d", timeout);
/*
* Receive a msg. slurm_msg_recvfrom() will read the message
* length and allocate space on the heap for a buffer containing
@@ -1363,16 +1363,22 @@ _send_and_recv_msg(slurm_fd fd, slurm_msg_t *req,
List ret_list = NULL;
int steps = 0;
resp->auth_cred = NULL;
+ if (!req->forward.timeout) {
+ if(!timeout)
+ timeout = SLURM_MESSAGE_TIMEOUT_MSEC_STATIC;
+ req->forward.timeout = timeout;
+ }
+
if(slurm_send_node_msg(fd, req) >= 0) {
if(req->forward.cnt>0) {
/* figure out where we are in the tree and set
the timeout for to wait for our childern
correctly (timeout+1 sec per step)
to let the child timeout */
-
+
steps = req->forward.cnt/slurm_get_tree_width();
- timeout = (1000*steps);
+ timeout = (5000*steps);
steps++;
timeout += (req->forward.timeout*steps);
}
@@ -1660,11 +1666,11 @@ List slurm_send_recv_rc_packed_msg(slurm_msg_t *msg, int timeout)
if(msg->forward.cnt>0) {
/* figure out where we are in the tree and set
the timeout for to wait for our childern
- correctly (timeout+1 sec per step)
+ correctly (timeout+5 sec per step)
to let the child timeout */
steps = msg->forward.cnt/slurm_get_tree_width();
- timeout = (1000*steps);
+ timeout = (5000*steps);
steps++;
timeout += (msg->forward.timeout*steps);
}
@@ -564,6 +564,9 @@ extern int _slurm_getsockname (int __fd, struct sockaddr * __addr,
extern int _slurm_connect (int __fd, struct sockaddr const * __addr,
socklen_t __len)
{
+#if 0
+ return connect ( __fd , __addr , __len ) ;
+#else
/* From "man connect": Note that for IP sockets the timeout
* may be very long when syncookies are enabled on the server.
*
@@ -574,18 +577,22 @@ extern int _slurm_connect (int __fd, struct sockaddr const * __addr,
flags = fcntl(__fd, F_GETFL);
fcntl(__fd, F_SETFL, flags | O_NONBLOCK);
- rc = connect ( __fd , __addr , __len ) ;
- if ((rc == -1)
- && ((errno == EINPROGRESS) || (errno == EALREADY))) {
+ rc = connect(__fd , __addr , __len);
+ if ((rc == -1) && (errno == EINPROGRESS)) {
+ int poll_rc;
struct pollfd ufds;
ufds.fd = __fd;
- ufds.events = POLLOUT;
- ufds. revents = 0;
- poll(&ufds, 1, 5000); /* 5 sec max wait */
- rc = connect ( __fd , __addr , __len ) ;
+ ufds.events = POLLIN | POLLOUT;
+ ufds.revents = 0;
+ poll_rc = poll(&ufds, 1, 5000);
+ if (poll_rc == 0)
+ errno = ETIMEDOUT;
+ else if (poll_rc == 1)
+ rc = 0;
}
fcntl(__fd, F_SETFL, flags);
return rc;
+#endif
}
/* Put the address of the peer connected to socket FD into *ADDR
Oops, something went wrong.

0 comments on commit 97cb396

Please sign in to comment.