Permalink
Browse files

re-tag v1.1.21

  • Loading branch information...
2 parents b319b7a + 911beef commit 508c27935f525c98a6f2e8ded79c49587d09cb38 @jette jette committed Dec 4, 2006
View
7 NEWS
@@ -1,6 +1,9 @@
This file describes changes in recent versions of SLURM. It primarily
documents those changes that are of interest to users and admins.
+* Changes in SLURM 1.1.22
+=========================
+
* Changes in SLURM 1.1.21
=========================
- BLUEGENE - Wait on a fini to make sure all threads are finished before
@@ -23,6 +26,8 @@ documents those changes that are of interest to users and admins.
have a working curses lib and header.
- Fixed an init issue with forward_struct_init not being set correctly in
a few locations in the slurmd.
+ - Fix for user to use the NodeHostname (when specified in the slurm.conf file)
+ to start jobs on.
* Changes in SLURM 1.1.20
=========================
@@ -407,8 +412,6 @@ documents those changes that are of interest to users and admins.
* Changes in SLURM 1.0.17
=========================
-- Set correct user groups for task epilogs.
- -- Make it so we only run 1 task pro/epilog either the user specified or the
- slurm.conf one if either.
* Changes in SLURM 1.0.16
=========================
View
@@ -587,7 +587,9 @@ The format of the output is identical to that of the \f3end\fP field.
.TP
\f3systemcpu\fP
-The amount of system CPU time.
+The amount of system CPU time. (If job was running on multiple cpus this
+is a combination of all the times so this number could be much larger
+than the elapsed time.)
The format of the output is identical to that of the
\f3elapsed\fP
field.
@@ -608,7 +610,9 @@ The user name of the user who ran the job.
.TP
\f3usercpu\fP
-The amount of user CPU time.
+The amount of user CPU time. (If job was running on multiple cpus this
+is a combination of all the times so this number could be much larger
+than the elapsed time.)
The format of the output is identical to that of the
\f3elapsed\fP field.
@@ -44,28 +44,12 @@ static slurm_addr moab_event_addr, moab_event_addr_bu;
static int event_addr_set = 0;
static slurm_fd event_fd = (slurm_fd) -1;
-/*
- * event_notify - Notify Moab of some event
- * msg IN - event type, NULL to close connection
- * RET 0 on success, -1 on failure
- */
-extern int event_notify(char *msg)
+/* Open event_fd as needed
+ * RET 0 on success, -1 on failure */
+static int _open_fd(time_t now)
{
- time_t now = time(NULL);
- int rc;
-
- if (e_port == 0) {
- /* Event notification disabled */
- return 0;
- }
-
- if (job_aggregation_time
- && (difftime(now, last_notify_time) < job_aggregation_time)) {
- info("wiki event notification already sent recently");
+ if (event_fd != -1)
return 0;
- }
-
- pthread_mutex_lock(&event_mutex);
/* Identify address for socket connection.
* Done only on first call, then cached. */
@@ -96,33 +80,86 @@ extern int event_notify(char *msg)
e_host_bu, e_port);
}
}
- if (event_fd == -1) {
- pthread_mutex_unlock(&event_mutex);
- /* Don't retry again for a while (10 mins)
- * to avoid long delays from ETIMEDOUT */
- last_notify_time = now + 600;
+ if (event_fd == -1)
return -1;
- }
/* We can't have the controller block on the following write() */
fd_set_nonblocking(event_fd);
+ return 0;
+}
+
+static void _close_fd(void)
+{
+ if (event_fd == -1)
+ return;
+
+ (void) slurm_shutdown_msg_engine(event_fd);
+ event_fd = -1;
+}
+
+/*
+ * event_notify - Notify Moab of some event
+ * msg IN - event type, NULL to close connection
+ * RET 0 on success, -1 on failure
+ */
+extern int event_notify(char *msg)
+{
+ time_t now = time(NULL);
+ int rc = 0, retry = 2;
+
+ if (e_port == 0) {
+ /* Event notification disabled */
+ return 0;
+ }
+
+ if (job_aggregation_time
+ && (difftime(now, last_notify_time) < job_aggregation_time)) {
+ debug("wiki event notification already sent recently");
+ return 0;
+ }
+
+ pthread_mutex_lock(&event_mutex);
+ while (retry) {
+ if ((event_fd == -1) && ((rc = _open_fd(now)) == -1)) {
+ /* Can't even open socket.
+ * Don't retry again for a while (2 mins)
+ * to avoid long delays from ETIMEDOUT */
+ last_notify_time = now + 120;
+ break;
+ }
+
+ /* Always send "1234\0" as the message
+ * (we do not care if all of the message is sent,
+ * just that some of it went through to wake up Moab)
+ */
+ if (write(event_fd, "1234", 5) > 0) {
+ verbose("wiki event_notification sent: %s", msg);
+ last_notify_time = now;
+ rc = 0;
+ /* Dave Jackson says to leave the connection
+ * open, but Moab isn't. Without the _close_fd()
+ * here, the next write() generates a broken pipe
+ * error. Just remove the _close_fd() and this
+ * comment when Moab maintains the connection. */
+ _close_fd();
+ break; /* success */
+ }
- /* Always send "1234\0" as the message
- * (we do not care if all of the message is sent, just that
- * some of it went through to wake up Moab)
- */
- if (write(event_fd, "1234", 5) > 0) {
- info("wiki event_notification sent: %s", msg);
- last_notify_time = now;
- rc = 0;
- } else {
error("wiki event notification failure: %m");
- /* close socket, re-open later */
- (void) slurm_shutdown_msg_engine(event_fd);
- event_fd = -1;
rc = -1;
- }
+ retry--;
+ if ((errno == EAGAIN) || (errno == EINTR))
+ continue;
+ _close_fd();
+ if (errno == EPIPE) {
+ /* If Moab closed the socket we get an EPIPE,
+ * retry once */
+ continue;
+ } else {
+ break;
+ }
+ }
pthread_mutex_unlock(&event_mutex);
return rc;
View
@@ -17,7 +17,7 @@
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
- * to link the code of portions of this program with the OpenSSL library under
+ * to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
View
@@ -61,6 +61,7 @@
#include "src/common/xassert.h"
#include "src/common/xstring.h"
#include "src/common/node_select.h"
+#include "src/common/read_config.h"
#include "src/slurmctld/agent.h"
#include "src/slurmctld/locks.h"
#include "src/slurmctld/ping_nodes.h"
@@ -86,6 +87,7 @@ bitstr_t *share_node_bitmap = NULL; /* bitmap of sharable nodes */
static int _delete_config_record (void);
static void _dump_node_state (struct node_record *dump_node_ptr,
Buf buffer);
+static struct node_record * _find_alias_node_record (char *name);
static int _hash_index (char *name);
static void _list_delete_config (void *config_entry);
static int _list_find_config (void *config_entry, void *key);
@@ -312,6 +314,65 @@ _dump_node_state (struct node_record *dump_node_ptr, Buf buffer)
pack32 (dump_node_ptr->tmp_disk, buffer);
}
+/*
+ * _find_alias_node_record - find a record for node with the alias of
+ * the specified name supplied
+ * input: name - name to be aliased of the desired node
+ * output: return pointer to node record or NULL if not found
+ * global: node_record_table_ptr - pointer to global node table
+ * node_hash_table - table of hash indecies
+ */
+static struct node_record *
+_find_alias_node_record (char *name)
+{
+ int i;
+ char *alias = NULL;
+
+ if ((name == NULL)
+ || (name[0] == '\0')) {
+ info("_find_alias_node_record: passed NULL name");
+ return NULL;
+ }
+ /* Get the alias we have just to make sure the user isn't
+ * trying to use the real hostname to run on something that has
+ * been aliased.
+ */
+ alias = slurm_conf_get_nodename(name);
+
+ if(!alias)
+ return NULL;
+
+ /* try to find via hash table, if it exists */
+ if (node_hash_table) {
+ struct node_record *node_ptr;
+
+ i = _hash_index (alias);
+ node_ptr = node_hash_table[i];
+ while (node_ptr) {
+ xassert(node_ptr->magic == NODE_MAGIC);
+ if (!strcmp(node_ptr->name, alias)) {
+ xfree(alias);
+ return node_ptr;
+ }
+ node_ptr = node_ptr->node_next;
+ }
+ error ("_find_alias_node_record: lookup failure for %s", name);
+ }
+
+ /* revert to sequential search */
+ else {
+ for (i = 0; i < node_record_count; i++) {
+ if (!strcmp (alias, node_record_table_ptr[i].name)) {
+ xfree(alias);
+ return (&node_record_table_ptr[i]);
+ }
+ }
+ }
+
+ xfree(alias);
+ return (struct node_record *) NULL;
+}
+
/*
* load_all_node_state - Load the node state from file, recover on slurmctld
* restart. Execute this after loading the configuration file data.
@@ -344,7 +405,7 @@ extern int load_all_node_state ( bool state_only )
data_allocated = BUF_SIZE;
data = xmalloc(data_allocated);
while (1) {
- data_read = read (state_fd, &data[data_size], BUF_SIZE);
+ data_read = read(state_fd, &data[data_size], BUF_SIZE);
if (data_read < 0) {
if (errno == EINTR)
continue;
@@ -399,7 +460,8 @@ extern int load_all_node_state ( bool state_only )
node_cnt++;
if (node_ptr->node_state == NODE_STATE_UNKNOWN) {
if (node_state & NODE_STATE_DRAIN)
- node_ptr->node_state = NODE_STATE_DRAIN;
+ node_ptr->node_state =
+ NODE_STATE_DRAIN;
else if (base_state == NODE_STATE_DOWN)
node_ptr->node_state = NODE_STATE_DOWN;
}
@@ -444,23 +506,24 @@ struct node_record *
find_node_record (char *name)
{
int i;
-
+
if ((name == NULL)
|| (name[0] == '\0')) {
info("find_node_record passed NULL name");
return NULL;
}
-
+
/* try to find via hash table, if it exists */
if (node_hash_table) {
struct node_record *node_ptr;
-
+
i = _hash_index (name);
node_ptr = node_hash_table[i];
while (node_ptr) {
xassert(node_ptr->magic == NODE_MAGIC);
- if (strncmp(node_ptr->name, name, MAX_SLURM_NAME) == 0)
+ if (!strcmp(node_ptr->name, name)) {
return node_ptr;
+ }
node_ptr = node_ptr->node_next;
}
error ("find_node_record: lookup failure for %s", name);
@@ -469,12 +532,15 @@ find_node_record (char *name)
/* revert to sequential search */
else {
for (i = 0; i < node_record_count; i++) {
- if (strcmp (name, node_record_table_ptr[i].name) == 0)
+ if (!strcmp (name, node_record_table_ptr[i].name)) {
return (&node_record_table_ptr[i]);
+ }
}
}
-
- return (struct node_record *) NULL;
+
+ /* look for the alias node record if the user put this in
+ instead of what slurm sees the node name as */
+ return _find_alias_node_record (name);
}
@@ -414,10 +414,13 @@ static int _build_single_nodeline_info(slurm_conf_node_t *node_ptr,
hostname = hostlist_shift(hostname_list);
address = hostlist_shift(address_list);
#endif
-
- if (strcmp(alias, highest_node_name) <= 0)
+ if (strcmp(alias, highest_node_name) <= 0) {
+ /* find_node_record locks this to get the
+ alias so we need to unlock */
+ slurm_conf_unlock();
node_rec = find_node_record(alias);
- else {
+ slurm_conf_lock();
+ } else {
strncpy(highest_node_name, alias, MAX_SLURM_NAME);
node_rec = NULL;
}
@@ -1120,7 +1120,8 @@ _wait_for_any_task(slurmd_job_t *job, bool waitflag)
job->task_epilog,
job,
2, job->env);
- } else if (conf->task_epilog) {
+ }
+ if (conf->task_epilog) {
char *my_epilog;
slurm_mutex_lock(&conf->config_mutex);
my_epilog = xstrdup(conf->task_epilog);
@@ -319,11 +319,7 @@ exec_task(slurmd_job_t *job, int i, int waitfd)
}
pre_launch(job);
-
- if (job->task_prolog) {
- _run_script_and_set_env("user task_prolog",
- job->task_prolog, job);
- } else if (conf->task_prolog) {
+ if (conf->task_prolog) {
char *my_prolog;
slurm_mutex_lock(&conf->config_mutex);
my_prolog = xstrdup(conf->task_prolog);
@@ -332,7 +328,10 @@ exec_task(slurmd_job_t *job, int i, int waitfd)
my_prolog, job);
xfree(my_prolog);
}
-
+ if (job->task_prolog) {
+ _run_script_and_set_env("user task_prolog",
+ job->task_prolog, job);
+ }
if (job->multi_prog)
task_exec(job->argv[1], job->env,

0 comments on commit 508c279

Please sign in to comment.