Skip to content

Commit

Permalink
re-tag v1.1.21
Browse files Browse the repository at this point in the history
  • Loading branch information
Moe Jette committed Dec 4, 2006
2 parents b319b7a + 911beef commit 508c279
Show file tree
Hide file tree
Showing 8 changed files with 176 additions and 63 deletions.
7 changes: 5 additions & 2 deletions NEWS
@@ -1,6 +1,9 @@
This file describes changes in recent versions of SLURM. It primarily
documents those changes that are of interest to users and admins.

* Changes in SLURM 1.1.22
=========================

* Changes in SLURM 1.1.21
=========================
- BLUEGENE - Wait on a fini to make sure all threads are finished before
Expand All @@ -23,6 +26,8 @@ documents those changes that are of interest to users and admins.
have a working curses lib and header.
- Fixed an init issue with forward_struct_init not being set correctly in
a few locations in the slurmd.
- Fix for user to use the NodeHostname (when specified in the slurm.conf file)
to start jobs on.

* Changes in SLURM 1.1.20
=========================
Expand Down Expand Up @@ -407,8 +412,6 @@ documents those changes that are of interest to users and admins.
* Changes in SLURM 1.0.17
=========================
-- Set correct user groups for task epilogs.
-- Make it so we only run 1 task pro/epilog either the user specified or the
slurm.conf one if either.

* Changes in SLURM 1.0.16
=========================
Expand Down
8 changes: 6 additions & 2 deletions doc/man/man1/sacct.1
Expand Up @@ -587,7 +587,9 @@ The format of the output is identical to that of the \f3end\fP field.

.TP
\f3systemcpu\fP
The amount of system CPU time.
The amount of system CPU time. (If job was running on multiple cpus this
is a combination of all the times so this number could be much larger
than the elapsed time.)
The format of the output is identical to that of the
\f3elapsed\fP
field.
Expand All @@ -608,7 +610,9 @@ The user name of the user who ran the job.

.TP
\f3usercpu\fP
The amount of user CPU time.
The amount of user CPU time. (If job was running on multiple cpus this
is a combination of all the times so this number could be much larger
than the elapsed time.)
The format of the output is identical to that of the
\f3elapsed\fP field.

Expand Down
115 changes: 76 additions & 39 deletions src/plugins/sched/wiki2/event.c
Expand Up @@ -44,28 +44,12 @@ static slurm_addr moab_event_addr, moab_event_addr_bu;
static int event_addr_set = 0;
static slurm_fd event_fd = (slurm_fd) -1;

/*
* event_notify - Notify Moab of some event
* msg IN - event type, NULL to close connection
* RET 0 on success, -1 on failure
*/
extern int event_notify(char *msg)
/* Open event_fd as needed
* RET 0 on success, -1 on failure */
static int _open_fd(time_t now)
{
time_t now = time(NULL);
int rc;

if (e_port == 0) {
/* Event notification disabled */
return 0;
}

if (job_aggregation_time
&& (difftime(now, last_notify_time) < job_aggregation_time)) {
info("wiki event notification already sent recently");
if (event_fd != -1)
return 0;
}

pthread_mutex_lock(&event_mutex);

/* Identify address for socket connection.
* Done only on first call, then cached. */
Expand Down Expand Up @@ -96,33 +80,86 @@ extern int event_notify(char *msg)
e_host_bu, e_port);
}
}
if (event_fd == -1) {
pthread_mutex_unlock(&event_mutex);
/* Don't retry again for a while (10 mins)
* to avoid long delays from ETIMEDOUT */
last_notify_time = now + 600;
if (event_fd == -1)
return -1;
}

/* We can't have the controller block on the following write() */
fd_set_nonblocking(event_fd);
return 0;
}

static void _close_fd(void)
{
if (event_fd == -1)
return;

(void) slurm_shutdown_msg_engine(event_fd);
event_fd = -1;
}

/*
* event_notify - Notify Moab of some event
* msg IN - event type, NULL to close connection
* RET 0 on success, -1 on failure
*/
extern int event_notify(char *msg)
{
time_t now = time(NULL);
int rc = 0, retry = 2;

if (e_port == 0) {
/* Event notification disabled */
return 0;
}

if (job_aggregation_time
&& (difftime(now, last_notify_time) < job_aggregation_time)) {
debug("wiki event notification already sent recently");
return 0;
}

pthread_mutex_lock(&event_mutex);
while (retry) {
if ((event_fd == -1) && ((rc = _open_fd(now)) == -1)) {
/* Can't even open socket.
* Don't retry again for a while (2 mins)
* to avoid long delays from ETIMEDOUT */
last_notify_time = now + 120;
break;
}

/* Always send "1234\0" as the message
* (we do not care if all of the message is sent,
* just that some of it went through to wake up Moab)
*/
if (write(event_fd, "1234", 5) > 0) {
verbose("wiki event_notification sent: %s", msg);
last_notify_time = now;
rc = 0;
/* Dave Jackson says to leave the connection
* open, but Moab isn't. Without the _close_fd()
* here, the next write() generates a broken pipe
* error. Just remove the _close_fd() and this
* comment when Moab maintains the connection. */
_close_fd();
break; /* success */
}

/* Always send "1234\0" as the message
* (we do not care if all of the message is sent, just that
* some of it went through to wake up Moab)
*/
if (write(event_fd, "1234", 5) > 0) {
info("wiki event_notification sent: %s", msg);
last_notify_time = now;
rc = 0;
} else {
error("wiki event notification failure: %m");
/* close socket, re-open later */
(void) slurm_shutdown_msg_engine(event_fd);
event_fd = -1;
rc = -1;
}
retry--;
if ((errno == EAGAIN) || (errno == EINTR))
continue;

_close_fd();
if (errno == EPIPE) {
/* If Moab closed the socket we get an EPIPE,
* retry once */
continue;
} else {
break;
}
}
pthread_mutex_unlock(&event_mutex);

return rc;
Expand Down
2 changes: 1 addition & 1 deletion src/sacct/options.c
Expand Up @@ -17,7 +17,7 @@
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
Expand Down
84 changes: 75 additions & 9 deletions src/slurmctld/node_mgr.c
Expand Up @@ -61,6 +61,7 @@
#include "src/common/xassert.h"
#include "src/common/xstring.h"
#include "src/common/node_select.h"
#include "src/common/read_config.h"
#include "src/slurmctld/agent.h"
#include "src/slurmctld/locks.h"
#include "src/slurmctld/ping_nodes.h"
Expand All @@ -86,6 +87,7 @@ bitstr_t *share_node_bitmap = NULL; /* bitmap of sharable nodes */
static int _delete_config_record (void);
static void _dump_node_state (struct node_record *dump_node_ptr,
Buf buffer);
static struct node_record * _find_alias_node_record (char *name);
static int _hash_index (char *name);
static void _list_delete_config (void *config_entry);
static int _list_find_config (void *config_entry, void *key);
Expand Down Expand Up @@ -312,6 +314,65 @@ _dump_node_state (struct node_record *dump_node_ptr, Buf buffer)
pack32 (dump_node_ptr->tmp_disk, buffer);
}

/*
* _find_alias_node_record - find a record for node with the alias of
* the specified name supplied
* input: name - name to be aliased of the desired node
* output: return pointer to node record or NULL if not found
* global: node_record_table_ptr - pointer to global node table
* node_hash_table - table of hash indecies
*/
static struct node_record *
_find_alias_node_record (char *name)
{
int i;
char *alias = NULL;

if ((name == NULL)
|| (name[0] == '\0')) {
info("_find_alias_node_record: passed NULL name");
return NULL;
}
/* Get the alias we have just to make sure the user isn't
* trying to use the real hostname to run on something that has
* been aliased.
*/
alias = slurm_conf_get_nodename(name);

if(!alias)
return NULL;

/* try to find via hash table, if it exists */
if (node_hash_table) {
struct node_record *node_ptr;

i = _hash_index (alias);
node_ptr = node_hash_table[i];
while (node_ptr) {
xassert(node_ptr->magic == NODE_MAGIC);
if (!strcmp(node_ptr->name, alias)) {
xfree(alias);
return node_ptr;
}
node_ptr = node_ptr->node_next;
}
error ("_find_alias_node_record: lookup failure for %s", name);
}

/* revert to sequential search */
else {
for (i = 0; i < node_record_count; i++) {
if (!strcmp (alias, node_record_table_ptr[i].name)) {
xfree(alias);
return (&node_record_table_ptr[i]);
}
}
}

xfree(alias);
return (struct node_record *) NULL;
}

/*
* load_all_node_state - Load the node state from file, recover on slurmctld
* restart. Execute this after loading the configuration file data.
Expand Down Expand Up @@ -344,7 +405,7 @@ extern int load_all_node_state ( bool state_only )
data_allocated = BUF_SIZE;
data = xmalloc(data_allocated);
while (1) {
data_read = read (state_fd, &data[data_size], BUF_SIZE);
data_read = read(state_fd, &data[data_size], BUF_SIZE);
if (data_read < 0) {
if (errno == EINTR)
continue;
Expand Down Expand Up @@ -399,7 +460,8 @@ extern int load_all_node_state ( bool state_only )
node_cnt++;
if (node_ptr->node_state == NODE_STATE_UNKNOWN) {
if (node_state & NODE_STATE_DRAIN)
node_ptr->node_state = NODE_STATE_DRAIN;
node_ptr->node_state =
NODE_STATE_DRAIN;
else if (base_state == NODE_STATE_DOWN)
node_ptr->node_state = NODE_STATE_DOWN;
}
Expand Down Expand Up @@ -444,23 +506,24 @@ struct node_record *
find_node_record (char *name)
{
int i;

if ((name == NULL)
|| (name[0] == '\0')) {
info("find_node_record passed NULL name");
return NULL;
}

/* try to find via hash table, if it exists */
if (node_hash_table) {
struct node_record *node_ptr;

i = _hash_index (name);
node_ptr = node_hash_table[i];
while (node_ptr) {
xassert(node_ptr->magic == NODE_MAGIC);
if (strncmp(node_ptr->name, name, MAX_SLURM_NAME) == 0)
if (!strcmp(node_ptr->name, name)) {
return node_ptr;
}
node_ptr = node_ptr->node_next;
}
error ("find_node_record: lookup failure for %s", name);
Expand All @@ -469,12 +532,15 @@ find_node_record (char *name)
/* revert to sequential search */
else {
for (i = 0; i < node_record_count; i++) {
if (strcmp (name, node_record_table_ptr[i].name) == 0)
if (!strcmp (name, node_record_table_ptr[i].name)) {
return (&node_record_table_ptr[i]);
}
}
}

return (struct node_record *) NULL;

/* look for the alias node record if the user put this in
instead of what slurm sees the node name as */
return _find_alias_node_record (name);
}


Expand Down
9 changes: 6 additions & 3 deletions src/slurmctld/read_config.c
Expand Up @@ -414,10 +414,13 @@ static int _build_single_nodeline_info(slurm_conf_node_t *node_ptr,
hostname = hostlist_shift(hostname_list);
address = hostlist_shift(address_list);
#endif

if (strcmp(alias, highest_node_name) <= 0)
if (strcmp(alias, highest_node_name) <= 0) {
/* find_node_record locks this to get the
alias so we need to unlock */
slurm_conf_unlock();
node_rec = find_node_record(alias);
else {
slurm_conf_lock();
} else {
strncpy(highest_node_name, alias, MAX_SLURM_NAME);
node_rec = NULL;
}
Expand Down
3 changes: 2 additions & 1 deletion src/slurmd/slurmstepd/mgr.c
Expand Up @@ -1120,7 +1120,8 @@ _wait_for_any_task(slurmd_job_t *job, bool waitflag)
job->task_epilog,
job,
2, job->env);
} else if (conf->task_epilog) {
}
if (conf->task_epilog) {
char *my_epilog;
slurm_mutex_lock(&conf->config_mutex);
my_epilog = xstrdup(conf->task_epilog);
Expand Down
11 changes: 5 additions & 6 deletions src/slurmd/slurmstepd/task.c
Expand Up @@ -319,11 +319,7 @@ exec_task(slurmd_job_t *job, int i, int waitfd)
}

pre_launch(job);

if (job->task_prolog) {
_run_script_and_set_env("user task_prolog",
job->task_prolog, job);
} else if (conf->task_prolog) {
if (conf->task_prolog) {
char *my_prolog;
slurm_mutex_lock(&conf->config_mutex);
my_prolog = xstrdup(conf->task_prolog);
Expand All @@ -332,7 +328,10 @@ exec_task(slurmd_job_t *job, int i, int waitfd)
my_prolog, job);
xfree(my_prolog);
}

if (job->task_prolog) {
_run_script_and_set_env("user task_prolog",
job->task_prolog, job);
}

if (job->multi_prog)
task_exec(job->argv[1], job->env,
Expand Down

0 comments on commit 508c279

Please sign in to comment.