Permalink
Browse files

This commit was manufactured by cvs2svn to create tag

'slurm-0-2-22-1'.
  • Loading branch information...
no author
no author committed Nov 21, 2003
1 parent 447bb06 commit 12687c8c4068ad4315e64e76167fc8220823b359
View
4 META
@@ -7,6 +7,6 @@
Name: slurm
Major: 0
Minor: 2
- Micro: 21
- Version: 0.2.21
+ Micro: 22
+ Version: 0.2.22
Release: 1
View
17 NEWS
@@ -1,6 +1,19 @@
This file describes changes in recent versions of SLURM. It primarily
documents those changes that are of interest to users and admins.
+* Changes in SLURM 0.2.22
+=========================
+ -- Fixes for reported problems:
+ - slurm/326: Node stays in completing state indefinitely.
+ - slurm/328: slurmd uses different shared memory key on restart.
+ - slurm/329: Job step processes may be left running when one task dies.
+ - slurm/334: slurmd segv with multiple simultaneous job steps.
+ -- Allow more digits for priority values in scontrol.
+ -- Applied various fixes for memory leaks.
+ -- Remove logic preventing DPCS from allocating jobs with more than
+ eight node segments. Fix for DPCS should now be in production.
+ -- Changed compact string for DRAINING state to "drng" from "drain."
+
* Changes in SLURM 0.2.21
=========================
-- Fixes for reported problems:
@@ -9,6 +22,7 @@ documents those changes that are of interest to users and admins.
- slurm/300: Possibly killing wrong job on slurmd restart
- slurm/312: Freeing non-allocated memory and killing slurmd
-- Assorted changes to support RedHat Enterprise Linux 3.0 and IA64
+ -- Initial Elan4 and libelanctrl support (--with-elan).
-- Slurmctld was sometimes inappropriately setting a job's priority
to 1 when a node was down (even if up nodes could be used for the
job when a running job completes)
@@ -17,7 +31,8 @@ documents those changes that are of interest to users and admins.
variable for `%J' expansion in TV bulk launch string.
-- Fix several locking bugs in slurmd IO layer.
-- Throttle back repetitious error messages in slurmd to avoid filling
- slurm logfiles.
+ log files.
+
* Changes in SLURM 0.2.20
=========================
View
@@ -205,10 +205,15 @@ on the same nodes or the values of \fBSlurmctldPort\fR and \fBSlurmdPort\fR
must be different.
.TP
\fBSlurmdSpoolDir\fR
-Fully qualified pathname of a file into which the \fBslurmd\fR daemon's state
-information is written. This must be a common pathname for all nodes, but
-should represent a file which is local to each node (reference a local file
-system). The default value is "/tmp/slurmd".
+Fully qualified pathname of a directory into which the \fBslurmd\fR
+daemon's state information and batch job script information are written. This
+must be a common pathname for all nodes, but should represent a directory which
+is local to each node (reference a local file system). The default value
+is "/var/spool/slurmd." \fBNOTE\fR: This directory is also used to store \fBslurmd\fR's
+shared memory lockfile, and \fBshould not be changed\fR unless the system
+is being cleanly restarted. If the location of \fBSlurmdSpoolDir\fR is
+changed and \fBslurmd\fR is restarted, the new daemon will attach to a
+different shared memory region and lose track of any running jobs.
.TP
\fBSlurmdTimeout\fR
The interval, in seconds, that the SLURM controller waits for \fBslurmd\fR
@@ -228,7 +233,7 @@ Fully qualified pathname of the file system available to user jobs for
temporary storage. This parameter is used in establishing a node's \fBTmpDisk\fR space.
The default value is "/tmp".
.TP
-\fBWaitTimefR
+\fBWaitTime\fR
Specifies how many seconds the srun command should by default wait after
the first task terminates before terminating all remaining tasks. The
"--wait" option on the srun command line overrides this value.
View
@@ -89,6 +89,7 @@ slurm_allocate_resources (job_desc_msg_t *req,
if (rc == SLURM_SOCKET_ERROR)
return SLURM_SOCKET_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_SLURM_RC:
if (_handle_rc_msg(&resp_msg) < 0)
@@ -127,6 +128,7 @@ int slurm_job_will_run (job_desc_msg_t *req,
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_SOCKET_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_SLURM_RC:
if (_handle_rc_msg(&resp_msg) < 0)
@@ -183,7 +185,7 @@ slurm_allocate_resources_and_run (job_desc_msg_t *req,
if (rc == SLURM_SOCKET_ERROR)
return SLURM_SOCKET_ERROR;
-
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_SLURM_RC:
if (_handle_rc_msg(&resp_msg) < 0)
@@ -222,6 +224,7 @@ slurm_job_step_create (job_step_create_request_msg_t *req,
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_SLURM_RC:
if (_handle_rc_msg(&resp_msg) < 0)
@@ -259,6 +262,7 @@ slurm_confirm_allocation (old_job_alloc_msg_t *req,
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch(resp_msg.msg_type) {
case RESPONSE_SLURM_RC:
if (_handle_rc_msg(&resp_msg) < 0)
View
@@ -73,7 +73,8 @@ slurm_kill_job_step (uint32_t job_id, uint32_t step_id, uint16_t signal)
if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0)
return SLURM_FAILURE;
- if (rc) slurm_seterrno_ret(rc);
+ if (rc)
+ slurm_seterrno_ret(rc);
return SLURM_SUCCESS;
}
View
@@ -85,7 +85,8 @@ slurm_complete_job_step ( uint32_t job_id, uint32_t step_id,
if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0)
return SLURM_ERROR;
- if (rc) slurm_seterrno_ret(rc);
+ if (rc)
+ slurm_seterrno_ret(rc);
return SLURM_PROTOCOL_SUCCESS;
}
View
@@ -151,10 +151,10 @@ slurm_load_ctl_conf (time_t update_time, slurm_ctl_conf_t **confp)
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_BUILD_INFO:
*confp = (slurm_ctl_conf_info_msg_t *) resp_msg.data;
- slurm_free_cred(resp_msg.cred);
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
View
@@ -219,10 +219,10 @@ slurm_load_jobs (time_t update_time, job_info_msg_t **resp)
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_JOB_INFO:
*resp = (job_info_msg_t *)resp_msg.data;
- slurm_free_cred(resp_msg.cred);
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
@@ -267,6 +267,7 @@ slurm_pid2jobid (pid_t job_pid, uint32_t *jobid)
if (slurm_send_recv_node_msg(&req_msg, &resp_msg, 0) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_JOB_ID:
*jobid = ((job_id_response_msg_t *) resp_msg.data)->job_id;
View
@@ -125,10 +125,10 @@ slurm_get_job_steps (time_t update_time, uint32_t job_id, uint32_t step_id,
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_JOB_STEP_INFO:
*resp = (job_step_info_response_msg_t *) resp_msg.data;
- slurm_free_cred(resp_msg.cred);
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
View
@@ -129,10 +129,10 @@ slurm_load_node (time_t update_time, node_info_msg_t **resp)
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_NODE_INFO:
*resp = (node_info_msg_t *) resp_msg.data;
- slurm_free_cred(resp_msg.cred);
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
View
@@ -161,10 +161,10 @@ slurm_load_partitions (time_t update_time, partition_info_msg_t **resp)
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_PARTITION_INFO:
*resp = (partition_info_msg_t *) resp_msg.data;
- slurm_free_cred(resp_msg.cred);
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
View
@@ -58,7 +58,8 @@ slurm_reconfigure ( void )
if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0)
return SLURM_ERROR;
- if (rc) slurm_seterrno_ret(rc);
+ if (rc)
+ slurm_seterrno_ret(rc);
return SLURM_PROTOCOL_SUCCESS;
}
@@ -129,6 +130,7 @@ _send_message_controller (enum controller_id dest, slurm_msg_t *req)
if ((rc = slurm_receive_msg(fd, &resp_msg, 0)) < 0)
slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_RECEIVE_ERROR);
+ slurm_free_cred(resp_msg.cred);
if (slurm_shutdown_msg_conn(fd) != SLURM_SUCCESS)
slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_SHUTDOWN_ERROR);
View
@@ -87,6 +87,7 @@ slurm_submit_batch_job (job_desc_msg_t *req,
if (rc == SLURM_SOCKET_ERROR)
return SLURM_ERROR;
+ slurm_free_cred(resp_msg.cred);
switch (resp_msg.msg_type) {
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
View
@@ -227,13 +227,13 @@ int log_init(char *prog, log_options_t opt, log_facility_t fac, char *logfile)
void log_fini()
{
- if (!log) return;
+ if (!log)
+ return;
+
log_flush();
slurm_mutex_lock(&log_lock);
- if (log->argv0)
- xfree(log->argv0);
- if (log->fpfx)
- xfree(log->fpfx);
+ xfree(log->argv0);
+ xfree(log->fpfx);
if (log->buf)
cbuf_destroy(log->buf);
if (log->fbuf)
@@ -254,8 +254,7 @@ void log_reinit()
void log_set_fpfx(char *prefix)
{
slurm_mutex_lock(&log_lock);
- if (log->fpfx)
- xfree(log->fpfx);
+ xfree(log->fpfx);
if (!prefix)
log->fpfx = xstrdup("");
else {
View
@@ -243,7 +243,7 @@ slurm_auth_context_create( const char *auth_type )
c->auth_errno = SLURM_SUCCESS;
/* Copy the authentication type. */
- c->auth_type = strdup( auth_type );
+ c->auth_type = xstrdup( auth_type );
if ( c->auth_type == NULL ) {
debug3( "can't make local copy of authentication type" );
xfree( c );
@@ -299,7 +299,7 @@ slurm_auth_context_destroy( slurm_auth_context_t c )
}
}
- free( c->auth_type );
+ xfree( c->auth_type );
xfree( c );
return SLURM_SUCCESS;
@@ -340,7 +340,12 @@ slurm_auth_fini( void )
if ( g_context )
slurm_auth_context_destroy( g_context );
- free_slurm_conf( &conf );
+ slurm_mutex_lock( &config_lock );
+ if ( conf.slurmd_port ) {
+ free_slurm_conf( &conf );
+ conf.slurmd_port = 0;
+ }
+ slurm_mutex_unlock( &config_lock );
}
/*
View
@@ -92,7 +92,7 @@ static slurm_errtab_t slurm_errtab[] = {
{ ESLURM_INVALID_PARTITION_NAME,
"Invalid partition name specified" },
{ ESLURM_DEFAULT_PARTITION_NOT_SET,
- "System default partition not set" },
+ "No partition specified or system default partition" },
{ ESLURM_ACCESS_DENIED,
"Access denied" },
{ ESLURM_JOB_MISSING_REQUIRED_PARTITION_GROUP,
@@ -372,7 +372,7 @@ char *node_state_string_compact(enum node_states inx)
"IDLE",
"ALLOC",
"DRAIN",
- "DRAIN",
+ "DRNG",
"COMP",
"END"
};
@@ -796,10 +796,6 @@ _unpack_resource_allocation_response_msg(resource_allocation_response_msg_t
safe_unpack16(&tmp_ptr->num_cpu_groups, buffer);
if (tmp_ptr->num_cpu_groups > 0) {
- tmp_ptr->cpus_per_node = (uint32_t *)
- xmalloc(sizeof(uint32_t) * tmp_ptr->num_cpu_groups);
- tmp_ptr->cpu_count_reps = (uint32_t *)
- xmalloc(sizeof(uint32_t) * tmp_ptr->num_cpu_groups);
safe_unpack32_array((uint32_t **) &
(tmp_ptr->cpus_per_node), &uint32_tmp,
buffer);
View
@@ -91,6 +91,7 @@ xsignal_sigset_create(int sigarray[], sigset_t *setp)
int
xsignal_save_mask(sigset_t *set)
{
+ sigemptyset(set);
return _sigmask(SIG_SETMASK, NULL, set);
}
@@ -462,6 +462,7 @@ _decode_cred(char *m, slurm_auth_credential_t *c)
* Block all signals to allow munge_decode() to proceed
* uninterrupted. (Testing for gnats slurm/223)
*/
+ sigemptyset(&oset);
sigfillset(&set);
sigdelset(&set, SIGABRT);
sigdelset(&set, SIGSEGV);
View
@@ -1268,7 +1268,7 @@ _update_job (int argc, char *argv[])
}
else if (strncasecmp(argv[i], "Priority=", 9) == 0)
job_msg.priority =
- (uint32_t) strtol(&argv[i][9],
+ (uint32_t) strtoll(&argv[i][9],
(char **) NULL, 10);
else if (strncasecmp(argv[i], "ReqProcs=", 9) == 0)
job_msg.num_procs =
View
@@ -122,7 +122,6 @@ static void _reset_step_bitmaps(struct job_record *job_ptr);
static void _set_job_id(struct job_record *job_ptr);
static void _set_job_prio(struct job_record *job_ptr);
static bool _slurm_picks_nodes(job_desc_msg_t * job_specs);
-static bool _too_many_fragments(bitstr_t *req_bitmap);
static bool _top_priority(struct job_record *job_ptr);
static int _validate_job_create_req(job_desc_msg_t * job_desc);
static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate,
@@ -1415,10 +1414,6 @@ static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id,
error_code = ESLURM_REQUESTED_NODES_NOT_IN_PARTITION;
goto cleanup;
}
- if (_too_many_fragments(req_bitmap)) {
- error_code = ESLURM_TOO_MANY_REQUESTED_NODES;
- goto cleanup;
- }
i = count_cpus(req_bitmap);
if (i > job_desc->num_procs)
job_desc->num_procs = i;
@@ -3196,20 +3191,3 @@ void job_fini (void)
xfree(job_hash_over);
}
-static bool _too_many_fragments(bitstr_t *req_bitmap)
-{
-#ifdef MAX_NODE_FRAGMENTS
- int i, frags=0;
- int last_bit = 0, next_bit;
-
- for (i = 0; i < node_record_count; i++) {
- next_bit = bit_test(req_bitmap, i);
- if (next_bit == last_bit)
- continue;
- last_bit = next_bit;
- if (next_bit && (++frags > MAX_NODE_FRAGMENTS))
- return true;
- }
-#endif
- return false;
-}
Oops, something went wrong.

0 comments on commit 12687c8

Please sign in to comment.