Browse files

create v1.2.16 tag

  • Loading branch information...
2 parents de93b68 + e25abc2 commit 01833b87ced69a66e4b37839ab7066adbaa495df Moe Jette committed Sep 11, 2007
View
8 NEWS
@@ -10,6 +10,14 @@ documents those changes that are of interest to users and admins.
-- In sched/wiki2, fix bug processing condensed hostlist expressions.
-- Release contribs/mpich1.slurm.patch without GPL license.
-- Fix bug in mvapich plugin for read/write calls that return EAGAIN.
+ -- Don't start MVAPICH timeout logic until we know that srun is starting
+ an MVAPICH program.
+ -- Fix to srun only allocating number of nodes needed for requested task
+ count when combining allocation and step creation in srun.
+ -- Execute task-prolog within proctrack container to insure that all
+ child processes get terminated.
+ -- Fixed job accounting to work with sgi_job proctrack plugin.
+
* Changes in SLURM 1.2.15
=========================
View
6 src/api/job_info.c
@@ -128,7 +128,7 @@ slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner )
char tmp1[128], tmp2[128];
char tmp_line[128];
char *ionodes = NULL;
- uint16_t term_sig = 0;
+ uint16_t exit_status = 0, term_sig = 0;
char *out = NULL;
#ifdef HAVE_BG
@@ -193,9 +193,11 @@ slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner )
xstrcat(out, tmp_line);
if (WIFSIGNALED(job_ptr->exit_code))
term_sig = WTERMSIG(job_ptr->exit_code);
+ else
+ exit_status = WEXITSTATUS(job_ptr->exit_code);
snprintf(tmp_line, sizeof(tmp_line),
"ExitCode=%u:%u",
- WEXITSTATUS(job_ptr->exit_code), term_sig);
+ exit_status, term_sig);
xstrcat(out, tmp_line);
if (one_liner)
xstrcat(out, " ");
View
2 src/common/env.c
@@ -294,7 +294,7 @@ int setup_env(env_t *env)
if (env->cpus_on_node
&& setenvf(&env->env, "SLURM_CPUS_ON_NODE", "%d",
env->cpus_on_node) ) {
- error("Unable to set SLURM_CPUS_PER_TASK");
+ error("Unable to set SLURM_CPUS_ON_NODE");
rc = SLURM_FAILURE;
}
View
2 src/plugins/jobacct/common/common_slurmstepd.c
@@ -62,7 +62,7 @@ extern int common_set_proctrack_container_id(uint32_t id)
info("Warning: jobacct: set_proctrack_container_id: "
"cont_id is already set to %d you are setting it to %d",
cont_id, id);
- if(id <= 0) {
+ if((int)id <= 0) {
error("jobacct: set_proctrack_container_id: "
"I was given most likely an unset cont_id %d",
id);
View
37 src/plugins/mpi/mvapich/mvapich.c
@@ -230,7 +230,7 @@ static int startup_timeout (mvapich_state_t *st)
now = time (NULL);
if (!st->start_time)
- st->start_time = now;
+ return (-1);
remaining = st->timeout - (now - st->start_time);
@@ -282,41 +282,52 @@ static int mvapich_write_n (mvapich_state_t *st, struct mvapich_info *mvi,
{
int nleft = len;
int n = 0;
+ unsigned char * p = buf;
- while (nleft) {
+ while (nleft > 0) {
/* Poll for write-activity */
if (mvapich_poll (st, mvi, 1) < 0)
return (-1);
- if ((n = fd_write_n (mvi->fd, buf, len)) < 0 &&
- (errno != EAGAIN))
+ if ((n = write (mvi->fd, p, nleft)) < 0) {
+ if (errno == EAGAIN || errno == EINTR)
+ continue;
return (-1);
+ }
nleft -= n;
+ p += n;
}
- return (n);
+ return (len - nleft);
}
static int mvapich_read_n (mvapich_state_t *st, struct mvapich_info *mvi,
void *buf, size_t len)
{
int nleft = len;
int n = 0;
+ unsigned char * p = buf;
- while (nleft) {
+ while (nleft > 0) {
/* Poll for write-activity */
if (mvapich_poll (st, mvi, 0) < 0)
return (-1);
- if ((n = fd_read_n (mvi->fd, buf, len)) < 0 &&
- (errno != EAGAIN))
+ if ((n = read (mvi->fd, p, nleft)) < 0) {
+ if (errno == EAGAIN || errno == EINTR)
+ continue;
+ return (-1);
+ }
+
+ if (n == 0) /* unexpected EOF */
return (-1);
nleft -= n;
+ p += n;
}
- return (n);
+ return (len - nleft);
}
@@ -342,6 +353,8 @@ static int mvapich_abort_sends_rank (mvapich_state_t *st)
static int mvapich_get_task_info (mvapich_state_t *st,
struct mvapich_info *mvi)
{
+ mvi->do_poll = 0;
+
if (mvapich_read_n (st, mvi, &mvi->addrlen, sizeof (int)) <= 0)
return error ("mvapich: Unable to read addrlen for rank %d: %m",
mvi->rank);
@@ -367,8 +380,6 @@ static int mvapich_get_task_info (mvapich_state_t *st,
mvi->rank);
}
- mvi->do_poll = 0;
-
return (0);
}
@@ -1135,6 +1146,10 @@ static void *mvapich_thr(void *arg)
if (first) {
mvapich_debug ("first task checked in");
do_timings (st);
+ /*
+ * Officially start timeout timer now.
+ */
+ st->start_time = time(NULL);
first = 0;
}
View
6 src/slurmctld/job_mgr.c
@@ -1661,12 +1661,12 @@ extern int job_complete(uint32_t job_id, uid_t uid, bool requeue,
if (job_return_code == NO_VAL) {
job_ptr->job_state = JOB_CANCELLED| job_comp_flag;
job_ptr->requid = uid;
- } else if (WEXITSTATUS(job_return_code)) {
+ } else if (WIFEXITED(job_return_code) &&
+ WEXITSTATUS(job_return_code)) {
job_ptr->job_state = JOB_FAILED | job_comp_flag;
job_ptr->exit_code = job_return_code;
job_ptr->state_reason = FAIL_EXIT_CODE;
- }
- else if (job_comp_flag && /* job was running */
+ } else if (job_comp_flag && /* job was running */
(job_ptr->end_time < now)) { /* over time limit */
job_ptr->job_state = JOB_TIMEOUT | job_comp_flag;
job_ptr->exit_code = MAX(job_ptr->exit_code, 1);
View
22 src/slurmd/slurmstepd/mgr.c
@@ -951,8 +951,6 @@ _fork_all_tasks(slurmd_job_t *job)
error ("Unable to return to working directory");
}
- jobacct_g_set_proctrack_container_id(job->cont_id);
-
for (i = 0; i < job->ntasks; i++) {
/*
* Put this task in the step process group
@@ -975,6 +973,7 @@ _fork_all_tasks(slurmd_job_t *job)
return SLURM_ERROR;
}
}
+ jobacct_g_set_proctrack_container_id(job->cont_id);
/*
* Now it's ok to unblock the tasks, so they may call exec.
@@ -1690,6 +1689,9 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job,
return -1;
}
+ if (slurm_container_create(job) != SLURM_SUCCESS)
+ error("slurm_container_create: %m");
+
if ((cpid = fork()) < 0) {
error ("executing %s: fork: %m", name);
return -1;
@@ -1720,6 +1722,8 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job,
exit(127);
}
+ if (slurm_container_add(job, cpid) != SLURM_SUCCESS)
+ error("slurm_container_add: %m");
if (max_wait < 0)
opt = 0;
else
@@ -1731,18 +1735,22 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job,
if (errno == EINTR)
continue;
error("waidpid: %m");
- return 0;
+ status = 0;
+ break;
} else if (rc == 0) {
sleep(1);
if ((--max_wait) == 0) {
killpg(cpid, SIGKILL);
opt = 0;
}
} else {
- killpg(cpid, SIGKILL); /* kill children too */
- return status;
+ /* spawned process exited */
+ break;
}
}
-
- /* NOTREACHED */
+ /* Insure that all child processes get killed */
+ killpg(cpid, SIGKILL);
+ slurm_container_signal(job->cont_id, SIGKILL);
+
+ return status;
}
View
9 src/srun/opt.c
@@ -2357,8 +2357,15 @@ static bool _opt_verify(void)
}
} else if (opt.nodes_set && opt.nprocs_set) {
+ /*
+ * Make sure in a non allocate situation that
+ * the number of max_nodes is <= number of tasks
+ */
+ if (!opt.allocate && opt.nprocs < opt.max_nodes)
+ opt.max_nodes = opt.nprocs;
+
/*
- * make sure # of procs >= min_nodes
+ * make sure # of procs >= min_nodes || max_nodes
*/
if (opt.nprocs < opt.min_nodes) {
View
8 src/srun/srun.c
@@ -250,6 +250,14 @@ int srun(int ac, char **av)
if (msg_thr_create(job) < 0)
job_fatal(job, "Unable to create msg thread");
exitcode = _run_job_script(job, env);
+
+ /* close up the msg thread cleanly */
+ close(job->forked_msg->msg_par->msg_pipe[1]);
+ debug2("Waiting for message thread");
+ if (pthread_join(job->jtid, NULL) < 0)
+ error ("Waiting on message thread: %m");
+ debug2("done");
+
srun_job_destroy(job,exitcode);
debug ("Spawned srun shell terminated");
View
6 testsuite/expect/globals
@@ -350,6 +350,12 @@ proc wait_for_file { file_name } {
for {set my_delay 0} {$my_delay <= $max_file_delay} {incr my_delay} {
if [file exists $file_name] {
# Add small delay for I/O buffering
+ for {} {$my_delay <= $max_file_delay} {incr my_delay} {
+ if {[file size $file_name] != 0} {
+ break
+ }
+ exec $bin_sleep 1
+ }
exec $bin_sleep 2
return 0
}
View
4 testsuite/expect/test1.88
@@ -57,6 +57,10 @@ if {[test_front_end] != 0} {
send_user "\nWARNING: This test is incompatable with front-end systems\n"
exit 0
}
+if {[test_aix] == 1} {
+ send_user "WARNING: Test is incompatible with AIX\n"
+ exit 0
+}
#
# Delete left-over program and rebuild it
View
7 testsuite/expect/test11.5
@@ -159,6 +159,13 @@ if {$matches != 1} {
set exit_code 1
}
+# Actual checkpoint on AIX only works for tasks launched using POE
+if {[test_aix] == 1} {
+ send_user "WARNING: Further testing is incompatible with AIX\n"
+ cancel_job $job_id
+ exit $exit_code
+}
+
#
# Create a checkpoint, continue execution
#
View
5 testsuite/expect/test14.7
@@ -130,6 +130,11 @@ if {[wait_for_file $file_err] == 0} {
incr matches
exp_continue
}
+ -re "not found" {
+ send_user "These errors are expected, no worries\n"
+ incr matches
+ exp_continue
+ }
eof {
wait
}
View
2 testsuite/expect/test18.17
@@ -76,7 +76,7 @@ set spawn_id $slaunch_spawn_id
set sum 0
expect {
-re "exit code ($number)" {
- send_user "This error is expected, no worries\n"
+ send_user "\nThis error is expected, no worries\n"
incr sum $expect_out(1,string)
exp_continue
}
View
20 testsuite/expect/test7.5
@@ -51,14 +51,6 @@ set test_prog "test$test_id.prog"
print_header $test_id
#
-# Put desired SLURM install directory at head of search path for bulk launch
-# command to work (runs "slaunch" without path)
-#
-global env
-set env(PATH) "$slurm_dir/bin:$env(PATH)"
-send_user "\n $env(PATH)\n"
-
-#
# Test for existence of mpi compiler and totalview
#
if {[info exists mpicc] == 0} {
@@ -81,6 +73,18 @@ if {[test_front_end] != 0} {
send_user "\nWARNING: This test is incompatable with front-end systems\n"
exit 0
}
+if {[test_aix] == 1} {
+ send_user "WARNING: Test is incompatible with AIX\n"
+ exit 0
+}
+
+#
+# Put desired SLURM install directory at head of search path for bulk launch
+# command to work (runs "slaunch" without path)
+#
+global env
+set env(PATH) "$slurm_dir/bin:$env(PATH)"
+send_user "\n $env(PATH)\n"
#
# Delete left-over program and rebuild it
View
18 testsuite/expect/test7.6
@@ -55,13 +55,6 @@ set no_bulk "set issue_dgo false; dset TV::bulk_launch_enabled false"
print_header $test_id
#
-# Put desired SLURM install directory at head of search path for bulk launch
-# command to work (runs "srun" without path)
-#
-set env(PATH) "$slurm_dir/bin:$env(PATH)"
-send_user "\n $env(PATH)\n"
-
-#
# Test for existence of mpi compiler and totalview
#
if {[info exists mpicc] == 0} {
@@ -84,6 +77,17 @@ if {[test_front_end] != 0} {
send_user "\nWARNING: This test is incompatable with front-end systems\n"
exit 0
}
+if {[test_aix] == 1} {
+ send_user "WARNING: Test is incompatible with AIX\n"
+ exit 0
+}
+
+#
+# Put desired SLURM install directory at head of search path for bulk launch
+# command to work (runs "srun" without path)
+#
+set env(PATH) "$slurm_dir/bin:$env(PATH)"
+send_user "\n $env(PATH)\n"
#
# Delete left-over program and rebuild it

0 comments on commit 01833b8

Please sign in to comment.