Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

retag with wiki2 mods.

  • Loading branch information...
commit a12ec8a29a311e8772ed8ea6fd329afc8bc6c6d3 2 parents e13dc04 + c38d3d4
@jette jette authored
Showing with 24 additions and 4 deletions.
  1. +1 −0  NEWS
  2. +23 −4 src/plugins/sched/wiki2/start_job.c
View
1  NEWS
@@ -9,6 +9,7 @@ documents those changes that are of interest to users and admins.
- BLUEGENE - add libsched_if.so so mpirun doesn't try to create a block
by itself.
- Enable specification of srun --jobid=# option with --batch (for user root).
+ - Verify that job actually starts when requested by sched/wiki2.
* Changes in SLURM 1.1.12
=========================
View
27 src/plugins/sched/wiki2/start_job.c
@@ -109,6 +109,7 @@ static int _start_job(uint32_t jobid, char *hostlist,
slurmctld_lock_t job_write_lock = {
NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK };
char *new_node_list;
+ static char tmp_msg[128];
bitstr_t *new_bitmap;
lock_slurmctld(job_write_lock);
@@ -168,11 +169,29 @@ static int _start_job(uint32_t jobid, char *hostlist,
job_ptr->priority = 1000000;
fini: unlock_slurmctld(job_write_lock);
- /* functions below provide their own locking */
if (rc == 0) { /* New job to start ASAP */
- (void) schedule();
- schedule_node_save();
- schedule_job_save();
+ (void) schedule(); /* provides own locking */
+ /* Check to insure the job was actually started */
+ lock_slurmctld(job_write_lock);
+ /* job_ptr = find_job_record(jobid); don't bother */
+ if ((job_ptr->job_id == jobid)
+ && (job_ptr->job_state != JOB_RUNNING)) {
+ uint16_t wait_reason = 0;
+ error("wiki: failed to start job %u", jobid);
+ job_ptr->priority = 0;
+ if (job_ptr->details)
+ wait_reason = job_ptr->details->wait_reason;
+ *err_code = 910 + wait_reason;
+ snprintf(tmp_msg, sizeof(tmp_msg),
+ "Could not start job %u: %s",
+ jobid, job_reason_string(wait_reason));
+ *err_msg = tmp_msg;
+ error("wiki: %s", tmp_msg);
+ rc = -1;
+ }
+ unlock_slurmctld(job_write_lock);
+ schedule_node_save(); /* provides own locking */
+ schedule_job_save(); /* provides own locking */
}
return rc;
}
Please sign in to comment.
Something went wrong with that request. Please try again.