Permalink
Browse files

new tag

  • Loading branch information...
2 parents e7ac440 + 6a8f838 commit 89abcb6e3e01560efe05ac0457be3f8e57fa4bce @dannyauble dannyauble committed Apr 14, 2009
View
@@ -56,11 +56,25 @@ documents those changes that are of interest to users and admins.
count from a range rather than minimum (e.g. "sbatch -N1-4 my.sh").
-- In accounting_storage/filetxt and accounting_storage/pgsql fix
possible invalid memory reference when a job lacks a name.
+ -- Give srun command an exit code of 1 if the prolog fails.
+ -- BLUEGENE - allows for checking nodecard states in the system instead
+ of midplane state so as to not down an entire midplane if you don't
+ have to.
+ -- BLUEGENE - fix creation of MESH blocks
+ -- BLUEGENE - on job cancellation we call jm_cancel_job and then wait until
+ the system cleans up the job. Before we would send a SIGKILL right
+ at the beginning.
+ -- BLUEGENE - if a user specifies a node count that can not be met the job
+ will be refused instead of before the plugin would search for the next
+ larger size that could be created. This prevents users asking for
+ things that can't be created, and then getting something back they might
+ not be expecting.
* Changes in SLURM 1.3.14
=========================
-- SECURITY BUG: Fix in sbcast logic that permits users to write files based
- upon supplimental groups of the slurmd daemon.
+ upon supplimental groups of the slurmd daemon. Similar logic for event
+ triggers if slurmctld is run as user root (not typical).
* Changes in SLURM 1.3.13
=========================
@@ -248,7 +248,13 @@ List of accounts to use for the report Default is all. The SizesByAccount
report only displays 1 hierarchical level. If accounts are specified
the next layer of accounts under those specified will be displayed,
not the accounts specified. In the SizesByAccount reports the default
-for accounts is root.
+for accounts is root. This explanation does not apply when ran with
+the FlatView option.
+.TP
+.B FlatView
+When used with the SizesbyAccount will not group accounts in a
+hierarchical level, but print each account where jobs ran on a
+separate line without any hierarchy.
.TP
.B GID=<OPT>
List of group ids to include in report. Default is all.
View
@@ -111,6 +111,24 @@ uid_to_string (uid_t uid)
}
gid_t
+gid_from_uid (uid_t uid)
+{
+ struct passwd pwd, *result;
+ char buffer[PW_BUF_SIZE];
+ gid_t gid;
+ int rc;
+
+ rc = getpwuid_r(uid, &pwd, buffer, PW_BUF_SIZE, &result);
+ if (result == NULL) {
+ gid = (gid_t) -1;
+ } else {
+ gid = result->pw_gid;
+ }
+
+ return gid;
+}
+
+gid_t
gid_from_string (char *name)
{
struct group grp, *result;
View
@@ -60,6 +60,12 @@
uid_t uid_from_string (char *name);
/*
+ * Return the primary group id for a given user id, or
+ * (gid_t) -1 on failure.
+ */
+gid_t gid_from_uid (uid_t uid);
+
+/*
* Same as uid_from_name(), but for group name/id.
*/
gid_t gid_from_string (char *name);
@@ -597,7 +597,7 @@ extern int new_ba_request(ba_request_t* ba_request)
ba_request->elongate_geos,
ba_request->rotate);
}
- startagain:
+// startagain:
picked=0;
for(i=0;i<8;i++)
checked[i]=0;
@@ -648,9 +648,21 @@ extern int new_ba_request(ba_request_t* ba_request)
break;
}
}
+ /* This size can not be made into a
+ block return. If you want to try
+ until we find the next largest block
+ uncomment the code below and the goto
+ above. If a user specifies a max
+ node count the job will never
+ run.
+ */
if(i2==1) {
- ba_request->size +=1;
- goto startagain;
+ error("Can't make a block of "
+ "%d into a cube.",
+ ba_request->size);
+ return 0;
+/* ba_request->size +=1; */
+/* goto startagain; */
}
} else {
@@ -2894,7 +2906,7 @@ static int _append_geo(int *geometry, List geos, int rotate)
/*
* Fill in the paths and extra midplanes we need for the block.
- * Basically copy the x path sent in with the start_list in each Y anx
+ * Basically copy the x path sent in with the start_list in each Y and
* Z dimension filling in every midplane for the block and then
* completing the Y and Z wiring, tying the whole block together.
*
@@ -2929,14 +2941,12 @@ static int _fill_in_coords(List results, List start_list,
curr_switch = &check_node->axis_switch[X];
for(y=0; y<geometry[Y]; y++) {
- if((check_node->coord[Y]+y)
- >= DIM_SIZE[Y]) {
+ if((check_node->coord[Y]+y) >= DIM_SIZE[Y]) {
rc = 0;
goto failed;
}
for(z=0; z<geometry[Z]; z++) {
- if((check_node->coord[Z]+z)
- >= DIM_SIZE[Z]) {
+ if((check_node->coord[Z]+z) >= DIM_SIZE[Z]) {
rc = 0;
goto failed;
}
@@ -3304,7 +3314,7 @@ static int _find_yz_path(ba_node_t *ba_node, int *first,
geometry[i2], i2, count);
return 0;
}
- } else if(geometry[i2] == 1) {
+ } else if((geometry[i2] == 1) && (conn_type == SELECT_TORUS)) {
/* FIX ME: This is put here because we got
into a state where the Y dim was not being
processed correctly. This will set up the
@@ -4247,10 +4257,16 @@ static int _find_x_path(List results, ba_node_t *ba_node,
/* we don't need to go any further */
if(x_size == 1) {
- curr_switch->int_wire[source_port].used = 1;
- curr_switch->int_wire[source_port].port_tar = target_port;
- curr_switch->int_wire[target_port].used = 1;
- curr_switch->int_wire[target_port].port_tar = source_port;
+ /* Only set this if Torus since mesh doesn't have any
+ * connections in this path */
+ if(conn_type == SELECT_TORUS) {
+ curr_switch->int_wire[source_port].used = 1;
+ curr_switch->int_wire[source_port].port_tar =
+ target_port;
+ curr_switch->int_wire[target_port].used = 1;
+ curr_switch->int_wire[target_port].port_tar =
+ source_port;
+ }
return 1;
}
@@ -197,6 +197,8 @@ extern int block_ready(struct job_record *job_ptr)
xfree(block_id);
} else
rc = READY_JOB_ERROR;
+/* info("returning %d for job %u %d %d", */
+/* rc, job_ptr->job_id, READY_JOB_ERROR, READY_JOB_FATAL); */
return rc;
}
@@ -211,8 +213,8 @@ extern void pack_block(bg_record_t *bg_record, Buf buffer)
pack16((uint16_t)bg_record->conn_type, buffer);
#ifdef HAVE_BGL
pack16((uint16_t)bg_record->node_use, buffer);
- pack16((uint16_t)bg_record->quarter, buffer);
- pack16((uint16_t)bg_record->nodecard, buffer);
+ pack16((uint16_t)0, buffer);
+ pack16((uint16_t)0, buffer);
#endif
pack32((uint32_t)bg_record->node_cnt, buffer);
pack_bit_fmt(bg_record->bitmap, buffer);
@@ -624,6 +626,7 @@ extern int update_freeing_block_list()
state);
bg_record->state = state;
+ updated = 1;
}
next_block:
if ((rc = bridge_free_block(block_ptr))
@@ -137,10 +137,16 @@ static void _rotate_geo(uint16_t *req_geometry, int rot_cnt)
*/
static int _bg_record_sort_aval_inc(bg_record_t* rec_a, bg_record_t* rec_b)
{
- if(rec_a->job_ptr && !rec_b->job_ptr)
+ if((rec_a->job_running == BLOCK_ERROR_STATE)
+ && (rec_b->job_running != BLOCK_ERROR_STATE))
+ return 1;
+ else if((rec_a->job_running != BLOCK_ERROR_STATE)
+ && (rec_b->job_running == BLOCK_ERROR_STATE))
return -1;
else if(!rec_a->job_ptr && rec_b->job_ptr)
return 1;
+ else if(rec_a->job_ptr && !rec_b->job_ptr)
+ return -1;
else if(rec_a->job_ptr && rec_b->job_ptr) {
if(rec_a->job_ptr->start_time > rec_b->job_ptr->start_time)
return 1;
@@ -159,10 +165,16 @@ static int _bg_record_sort_aval_inc(bg_record_t* rec_a, bg_record_t* rec_b)
*/
static int _bg_record_sort_aval_dec(bg_record_t* rec_a, bg_record_t* rec_b)
{
- if(rec_a->job_ptr && !rec_b->job_ptr)
+ if((rec_a->job_running == BLOCK_ERROR_STATE)
+ && (rec_b->job_running != BLOCK_ERROR_STATE))
+ return -1;
+ else if((rec_a->job_running != BLOCK_ERROR_STATE)
+ && (rec_b->job_running == BLOCK_ERROR_STATE))
return 1;
else if(!rec_a->job_ptr && rec_b->job_ptr)
return -1;
+ else if(rec_a->job_ptr && !rec_b->job_ptr)
+ return 1;
else if(rec_a->job_ptr && rec_b->job_ptr) {
if(rec_a->job_ptr->start_time > rec_b->job_ptr->start_time)
return -1;
@@ -647,12 +659,26 @@ static int _check_for_booted_overlapping_blocks(
}
destroy_bg_record(bg_record);
if(!found_record) {
- debug2("This record wasn't "
- "found in the bg_list, "
- "no big deal, it "
- "probably wasn't added");
+ /* There may be a bug
+ here where on a real
+ system we don't go
+ destroy this block
+ in the real system.
+ If that is the case we
+ need to add the
+ bg_record to the
+ free_block_list
+ instead of destroying
+ it like above.
+ */
+ debug("This record wasn't "
+ "found in the bg_list, "
+ "no big deal, it "
+ "probably wasn't added");
//rc = SLURM_ERROR;
} else {
+ debug("removing the block "
+ "from the system");
List temp_list =
list_create(NULL);
list_push(temp_list,
@@ -1022,10 +1048,8 @@ static int _find_best_block_match(List block_list,
"block %s in an error state "
"because of bad bps.",
bg_record->bg_block_id);
- bg_record->job_running =
- BLOCK_ERROR_STATE;
- bg_record->state = RM_PARTITION_ERROR;
- trigger_block_error();
+ put_block_in_error_state(
+ bg_record, BLOCK_ERROR_STATE);
continue;
}
}
@@ -1094,14 +1118,29 @@ static int _find_best_block_match(List block_list,
request.geometry[i] = req_geometry[i];
bg_record = list_pop(job_list);
- if(bg_record)
- debug2("taking off %d(%s) started "
- "at %d ends at %d",
- bg_record->job_running,
- bg_record->bg_block_id,
- bg_record->job_ptr->start_time,
- bg_record->job_ptr->end_time);
- else
+ if(bg_record) {
+ if(bg_record->job_ptr)
+ debug2("taking off %d(%s) "
+ "started at %d "
+ "ends at %d",
+ bg_record->job_running,
+ bg_record->bg_block_id,
+ bg_record->job_ptr->
+ start_time,
+ bg_record->job_ptr->
+ end_time);
+ else if(bg_record->job_running
+ == BLOCK_ERROR_STATE)
+ debug2("taking off (%s) "
+ "which is in an error "
+ "state",
+ bg_record->job_running,
+ bg_record->bg_block_id,
+ bg_record->job_ptr->
+ start_time,
+ bg_record->job_ptr->
+ end_time);
+ } else
/* This means we didn't have
any jobs to take off
anymore so we are making
@@ -1349,7 +1388,8 @@ extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap,
else
starttime =
bg_record->job_ptr->end_time;
- }
+ } else if(bg_record->job_running == BLOCK_ERROR_STATE)
+ starttime = INFINITE;
job_ptr->start_time = starttime;
Oops, something went wrong.

0 comments on commit 89abcb6

Please sign in to comment.