Skip to content
Browse files

This commit was manufactured by cvs2svn to create tag

'slurm-0-4-15-1'.
  • Loading branch information...
1 parent 5ab7088 commit 0ad0e3d0710b51bd024bff7ff2b70fc76a76ea30 no author committed Mar 25, 2005
View
4 META
@@ -9,8 +9,8 @@
Name: slurm
Major: 0
Minor: 4
- Micro: 6
- Version: 0.4.6
+ Micro: 15
+ Version: 0.4.15
Release: 1
API_CURRENT: 6
API_AGE: 4
View
40 NEWS
@@ -1,9 +1,47 @@
This file describes changes in recent versions of SLURM. It primarily
documents those changes that are of interest to users and admins.
+* Changes in SLURM 0.4.15
+========================
+ -- Added error checking to a bunch of Bridge API calls and more
+ gracefully handle failure modes.
+
+* Changes in SLURM 0.4.14
+========================
+ -- job state is kept on warm start of slurm
+
+* Changes in SLURM 0.4.13
+========================
+ -- epilog fix for bgl plugin
+
+* Changes in SLURM 0.4.12
+========================
+ -- bug shot for new api calls.
+ -- added BridgeAPILogFile as an option for bluegene.conf file
+
+* Changes in SLURM 0.4.11
+========================
+ -- changed as many rm_get_partition() to rm_get_partitions_info as we could
+ for time saving.
+
+* Changes in SLURM 0.4.10
+========================
+ -- redesign for BGL external wiring.
+ -- smap display bug fix for smaller systems.
+
+* Changes in SLURM 0.4.9
+========================
+ -- setpnum works now, have to include this in bluegene.conf
+
+* Changes in SLURM 0.4.8
+========================
+ -- Changed the prolog and the epilog to use the env var MPIRUN_PARTITION
+ instead of BGL_PARTITION_ID
+
* Changes in SLURM 0.4.7
========================
- -- Remove some BGL specific headers that IBM now distributes.
+ -- Remove some BGL specific headers that IBM now distributes, NOTE
+ BGL driver 080 or greater required.
-- Change autogen.sh to deal with problems running autoconf on one
system and configure on another with different software versions.
View
2 auxdir/x_ac_bgl.m4
@@ -33,7 +33,7 @@ AC_DEFUN([X_AC_BGL],
fi
have_bgl_ar=yes
- bgl_ldflags="$bgl_ldflags -Wl,-rpath $bgl_dir/lib -Wl,-L$bgl_dir/lib -Wl,-whole-archive -Wl,-lbglbridge -Wl,-no-whole-archive $bgl_dir/lib/bglbootload.a $bgl_dir/lib/bglsp440supt.a -lbgldb -lbglmachine -ltableapi -lexpat -lbglsp"
+ bgl_ldflags="$bgl_ldflags -Wl,-rpath $bgl_dir/lib -Wl,-L$bgl_dir/lib -Wl,-whole-archive -Wl,-lbglbridge -Wl,-no-whole-archive $bgl_dir/lib/bglbootload.a $bgl_dir/lib/bglsp440supt.a -lsaymessage -lbgldb -lbglmachine -ltableapi -lexpat -lbglsp"
fi
# Search for required DB2 library in the directory
View
6 doc/html/bluegene.html
@@ -99,7 +99,7 @@
The script that you submit to SLURM can contain multiple invocations of mpirun as
well as any desired commands for pre- and post-processing.
The mpirun command will get its <i>bglblock</i> or BGL partition information from the
-<i>BGL_PARTITION_ID</i> as set by SLURM. A sample script is shown below.
+<i>MPIRUN_PARTITION</i> as set by SLURM. A sample script is shown below.
<pre>
#!/bin/bash
# pre-processing
@@ -185,11 +185,11 @@
and interfaces.
The value of <i>SchedulerType</i> should be set to "sched/builtin".
The value of <i>Prolog</i> should be set to a program that will delay
-execution until the bglblock identified by the BGL_PARTITION_ID environment
+execution until the bglblock identified by the MPIRUN_PARTITION environment
variable is ready for use. It is recommended that you construct a script
that serves this function and calls the supplied program <i>slurm_prolog</i>.
The value of <i>Epilog</i> should be set to a program that will wait
-until the bglblock identified by the BGL_PARTITION_ID environment
+until the bglblock identified by the MPIRUN_PARTITION environment
variable has been freed. It is recommended that you construct a script
that serves this function and calls the supplied program <i>slurm_epilog</i>.
The prolog and epilog programs are used to insure proper synchronization
View
21 etc/bluegene.conf.example
@@ -5,17 +5,30 @@
# LinuxImage: LinuxImage used for creation of all bglblocks.
# MloaderImage: MloaderImage used for creation of all bglblocks.
# RamDiskImage: RamDiskImage used for creation of all bglblocks.
-# ChangeNumpsets: Script to reset a created bglblock's Numpsets value.
-# Will be removed when an API is available for this.
-# Default value uses half of available I/O nodes.
+# Numpsets: The Numpsets used for creation of all bglblocks
+# equals this value multiplied by the number of
+# base partitions in the bglblock.
+#
+# BridgeAPILogFile : Pathname of file in which to write the BGL
+# Bridge API logs.
+# BridgeAPIVerbose: How verbose the BGL Bridge API logs should be
+# 0: Log only error and warning messages
+# 1: Log level 0 and information messasges
+# 2: Log level 1 and basic debug messages
+# 3: Log level 2 and more debug message
+# 4: Log all messages
+#
# NOTE: The bgl_serial value is set at configuration time using the
# "--with-bgl-serial=" option. Its default value is "BGL".
#
BlrtsImage=/bgl/BlueLight/ppcfloor/bglsys/bin/rts_hw.rts
LinuxImage=/bgl/BlueLight/ppcfloor/bglsys/bin/zImage.elf
MloaderImage=/bgl/BlueLight/ppcfloor/bglsys/bin/mmcs-mloader.rts
RamDiskImage=/bgl/BlueLight/ppcfloor/bglsys/bin/ramdisk.elf
-#ChangeNumpsets=/etc/slurm/change_numpsets
+Numpsets=8
+#
+BridgeAPILogFile=/var/log/slurm/bridgeapi.log
+BridgeAPIVerbose=0
#
# Define the static partitions (bglblocks)
View
2 src/partition_allocator/Makefile.am
@@ -16,7 +16,7 @@ INCLUDES = -I$(top_srcdir) $(BGL_INCLUDES)
# $(top_builddir)/src/api/libslurm.la
# partition_allocator_LDFLAGS = -export-dynamic -lm $(CMD_LDFLAGS) $(BGL_LDFLAGS)
-# CPPFLAGS = -DBUILD_EXE
+# CPPFLAGS = -DBUILD_EXE
# making a .la
noinst_LTLIBRARIES = libpartition_allocator.la
View
121 src/partition_allocator/partition_allocator.c
@@ -92,8 +92,7 @@ static void _switch_config(pa_node_t* source, pa_node_t* target, int dim,
int port_src, int port_tar);
/* */
static void _set_external_wires(int dim, int count, pa_node_t* source,
- pa_node_t* target_1, pa_node_t* target_2,
- pa_node_t* target_first, pa_node_t* target_second);
+ pa_node_t* target_1, pa_node_t* target_2);
/* */
static char *_set_internal_wires(List nodes, int size, int conn_type);
@@ -474,7 +473,7 @@ void pa_init(node_info_msg_t *node_info_ptr)
_create_pa_system();
pa_system_ptr->fill_in_value = (pa_node_t *)
- xmalloc(sizeof(pa_node_t) * pa_system_ptr->num_of_proc);
+ xmalloc(sizeof(pa_node_t) * 128);
init_grid(node_info_ptr);
@@ -985,7 +984,7 @@ static int _create_config_even(pa_node_t *grid)
#if HAVE_BGL
int y,z;
- pa_node_t *target_2, *target_first, *target_second;
+ pa_node_t *target_2;
for(x=0;x<DIM_SIZE[X];x++) {
for(y=0;y<DIM_SIZE[Y];y++) {
for(z=0;z<DIM_SIZE[Z];z++) {
@@ -999,31 +998,23 @@ static int _create_config_even(pa_node_t *grid)
target_2 = &grid[x+2][y][z];
else
target_2 = target_1;
- target_first = &grid[0][y][z];
- if (DIM_SIZE[X] > 1)
- target_second = &grid[1][y][z];
- else
- target_second = target_first;
_set_external_wires(X, x, source,
- target_1, target_2,
- target_first, target_second);
+ target_1, target_2);
if(y<(DIM_SIZE[Y]-1))
target_1 = &grid[x][y+1][z];
else
target_1 = &grid[x][0][z];
_set_external_wires(Y, y, source,
- target_1, NULL,
- NULL, NULL);
+ target_1, NULL);
if(z<(DIM_SIZE[Z]-1))
target_1 = &grid[x][y][z+1];
else
target_1 = &grid[x][y][0];
_set_external_wires(Z, z, source,
- target_1, NULL,
- NULL, NULL);
+ target_1, NULL);
}
}
}
@@ -1051,8 +1042,7 @@ static int _create_config_even(pa_node_t *grid)
target_1 = &grid[x+1];
_set_external_wires(X, x, source,
- target_1, NULL,
- NULL, NULL);
+ target_1, NULL);
}
#endif
return 1;
@@ -1434,8 +1424,7 @@ static void _switch_config(pa_node_t* source, pa_node_t* target, int dim,
}
static void _set_external_wires(int dim, int count, pa_node_t* source,
- pa_node_t* target_1, pa_node_t* target_2,
- pa_node_t* target_first, pa_node_t* target_second)
+ pa_node_t* target_1, pa_node_t* target_2)
{
_switch_config(source, source, dim, 0, 0);
_switch_config(source, source, dim, 1, 1);
@@ -1445,47 +1434,46 @@ static void _set_external_wires(int dim, int count, pa_node_t* source,
_switch_config(source, source, dim, 4, 4);
return;
}
+
if(count==0) {
- /* First Node */
+ /* First Even Node */
/* 4->3 of next */
_switch_config(source, target_1, dim, 4, 3);
- /* 2->5 of next */
- _switch_config(source, target_1, dim, 2, 5);
- /* 3->4 of next even */
- _switch_config(source, target_2, dim, 3, 4);
- if(DIM_SIZE[dim]<4) {
- /* 5->2 of next even */
- _switch_config(source, target_2, dim, 5, 2);
-
- }
-
+ /* 5->2 of next */
+ _switch_config(source, target_1, dim, 5, 2);
+ /* 2->5 of next even */
+ _switch_config(source, target_2, dim, 2, 5);
+
} else if(!(count%2)) {
if(count<DIM_SIZE[dim]-2) {
/* Not Last Even Node */
- /* 3->4 of next even */
- _switch_config(source, target_2, dim, 3, 4);
- /* 2->5 of next */
- _switch_config(source, target_1, dim, 2, 5);
- /* 5->2 of next */
- _switch_config(source, target_1, dim, 5, 2);
+ /* 3->4 of next */
+ _switch_config(source, target_1, dim, 3, 4);
+ /* 4->3 of next */
+ _switch_config(source, target_1, dim, 4, 3);
+ /* 2->5 of next even */
+ _switch_config(source, target_2, dim, 2, 5);
+ /* 5->2 of next even */
+ _switch_config(source, target_2, dim, 5, 2);
+
} else {
/* Last Even Node */
/* 3->4 of next */
_switch_config(source, target_1, dim, 3, 4);
- /* 5->2 of next */
- _switch_config(source, target_1, dim, 5, 2);
- /* 2->5 of first */
- _switch_config(source, target_first, dim, 2, 5);
+ /* 2->5 of previous */
+ /********** fix me: on the full system this is needed ******/
+ //_switch_config(source, target_1, dim, 2, 5);
+ /********** fix me: not this ******/
+ _switch_config(source, target_1, dim, 4, 3);
}
} else {
if(count<DIM_SIZE[dim]-2) {
/* Not Last Odd Node */
- /* 4->3 of next odd */
- _switch_config(source, target_2, dim, 4, 3);
+ /* 5->2 of next odd */
+ _switch_config(source, target_2, dim, 5, 2);
} else {
/* Last Odd Node */
- /* 5->2 of second */
- _switch_config(source, target_second, dim, 5, 2);
+ /* nothing */
}
}
}
@@ -1891,42 +1879,33 @@ int main(int argc, char** argv)
List results;
// List results2;
// int i,j;
- DIM_SIZE[X]=8;
+ DIM_SIZE[X]=4;
DIM_SIZE[Y]=1;
DIM_SIZE[Z]=1;
pa_init(NULL);
- loc = find_bp_loc("R171");
- printf("The loc is %d%d%d\n",loc[X],loc[Y],loc[Z]);
- if((loc = find_bp_loc("R178")))
- printf("The loc is %d%d%d\n",loc[X],loc[Y],loc[Z]);
- else
- printf("This doesn't exsist!\n");
- exit(0);
-/* request->rotate = true; */
-/* request->elongate = true; */
-/* request->force_contig = true; */
-/* request->co_proc = true; */
-/* request->geometry[0]=-1; */
-
- results = list_create(NULL);
- request->geometry[0] = -1;
- request->size = 1; //atoi(argv[1]);
- request->conn_type = TORUS;
- new_pa_request(request);
- print_pa_request(request);
- allocate_part(request, results);
+
results = list_create(NULL);
- request->geometry[0] = 5;
+ request->geometry[0] = 4;
request->geometry[1] = 1;
request->geometry[2] = 1;
- request->size = -1; //atoi(argv[1]);
- request->conn_type = MESH;
+ request->size = 4;
+ request->conn_type = TORUS;
new_pa_request(request);
print_pa_request(request);
allocate_part(request, results);
+ /* results = list_create(NULL); */
+/* request->geometry[0] = 5; */
+/* request->geometry[1] = 1; */
+/* request->geometry[2] = 1; */
+/* request->size = -1; //atoi(argv[1]); */
+/* request->conn_type = MESH; */
+/* new_pa_request(request); */
+/* print_pa_request(request); */
+/* allocate_part(request, results); */
+
int dim,j;
int x,y,z;
int startx=0;
@@ -1955,11 +1934,11 @@ int main(int argc, char** argv)
}
}
}
- list_destroy(results);
+ /* list_destroy(results); */
- pa_fini();
+/* pa_fini(); */
- delete_pa_request(request);
+/* delete_pa_request(request); */
return 0;
}
View
219 src/plugins/select/bluegene/bgl_job_run.c
@@ -56,8 +56,6 @@
#define MAX_PTHREAD_RETRIES 1
#define POLL_INTERVAL 2
-#define KILL_PARTS_ON_REBOOT 1 /* FIXME: Temporaroy */
-
enum update_op {START_OP, TERM_OP, SYNC_OP};
typedef struct bgl_update {
@@ -98,22 +96,22 @@ static void _bgl_list_del(void *x)
}
}
-/* Kill a job and remove its record from DB2 */
+/* Kill a job and remove its record from MMCS */
static int _remove_job(db_job_id_t job_id)
{
int i, rc;
rm_job_t *job_rec;
rm_job_state_t job_state;
- debug("removing job %d from DB2", job_id);
+ debug("removing job %d from MMCS", job_id);
for (i=0; i<MAX_POLL_RETRIES; i++) {
if (i > 0)
sleep(POLL_INTERVAL);
/* Find the job */
if ((rc = rm_get_job(job_id, &job_rec)) != STATUS_OK) {
if (rc == JOB_NOT_FOUND) {
- debug("job %d removed from DB2", job_id);
+ debug("job %d removed from MMCS", job_id);
rc = STATUS_OK;
} else
error("rm_get_job(%d): %s", job_id,
@@ -124,7 +122,7 @@ static int _remove_job(db_job_id_t job_id)
if ((rc = rm_get_data(job_rec, RM_JobState, &job_state)) !=
STATUS_OK) {
if (rc == JOB_NOT_FOUND) {
- debug("job %d not found in DB2", job_id);
+ debug("job %d not found in MMCS", job_id);
rc = STATUS_OK;
} else
error("rm_get_data(RM_JobState) for jobid=%d "
@@ -143,7 +141,7 @@ static int _remove_job(db_job_id_t job_id)
rc = rm_remove_job(job_id);
if (rc != STATUS_OK) {
if (rc == JOB_NOT_FOUND) {
- debug("job %d removed from DB2", job_id);
+ debug("job %d removed from MMCS", job_id);
rc = STATUS_OK;
} else if (job_state == RM_JOB_RUNNING)
error("jm_cancel_job(%d): %s", job_id,
@@ -156,38 +154,78 @@ static int _remove_job(db_job_id_t job_id)
}
/* try once more... */
(void) rm_remove_job(job_id);
- error("Failed to remove job %d from DB2", job_id);
+ error("Failed to remove job %d from MMCS", job_id);
return INTERNAL_ERROR;
}
/* Get the owner of an existing partition. Caller must xfree() return value. */
static char *_get_part_owner(pm_partition_id_t bgl_part_id)
{
- int rc;
- char *owner, *cur_owner;
- rm_partition_t * part_elem;
-
- if ((rc = rm_get_partition(bgl_part_id, &part_elem)) != STATUS_OK) {
- error("rm_get_partition(%s): %s", bgl_part_id, bgl_err_str(rc));
+ int rc, j, num_parts;
+ char *name, *owner = NULL, *cur_owner = NULL;
+ rm_partition_t *part_ptr;
+ rm_partition_list_t *part_list;
+ rm_partition_state_flag_t part_state = PARTITION_ALL_FLAG;
+
+ if ((rc = rm_get_partitions_info(part_state, &part_list))
+ != STATUS_OK) {
+ error("rm_get_partitions_info(): %s", bgl_err_str(rc));
return NULL;
}
- if ((rc = rm_get_data(part_elem, RM_PartitionUserName, &owner)) !=
- STATUS_OK) {
- error("rm_get_data(RM_PartitionUserName): %s", bgl_err_str(rc));
- (void) rm_free_partition(part_elem);
- return NULL;
+
+ if ((rc = rm_get_data(part_list, RM_PartListSize, &num_parts))
+ != STATUS_OK) {
+ error("rm_get_data(RM_PartListSize): %s", bgl_err_str(rc));
+ num_parts = 0;
}
- cur_owner = xstrdup(owner);
- if ((rc = rm_free_partition(part_elem)) != STATUS_OK)
- error("rm_free_partition(): %s", bgl_err_str(rc));
+
+ for (j=0; j<num_parts; j++) {
+ if (j) {
+ if ((rc = rm_get_data(part_list, RM_PartListNextPart,
+ &part_ptr)) != STATUS_OK) {
+ error("rm_get_data(RM_PartListNextPart): %s",
+ bgl_err_str(rc));
+ break;
+ }
+ } else {
+ if ((rc = rm_get_data(part_list, RM_PartListFirstPart,
+ &part_ptr)) != STATUS_OK) {
+ error("rm_get_data(RM_PartListFirstPart: %s",
+ bgl_err_str(rc));
+ break;
+ }
+ }
+
+ if ((rc = rm_get_data(part_ptr, RM_PartitionID, &name))
+ != STATUS_OK) {
+ error("rm_get_data(RM_PartitionID): %s",
+ bgl_err_str(rc));
+ continue;
+ }
+ if (strcmp(bgl_part_id, name) != 0)
+ continue;
+
+ if ((rc = rm_get_data(part_ptr, RM_PartitionUserName, &owner))
+ != STATUS_OK) {
+ error("rm_get_data(RM_PartitionUserName) errno=%s\n",
+ bgl_err_str(rc));
+ owner = NULL;
+ }
+ break;
+ }
+
+ if (owner)
+ cur_owner = xstrdup(owner);
+ if ((rc = rm_free_partition_list(part_list)) != STATUS_OK)
+ error("rm_free_partition_list(): %s", bgl_err_str(rc));
return cur_owner;
}
/* Set the owner of an existing partition */
static int _set_part_owner(pm_partition_id_t bgl_part_id, char *user)
{
int rc;
- rm_partition_t * part_elem;
+ rm_partition_t * part_ptr;
if (user && user[0])
info("Setting partition %s owner to %s", bgl_part_id, user);
@@ -198,78 +236,99 @@ static int _set_part_owner(pm_partition_id_t bgl_part_id, char *user)
#ifdef USE_BGL_FILES
/* Logic shown below is the type of code we want to use to change the
- * owner of an existing bglblock - without rebooting it. This logic
- * does not work as of driver 040 2/17/2005.
+ * owner of an existing bglblock - without rebooting it. Logic of this
+ * type should be available in driver 140. 3/24/2005.
*/
int err_ret = SLURM_SUCCESS;
-/* find the partition */
- if ((rc = rm_get_partition(bgl_part_id, &part_elem)) != STATUS_OK) {
- error("rm_get_partition(%s): %s", bgl_part_id, bgl_err_str(rc));
- return SLURM_ERROR;
- }
-
-/* /\* set its owner *\/ */
+ /* set its owner */
if ((rc = rm_set_part_owner(bgl_part_id, user)) != STATUS_OK) {
error("rm_set_part_owner(%s,%s): %s", bgl_part_id, user,
bgl_err_str(rc));
return SLURM_ERROR;
}
-
-/* if ((rc = rm_set_data(part_elem, RM_PartitionUserName, &user)) */
-/* != STATUS_OK) { */
-/* error("rm_set_date(%s, RM_PartitionUserName): %s", bgl_part_id, */
-/* bgl_err_str(rc)); */
-/* err_ret = SLURM_ERROR; */
-/* } */
-
- if ((rc = rm_free_partition(part_elem)) != STATUS_OK)
- error("rm_free_partition(): %s", bgl_err_str(rc));
-
return err_ret;
#else
- int i=0;
- rm_partition_state_t part_state;
+ int i = 0, j, num_parts;
+ rm_partition_list_t *part_list;
+ rm_partition_state_t state;
+ rm_partition_state_flag_t part_state = PARTITION_ALL_FLAG;
+ char *name;
+ int is_ready = 0, destroyed = 0;
+
/* Wait for partition state to be FREE */
for (i=0; i<MAX_POLL_RETRIES; i++) {
if (i > 0)
sleep(POLL_INTERVAL);
- /* find the partition */
- if ((rc = rm_get_partition(bgl_part_id, &part_elem)) !=
- STATUS_OK) {
- error("rm_get_partition(%s): %s", bgl_part_id,
- bgl_err_str(rc));
- return SLURM_ERROR;
+ if ((rc = rm_get_partitions_info(part_state, &part_list))
+ != STATUS_OK) {
+ error("rm_get_partitions(): %s", bgl_err_str(rc));
+ continue;
}
- /* find its state */
- rc = rm_get_data(part_elem, RM_PartitionState, &part_state);
- if (rc != STATUS_OK) {
- error("rm_get_data(RM_PartitionState): %s",
+ if ((rc = rm_get_data(part_list, RM_PartListSize, &num_parts))
+ != STATUS_OK) {
+ error("rm_get_data(RM_PartListSize): %s",
bgl_err_str(rc));
- (void) rm_free_partition(part_elem);
- return SLURM_ERROR;
+ num_parts = 0;
}
- if ((rc = rm_free_partition(part_elem)) != STATUS_OK)
- error("rm_free_partition(): %s", bgl_err_str(rc));
+ for (j=0; j<num_parts; j++) {
+ if (j) {
+ if ((rc = rm_get_data(part_list,
+ RM_PartListNextPart,
+ &part_ptr)) != STATUS_OK) {
+ error("rm_get_data(RM_PartListNextPart)"
+ ": %s", bgl_err_str(rc));
+ break;
+ }
+ } else {
+ if ((rc = rm_get_data(part_list,
+ RM_PartListFirstPart,
+ &part_ptr)) != STATUS_OK) {
+ error("rm_get_data(RM_PartList"
+ "FirstPart: %s",
+ bgl_err_str(rc));
+ break;
+ }
+ }
- if (part_state == RM_PARTITION_FREE)
- break; /* partition is now free */
+ if ((rc = rm_get_data(part_ptr, RM_PartitionID, &name))
+ != STATUS_OK) {
+ error("rm_get_data(RM_PartitionID): %s",
+ bgl_err_str(rc));
+ continue;
+ }
+ if (strcmp(bgl_part_id, name))
+ continue;
- /* Destroy the partition, only on first pass */
- if ((i == 0)
- && ((rc = pm_destroy_partition(bgl_part_id)) != STATUS_OK)) {
- error("pm_destroy_partition(%s): %s", bgl_part_id,
- bgl_err_str(rc));
- return SLURM_ERROR;
+ if ((rc = rm_get_data(part_ptr, RM_PartitionState,
+ &state)) != STATUS_OK) {
+ error("rm_get_data(RM_PartitionState) :%s",
+ bgl_err_str(rc));
+ continue;
+ }
+ if (state == RM_PARTITION_FREE) {
+ is_ready = 1;
+ break;
+ }
+ if (destroyed)
+ continue;
+ if ((rc = pm_destroy_partition(bgl_part_id))
+ != STATUS_OK) {
+ error("pm_destroy_partition(%s): %s",
+ bgl_part_id, bgl_err_str(rc));
+ } else
+ destroyed = 1;
}
+ if ((rc = rm_free_partition_list(part_list)) != STATUS_OK)
+ error("rm_free_partition_list(): %s", bgl_err_str(rc));
}
- if (part_state != RM_PARTITION_FREE) {
+ if (!is_ready) {
error("Could not free partition %s", bgl_part_id);
return SLURM_ERROR;
}
@@ -280,6 +339,7 @@ static int _set_part_owner(pm_partition_id_t bgl_part_id, char *user)
return SLURM_ERROR;
}
+ verbose("rm_set_part_owner(%s,%s) completed", bgl_part_id, user);
return SLURM_SUCCESS;
#endif
}
@@ -314,7 +374,8 @@ static void _sync_agent(bgl_update_t *bgl_update_ptr)
cur_part_owner = _get_part_owner(bgl_update_ptr->bgl_part_id);
new_part_owner = uid_to_string(bgl_update_ptr->uid);
- if (strcmp(cur_part_owner, new_part_owner)) {
+ if ((cur_part_owner == NULL)
+ || strcmp(cur_part_owner, new_part_owner)) {
error("changing owner of bgl_part %s from %s to %s",
bgl_update_ptr->bgl_part_id, cur_part_owner,
new_part_owner);
@@ -359,7 +420,7 @@ static void _term_agent(bgl_update_t *bgl_update_ptr)
error("rm_get_data(RM_JobListSize): %s", bgl_err_str(rc));
jobs = 0;
} else if (jobs > 128)
- fatal("Active job count (%d) invalid, restart DB2", jobs);
+ fatal("Active job count (%d) invalid, restart MMCS", jobs);
for (i=0; i<jobs; i++) {
rm_element_t *job_elem;
@@ -381,6 +442,7 @@ static void _term_agent(bgl_update_t *bgl_update_ptr)
continue;
}
}
+
if(!job_elem) {
error("No Job Elem breaking out job count = %d\n", jobs);
break;
@@ -636,21 +698,12 @@ int term_job(struct job_record *job_ptr)
extern int sync_jobs(List job_list)
{
#ifdef HAVE_BGL_FILES
-#if KILL_PARTS_ON_REBOOT
- static int have_run = 0;
-#endif
ListIterator job_iterator, block_iterator;
struct job_record *job_ptr;
pm_partition_id_t bgl_part_id;
bgl_update_t *bgl_update_ptr;
List block_list = _get_all_blocks();
-#if KILL_PARTS_ON_REBOOT
- if (have_run)
- return SLURM_SUCCESS;
- have_run = 1;
-#endif
-
/* Insure that all running jobs own the specified partition */
job_iterator = list_iterator_create(job_list);
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
@@ -660,7 +713,8 @@ extern int sync_jobs(List job_list)
bgl_update_ptr = xmalloc(sizeof(bgl_update_t));
select_g_get_jobinfo(job_ptr->select_jobinfo,
- SELECT_DATA_PART_ID, &(bgl_update_ptr->bgl_part_id));
+ SELECT_DATA_PART_ID,
+ &(bgl_update_ptr->bgl_part_id));
if (bgl_update_ptr->bgl_part_id == NULL) {
error("Running job %u has bglblock==NULL",
@@ -670,11 +724,6 @@ extern int sync_jobs(List job_list)
error("Running job %u has nodes==NULL",
job_ptr->job_id);
good_block = false;
-#if KILL_PARTS_ON_REBOOT
- } else if (1) {
- info("Running job %u being killed", job_ptr->job_id);
- good_block = false;
-#endif
} else if (_excise_block(block_list, bgl_update_ptr->
bgl_part_id, job_ptr->nodes) != SLURM_SUCCESS) {
error("Kill job %u belongs to defunct bglblock %s",
View
21 src/plugins/select/bluegene/bgl_switch_connections.c
@@ -108,7 +108,7 @@ static int _add_switch_conns(rm_switch_t* curr_switch, bgl_switch_t *bgl_switch)
}
}
conn.part_state = RM_PARTITION_READY;
-
+
if(firstconnect) {
rm_set_data(curr_switch, RM_SwitchFirstConnection, &conn);
firstconnect=0;
@@ -239,6 +239,7 @@ extern int configure_partition_switches(bgl_record_t * bgl_record)
char *name2;
rm_BP_t *curr_bp;
rm_switch_t *coord_switch[PA_SYSTEM_DIMENSIONS];
+ rm_switch_t *curr_switch;
pa_switch_t *pa_switch;
char *bpid, *curr_bpid;
int found_bpid = 0;
@@ -324,23 +325,23 @@ extern int configure_partition_switches(bgl_record_t * bgl_record)
rm_get_data(curr_bp, RM_BPID, &bpid);
rm_get_data(bgl, RM_SwitchNum, &switch_count);
- rm_get_data(bgl, RM_FirstSwitch,&coord_switch[X]);
+ rm_get_data(bgl, RM_FirstSwitch,&curr_switch);
found_bpid = 0;
for (i=0; i<switch_count; i++) {
- rm_get_data(coord_switch[X], RM_SwitchBPID, &curr_bpid);
+ rm_get_data(curr_switch, RM_SwitchBPID, &curr_bpid);
if (!strcasecmp((char *)bpid, (char *)curr_bpid)) {
- found_bpid = 1;
- break;
+ coord_switch[found_bpid] = curr_switch;
+ found_bpid++;
+ if(found_bpid==PA_SYSTEM_DIMENSIONS)
+ break;
}
- rm_get_data(bgl,RM_NextSwitch,&coord_switch[X]);
+ rm_get_data(bgl,RM_NextSwitch,&curr_switch);
}
- if(found_bpid) {
- rm_get_data(bgl,RM_NextSwitch,&coord_switch[Y]);
- rm_get_data(bgl,RM_NextSwitch,&coord_switch[Z]);
-
+ if(found_bpid==PA_SYSTEM_DIMENSIONS) {
+
switch_itr = list_iterator_create(bgl_bp->switch_list);
while((bgl_switch = list_next(switch_itr)) != NULL) {
rm_get_data(coord_switch[bgl_switch->dim],
View
431 src/plugins/select/bluegene/bluegene.c
@@ -24,19 +24,10 @@
\*****************************************************************************/
#include "bluegene.h"
+#include <stdio.h>
#define BUFSIZE 4096
#define BITSIZE 128
-
-/*
- * The BGL bridge APIs are *not* thread safe. This means we can not
- * presently test for down nodes and switches in a separate pthread.
- * We could do so from within bgl_job_run.c:_part_agent(), but these
- * APIs are so slow (10-15 seconds for rm_get_BGL) that we do not
- * want to slow down job launch or termination by that much. When
- * the APIs are thread safe, revert to the code marked by
- * "#ifdef BGL_THREAD_SAFE". - Jette 2/17/2005
- */
#define MMCS_POLL_TIME 120 /* poll MMCS for down switches and nodes
* every 120 secs */
@@ -50,10 +41,12 @@ List bgl_list = NULL; /* list of bgl_record entries */
List bgl_curr_part_list = NULL; /* current bgl partitions */
List bgl_found_part_list = NULL; /* found bgl partitions */
char *bluegene_blrts = NULL, *bluegene_linux = NULL, *bluegene_mloader = NULL;
-char *bluegene_ramdisk = NULL;
+char *bluegene_ramdisk = NULL, *bridge_api_file = NULL;
char *change_numpsets = NULL;
+int numpsets;
bool agent_fini = false;
-
+int bridge_api_verb = 0;
+
/* some local functions */
#ifdef HAVE_BGL
static int _addto_node_list(bgl_record_t *bgl_record, int *start, int *end);
@@ -63,15 +56,16 @@ static int _validate_config_nodes(void);
static int _bgl_record_cmpf_inc(bgl_record_t* rec_a, bgl_record_t* rec_b);
static int _parse_bgl_spec(char *in_line);
static void _process_nodes(bgl_record_t *bgl_record);
+static int _reopen_bridge_log(int api_verb);
+static void _strip_13_10(char *line);
/* Initialize all plugin variables */
extern int init_bgl(void)
{
#ifdef HAVE_BGL_FILES
int rc;
-
rm_size3D_t bp_size;
-
+
info("Attempting to contact MMCS");
if ((rc = rm_set_serial(BGL_SERIAL)) != STATUS_OK) {
fatal("init_bgl: rm_set_serial(): %s", bgl_err_str(rc));
@@ -103,24 +97,6 @@ extern int init_bgl(void)
/* Purge all plugin variables */
extern void fini_bgl(void)
{
- /* pm_partition_id_t part_id; */
-/* bgl_record_t *record; */
-/* ListIterator itr; */
-
-/* itr = list_iterator_create(bgl_list); */
-/* while ((record = (bgl_record_t*) list_next(itr))) { */
-/* part_id=record->bgl_part_id; */
-/* debug("removing the jobs on partition %s\n", */
-/* (char *)part_id); */
-/* term_jobs_on_part(part_id); */
-
-/* debug("destroying %s\n",(char *)part_id); */
-/* bgl_free_partition(part_id); */
-
-/* rm_remove_partition(part_id); */
-/* debug("done\n"); */
-/* } */
-
_set_bgl_lists();
if (bgl_list) {
@@ -142,6 +118,7 @@ extern void fini_bgl(void)
xfree(bluegene_linux);
xfree(bluegene_mloader);
xfree(bluegene_ramdisk);
+ xfree(bridge_api_file);
#ifdef HAVE_BGL_FILES
if(bgl)
@@ -188,19 +165,20 @@ extern void destroy_bgl_record(void* object)
bgl_record_t* bgl_record = (bgl_record_t*) object;
if (bgl_record) {
- if(bgl_record->nodes) {
+ if(bgl_record->nodes)
xfree(bgl_record->nodes);
+ if(bgl_record->owner_name)
xfree(bgl_record->owner_name);
- if (bgl_record->bgl_part_list)
- list_destroy(bgl_record->bgl_part_list);
- if (bgl_record->hostlist)
- hostlist_destroy(bgl_record->hostlist);
- if (bgl_record->bitmap)
- bit_free(bgl_record->bitmap);
+ if(bgl_record->bgl_part_list)
+ list_destroy(bgl_record->bgl_part_list);
+ if(bgl_record->hostlist)
+ hostlist_destroy(bgl_record->hostlist);
+ if(bgl_record->bitmap)
+ bit_free(bgl_record->bitmap);
+ if(bgl_record->bgl_part_id)
xfree(bgl_record->bgl_part_id);
- xfree(bgl_record);
- }
+ xfree(bgl_record);
}
}
@@ -262,7 +240,6 @@ extern void *bluegene_agent(void *args)
last_mmcs_test = time(NULL) + MMCS_POLL_TIME;
while (!agent_fini) {
-#ifdef BGL_THREAD_SAFE
time_t now = time(NULL);
if (difftime(now, last_mmcs_test) >= MMCS_POLL_TIME) {
@@ -271,7 +248,6 @@ extern void *bluegene_agent(void *args)
last_mmcs_test = now;
test_mmcs_failures(); /* can run for a while */
}
-#endif
sleep(1);
}
return NULL;
@@ -367,25 +343,6 @@ extern int create_static_partitions(List part_list)
configure_partition(bgl_record);
print_bgl_record(bgl_record);
-/* found_record = (bgl_record_t*) xmalloc(sizeof(bgl_record_t)); */
-/* list_push(bgl_list, found_record); */
-
-/* found_record->bgl_part_list = bgl_record->bgl_part_list; */
-/* found_record->hostlist = bgl_record->hostlist; */
-/* found_record->nodes = xstrdup(bgl_record->nodes); */
-
-/* found_record->bp_count = bgl_record->bp_count; */
-/* found_record->switch_count = bgl_record->switch_count; */
-/* found_record->geo[X] = bgl_record->geo[X]; */
-/* found_record->geo[Y] = bgl_record->geo[Y]; */
-/* found_record->geo[Z] = bgl_record->geo[Z]; */
-
-/* found_record->conn_type = bgl_record->conn_type; */
-/* found_record->bitmap = bgl_record->bitmap; */
-/* found_record->node_use = SELECT_COPROCESSOR_MODE; */
-/* configure_partition(found_record); */
-/* /\*********************************************************\/ */
-/* print_bgl_record(found_record); */
#endif
}
}
@@ -407,7 +364,8 @@ extern int create_static_partitions(List part_list)
if(DIM_SIZE[X]==1 && DIM_SIZE[Y]==1 && DIM_SIZE[Z]==1)
sprintf(bgl_record->nodes, "bgl000");
else
- sprintf(bgl_record->nodes, "bgl[000x%d%d%d]", DIM_SIZE[X]-1, DIM_SIZE[Y]-1, DIM_SIZE[Z]-1);
+ sprintf(bgl_record->nodes, "bgl[000x%d%d%d]", DIM_SIZE[X]-1,
+ DIM_SIZE[Y]-1, DIM_SIZE[Z]-1);
itr = list_iterator_create(bgl_list);
while ((found_record = (bgl_record_t *) list_next(itr)) != NULL) {
if (!strcmp(bgl_record->nodes, found_record->nodes)) {
@@ -451,60 +409,105 @@ extern int create_static_partitions(List part_list)
no_total:
rc = SLURM_SUCCESS;
-/* itr = list_iterator_create(bgl_list); */
- /* printf("\n\n"); */
-/* while ((found_record = (bgl_record_t *) list_next(itr)) != NULL) { */
-
-/* print_bgl_record(found_record); */
-/* } */
-/* list_iterator_destroy(itr); */
-/* exit(0); */
- /*********************************************************/
-#endif
+#ifdef _PRINT_PARTS_AND_EXIT
+ itr = list_iterator_create(bgl_list);
+ printf("\n\n");
+ while ((found_record = (bgl_record_t *) list_next(itr)) != NULL) {
+ print_bgl_record(found_record);
+ }
+ list_iterator_destroy(itr);
+ exit(0);
+#endif /* _PRINT_PARTS_AND_EXIT */
+#endif /* HAVE_BGL_FILES */
return rc;
}
+#ifdef HAVE_BGL_FILES
+static rm_partition_state_t _get_state_partition(pm_partition_id_t part_id)
+{
+ rm_partition_state_t state = RM_PARTITION_NAV;
+ rm_partition_t *part_ptr;
+ int rc, j, num_parts;
+ rm_partition_list_t *part_list;
+ rm_partition_state_flag_t part_state = PARTITION_ALL_FLAG;
+ char *name;
+
+ if ((rc = rm_get_partitions_info(part_state, &part_list))
+ != STATUS_OK) {
+ error("rm_get_partitions(): %s\n", bgl_err_str(rc));
+ return state;
+ }
+
+ if ((rc = rm_get_data(part_list, RM_PartListSize, &num_parts))
+ != STATUS_OK) {
+ error("rm_get_data(RM_PartListSize): %s\n", bgl_err_str(rc));
+ num_parts = 0;
+ }
+
+ for (j=0; j<num_parts; j++) {
+ if (j) {
+ if ((rc = rm_get_data(part_list,
+ RM_PartListNextPart, &part_ptr))
+ != STATUS_OK) {
+ error("rm_get_data(RM_PartListNextPart): %s",
+ bgl_err_str(rc));
+ break;
+ }
+ } else {
+ if ((rc = rm_get_data(part_list,
+ RM_PartListFirstPart, &part_ptr))
+ != STATUS_OK) {
+ error("rm_get_data(RM_PartListFirstPart): %s",
+ bgl_err_str(rc));
+ break;
+ }
+ }
+
+ if ((rc = rm_get_data(part_ptr, RM_PartitionID, &name))
+ != STATUS_OK) {
+ error("rm_get_data(RM_PartitionID): %s",
+ bgl_err_str(rc));
+ continue;
+ }
+
+ if (strcmp(part_id, name))
+ continue;
+
+ if ((rc = rm_get_data(part_ptr, RM_PartitionState, &state))
+ != STATUS_OK) {
+ error("rm_get_data(RM_PartitionState): %s",
+ bgl_err_str(rc));
+ state = RM_PARTITION_NAV;
+ }
+ break;
+ }
+ if ((rc = rm_free_partition_list(part_list)) != STATUS_OK)
+ error("rm_free_partition_list(): %s", bgl_err_str(rc));
+
+ return state;
+}
+#endif
+
extern int bgl_free_partition(pm_partition_id_t part_id)
{
#ifdef HAVE_BGL_FILES
rm_partition_state_t state;
- rm_partition_t *my_part;
int rc;
- if ((rc = rm_get_partition(part_id, &my_part))
- != STATUS_OK) {
- error("couldn't get the partition in bgl_free_partition");
- } else {
- rm_get_data(my_part, RM_PartitionState, &state);
- if(state != RM_PARTITION_FREE)
- pm_destroy_partition(part_id);
-
- rm_get_data(my_part, RM_PartitionState, &state);
- while ((state != RM_PARTITION_FREE)
- && (state != RM_PARTITION_ERROR)){
- debug(".");
- rc=rm_free_partition(my_part);
- if(rc!=STATUS_OK){
- error("Error freeing partition\n");
- return(-1);
- }
- sleep(3);
- rc=rm_get_partition(part_id,&my_part);
- if(rc!=STATUS_OK) {
- error("Error in GetPartition\n");
- return(-1);
- }
- rm_get_data(my_part, RM_PartitionState,
- &state);
- }
- //Free memory allocated to mypart
- rc=rm_free_partition(my_part);
- if(rc!=STATUS_OK){
- error("Error freeing partition\n");
- return(-1);
+ while (1) {
+ state = _get_state_partition(part_id);
+
+ if (state != RM_PARTITION_FREE) {
+ if ((rc = pm_destroy_partition(part_id)) != STATUS_OK)
+ error("pm_destroy_partition(%s): %s",
+ part_id, bgl_err_str(rc));
}
-
+
+ if ((state == RM_PARTITION_FREE)
+ || (state == RM_PARTITION_ERROR))
+ break;
+ sleep(3);
}
#endif
return SLURM_SUCCESS;
@@ -584,19 +587,22 @@ static int _validate_config_nodes(void)
/* read current bgl partition info into bgl_curr_part_list */
if (read_bgl_partitions() == SLURM_ERROR)
return SLURM_ERROR;
-
+
+ if(!bgl_recover)
+ return SLURM_SUCCESS;
+
itr_conf = list_iterator_create(bgl_list);
while ((record = (bgl_record_t*) list_next(itr_conf))) {
/* translate hostlist to ranged string for consistent format */
/* search here */
node_use = SELECT_COPROCESSOR_MODE;
-
+
itr_curr = list_iterator_create(bgl_curr_part_list);
while ((init_record = (bgl_record_t*) list_next(itr_curr))
!= NULL) {
- if (strcasecmp(record->nodes, init_record->nodes)) {
+
+ if (strcasecmp(record->nodes, init_record->nodes))
continue; /* wrong nodes */
- }
if (record->conn_type != init_record->conn_type)
continue; /* must reconfig this part */
if(record->node_use != init_record->node_use)
@@ -648,99 +654,47 @@ static int _delete_old_partitions(void)
int rc;
ListIterator itr_curr, itr_found;
bgl_record_t *found_record, *init_record;
- pm_partition_id_t part_id;
- rm_partition_t *my_part;
- int part_number, lowest_part=300;
- char part_name[7];
+
+ if(!bgl_recover) {
+ itr_curr = list_iterator_create(bgl_curr_part_list);
+ while ((init_record = (bgl_record_t*) list_next(itr_curr))) {
+ debug("removing the jobs on partition %s\n",
+ init_record->bgl_part_id);
+ term_jobs_on_part(init_record->bgl_part_id);
- /******************************************************************/
- itr_curr = list_iterator_create(bgl_curr_part_list);
- while ((init_record = (bgl_record_t*) list_next(itr_curr))) {
- part_id=init_record->bgl_part_id;
- part_number = atoi(init_record->bgl_part_id+3);
- if(part_number<lowest_part)
- lowest_part = part_number;
- }
- list_iterator_destroy(itr_curr);
-// if(lowest_part != 101) {
- /* rm_get_partitions(RM_PARTITION_FREE, &part_list); */
-/* rm_get_data(part_list, RM_PartListSize, &size); */
-/* printf("This is the size %d\n",size); */
-/* for(i=0;i<size;i++) { */
-/* if(!i) */
-/* rm_get_data(part_list, RM_PartListFirstPart, &my_part); */
-/* else */
-/* rm_get_data(part_list, RM_PartListNextPart, &my_part); */
-/* rm_get_data(my_part, RM_PartListNextPart, &part_id); */
-/* printf("this is the name %s\n",part_id); */
-/* if(!strncasecmp("RMP",part_id,3)) { */
-/* init_record = xmalloc(sizeof(bgl_record_t)); */
-/* list_push(bgl_curr_part_list, init_record); */
-/* init_record->bgl_part_id = xstrdup(part_id); */
-/* } */
-/* xfree(part_id); */
-/* rm_free_partition(my_part); */
-/* } */
-/* exit(0); */
-
- /* Here is where we clear all the partitions that exist. This will need to
- be taken out when we get better code from IBM.
- */
- for(part_number=101; part_number<lowest_part; part_number++) {
- memset(part_name,0,7);
- sprintf(part_name, "RMP%d", part_number);
- //debug("Checking if Partition %s is free",part_name);
- if ((rc = rm_get_partition(part_name, &my_part))
- != STATUS_OK) {
- debug("Above error is ok. "
- "Partition %s doesn't exist.",
- part_name);
- continue;
+ debug("destroying %s\n",(char *)init_record->bgl_part_id);
+ rc = bgl_free_partition(init_record->bgl_part_id);
+
+ rm_remove_partition(init_record->bgl_part_id);
+ debug("done\n");
}
- debug("removing the jobs on partition %s\n",
- (char *)part_name);
- term_jobs_on_part(part_name);
-
- debug("destroying %s\n",(char *)part_name);
- rc = bgl_free_partition(part_name);
-
- rm_remove_partition(part_name);
- debug("done\n");
-
- //sleep(3);
- //debug("Removed Freed Partition %s",part_name);
- }
-
- /*************************************************/
-// }
-
- itr_curr = list_iterator_create(bgl_curr_part_list);
- while ((init_record = (bgl_record_t*) list_next(itr_curr))) {
- part_id=init_record->bgl_part_id;
- itr_found = list_iterator_create(bgl_found_part_list);
- while ((found_record = (bgl_record_t*) list_next(itr_found))
- != NULL) {
- if (!strcmp(init_record->bgl_part_id,
- found_record->bgl_part_id)) {
- break; /* don't reboot this one */
+ list_iterator_destroy(itr_curr);
+ } else {
+ itr_curr = list_iterator_create(bgl_curr_part_list);
+ while ((init_record = (bgl_record_t*) list_next(itr_curr))) {
+ itr_found = list_iterator_create(bgl_found_part_list);
+ while ((found_record = (bgl_record_t*) list_next(itr_found))
+ != NULL) {
+ if (!strcmp(init_record->bgl_part_id,
+ found_record->bgl_part_id)) {
+ break; /* don't reboot this one */
+ }
}
- }
- list_iterator_destroy(itr_found);
- if(found_record == NULL) {
+ list_iterator_destroy(itr_found);
+ if(found_record == NULL) {
+ debug("removing the jobs on partition %s\n",
+ init_record->bgl_part_id);
+ term_jobs_on_part(init_record->bgl_part_id);
- debug("removing the jobs on partition %s\n",
- (char *)part_id);
- term_jobs_on_part(part_id);
+ debug("destroying %s\n",(char *)init_record->bgl_part_id);
+ rc = bgl_free_partition(init_record->bgl_part_id);
- debug("destroying %s\n",(char *)part_id);
- rc = bgl_free_partition(part_id);
-
- rm_remove_partition(part_id);
- debug("done\n");
- }
+ rm_remove_partition(init_record->bgl_part_id);
+ debug("done\n");
+ }
+ }
+ list_iterator_destroy(itr_curr);
}
- //exit(0);
- list_iterator_destroy(itr_curr);
#endif
return 1;
}
@@ -768,6 +722,7 @@ extern int read_bgl_conf(void)
if (last_config_update
&& (last_config_update == config_stat.st_mtime)) {
debug("bluegene.conf unchanged");
+ _reopen_bridge_log(0);
return SLURM_SUCCESS;
}
last_config_update = config_stat.st_mtime;
@@ -785,6 +740,7 @@ extern int read_bgl_conf(void)
line_num = 0;
while (fgets(in_line, BUFSIZE, bgl_spec_file) != NULL) {
line_num++;
+ _strip_13_10(in_line);
if (strlen(in_line) >= (BUFSIZE - 1)) {
error("_read_bgl_config line %d, of input file %s "
"too long", line_num, bgl_conf);
@@ -828,16 +784,19 @@ extern int read_bgl_conf(void)
fatal("MloaderImage not configured in bluegene.conf");
if (!bluegene_ramdisk)
fatal("RamDiskImage not configured in bluegene.conf");
- if (!change_numpsets)
- info("Warning: ChangeNumpsets not configured in bluegene.conf");
-
+ if (!bridge_api_file)
+ info("BridgeAPILogFile not configured in bluegene.conf");
+ if (!numpsets)
+ info("Warning: Numpsets not configured in bluegene.conf");
+
/* Check to see if the configs we have are correct */
if (!_validate_config_nodes()) {
_delete_old_partitions();
/* FIXME: Wait for MMCS to actually complete the
* partition deletions */
sleep(3);
}
+
/* looking for partitions only I created */
if (create_static_partitions(NULL)) {
/* error in creating the static partitions, so
@@ -851,18 +810,20 @@ extern int read_bgl_conf(void)
return error_code;
}
-static void _strip_13_10(char *word)
+/* Explicitly strip out new-line and carriage-return */
+static void _strip_13_10(char *line)
{
- int len = strlen(word);
+ int len = strlen(line);
int i;
for(i=0;i<len;i++) {
- if(word[i]==13 || word[i]==10) {
- word[i] = '\0';
+ if(line[i]==13 || line[i]==10) {
+ line[i] = '\0';
return;
}
}
}
+
/*
*
* _parse_bgl_spec - parse the partition specification, build table and
@@ -880,14 +841,17 @@ static int _parse_bgl_spec(char *in_line)
char *nodes = NULL, *conn_type = NULL, *node_use = NULL;
char *blrts_image = NULL, *linux_image = NULL;
char *mloader_image = NULL, *ramdisk_image = NULL;
- char *change = NULL;
+ char *api_file = NULL;
+ int pset_num=-1, api_verb=-1;
bgl_record_t *bgl_record, *found_record;
error_code = slurm_parser(in_line,
"BlrtsImage=", 's', &blrts_image,
"LinuxImage=", 's', &linux_image,
"MloaderImage=", 's', &mloader_image,
- "ChangeNumpsets=", 's', &change,
+ "Numpsets=", 'd', &pset_num,
+ "BridgeAPIVerbose=", 'd', &api_verb,
+ "BridgeAPILogFile=", 's', &api_file,
"Nodes=", 's', &nodes,
"RamDiskImage=", 's', &ramdisk_image,
"Type=", 's', &conn_type,
@@ -918,11 +882,18 @@ static int _parse_bgl_spec(char *in_line)
bluegene_ramdisk = ramdisk_image;
ramdisk_image = NULL; /* nothing left to xfree */
}
- if (change) {
- xfree(change_numpsets);
- _strip_13_10(change);
- change_numpsets = change;
- change = NULL; /* nothing left to xfree */
+ if (api_file) {
+ xfree(bridge_api_file);
+ bridge_api_file = api_file;
+ api_file = NULL; /* nothing left to xfree */
+ _reopen_bridge_log(bridge_api_verb);
+ }
+ if (pset_num > 0) {
+ numpsets = pset_num;
+ }
+ if (api_verb >= 0) {
+ bridge_api_verb = api_verb;
+ _reopen_bridge_log(bridge_api_verb);
}
/* Process node information */
@@ -941,14 +912,11 @@ static int _parse_bgl_spec(char *in_line)
bgl_record->bgl_part_list = list_create(NULL);
bgl_record->hostlist = hostlist_create(NULL);
- _strip_13_10(nodes);
bgl_record->nodes = xstrdup(nodes);
xfree(nodes); /* pointer moved, nothing left to xfree */
_process_nodes(bgl_record);
- if (conn_type)
- _strip_13_10(conn_type);
if (!conn_type || !strcasecmp(conn_type,"TORUS"))
bgl_record->conn_type = SELECT_TORUS;
else
@@ -963,16 +931,21 @@ static int _parse_bgl_spec(char *in_line)
bgl_record->node_use = SELECT_COPROCESSOR_MODE;
else
bgl_record->node_use = SELECT_VIRTUAL_NODE_MODE;
+ bgl_record->partner = NULL;
} else {
/* If not then we will make both. */
- /* this is here to make a co_proc and virtual partition just like each other */
+ /* this is here to make a co_proc and virtual partition
+ * just like each other */
bgl_record->node_use = SELECT_VIRTUAL_NODE_MODE;
found_record = (bgl_record_t*) xmalloc(sizeof(bgl_record_t));
list_push(bgl_list, found_record);
+ bgl_record->partner = found_record;
+ found_record->partner = bgl_record;
+
found_record->bgl_part_list = bgl_record->bgl_part_list;
found_record->hostlist = bgl_record->hostlist;
found_record->nodes = xstrdup(bgl_record->nodes);
@@ -1103,3 +1076,31 @@ static void _process_nodes(bgl_record_t *bgl_record)
return;
}
+static int _reopen_bridge_log(int api_verb)
+{
+ static FILE *fp = NULL;
+
+ if (bridge_api_file == NULL)
+ return SLURM_SUCCESS;
+
+ if(fp)
+ fclose(fp);
+ fp = fopen(bridge_api_file,"a");
+ if (fp == NULL) {
+ error("can't open file for bridgeapi.log at %s: %m",
+ bridge_api_file);
+ return SLURM_ERROR;
+ }
+
+#ifdef HAVE_BGL_FILES
+ setSayMessageParams(fp, api_verb);
+#else
+ if (fprintf(fp, "bridgeapi.log to write here at level %d\n", api_verb)
+ < 20) {
+ error("can't write to bridgeapi.log: %m");
+ return SLURM_ERROR;
+ }
+#endif
+
+ return SLURM_SUCCESS;
+}
View
10 src/plugins/select/bluegene/bluegene.h
@@ -60,16 +60,16 @@
#include "bgl_job_run.h"
#include "state_test.h"
-#define PSETS_PER_BP 8
-#define USER_NAME "nobody"
+#define USER_NAME "slurm"
/* Global variables */
extern rm_BGL_t *bgl;
extern char *bluegene_blrts;
extern char *bluegene_linux;
extern char *bluegene_mloader;
extern char *bluegene_ramdisk;
-extern char *change_numpsets;
+extern char *bridge_api_file;
+extern int numpsets;
extern pa_system_t *pa_system_ptr;
extern int DIM_SIZE[PA_SYSTEM_DIMENSIONS];
@@ -80,7 +80,7 @@ extern bool agent_fini;
typedef int lifecycle_type_t;
enum part_lifecycle {DYNAMIC, STATIC};
-typedef struct {
+typedef struct bgl_record {
char *nodes; /* String of nodes in partition */
char *owner_name; /* Owner of partition */
pm_partition_id_t bgl_part_id; /* ID returned from MMCS */
@@ -95,6 +95,7 @@ typedef struct {
int bp_count; /* size */
int switch_count;
bitstr_t *bitmap;
+ struct bgl_record *partner;
} bgl_record_t;
typedef struct {
@@ -180,3 +181,4 @@ extern int configure_partition_switches(bgl_record_t * bgl_conf_record);
extern int bgl_free_partition(pm_partition_id_t part_id);
#endif /* _BLUEGENE_H_ */
+
View
471 src/plugins/select/bluegene/partition_sys.c
@@ -48,7 +48,6 @@ List bgl_sys_allocated = NULL;
#ifdef HAVE_BGL_FILES
static void _pre_allocate(bgl_record_t *bgl_record);
static int _post_allocate(bgl_record_t *bgl_record);
-static int _part_list_find(void *object, void *key);
static int _post_bgl_init_read(void *object, void *arg);
#if 0
@@ -89,77 +88,76 @@ static void _print_list(List list)
*/
static void _pre_allocate(bgl_record_t *bgl_record)
{
- int psets = (PSETS_PER_BP * bgl_record->bp_count);
-
- rm_set_data(bgl_record->bgl_part, RM_PartitionBlrtsImg,
- bluegene_blrts);
- rm_set_data(bgl_record->bgl_part, RM_PartitionLinuxImg,
- bluegene_linux);
- rm_set_data(bgl_record->bgl_part, RM_PartitionMloaderImg,
- bluegene_mloader);
- rm_set_data(bgl_record->bgl_part, RM_PartitionRamdiskImg,
- bluegene_ramdisk);
- rm_set_data(bgl_record->bgl_part, RM_PartitionConnection,
- &bgl_record->conn_type);
- rm_set_data(bgl_record->bgl_part, RM_PartitionMode,
- &bgl_record->node_use);
- rm_set_data(bgl_record->bgl_part, RM_PartitionPsetNum, &psets);
- rm_set_data(bgl_record->bgl_part, RM_PartitionUserName, USER_NAME);
+ int rc;
+
+ if ((rc = rm_set_data(bgl_record->bgl_part, RM_PartitionBlrtsImg,
+ bluegene_blrts)) != STATUS_OK)
+ error("rm_set_data(RM_PartitionBlrtsImg)", bgl_err_str(rc));
+
+ if ((rc = rm_set_data(bgl_record->bgl_part, RM_PartitionLinuxImg,
+ bluegene_linux)) != STATUS_OK)
+ error("rm_set_data(RM_PartitionLinuxImg)", bgl_err_str(rc));
+
+ if ((rc = rm_set_data(bgl_record->bgl_part, RM_PartitionMloaderImg,
+ bluegene_mloader)) != STATUS_OK)
+ error("rm_set_data(RM_PartitionMloaderImg)", bgl_err_str(rc));
+
+ if ((rc = rm_set_data(bgl_record->bgl_part, RM_PartitionRamdiskImg,
+ bluegene_ramdisk)) != STATUS_OK)
+ error("rm_set_data(RM_PartitionRamdiskImg)", bgl_err_str(rc));
+
+ if ((rc = rm_set_data(bgl_record->bgl_part, RM_PartitionConnection,
+ &bgl_record->conn_type)) != STATUS_OK)
+ error("rm_set_data(RM_PartitionConnection)", bgl_err_str(rc));
+
+ if ((rc = rm_set_data(bgl_record->bgl_part, RM_PartitionMode,
+ &bgl_record->node_use)) != STATUS_OK)
+ error("rm_set_data(RM_PartitionMode)", bgl_err_str(rc));
+
+ if ((rc = rm_set_data(bgl_record->bgl_part, RM_PartitionPsetsPerBP,
+ &numpsets)) != STATUS_OK)
+ error("rm_set_data(RM_PartitionPsetsPerBP)", bgl_err_str(rc));
+
+ if ((rc = rm_set_data(bgl_record->bgl_part, RM_PartitionUserName,
+ USER_NAME)) != STATUS_OK)
+ error("rm_set_data(RM_PartitionUserName)", bgl_err_str(rc));
}
/**
- * add the partition record to the DB and boot it up!
+ * add the partition record to the DB
*/
static int _post_allocate(bgl_record_t *bgl_record)
{
int rc;
pm_partition_id_t part_id;
- char command[255];
/* Add partition record to the DB */
debug("adding partition\n");
- rc = rm_add_partition(bgl_record->bgl_part);
- if (rc != STATUS_OK) {
- error("Error adding partition");
+
+ if ((rc = rm_add_partition(bgl_record->bgl_part)) != STATUS_OK) {
+ error("rm_add_partition(): %s", bgl_err_str(rc));
return(-1);
}
debug("done adding\n");
/* Get back the new partition id */
- rm_get_data(bgl_record->bgl_part, RM_PartitionID, &part_id);
- bgl_record->bgl_part_id = xstrdup(part_id);
- if (change_numpsets) {
- memset(command,0,255);
- sprintf(command,"%s %s", change_numpsets, part_id);
- info("%s",command);
- system(command);
- }
- /* We are done with the partition */
- rm_free_partition(bgl_record->bgl_part);
-
- /* Initiate boot of the partition */
- /* debug("Booting Partition %s", bgl_record->bgl_part_id); */
-/* rc = pm_create_partition(bgl_record->bgl_part_id); */
-/* if (rc != STATUS_OK) { */
-/* error("Error booting_partition partition"); */
-/* return(-1); */
-/* } */
+ if ((rc = rm_get_data(bgl_record->bgl_part, RM_PartitionID, &part_id))
+ != STATUS_OK) {
+ error("rm_get_data(RM_PartitionID): %s", bgl_err_str(rc));
+ bgl_record->bgl_part_id = xstrdup("UNKNOWN");
+ } else
+ bgl_record->bgl_part_id = xstrdup(part_id);
-/* /\* Wait for Partition to be booted *\/ */
-/* rc = rm_get_partition(bgl_record->bgl_part_id, &bgl_record->bgl_part); */
-/* if (rc != STATUS_OK) { */
-/* error("Error in GetPartition"); */
-/* return(-1); */
-/* } */
-/* rm_free_partition(bgl_record->bgl_part); */
-
- fflush(stdout);
+ /* We are done with the partition */
+ if ((rc = rm_free_partition(bgl_record->bgl_part)) != STATUS_OK)
+ error("rm_free_partition(): %s", bgl_err_str(rc));
return 0;
}
extern int configure_partition(bgl_record_t *bgl_record)
{
+
rm_new_partition(&bgl_record->bgl_part); /* new partition to be added */
_pre_allocate(bgl_record);
@@ -178,66 +176,94 @@ int read_bgl_partitions()
int bp_cnt, i, rm_rc;
rm_element_t *bp_ptr;
- rm_location_t bp_loc;
pm_partition_id_t part_id;
rm_partition_t *part_ptr;
char node_name_tmp[7], *owner_name;
bgl_record_t *bgl_record;
-#ifndef USE_BGL_FILE
+
int *coord;
- char *bp_id;
- int part_number, lowest_part=300;
- char part_name[7];
-#endif
+ int part_number, part_count;
+ char *part_name;
+ rm_partition_list_t *part_list;
+ rm_partition_state_flag_t state = PARTITION_ALL_FLAG;
+
- /* This code is here to blow add partitions after we get the
- system to return correct location information
- */
- return 1;
-#ifndef USE_BGL_FILES
if ((rc = rm_set_serial(BGL_SERIAL)) != STATUS_OK) {
- error("rm_set_serial(): %d\n", rc);
+ error("rm_set_serial(): %s\n", bgl_err_str(rc));
return SLURM_ERROR;
}
- for(part_number=101; part_number<lowest_part; part_number++) {
- memset(part_name,0,7);
- sprintf(part_name, "RMP%d", part_number);
- //debug("Checking if Partition %s is free",part_name);
- if ((rc = rm_get_partition(part_name, &part_ptr))
- != STATUS_OK) {
- debug("Above error is ok. "
- "Partition %s doesn't exist.",
- part_name);
- rc = SLURM_SUCCESS;
- break;
- /* FIX ME: This will need to continue not break
- after testing is done.
- */
- //continue;
+ if ((rc = rm_get_partitions_info(state, &part_list))
+ != STATUS_OK) {
+ error("rm_get_partitions(): %s", bgl_err_str(rc));
+ return SLURM_ERROR;
+
+ }
+
+ if ((rc = rm_get_data(part_list, RM_PartListSize, &part_count))
+ != STATUS_OK) {
+ error("rm_get_data(RM_PartListSize): %s", bgl_err_str(rc));
+ part_count = 0;
+ }
+
+ for(part_number=0; part_number<part_count; part_number++) {
+ if (part_number) {
+ if ((rc = rm_get_data(part_list, RM_PartListNextPart,
+ &part_ptr)) != STATUS_OK) {
+ error("rm_get_data(RM_PartListNextPart): %s",
+ bgl_err_str(rc));
+ break;
+ }
+ } else {
+ if ((rc = rm_get_data(part_list, RM_PartListFirstPart,
+ &part_ptr)) != STATUS_OK) {
+ error("rm_get_data(RM_PartListFirstPart): %s",
+ bgl_err_str(rc));
+ break;
+ }
+ }
+
+ if ((rc = rm_get_data(part_ptr, RM_PartitionID, &part_name))
+ != STATUS_OK) {
+ error("rm_get_data(RM_PartitionID): %s",
+ bgl_err_str(rc));
+ continue;
}
- /* New BGL partition record */
+ if(strncmp("RMP",part_name,3))
+ continue;
+ if(bgl_recover)
+ if ((rc = rm_get_partition(part_name, &part_ptr))
+ != STATUS_OK) {
+ error("Partition %s doesn't exist.",
+ part_name);
+ rc = SLURM_SUCCESS;
+ break;
+ }
+ /* New BGL partition record */
+ bgl_record = xmalloc(sizeof(bgl_record_t));
+ list_push(bgl_curr_part_list, bgl_record);
+
+ bgl_record->bgl_part_id = xstrdup(part_name);
+ //rm_BP_id_t *bp_id;
if ((rm_rc = rm_get_data(part_ptr, RM_PartitionBPNum, &bp_cnt)) != STATUS_OK) {
error("rm_get_data(RM_BPNum): %s", bgl_err_str(rm_rc));
bp_cnt = 0;
}
if(bp_cnt==0)
continue;
+
+ bgl_record->bgl_part_list = list_create(NULL);
+ bgl_record->hostlist = hostlist_create(NULL);
+
if ((rm_rc = rm_get_data(part_ptr, RM_PartitionFirstBP, &bp_ptr))
!= STATUS_OK) {
error("rm_get_data(RM_FirstBP): %s",
bgl_err_str(rm_rc));
rc = SLURM_ERROR;
return rc;
}
- bgl_record = xmalloc(sizeof(bgl_record_t));
- list_push(bgl_curr_part_list, bgl_record);
-
- bgl_record->bgl_part_list = list_create(NULL);
- bgl_record->hostlist = hostlist_create(NULL);
- bgl_record->bgl_part_id = xstrdup(part_name);
- //rm_BP_id_t *bp_id;
+
for (i=0; i<bp_cnt; i++) {
if ((rm_rc = rm_get_data(bp_ptr, RM_BPID, &part_id))
!= STATUS_OK) {
@@ -246,18 +272,17 @@ int read_bgl_partitions()
rc = SLURM_ERROR;
break;
}
- debug("bp_id is %s\n",part_id);
-
- coord = find_bp_loc(bp_id);
+
+ coord = find_bp_loc(part_id);
sprintf(node_name_tmp, "bgl%d%d%d",
coord[X], coord[Y], coord[Z]);
- debug("adding %s to partition %s\n",node_name_tmp,part_name);
+ //debug("adding %s to partition %s\n",node_name_tmp, bgl_record->bgl_part_id);
hostlist_push(bgl_record->hostlist, node_name_tmp);
list_append(bgl_record->bgl_part_list,
- &pa_system_ptr->grid[bp_loc.X][bp_loc.Y][bp_loc.Z]);
+ &pa_system_ptr->grid[coord[X]][coord[Y]][coord[Z]]);
if ((rm_rc = rm_get_data(part_ptr, RM_PartitionNextBP, &bp_ptr))
!= STATUS_OK) {
error("rm_get_data(RM_NextBP): %s",
@@ -266,6 +291,7 @@ int read_bgl_partitions()
break;
}
}
+
// need to get the 000x000 range for nodes
// also need to get coords
@@ -309,161 +335,156 @@ int read_bgl_partitions()
bgl_record->part_lifecycle = STATIC;
-
if ((rm_rc = rm_free_partition(part_ptr))
!= STATUS_OK) {
error("rm_free_partition(): %s",
bgl_err_str(rm_rc));
}
-
- //sleep(3);
- //debug("Removed Freed Partition %s",part_name);
- }
-//#endif
-#else
- if ((rc = rm_get_BGL(&bgl)) != STATUS_OK) {
- fatal("init_bgl: rm_get_BGL(): %s", bgl_err_str(rc));
- return SLURM_ERROR;
}
+ rm_free_partition_list(part_list);
+
+/* if ((rc = rm_get_BGL(&bgl)) != STATUS_OK) { */
+/* fatal("init_bgl: rm_get_BGL(): %s", bgl_err_str(rc)); */
+/* return SLURM_ERROR; */
+/* } */
- if ((rm_rc = rm_get_data(bgl, RM_BPNum, &bp_cnt)) != STATUS_OK) {
- error("rm_get_data(RM_BPNum): %s", bgl_err_str(rm_rc));
- rc = SLURM_ERROR;
- bp_cnt = 0;
- }
+/* if ((rm_rc = rm_get_data(bgl, RM_BPNum, &bp_cnt)) != STATUS_OK) { */
+/* error("rm_get_data(RM_BPNum): %s", bgl_err_str(rm_rc)); */
+/* rc = SLURM_ERROR; */
+/* bp_cnt = 0; */
+/* } */
- if ((rm_rc = rm_get_data(bgl, RM_FirstBP, &bp_ptr))
- != STATUS_OK) {
- error("rm_get_data(RM_FirstBP): %s",
- bgl_err_str(rm_rc));
- rc = SLURM_ERROR;
- return rc;
- }
-
- for (i=0; i<bp_cnt; i++) {
-
- if ((rm_rc = rm_get_data(bp_ptr, RM_BPLoc, &bp_loc))
- != STATUS_OK) {
- error("rm_get_data(RM_BPLoc): %s",
- bgl_err_str(rm_rc));
- rc = SLURM_ERROR;
- break;
- }
-
- sprintf(node_name_tmp, "bgl%d%d%d",
- bp_loc.X, bp_loc.Y, bp_loc.Z);
+/* if ((rm_rc = rm_get_data(bgl, RM_FirstBP, &bp_ptr)) */
+/* != STATUS_OK) { */
+/* error("rm_get_data(RM_FirstBP): %s", */
+/* bgl_err_str(rm_rc)); */
+/* rc = SLURM_ERROR; */
+/* return rc; */
+/* } */
+
+/* for (i=0; i<bp_cnt; i++) { */
+
+/* if ((rm_rc = rm_get_data(bp_ptr, RM_BPLoc, &bp_loc)) */
+/* != STATUS_OK) { */
+/* error("rm_get_data(RM_BPLoc): %s", */
+/* bgl_err_str(rm_rc)); */
+/* rc = SLURM_ERROR; */
+/* break; */
+/* } */
+
+/* sprintf(node_name_tmp, "bgl%d%d%d", */
+/* bp_loc.X, bp_loc.Y, bp_loc.Z); */
- if ((rm_rc = rm_get_data(bp_ptr, RM_BPPartID, &part_id))
- != STATUS_OK) {
- error("rm_get_data(RM_BPPartID: %s",
- bgl_err_str(rm_rc));
- rc = SLURM_ERROR;
- break;
- }
-
- if (!part_id || (part_id[0] == '\0')) {
- error("no part_id exiting");
- rc = SLURM_ERROR;
- break;
- }
- //info("Node:%s in BglBlock:%s", node_name_tmp, part_id);
- if(strncmp("RMP",part_id,3))
- goto noadd;
- bgl_record = list_find_first(bgl_curr_part_list,
- _part_list_find, part_id);
- if (!bgl_record) {
- /* New BGL partition record */
- if ((rm_rc = rm_get_partition(part_id, &part_ptr))
- != STATUS_OK) {
- error("rm_get_partition(%s): %s",
- part_id, bgl_err_str(rm_rc));
- rc = SLURM_ERROR;
- continue;
- }
- bgl_record = xmalloc(sizeof(bgl_record_t));
- list_push(bgl_curr_part_list, bgl_record);
+/* if ((rm_rc = rm_get_data(bp_ptr, RM_BPPartID, &part_id)) */
+/* != STATUS_OK) { */
+/* error("rm_get_data(RM_BPPartID: %s", */
+/* bgl_err_str(rm_rc)); */
+/* rc = SLURM_ERROR; */
+/* break; */
+/* } */
+
+/* if (!part_id || (part_id[0] == '\0')) { */
+/* error("no part_id exiting"); */
+/* rc = SLURM_ERROR; */
+/* break; */
+/* } */
+/* //info("Node:%s in BglBlock:%s", node_name_tmp, part_id); */
+/* if(strncmp("RMP",part_id,3)) */
+/* goto noadd; */
+/* bgl_record = list_find_first(bgl_curr_part_list, */
+/* _part_list_find, part_id); */
+/* if (!bgl_record) { */
+/* /\* New BGL partition record *\/ */
+/* if ((rm_rc = rm_get_partition(part_id, &part_ptr)) */
+/* != STATUS_OK) { */
+/* error("rm_get_partition(%s): %s", */
+/* part_id, bgl_err_str(rm_rc)); */
+/* rc = SLURM_ERROR; */
+/* continue; */
+/* } */
+/* bgl_record = xmalloc(sizeof(bgl_record_t)); */
+/* list_push(bgl_curr_part_list, bgl_record); */
- bgl_record->bgl_part_list = list_create(NULL);
- list_append(bgl_record->bgl_part_list, &pa_system_ptr->grid[bp_loc.X][bp_loc.Y][bp_loc.Z]);
- bgl_record->hostlist = hostlist_create(node_name_tmp);
- bgl_record->bgl_part_id = xstrdup(part_id);
+/* bgl_record->bgl_part_list = list_create(NULL); */
+/* list_append(bgl_record->bgl_part_list, &pa_system_ptr->grid[bp_loc.X][bp_loc.Y][bp_loc.Z]); */
+/* bgl_record->hostlist = hostlist_create(node_name_tmp); */
+/* bgl_record->bgl_part_id = xstrdup(part_id); */
- // need to get the 000x000 range for nodes
- // also need to get coords
+/* // need to get the 000x000 range for nodes */
+/* // also need to get coords */
- if ((rm_rc = rm_get_data(part_ptr,
- RM_PartitionConnection,
- &bgl_record->conn_type))
- != STATUS_OK) {
- error("rm_get_data(RM_PartitionConnection): %s",
- bgl_err_str(rm_rc));
- }
- if ((rm_rc = rm_get_data(part_ptr, RM_PartitionMode,
- &bgl_record->node_use))
- != STATUS_OK) {
- error("rm_get_data(RM_PartitionMode): %s",
- bgl_err_str(rm_rc));
- }
+/* if ((rm_rc = rm_get_data(part_ptr, */
+/* RM_PartitionConnection, */
+/* &bgl_record->conn_type)) */
+/* != STATUS_OK) { */
+/* error("rm_get_data(RM_PartitionConnection): %s", */
+/* bgl_err_str(rm_rc)); */
+/* } */
+/* if ((rm_rc = rm_get_data(part_ptr, RM_PartitionMode, */
+/* &bgl_record->node_use)) */
+/* != STATUS_OK) { */
+/* error("rm_get_data(RM_PartitionMode): %s", */
+/* bgl_err_str(rm_rc)); */
+/* } */
- if ((rm_rc = rm_get_data(part_ptr,
- RM_PartitionUserName,
- &owner_name)) != STATUS_OK) {
- error("rm_get_data(RM_PartitionUserName): %s",
- bgl_err_str(rm_rc));
- } else
- bgl_record->owner_name = xstrdup(owner_name);
+/* if ((rm_rc = rm_get_data(part_ptr, */
+/* RM_PartitionUserName, */
+/* &owner_name)) != STATUS_OK) { */
+/* error("rm_get_data(RM_PartitionUserName): %s", */
+/* bgl_err_str(rm_rc)); */
+/* } else */
+/* bgl_record->owner_name = xstrdup(owner_name); */
- if ((rm_rc = rm_get_data(part_ptr,
- RM_PartitionBPNum,
- &bgl_record->bp_count))
- != STATUS_OK) {
- error("rm_get_data(RM_PartitionUserName): %s",
- bgl_err_str(rm_rc));
- }
+/* if ((rm_rc = rm_get_data(part_ptr, */
+/* RM_PartitionBPNum, */
+/* &bgl_record->bp_count)) */
+/* != STATUS_OK) { */
+/* error("rm_get_data(RM_PartitionUserName): %s", */
+/* bgl_err_str(rm_rc)); */
+/* } */
- if ((rm_rc = rm_get_data(part_ptr,
- RM_PartitionSwitchNum,
- &bgl_record->switch_count))
- != STATUS_OK) {
- error("rm_get_data(RM_PartitionUserName): %s",
- bgl_err_str(rm_rc));
- }
+/* if ((rm_rc = rm_get_data(part_ptr, */
+/* RM_PartitionSwitchNum, */
+/* &bgl_record->switch_count)) */
+/* != STATUS_OK) { */
+/* error("rm_get_data(RM_PartitionUserName): %s", */
+/* bgl_err_str(rm_rc)); */
+/* } */
- bgl_record->part_lifecycle = STATIC;
+/* bgl_record->part_lifecycle = STATIC; */
- if ((rm_rc = rm_free_partition(part_ptr))
- != STATUS_OK) {
- error("rm_free_partition(): %s",
- bgl_err_str(rm_rc));
- }
+/* if ((rm_rc = rm_free_partition(part_ptr)) */
+/* != STATUS_OK) { */
+/* error("rm_free_partition(): %s", */
+/* bgl_err_str(rm_rc)); */
+/* } */
- } else {
- hostlist_push(bgl_record->hostlist, node_name_tmp);
- list_append(bgl_record->bgl_part_list,
- &pa_system_ptr->grid[bp_loc.X][bp_loc.Y][bp_loc.Z]);
- }
- noadd:
- if ((rm_rc = rm_get_data(bgl, RM_NextBP, &bp_ptr))
- != STATUS_OK) {
- error("rm_get_data(RM_NextBP): %s",
- bgl_err_str(rm_rc));
- rc = SLURM_ERROR;
- break;
- }
- }
-#endif
+/* } else { */
+/* hostlist_push(bgl_record->hostlist, node_name_tmp); */
+/* list_append(bgl_record->bgl_part_list, */
+/* &pa_system_ptr->grid[bp_loc.X][bp_loc.Y][bp_loc.Z]); */
+/* } */
+/* noadd: */
+/* if ((rm_rc = rm_get_data(bgl, RM_NextBP, &bp_ptr)) */
+/* != STATUS_OK) { */
+/* error("rm_get_data(RM_NextBP): %s", */
+/* bgl_err_str(rm_rc)); */
+/* rc = SLURM_ERROR; */
+/* break; */
+/* } */
+/* } */
/* perform post-processing for each bluegene partition */
- list_for_each(bgl_curr_part_list, _post_bgl_init_read, NULL);
+ if(bgl_recover)
+ list_for_each(bgl_curr_part_list, _post_bgl_init_read, NULL);
return rc;
}
static int _post_bgl_init_read(void *object, void *arg)
{
bgl_record_t *bgl_record = (bgl_record_t *) object;
int i = 1024;
-
bgl_record->nodes = xmalloc(i);
while (hostlist_ranged_string(bgl_record->hostlist, i,
bgl_record->nodes) < 0) {
@@ -482,23 +503,5 @@ static int _post_bgl_init_read(void *object, void *arg)
return SLURM_SUCCESS;
}
-static int _part_list_find(void *object, void *key)
-{
- bgl_record_t *part_ptr = (bgl_record_t *) object;
- pm_partition_id_t part_id = (pm_partition_id_t) key;
-
- if (!part_ptr->bgl_part_id) {
- error("_part_list_find: bgl_part_id == NULL");
- return -1;
- }
- if (!part_id) {
- error("_part_list_find: part_id == NULL");
- return -1;
- }
-
- if (strcmp(part_ptr->bgl_part_id, part_id) == 0)
- return 1;
- return 0;
-}
#endif
View
140 src/plugins/select/bluegene/slurm_epilog.c
@@ -48,13 +48,15 @@ int main(int argc, char *argv[])
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
+#include <string.h>
#include "src/plugins/select/bluegene/wrap_rm_api.h"
#define _DEBUG 0
-#define MAX_RETRIES 20 /* max retry count in polling */
+#define MAX_RETRIES 40 /* max retry count in polling */
#define POLL_SLEEP 3 /* retry interval in seconds */
#define MAX_DELAY (MAX_RETRIES * POLL_SLEEP) /* time in seconds */
+static char *bgl_err_str(status_t inx);
static void _wait_part_owner(char *part_name, char *user_id);
int main(int argc, char *argv[])
@@ -65,9 +67,9 @@ int main(int argc, char *argv[])
if (!job_id)
fprintf(stderr, "SLURM_JOBID not set\n");
- part_name = getenv("BGL_PARTITION_ID"); /* get partition ID */
+ part_name = getenv("MPIRUN_PARTITION"); /* get partition ID */
if (!part_name) {
- fprintf(stderr, "BGL_PARTITION_ID not set for job %s\n",
+ fprintf(stderr, "MPIRUN_PARTITION not set for job %s\n",
job_id);
exit(0);
}
@@ -83,10 +85,13 @@ int main(int argc, char *argv[])
static void _wait_part_owner(char *part_name, char *user_id)
{
uid_t target_uid;
- int i, rc1, rc2;
+ int i, j, rc, num_parts;
rm_partition_t *part_ptr;
char *name;
struct passwd *pw_ent;
+ int is_ready = 0;
+ rm_partition_state_flag_t part_state = PARTITION_ALL_FLAG;
+ rm_partition_list_t *part_list;
target_uid = atoi(user_id);
@@ -102,35 +107,82 @@ static void _wait_part_owner(char *part_name, char *user_id)
printf(".");
#endif
}
- if ((rc1 = rm_get_partition(part_name, &part_ptr)) !=
- STATUS_OK) {
- fprintf(stderr, "rm_get_partition(%s) errno=%d\n",
- part_name, rc1);
- return;
- }
- rc1 = rm_get_data(part_ptr, RM_PartitionUserName, &name);
- rc2 = rm_free_partition(part_ptr);
- if (rc1 != STATUS_OK) {
- fprintf(stderr,
- "rm_get_data(%s, RM_PartitionUserName) "
- "errno=%d\n", part_name, rc1);
- return;
+
+ if ((rc = rm_get_partitions_info(part_state, &a