Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

This commit was manufactured by cvs2svn to create tag

'slurm-0-4-11-1'.
  • Loading branch information...
commit 5ad19da07b33039753f1dccaceaff1db71ba0cdb 1 parent 5ab7088
no author authored
4 META
View
@@ -9,8 +9,8 @@
Name: slurm
Major: 0
Minor: 4
- Micro: 6
- Version: 0.4.6
+ Micro: 11
+ Version: 0.4.11
Release: 1
API_CURRENT: 6
API_AGE: 4
22 NEWS
View
@@ -1,9 +1,29 @@
This file describes changes in recent versions of SLURM. It primarily
documents those changes that are of interest to users and admins.
+* Changes in SLURM 0.4.11
+========================
+ -- changed as many rm_get_partition() to rm_get_partitions_info as we could
+ for time saving.
+
+* Changes in SLURM 0.4.10
+========================
+ -- redesign for BGL external wiring.
+ -- smap display bug fix for smaller systems.
+
+* Changes in SLURM 0.4.9
+========================
+ -- setpnum works now, have to include this in bluegene.conf
+
+* Changes in SLURM 0.4.8
+========================
+ -- Changed the prolog and the epilog to use the env var MPIRUN_PARTITION
+ instead of BGL_PARTITION_ID
+
* Changes in SLURM 0.4.7
========================
- -- Remove some BGL specific headers that IBM now distributes.
+ -- Remove some BGL specific headers that IBM now distributes, NOTE
+ BGL driver 080 or greater required.
-- Change autogen.sh to deal with problems running autoconf on one
system and configure on another with different software versions.
2  auxdir/x_ac_bgl.m4
View
@@ -33,7 +33,7 @@ AC_DEFUN([X_AC_BGL],
fi
have_bgl_ar=yes
- bgl_ldflags="$bgl_ldflags -Wl,-rpath $bgl_dir/lib -Wl,-L$bgl_dir/lib -Wl,-whole-archive -Wl,-lbglbridge -Wl,-no-whole-archive $bgl_dir/lib/bglbootload.a $bgl_dir/lib/bglsp440supt.a -lbgldb -lbglmachine -ltableapi -lexpat -lbglsp"
+ bgl_ldflags="$bgl_ldflags -Wl,-rpath $bgl_dir/lib -Wl,-L$bgl_dir/lib -Wl,-whole-archive -Wl,-lbglbridge -Wl,-no-whole-archive $bgl_dir/lib/bglbootload.a $bgl_dir/lib/bglsp440supt.a -lsaymessage -lbgldb -lbglmachine -ltableapi -lexpat -lbglsp"
fi
# Search for required DB2 library in the directory
6 doc/html/bluegene.html
View
@@ -99,7 +99,7 @@
The script that you submit to SLURM can contain multiple invocations of mpirun as
well as any desired commands for pre- and post-processing.
The mpirun command will get its <i>bglblock</i> or BGL partition information from the
-<i>BGL_PARTITION_ID</i> as set by SLURM. A sample script is shown below.
+<i>MPIRUN_PARTITION</i> as set by SLURM. A sample script is shown below.
<pre>
#!/bin/bash
# pre-processing
@@ -185,11 +185,11 @@
and interfaces.
The value of <i>SchedulerType</i> should be set to "sched/builtin".
The value of <i>Prolog</i> should be set to a program that will delay
-execution until the bglblock identified by the BGL_PARTITION_ID environment
+execution until the bglblock identified by the MPIRUN_PARTITION environment
variable is ready for use. It is recommended that you construct a script
that serves this function and calls the supplied program <i>slurm_prolog</i>.
The value of <i>Epilog</i> should be set to a program that will wait
-until the bglblock identified by the BGL_PARTITION_ID environment
+until the bglblock identified by the MPIRUN_PARTITION environment
variable has been freed. It is recommended that you construct a script
that serves this function and calls the supplied program <i>slurm_epilog</i>.
The prolog and epilog programs are used to insure proper synchronization
3  etc/bluegene.conf.example
View
@@ -15,7 +15,8 @@ BlrtsImage=/bgl/BlueLight/ppcfloor/bglsys/bin/rts_hw.rts
LinuxImage=/bgl/BlueLight/ppcfloor/bglsys/bin/zImage.elf
MloaderImage=/bgl/BlueLight/ppcfloor/bglsys/bin/mmcs-mloader.rts
RamDiskImage=/bgl/BlueLight/ppcfloor/bglsys/bin/ramdisk.elf
-#ChangeNumpsets=/etc/slurm/change_numpsets
+Numpsets=8
+BridgeAPIVerbose=0
#
# Define the static partitions (bglblocks)
2  src/partition_allocator/Makefile.am
View
@@ -16,7 +16,7 @@ INCLUDES = -I$(top_srcdir) $(BGL_INCLUDES)
# $(top_builddir)/src/api/libslurm.la
# partition_allocator_LDFLAGS = -export-dynamic -lm $(CMD_LDFLAGS) $(BGL_LDFLAGS)
-# CPPFLAGS = -DBUILD_EXE
+# CPPFLAGS = -DBUILD_EXE
# making a .la
noinst_LTLIBRARIES = libpartition_allocator.la
121 src/partition_allocator/partition_allocator.c
View
@@ -92,8 +92,7 @@ static void _switch_config(pa_node_t* source, pa_node_t* target, int dim,
int port_src, int port_tar);
/* */
static void _set_external_wires(int dim, int count, pa_node_t* source,
- pa_node_t* target_1, pa_node_t* target_2,
- pa_node_t* target_first, pa_node_t* target_second);
+ pa_node_t* target_1, pa_node_t* target_2);
/* */
static char *_set_internal_wires(List nodes, int size, int conn_type);
@@ -474,7 +473,7 @@ void pa_init(node_info_msg_t *node_info_ptr)
_create_pa_system();
pa_system_ptr->fill_in_value = (pa_node_t *)
- xmalloc(sizeof(pa_node_t) * pa_system_ptr->num_of_proc);
+ xmalloc(sizeof(pa_node_t) * 128);
init_grid(node_info_ptr);
@@ -985,7 +984,7 @@ static int _create_config_even(pa_node_t *grid)
#if HAVE_BGL
int y,z;
- pa_node_t *target_2, *target_first, *target_second;
+ pa_node_t *target_2;
for(x=0;x<DIM_SIZE[X];x++) {
for(y=0;y<DIM_SIZE[Y];y++) {
for(z=0;z<DIM_SIZE[Z];z++) {
@@ -999,14 +998,8 @@ static int _create_config_even(pa_node_t *grid)
target_2 = &grid[x+2][y][z];
else
target_2 = target_1;
- target_first = &grid[0][y][z];
- if (DIM_SIZE[X] > 1)
- target_second = &grid[1][y][z];
- else
- target_second = target_first;
_set_external_wires(X, x, source,
- target_1, target_2,
- target_first, target_second);
+ target_1, target_2);
if(y<(DIM_SIZE[Y]-1))
target_1 = &grid[x][y+1][z];
@@ -1014,16 +1007,14 @@ static int _create_config_even(pa_node_t *grid)
target_1 = &grid[x][0][z];
_set_external_wires(Y, y, source,
- target_1, NULL,
- NULL, NULL);
+ target_1, NULL);
if(z<(DIM_SIZE[Z]-1))
target_1 = &grid[x][y][z+1];
else
target_1 = &grid[x][y][0];
_set_external_wires(Z, z, source,
- target_1, NULL,
- NULL, NULL);
+ target_1, NULL);
}
}
}
@@ -1051,8 +1042,7 @@ static int _create_config_even(pa_node_t *grid)
target_1 = &grid[x+1];
_set_external_wires(X, x, source,
- target_1, NULL,
- NULL, NULL);
+ target_1, NULL);
}
#endif
return 1;
@@ -1434,8 +1424,7 @@ static void _switch_config(pa_node_t* source, pa_node_t* target, int dim,
}
static void _set_external_wires(int dim, int count, pa_node_t* source,
- pa_node_t* target_1, pa_node_t* target_2,
- pa_node_t* target_first, pa_node_t* target_second)
+ pa_node_t* target_1, pa_node_t* target_2)
{
_switch_config(source, source, dim, 0, 0);
_switch_config(source, source, dim, 1, 1);
@@ -1445,47 +1434,46 @@ static void _set_external_wires(int dim, int count, pa_node_t* source,
_switch_config(source, source, dim, 4, 4);
return;
}
+
if(count==0) {
- /* First Node */
+ /* First Even Node */
/* 4->3 of next */
_switch_config(source, target_1, dim, 4, 3);
- /* 2->5 of next */
- _switch_config(source, target_1, dim, 2, 5);
- /* 3->4 of next even */
- _switch_config(source, target_2, dim, 3, 4);
- if(DIM_SIZE[dim]<4) {
- /* 5->2 of next even */
- _switch_config(source, target_2, dim, 5, 2);
-
- }
-
+ /* 5->2 of next */
+ _switch_config(source, target_1, dim, 5, 2);
+ /* 2->5 of next even */
+ _switch_config(source, target_2, dim, 2, 5);
+
} else if(!(count%2)) {
if(count<DIM_SIZE[dim]-2) {
/* Not Last Even Node */
- /* 3->4 of next even */
- _switch_config(source, target_2, dim, 3, 4);
- /* 2->5 of next */
- _switch_config(source, target_1, dim, 2, 5);
- /* 5->2 of next */
- _switch_config(source, target_1, dim, 5, 2);
+ /* 3->4 of next */
+ _switch_config(source, target_1, dim, 3, 4);
+ /* 4->3 of next */
+ _switch_config(source, target_1, dim, 4, 3);
+ /* 2->5 of next even */
+ _switch_config(source, target_2, dim, 2, 5);
+ /* 5->2 of next even */
+ _switch_config(source, target_2, dim, 5, 2);
+
} else {
/* Last Even Node */
/* 3->4 of next */
_switch_config(source, target_1, dim, 3, 4);
- /* 5->2 of next */
- _switch_config(source, target_1, dim, 5, 2);
- /* 2->5 of first */
- _switch_config(source, target_first, dim, 2, 5);
+ /* 2->5 of previous */
+ /********** fix me: on the full system this is needed ******/
+ //_switch_config(source, target_1, dim, 2, 5);
+ /********** fix me: not this ******/
+ _switch_config(source, target_1, dim, 4, 3);
}
} else {
if(count<DIM_SIZE[dim]-2) {
/* Not Last Odd Node */
- /* 4->3 of next odd */
- _switch_config(source, target_2, dim, 4, 3);
+ /* 5->2 of next odd */
+ _switch_config(source, target_2, dim, 5, 2);
} else {
/* Last Odd Node */
- /* 5->2 of second */
- _switch_config(source, target_second, dim, 5, 2);
+ /* nothing */
}
}
}
@@ -1891,42 +1879,33 @@ int main(int argc, char** argv)
List results;
// List results2;
// int i,j;
- DIM_SIZE[X]=8;
+ DIM_SIZE[X]=4;
DIM_SIZE[Y]=1;
DIM_SIZE[Z]=1;
pa_init(NULL);
- loc = find_bp_loc("R171");
- printf("The loc is %d%d%d\n",loc[X],loc[Y],loc[Z]);
- if((loc = find_bp_loc("R178")))
- printf("The loc is %d%d%d\n",loc[X],loc[Y],loc[Z]);
- else
- printf("This doesn't exsist!\n");
- exit(0);
-/* request->rotate = true; */
-/* request->elongate = true; */
-/* request->force_contig = true; */
-/* request->co_proc = true; */
-/* request->geometry[0]=-1; */
-
- results = list_create(NULL);
- request->geometry[0] = -1;
- request->size = 1; //atoi(argv[1]);
- request->conn_type = TORUS;
- new_pa_request(request);
- print_pa_request(request);
- allocate_part(request, results);
+
results = list_create(NULL);
- request->geometry[0] = 5;
+ request->geometry[0] = 4;
request->geometry[1] = 1;
request->geometry[2] = 1;
- request->size = -1; //atoi(argv[1]);
- request->conn_type = MESH;
+ request->size = 4;
+ request->conn_type = TORUS;
new_pa_request(request);
print_pa_request(request);
allocate_part(request, results);
+ /* results = list_create(NULL); */
+/* request->geometry[0] = 5; */
+/* request->geometry[1] = 1; */
+/* request->geometry[2] = 1; */
+/* request->size = -1; //atoi(argv[1]); */
+/* request->conn_type = MESH; */
+/* new_pa_request(request); */
+/* print_pa_request(request); */
+/* allocate_part(request, results); */
+
int dim,j;
int x,y,z;
int startx=0;
@@ -1955,11 +1934,11 @@ int main(int argc, char** argv)
}
}
}
- list_destroy(results);
+ /* list_destroy(results); */
- pa_fini();
+/* pa_fini(); */
- delete_pa_request(request);
+/* delete_pa_request(request); */
return 0;
}
134 src/plugins/select/bluegene/bgl_job_run.c
View
@@ -163,23 +163,47 @@ static int _remove_job(db_job_id_t job_id)
/* Get the owner of an existing partition. Caller must xfree() return value. */
static char *_get_part_owner(pm_partition_id_t bgl_part_id)
{
- int rc;
+ int rc, i, j, num_parts;
char *owner, *cur_owner;
- rm_partition_t * part_elem;
-
- if ((rc = rm_get_partition(bgl_part_id, &part_elem)) != STATUS_OK) {
- error("rm_get_partition(%s): %s", bgl_part_id, bgl_err_str(rc));
- return NULL;
- }
- if ((rc = rm_get_data(part_elem, RM_PartitionUserName, &owner)) !=
- STATUS_OK) {
- error("rm_get_data(RM_PartitionUserName): %s", bgl_err_str(rc));
- (void) rm_free_partition(part_elem);
- return NULL;
+ rm_partition_t *part_ptr;
+ rm_partition_list_t *part_list;
+
+ for(i=2;i<6;i++) {
+ if ((rc = rm_get_partitions_info(i, &part_list))
+ != STATUS_OK) {
+ error("rm_get_partitions() errno=%s\n",
+ bgl_err_str(rc));
+
+ }
+ rm_get_data(part_list, RM_PartListSize, &num_parts);
+ for(j=0; j<num_parts; j++) {
+ if(j)
+ rm_get_data(part_list, RM_PartListNextPart, &part_ptr);
+ else
+ rm_get_data(part_list, RM_PartListFirstPart, &part_ptr);
+ rm_get_data(part_ptr, RM_PartitionID, &owner);
+ if(!strcasecmp(bgl_part_id, owner)) {
+ rc = rm_get_data(part_ptr, RM_PartitionUserName, &owner);
+ break;
+ }
+ }
+ rm_free_partition_list(part_list);
+ if(j<num_parts)
+ break;
}
+ /* if ((rc = rm_get_partition(bgl_part_id, &part_ptr)) != STATUS_OK) { */
+/* error("rm_get_partition(%s): %s", bgl_part_id, bgl_err_str(rc)); */
+/* return NULL; */
+/* } */
+/* if ((rc = rm_get_data(part_ptr, RM_PartitionUserName, &owner)) != */
+/* STATUS_OK) { */
+/* error("rm_get_data(RM_PartitionUserName): %s", bgl_err_str(rc)); */
+/* (void) rm_free_partition(part_ptr); */
+/* return NULL; */
+/* } */
cur_owner = xstrdup(owner);
- if ((rc = rm_free_partition(part_elem)) != STATUS_OK)
- error("rm_free_partition(): %s", bgl_err_str(rc));
+ /* if ((rc = rm_free_partition(part_ptr)) != STATUS_OK) */
+/* error("rm_free_partition(): %s", bgl_err_str(rc)); */
return cur_owner;
}
@@ -187,7 +211,7 @@ static char *_get_part_owner(pm_partition_id_t bgl_part_id)
static int _set_part_owner(pm_partition_id_t bgl_part_id, char *user)
{
int rc;
- rm_partition_t * part_elem;
+ rm_partition_t * part_ptr;
if (user && user[0])
info("Setting partition %s owner to %s", bgl_part_id, user);
@@ -205,7 +229,7 @@ static int _set_part_owner(pm_partition_id_t bgl_part_id, char *user)
int err_ret = SLURM_SUCCESS;
/* find the partition */
- if ((rc = rm_get_partition(bgl_part_id, &part_elem)) != STATUS_OK) {
+ if ((rc = rm_get_partition(bgl_part_id, &part_ptr)) != STATUS_OK) {
error("rm_get_partition(%s): %s", bgl_part_id, bgl_err_str(rc));
return SLURM_ERROR;
}
@@ -218,47 +242,72 @@ static int _set_part_owner(pm_partition_id_t bgl_part_id, char *user)
}
-/* if ((rc = rm_set_data(part_elem, RM_PartitionUserName, &user)) */
+/* if ((rc = rm_set_data(part_ptr, RM_PartitionUserName, &user)) */
/* != STATUS_OK) { */
/* error("rm_set_date(%s, RM_PartitionUserName): %s", bgl_part_id, */
/* bgl_err_str(rc)); */
/* err_ret = SLURM_ERROR; */
/* } */
- if ((rc = rm_free_partition(part_elem)) != STATUS_OK)
+ if ((rc = rm_free_partition(part_ptr)) != STATUS_OK)
error("rm_free_partition(): %s", bgl_err_str(rc));
return err_ret;
#else
- int i=0;
- rm_partition_state_t part_state;
+ int i=0, j, num_parts;
+ rm_partition_list_t *part_list;
+ //rm_partition_state_t part_state;
+ rm_partition_state_flag_t part_state = RM_PARTITION_FREE+2;
+ char *name;
+ int is_ready=0;
/* Wait for partition state to be FREE */
for (i=0; i<MAX_POLL_RETRIES; i++) {
if (i > 0)
sleep(POLL_INTERVAL);
- /* find the partition */
- if ((rc = rm_get_partition(bgl_part_id, &part_elem)) !=
- STATUS_OK) {
- error("rm_get_partition(%s): %s", bgl_part_id,
+ if ((rc = rm_get_partitions_info(part_state, &part_list))
+ != STATUS_OK) {
+ error("rm_get_partitions() errno=%s\n",
bgl_err_str(rc));
- return SLURM_ERROR;
+
}
-
- /* find its state */
- rc = rm_get_data(part_elem, RM_PartitionState, &part_state);
- if (rc != STATUS_OK) {
- error("rm_get_data(RM_PartitionState): %s",
- bgl_err_str(rc));
- (void) rm_free_partition(part_elem);
- return SLURM_ERROR;
+ rm_get_data(part_list, RM_PartListSize, &num_parts);
+ for(j=0; j<num_parts; j++) {
+ if(j)
+ rm_get_data(part_list, RM_PartListNextPart, &part_ptr);
+ else
+ rm_get_data(part_list, RM_PartListFirstPart, &part_ptr);
+ rm_get_data(part_ptr, RM_PartitionID, &name);
+ if(!strcasecmp(bgl_part_id, name)) {
+ is_ready = 1;
+ break;
+ }
}
-
- if ((rc = rm_free_partition(part_elem)) != STATUS_OK)
- error("rm_free_partition(): %s", bgl_err_str(rc));
-
- if (part_state == RM_PARTITION_FREE)
- break; /* partition is now free */
+ rm_free_partition_list(part_list);
+ if(is_ready)
+ break;
+ /* /\* find the partition *\/ */
+/* if ((rc = rm_get_partition(bgl_part_id, &part_ptr)) != */
+/* STATUS_OK) { */
+/* error("rm_get_partition(%s): %s", bgl_part_id, */
+/* bgl_err_str(rc)); */
+/* return SLURM_ERROR; */
+/* } */
+
+/* /\* find its state *\/ */
+/* rc = rm_get_data(part_ptr, RM_PartitionState, &part_state); */
+/* if (rc != STATUS_OK) { */
+/* error("rm_get_data(RM_PartitionState): %s", */
+/* bgl_err_str(rc)); */
+/* (void) rm_free_partition(part_ptr); */
+/* return SLURM_ERROR; */
+/* } */
+
+/* if ((rc = rm_free_partition(part_ptr)) != STATUS_OK) */
+/* error("rm_free_partition(): %s", bgl_err_str(rc)); */
+
+/* if (part_state == RM_PARTITION_FREE) */
+/* break; /\* partition is now free *\/ */
/* Destroy the partition, only on first pass */
if ((i == 0)
@@ -269,7 +318,12 @@ static int _set_part_owner(pm_partition_id_t bgl_part_id, char *user)
}
}
- if (part_state != RM_PARTITION_FREE) {
+ /* if (part_state != RM_PARTITION_FREE) { */
+/* error("Could not free partition %s", bgl_part_id); */
+/* return SLURM_ERROR; */
+/* } */
+
+ if (!is_ready) {
error("Could not free partition %s", bgl_part_id);
return SLURM_ERROR;
}
21 src/plugins/select/bluegene/bgl_switch_connections.c
View
@@ -108,7 +108,7 @@ static int _add_switch_conns(rm_switch_t* curr_switch, bgl_switch_t *bgl_switch)
}
}
conn.part_state = RM_PARTITION_READY;
-
+
if(firstconnect) {
rm_set_data(curr_switch, RM_SwitchFirstConnection, &conn);
firstconnect=0;
@@ -239,6 +239,7 @@ extern int configure_partition_switches(bgl_record_t * bgl_record)
char *name2;
rm_BP_t *curr_bp;
rm_switch_t *coord_switch[PA_SYSTEM_DIMENSIONS];
+ rm_switch_t *curr_switch;
pa_switch_t *pa_switch;
char *bpid, *curr_bpid;
int found_bpid = 0;
@@ -324,23 +325,23 @@ extern int configure_partition_switches(bgl_record_t * bgl_record)
rm_get_data(curr_bp, RM_BPID, &bpid);
rm_get_data(bgl, RM_SwitchNum, &switch_count);
- rm_get_data(bgl, RM_FirstSwitch,&coord_switch[X]);
+ rm_get_data(bgl, RM_FirstSwitch,&curr_switch);
found_bpid = 0;
for (i=0; i<switch_count; i++) {
- rm_get_data(coord_switch[X], RM_SwitchBPID, &curr_bpid);
+ rm_get_data(curr_switch, RM_SwitchBPID, &curr_bpid);
if (!strcasecmp((char *)bpid, (char *)curr_bpid)) {
- found_bpid = 1;
- break;
+ coord_switch[found_bpid] = curr_switch;
+ found_bpid++;
+ if(found_bpid==PA_SYSTEM_DIMENSIONS)
+ break;
}
- rm_get_data(bgl,RM_NextSwitch,&coord_switch[X]);
+ rm_get_data(bgl,RM_NextSwitch,&curr_switch);
}
- if(found_bpid) {
- rm_get_data(bgl,RM_NextSwitch,&coord_switch[Y]);
- rm_get_data(bgl,RM_NextSwitch,&coord_switch[Z]);
-
+ if(found_bpid==PA_SYSTEM_DIMENSIONS) {
+
switch_itr = list_iterator_create(bgl_bp->switch_list);
while((bgl_switch = list_next(switch_itr)) != NULL) {
rm_get_data(coord_switch[bgl_switch->dim],
243 src/plugins/select/bluegene/bluegene.c
View
@@ -52,8 +52,10 @@ List bgl_found_part_list = NULL; /* found bgl partitions */
char *bluegene_blrts = NULL, *bluegene_linux = NULL, *bluegene_mloader = NULL;
char *bluegene_ramdisk = NULL;
char *change_numpsets = NULL;
+int numpsets;
bool agent_fini = false;
-
+FILE *fp = NULL;
+
/* some local functions */
#ifdef HAVE_BGL
static int _addto_node_list(bgl_record_t *bgl_record, int *start, int *end);
@@ -69,9 +71,8 @@ extern int init_bgl(void)
{
#ifdef HAVE_BGL_FILES
int rc;
-
rm_size3D_t bp_size;
-
+
info("Attempting to contact MMCS");
if ((rc = rm_set_serial(BGL_SERIAL)) != STATUS_OK) {
fatal("init_bgl: rm_set_serial(): %s", bgl_err_str(rc));
@@ -367,25 +368,6 @@ extern int create_static_partitions(List part_list)
configure_partition(bgl_record);
print_bgl_record(bgl_record);
-/* found_record = (bgl_record_t*) xmalloc(sizeof(bgl_record_t)); */
-/* list_push(bgl_list, found_record); */
-
-/* found_record->bgl_part_list = bgl_record->bgl_part_list; */
-/* found_record->hostlist = bgl_record->hostlist; */
-/* found_record->nodes = xstrdup(bgl_record->nodes); */
-
-/* found_record->bp_count = bgl_record->bp_count; */
-/* found_record->switch_count = bgl_record->switch_count; */
-/* found_record->geo[X] = bgl_record->geo[X]; */
-/* found_record->geo[Y] = bgl_record->geo[Y]; */
-/* found_record->geo[Z] = bgl_record->geo[Z]; */
-
-/* found_record->conn_type = bgl_record->conn_type; */
-/* found_record->bitmap = bgl_record->bitmap; */
-/* found_record->node_use = SELECT_COPROCESSOR_MODE; */
-/* configure_partition(found_record); */
-/* /\*********************************************************\/ */
-/* print_bgl_record(found_record); */
#endif
}
}
@@ -469,43 +451,89 @@ extern int bgl_free_partition(pm_partition_id_t part_id)
{
#ifdef HAVE_BGL_FILES
rm_partition_state_t state;
- rm_partition_t *my_part;
- int rc;
-
- if ((rc = rm_get_partition(part_id, &my_part))
- != STATUS_OK) {
- error("couldn't get the partition in bgl_free_partition");
- } else {
- rm_get_data(my_part, RM_PartitionState, &state);
- if(state != RM_PARTITION_FREE)
- pm_destroy_partition(part_id);
+ rm_partition_t *part_ptr;
+ int rc, i, j, num_parts;
+ rm_partition_list_t *part_list;
+ rm_partition_state_flag_t part_state = RM_PARTITION_FREE+2;
+ char *name;
+ int is_ready=0;
+
+
+ rm_get_data(part_ptr, RM_PartitionState, &state);
+ if(state != RM_PARTITION_FREE)
+ pm_destroy_partition(part_id);
+ else
+ return SLURM_SUCCESS;
+ i=0;
+ while(1) {
+ if ((rc = rm_get_partitions_info(part_state, &part_list))
+ != STATUS_OK) {
+ error("rm_get_partitions() errno=%s\n",
+ bgl_err_str(rc));
- rm_get_data(my_part, RM_PartitionState, &state);
- while ((state != RM_PARTITION_FREE)
- && (state != RM_PARTITION_ERROR)){
- debug(".");
- rc=rm_free_partition(my_part);
- if(rc!=STATUS_OK){
- error("Error freeing partition\n");
- return(-1);
- }
- sleep(3);
- rc=rm_get_partition(part_id,&my_part);
- if(rc!=STATUS_OK) {
- error("Error in GetPartition\n");
- return(-1);
+ }
+ rm_get_data(part_list, RM_PartListSize, &num_parts);
+ for(j=0; j<num_parts; j++) {
+ if(j)
+ rm_get_data(part_list, RM_PartListNextPart, &part_ptr);
+ else
+ rm_get_data(part_list, RM_PartListFirstPart, &part_ptr);
+ rm_get_data(part_ptr, RM_PartitionID, &name);
+ if(!strcasecmp(part_id, name)) {
+ is_ready = 1;
+ break;
}
- rm_get_data(my_part, RM_PartitionState,
- &state);
}
- //Free memory allocated to mypart
- rc=rm_free_partition(my_part);
- if(rc!=STATUS_OK){
- error("Error freeing partition\n");
- return(-1);
+ rm_free_partition_list(part_list);
+ if(is_ready)
+ break;
+ sleep(3);
+ if(i) {
+ part_state = RM_PARTITION_FREE+2;
+ i=0;
+ } else {
+ part_state = RM_PARTITION_ERROR+2;
+ i=1;
}
-
}
+ if(part_state == (RM_PARTITION_ERROR+2)) {
+ error("Partition is in an error state\n");
+ return(-1);
+ }
+ /* if ((rc = rm_get_partition(part_id, &part_ptr)) */
+/* != STATUS_OK) { */
+/* error("couldn't get the partition in bgl_free_partition"); */
+/* } else { */
+/* rm_get_data(part_ptr, RM_PartitionState, &state); */
+/* if(state != RM_PARTITION_FREE) */
+/* pm_destroy_partition(part_id); */
+
+/* rm_get_data(part_ptr, RM_PartitionState, &state); */
+/* while ((state != RM_PARTITION_FREE) */
+/* && (state != RM_PARTITION_ERROR)){ */
+/* debug("."); */
+/* rc=rm_free_partition(part_ptr); */
+/* if(rc!=STATUS_OK){ */
+/* error("Error freeing partition\n"); */
+/* return(-1); */
+/* } */
+/* sleep(3); */
+/* rc=rm_get_partition(part_id,&part_ptr); */
+/* if(rc!=STATUS_OK) { */
+/* error("Error in GetPartition\n"); */
+/* return(-1); */
+/* } */
+/* rm_get_data(part_ptr, RM_PartitionState, */
+/* &state); */
+/* } */
+/* //Free memory allocated to mypart */
+/* rc=rm_free_partition(part_ptr); */
+/* if(rc!=STATUS_OK){ */
+/* error("Error freeing partition\n"); */
+/* return(-1); */
+/* } */
+
+/* } */
#endif
return SLURM_SUCCESS;
}
@@ -649,54 +677,31 @@ static int _delete_old_partitions(void)
ListIterator itr_curr, itr_found;
bgl_record_t *found_record, *init_record;
pm_partition_id_t part_id;
- rm_partition_t *my_part;
- int part_number, lowest_part=300;
- char part_name[7];
-
+ rm_partition_t *part_ptr;
+ int part_number, part_count;
+ char *part_name;
+ rm_partition_list_t *part_list;
+ rm_partition_state_flag_t state = 5;
+
/******************************************************************/
- itr_curr = list_iterator_create(bgl_curr_part_list);
- while ((init_record = (bgl_record_t*) list_next(itr_curr))) {
- part_id=init_record->bgl_part_id;
- part_number = atoi(init_record->bgl_part_id+3);
- if(part_number<lowest_part)
- lowest_part = part_number;
+ if ((rc = rm_get_partitions_info(state, &part_list))
+ != STATUS_OK) {
+ error("rm_get_partitions(): %s",
+ bgl_err_str(rc));
+ return SLURM_ERROR;
+
}
- list_iterator_destroy(itr_curr);
-// if(lowest_part != 101) {
- /* rm_get_partitions(RM_PARTITION_FREE, &part_list); */
-/* rm_get_data(part_list, RM_PartListSize, &size); */
-/* printf("This is the size %d\n",size); */
-/* for(i=0;i<size;i++) { */
-/* if(!i) */
-/* rm_get_data(part_list, RM_PartListFirstPart, &my_part); */
-/* else */
-/* rm_get_data(part_list, RM_PartListNextPart, &my_part); */
-/* rm_get_data(my_part, RM_PartListNextPart, &part_id); */
-/* printf("this is the name %s\n",part_id); */
-/* if(!strncasecmp("RMP",part_id,3)) { */
-/* init_record = xmalloc(sizeof(bgl_record_t)); */
-/* list_push(bgl_curr_part_list, init_record); */
-/* init_record->bgl_part_id = xstrdup(part_id); */
-/* } */
-/* xfree(part_id); */
-/* rm_free_partition(my_part); */
-/* } */
-/* exit(0); */
-
- /* Here is where we clear all the partitions that exist. This will need to
- be taken out when we get better code from IBM.
- */
- for(part_number=101; part_number<lowest_part; part_number++) {
- memset(part_name,0,7);
- sprintf(part_name, "RMP%d", part_number);
+
+ rm_get_data(part_list, RM_PartListSize, &part_count);
+
+ rm_get_data(part_list, RM_PartListFirstPart, &part_ptr);
+
+ for(part_number=0; part_number<part_count; part_number++) {
+ rm_get_data(part_ptr, RM_PartitionID, &part_name);
+ if(strncmp("RMP",part_name,3))
+ goto next_partition;
//debug("Checking if Partition %s is free",part_name);
- if ((rc = rm_get_partition(part_name, &my_part))
- != STATUS_OK) {
- debug("Above error is ok. "
- "Partition %s doesn't exist.",
- part_name);
- continue;
- }
+
debug("removing the jobs on partition %s\n",
(char *)part_name);
term_jobs_on_part(part_name);
@@ -706,11 +711,16 @@ static int _delete_old_partitions(void)
rm_remove_partition(part_name);
debug("done\n");
+ next_partition:
+ /* if ((rc = rm_free_partition(part_ptr)) != STATUS_OK) { */
+/* } */
+ rm_get_data(part_list, RM_PartListNextPart, &part_ptr);
//sleep(3);
//debug("Removed Freed Partition %s",part_name);
}
-
+ rm_free_partition_list(part_list);
+
/*************************************************/
// }
@@ -828,9 +838,9 @@ extern int read_bgl_conf(void)
fatal("MloaderImage not configured in bluegene.conf");
if (!bluegene_ramdisk)
fatal("RamDiskImage not configured in bluegene.conf");
- if (!change_numpsets)
- info("Warning: ChangeNumpsets not configured in bluegene.conf");
-
+ if (!numpsets)
+ info("Warning: Numpsets not configured in bluegene.conf");
+
/* Check to see if the configs we have are correct */
if (!_validate_config_nodes()) {
_delete_old_partitions();
@@ -838,6 +848,7 @@ extern int read_bgl_conf(void)
* partition deletions */
sleep(3);
}
+
/* looking for partitions only I created */
if (create_static_partitions(NULL)) {
/* error in creating the static partitions, so
@@ -880,14 +891,15 @@ static int _parse_bgl_spec(char *in_line)
char *nodes = NULL, *conn_type = NULL, *node_use = NULL;
char *blrts_image = NULL, *linux_image = NULL;
char *mloader_image = NULL, *ramdisk_image = NULL;
- char *change = NULL;
+ int pset_num=8, api_verb=0;
bgl_record_t *bgl_record, *found_record;
error_code = slurm_parser(in_line,
"BlrtsImage=", 's', &blrts_image,
"LinuxImage=", 's', &linux_image,
"MloaderImage=", 's', &mloader_image,
- "ChangeNumpsets=", 's', &change,
+ "Numpsets=", 'd', pset_num,
+ "BridgeAPIVerbose=", 'd', api_verb,
"Nodes=", 's', &nodes,
"RamDiskImage=", 's', &ramdisk_image,
"Type=", 's', &conn_type,
@@ -918,13 +930,18 @@ static int _parse_bgl_spec(char *in_line)
bluegene_ramdisk = ramdisk_image;
ramdisk_image = NULL; /* nothing left to xfree */
}
- if (change) {
- xfree(change_numpsets);
- _strip_13_10(change);
- change_numpsets = change;
- change = NULL; /* nothing left to xfree */
+ if (pset_num!=8) {
+ numpsets = pset_num;
+
+ }
+ if(api_verb) {
+ if(fp)
+ fclose(fp);
+ fp = fopen("/var/log/slurm/bridgeapi.log","a");
+
+ setSayMessageParams(fp, api_verb);
+
}
-
/* Process node information */
if (!nodes && !conn_type)
goto cleanup; /* no data */
@@ -963,6 +980,7 @@ static int _parse_bgl_spec(char *in_line)
bgl_record->node_use = SELECT_COPROCESSOR_MODE;
else
bgl_record->node_use = SELECT_VIRTUAL_NODE_MODE;
+ bgl_record->partner = NULL;
} else {
/* If not then we will make both. */
@@ -973,6 +991,9 @@ static int _parse_bgl_spec(char *in_line)
found_record = (bgl_record_t*) xmalloc(sizeof(bgl_record_t));
list_push(bgl_list, found_record);
+ bgl_record->partner = found_record;
+ found_record->partner = bgl_record;
+
found_record->bgl_part_list = bgl_record->bgl_part_list;
found_record->hostlist = bgl_record->hostlist;
found_record->nodes = xstrdup(bgl_record->nodes);
9 src/plugins/select/bluegene/bluegene.h
View
@@ -60,8 +60,7 @@
#include "bgl_job_run.h"
#include "state_test.h"
-#define PSETS_PER_BP 8
-#define USER_NAME "nobody"
+#define USER_NAME "slurm"
/* Global variables */
extern rm_BGL_t *bgl;
@@ -69,7 +68,7 @@ extern char *bluegene_blrts;
extern char *bluegene_linux;
extern char *bluegene_mloader;
extern char *bluegene_ramdisk;
-extern char *change_numpsets;
+extern int numpsets;
extern pa_system_t *pa_system_ptr;
extern int DIM_SIZE[PA_SYSTEM_DIMENSIONS];
@@ -80,7 +79,7 @@ extern bool agent_fini;
typedef int lifecycle_type_t;
enum part_lifecycle {DYNAMIC, STATIC};
-typedef struct {
+typedef struct bgl_record {
char *nodes; /* String of nodes in partition */
char *owner_name; /* Owner of partition */
pm_partition_id_t bgl_part_id; /* ID returned from MMCS */
@@ -95,6 +94,7 @@ typedef struct {
int bp_count; /* size */
int switch_count;
bitstr_t *bitmap;
+ struct bgl_record *partner;
} bgl_record_t;
typedef struct {
@@ -180,3 +180,4 @@ extern int configure_partition_switches(bgl_record_t * bgl_conf_record);
extern int bgl_free_partition(pm_partition_id_t part_id);
#endif /* _BLUEGENE_H_ */
+
63 src/plugins/select/bluegene/partition_sys.c
View
@@ -89,8 +89,6 @@ static void _print_list(List list)
*/
static void _pre_allocate(bgl_record_t *bgl_record)
{
- int psets = (PSETS_PER_BP * bgl_record->bp_count);
-
rm_set_data(bgl_record->bgl_part, RM_PartitionBlrtsImg,
bluegene_blrts);
rm_set_data(bgl_record->bgl_part, RM_PartitionLinuxImg,
@@ -103,7 +101,7 @@ static void _pre_allocate(bgl_record_t *bgl_record)
&bgl_record->conn_type);
rm_set_data(bgl_record->bgl_part, RM_PartitionMode,
&bgl_record->node_use);
- rm_set_data(bgl_record->bgl_part, RM_PartitionPsetNum, &psets);
+ rm_set_data(bgl_record->bgl_part, RM_PartitionPsetsPerBP, &numpsets);
rm_set_data(bgl_record->bgl_part, RM_PartitionUserName, USER_NAME);
}
@@ -114,9 +112,10 @@ static int _post_allocate(bgl_record_t *bgl_record)
{
int rc;
pm_partition_id_t part_id;
- char command[255];
+ //char command[255];
/* Add partition record to the DB */
debug("adding partition\n");
+
rc = rm_add_partition(bgl_record->bgl_part);
if (rc != STATUS_OK) {
error("Error adding partition");
@@ -127,12 +126,12 @@ static int _post_allocate(bgl_record_t *bgl_record)
/* Get back the new partition id */
rm_get_data(bgl_record->bgl_part, RM_PartitionID, &part_id);
bgl_record->bgl_part_id = xstrdup(part_id);
- if (change_numpsets) {
- memset(command,0,255);
- sprintf(command,"%s %s", change_numpsets, part_id);
- info("%s",command);
- system(command);
- }
+ /* if (change_numpsets) { */
+/* memset(command,0,255); */
+/* sprintf(command,"%s %s", change_numpsets, part_id); */
+/* info("%s",command); */
+/* system(command); */
+/* } */
/* We are done with the partition */
rm_free_partition(bgl_record->bgl_part);
@@ -160,6 +159,7 @@ static int _post_allocate(bgl_record_t *bgl_record)
extern int configure_partition(bgl_record_t *bgl_record)
{
+
rm_new_partition(&bgl_record->bgl_part); /* new partition to be added */
_pre_allocate(bgl_record);
@@ -186,8 +186,11 @@ int read_bgl_partitions()
#ifndef USE_BGL_FILE
int *coord;
char *bp_id;
- int part_number, lowest_part=300;
- char part_name[7];
+ int part_number, part_count;
+ char *part_name;
+ rm_partition_list_t *part_list;
+ rm_partition_state_flag_t state = 5;
+
#endif
/* This code is here to blow add partitions after we get the
@@ -199,9 +202,23 @@ int read_bgl_partitions()
error("rm_set_serial(): %d\n", rc);
return SLURM_ERROR;
}
- for(part_number=101; part_number<lowest_part; part_number++) {
- memset(part_name,0,7);
- sprintf(part_name, "RMP%d", part_number);
+ if ((rc = rm_get_partitions_info(state, &part_list))
+ != STATUS_OK) {
+ error("rm_get_partitions(): %s",
+ bgl_err_str(rc));
+ return SLURM_ERROR;
+
+ }
+
+ rm_get_data(part_list, RM_PartListSize, &part_count);
+
+ rm_get_data(part_list, RM_PartListFirstPart, &part_ptr);
+
+ for(part_number=0; part_number<part_count; part_number++) {
+ rm_get_data(part_ptr, RM_PartitionID, &part_name);
+ if(strncmp("RMP",part_name,3))
+ goto next_partition;
+
//debug("Checking if Partition %s is free",part_name);
if ((rc = rm_get_partition(part_name, &part_ptr))
!= STATUS_OK) {
@@ -309,16 +326,22 @@ int read_bgl_partitions()
bgl_record->part_lifecycle = STATIC;
+ next_partition:
+ /* if ((rc = rm_free_partition(part_ptr)) != STATUS_OK) { */
+/* } */
+ rm_get_data(part_list, RM_PartListNextPart, &part_ptr);
- if ((rm_rc = rm_free_partition(part_ptr))
- != STATUS_OK) {
- error("rm_free_partition(): %s",
- bgl_err_str(rm_rc));
- }
+ /* if ((rm_rc = rm_free_partition(part_ptr)) */
+/* != STATUS_OK) { */
+/* error("rm_free_partition(): %s", */
+/* bgl_err_str(rm_rc)); */
+/* } */
//sleep(3);
//debug("Removed Freed Partition %s",part_name);
}
+ rm_free_partition_list(part_list);
+
//#endif
#else
if ((rc = rm_get_BGL(&bgl)) != STATUS_OK) {
98 src/plugins/select/bluegene/slurm_epilog.c
View
@@ -48,6 +48,7 @@ int main(int argc, char *argv[])
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
+#include <strings.h>
#include "src/plugins/select/bluegene/wrap_rm_api.h"
#define _DEBUG 0
@@ -65,9 +66,9 @@ int main(int argc, char *argv[])
if (!job_id)
fprintf(stderr, "SLURM_JOBID not set\n");
- part_name = getenv("BGL_PARTITION_ID"); /* get partition ID */
+ part_name = getenv("MPIRUN_PARTITION"); /* get partition ID */
if (!part_name) {
- fprintf(stderr, "BGL_PARTITION_ID not set for job %s\n",
+ fprintf(stderr, "MPIRUN_PARTITION not set for job %s\n",
job_id);
exit(0);
}
@@ -83,11 +84,14 @@ int main(int argc, char *argv[])
static void _wait_part_owner(char *part_name, char *user_id)
{
uid_t target_uid;
- int i, rc1, rc2;
+ int i, j, rc1, num_parts;
rm_partition_t *part_ptr;
char *name;
struct passwd *pw_ent;
-
+ int is_ready = 0;
+ rm_partition_state_flag_t part_state = RM_PARTITION_READY+2;
+ rm_partition_list_t *part_list;
+
target_uid = atoi(user_id);
#if _DEBUG
@@ -102,36 +106,72 @@ static void _wait_part_owner(char *part_name, char *user_id)
printf(".");
#endif
}
- if ((rc1 = rm_get_partition(part_name, &part_ptr)) !=
- STATUS_OK) {
- fprintf(stderr, "rm_get_partition(%s) errno=%d\n",
- part_name, rc1);
- return;
- }
- rc1 = rm_get_data(part_ptr, RM_PartitionUserName, &name);
- rc2 = rm_free_partition(part_ptr);
- if (rc1 != STATUS_OK) {
- fprintf(stderr,
- "rm_get_data(%s, RM_PartitionUserName) "
- "errno=%d\n", part_name, rc1);
- return;
- }
- if (rc2 != STATUS_OK)
- fprintf(stderr, "rm_free_partition() errno=%d\n", rc2);
- /* Now test this owner */
- if (name[0] == '\0')
- break;
- if ((pw_ent = getpwnam(name)) == NULL) {
- fprintf(stderr, "getpwnam(%s) errno=%d\n", part_name,
- errno);
- continue;
+ if ((rc1 = rm_get_partitions_info(part_state, &part_list))
+ != STATUS_OK) {
+ fprintf(stderr, "rm_get_partitions() errno=%d\n",
+ rc1);
+
}
+ rm_get_data(part_list, RM_PartListSize, &num_parts);
+ for(j=0; j<num_parts; j++) {
+ if(j)
+ rm_get_data(part_list, RM_PartListNextPart, &part_ptr);
+ else
+ rm_get_data(part_list, RM_PartListFirstPart, &part_ptr);
+ rm_get_data(part_ptr, RM_PartitionID, &name);
+ if(!strcasecmp(part_name, name)) {
+ rc1 = rm_get_data(part_ptr, RM_PartitionUserName, &name);
+ if (name[0] == '\0')
+ continue;
+ if ((pw_ent = getpwnam(name)) == NULL) {
+ fprintf(stderr, "getpwnam(%s) errno=%d\n", name,
+ errno);
+ continue;
+ }
#if (_DEBUG > 1)
- printf("\nowner = %s(%d)\n", name, pw_ent->pw_uid);
+ printf("\nowner = %s(%d)\n", name, pw_ent->pw_uid);
#endif
- if (pw_ent->pw_uid != target_uid)
+ if (pw_ent->pw_uid == target_uid) {
+ is_ready = 1;
+ break;
+ }
+ }
+ }
+ rm_free_partition_list(part_list);
+ if(is_ready)
break;
+
+ /* if ((rc1 = rm_get_partition(part_name, &part_ptr)) != */
+/* STATUS_OK) { */
+/* fprintf(stderr, "rm_get_partition(%s) errno=%d\n", */
+/* part_name, rc1); */
+/* return; */
+/* } */
+/* rc1 = rm_get_data(part_ptr, RM_PartitionUserName, &name); */
+/* rc2 = rm_free_partition(part_ptr); */
+/* if (rc1 != STATUS_OK) { */
+/* fprintf(stderr, */
+/* "rm_get_data(%s, RM_PartitionUserName) " */
+/* "errno=%d\n", part_name, rc1); */
+/* return; */
+/* } */
+/* if (rc2 != STATUS_OK) */
+/* fprintf(stderr, "rm_free_partition() errno=%d\n", rc2); */
+
+/* /\* Now test this owner *\/ */
+/* if (name[0] == '\0') */
+/* break; */
+/* if ((pw_ent = getpwnam(name)) == NULL) { */
+/* fprintf(stderr, "getpwnam(%s) errno=%d\n", part_name, */
+/* errno); */
+/* continue; */
+/* } */
+/* #if (_DEBUG > 1) */
+/* printf("\nowner = %s(%d)\n", name, pw_ent->pw_uid); */
+/* #endif */
+/* if (pw_ent->pw_uid != target_uid) */
+/* break; */
}
#if _DEBUG
207 src/plugins/select/bluegene/slurm_prolog.c
View
@@ -48,6 +48,7 @@ int main(int argc, char *argv[])
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
+#include <strings.h>
#include "src/plugins/select/bluegene/wrap_rm_api.h"
#define _DEBUG 0
@@ -62,7 +63,7 @@ int main(int argc, char *argv[])
#define MIN_DELAY 300 /* time in seconds */
#define INCR_DELAY 20 /* time in seconds per BP */
int max_delay=MIN_DELAY;
-int cur_delay=0;
+int cur_delay=0;
static char *_part_state_str(rm_partition_state_t state);
static void _wait_part_ready(char *part_name);
@@ -76,9 +77,9 @@ int main(int argc, char *argv[])
if (!job_id)
fprintf(stderr, "SLURM_JOBID not set\n");
- part_name = getenv("BGL_PARTITION_ID"); /* get partition ID */
+ part_name = getenv("MPIRUN_PARTITION"); /* get partition ID */
if (!part_name) {
- fprintf(stderr, "BGL_PARTITION_ID not set for job %s\n",
+ fprintf(stderr, "MPIRUN_PARTITION not set for job %s\n",
job_id);
exit(0);
}
@@ -95,11 +96,14 @@ int main(int argc, char *argv[])
static void _wait_part_ready(char *part_name)
{
- int i, rc1, rc2;
+ int i, j, rc1, rc2, num_parts;
rm_partition_t *part_ptr;
rm_partition_state_t state;
+ rm_partition_state_flag_t part_state = RM_PARTITION_READY+2;
int is_ready = 0;
-
+ char *name;
+ rm_partition_list_t *part_list;
+
#if _DEBUG
printf("Waiting for partition %s to become ready.", part_name);
#endif
@@ -110,46 +114,70 @@ static void _wait_part_ready(char *part_name)
#if _DEBUG
printf(".");
#endif
- }
- rc1 = rm_get_partition(part_name, &part_ptr);
- if (rc1 == PARTITION_NOT_FOUND)
- continue; /* wait for creation */
- if (rc1 != STATUS_OK) {
- fprintf(stderr, "rm_get_partition(%s) errno=%d\n",
- part_name, rc1);
- return;
- }
-
- if (max_delay == MIN_DELAY) {
- int bp;
- rc1 = rm_get_data(part_ptr, RM_PartitionBPNum, &bp);
- if (rc1 != STATUS_OK)
- fprintf(stderr, "rm_get_data(%s, "
- "RM_PartitionBPNum) errno=%d\n",
+
+ if ((rc1 = rm_get_partitions_info(part_state, &part_list))
+ != STATUS_OK) {
+ fprintf(stderr, "rm_get_partitions() errno=%d\n",
+ rc1);
+
+ }
+ rm_get_data(part_list, RM_PartListSize, &num_parts);
+ for(j=0; j<num_parts; j++) {
+ if(j)
+ rm_get_data(part_list, RM_PartListNextPart, &part_ptr);
+ else
+ rm_get_data(part_list, RM_PartListFirstPart, &part_ptr);
+ rm_get_data(part_ptr, RM_PartitionID, &name);
+ if(!strcasecmp(part_name, name)) {
+ is_ready = 1;
+ break;
+ }
+ }
+ rm_free_partition_list(part_list);
+ if(is_ready)
+ break;
+ } else {
+ rc1 = rm_get_partition(part_name, &part_ptr);
+ if (rc1 == PARTITION_NOT_FOUND)
+ continue; /* wait for creation */
+ if (rc1 != STATUS_OK) {
+ fprintf(stderr, "rm_get_partition(%s) errno=%d\n",
part_name, rc1);
- else {
- max_delay += (INCR_DELAY * bp);
- if (max_delay == MIN_DELAY)
- max_delay++; /* avoid re-test */
+ return;
}
- }
- rc1 = rm_get_data(part_ptr, RM_PartitionState, &state);
- rc2 = rm_free_partition(part_ptr);
- if (rc1 != STATUS_OK) {
- fprintf(stderr,
- "rm_get_data(%s, RM_PartitionState) errno=%d\n",
- part_name, rc1);
- return;
- }
- if (rc2 != STATUS_OK)
- fprintf(stderr, "rm_free_partition() errno=%d\n", rc2);
+
+ if (max_delay == MIN_DELAY) {
+ int bp;
+ rc1 = rm_get_data(part_ptr, RM_PartitionBPNum, &bp);
+ if (rc1 != STATUS_OK)
+ fprintf(stderr, "rm_get_data(%s, "
+ "RM_PartitionBPNum) errno=%d\n",
+ part_name, rc1);
+ else {
+ max_delay += (INCR_DELAY * bp);
+ if (max_delay == MIN_DELAY)
+ max_delay++; /* avoid re-test */
+ }
+ }
+ rc1 = rm_get_data(part_ptr, RM_PartitionState, &state);
+ rc2 = rm_free_partition(part_ptr);
+ if (rc1 != STATUS_OK) {
+ fprintf(stderr,
+ "rm_get_data(%s, RM_PartitionState) errno=%d\n",
+ part_name, rc1);
+ return;
+ }
+ if (rc2 != STATUS_OK)
+ fprintf(stderr, "rm_free_partition() errno=%d\n", rc2);
+
#if (_DEBUG > 1)
- printf("\nstate=%s\n",_part_state_str(state));
+ printf("\nstate=%s\n",_part_state_str(state));
#endif
- if ((state == RM_PARTITION_READY)
- || (state == RM_PARTITION_ERROR)) {
- is_ready = 1;
- break;
+ if ((state == RM_PARTITION_READY)
+ || (state == RM_PARTITION_ERROR)) {
+ is_ready = 1;
+ break;
+ }
}
}
#if _DEBUG
@@ -194,12 +222,14 @@ static char *_part_state_str(rm_partition_state_t state)
static void _wait_part_owner(char *part_name, char *user_id)
{
uid_t target_uid;
- int i, rc1, rc2;
+ int i, j, rc1, num_parts;
rm_partition_t *part_ptr;
char *name;
struct passwd *pw_ent;
int is_ready = 0;
-
+ rm_partition_state_flag_t part_state = RM_PARTITION_READY+2;
+ rm_partition_list_t *part_list;
+
target_uid = atoi(user_id);
#if _DEBUG
@@ -215,38 +245,73 @@ static void _wait_part_owner(char *part_name, char *user_id)
printf(".");
#endif
}
- if ((rc1 = rm_get_partition(part_name, &part_ptr)) !=
- STATUS_OK) {
- fprintf(stderr, "rm_get_partition(%s) errno=%d\n",
- part_name, rc1);
- return;
- }
- rc1 = rm_get_data(part_ptr, RM_PartitionUserName, &name);
- rc2 = rm_free_partition(part_ptr);
- if (rc1 != STATUS_OK) {
- fprintf(stderr,
- "rm_get_data(%s, RM_PartitionUserName) "
- "errno=%d\n", part_name, rc1);
- return;
- }
- if (rc2 != STATUS_OK)
- fprintf(stderr, "rm_free_partition() errno=%d\n", rc2);
-
- /* Now test this owner */
- if (name[0] == '\0')
- continue;
- if ((pw_ent = getpwnam(name)) == NULL) {
- fprintf(stderr, "getpwnam(%s) errno=%d\n", part_name,
- errno);
- continue;
+ if ((rc1 = rm_get_partitions_info(part_state, &part_list))
+ != STATUS_OK) {
+ fprintf(stderr, "rm_get_partitions() errno=%d\n",
+ rc1);
+
}
+ rm_get_data(part_list, RM_PartListSize, &num_parts);
+ for(j=0; j<num_parts; j++) {
+ if(j)
+ rm_get_data(part_list, RM_PartListNextPart, &part_ptr);
+ else
+ rm_get_data(part_list, RM_PartListFirstPart, &part_ptr);
+ rm_get_data(part_ptr, RM_PartitionID, &name);
+ if(!strcasecmp(part_name, name)) {
+ rc1 = rm_get_data(part_ptr, RM_PartitionUserName, &name);
+ if (name[0] == '\0')
+ continue;
+ if ((pw_ent = getpwnam(name)) == NULL) {
+ fprintf(stderr, "getpwnam(%s) errno=%d\n", name,
+ errno);
+ continue;
+ }
#if (_DEBUG > 1)
- printf("\nowner = %s(%d)\n", name, pw_ent->pw_uid);
+ printf("\nowner = %s(%d)\n", name, pw_ent->pw_uid);
#endif
- if (pw_ent->pw_uid == target_uid) {
- is_ready = 1;
- break;
+ if (pw_ent->pw_uid == target_uid) {
+ is_ready = 1;
+ break;
+ }
+ }
}
+ rm_free_partition_list(part_list);
+ if(is_ready)
+ break;
+
+ /* if ((rc1 = rm_get_partition(part_name, &part_ptr)) != */
+/* STATUS_OK) { */
+/* fprintf(stderr, "rm_get_partition(%s) errno=%d\n", */
+/* part_name, rc1); */
+/* return; */
+/* } */
+/* rc1 = rm_get_data(part_ptr, RM_PartitionUserName, &name); */
+/* rc2 = rm_free_partition(part_ptr); */
+/* if (rc1 != STATUS_OK) { */
+/* fprintf(stderr, */
+/* "rm_get_data(%s, RM_PartitionUserName) " */
+/* "errno=%d\n", part_name, rc1); */
+/* return; */
+/* } */
+/* if (rc2 != STATUS_OK) */
+/* fprintf(stderr, "rm_free_partition() errno=%d\n", rc2); */
+
+ /* Now test this owner */
+ /* if (name[0] == '\0') */
+/* continue; */
+/* if ((pw_ent = getpwnam(name)) == NULL) { */
+/* fprintf(stderr, "getpwnam(%s) errno=%d\n", part_name, */
+/* errno); */
+/* continue; */
+/* } */
+/* #if (_DEBUG > 1) */
+/* printf("\nowner = %s(%d)\n", name, pw_ent->pw_uid); */
+/* #endif */
+/* if (pw_ent->pw_uid == target_uid) { */
+/* is_ready = 1; */
+/* break; */
+/* } */
}
#if _DEBUG
6 src/plugins/select/bluegene/state_test.c
View
@@ -115,6 +115,12 @@ static char *_convert_bp_state(rm_BP_state_t state)
case RM_BP_DOWN:
return "RM_BP_DOWN";
break;
+ case RM_BP_MISSING:
+ return "RM_BP_MISSING";
+ break;
+ case RM_BP_ERROR:
+ return "RM_BP_ERROR";
+ break;
case RM_BP_NAV:
return "RM_BP_NAV";
}
6 src/slurmd/mgr.c
View
@@ -258,7 +258,7 @@ mgr_spawn_task(spawn_task_request_msg_t *msg, slurm_addr *cli,
/*
* Run a prolog or epilog script. Sets environment variables:
* SLURM_JOBID = jobid, SLURM_UID=uid, and
- * BGL_PARTITION_ID=bgl_part_id (if not NULL)
+ * MPIRUN_PARTITION=bgl_part_id (if not NULL)
* Returns -1 on failure.
*/
extern int
@@ -296,7 +296,7 @@ run_script(bool prolog, const char *path, uint32_t jobid, uid_t uid,
setenvpf(&env, "SLURM_JOBID", "%u", jobid);
setenvpf(&env, "SLURM_UID", "%u", uid);
if (bgl_part_id)
- setenvpf(&env, "BGL_PARTITION_ID", "%s", bgl_part_id);
+ setenvpf(&env, "MPIRUN_PARTITION", "%s", bgl_part_id);
execve(path, argv, env);
error("help! %m");
@@ -989,7 +989,7 @@ _setup_batch_env(slurmd_job_t *job, batch_job_launch_msg_t *msg)
select_g_get_jobinfo(msg->select_jobinfo,
SELECT_DATA_PART_ID, &bgl_part_id);
if (bgl_part_id) {
- setenvpf(&job->env, "BGL_PARTITION_ID", "%s", bgl_part_id);
+ setenvpf(&job->env, "MPIRUN_PARTITION", "%s", bgl_part_id);
xfree(bgl_part_id);
}
2  src/slurmd/mgr.h
View
@@ -52,7 +52,7 @@ int mgr_launch_batch_job(batch_job_launch_msg_t *msg, slurm_addr *client);
/*
* Run a prolog or epilog script. Sets environment variables:
* SLURM_JOBID = jobid, SLURM_UID=uid, and
- * BGL_PARTITION_ID=bgl_part_id (if not NULL)
+ * MPIRUN_PARTITION=bgl_part_id (if not NULL)
* Returns -1 on failure.
*/
extern int run_script(bool prolog, const char *path, uint32_t jobid, uid_t uid,
65 src/smap/partition_functions.c
View
@@ -736,13 +736,15 @@ static void _read_part_db2(void)
db2_block_info_t *block_ptr;
//block_t *block_ptr;
char *user_name;
- int part_number;
- char part_name[7];
+ int part_number, part_count;
+ char *part_name;
char node_name_tmp[7];
rm_element_t *bp_ptr;
int *coord;
rm_connection_type_t conn_type;
rm_partition_mode_t node_use;
+ rm_partition_list_t *part_list;
+ rm_partition_state_flag_t state = 5;
if ((rc = rm_set_serial(BGL_SERIAL)) != STATUS_OK) {
error("rm_set_serial(): %d\n", rc);
@@ -759,22 +761,36 @@ static void _read_part_db2(void)
return;
}
}
-
- for(part_number=101; part_number<300; part_number++) {
- memset(part_name,0,7);
- sprintf(part_name, "RMP%d", part_number);
+
+ if ((rc = rm_get_partitions_info(state, &part_list))
+ != STATUS_OK) {
+ error("rm_get_partitions(): %s",
+ bgl_err_str(rc));
+ return;
+
+ }
+
+ rm_get_data(part_list, RM_PartListSize, &part_count);
+
+ rm_get_data(part_list, RM_PartListFirstPart, &part_ptr);
+
+ for(part_number=0; part_number<part_count; part_number++) {
+ rm_get_data(part_ptr, RM_PartitionID, &part_name);
+ if(strncmp("RMP",part_name,3))
+ goto next_partition;
+
//printf("Checking if Partition %s is free",part_name);
if ((rc = rm_get_partition(part_name, &part_ptr))
- != STATUS_OK) {
- break;
- }
-
+ != STATUS_OK) {
+ break;
+ }
+
if ((rc = rm_get_data(part_ptr, RM_PartitionBPNum, &bp_num)) != STATUS_OK) {
error("rm_get_data(RM_BPNum): %s", bgl_err_str(rc));
bp_num = 0;
}
- if(bp_num==0)
- continue;
+ /* if(bp_num==0) */
+/* continue; */
if ((rc = rm_get_data(part_ptr, RM_PartitionFirstBP, &bp_ptr))
!= STATUS_OK) {
error("rm_get_data(RM_FirstBP): %s",
@@ -783,11 +799,11 @@ static void _read_part_db2(void)
return;
}
block_ptr = xmalloc(sizeof(db2_block_info_t));
- list_append(block_list, block_ptr);
+ list_push(block_list, block_ptr);
block_ptr->bgl_block_name = xstrdup(part_name);
-
+
block_ptr->hostlist = hostlist_create(NULL);
-
+
for (i=0; i<bp_num; i++) {
if ((rc = rm_get_data(bp_ptr, RM_BPID, &bp_id))
!= STATUS_OK) {
@@ -797,10 +813,10 @@ static void _read_part_db2(void)
break;
}
coord = find_bp_loc(bp_id);
-
+
sprintf(node_name_tmp, "bgl%d%d%d",
coord[X], coord[Y], coord[Z]);
-
+
hostlist_push(block_ptr->hostlist, node_name_tmp);
if ((rc = rm_get_data(part_ptr, RM_PartitionNextBP, &bp_ptr))
!= STATUS_OK) {
@@ -814,13 +830,13 @@ static void _read_part_db2(void)
RM_PartitionState,
&block_ptr->state)) != STATUS_OK) {
}
-
+
if ((rc = rm_get_data(part_ptr,
RM_PartitionUserName,
&user_name)) != STATUS_OK) {
} else
block_ptr->bgl_user_name = xstrdup(user_name);
-
+
if ((rc = rm_get_data(part_ptr,
RM_PartitionConnection,
&conn_type)) != STATUS_OK) {
@@ -833,11 +849,14 @@ static void _read_part_db2(void)
block_ptr->bgl_node_use = SELECT_NAV_MODE;
} else
block_ptr->bgl_node_use = node_use;
-
- if ((rc = rm_free_partition(part_ptr)) != STATUS_OK) {
- }
+
+ next_partition:
+ /* if ((rc = rm_free_partition(part_ptr)) != STATUS_OK) { */
+/* } */
+ rm_get_data(part_list, RM_PartListNextPart, &part_ptr);
}
-
+ rm_free_partition_list(part_list);
+
/* perform post-processing for each bluegene partition */
list_for_each(block_list, _post_block_read, NULL);
5 src/smap/smap.c
View
@@ -97,7 +97,6 @@ int main(int argc, char *argv[])
} else {
pa_init(new_node_ptr);
}
-
if(params.partition) {
if(params.partition[0] == 'r')
params.partition[0] = 'R';
@@ -129,7 +128,6 @@ int main(int argc, char *argv[])
exit(0);
}
if(!params.commandline) {
- _set_pairs();
signal(SIGWINCH, (sighandler_t) _resize_handler);
initscr();
@@ -157,6 +155,7 @@ int main(int argc, char *argv[])
curs_set(1);
nodelay(stdscr, TRUE);
start_color();
+ _set_pairs();
pa_system_ptr->grid_win = newwin(height, width, starty, startx);
max_display = pa_system_ptr->grid_win->_maxy*pa_system_ptr->grid_win->_maxx;
@@ -411,7 +410,7 @@ static int _set_pairs()
z = 0;
y = 65;
- for (x = 0; x < pa_system_ptr->num_of_proc; x++) {
+ for (x = 0; x < 128; x++) {
if (y == 91)
y = 97;
else if(y == 123)
Please sign in to comment.
Something went wrong with that request. Please try again.