Permalink
Browse files

retag of 2.3.0-pre4

  • Loading branch information...
2 parents e42dad4 + 95a1c0c commit 1280d6d9017134098e8b33de8ba2d37d49e7af1e @jette jette committed Mar 30, 2011
@@ -88,6 +88,10 @@ static int _post_block_free(bg_record_t *bg_record, bool restore)
bg_record->bg_block_id);
xassert(0);
return SLURM_SUCCESS;
+ } else if (bg_record->modifying) {
+ info("%d other are modifing this block %s",
+ bg_record->free_cnt, bg_record->bg_block_id);
+ return SLURM_SUCCESS;
} else if (bg_record->free_cnt) {
if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
info("%d other are trying to destroy this block %s",
@@ -107,7 +111,7 @@ static int _post_block_free(bg_record_t *bg_record, bool restore)
bg_block_state_string(bg_record->state));
slurm_init_update_block_msg(&block_msg);
block_msg.bg_block_id = bg_record->bg_block_id;
- block_msg.state = (bg_record->state | BG_BLOCK_ERROR_FLAG);
+ block_msg.state = BG_BLOCK_ERROR_FLAG;
block_msg.reason = "Block would not deallocate";
slurm_mutex_unlock(&block_state_mutex);
select_g_update_block(&block_msg);
@@ -141,19 +145,18 @@ static int _post_block_free(bg_record_t *bg_record, bool restore)
rc = bridge_block_remove(bg_record);
if (rc != SLURM_SUCCESS) {
- /* if (rc == PARTITION_NOT_FOUND) { */
- /* debug("_post_block_free: block %s is not found", */
- /* bg_record->bg_block_id); */
- /* } else { */
+ if (rc == BG_ERROR_BLOCK_NOT_FOUND) {
+ debug("_post_block_free: block %s is not found",
+ bg_record->bg_block_id);
+ } else {
error("_post_block_free: "
"bridge_block_remove(%s): %s",
bg_record->bg_block_id,
bg_err_str(rc));
- /* } */
- } else
- if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
- info("_post_block_free: done %s(%p)",
- bg_record->bg_block_id, bg_record);
+ }
+ } else if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
+ info("_post_block_free: done %s(%p)",
+ bg_record->bg_block_id, bg_record);
destroy_bg_record(bg_record);
if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
@@ -364,8 +367,9 @@ extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked)
if (rc == BG_ERROR_BLOCK_NOT_FOUND) {
debug("block %s is not found",
bg_record->bg_block_id);
+ bg_record->state = BG_BLOCK_FREE;
break;
- } else if (rc == BG_ERROR_PENDING_ACTION) {
+ } else if (rc == BG_ERROR_INVALID_STATE) {
#ifndef HAVE_BGL
/* If the state is error and
we get an incompatible
@@ -443,7 +447,7 @@ extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked)
bg_block_state_string(bg_record->state));
slurm_init_update_block_msg(&block_msg);
block_msg.bg_block_id = bg_record->bg_block_id;
- block_msg.state = (bg_record->state | BG_BLOCK_ERROR_FLAG);
+ block_msg.state = BG_BLOCK_ERROR_FLAG;
block_msg.reason = "Block would not deallocate";
slurm_mutex_unlock(&block_state_mutex);
select_g_update_block(&block_msg);
@@ -501,8 +505,6 @@ extern int free_block_list(uint32_t job_id, List track_list,
if (remove_from_bg_list(bg_lists->job_running, bg_record)
== SLURM_SUCCESS)
num_unused_cpus += bg_record->cpu_cnt;
-
- bg_free_block(bg_record, 0, 1);
}
list_iterator_destroy(itr);
slurm_mutex_unlock(&block_state_mutex);
@@ -876,8 +878,6 @@ extern const char *bg_err_str(int inx)
switch (inx) {
case SLURM_SUCCESS:
return "Status OK";
- case BG_ERROR_PENDING_ACTION:
- return "Action already pending";
case BG_ERROR_BLOCK_NOT_FOUND:
return "Block not found";
case BG_ERROR_BOOT_ERROR:
@@ -154,8 +154,7 @@ typedef enum {
#define REMOVE_USER_FOUND 2
typedef enum {
- BG_ERROR_PENDING_ACTION = 100,
- BG_ERROR_INVALID_STATE,
+ BG_ERROR_INVALID_STATE = 100,
BG_ERROR_BLOCK_NOT_FOUND,
BG_ERROR_BOOT_ERROR,
BG_ERROR_JOB_NOT_FOUND,
@@ -229,17 +229,15 @@ static void _start_agent(bg_action_t *bg_action_ptr)
}
if (bg_record->state == BG_BLOCK_TERM) {
debug("Block is in Deallocating state, waiting for free.");
+ /* Increment free_cnt to make sure we don't loose this
+ * block since bg_free_block will unlock block_state_mutex.
+ */
+ bg_record->free_cnt++;
bg_free_block(bg_record, 1, 1);
+ bg_record->free_cnt--;
/* no reason to reboot here since we are already
deallocating */
bg_action_ptr->reboot = 0;
- /* Since bg_free_block will unlock block_state_mutex
- we need to make sure the block we want is still
- around. Failure will unlock this so no need to
- unlock before return.
- */
- if (!_make_sure_block_still_exists(bg_action_ptr, bg_record))
- return;
}
delete_list = list_create(NULL);
@@ -301,7 +299,9 @@ static void _start_agent(bg_action_t *bg_action_ptr)
slurm_mutex_lock(&block_state_mutex);
/* Failure will unlock block_state_mutex so no need to unlock before
- return. */
+ return. Failure will unlock block_state_mutex so no need to unlock
+ before return.
+ */
if (!_make_sure_block_still_exists(bg_action_ptr, bg_record))
return;
@@ -381,15 +381,12 @@ static void _start_agent(bg_action_t *bg_action_ptr)
if (rc) {
bg_record->modifying = 1;
+ /* Increment free_cnt to make sure we don't loose this
+ * block since bg_free_block will unlock block_state_mutex.
+ */
+ bg_record->free_cnt++;
bg_free_block(bg_record, 1, 1);
-
- /* Since bg_free_block will unlock block_state_mutex
- we need to make sure the block we want is still
- around. Failure will unlock block_state_mutex so
- no need to unlock before return.
- */
- if (!_make_sure_block_still_exists(bg_action_ptr, bg_record))
- return;
+ bg_record->free_cnt--;
#if defined HAVE_BG_FILES && defined HAVE_BG_L_P
#ifdef HAVE_BGL
@@ -469,15 +466,12 @@ static void _start_agent(bg_action_t *bg_action_ptr)
} else if (bg_action_ptr->reboot) {
bg_record->modifying = 1;
+ /* Increment free_cnt to make sure we don't loose this
+ * block since bg_free_block will unlock block_state_mutex.
+ */
+ bg_record->free_cnt++;
bg_free_block(bg_record, 1, 1);
-
- /* Since bg_free_block will unlock block_state_mutex
- we need to make sure the block we want is still
- around. Failure will unlock block_state_mutex so
- no need to unlock before return.
- */
- if (!_make_sure_block_still_exists(bg_action_ptr, bg_record))
- return;
+ bg_record->free_cnt--;
bg_record->modifying = 0;
}
@@ -952,22 +946,22 @@ extern int boot_block(bg_record_t *bg_record)
info("Booting block %s", bg_record->bg_block_id);
if ((rc = bridge_block_boot(bg_record)) != SLURM_SUCCESS) {
- /* error("bridge_create_block(%s): %s", */
- /* bg_record->bg_block_id, bg_err_str(rc)); */
- /* if (rc == INCOMPATIBLE_STATE) { */
- /* char reason[200]; */
- /* snprintf(reason, sizeof(reason), */
- /* "boot_block: " */
- /* "Block %s is in an incompatible state. " */
- /* "This usually means hardware is allocated " */
- /* "by another block (maybe outside of SLURM).", */
- /* bg_record->bg_block_id); */
- /* bg_record->boot_state = 0; */
- /* bg_record->boot_count = 0; */
- /* slurm_mutex_unlock(&block_state_mutex); */
- /* requeue_and_error(bg_record, reason); */
- /* slurm_mutex_lock(&block_state_mutex); */
- /* } */
+ error("bridge_create_block(%s): %s",
+ bg_record->bg_block_id, bg_err_str(rc));
+ if (rc == BG_ERROR_BOOT_ERROR) {
+ char reason[200];
+ snprintf(reason, sizeof(reason),
+ "boot_block: "
+ "Block %s is in an incompatible state. "
+ "This usually means hardware is allocated "
+ "by another block (maybe outside of SLURM).",
+ bg_record->bg_block_id);
+ bg_record->boot_state = 0;
+ bg_record->boot_count = 0;
+ slurm_mutex_unlock(&block_state_mutex);
+ requeue_and_error(bg_record, reason);
+ slurm_mutex_lock(&block_state_mutex);
+ }
return SLURM_ERROR;
}
Oops, something went wrong.

0 comments on commit 1280d6d

Please sign in to comment.