Permalink
Browse files

Tag 0.6.4-1

  • Loading branch information...
2 parents 751f7e6 + 2211e73 commit 634d4844480276eb291ec2a6a1489dfdd3ed65b5 @morrone morrone committed Oct 6, 2005
View
4 META
@@ -9,8 +9,8 @@
Name: slurm
Major: 0
Minor: 6
- Micro: 2
- Version: 0.6.3
+ Micro: 4
+ Version: 0.6.4
Release: 1
API_CURRENT: 7
API_AGE: 4
View
5 NEWS
@@ -1,6 +1,11 @@
This file describes changes in recent versions of SLURM. It primarily
documents those changes that are of interest to users and admins.
+* Changes in SLURM 0.6.4
+========================
+ -- Bluegene plugin drains an entire bglblock on repeated boot failures
+ only if it has not identified a specific node as being bad.
+
* Changes in SLURM 0.6.3
========================
-- Fix slurmctld mem leaks (step name and hostlist struct).
@@ -45,6 +45,7 @@
#include <pwd.h>
#include <sys/types.h>
+#include "src/common/hostlist.h"
#include "src/common/list.h"
#include "src/common/macros.h"
#include "src/common/node_select.h"
@@ -58,7 +59,8 @@
#define RETRY_BOOT_COUNT 3
#ifdef HAVE_BGL_FILES
-static int _partition_is_deallocating(bgl_record_t *bgl_record);
+static int _partition_is_deallocating(bgl_record_t *bgl_record);
+static void _drain_as_needed(char *node_list, char *reason);
static int _partition_is_deallocating(bgl_record_t *bgl_record)
{
@@ -110,6 +112,35 @@ static int _partition_is_deallocating(bgl_record_t *bgl_record)
}
return SLURM_SUCCESS;
}
+
+/* If any nodes in node_list are drained, draining, or down,
+ * then just return
+ * else drain all of the nodes
+ * This function lets us drain an entire bglblock only if
+ * we have not already identified a specific node as bad. */
+static void _drain_as_needed(char *node_list, char *reason)
+{
+ bool needed = true;
+ hostlist_t hl;
+ char *host;
+
+ /* scan node list */
+ hl = hostlist_create(node_list);
+ if (!hl) {
+ slurm_drain_nodes(node_list, reason);
+ return;
+ }
+ while ((host = hostlist_shift(hl))) {
+ if (node_already_down(host)) {
+ needed = false;
+ break;
+ }
+ }
+ hostlist_destroy(hl);
+
+ if (needed)
+ slurm_drain_nodes(node_list, reason);
+}
#endif
/*
@@ -344,12 +375,12 @@ extern int update_partition_list()
now = time(NULL);
time_ptr = localtime(&now);
strftime(reason, sizeof(reason),
- "update_partition_list: "
- "Boot fails "
- "[SLURM@%b %d %H:%M]",
- time_ptr);
- slurm_drain_nodes(bgl_record->nodes,
- reason);
+ "update_partition_list: "
+ "Boot fails "
+ "[SLURM@%b %d %H:%M]",
+ time_ptr);
+ _drain_as_needed(bgl_record->nodes,
+ reason);
bgl_record->boot_state = 0;
bgl_record->boot_count = 0;
}
@@ -44,7 +44,7 @@
#ifdef HAVE_BGL_FILES
/* Determine if specific slurm node is already in DOWN or DRAIN state */
-static bool _node_already_down(char *node_name)
+extern bool node_already_down(char *node_name)
{
uint16_t base_state;
struct node_record *node_ptr = find_node_record(node_name);
@@ -128,7 +128,7 @@ static void _configure_node_down(rm_bp_id_t bp_id, rm_BGL_t *bgl)
}
snprintf(bgl_down_node, sizeof(bgl_down_node), "bgl%d%d%d",
bp_loc.X, bp_loc.Y, bp_loc.Z);
- if (_node_already_down(bgl_down_node))
+ if (node_already_down(bgl_down_node))
break;
error("switch for node %s is bad", bgl_down_node);
@@ -217,7 +217,7 @@ static void _test_down_nodes(rm_BGL_t *bgl)
snprintf(bgl_down_node, sizeof(bgl_down_node), "bgl%d%d%d",
bp_loc.X, bp_loc.Y, bp_loc.Z);
- if (_node_already_down(bgl_down_node))
+ if (node_already_down(bgl_down_node))
continue;
debug("_test_down_nodes: %s in state %s",
@@ -1,5 +1,6 @@
/*****************************************************************************\
* state_test.h - header for Blue Gene node and switch state test.
+ * $Id$
*****************************************************************************
* Copyright (C) 2004 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
@@ -26,6 +27,9 @@
#ifndef _STATE_TEST_H_
#define _STATE_TEST_H_
+/* Determine if specific slurm node is already in DOWN or DRAIN state */
+extern bool node_already_down(char *node_name);
+
/*
* Search MMCS for failed switches and nodes. Failed resources are DRAINED in
* SLURM. This relies upon rm_get_BGL(), which is slow (10+ seconds) so run
View
@@ -184,10 +184,8 @@ static void _dump_proctable(srun_job_t *job)
for (node_inx=0; node_inx<job->nhosts; node_inx++) {
if (!opt.overcommit)
max_task = job->cpus[node_inx];
- for (task_inx=0; task_inx<max_task; task_inx++) {
+ for (task_inx=0; task_inx<job->ntask[node_inx]; task_inx++) {
taskid = job->tids[node_inx][task_inx];
- if ((task_inx > 0) && (taskid == 0))
- break;
tv = &MPIR_proctable[taskid];
info("task:%d, host:%s, pid:%d",
taskid, tv->host_name, tv->pid);
View
@@ -545,11 +545,8 @@ _job_create_internal(allocation_info_t *info)
/* Build task id list for each host */
job->tids = xmalloc(job->nhosts * sizeof(uint32_t *));
job->hostid = xmalloc(opt.nprocs * sizeof(uint32_t));
- for (i = 0; i < job->nhosts; i++) {
- /* leave space for trailing zero */
- job->tids[i] = xmalloc((job->ntask[i] + 1)
- * sizeof(uint32_t));
- }
+ for (i = 0; i < job->nhosts; i++)
+ job->tids[i] = xmalloc(job->ntask[i] * sizeof(uint32_t));
if (opt.distribution == SRUN_DIST_UNKNOWN) {
if (opt.nprocs <= job->nhosts)

0 comments on commit 634d484

Please sign in to comment.