Skip to content
Browse files

blobstore cleanup, OOM fix for NC + related

    - fixed re-attachment after reboot
    - fixed double-free in doDetachVolume
    - renamed libvirt_error_handler to libvirt_err_handler
      (to make 'grep -i error' in nc.log more useful)
    - fixes NC's unresponsiveness and ultimate crash due to a
      non-deterministic ultra-fast memory growth (corrupted value
      of metricsLen in sensor state was the culprit, which is now
      checked for, though corruption is not yet resolved)
    - added '-t' for NCclient for timestamp conversion checking
    - removed several memory leaks identified by valgrind
    - moved dump_trace into log.c for use across C components
    - reworked code in configFileValue to make it more readable
    - fixed one case that was failing in test_sensors binary
    - tweaks to log messages, mainly to clean up levels
    - improved blobstore cleanup (nixing unfinished) by
      adding BLOCKBLOB_STATUS_ABANDONED and using it in fsck

Fixes EUCA-3645 and EUCA-3820
  • Loading branch information...
1 parent 16c71e3 commit af3e9ee665619bfe0ea97325aaefce94f18d7c44 @dmitrii dmitrii committed Oct 24, 2012
Showing with 217 additions and 84 deletions.
  1. +14 −1 node/NCclient.c
  2. +3 −3 node/handlers.c
  3. +0 −2 node/handlers.h
  4. +11 −6 node/handlers_default.c
  5. +1 −2 node/handlers_kvm.c
  6. +9 −3 node/server-marshal.c
  7. +31 −35 storage/blobstore.c
  8. +5 −4 storage/blobstore.h
  9. +1 −1 storage/diskutil.c
  10. +3 −3 storage/vbr.c
  11. +1 −0 storage/walrus.c
  12. +20 −0 util/adb-helpers.h
  13. +28 −15 util/config.c
  14. +23 −0 util/log.c
  15. +1 −0 util/log.h
  16. +65 −9 util/sensor.c
  17. +1 −0 util/sensor.h
View
15 node/NCclient.c
@@ -70,6 +70,7 @@
#include "misc.h"
#include "euca_axis.h"
#include "sensor.h"
+#include "adb-helpers.h"
#define NC_ENDPOINT "/axis2/services/EucalyptusNC"
#define WALRUS_ENDPOINT "/services/Walrus"
@@ -197,12 +198,13 @@ int main (int argc, char **argv)
char * launch_index = NULL;
char ** group_names = NULL;
int group_names_size = 0;
+ char * timestamp_str = NULL;
char * command = NULL;
int local = 0;
int count = 1;
int ch;
- while ((ch = getopt(argc, argv, "lhdn:w:i:m:k:r:e:a:c:h:u:p:V:R:L:FU:I:G:v:")) != -1) {
+ while ((ch = getopt(argc, argv, "lhdn:w:i:m:k:r:e:a:c:h:u:p:V:R:L:FU:I:G:v:t:")) != -1) {
switch (ch) {
case 'c':
count = atoi (optarg);
@@ -299,6 +301,9 @@ int main (int argc, char **argv)
exit (1);
}
break;
+ case 't':
+ timestamp_str = optarg;
+ break;
case 'h':
usage (); // will exit
case '?':
@@ -614,6 +619,14 @@ int main (int argc, char **argv)
printf ("resources: %d\n%s\n", resSize, buf);
/***********************************************************/
+ } else if (!strcmp(command, "_convertTimestamp")) {
+ CHECK_PARAM(timestamp_str, "timestamp");
+
+ axutil_date_time_t * dt = unixms_to_datetime (stub->env, 0123L);
+ long long ts_l = datetime_to_unixms (dt, stub->env);
+ printf ("timestamp: %lld\n", ts_l);
+
+ /***********************************************************/
} else {
fprintf (stderr, "ERROR: command %s unknown (try -h)\n", command);
exit (1);
View
6 node/handlers.c
@@ -170,7 +170,7 @@ get_value( char *s,
return (sscanf_lines (s, buf, valp)==1 ? OK : ERROR);
}
-void libvirt_error_handler ( void *userData,
+void libvirt_err_handler ( void *userData,
virErrorPtr error)
{
if ( error==NULL) {
@@ -1117,7 +1117,7 @@ void adopt_instances()
return;
logprintfl (EUCAINFO, "looking for existing domains\n");
- virSetErrorFunc (NULL, libvirt_error_handler);
+ virSetErrorFunc (NULL, libvirt_err_handler);
num_doms = virConnectListDomains(nc_state.conn, dom_ids, MAXDOMS);
if (num_doms == 0) {
@@ -1963,7 +1963,7 @@ int doTerminateInstance (ncMetadata *meta, char *instanceId, int force, int *shu
if (init())
return 1;
- logprintfl (EUCAINFO, "[%s] invoked\n", instanceId);
+ logprintfl (EUCAINFO, "[%s] termination requested\n", instanceId);
if (nc_state.H->doTerminateInstance)
ret = nc_state.H->doTerminateInstance(&nc_state, meta, instanceId, force, shutdownState, previousState);
View
2 node/handlers.h
@@ -287,8 +287,6 @@ int get_value( char *s,
int convert_dev_names( const char *localDev,
char *localDevReal,
char *localDevTag);
-void libvirt_error_handler( void * userData,
- virErrorPtr error);
void print_running_domains( void);
virConnectPtr *check_hypervisor_conn();
void change_state( ncInstance * instance,
View
17 node/handlers_default.c
@@ -808,6 +808,7 @@ doDetachVolume ( struct nc_state_t *nc,
int ret = OK;
int is_iscsi_target = 0;
int have_remote_device = 0;
+ char * xml = NULL;
char * tagBuf;
char * localDevName;
@@ -908,7 +909,7 @@ doDetachVolume ( struct nc_state_t *nc,
snprintf (lpath, sizeof (lpath), EUCALYPTUS_VOLUME_LIBVIRT_XML_PATH_FORMAT, instance->instancePath, volumeId); // vol-XXX-libvirt.xml
// read in libvirt XML
- char * xml = file2str (lpath);
+ xml = file2str (lpath);
if (xml == NULL) {
logprintfl (EUCAERROR, "[%s][%s] failed to read volume XML from %s\n", instance->instanceId, volumeId, lpath);
ret = ERROR;
@@ -972,7 +973,7 @@ doDetachVolume ( struct nc_state_t *nc,
if (xml)
free (xml);
-
+
if (force) {
return(OK);
}
@@ -1652,10 +1653,13 @@ doDescribeSensors (struct nc_state_t *nc,
else
total = instIdsLen;
- sensorResource ** rss = malloc (total * sizeof (sensorResource *));
- if (rss == NULL) {
- sem_v (inst_copy_sem);
- return OUT_OF_MEMORY;
+ sensorResource ** rss = NULL;
+ if (total > 0) {
+ rss = malloc (total * sizeof (sensorResource *));
+ if (rss == NULL) {
+ sem_v (inst_copy_sem);
+ return OUT_OF_MEMORY;
+ }
}
int k = 0;
@@ -1679,6 +1683,7 @@ doDescribeSensors (struct nc_state_t *nc,
continue;
}
+ assert (k<total);
rss [k] = malloc (sizeof (sensorResource));
sensor_get_instance_data (instance->instanceId, sensorIds, sensorIdsLen, rss + k, 1);
k++;
View
3 node/handlers_kvm.c
@@ -195,13 +195,12 @@ static void * rebooting_thread (void *arg)
snprintf (lpath, sizeof (lpath), EUCALYPTUS_VOLUME_LIBVIRT_XML_PATH_FORMAT, instance->instancePath, volume->volumeId); // vol-XXX-libvirt.xml
// read in libvirt XML, which may have been modified by the hook above
- char * xml = file2str (lpath);
+ xml = file2str (lpath);
if (xml == NULL) {
logprintfl (EUCAERROR, "[%s][%s] failed to read volume XML from %s\n", instance->instanceId, volume->volumeId, lpath);
rc = 1;
}
}
-
if (remoteDevStr)
free (remoteDevStr);
View
12 node/server-marshal.c
@@ -222,11 +222,11 @@ adb_ncStartNetworkResponse_t* ncStartNetworkMarshal (adb_ncStartNetwork_t* ncSta
adb_ncStartNetworkResponseType_set_networkStatus(output, env, "SUCCESS");
adb_ncStartNetworkResponseType_set_statusMessage(output, env, "0");
}
-
- if (peersLen)
- free (peers);
}
+ if (peers)
+ free (peers);
+
// set response to output
adb_ncStartNetworkResponse_set_ncStartNetworkResponse(response, env, output);
pthread_mutex_unlock(&ncHandlerLock);
@@ -511,6 +511,8 @@ adb_ncDescribeInstancesResponse_t* ncDescribeInstancesMarshal (adb_ncDescribeIns
}
// eventlog("NC", userId, correlationId, "DescribeInstances", "end");
}
+ if (instIds)
+ free (instIds);
// set response to output
adb_ncDescribeInstancesResponse_set_ncDescribeInstancesResponse(response, env, output);
@@ -1068,6 +1070,10 @@ adb_ncDescribeSensorsResponse_t* ncDescribeSensorsMarshal (adb_ncDescribeSensors
}
}
// eventlog("NC", userId, correlationId, "DescribeSensors", "end");
+ if (instIds)
+ free (instIds);
+ if (sensorIds)
+ free (sensorIds);
reply:
View
66 storage/blobstore.c
@@ -79,7 +79,6 @@
#include <sys/file.h> // flock
#include <dirent.h>
#include <sys/wait.h> // wait
-#include <execinfo.h> // backtrace
#include <pthread.h>
#include "blobstore.h"
#include <sys/types.h> // gettid
@@ -179,28 +178,6 @@ static void myprintf (int loglevel, const char * format, ...)
puts (buf);
}
-static void dump_trace (char * buf, int buf_size)
-{
- void *array[64];
- size_t size;
- char **strings;
- size_t i;
-
- size = backtrace (array, sizeof(array)/sizeof(void *));
- strings = backtrace_symbols (array, size);
-
- buf [0] = '\0';
- for (i = 0; i < size; i++) {
- int left = buf_size - 1 - strlen (buf);
- if (left < 0) break;
- char line [512];
- snprintf (line, sizeof(line), "\t%s\n", strings [i]);
- strncat (buf, line, left);
- }
-
- free (strings);
-}
-
const char * blobstore_get_error_str ( blobstore_error_t error ) {
return _blobstore_error_strings [error];
}
@@ -229,7 +206,7 @@ static void err (blobstore_error_t error, const char * custom_msg, const int src
msg = blobstore_get_error_str (error);
}
snprintf (_blobstore_last_msg, sizeof(_blobstore_last_msg), "%s:%d %s", src_file_name, src_line_no, msg);
- dump_trace (_blobstore_last_trace, sizeof(_blobstore_last_trace));
+ log_dump_trace (_blobstore_last_trace, sizeof(_blobstore_last_trace));
if (_do_print_errors) {
myprintf (EUCAERROR, "error: %s\n", _blobstore_last_msg);
@@ -1481,6 +1458,12 @@ static unsigned int check_in_use ( blobstore * bs, const char * bb_id, long long
_err_off(); // do not complain if metadata files do not exist
int fd = open_and_lock (path, BLOBSTORE_FLAG_RDWR, timeout_usec, BLOBSTORE_FILE_PERM); // try opening to see what happens
if (fd != -1) {
+ struct stat s;
+ if (fstat (fd, &s) == 0) {
+ if (s.st_size > 0) { // lock file was not truncated before being released => file not properly closed
+ in_use |= BLOCKBLOB_STATUS_ABANDONED;
+ }
+ }
close_and_unlock (fd);
} else {
in_use |= BLOCKBLOB_STATUS_OPENED; // TODO: check if open failed for other reason?
@@ -1685,10 +1668,11 @@ static long long purge_blockblobs_lru ( blobstore * bs, blockblob * bb_list, lon
code = 'D';
deleted++;
}
- logprintfl (EUCADEBUG, "LRU %d %08d: %29s %c%c%c %c %9llu %s", iteration, purged, bb->id,
- (bb->in_use & BLOCKBLOB_STATUS_OPENED) ? ('o') : ('-'), // o = open
- (bb->in_use & BLOCKBLOB_STATUS_BACKED) ? ('p') : ('-'), // p = has parents
- (bb->in_use & BLOCKBLOB_STATUS_MAPPED) ? ('c') : ('-'), // c = has children
+ logprintfl (EUCADEBUG, "LRU %d %08d: %29s %c%c%c%c %c %9llu %s", iteration, purged, bb->id,
+ (bb->in_use & BLOCKBLOB_STATUS_OPENED) ? ('o') : ('-'), // o = open
+ (bb->in_use & BLOCKBLOB_STATUS_BACKED) ? ('p') : ('-'), // p = has parents
+ (bb->in_use & BLOCKBLOB_STATUS_MAPPED) ? ('c') : ('-'), // c = has children
+ (bb->in_use & BLOCKBLOB_STATUS_ABANDONED) ? ('a') : ('-'), // a = was abandoned
code, // outcome codes: D=deleted, else C=children, !=undeletable, O=open
bb->size_bytes / 512L, // size is in sectors
ctime (&(bb->last_modified))); // ctime adds a newline
@@ -1725,7 +1709,7 @@ int blobstore_stat (blobstore * bs, blobstore_meta * meta)
meta->blocks_unlocked = 0;
meta->blocks_locked = 0;
meta->num_blobs = 0;
- for (blockblob * abb = bbs; abb; abb=abb->next) { // TODO: unify this with locked/unlocked calculation in open()
+ for (blockblob * abb = bbs; abb; ) { // TODO: unify this with locked/unlocked calculation in open()
long long abb_size_blocks = round_up_sec (abb->size_bytes) / 512;
if (abb->in_use & BLOCKBLOB_STATUS_OPENED) {
meta->blocks_locked += abb_size_blocks; // these can't be purged if we need space (TODO: look into recursive purging of unused references?)
@@ -1734,6 +1718,11 @@ int blobstore_stat (blobstore * bs, blobstore_meta * meta)
}
meta->blocks_allocated += abb->blocks_allocated;
meta->num_blobs++;
+
+ // free this node and move the pointer
+ blockblob * old_bb = abb;
+ abb=abb->next;
+ free (old_bb);
}
unlock:
@@ -2652,9 +2641,14 @@ static int blockblob_check (const blockblob * bb)
}
}
+ // check on .refs that point to blobs that no longer exist
if (get_stale_refs (bb, NULL)>0)
err++;
+ // check on .lock files that are non-zero => blobs that were not closed properly
+ if (bb->in_use & BLOCKBLOB_STATUS_ABANDONED)
+ err++;
+
_err_on();
return err;
}
@@ -2718,7 +2712,7 @@ static int delete_blob_state (blockblob * bb, long long timeout_usec, char do_fo
// TODO: print a warning about store/blob corruption?
}
- if (!check_in_use(dep_bs, blob_id, 0)) {
+ if (! (check_in_use(dep_bs, blob_id, 0) & ~(BLOCKBLOB_STATUS_ABANDONED))) { // in use except abandoned
loop_remove (dep_bs, blob_id); // TODO: do we care about errors?
}
if (dep_bs != bs) {
@@ -2768,7 +2762,8 @@ int blockblob_delete ( blockblob * bb, long long timeout_usec, char do_force )
// do not delete the blob if it is used by another one
bb->in_use = check_in_use (bs, bb->id, 0); // update in_use status
- if (!do_force && bb->in_use & ~(BLOCKBLOB_STATUS_OPENED|BLOCKBLOB_STATUS_BACKED)) { // in use other than opened (by this thread) or backed
+ // if in use other than opened (by this thread), backed, or abandoned
+ if (!do_force && bb->in_use & ~(BLOCKBLOB_STATUS_OPENED|BLOCKBLOB_STATUS_BACKED|BLOCKBLOB_STATUS_ABANDONED)) {
ERR (BLOBSTORE_ERROR_AGAIN, NULL);
ret = -1;
} else {
@@ -4406,10 +4401,11 @@ static int do_list (const char * regex)
}
char extras [100] = "";
if (show_extras) {
- snprintf (extras, sizeof (extras), "%c%c%c %9llu %s",
- (bm->in_use & BLOCKBLOB_STATUS_OPENED) ? ('o') : ('-'), // o = open
- (bm->in_use & BLOCKBLOB_STATUS_BACKED) ? ('p') : ('-'), // p = has parents
- (bm->in_use & BLOCKBLOB_STATUS_MAPPED) ? ('c') : ('-'), // c = has children
+ snprintf (extras, sizeof (extras), "%c%c%c%c %9llu %s",
+ (bm->in_use & BLOCKBLOB_STATUS_OPENED) ? ('o') : ('-'), // o = open
+ (bm->in_use & BLOCKBLOB_STATUS_BACKED) ? ('p') : ('-'), // p = has parents
+ (bm->in_use & BLOCKBLOB_STATUS_MAPPED) ? ('c') : ('-'), // c = has children
+ (bm->in_use & BLOCKBLOB_STATUS_ABANDONED) ? ('a') : ('-'), // a = was abandoned
bm->size_bytes / 512L, // size is in sectors
ctime (&(bm->last_modified)));
extras [strlen (extras)-1] = ' '; // remove the newline from date
View
9 storage/blobstore.h
@@ -88,10 +88,11 @@
#define BLOBSTORE_FLAG_HOLLOW 02000 // means the blob being created should be viewed as not occupying space
// in-use flags for blockblobs
-#define BLOCKBLOB_STATUS_OPENED 00002 // currently opened by someone (read or write)
-#define BLOCKBLOB_STATUS_LOCKED 00004 // locked by a process (TODO: remove this UNUSED status?)
-#define BLOCKBLOB_STATUS_MAPPED 00010 // loopback device dm-mapped by one or more other blobs
-#define BLOCKBLOB_STATUS_BACKED 00020 // loopback device used as a backing by device mapper
+#define BLOCKBLOB_STATUS_OPENED 00002 // currently opened by someone (read or write)
+#define BLOCKBLOB_STATUS_LOCKED 00004 // locked by a process (TODO: remove this UNUSED status?)
+#define BLOCKBLOB_STATUS_MAPPED 00010 // loopback device dm-mapped by one or more other blobs
+#define BLOCKBLOB_STATUS_BACKED 00020 // loopback device used as a backing by device mapper
+#define BLOCKBLOB_STATUS_ABANDONED 00040 // non-zero lock file size => blob not closed properly
typedef enum { // make sure these match up with _blobstore_error_strings[] entries below
BLOBSTORE_ERROR_OK = 0,
View
2 storage/diskutil.c
@@ -827,7 +827,7 @@ int diskutil_ch (const char * path, const char * user, const char * group, const
int ret = OK;
char * output;
- logprintfl (EUCAINFO, "ch(own|mod) '%s' %s.%s %o\n",
+ logprintfl (EUCADEBUG, "ch(own|mod) '%s' %s.%s %o\n",
path,
user?user:"*",
group?group:"*",
View
6 storage/vbr.c
@@ -439,7 +439,7 @@ vbr_legacy ( // constructs VBRs for {image|kernel|ramdisk}x{Id|URL} entries (DEP
for (i=0; i<EUCA_MAX_VBRS && i<params->virtualBootRecordLen; i++) {
virtualBootRecord * vbr = &(params->virtualBootRecord[i]);
if (strlen(vbr->resourceLocation)>0) {
- logprintfl (EUCAINFO, "[%s] VBR[%d] type=%s id=%s dev=%s size=%lld format=%s %s\n",
+ logprintfl (EUCADEBUG, "[%s] VBR[%d] type=%s id=%s dev=%s size=%lld format=%s %s\n",
instanceId, i, vbr->typeName, vbr->id, vbr->guestDeviceName, vbr->size, vbr->formatName, vbr->resourceLocation);
if (!strcmp(vbr->typeName, "machine"))
found_image = 1;
@@ -455,9 +455,9 @@ vbr_legacy ( // constructs VBRs for {image|kernel|ramdisk}x{Id|URL} entries (DEP
// legacy support for image{Id|URL}
if (imageId && imageURL) {
if (found_image) {
- logprintfl (EUCAINFO, "[%s] IGNORING image %s passed outside the virtual boot record\n", instanceId, imageId);
+ logprintfl (EUCAWARN, "[%s] IGNORING image %s passed outside the virtual boot record\n", instanceId, imageId);
} else {
- logprintfl (EUCAINFO, "[%s] LEGACY pre-VBR image id=%s URL=%s\n", instanceId, imageId, imageURL);
+ logprintfl (EUCAWARN, "[%s] LEGACY pre-VBR image id=%s URL=%s\n", instanceId, imageId, imageURL);
if (i>=EUCA_MAX_VBRS-2) {
logprintfl (EUCAERROR, "[%s] error: out of room in the Virtual Boot Record for legacy image %s\n", instanceId, imageId);
return ERROR;
View
1 storage/walrus.c
@@ -466,6 +466,7 @@ static size_t write_header (void *buffer, size_t size, size_t nmemb, void *param
static size_t write_data (void *buffer, size_t size, size_t nmemb, void *params)
{
assert (params !=NULL);
+
int fd = ((struct request *)params)->fd;
int wrote = write (fd, buffer, size * nmemb);
((struct request *)params)->total_wrote += wrote;
View
20 util/adb-helpers.h
@@ -372,20 +372,40 @@ static inline adb_sensorsResourceType_t * copy_sensor_resource_to_adb (const axu
adb_sensorsResourceType_set_resourceName (resource, env, sr->resourceName);
adb_sensorsResourceType_set_resourceType (resource, env, sr->resourceType);
adb_sensorsResourceType_set_resourceUuid (resource, env, sr->resourceUuid);
+ if (sr->metricsLen<0 || sr->metricsLen>MAX_SENSOR_METRICS) {
+ logprintfl (EUCAERROR, "inconsistency in sensor database (metricsLen=%d for %s)\n",
+ sr->metricsLen, sr->resourceName);
+ return resource;
+ }
for (int m=0; m<sr->metricsLen; m++) {
const sensorMetric * sm = sr->metrics + m;
adb_metricsResourceType_t * metric = adb_metricsResourceType_create(env);
adb_metricsResourceType_set_metricName (metric, env, sm->metricName);
+ if (sm->countersLen<0 || sm->countersLen>MAX_SENSOR_COUNTERS) {
+ logprintfl (EUCAERROR, "inconsistency in sensor database (countersLen=%d for %s:%s)\n",
+ sm->countersLen, sr->resourceName, sm->metricName);
+ return resource;
+ }
for (int c=0; c<sm->countersLen; c++) {
const sensorCounter * sc = sm->counters + c;
adb_metricCounterType_t * counter = adb_metricCounterType_create(env);
adb_metricCounterType_set_type (counter, env, sensor_type2str(sc->type));
adb_metricCounterType_set_collectionIntervalMs (counter, env, sc->collectionIntervalMs);
adb_metricCounterType_set_sequenceNum (counter, env, sc->sequenceNum);
+ if (sc->dimensionsLen<0 || sc->dimensionsLen>MAX_SENSOR_DIMENSIONS) {
+ logprintfl (EUCAERROR, "inconsistency in sensor database (dimensionsLen=%d for %s:%s:%s)\n",
+ sc->dimensionsLen, sr->resourceName, sm->metricName, sensor_type2str(sc->type));
+ return resource;
+ }
for (int d=0; d<sc->dimensionsLen; d++) {
const sensorDimension * sd = sc->dimensions + d;
adb_metricDimensionsType_t * dimension = adb_metricDimensionsType_create(env);
adb_metricDimensionsType_set_dimensionName (dimension, env, sd->dimensionName);
+ if (sd->valuesLen<0 || sd->valuesLen>MAX_SENSOR_VALUES) {
+ logprintfl (EUCAERROR, "inconsistency in sensor database (valuesLen=%d for %s:%s:%s:%s)\n",
+ sd->valuesLen, sr->resourceName, sm->metricName, sensor_type2str(sc->type), sd->dimensionName);
+ return resource;
+ }
for (int v=0; v<sd->valuesLen; v++) {
int v_adj = (sd->firstValueIndex + v) % MAX_SENSOR_VALUES;
const sensorValue * sv = sd->values + v_adj;
View
43 util/config.c
@@ -75,22 +75,33 @@ int isConfigModified (char configFiles[][MAX_PATH], int numFiles)
char *configFileValue(const char *key)
{
- int i;
- for (i=0; i<configRestartLen; i++) {
- if (configKeysRestart[i].key) {
- if (!strcmp(configKeysRestart[i].key, key)) {
- return(configValuesRestart[i] ? strdup(configValuesRestart[i]) : (configKeysRestart[i].defaultValue ? strdup(configKeysRestart[i].defaultValue) : NULL));
- }
+ int i;
+ for (i=0; i<configRestartLen; i++) {
+ if (configKeysRestart[i].key) {
+ if (!strcmp (configKeysRestart[i].key, key)) { // have a match among values that restart
+ if (configValuesRestart[i])
+ return strdup(configValuesRestart[i]);
+ else if (configKeysRestart[i].defaultValue)
+ return strdup(configKeysRestart[i].defaultValue);
+ else
+ return NULL;
+ }
+ }
}
- }
- for (i=0; i<configNoRestartLen; i++) {
- if (configKeysNoRestart[i].key) {
- if (!strcmp(configKeysNoRestart[i].key, key)) {
- return(configValuesNoRestart[i] ? strdup(configValuesNoRestart[i]) : (configKeysNoRestart[i].defaultValue ? strdup(configKeysNoRestart[i].defaultValue) : NULL));
- }
+ for (i=0; i<configNoRestartLen; i++) {
+ if (configKeysNoRestart[i].key) {
+ if (!strcmp(configKeysNoRestart[i].key, key)) {
+ if (configValuesNoRestart[i])
+ return strdup(configValuesNoRestart[i]);
+ else if (configKeysNoRestart[i].defaultValue)
+ return strdup(configKeysNoRestart[i].defaultValue);
+ else
+ return NULL;
+ }
+ }
}
- }
- return(NULL);
+
+ return NULL;
}
int configFileValueLong(const char *key, long *val)
@@ -105,6 +116,7 @@ int configFileValueLong(const char *key, long *val)
* val = v;
found = 1;
}
+ free (tmpstr);
}
return found;
}
@@ -162,7 +174,8 @@ void configReadLogParams(int *log_level_out, int *log_roll_number_out, long *log
char * s = configFileValue ("LOGLEVEL");
assert (s!=NULL); // configFileValue should return default
* log_level_out = log_level_int (s);
-
+ return (s);
+
long l;
configFileValueLong ("LOGROLLNUMBER", &l);
* log_roll_number_out = (int)l;
View
23 util/log.c
@@ -75,6 +75,7 @@
#include <unistd.h>
#include <sys/syscall.h> // to get thread id
#include <sys/resource.h> // rusage
+#include <execinfo.h> // backtrace
#include <errno.h>
#include "eucalyptus.h"
@@ -557,3 +558,25 @@ void eventlog(char *hostTag, char *userTag, char *cid, char *eventTag, char *oth
logprintf("TIMELOG %s:%s:%s:%s:%f:%s\n", hostTagFull, userTag, cid, eventTag, ts, other);
}
}
+
+void log_dump_trace (char * buf, int buf_size)
+{
+ void *array[64];
+ size_t size;
+ char **strings;
+ size_t i;
+
+ size = backtrace (array, sizeof(array)/sizeof(void *));
+ strings = backtrace_symbols (array, size);
+
+ buf [0] = '\0';
+ for (i = 0; i < size; i++) {
+ int left = buf_size - 1 - strlen (buf);
+ if (left < 0) break;
+ char line [512];
+ snprintf (line, sizeof(line), "\t%s\n", strings [i]);
+ strncat (buf, line, left);
+ }
+
+ free (strings);
+}
View
1 util/log.h
@@ -152,5 +152,6 @@ int logprintfl(int level, const char *format, ...);
int logcat (int debug_level, const char * file_name);
void eventlog(char *hostTag, char *userTag, char *cid, char *eventTag, char *other);
+void log_dump_trace (char * buf, int buf_size);
#endif
View
74 util/sensor.c
@@ -71,6 +71,8 @@ static getstat * getstat_find (const char * instanceId)
sem_p (state_sem);
if (sensor_state->stats) {
for (int i = 0; (gs = sensor_state->stats[i]) != NULL; i++) {
+ if (instanceId==NULL) // special case, for testing, return first thing in the list
+ break;
if (strncmp (gs->instanceId, instanceId, sizeof (gs->instanceId)) == 0)
break;
}
@@ -119,6 +121,7 @@ static int getstat_refresh (void)
} else {
// Right now !CC means the NC.
output = system_output ("euca_rootwrap getstats.pl"); // invoke th Perl script
+ logprintfl (EUCATRACE, "getstats.pl output:\n%s\n", output);
}
if (hyp_sem) sem_v (hyp_sem);
int ret = ERROR;
@@ -572,6 +575,13 @@ int sensor_get_dummy_instance_data (long long sn, const char * instanceId, const
static sensorResource * find_or_alloc_sr (const boolean do_alloc, const char * resourceName, const char * resourceType, const char * resourceUuid)
{
+ // sanity check
+ if (sensor_state->max_resources < 0 || sensor_state->max_resources > MAX_SENSOR_RESOURCES_HARD) {
+ logprintfl (EUCAERROR, "inconsistency in sensor database (max_resources=%d for %s)\n",
+ sensor_state->max_resources, resourceName);
+ return NULL;
+ }
+
sensorResource * unused_sr = NULL;
for (int r=0; r<sensor_state->max_resources; r++) {
sensorResource * sr = sensor_state->resources + r;
@@ -618,6 +628,13 @@ static sensorResource * find_or_alloc_sr (const boolean do_alloc, const char * r
static sensorMetric * find_or_alloc_sm (const boolean do_alloc, sensorResource * sr, const char * metricName)
{
+ // sanity check
+ if (sr->metricsLen<0 || sr->metricsLen>MAX_SENSOR_METRICS) {
+ logprintfl (EUCAERROR, "inconsistency in sensor database (metricsLen=%d for %s)\n",
+ sr->metricsLen, sr->resourceName);
+ return NULL;
+ }
+
for (int m=0; m < sr->metricsLen; m++) {
sensorMetric * sm = sr->metrics + m;
if (strcmp (sm->metricName, metricName) == 0) {
@@ -640,6 +657,13 @@ static sensorMetric * find_or_alloc_sm (const boolean do_alloc, sensorResource *
static sensorCounter * find_or_alloc_sc (const boolean do_alloc, sensorMetric * sm, const int counterType)
{
+ // sanity check
+ if (sm->countersLen<0 || sm->countersLen>MAX_SENSOR_COUNTERS) {
+ logprintfl (EUCAERROR, "inconsistency in sensor database (countersLen=%d for %s)\n",
+ sm->countersLen, sm->metricName);
+ return NULL;
+ }
+
for (int c=0; c < sm->countersLen; c++) {
sensorCounter * sc = sm->counters + c;
if (sc->type == counterType) {
@@ -663,6 +687,13 @@ static sensorCounter * find_or_alloc_sc (const boolean do_alloc, sensorMetric *
static sensorDimension * find_or_alloc_sd (const boolean do_alloc, sensorCounter * sc, const char * dimensionName)
{
+ // sanity check
+ if (sc->dimensionsLen<0 || sc->dimensionsLen>MAX_SENSOR_DIMENSIONS) {
+ logprintfl (EUCAERROR, "inconsistency in sensor database (dimensionsLen=%d for %s)\n",
+ sc->dimensionsLen, sensor_type2str(sc->type));
+ return NULL;
+ }
+
for (int d=0; d < sc->dimensionsLen; d++) {
sensorDimension * sd = sc->dimensions + d;
if ((strcmp (sd->dimensionName, dimensionName) == 0) ||
@@ -735,7 +766,9 @@ int sensor_merge_records (const sensorResource * srs[], int srsLen, boolean fail
}
// update the collection interval
- cache_sc->collectionIntervalMs = sc->collectionIntervalMs;
+ if (sc->collectionIntervalMs > 0) {
+ cache_sc->collectionIntervalMs = sc->collectionIntervalMs;
+ }
// run through dimensions merging in their values separately
long long dimension_seq [MAX_SENSOR_DIMENSIONS];
@@ -754,6 +787,12 @@ int sensor_merge_records (const sensorResource * srs[], int srsLen, boolean fail
if (sd->valuesLen < 1) // no values in this dimension at all
continue;
+ if (cache_sd->valuesLen<0 || cache_sd->valuesLen>MAX_SENSOR_VALUES) { // sanity check
+ logprintfl (EUCAERROR, "inconsistency in sensor database (valuesLen=%d for %s)\n",
+ cache_sd->valuesLen, cache_sr->resourceName, cache_sm->metricName, sensor_type2str(cache_sc->type), cache_sd->dimensionName);
+ goto bail;
+ }
+
// correlate new values with values already in the cache:
// phase 1: go backwards through sequence numbers of new and old
@@ -1163,7 +1202,7 @@ int sensor_refresh_resources (const char resourceNames [][MAX_SENSOR_NAME_LEN],
}
if (head == NULL) {
// OK, can't find this thing anywhere.
- logprintfl (EUCADEBUG, "unable to get metrics for instance %s\n", name);
+ logprintfl (EUCADEBUG, "unable to get metrics for instance %s (which is OK if it was terminated)\n", name);
// TODO3.2: decide what to do when some metrics for an instance aren't available
}
}
@@ -1183,6 +1222,7 @@ static void dump_sensor_cache (void)
srs [i] = & (sensor_state->resources [i]);
}
log_sensor_resources ("whole cache", srs, sensor_state->max_resources);
+ free (srs);
}
static void clear_srs (sensorResource ** srs, int srsLen)
@@ -1192,11 +1232,14 @@ static void clear_srs (sensorResource ** srs, int srsLen)
}
}
+const char * euca_this_component_name = "ignore";
+const char * euca_client_component_name = "ignore";
+
int main (int argc, char ** argv)
{
int errors = 0;
- logfile (NULL, EUCADEBUG, 4);
+ logfile (NULL, EUCATRACE, 4);
logprintfl (EUCADEBUG, "testing sensor.c with MAX_SENSOR_VALUES=%d\n", MAX_SENSOR_VALUES);
long long intervalMs = 50000;
@@ -1211,11 +1254,24 @@ int main (int argc, char ** argv)
assert (0 == sensor_config (1, 50000));
assert (0 == sensor_config (3, intervalMs));
- // test the getstat functions
- assert (getstat_refresh () == OK);
- logprintfl (EUCADEBUG, "getstat_refresh() found %d instances\n", getstat_ninstances());
- assert (getstat_refresh () == OK);
- logprintfl (EUCADEBUG, "getstat_refresh(), 2nd time, found %d instances\n", getstat_ninstances());
+#define GETSTAT_ITERS 10
+ // test the getstat function
+ for (int i=1; i<=GETSTAT_ITERS; i++) {
+ if (i%2) {
+ euca_this_component_name = "nc";
+ } else {
+ euca_this_component_name = "cc";
+ }
+ assert (getstat_refresh () == OK);
+ getstat * gs = getstat_find (NULL);
+ if (gs != NULL) {
+ char * id = gs->instanceId;
+ assert (sensor_refresh_resources (id, "", 1) == OK);
+ }
+ if (i%101 == 0) {
+ logprintfl (EUCADEBUG, "getstat_refresh() iteration %d/%d found %d instances\n", i, GETSTAT_ITERS, getstat_ninstances());
+ }
+ }
char * anInstanceId = NULL;
sem_p (state_sem);
if (sensor_state->stats) {
@@ -1236,7 +1292,7 @@ int main (int argc, char ** argv)
ts+=intervalMs;
val+=1;
assert (0 == sensor_add_value ("i-555", "CPUUtilization", SENSOR_AVERAGE, "default", sn, ts, (sn%2)?TRUE:FALSE, val));
- // assert (0 == sensor_add_value ("i-555", "CPUUtilization", SENSOR_AVERAGE, "default", sn, ts, (sn%2)?TRUE:FALSE, val)); // should be a no-op
+ assert (0 == sensor_add_value ("i-555", "CPUUtilization", SENSOR_AVERAGE, "default", sn, ts, (sn%2)?TRUE:FALSE, val)); // should be a no-op
valLen++;
if (valLen>MAX_SENSOR_VALUES)
valLen=MAX_SENSOR_VALUES;
View
1 util/sensor.h
@@ -43,6 +43,7 @@
#define DEFAULT_SENSOR_SLEEP_DURATION_USEC 15000000L
#define MIN_COLLECTION_INTERVAL_MS 1000L // below 1 second is too frequent
#define MAX_COLLECTION_INTERVAL_MS 86400000L // above 24 hours is too infrequent
+#define MAX_SENSOR_RESOURCES_HARD 10000000L // 10 mil resources max, for sanity checking
typedef struct {
long long timestampMs; // in milliseconds

0 comments on commit af3e9ee

Please sign in to comment.
Something went wrong with that request. Please try again.