Permalink
Browse files

blobstore cleanup, OOM fix for NC + related

    - fixed re-attachment after reboot
    - fixed double-free in doDetachVolume
    - renamed libvirt_error_handler to libvirt_err_handler
      (to make 'grep -i error' in nc.log more useful)
    - fixes NC's unresponsiveness and ultimate crash due to a
      non-deterministic ultra-fast memory growth (corrupted value
      of metricsLen in sensor state was the culprit, which is now
      checked for, though corruption is not yet resolved)
    - added '-t' for NCclient for timestamp conversion checking
    - removed several memory leaks identified by valgrind
    - moved dump_trace into log.c for use across C components
    - reworked code in configFileValue to make it more readable
    - fixed one case that was failing in test_sensors binary
    - tweaks to log messages, mainly to clean up levels
    - improved blobstore cleanup (nixing unfinished) by
      adding BLOCKBLOB_STATUS_ABANDONED and using it in fsck

Fixes EUCA-3645 and EUCA-3820
  • Loading branch information...
dmitrii committed Oct 25, 2012
1 parent 16c71e3 commit af3e9ee665619bfe0ea97325aaefce94f18d7c44
Showing with 217 additions and 84 deletions.
  1. +14 −1 node/NCclient.c
  2. +3 −3 node/handlers.c
  3. +0 −2 node/handlers.h
  4. +11 −6 node/handlers_default.c
  5. +1 −2 node/handlers_kvm.c
  6. +9 −3 node/server-marshal.c
  7. +31 −35 storage/blobstore.c
  8. +5 −4 storage/blobstore.h
  9. +1 −1 storage/diskutil.c
  10. +3 −3 storage/vbr.c
  11. +1 −0 storage/walrus.c
  12. +20 −0 util/adb-helpers.h
  13. +28 −15 util/config.c
  14. +23 −0 util/log.c
  15. +1 −0 util/log.h
  16. +65 −9 util/sensor.c
  17. +1 −0 util/sensor.h
View
@@ -70,6 +70,7 @@
#include "misc.h"
#include "euca_axis.h"
#include "sensor.h"
+#include "adb-helpers.h"
#define NC_ENDPOINT "/axis2/services/EucalyptusNC"
#define WALRUS_ENDPOINT "/services/Walrus"
@@ -197,12 +198,13 @@ int main (int argc, char **argv)
char * launch_index = NULL;
char ** group_names = NULL;
int group_names_size = 0;
+ char * timestamp_str = NULL;
char * command = NULL;
int local = 0;
int count = 1;
int ch;
- while ((ch = getopt(argc, argv, "lhdn:w:i:m:k:r:e:a:c:h:u:p:V:R:L:FU:I:G:v:")) != -1) {
+ while ((ch = getopt(argc, argv, "lhdn:w:i:m:k:r:e:a:c:h:u:p:V:R:L:FU:I:G:v:t:")) != -1) {
switch (ch) {
case 'c':
count = atoi (optarg);
@@ -299,6 +301,9 @@ int main (int argc, char **argv)
exit (1);
}
break;
+ case 't':
+ timestamp_str = optarg;
+ break;
case 'h':
usage (); // will exit
case '?':
@@ -613,6 +618,14 @@ int main (int argc, char **argv)
sensor_res2str (buf, sizeof(buf), res, resSize);
printf ("resources: %d\n%s\n", resSize, buf);
+ /***********************************************************/
+ } else if (!strcmp(command, "_convertTimestamp")) {
+ CHECK_PARAM(timestamp_str, "timestamp");
+
+ axutil_date_time_t * dt = unixms_to_datetime (stub->env, 0123L);
+ long long ts_l = datetime_to_unixms (dt, stub->env);
+ printf ("timestamp: %lld\n", ts_l);
+
/***********************************************************/
} else {
fprintf (stderr, "ERROR: command %s unknown (try -h)\n", command);
View
@@ -170,7 +170,7 @@ get_value( char *s,
return (sscanf_lines (s, buf, valp)==1 ? OK : ERROR);
}
-void libvirt_error_handler ( void *userData,
+void libvirt_err_handler ( void *userData,
virErrorPtr error)
{
if ( error==NULL) {
@@ -1117,7 +1117,7 @@ void adopt_instances()
return;
logprintfl (EUCAINFO, "looking for existing domains\n");
- virSetErrorFunc (NULL, libvirt_error_handler);
+ virSetErrorFunc (NULL, libvirt_err_handler);
num_doms = virConnectListDomains(nc_state.conn, dom_ids, MAXDOMS);
if (num_doms == 0) {
@@ -1963,7 +1963,7 @@ int doTerminateInstance (ncMetadata *meta, char *instanceId, int force, int *shu
if (init())
return 1;
- logprintfl (EUCAINFO, "[%s] invoked\n", instanceId);
+ logprintfl (EUCAINFO, "[%s] termination requested\n", instanceId);
if (nc_state.H->doTerminateInstance)
ret = nc_state.H->doTerminateInstance(&nc_state, meta, instanceId, force, shutdownState, previousState);
View
@@ -287,8 +287,6 @@ int get_value( char *s,
int convert_dev_names( const char *localDev,
char *localDevReal,
char *localDevTag);
-void libvirt_error_handler( void * userData,
- virErrorPtr error);
void print_running_domains( void);
virConnectPtr *check_hypervisor_conn();
void change_state( ncInstance * instance,
View
@@ -808,6 +808,7 @@ doDetachVolume ( struct nc_state_t *nc,
int ret = OK;
int is_iscsi_target = 0;
int have_remote_device = 0;
+ char * xml = NULL;
char * tagBuf;
char * localDevName;
@@ -908,7 +909,7 @@ doDetachVolume ( struct nc_state_t *nc,
snprintf (lpath, sizeof (lpath), EUCALYPTUS_VOLUME_LIBVIRT_XML_PATH_FORMAT, instance->instancePath, volumeId); // vol-XXX-libvirt.xml
// read in libvirt XML
- char * xml = file2str (lpath);
+ xml = file2str (lpath);
if (xml == NULL) {
logprintfl (EUCAERROR, "[%s][%s] failed to read volume XML from %s\n", instance->instanceId, volumeId, lpath);
ret = ERROR;
@@ -972,7 +973,7 @@ doDetachVolume ( struct nc_state_t *nc,
if (xml)
free (xml);
-
+
if (force) {
return(OK);
}
@@ -1652,10 +1653,13 @@ doDescribeSensors (struct nc_state_t *nc,
else
total = instIdsLen;
- sensorResource ** rss = malloc (total * sizeof (sensorResource *));
- if (rss == NULL) {
- sem_v (inst_copy_sem);
- return OUT_OF_MEMORY;
+ sensorResource ** rss = NULL;
+ if (total > 0) {
+ rss = malloc (total * sizeof (sensorResource *));
+ if (rss == NULL) {
+ sem_v (inst_copy_sem);
+ return OUT_OF_MEMORY;
+ }
}
int k = 0;
@@ -1679,6 +1683,7 @@ doDescribeSensors (struct nc_state_t *nc,
continue;
}
+ assert (k<total);
rss [k] = malloc (sizeof (sensorResource));
sensor_get_instance_data (instance->instanceId, sensorIds, sensorIdsLen, rss + k, 1);
k++;
View
@@ -195,13 +195,12 @@ static void * rebooting_thread (void *arg)
snprintf (lpath, sizeof (lpath), EUCALYPTUS_VOLUME_LIBVIRT_XML_PATH_FORMAT, instance->instancePath, volume->volumeId); // vol-XXX-libvirt.xml
// read in libvirt XML, which may have been modified by the hook above
- char * xml = file2str (lpath);
+ xml = file2str (lpath);
if (xml == NULL) {
logprintfl (EUCAERROR, "[%s][%s] failed to read volume XML from %s\n", instance->instanceId, volume->volumeId, lpath);
rc = 1;
}
}
-
if (remoteDevStr)
free (remoteDevStr);
View
@@ -222,11 +222,11 @@ adb_ncStartNetworkResponse_t* ncStartNetworkMarshal (adb_ncStartNetwork_t* ncSta
adb_ncStartNetworkResponseType_set_networkStatus(output, env, "SUCCESS");
adb_ncStartNetworkResponseType_set_statusMessage(output, env, "0");
}
-
- if (peersLen)
- free (peers);
}
+ if (peers)
+ free (peers);
+
// set response to output
adb_ncStartNetworkResponse_set_ncStartNetworkResponse(response, env, output);
pthread_mutex_unlock(&ncHandlerLock);
@@ -511,6 +511,8 @@ adb_ncDescribeInstancesResponse_t* ncDescribeInstancesMarshal (adb_ncDescribeIns
}
// eventlog("NC", userId, correlationId, "DescribeInstances", "end");
}
+ if (instIds)
+ free (instIds);
// set response to output
adb_ncDescribeInstancesResponse_set_ncDescribeInstancesResponse(response, env, output);
@@ -1068,6 +1070,10 @@ adb_ncDescribeSensorsResponse_t* ncDescribeSensorsMarshal (adb_ncDescribeSensors
}
}
// eventlog("NC", userId, correlationId, "DescribeSensors", "end");
+ if (instIds)
+ free (instIds);
+ if (sensorIds)
+ free (sensorIds);
reply:
View
@@ -79,7 +79,6 @@
#include <sys/file.h> // flock
#include <dirent.h>
#include <sys/wait.h> // wait
-#include <execinfo.h> // backtrace
#include <pthread.h>
#include "blobstore.h"
#include <sys/types.h> // gettid
@@ -179,28 +178,6 @@ static void myprintf (int loglevel, const char * format, ...)
puts (buf);
}
-static void dump_trace (char * buf, int buf_size)
-{
- void *array[64];
- size_t size;
- char **strings;
- size_t i;
-
- size = backtrace (array, sizeof(array)/sizeof(void *));
- strings = backtrace_symbols (array, size);
-
- buf [0] = '\0';
- for (i = 0; i < size; i++) {
- int left = buf_size - 1 - strlen (buf);
- if (left < 0) break;
- char line [512];
- snprintf (line, sizeof(line), "\t%s\n", strings [i]);
- strncat (buf, line, left);
- }
-
- free (strings);
-}
-
const char * blobstore_get_error_str ( blobstore_error_t error ) {
return _blobstore_error_strings [error];
}
@@ -229,7 +206,7 @@ static void err (blobstore_error_t error, const char * custom_msg, const int src
msg = blobstore_get_error_str (error);
}
snprintf (_blobstore_last_msg, sizeof(_blobstore_last_msg), "%s:%d %s", src_file_name, src_line_no, msg);
- dump_trace (_blobstore_last_trace, sizeof(_blobstore_last_trace));
+ log_dump_trace (_blobstore_last_trace, sizeof(_blobstore_last_trace));
if (_do_print_errors) {
myprintf (EUCAERROR, "error: %s\n", _blobstore_last_msg);
@@ -1481,6 +1458,12 @@ static unsigned int check_in_use ( blobstore * bs, const char * bb_id, long long
_err_off(); // do not complain if metadata files do not exist
int fd = open_and_lock (path, BLOBSTORE_FLAG_RDWR, timeout_usec, BLOBSTORE_FILE_PERM); // try opening to see what happens
if (fd != -1) {
+ struct stat s;
+ if (fstat (fd, &s) == 0) {
+ if (s.st_size > 0) { // lock file was not truncated before being released => file not properly closed
+ in_use |= BLOCKBLOB_STATUS_ABANDONED;
+ }
+ }
close_and_unlock (fd);
} else {
in_use |= BLOCKBLOB_STATUS_OPENED; // TODO: check if open failed for other reason?
@@ -1685,10 +1668,11 @@ static long long purge_blockblobs_lru ( blobstore * bs, blockblob * bb_list, lon
code = 'D';
deleted++;
}
- logprintfl (EUCADEBUG, "LRU %d %08d: %29s %c%c%c %c %9llu %s", iteration, purged, bb->id,
- (bb->in_use & BLOCKBLOB_STATUS_OPENED) ? ('o') : ('-'), // o = open
- (bb->in_use & BLOCKBLOB_STATUS_BACKED) ? ('p') : ('-'), // p = has parents
- (bb->in_use & BLOCKBLOB_STATUS_MAPPED) ? ('c') : ('-'), // c = has children
+ logprintfl (EUCADEBUG, "LRU %d %08d: %29s %c%c%c%c %c %9llu %s", iteration, purged, bb->id,
+ (bb->in_use & BLOCKBLOB_STATUS_OPENED) ? ('o') : ('-'), // o = open
+ (bb->in_use & BLOCKBLOB_STATUS_BACKED) ? ('p') : ('-'), // p = has parents
+ (bb->in_use & BLOCKBLOB_STATUS_MAPPED) ? ('c') : ('-'), // c = has children
+ (bb->in_use & BLOCKBLOB_STATUS_ABANDONED) ? ('a') : ('-'), // a = was abandoned
code, // outcome codes: D=deleted, else C=children, !=undeletable, O=open
bb->size_bytes / 512L, // size is in sectors
ctime (&(bb->last_modified))); // ctime adds a newline
@@ -1725,7 +1709,7 @@ int blobstore_stat (blobstore * bs, blobstore_meta * meta)
meta->blocks_unlocked = 0;
meta->blocks_locked = 0;
meta->num_blobs = 0;
- for (blockblob * abb = bbs; abb; abb=abb->next) { // TODO: unify this with locked/unlocked calculation in open()
+ for (blockblob * abb = bbs; abb; ) { // TODO: unify this with locked/unlocked calculation in open()
long long abb_size_blocks = round_up_sec (abb->size_bytes) / 512;
if (abb->in_use & BLOCKBLOB_STATUS_OPENED) {
meta->blocks_locked += abb_size_blocks; // these can't be purged if we need space (TODO: look into recursive purging of unused references?)
@@ -1734,6 +1718,11 @@ int blobstore_stat (blobstore * bs, blobstore_meta * meta)
}
meta->blocks_allocated += abb->blocks_allocated;
meta->num_blobs++;
+
+ // free this node and move the pointer
+ blockblob * old_bb = abb;
+ abb=abb->next;
+ free (old_bb);
}
unlock:
@@ -2652,9 +2641,14 @@ static int blockblob_check (const blockblob * bb)
}
}
+ // check on .refs that point to blobs that no longer exist
if (get_stale_refs (bb, NULL)>0)
err++;
+ // check on .lock files that are non-zero => blobs that were not closed properly
+ if (bb->in_use & BLOCKBLOB_STATUS_ABANDONED)
+ err++;
+
_err_on();
return err;
}
@@ -2718,7 +2712,7 @@ static int delete_blob_state (blockblob * bb, long long timeout_usec, char do_fo
// TODO: print a warning about store/blob corruption?
}
- if (!check_in_use(dep_bs, blob_id, 0)) {
+ if (! (check_in_use(dep_bs, blob_id, 0) & ~(BLOCKBLOB_STATUS_ABANDONED))) { // in use except abandoned
loop_remove (dep_bs, blob_id); // TODO: do we care about errors?
}
if (dep_bs != bs) {
@@ -2768,7 +2762,8 @@ int blockblob_delete ( blockblob * bb, long long timeout_usec, char do_force )
// do not delete the blob if it is used by another one
bb->in_use = check_in_use (bs, bb->id, 0); // update in_use status
- if (!do_force && bb->in_use & ~(BLOCKBLOB_STATUS_OPENED|BLOCKBLOB_STATUS_BACKED)) { // in use other than opened (by this thread) or backed
+ // if in use other than opened (by this thread), backed, or abandoned
+ if (!do_force && bb->in_use & ~(BLOCKBLOB_STATUS_OPENED|BLOCKBLOB_STATUS_BACKED|BLOCKBLOB_STATUS_ABANDONED)) {
ERR (BLOBSTORE_ERROR_AGAIN, NULL);
ret = -1;
} else {
@@ -4406,10 +4401,11 @@ static int do_list (const char * regex)
}
char extras [100] = "";
if (show_extras) {
- snprintf (extras, sizeof (extras), "%c%c%c %9llu %s",
- (bm->in_use & BLOCKBLOB_STATUS_OPENED) ? ('o') : ('-'), // o = open
- (bm->in_use & BLOCKBLOB_STATUS_BACKED) ? ('p') : ('-'), // p = has parents
- (bm->in_use & BLOCKBLOB_STATUS_MAPPED) ? ('c') : ('-'), // c = has children
+ snprintf (extras, sizeof (extras), "%c%c%c%c %9llu %s",
+ (bm->in_use & BLOCKBLOB_STATUS_OPENED) ? ('o') : ('-'), // o = open
+ (bm->in_use & BLOCKBLOB_STATUS_BACKED) ? ('p') : ('-'), // p = has parents
+ (bm->in_use & BLOCKBLOB_STATUS_MAPPED) ? ('c') : ('-'), // c = has children
+ (bm->in_use & BLOCKBLOB_STATUS_ABANDONED) ? ('a') : ('-'), // a = was abandoned
bm->size_bytes / 512L, // size is in sectors
ctime (&(bm->last_modified)));
extras [strlen (extras)-1] = ' '; // remove the newline from date
View
@@ -88,10 +88,11 @@
#define BLOBSTORE_FLAG_HOLLOW 02000 // means the blob being created should be viewed as not occupying space
// in-use flags for blockblobs
-#define BLOCKBLOB_STATUS_OPENED 00002 // currently opened by someone (read or write)
-#define BLOCKBLOB_STATUS_LOCKED 00004 // locked by a process (TODO: remove this UNUSED status?)
-#define BLOCKBLOB_STATUS_MAPPED 00010 // loopback device dm-mapped by one or more other blobs
-#define BLOCKBLOB_STATUS_BACKED 00020 // loopback device used as a backing by device mapper
+#define BLOCKBLOB_STATUS_OPENED 00002 // currently opened by someone (read or write)
+#define BLOCKBLOB_STATUS_LOCKED 00004 // locked by a process (TODO: remove this UNUSED status?)
+#define BLOCKBLOB_STATUS_MAPPED 00010 // loopback device dm-mapped by one or more other blobs
+#define BLOCKBLOB_STATUS_BACKED 00020 // loopback device used as a backing by device mapper
+#define BLOCKBLOB_STATUS_ABANDONED 00040 // non-zero lock file size => blob not closed properly
typedef enum { // make sure these match up with _blobstore_error_strings[] entries below
BLOBSTORE_ERROR_OK = 0,
View
@@ -827,7 +827,7 @@ int diskutil_ch (const char * path, const char * user, const char * group, const
int ret = OK;
char * output;
- logprintfl (EUCAINFO, "ch(own|mod) '%s' %s.%s %o\n",
+ logprintfl (EUCADEBUG, "ch(own|mod) '%s' %s.%s %o\n",
path,
user?user:"*",
group?group:"*",
View
@@ -439,7 +439,7 @@ vbr_legacy ( // constructs VBRs for {image|kernel|ramdisk}x{Id|URL} entries (DEP
for (i=0; i<EUCA_MAX_VBRS && i<params->virtualBootRecordLen; i++) {
virtualBootRecord * vbr = &(params->virtualBootRecord[i]);
if (strlen(vbr->resourceLocation)>0) {
- logprintfl (EUCAINFO, "[%s] VBR[%d] type=%s id=%s dev=%s size=%lld format=%s %s\n",
+ logprintfl (EUCADEBUG, "[%s] VBR[%d] type=%s id=%s dev=%s size=%lld format=%s %s\n",
instanceId, i, vbr->typeName, vbr->id, vbr->guestDeviceName, vbr->size, vbr->formatName, vbr->resourceLocation);
if (!strcmp(vbr->typeName, "machine"))
found_image = 1;
@@ -455,9 +455,9 @@ vbr_legacy ( // constructs VBRs for {image|kernel|ramdisk}x{Id|URL} entries (DEP
// legacy support for image{Id|URL}
if (imageId && imageURL) {
if (found_image) {
- logprintfl (EUCAINFO, "[%s] IGNORING image %s passed outside the virtual boot record\n", instanceId, imageId);
+ logprintfl (EUCAWARN, "[%s] IGNORING image %s passed outside the virtual boot record\n", instanceId, imageId);
} else {
- logprintfl (EUCAINFO, "[%s] LEGACY pre-VBR image id=%s URL=%s\n", instanceId, imageId, imageURL);
+ logprintfl (EUCAWARN, "[%s] LEGACY pre-VBR image id=%s URL=%s\n", instanceId, imageId, imageURL);
if (i>=EUCA_MAX_VBRS-2) {
logprintfl (EUCAERROR, "[%s] error: out of room in the Virtual Boot Record for legacy image %s\n", instanceId, imageId);
return ERROR;
View
@@ -466,6 +466,7 @@ static size_t write_header (void *buffer, size_t size, size_t nmemb, void *param
static size_t write_data (void *buffer, size_t size, size_t nmemb, void *params)
{
assert (params !=NULL);
+
int fd = ((struct request *)params)->fd;
int wrote = write (fd, buffer, size * nmemb);
((struct request *)params)->total_wrote += wrote;
Oops, something went wrong.

0 comments on commit af3e9ee

Please sign in to comment.