Skip to content
Browse files

mcdb_make: reliably detect No space left on device

mcdb_make - reliably detect "No space left on device" using
posix_fallocate() in combination with asynchronous mmap flush
(msync MS_ASYNC).

separate mcdb on-disk size allocations from mmap allocations so
that more advantageous block sizes can be used for each.  Align
filesystem allocations on larger block sizes.  Align mmap
allocations on system page size (_SC_PAGESIZE) (same as before).

posix_fallocate() reliably detects "No space left on device"
but is above 10% more expensive than ftruncate() for small mcdb
(and decreases to nearly 0% differential cost for large mcdb)

Difference from prior use of posix_fallocate() in mcdb_make is
that use now is only differential for file size increases, instead
of calling posix_fallocate() on entire file size which resulted in
each call being more expensive than the prior call as file grew.
Also, file is grown in power-of-2 block sizes (currently 4 MB).
  • Loading branch information...
1 parent 6336e94 commit 3d55a2286f8e0f4e3cf18d53c3ef38e5fffed200 @gstrauss committed Sep 30, 2011
Showing with 42 additions and 33 deletions.
  1. +6 −5 mcdb.h
  2. +32 −25 mcdb_make.c
  3. +1 −0 mcdb_make.h
  4. +3 −3 t/PERFORMANCE
View
11 mcdb.h
@@ -186,11 +186,12 @@ mcdb_mmap_reopen_threadsafe(struct mcdb_mmap ** restrict)
|| __builtin_expect(mcdb_thread_register(mcdb), true))
-#define MCDB_SLOT_BITS 8 /* 2^8 = 256 */
-#define MCDB_SLOTS (1u<<MCDB_SLOT_BITS) /* must be power-of-2 */
-#define MCDB_SLOT_MASK (MCDB_SLOTS-1) /* bitmask */
-#define MCDB_HEADER_SZ (MCDB_SLOTS<<4) /* MCDB_SLOTS * 16 (256*16=4096) */
-#define MCDB_MMAP_SZ (1<<19) /* 512KB; must be larger than MCDB_HEADER_SZ */
+#define MCDB_SLOT_BITS 8 /* 2^8 = 256 */
+#define MCDB_SLOTS (1u<<MCDB_SLOT_BITS) /* must be power-of-2 */
+#define MCDB_SLOT_MASK (MCDB_SLOTS-1) /* bitmask */
+#define MCDB_HEADER_SZ (MCDB_SLOTS<<4) /* MCDB_SLOTS * 16 (256*16=4096) */
+#define MCDB_MMAP_SZ (1<<19) /* 512KB; must be > MCDB_HEADER_SZ */
+#define MCDB_BLOCK_SZ (1<<22) /* 4MB; must be >= MCDB_MMAP_SZ */
#define MCDB_PAD_ALIGN 16
#define MCDB_PAD_MASK (MCDB_PAD_ALIGN-1)
View
57 mcdb_make.c
@@ -29,8 +29,8 @@
#ifndef _POSIX_C_SOURCE
#define _POSIX_C_SOURCE 200112L
#endif
-#ifndef _XOPEN_SOURCE
-#define _XOPEN_SOURCE 500
+#ifndef _XOPEN_SOURCE /* posix_fallocate() requires _XOPEN_SOURCE 600 */
+#define _XOPEN_SOURCE 600
#endif
/* gcc -std=c99 hides MAP_ANONYMOUS
* _BSD_SOURCE or _SVID_SOURCE needed for mmap MAP_ANONYMOUS on Linux */
@@ -65,6 +65,7 @@
#include <sys/mman.h>
#include <unistd.h>
#include <errno.h>
+#include <fcntl.h> /* posix_fallocate() */
#include <string.h> /* memcpy() */
#include <stdbool.h> /* bool true false */
#include <stdint.h> /* uint32_t uintptr_t */
@@ -155,7 +156,7 @@ mcdb_mmap_commit(struct mcdb_make * const restrict m,
* call fsync() or fdatasync() on fd to ensure data is written to disk,
* e.g. in case when writing new mcdb to temporary file, before renaming
* temporary file to overwrite existing mcdb. If not sync'd to disk and
- * OS crashes, then the update mcdb can be corrupted. */
+ * OS crashes, then the updated mcdb can be corrupted. */
}
static bool __attribute_noinline__
@@ -164,24 +165,14 @@ mcdb_mmap_upsize(struct mcdb_make * const restrict m, const size_t sz)
static bool __attribute_noinline__
mcdb_mmap_upsize(struct mcdb_make * const restrict m, const size_t sz)
{
- size_t offset;
+ const size_t offset = m->pos & m->pgalign; /* mmap offset must be aligned */
size_t msz;
- /* flush and munmap prior mmap */
- if (m->map != MAP_FAILED) {/*(m->fd==-1 during some large mcdb size tests)*/
- if ((m->fd == -1 || msync(m->map, m->pos - m->offset, MS_ASYNC) == 0)
- && munmap(m->map, m->msz) == 0)
- m->map = MAP_FAILED;
- else
- return false;
- }
-
#if !defined(_LP64) && !defined(__LP64__) /* (no 4 GB limit in 64-bit) */
/* limit max size of mcdb to (4 GB - pagesize) */
if (sz > (UINT_MAX & m->pgalign)) { errno = EOVERFLOW; return false; }
#endif
- offset = m->offset + ((m->pos - m->offset) & m->pgalign);
msz = (MCDB_MMAP_SZ > sz - offset)
? MCDB_MMAP_SZ
: (sz - offset + ~m->pgalign) & m->pgalign;
@@ -190,17 +181,33 @@ mcdb_mmap_upsize(struct mcdb_make * const restrict m, const size_t sz)
msz = (UINT_MAX & m->pgalign) - offset;
#endif
- m->fsz = offset + msz; /* (mcdb_make mmap region is always to end of file)*/
- if (m->fd != -1 && nointr_ftruncate(m->fd,(off_t)m->fsz) != 0) return false;
+ /* increase file size by at least msz (prefer multiple of disk block size)*/
+ if (m->fd != -1 && m->fsz < offset + msz) {
+ m->fsz = (offset + msz + (MCDB_BLOCK_SZ-1)) & ~(MCDB_BLOCK_SZ-1);
+ if ((errno =
+ posix_fallocate(m->fd, (off_t)m->osz, (off_t)(m->fsz-m->osz)))==0)
+ m->osz = m->fsz;
+ else
+ return false;
+ }
+
+ /* flush and munmap prior mmap */
+ if (m->map != MAP_FAILED) {/*(m->fd==-1 during some large mcdb size tests)*/
+ if ((m->fd == -1 || msync(m->map, m->pos - m->offset, MS_ASYNC) == 0)
+ && munmap(m->map, m->msz) == 0)
+ m->map = MAP_FAILED;
+ else
+ return false;
+ }
/* (compilation with large file support enables off_t max > 2 GB in cast) */
m->map = (m->fd != -1) /* (m->fd == -1 during some large mcdb size tests) */
? (char *)mmap(0, msz, PROT_WRITE, MAP_SHARED, m->fd, (off_t)offset)
: (char *)mmap(0, msz, PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (m->map == MAP_FAILED) return false;
- posix_madvise(m->map, msz, POSIX_MADV_SEQUENTIAL | POSIX_MADV_WILLNEED);
m->offset = offset;
m->msz = msz;
+ posix_madvise(m->map, msz, POSIX_MADV_SEQUENTIAL);
return true;
}
@@ -221,7 +228,7 @@ mcdb_make_addbegin(struct mcdb_make * const restrict m,
#if !defined(_LP64) && !defined(__LP64__) /* (no 4 GB limit in 64-bit) */
if (pos > UINT_MAX-len) return mcdb_make_err(NULL,ENOMEM);
#endif
- if (m->fsz < pos+len && !mcdb_mmap_upsize(m, pos+len))
+ if (m->offset+m->msz < pos+len && !mcdb_mmap_upsize(m, pos+len))
return mcdb_make_err(NULL,errno);
p = m->map + pos - m->offset;
uint32_strpack_bigendian_macro(p,keylen);
@@ -292,6 +299,7 @@ mcdb_make_start(struct mcdb_make * const restrict m, const int fd,
m->pos = MCDB_HEADER_SZ;
m->offset = 0;
m->fsz = 0;
+ m->osz = 0;
m->msz = 0;
m->hp.p = MCDB_HEADER_SZ;
m->hp.h = 0;
@@ -355,20 +363,19 @@ mcdb_make_finish(struct mcdb_make * const restrict m)
#if !defined(_LP64) && !defined(__LP64__)
if (d > (UINT_MAX-(m->pos+u))) return mcdb_make_err(m,ENOMEM);
#endif
- if (m->fsz < m->pos+d && !mcdb_mmap_upsize(m,m->pos+d))
+ if (m->offset+m->msz < m->pos+d && !mcdb_mmap_upsize(m,m->pos+d))
return mcdb_make_err(m,errno);
if (d) memset(m->map + m->pos - m->offset, ~0, d);
m->pos += d; /*set all bits in hole so code can detect end of data padding*/
b = (m->pos < UINT_MAX) ? 3 : 4;
for (i = 0; i < MCDB_SLOTS; ++i) {
- len = count[i] << 1; /* no overflow possible */
+ len = count[i] << 1;
d = m->pos;
- /* check for sufficient space in mmap to write hash table for this slot
- * (integer overflow not possible: total size checked outside loop) */
- if (m->fsz < d+((size_t)len << b)
- && !mcdb_mmap_upsize(m, d+((size_t)len << b)))
+ /* mmap sufficient space into which to write hash table for this slot */
+ if (m->offset+m->msz < d+((uintptr_t)len << b)
+ && !mcdb_mmap_upsize(m, d+((uintptr_t)len << b)))
break;
/* constant header (16 bytes per header slot, so multiply by 16) */
@@ -423,7 +430,7 @@ mcdb_make_finish(struct mcdb_make * const restrict m)
}
u = (i == MCDB_SLOTS && mcdb_mmap_commit(m, header));
- return mcdb_make_destroy(m) | (u ? 0 : -1);
+ return (u ? 0 : -1) | mcdb_make_destroy(m);
}
/* caller should call mcdb_make_destroy() upon errors from mcdb_make_*() calls
View
1 mcdb_make.h
@@ -43,6 +43,7 @@ struct mcdb_make {
size_t offset;
char * restrict map;
size_t fsz;
+ size_t osz;
size_t msz;
size_t pgalign;
struct mcdb_hp hp;
View
6 t/PERFORMANCE
@@ -56,10 +56,10 @@ processing time scaling linearly as different size databases are tested. The
last parameter to the above two commands is the number of records to create and
can be varied. On my Pentium-M laptop, Tokyo Cabinet falls off a performance
cliff when creating hdb somewhere between 3 million (~21 sec) and 4 million
-records (~2m 45s). mcdb can do 10 million in ~13 sec, 30 million in ~64 sec,
+records (~2m 45s). mcdb can do 10 million in ~15 sec, 30 million in ~60 sec,
and could go further if more than 2 GB free disk space were available on my
-laptop. Some number of records past 30 million, my 1 GB RAM is exhausted and
-swap is used, but mcdbctl does not appear to thrash.
+laptop. Some number of records before 30 million, my 1 GB RAM is exhausted
+and swap is used, but mcdbctl does not appear to thrash.
To compare read (query) performance of mcdb with that of Tokyo Cabinet, a more

0 comments on commit 3d55a22

Please sign in to comment.
Something went wrong with that request. Please try again.