Skip to content

Commit

Permalink
change structures to support cdb > 4 GB file size with 64-bit mcdb cl…
Browse files Browse the repository at this point in the history
…ient.

  Use of 32-bit hash is basis for continuing to use mostly 32-bit structures.
  Even a mostly uniform distribution of hash keys will likely show increasing
  number of collisions as number of keys approaches 2 billion.
  Note: A 32-bit mcdb client will not be able to open mcdb larger than 4 GB
  since mcdb is implemented with mmap of entire file and the map needs to fit
  in the 4 GB address space (along with the rest of the process).
  • Loading branch information
gstrauss committed Jul 25, 2011
1 parent c8e1656 commit 4af1ec1
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 64 deletions.
32 changes: 19 additions & 13 deletions mcdb.c
Expand Up @@ -54,13 +54,16 @@ mcdb_findtagstart(struct mcdb * const restrict m,
(void) mcdb_thread_refresh_self(m);
/* (ignore rc; continue with previous map in case of failure) */

ptr = m->map->ptr + ((khash << 3) & MCDB_HEADER_MASK) + 4;
/* (size of data in lvl1 hash table element is 16-bytes (shift 4 bits)) */
ptr = m->map->ptr + ((khash & MCDB_SLOT_MASK) << 4) + 8;
m->hslots = uint32_strunpack_bigendian_aligned_macro(ptr);
if (!m->hslots)
return false;
m->hpos = uint32_strunpack_bigendian_aligned_macro(ptr-4);
m->kpos = m->hpos + (((khash >> 8) % m->hslots) << 3);
m->khash = khash;
m->hpos = (((uint64_t)uint32_strunpack_bigendian_aligned_macro(ptr-8))<<31)
|((uint64_t)uint32_strunpack_bigendian_aligned_macro(ptr-4));
/* (size of data in lvl2 hash table element is 16-bytes (shift 4 bits)) */
m->kpos = m->hpos + (((khash >> MCDB_SLOT_BITS) % m->hslots) << 4);
m->khash = khash; /* (use khash bits not used above) */
m->loop = 0;
return true;
}
Expand All @@ -72,21 +75,24 @@ mcdb_findtagnext(struct mcdb * const restrict m,
{
const unsigned char * ptr;
const unsigned char * const restrict mptr = m->map->ptr;
uint32_t vpos, khash, len;
const uint64_t hslots_end = m->hpos + (((uint64_t)m->hslots) << 4);
uint32_t vpos, khash;
size_t len;

while (m->loop < m->hslots) {
ptr = mptr + m->kpos + 4;
vpos = uint32_strunpack_bigendian_aligned_macro(ptr);
ptr = mptr + m->kpos + 8;
vpos = (((uint64_t)uint32_strunpack_bigendian_aligned_macro(ptr))<<31)
|((uint64_t)uint32_strunpack_bigendian_aligned_macro(ptr+4));
if (!vpos)
return false;
khash = uint32_strunpack_bigendian_aligned_macro(ptr-4);
m->kpos += 8;
if (m->kpos == m->hpos + (m->hslots << 3))
khash = uint32_strunpack_bigendian_aligned_macro(ptr-8);
m->kpos += 16;
if (m->kpos == hslots_end)
m->kpos = m->hpos;
m->loop += 1;
if (khash == m->khash) {
ptr = mptr + vpos;
len = uint32_strunpack_bigendian_macro(ptr);
len = (size_t)uint32_strunpack_bigendian_macro(ptr);
if (tagc != 0
? len == klen+1 && tagc == ptr[8] && memcmp(key,ptr+9,klen) == 0
: len == klen && memcmp(key,ptr+8,klen) == 0) {
Expand Down Expand Up @@ -149,7 +155,7 @@ mcdb_mmap_init(struct mcdb_mmap * const restrict map, int fd)
#ifdef __GNUC__
__builtin_prefetch((char *)x+960, 0, 1); /*(touch mem page w/ mcdb header)*/
#endif
if (st.st_size > USHRT_MAX) /*(skip extra syscall overhead for small mcdb)*/
if (st.st_size > 4194304) /*(skip extra syscall overhead for mcdb < 4 MB)*/
posix_madvise(((char *)x), st.st_size, POSIX_MADV_RANDOM);
/*(addr (x) must be aligned on _SC_PAGESIZE for madvise portability)*/
map->ptr = (unsigned char *)x;
Expand Down Expand Up @@ -380,7 +386,7 @@ mcdb_mmap_thread_registration(struct mcdb_mmap ** const restrict mapptr,
return true;
}

/* theaded programs (in while multiple threads are using same struct mcdb_mmap)
/* theaded programs (while multiple threads are using same struct mcdb_mmap)
* must reopen and register (update refcnt on previous and new mcdb_mmap) while
* holding a lock, or else there are race conditions with refcnt'ing. */
bool __attribute_noinline__
Expand Down
25 changes: 12 additions & 13 deletions mcdb.h
Expand Up @@ -25,12 +25,12 @@
* - mmap page alignment requirements and use of address space limits const db
* max size when created by 32-bit process. Sizes approaching 4 GB may fail.
* - arbitrary limit of each key or data set to (INT_MAX - 8 bytes; almost 2 GB)
* (djb cdb doc states there is limit besides cdb fitting into 4 GB)
* (djb cdb doc states there is no limit besides cdb fitting into 4 GB)
* (writev() on some platforms in 32-bit exe might also have 2 GB limit)
* - djb cdb tools work on input stream; mcdbctl operates on file
* The ability to work on an input stream forced certain design choices that
* might not be relevant to working on an mmap'd file, but mcdb preserves the
* same format for compatibility.
* might not be relevant to working on an mmap'd file, but mcdb uses similar
* layout to cdb in format specification.
*
* Incompatibilities with djb cdb
* - padding added at the end of key,value data to 8-byte align hash tables
Expand All @@ -45,7 +45,7 @@
#define MCDB_H

#include <stdbool.h> /* bool */
#include <stdint.h> /* uint32_t */
#include <stdint.h> /* uint32_t, uint64_t */
#include <unistd.h> /* size_t */
#include <sys/time.h> /* time_t */

Expand All @@ -62,8 +62,7 @@ extern "C" {

struct mcdb_mmap {
unsigned char *ptr; /* mmap pointer */
uint32_t size; /* mmap size */
uint32_t pad0; /* (unused; padding) */
uint64_t size; /* mmap size */
time_t mtime; /* mmap file mtime */
struct mcdb_mmap * volatile next; /* updated (new) mcdb_mmap */
void * (*fn_malloc)(size_t); /* fn ptr to malloc() */
Expand All @@ -77,12 +76,12 @@ struct mcdb_mmap {
struct mcdb {
struct mcdb_mmap *map;
uint32_t loop; /* number of hash slots searched under this key */
uint32_t khash; /* initialized if loop is nonzero */
uint32_t kpos; /* initialized if loop is nonzero */
uint32_t hpos; /* initialized if loop is nonzero */
uint32_t hslots; /* initialized if loop is nonzero */
uint32_t dpos; /* initialized if cdb_findnext() returns 1 */
uint64_t kpos; /* initialized if loop is nonzero */
uint64_t hpos; /* initialized if loop is nonzero */
uint64_t dpos; /* initialized if cdb_findnext() returns 1 */
uint32_t dlen; /* initialized if cdb_findnext() returns 1 */
uint32_t khash; /* initialized if loop is nonzero */
};

extern bool
Expand Down Expand Up @@ -170,10 +169,10 @@ mcdb_mmap_reopen_threadsafe(struct mcdb_mmap ** restrict)
|| __builtin_expect(mcdb_thread_register(mcdb), true))


#define MCDB_SLOTS 256 /* must be power-of-2 */
#define MCDB_SLOT_BITS 8 /* 2^8 = 256 */
#define MCDB_SLOTS (1u<<(MCDB_SLOT_BITS-1)) /* must be power-of-2 */
#define MCDB_SLOT_MASK (MCDB_SLOTS-1) /* bitmask */
#define MCDB_HEADER_SZ (MCDB_SLOTS<<3) /* MCDB_SLOTS * 8 (256*8 = 2048) */
#define MCDB_HEADER_MASK (MCDB_HEADER_SZ-1) /* bitmask */
#define MCDB_HEADER_SZ (MCDB_SLOTS<<4) /* MCDB_SLOTS * 16 (256*16=4096) */
#define MCDB_MMAP_SZ (1<<19) /* 512KB; must be larger than MCDB_HEADER_SZ */


Expand Down
89 changes: 53 additions & 36 deletions mcdb_make.c
Expand Up @@ -46,19 +46,19 @@
#include <unistd.h>
#include <errno.h>
#include <string.h> /* memcpy() */
#include <stdint.h> /* uint32_t */
#include <stdint.h> /* uint32_t uint64_t */
#include <stdlib.h> /* posix_fallocate() */
#include <fcntl.h> /* posix_fallocate() */
#include <limits.h> /* UINT_MAX, INT_MAX */

#define MCDB_HPLIST 1000
#define MCDB_HPLIST 4000

struct mcdb_hp { uint32_t h; uint32_t p; };
struct mcdb_hp { uint64_t p; uint32_t h; };

struct mcdb_hplist {
struct mcdb_hp hp[MCDB_HPLIST];
uint32_t num; /* index into struct mcdb_hp hp[MCDB_HPLIST] */
struct mcdb_hplist *next;
uint32_t num;
struct mcdb_hp hp[MCDB_HPLIST];
};

static struct mcdb_hplist * __attribute_noinline__ __attribute_malloc__
Expand Down Expand Up @@ -120,12 +120,14 @@ mcdb_mmap_upsize(struct mcdb_make * const restrict m, const size_t sz)
}

/* limit max size of mcdb to (4GB - pagesize) */
if (sz > (UINT_MAX & m->pgalign)) { errno = EOVERFLOW; return false; }
/*(commented out to remove the 4 GB limit)*/
//if (sz > (UINT_MAX & m->pgalign)) { errno = EOVERFLOW; return false; }

offset = m->offset + ((m->pos - m->offset) & m->pgalign);
msz = (MCDB_MMAP_SZ > sz - offset) ? MCDB_MMAP_SZ : sz - offset;
if (offset > (UINT_MAX & m->pgalign) - msz)
msz = (UINT_MAX & m->pgalign) - offset;
/*(commented out to remove the 4 GB limit)*/
//if (offset > (UINT_MAX & m->pgalign) - msz)
// msz = (UINT_MAX & m->pgalign) - offset;

m->fsz = offset + msz; /* (mcdb_make mmap region is always to end of file)*/
if (m->fd != -1 && nointr_ftruncate(m->fd,(off_t)m->fsz) != 0) return false;
Expand Down Expand Up @@ -153,10 +155,11 @@ mcdb_make_addbegin(struct mcdb_make * const restrict m,
struct mcdb_hplist * const head =
m->head->num < MCDB_HPLIST ? m->head : mcdb_hplist_alloc(m);
if (head == NULL) return -1;
head->hp[head->num].p = pos;
head->hp[head->num].h = UINT32_HASH_DJB_INIT;
head->hp[head->num].p = (uint32_t)pos; /* arbitrary ~2 GB limit for lens */
if (keylen > INT_MAX-8 || datalen > INT_MAX-8) { errno=EINVAL; return -1; }
if (pos > UINT_MAX-len) { errno=ENOMEM; return -1; }
/*(no 4 GB limit in 64-bit or in 32-bit with large file support)*/
//if (pos > UINT_MAX-len) { errno=ENOMEM; return -1; }
if (m->fsz < pos+len && !mcdb_mmap_upsize(m,pos+len)) return -1;
p = m->map + pos - m->offset;
uint32_strpack_bigendian_macro(p,keylen);
Expand Down Expand Up @@ -234,7 +237,7 @@ mcdb_make_start(struct mcdb_make * const restrict m, const int fd,

if (mcdb_mmap_upsize(m, MCDB_MMAP_SZ) /* MCDB_MMAP_SZ >= MCDB_HEADER_SZ */
&& mcdb_hplist_alloc(m) != NULL) { /*init to skip NULL check every add*/
m->head->hp[m->head->num].p = (uint32_t)m->pos;
m->head->hp[m->head->num].p = m->pos;
return 0;
}
else {
Expand All @@ -248,12 +251,16 @@ mcdb_make_start(struct mcdb_make * const restrict m, const int fd,
int
mcdb_make_finish(struct mcdb_make * const restrict m)
{
/* Current code below effectively limits mcdb to approx 2 billion entries.
* Use of 32-bit hash is the basis for continuing to use 32-bit structures.
* Even a mostly uniform distribution of hash keys will likely show
* increasing number of collisions as number of keys approaches 2 billion.*/
uint32_t u;
uint32_t i;
uint32_t d;
uint64_t d;
uint32_t len;
uint32_t cnt;
uint32_t where;
uint32_t w;
struct mcdb_hp *hash;
struct mcdb_hp *split;
struct mcdb_hp *hp;
Expand All @@ -267,7 +274,9 @@ mcdb_make_finish(struct mcdb_make * const restrict m)

cnt = 0; /* num entries */
for (x = m->head; x; x = x->next) {
cnt += (u = x->num);
u = x->num;
if (u > UINT_MAX - cnt) { errno = ENOMEM; return -1; }
cnt += u;
while (u--)
++count[MCDB_SLOT_MASK & x->hp[u].h];
}
Expand All @@ -280,15 +289,18 @@ mcdb_make_finish(struct mcdb_make * const restrict m)
}

/* check for integer overflow and that sufficient space allocated in file */
if (cnt > (UINT_MAX>>4) || len > INT_MAX) { errno = ENOMEM; return -1; }
//if (cnt > (UINT_MAX>>5) || len > INT_MAX){ errno = ENOMEM; return -1; }
if (cnt > INT_MAX || len > INT_MAX) { errno = ENOMEM; return -1; }
len += cnt;
if (len > UINT_MAX/sizeof(struct mcdb_hp)) { errno = ENOMEM; return -1; }
u = cnt << 4; /* multiply by 2 and then by 8 (for 8 chars) */
if (m->pos > (UINT_MAX-u)) { errno = ENOMEM; return -1; }
/*(no 4 GB limit in 64-bit or in 32-bit with large file support)*/
//u = cnt << 5; /* multiply by 2 and then by 16 (for 16 chars) */
//if (m->pos > (UINT_MAX-u)) { errno = ENOMEM; return -1; }

/* add "hole" for alignment; incompatible with djb cdbdump */
d = (8 - (m->pos & 7)) & 7; /* padding to align hash tables to 8 bytes */
if (d > (UINT_MAX-(m->pos+u))) { errno = ENOMEM; return -1; }
/*(no 4 GB limit in 64-bit or in 32-bit with large file support)*/
//if (d > (UINT_MAX-(m->pos+u))) { errno = ENOMEM; return -1; }
if (m->fsz < m->pos+d && !mcdb_mmap_upsize(m,m->pos+d)) return -1;
if (d) memset(m->map + m->pos - m->offset, '\0', d);
m->pos += d; /* clear hole for binary cmp of mcdbs */
Expand All @@ -297,8 +309,7 @@ mcdb_make_finish(struct mcdb_make * const restrict m)
if (!split) return -1;
hash = split + cnt;

u = 0;
for (i = 0; i < MCDB_SLOTS; ++i) {
for (u = 0, i = 0; i < MCDB_SLOTS; ++i) {
u += count[i]; /* bounded by cnt number of entries, so no overflow */
start[i] = u;
}
Expand All @@ -312,36 +323,42 @@ mcdb_make_finish(struct mcdb_make * const restrict m)
for (i = 0; i < MCDB_SLOTS; ++i) {
cnt = count[i];
len = cnt << 1; /* no overflow possible */
u = m->pos;
d = m->pos;

/* check for sufficient space in mmap to write hash table for this slot
* (integer overflow not possible: total size checked outside loop) */
if (m->fsz < u+(len<<3) && !mcdb_mmap_upsize(m,u+(len<<3))) break;
if (m->fsz < d+(len<<4) && !mcdb_mmap_upsize(m,d+(len<<4))) break;

/* constant header (8 bytes per header slot, so multiply by 8) */
p = header + (i << 3); /* (i << 3) == (i * 8) */
uint32_strpack_bigendian_aligned_macro(p,u);
uint32_strpack_bigendian_aligned_macro(p+4,len);
/* constant header (16 bytes per header slot, so multiply by 16) */
p = header + (i << 4); /* (i << 4) == (i * 16) */
u = (uint32_t)(d >> 32);
uint32_strpack_bigendian_aligned_macro(p,u); /* hpos (high bits) */
u = (uint32_t)d;
uint32_strpack_bigendian_aligned_macro(p+4,u); /* hpos (low bits) */
uint32_strpack_bigendian_aligned_macro(p+8,len); /* hslots */

/* generate hash table for this slot */
memset(hash, 0, len * sizeof(struct mcdb_hp));
hp = split + start[i];
for (u = 0; u < cnt; ++u) {
where = (hp->h >> 8) % len;
while (hash[where].p)
if (++where == len)
where = 0;
hash[where] = *hp++;
w = (hp->h >> MCDB_SLOT_BITS) % len;
while (hash[w].p)
if (++w == len)
w = 0;
hash[w] = *hp++;
}

/* write hash table directly to map; allocated space checked above */
for (u = 0; u < len; ++u) {
p = m->map + m->pos - m->offset;
m->pos += 8;
d = hash[u].h;
uint32_strpack_bigendian_aligned_macro(p,d);
d = hash[u].p;
uint32_strpack_bigendian_aligned_macro(p+4,d);
m->pos += 16; /* sizeof(struct mcdb_hp) */
w = hash[u].h;
uint32_strpack_bigendian_aligned_macro(p,w); /* khash */
*(int *)(p+4) = 0; /*(fill hole with 0 only for consistency)*/
w = (uint32_t)(hash[u].p >> 32);
uint32_strpack_bigendian_aligned_macro(p+8,w); /* dpos (high bits)*/
w = (uint32_t)hash[u].p;
uint32_strpack_bigendian_aligned_macro(p+12,w);/* dpos (low bits)*/
}
}
m->fn_free(split);
Expand Down
7 changes: 5 additions & 2 deletions mcdbctl.c
Expand Up @@ -61,7 +61,8 @@ mcdbctl_dump(struct mcdb * const restrict m)
uint32_t klen;
uint32_t dlen;
unsigned char * const eod =
p + uint32_strunpack_bigendian_aligned_macro(p) - 7;
p + ((((uint64_t)uint32_strunpack_bigendian_aligned_macro(p))<<31)
| (uint64_t)uint32_strunpack_bigendian_aligned_macro(p+4)) - 7;
int iovcnt = 0;
size_t iovlen = 0;
size_t buflen = 0;
Expand Down Expand Up @@ -154,7 +155,9 @@ mcdbctl_stats(struct mcdb * const restrict m)
uint32_t klen;
uint32_t dlen;
unsigned char * const eod =
map_ptr + uint32_strunpack_bigendian_aligned_macro(map_ptr) - 7;
map_ptr
+ ((((uint64_t)uint32_strunpack_bigendian_aligned_macro(map_ptr))<<31)
| (uint64_t)uint32_strunpack_bigendian_aligned_macro(map_ptr+4)) - 7;
unsigned long nrec = 0;
unsigned long numd[11] = { 0,0,0,0,0,0,0,0,0,0,0 };
unsigned int rv;
Expand Down

0 comments on commit 4af1ec1

Please sign in to comment.