Skip to content

Commit

Permalink
lib: Add bloom filter support
Browse files Browse the repository at this point in the history
  • Loading branch information
cmouse committed Nov 30, 2017
1 parent f6de86e commit 44d29bb
Show file tree
Hide file tree
Showing 5 changed files with 293 additions and 0 deletions.
3 changes: 3 additions & 0 deletions src/lib/Makefile.am
Expand Up @@ -18,6 +18,7 @@ liblib_la_SOURCES = \
base32.c \
base64.c \
bits.c \
bloomfilter.c \
bsearch-insert-pos.c \
buffer.c \
child-wait.c \
Expand Down Expand Up @@ -173,6 +174,7 @@ headers = \
base32.h \
base64.h \
bits.h \
bloomfilter.h \
bsearch-insert-pos.h \
buffer.h \
byteorder.h \
Expand Down Expand Up @@ -330,6 +332,7 @@ test_lib_SOURCES = \
test-base32.c \
test-base64.c \
test-bits.c \
test-bloomfilter.c \
test-bsearch-insert-pos.c \
test-buffer.c \
test-byteorder.c \
Expand Down
140 changes: 140 additions & 0 deletions src/lib/bloomfilter.c
@@ -0,0 +1,140 @@
#include "lib.h"
#include "bloomfilter.h"
#include "murmurhash3.h"
#include "md5.h"
#include "randgen.h"

#include <math.h>

struct bloomfilter {
pool_t pool;
int refcount;
size_t size;
size_t total_added;
unsigned int nk;
uint32_t seed;

bloomfilter_hash_func_t *const *k;

uint8_t *bitmap;
};

#define BITMAP_HAS_BIT(map, idx) (((map)[((idx)/CHAR_BIT)] & (0x1<<((idx)%CHAR_BIT))) != 0)
#define BITMAP_SET_BIT(map, idx) ((map)[((idx)/CHAR_BIT)] |= (0x1<<((idx)%CHAR_BIT)))
#define BLOOMFILTER_HASH_BYTES 16

/* use only murmurhash3 by default */
bloomfilter_hash_func_t *const bloomfilter_default_functions[] = {
bloomfilter_murmur3_hash,
NULL
};

static inline size_t
bloomfilter_hash_fold(unsigned char result[STATIC_ARRAY BLOOMFILTER_HASH_BYTES],
uint32_t seed)
{
#ifdef _LP64
/* rolls 128 bit result into a 64 bit result by xoring the first 64 bits
and seed, and remaining 64 bits. */
return be64_to_cpu_unaligned(&result[0]) ^
be64_to_cpu_unaligned(&result[8]) ^
(((size_t)seed) << 32);
#else
/* rolls 128 bit result into a 32 bit result by folding
all the successive 32 bit values into one together with seed. */
return be32_to_cpu_unaligned(&result[0]) ^
be32_to_cpu_unaligned(&result[4]) ^
be32_to_cpu_unaligned(&result[8]) ^
be32_to_cpu_unaligned(&result[12]) ^
seed;
#endif
}

size_t bloomfilter_murmur3_hash(const void *data, size_t len, uint32_t seed)
{
unsigned char result[MURMURHASH3_128_RESULTBYTES];
murmurhash3_128(data, len, seed, result);
/* murmur includes seed already */
return bloomfilter_hash_fold(result, 0);
}

size_t bloomfilter_md5_hash(const void *data, size_t len, uint32_t seed)
{
unsigned char result[MD5_RESULTLEN];
md5_get_digest(data, len, result);
return bloomfilter_hash_fold(result, seed);
}

struct bloomfilter *
bloomfilter_create(pool_t pool, size_t size,
bloomfilter_hash_func_t *const *hash_functions)
{
struct bloomfilter *bf = p_new(pool, struct bloomfilter, 1);
i_assert(size > 0);
bf->pool = pool;
/* allocate extra byte to round up result */
bf->bitmap = p_malloc(pool, size/CHAR_BIT + 1);
bf->k = hash_functions;
bf->size = size;
while(*hash_functions != NULL) {
bf->nk++;
hash_functions++;
}
i_assert(bf->nk > 0);
random_fill(&bf->seed, sizeof(bf->seed));
bf->refcount = 1;
return bf;
}

void bloomfilter_ref(struct bloomfilter *bf)
{
i_assert(bf->refcount > 0);
bf->refcount++;
}

void bloomfilter_unref(struct bloomfilter **_bf)
{
struct bloomfilter *bf = *_bf;
if (*_bf == NULL)
return;
*_bf = NULL;
i_assert(bf->refcount > 0);

if (--bf->refcount > 0)
return;
/* in case system pool was used .. */
p_free(bf->pool, bf->bitmap);
p_free(bf->pool, bf);
}

size_t bloomfilter_estimated_item_count(struct bloomfilter *bf)
{
return bf->total_added;
}

bool bloomfilter_has_data(struct bloomfilter *bf, const void *data, size_t len)
{
i_assert(data != NULL || len == 0);
bloomfilter_hash_func_t *const *k = bf->k;
for(;*k != NULL; k++) {
size_t result;
result = (*k)(data, len, bf->seed) % bf->size;
if (!BITMAP_HAS_BIT(bf->bitmap, result))
return FALSE;
}
return TRUE;
}

void bloomfilter_set_data(struct bloomfilter *bf, const void *data, size_t len)
{
i_assert(data != NULL || len == 0);
bloomfilter_hash_func_t *const *k = bf->k;
/* total added will cap at size_t, because it's an estimate */
if (bf->total_added < (size_t)-1)
bf->total_added++;
for(;*k != NULL; k++) {
size_t result;
result = (*k)(data, len, bf->seed) % bf->size;
BITMAP_SET_BIT(bf->bitmap, result);
}
}
117 changes: 117 additions & 0 deletions src/lib/bloomfilter.h
@@ -0,0 +1,117 @@
#ifndef BLOOMFILTER_H
#define BLOOMFILTER_H

#include "buffer.h"

/* Short explanation of bloom filter:
Bloom filter is a space-efficient probabilistic filter. The idea is
that each element that gets added, is hashed thru one or more hashing
functions and the resulting hash modulo table size bit is set.
When seeing if there is an element set, it will check that each
hashing function result modulo table size bit is set. If any of them
is not set, the element is missing. If all of them are set, the
element is probably present.
A bloom filter will never report a false negative, but it might
report a false positive value.
Elements cannot be removed from this bloom filter.
*/

struct bloomfilter;

typedef size_t bloomfilter_hash_func_t(const void *data, size_t len, uint32_t seed);

/* create bloomfilter of size with hash functions */
struct bloomfilter *
bloomfilter_create(pool_t pool, size_t size,
bloomfilter_hash_func_t *const *hash_functions) ATTR_RETURNS_NONNULL;

/* Some helpers */
#define p_bloomfilter_create(pool, size) \
bloomfilter_create(pool, size, bloomfilter_default_functions)
#define i_bloomfilter_create(size) p_bloomfilter_create(default_pool, size)
#define t_bloomfilter_create(size) \
p_bloomfilter_create(pool_datastack_create(), size)

/* Reference counting */
void bloomfilter_ref(struct bloomfilter *bf);
void bloomfilter_unref(struct bloomfilter **_bf);

/* Returns estimated number of items in this filter */
size_t bloomfilter_estimated_item_count(struct bloomfilter *bf);

/* Returns TRUE if the element is probably in the filter */
bool bloomfilter_has_data(struct bloomfilter *bf, const void *data, size_t len) ATTR_NULL(2);

/* Inserts element into filter */
void bloomfilter_set_data(struct bloomfilter *bf, const void *data, size_t len) ATTR_NULL(2);

static inline bool
bloomfilter_has_string(struct bloomfilter *bf, const char *data)
{
return bloomfilter_has_data(bf, data, strlen(data));
}

static inline void
bloomfilter_set_string(struct bloomfilter *bf, const char *data)
{
bloomfilter_set_data(bf, data, strlen(data));
}

static inline void
bloomfilter_set_strings(struct bloomfilter *bf, const char *const *datum)
{
while(*datum != NULL) {
bloomfilter_set_data(bf, *datum, strlen(*datum));
datum++;
}
}

static inline bool
bloomfilter_has_buffer(struct bloomfilter *bf, const buffer_t *data)
{
return bloomfilter_has_data(bf, data->data, data->used);
}

static inline void
bloomfilter_set_buffer(struct bloomfilter *bf, const buffer_t *data)
{
bloomfilter_set_data(bf, data->data, data->used);
}

static inline bool
bloomfilter_has_int(struct bloomfilter *bf, intmax_t value)
{
return bloomfilter_has_data(bf, &value, sizeof(value));
}

static inline void
bloomfilter_set_int(struct bloomfilter *bf, intmax_t value)
{
bloomfilter_set_data(bf, &value, sizeof(value));
}

static inline bool
bloomfilter_has_uint(struct bloomfilter *bf, uintmax_t value)
{
return bloomfilter_has_data(bf, &value, sizeof(value));
}

static inline void
bloomfilter_set_uint(struct bloomfilter *bf, uintmax_t value)
{
bloomfilter_set_data(bf, &value, sizeof(value));
}

size_t
bloomfilter_murmur3_hash(const void *data, size_t len, uint32_t seed) ATTR_PURE;
size_t
bloomfilter_md5_hash(const void *data, size_t len, uint32_t seed) ATTR_PURE;

/* By default, only murmur3 is used. */
extern bloomfilter_hash_func_t *const bloomfilter_default_functions[];

#endif
32 changes: 32 additions & 0 deletions src/lib/test-bloomfilter.c
@@ -0,0 +1,32 @@
#include "test-lib.h"
#include "randgen.h"
#include "bloomfilter.h"

void test_bloomfilter(void)
{
test_begin("bloomfilter");
struct bloomfilter *bf = i_bloomfilter_create(18);
const char *const strings[] = {
"correct", "horse", "battery", "staple", NULL
};

/* set some items */
bloomfilter_set_strings(bf, strings);
bloomfilter_set_int(bf, 500);

/* make sure they exist */
for(unsigned int i = 0; strings[i] != NULL; i++) {
test_assert(bloomfilter_has_string(bf, strings[i]));
}

test_assert(bloomfilter_has_int(bf, 500));

/* make sure nothing bad happens with non-existing items */
(void)bloomfilter_has_string(bf, "hello, world");

test_assert(bloomfilter_estimated_item_count(bf) == 5);

bloomfilter_unref(&bf);

test_end();
}
1 change: 1 addition & 0 deletions src/lib/test-lib.inc
Expand Up @@ -8,6 +8,7 @@ FATAL(fatal_array)
TEST(test_base32)
TEST(test_base64)
TEST(test_bits)
TEST(test_bloomfilter)
TEST(test_bsearch_insert_pos)
TEST(test_buffer)
TEST(test_byteorder)
Expand Down

0 comments on commit 44d29bb

Please sign in to comment.