Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
293 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
#include "lib.h" | ||
#include "bloomfilter.h" | ||
#include "murmurhash3.h" | ||
#include "md5.h" | ||
#include "randgen.h" | ||
|
||
#include <math.h> | ||
|
||
struct bloomfilter { | ||
pool_t pool; | ||
int refcount; | ||
size_t size; | ||
size_t total_added; | ||
unsigned int nk; | ||
uint32_t seed; | ||
|
||
bloomfilter_hash_func_t *const *k; | ||
|
||
uint8_t *bitmap; | ||
}; | ||
|
||
#define BITMAP_HAS_BIT(map, idx) (((map)[((idx)/CHAR_BIT)] & (0x1<<((idx)%CHAR_BIT))) != 0) | ||
#define BITMAP_SET_BIT(map, idx) ((map)[((idx)/CHAR_BIT)] |= (0x1<<((idx)%CHAR_BIT))) | ||
#define BLOOMFILTER_HASH_BYTES 16 | ||
|
||
/* use only murmurhash3 by default */ | ||
bloomfilter_hash_func_t *const bloomfilter_default_functions[] = { | ||
bloomfilter_murmur3_hash, | ||
NULL | ||
}; | ||
|
||
static inline size_t | ||
bloomfilter_hash_fold(unsigned char result[STATIC_ARRAY BLOOMFILTER_HASH_BYTES], | ||
uint32_t seed) | ||
{ | ||
#ifdef _LP64 | ||
/* rolls 128 bit result into a 64 bit result by xoring the first 64 bits | ||
and seed, and remaining 64 bits. */ | ||
return be64_to_cpu_unaligned(&result[0]) ^ | ||
be64_to_cpu_unaligned(&result[8]) ^ | ||
(((size_t)seed) << 32); | ||
#else | ||
/* rolls 128 bit result into a 32 bit result by folding | ||
all the successive 32 bit values into one together with seed. */ | ||
return be32_to_cpu_unaligned(&result[0]) ^ | ||
be32_to_cpu_unaligned(&result[4]) ^ | ||
be32_to_cpu_unaligned(&result[8]) ^ | ||
be32_to_cpu_unaligned(&result[12]) ^ | ||
seed; | ||
#endif | ||
} | ||
|
||
size_t bloomfilter_murmur3_hash(const void *data, size_t len, uint32_t seed) | ||
{ | ||
unsigned char result[MURMURHASH3_128_RESULTBYTES]; | ||
murmurhash3_128(data, len, seed, result); | ||
/* murmur includes seed already */ | ||
return bloomfilter_hash_fold(result, 0); | ||
} | ||
|
||
size_t bloomfilter_md5_hash(const void *data, size_t len, uint32_t seed) | ||
{ | ||
unsigned char result[MD5_RESULTLEN]; | ||
md5_get_digest(data, len, result); | ||
return bloomfilter_hash_fold(result, seed); | ||
} | ||
|
||
struct bloomfilter * | ||
bloomfilter_create(pool_t pool, size_t size, | ||
bloomfilter_hash_func_t *const *hash_functions) | ||
{ | ||
struct bloomfilter *bf = p_new(pool, struct bloomfilter, 1); | ||
i_assert(size > 0); | ||
bf->pool = pool; | ||
/* allocate extra byte to round up result */ | ||
bf->bitmap = p_malloc(pool, size/CHAR_BIT + 1); | ||
bf->k = hash_functions; | ||
bf->size = size; | ||
while(*hash_functions != NULL) { | ||
bf->nk++; | ||
hash_functions++; | ||
} | ||
i_assert(bf->nk > 0); | ||
random_fill(&bf->seed, sizeof(bf->seed)); | ||
bf->refcount = 1; | ||
return bf; | ||
} | ||
|
||
void bloomfilter_ref(struct bloomfilter *bf) | ||
{ | ||
i_assert(bf->refcount > 0); | ||
bf->refcount++; | ||
} | ||
|
||
void bloomfilter_unref(struct bloomfilter **_bf) | ||
{ | ||
struct bloomfilter *bf = *_bf; | ||
if (*_bf == NULL) | ||
return; | ||
*_bf = NULL; | ||
i_assert(bf->refcount > 0); | ||
|
||
if (--bf->refcount > 0) | ||
return; | ||
/* in case system pool was used .. */ | ||
p_free(bf->pool, bf->bitmap); | ||
p_free(bf->pool, bf); | ||
} | ||
|
||
size_t bloomfilter_estimated_item_count(struct bloomfilter *bf) | ||
{ | ||
return bf->total_added; | ||
} | ||
|
||
bool bloomfilter_has_data(struct bloomfilter *bf, const void *data, size_t len) | ||
{ | ||
i_assert(data != NULL || len == 0); | ||
bloomfilter_hash_func_t *const *k = bf->k; | ||
for(;*k != NULL; k++) { | ||
size_t result; | ||
result = (*k)(data, len, bf->seed) % bf->size; | ||
if (!BITMAP_HAS_BIT(bf->bitmap, result)) | ||
return FALSE; | ||
} | ||
return TRUE; | ||
} | ||
|
||
void bloomfilter_set_data(struct bloomfilter *bf, const void *data, size_t len) | ||
{ | ||
i_assert(data != NULL || len == 0); | ||
bloomfilter_hash_func_t *const *k = bf->k; | ||
/* total added will cap at size_t, because it's an estimate */ | ||
if (bf->total_added < (size_t)-1) | ||
bf->total_added++; | ||
for(;*k != NULL; k++) { | ||
size_t result; | ||
result = (*k)(data, len, bf->seed) % bf->size; | ||
BITMAP_SET_BIT(bf->bitmap, result); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
#ifndef BLOOMFILTER_H | ||
#define BLOOMFILTER_H | ||
|
||
#include "buffer.h" | ||
|
||
/* Short explanation of bloom filter: | ||
Bloom filter is a space-efficient probabilistic filter. The idea is | ||
that each element that gets added, is hashed thru one or more hashing | ||
functions and the resulting hash modulo table size bit is set. | ||
When seeing if there is an element set, it will check that each | ||
hashing function result modulo table size bit is set. If any of them | ||
is not set, the element is missing. If all of them are set, the | ||
element is probably present. | ||
A bloom filter will never report a false negative, but it might | ||
report a false positive value. | ||
Elements cannot be removed from this bloom filter. | ||
*/ | ||
|
||
struct bloomfilter; | ||
|
||
typedef size_t bloomfilter_hash_func_t(const void *data, size_t len, uint32_t seed); | ||
|
||
/* create bloomfilter of size with hash functions */ | ||
struct bloomfilter * | ||
bloomfilter_create(pool_t pool, size_t size, | ||
bloomfilter_hash_func_t *const *hash_functions) ATTR_RETURNS_NONNULL; | ||
|
||
/* Some helpers */ | ||
#define p_bloomfilter_create(pool, size) \ | ||
bloomfilter_create(pool, size, bloomfilter_default_functions) | ||
#define i_bloomfilter_create(size) p_bloomfilter_create(default_pool, size) | ||
#define t_bloomfilter_create(size) \ | ||
p_bloomfilter_create(pool_datastack_create(), size) | ||
|
||
/* Reference counting */ | ||
void bloomfilter_ref(struct bloomfilter *bf); | ||
void bloomfilter_unref(struct bloomfilter **_bf); | ||
|
||
/* Returns estimated number of items in this filter */ | ||
size_t bloomfilter_estimated_item_count(struct bloomfilter *bf); | ||
|
||
/* Returns TRUE if the element is probably in the filter */ | ||
bool bloomfilter_has_data(struct bloomfilter *bf, const void *data, size_t len) ATTR_NULL(2); | ||
|
||
/* Inserts element into filter */ | ||
void bloomfilter_set_data(struct bloomfilter *bf, const void *data, size_t len) ATTR_NULL(2); | ||
|
||
static inline bool | ||
bloomfilter_has_string(struct bloomfilter *bf, const char *data) | ||
{ | ||
return bloomfilter_has_data(bf, data, strlen(data)); | ||
} | ||
|
||
static inline void | ||
bloomfilter_set_string(struct bloomfilter *bf, const char *data) | ||
{ | ||
bloomfilter_set_data(bf, data, strlen(data)); | ||
} | ||
|
||
static inline void | ||
bloomfilter_set_strings(struct bloomfilter *bf, const char *const *datum) | ||
{ | ||
while(*datum != NULL) { | ||
bloomfilter_set_data(bf, *datum, strlen(*datum)); | ||
datum++; | ||
} | ||
} | ||
|
||
static inline bool | ||
bloomfilter_has_buffer(struct bloomfilter *bf, const buffer_t *data) | ||
{ | ||
return bloomfilter_has_data(bf, data->data, data->used); | ||
} | ||
|
||
static inline void | ||
bloomfilter_set_buffer(struct bloomfilter *bf, const buffer_t *data) | ||
{ | ||
bloomfilter_set_data(bf, data->data, data->used); | ||
} | ||
|
||
static inline bool | ||
bloomfilter_has_int(struct bloomfilter *bf, intmax_t value) | ||
{ | ||
return bloomfilter_has_data(bf, &value, sizeof(value)); | ||
} | ||
|
||
static inline void | ||
bloomfilter_set_int(struct bloomfilter *bf, intmax_t value) | ||
{ | ||
bloomfilter_set_data(bf, &value, sizeof(value)); | ||
} | ||
|
||
static inline bool | ||
bloomfilter_has_uint(struct bloomfilter *bf, uintmax_t value) | ||
{ | ||
return bloomfilter_has_data(bf, &value, sizeof(value)); | ||
} | ||
|
||
static inline void | ||
bloomfilter_set_uint(struct bloomfilter *bf, uintmax_t value) | ||
{ | ||
bloomfilter_set_data(bf, &value, sizeof(value)); | ||
} | ||
|
||
size_t | ||
bloomfilter_murmur3_hash(const void *data, size_t len, uint32_t seed) ATTR_PURE; | ||
size_t | ||
bloomfilter_md5_hash(const void *data, size_t len, uint32_t seed) ATTR_PURE; | ||
|
||
/* By default, only murmur3 is used. */ | ||
extern bloomfilter_hash_func_t *const bloomfilter_default_functions[]; | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
#include "test-lib.h" | ||
#include "randgen.h" | ||
#include "bloomfilter.h" | ||
|
||
void test_bloomfilter(void) | ||
{ | ||
test_begin("bloomfilter"); | ||
struct bloomfilter *bf = i_bloomfilter_create(18); | ||
const char *const strings[] = { | ||
"correct", "horse", "battery", "staple", NULL | ||
}; | ||
|
||
/* set some items */ | ||
bloomfilter_set_strings(bf, strings); | ||
bloomfilter_set_int(bf, 500); | ||
|
||
/* make sure they exist */ | ||
for(unsigned int i = 0; strings[i] != NULL; i++) { | ||
test_assert(bloomfilter_has_string(bf, strings[i])); | ||
} | ||
|
||
test_assert(bloomfilter_has_int(bf, 500)); | ||
|
||
/* make sure nothing bad happens with non-existing items */ | ||
(void)bloomfilter_has_string(bf, "hello, world"); | ||
|
||
test_assert(bloomfilter_estimated_item_count(bf) == 5); | ||
|
||
bloomfilter_unref(&bf); | ||
|
||
test_end(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters