Skip to content

Commit

Permalink
Merge pull request #14 from jbapple-cloudera/hash-functor
Browse files Browse the repository at this point in the history
Parameterize cuckoo filter on hash family.
  • Loading branch information
apc999 committed Jun 27, 2017
2 parents 357ef73 + 0448604 commit 3785fab
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 52 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ CC = g++
#OPT = -O3 -DNDEBUG
OPT = -g -ggdb

CFLAGS += -fno-strict-aliasing -Wall -c -I. -I./include -I/usr/include/ -I./src/ $(OPT)
CFLAGS += --std=c++11 -fno-strict-aliasing -Wall -c -I. -I./include -I/usr/include/ -I./src/ $(OPT)

LDFLAGS+= -Wall -lpthread -lssl -lcrypto

Expand Down
6 changes: 3 additions & 3 deletions benchmarks/bulk-insert-and-query.cc
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,8 @@ struct FilterAPI<CuckooFilter<ItemType, bits_per_item, TableType>> {
};

template <>
struct FilterAPI<SimdBlockFilter> {
using Table = SimdBlockFilter;
struct FilterAPI<SimdBlockFilter<>> {
using Table = SimdBlockFilter<>;
static Table ConstructFromAddCount(size_t add_count) {
Table ans(ceil(log2(add_count * 8.0 / CHAR_BIT)));
return ans;
Expand Down Expand Up @@ -248,7 +248,7 @@ int main(int argc, char * argv[]) {

cout << setw(NAME_WIDTH) << "SemiSort17" << cf << endl;

cf = FilterBenchmark<SimdBlockFilter>(add_count, to_add, to_lookup);
cf = FilterBenchmark<SimdBlockFilter<>>(add_count, to_add, to_lookup);

cout << setw(NAME_WIDTH) << "SimdBlock8" << cf << endl;

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/random.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ ::std::vector<::std::uint64_t> GenerateRandom64(::std::size_t count) {
template <typename T>
::std::vector<T> MixIn(const T* x_begin, const T* x_end, const T* y_begin, const T* y_end,
double y_probability) {
const auto x_size = x_end - x_begin, y_size = y_end - y_begin;
const size_t x_size = x_end - x_begin, y_size = y_end - y_begin;
if (y_size > (1ull << 32)) throw ::std::length_error("y is too long");
::std::vector<T> result(x_begin, x_end);
::std::random_device random;
Expand Down
40 changes: 19 additions & 21 deletions src/cuckoofilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ const size_t kMaxCuckooCount = 500;
// TableType: the storage of table, SingleTable by default, and
// PackedTable to enable semi-sorting
template <typename ItemType, size_t bits_per_item,
template <size_t> class TableType = SingleTable>
template <size_t> class TableType = SingleTable,
typename HashFamily = TwoIndependentMultiplyShift>
class CuckooFilter {
// Storage of items
TableType<bits_per_item> *table_;
Expand All @@ -45,6 +46,8 @@ class CuckooFilter {

VictimCache victim_;

HashFamily hasher_;

inline size_t IndexHash(uint32_t hv) const {
// table_->num_buckets is always a power of two, so modulo can be replaced
// with
Expand All @@ -59,9 +62,9 @@ class CuckooFilter {
return tag;
}

inline void GenerateIndexTagHash(const ItemType &item, size_t *index,
uint32_t *tag) const {
const uint64_t hash = HashUtil::TwoIndependentMultiplyShift(item);
inline void GenerateIndexTagHash(const ItemType& item, size_t* index,
uint32_t* tag) const {
const uint64_t hash = hasher_(item);
*index = IndexHash(hash >> 32);
*tag = TagHash(hash);
}
Expand All @@ -82,7 +85,7 @@ class CuckooFilter {
double BitsPerItem() const { return 8.0 * table_->SizeInBytes() / Size(); }

public:
explicit CuckooFilter(const size_t max_num_keys) : num_items_(0), victim_() {
explicit CuckooFilter(const size_t max_num_keys) : num_items_(0), victim_(), hasher_() {
size_t assoc = 4;
size_t num_buckets = upperpower2(max_num_keys / assoc);
double frac = (double)max_num_keys / num_buckets / assoc;
Expand Down Expand Up @@ -116,8 +119,8 @@ class CuckooFilter {
};

template <typename ItemType, size_t bits_per_item,
template <size_t> class TableType>
Status CuckooFilter<ItemType, bits_per_item, TableType>::Add(
template <size_t> class TableType, typename HashFamily>
Status CuckooFilter<ItemType, bits_per_item, TableType, HashFamily>::Add(
const ItemType &item) {
size_t i;
uint32_t tag;
Expand All @@ -131,8 +134,8 @@ Status CuckooFilter<ItemType, bits_per_item, TableType>::Add(
}

template <typename ItemType, size_t bits_per_item,
template <size_t> class TableType>
Status CuckooFilter<ItemType, bits_per_item, TableType>::AddImpl(
template <size_t> class TableType, typename HashFamily>
Status CuckooFilter<ItemType, bits_per_item, TableType, HashFamily>::AddImpl(
const size_t i, const uint32_t tag) {
size_t curindex = i;
uint32_t curtag = tag;
Expand All @@ -158,8 +161,8 @@ Status CuckooFilter<ItemType, bits_per_item, TableType>::AddImpl(
}

template <typename ItemType, size_t bits_per_item,
template <size_t> class TableType>
Status CuckooFilter<ItemType, bits_per_item, TableType>::Contain(
template <size_t> class TableType, typename HashFamily>
Status CuckooFilter<ItemType, bits_per_item, TableType, HashFamily>::Contain(
const ItemType &key) const {
bool found = false;
size_t i1, i2;
Expand All @@ -181,8 +184,8 @@ Status CuckooFilter<ItemType, bits_per_item, TableType>::Contain(
}

template <typename ItemType, size_t bits_per_item,
template <size_t> class TableType>
Status CuckooFilter<ItemType, bits_per_item, TableType>::Delete(
template <size_t> class TableType, typename HashFamily>
Status CuckooFilter<ItemType, bits_per_item, TableType, HashFamily>::Delete(
const ItemType &key) {
size_t i1, i2;
uint32_t tag;
Expand Down Expand Up @@ -215,15 +218,10 @@ Status CuckooFilter<ItemType, bits_per_item, TableType>::Delete(
}

template <typename ItemType, size_t bits_per_item,
template <size_t> class TableType>
std::string CuckooFilter<ItemType, bits_per_item, TableType>::Info() const {
template <size_t> class TableType, typename HashFamily>
std::string CuckooFilter<ItemType, bits_per_item, TableType, HashFamily>::Info() const {
std::stringstream ss;
ss << "CuckooFilter Status:\n"
#ifdef QUICK_N_DIRTY_HASHING
<< "\t\tQuick hashing used\n"
#else
<< "\t\tBob hashing used\n"
#endif
<< "\t\t" << table_->Info() << "\n"
<< "\t\tKeys stored: " << Size() << "\n"
<< "\t\tLoad factor: " << LoadFactor() << "\n"
Expand All @@ -236,4 +234,4 @@ std::string CuckooFilter<ItemType, bits_per_item, TableType>::Info() const {
return ss.str();
}
} // namespace cuckoofilter
#endif // CUCKOO_FILTER_CUCKOO_FILTER_H_
#endif // CUCKOO_FILTER_CUCKOO_FILTER_H_
62 changes: 47 additions & 15 deletions src/hashutil.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <string>

#include <openssl/evp.h>
#include <random>

namespace cuckoofilter {

Expand All @@ -19,8 +20,7 @@ class HashUtil {

// Bob Jenkins Hash that returns two indices in one call
// Useful for Cuckoo hashing, power of two choices, etc.
// Use idx1 before idx2, when possible. idx1 and idx2 should be initialized to
// seeds.
// Use idx1 before idx2, when possible. idx1 and idx2 should be initialized to seeds.
static void BobHash(const void *buf, size_t length, uint32_t *idx1,
uint32_t *idx2);
static void BobHash(const std::string &s, uint32_t *idx1, uint32_t *idx2);
Expand All @@ -40,22 +40,54 @@ class HashUtil {
static std::string MD5Hash(const char *inbuf, size_t in_length);
static std::string SHA1Hash(const char *inbuf, size_t in_length);

// See Martin Dietzfelbinger, "Universal hashing and k-wise independent random
// variables via integer arithmetic without primes".
static uint64_t TwoIndependentMultiplyShift(uint64_t key) {
const uint64_t SEED[4] = {0x818c3f78ull, 0x672f4a3aull, 0xabd04d69ull,
0x12b51f95ull};
const unsigned __int128 m =
*reinterpret_cast<const unsigned __int128 *>(&SEED[0]);
const unsigned __int128 a =
*reinterpret_cast<const unsigned __int128 *>(&SEED[2]);
return (a + m * key) >> 64;
}

private:
HashUtil();
};

} // namespace cuckoofilter
// See Martin Dietzfelbinger, "Universal hashing and k-wise independent random
// variables via integer arithmetic without primes".
class TwoIndependentMultiplyShift {
unsigned __int128 multiply_, add_;

public:
TwoIndependentMultiplyShift() {
::std::random_device random;
for (auto v : {&multiply_, &add_}) {
*v = random();
for (int i = 1; i <= 4; ++i) {
*v = *v << 32;
*v |= random();
}
}
}

uint64_t operator()(uint64_t key) const {
return (add_ + multiply_ * static_cast<decltype(multiply_)>(key)) >> 64;
}
};

// See Patrascu and Thorup's "The Power of Simple Tabulation Hashing"
class SimpleTabulation {
uint64_t tables_[sizeof(uint64_t)][1 << CHAR_BIT];

public:
SimpleTabulation() {
::std::random_device random;
for (unsigned i = 0; i < sizeof(uint64_t); ++i) {
for (int j = 0; j < (1 << CHAR_BIT); ++j) {
tables_[i][j] = random() | ((static_cast<uint64_t>(random())) << 32);
}
}
}

uint64_t operator()(uint64_t key) const {
uint64_t result = 0;
for (unsigned i = 0; i < sizeof(key); ++i) {
result ^= tables_[i][reinterpret_cast<uint8_t *>(&key)[i]];
}
return result;
}
};
}

#endif // CUCKOO_FILTER_HASHUTIL_H_
33 changes: 22 additions & 11 deletions src/simd-block.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
using uint32_t = ::std::uint32_t;
using uint64_t = ::std::uint64_t;

template<typename HashFamily = ::cuckoofilter::TwoIndependentMultiplyShift>
class SimdBlockFilter {
private:
// The filter is divided up into Buckets:
Expand All @@ -46,13 +47,16 @@ class SimdBlockFilter {

Bucket* directory_;

HashFamily hasher_;

public:
// Consumes at most (1 << log_heap_space) bytes on the heap:
explicit SimdBlockFilter(const int log_heap_space);
SimdBlockFilter(SimdBlockFilter&& that)
: log_num_buckets_(that.log_num_buckets_),
directory_mask_(that.directory_mask_),
directory_(that.directory_) {}
directory_(that.directory_),
hasher_(that.hasher_) {}
~SimdBlockFilter() noexcept;
void Add(const uint64_t key) noexcept;
bool Find(const uint64_t key) const noexcept;
Expand All @@ -67,14 +71,16 @@ class SimdBlockFilter {
void operator=(const SimdBlockFilter&) = delete;
};

SimdBlockFilter::SimdBlockFilter(const int log_heap_space)
template<typename HashFamily>
SimdBlockFilter<HashFamily>::SimdBlockFilter(const int log_heap_space)
: // Since log_heap_space is in bytes, we need to convert it to the number of Buckets
// we will use.
log_num_buckets_(::std::max(1, log_heap_space - LOG_BUCKET_BYTE_SIZE)),
// Don't use log_num_buckets_ if it will lead to undefined behavior by a shift that is
// too large.
directory_mask_((1ull << ::std::min(63, log_num_buckets_)) - 1),
directory_(nullptr) {
directory_(nullptr),
hasher_() {
if (!__builtin_cpu_supports("avx2")) {
throw ::std::runtime_error("SimdBlockFilter does not work without AVX2 instructions");
}
Expand All @@ -85,15 +91,17 @@ SimdBlockFilter::SimdBlockFilter(const int log_heap_space)
memset(directory_, 0, alloc_size);
}

SimdBlockFilter::~SimdBlockFilter() noexcept {
template<typename HashFamily>
SimdBlockFilter<HashFamily>::~SimdBlockFilter() noexcept {
free(directory_);
directory_ = nullptr;
}

// The SIMD reinterpret_casts technically violate C++'s strict aliasing rules. However, we
// compile with -fno-strict-aliasing.
[[gnu::always_inline]] inline __m256i SimdBlockFilter::MakeMask(
const uint32_t hash) noexcept {
template <typename HashFamily>
[[gnu::always_inline]] inline __m256i
SimdBlockFilter<HashFamily>::MakeMask(const uint32_t hash) noexcept {
const __m256i ones = _mm256_set1_epi32(1);
// Odd contants for hashing:
const __m256i rehash = _mm256_setr_epi32(0x47b6137bU, 0x44974d91U, 0x8824ad5bU,
Expand All @@ -108,17 +116,20 @@ SimdBlockFilter::~SimdBlockFilter() noexcept {
return _mm256_sllv_epi32(ones, hash_data);
}

[[gnu::always_inline]] inline void SimdBlockFilter::Add(const uint64_t key) noexcept {
const auto hash = ::cuckoofilter::HashUtil::TwoIndependentMultiplyShift(key);
template <typename HashFamily>
[[gnu::always_inline]] inline void
SimdBlockFilter<HashFamily>::Add(const uint64_t key) noexcept {
const auto hash = hasher_(key);
const uint32_t bucket_idx = hash & directory_mask_;
const __m256i mask = MakeMask(hash >> log_num_buckets_);
__m256i* const bucket = &reinterpret_cast<__m256i*>(directory_)[bucket_idx];
_mm256_store_si256(bucket, _mm256_or_si256(*bucket, mask));
}

[[gnu::always_inline]] inline bool SimdBlockFilter::Find(const uint64_t key) const
noexcept {
const auto hash = ::cuckoofilter::HashUtil::TwoIndependentMultiplyShift(key);
template <typename HashFamily>
[[gnu::always_inline]] inline bool
SimdBlockFilter<HashFamily>::Find(const uint64_t key) const noexcept {
const auto hash = hasher_(key);
const uint32_t bucket_idx = hash & directory_mask_;
const __m256i mask = MakeMask(hash >> log_num_buckets_);
const __m256i bucket = reinterpret_cast<__m256i*>(directory_)[bucket_idx];
Expand Down

0 comments on commit 3785fab

Please sign in to comment.