Permalink
Browse files

IMPALA-2270: avoid FnvHash64to32 with empty inputs

FnvHash64to32 produces pathologically bad results when hashing zero-byte
input: it always returns 0 regardless of the input hash seed. This
is a result of it xoring the 32-bit hash seed with itself. This patch
adds a DCHECK to this function to verify that this function is not
invoked with zero-byte inputs, and updates all callsites to check for
the zero-length case.

This patch also improves hashing of booleans: false and NULL no longer
hash to the same value.

Change-Id: I6706f6ea167e5362d55351f7cc0c637c680a315d
Reviewed-on: http://gerrit.cloudera.org:8080/720
Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com>
Tested-by: Internal Jenkins
  • Loading branch information...
timarmstrong authored and Internal Jenkins committed Aug 31, 2015
1 parent 3a34b95 commit 8ed5130e1d83243491ad4d787921495fa17f864f
@@ -44,9 +44,11 @@ using namespace llvm;
//
// The different hash functions benchmarked:
// 1. FNV Hash: Fowler-Noll-Vo hash function
-// 2. Boost Hash: boost hash function
-// 3. Crc: hash using sse4 crc hash instruction
-// 4. Codegen: hash using sse4 with the tuple types baked into the codegen function
+// 2. FNV Hash with empty string handling: FNV with special-case for empty string
+// 3. Murmur2_64 Hash: Murmur2 hash function
+// 4. Boost Hash: boost hash function
+// 5. Crc: hash using sse4 crc hash instruction
+// 6. Codegen: hash using sse4 with the tuple types baked into the codegen function
//
// n is the number of buckets, k is the number of items
// Expected(collisions) = n - k + E(X)
@@ -56,20 +58,23 @@ using namespace llvm;
// = n / e
// = 367
-// Machine Info: Intel(R) Core(TM) i7-2600 CPU @ 3.40GHz
+// Machine Info: Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
// Int Hash: Function Rate (iters/ms) Comparison
// ----------------------------------------------------------------------
-// Fnv 76.46 1X
-// Boost 203.7 2.665X
-// Crc 305.3 3.993X
-// Codegen 703.4 9.199X
-
+// Fnv 114.6 1X
+// Murmur2_64 129.9 1.134X
+// Boost 294.1 2.567X
+// Crc 536.2 4.68X
+// Codegen 1413 12.33X
+//
// Mixed Hash: Function Rate (iters/ms) Comparison
// ----------------------------------------------------------------------
-// Fnv 55.51 1X
-// Boost 82.86 1.493X
-// Crc 215.1 3.874X
-// Codegen 219.8 3.959X
+// Fnv 90.88 1X
+// FnvEmpty 91.58 1.008X
+// Murmur2_64 124.9 1.374X
+// Boost 133.5 1.469X
+// Crc 435.8 4.795X
+// Codegen 379.3 4.174X
typedef uint32_t (*CodegenHashFn)(int rows, char* data, int32_t* results);
@@ -143,7 +148,25 @@ void TestCodegenIntHash(int batch, void* d) {
}
}
-void TestFnvMixedHash(int batch, void* d) {
+void TestMurmur2_64IntHash(int batch, void* d) {
+ TestData* data = reinterpret_cast<TestData*>(d);
+ int rows = data->num_rows;
+ int cols = data->num_cols;
+ for (int i = 0; i < batch; ++i) {
+ char* values = reinterpret_cast<char*>(data->data);
+ for (int j = 0; j < rows; ++j) {
+ uint64_t hash = 0;
+ for (int k = 0; k < cols; ++k) {
+ hash = HashUtil::MurmurHash2_64(&values[k], sizeof(uint32_t), hash);
+ }
+ data->results[j] = hash;
+ values += cols;
+ }
+ }
+}
+
+template <bool handle_empty>
+void TestFnvMixedHashTemplate(int batch, void* d) {
TestData* data = reinterpret_cast<TestData*>(d);
int rows = data->num_rows;
for (int i = 0; i < batch; ++i) {
@@ -161,14 +184,26 @@ void TestFnvMixedHash(int batch, void* d) {
values += sizeof(int64_t);
StringValue* str = reinterpret_cast<StringValue*>(values);
- hash = HashUtil::FnvHash64to32(str->ptr, str->len, hash);
+ if (handle_empty && str->len == 0) {
+ hash = HashUtil::HashCombine32(0, hash);
+ } else {
+ hash = HashUtil::FnvHash64to32(str->ptr, str->len, hash);
+ }
values += sizeof(StringValue);
data->results[j] = hash;
}
}
}
+void TestFnvMixedHash(int batch, void* d) {
+ return TestFnvMixedHashTemplate<false>(batch, d);
+}
+
+void TestFnvEmptyMixedHash(int batch, void* d) {
+ return TestFnvMixedHashTemplate<true>(batch, d);
+}
+
void TestCrcMixedHash(int batch, void* d) {
TestData* data = reinterpret_cast<TestData*>(d);
int rows = data->num_rows;
@@ -235,6 +270,32 @@ void TestBoostMixedHash(int batch, void* d) {
}
}
+void TestMurmur2_64MixedHash(int batch, void* d) {
+ TestData* data = reinterpret_cast<TestData*>(d);
+ int rows = data->num_rows;
+ for (int i = 0; i < batch; ++i) {
+ char* values = reinterpret_cast<char*>(data->data);
+ for (int j = 0; j < rows; ++j) {
+ uint64_t hash = 0;
+
+ hash = HashUtil::MurmurHash2_64(values, sizeof(int8_t), hash);
+ values += sizeof(int8_t);
+
+ hash = HashUtil::MurmurHash2_64(values, sizeof(int32_t), hash);
+ values += sizeof(int32_t);
+
+ hash = HashUtil::MurmurHash2_64(values, sizeof(int64_t), hash);
+ values += sizeof(int64_t);
+
+ StringValue* str = reinterpret_cast<StringValue*>(values);
+ hash = HashUtil::MurmurHash2_64(str->ptr, str->len, hash);
+ values += sizeof(StringValue);
+
+ data->results[j] = hash;
+ }
+ }
+}
+
int NumCollisions(TestData* data, int num_buckets) {
vector<bool> buckets;
buckets.resize(num_buckets);
@@ -426,13 +487,16 @@ int main(int argc, char **argv) {
Benchmark int_suite("Int Hash");
int_suite.AddBenchmark("Fnv", TestFnvIntHash, &int_data);
+ int_suite.AddBenchmark("Murmur2_64", TestMurmur2_64IntHash, &int_data);
int_suite.AddBenchmark("Boost", TestBoostIntHash, &int_data);
int_suite.AddBenchmark("Crc", TestCrcIntHash, &int_data);
int_suite.AddBenchmark("Codegen", TestCodegenIntHash, &int_data);
cout << int_suite.Measure() << endl;
Benchmark mixed_suite("Mixed Hash");
mixed_suite.AddBenchmark("Fnv", TestFnvMixedHash, &mixed_data);
+ mixed_suite.AddBenchmark("FnvEmpty", TestFnvEmptyMixedHash, &mixed_data);
+ mixed_suite.AddBenchmark("Murmur2_64", TestMurmur2_64MixedHash, &mixed_data);
mixed_suite.AddBenchmark("Boost", TestBoostMixedHash, &mixed_data);
mixed_suite.AddBenchmark("Crc", TestCrcMixedHash, &mixed_data);
mixed_suite.AddBenchmark("Codegen", TestCodegenMixedHash, &mixed_data);
@@ -454,6 +454,19 @@ TEST_F(HashTableTest, QuadraticInsertFullTest) {
InsertFullTest(true, 65536);
}
+// Test that hashing empty string updates hash value.
+TEST_F(HashTableTest, HashEmpty) {
+ HashTableCtx ht_ctx(build_expr_ctxs_, probe_expr_ctxs_, false, false, 1, 2, 1);
+ uint32_t seed = 9999;
+ ht_ctx.set_level(0);
+ EXPECT_NE(seed, ht_ctx.Hash(NULL, 0, seed));
+ // TODO: level 0 uses CRC hash, which only swaps bytes around on empty input.
+ // EXPECT_NE(seed, ht_ctx.Hash(NULL, 0, ht_ctx.Hash(NULL, 0, seed)));
+ ht_ctx.set_level(1);
+ EXPECT_NE(seed, ht_ctx.Hash(NULL, 0, seed));
+ EXPECT_NE(seed, ht_ctx.Hash(NULL, 0, ht_ctx.Hash(NULL, 0, seed)));
+}
+
}
int main(int argc, char** argv) {
@@ -161,6 +161,7 @@ uint32_t HashTableCtx::HashVariableLenRow() {
hash = Hash(loc, sizeof(StringValue), hash);
} else {
// Hash the string
+ // TODO: when using CRC hash on empty string, this only swaps bytes.
StringValue* str = reinterpret_cast<StringValue*>(loc);
hash = Hash(str->ptr, str->len, hash);
}
View
@@ -171,6 +171,7 @@ class HashTableCtx {
private:
friend class HashTable;
+ friend class HashTableTest_HashEmpty_Test;
/// Compute the hash of the values in expr_values_buffer_.
/// This will be replaced by codegen. We don't want this inlined for replacing
@@ -73,6 +73,52 @@ TEST_F(RawValueTest, TypeChar) {
EXPECT_EQ(memcmp(&val, s.data(), sizeof(int)), 0);
}
+// IMPALA-2270: "", false, and NULL should hash to distinct values.
+TEST_F(RawValueTest, HashEmptyAndNull) {
+ uint32_t seed = 12345;
+ uint32_t null_hash = RawValue::GetHashValue(NULL, TYPE_STRING, seed);
+ uint32_t null_hash_fnv = RawValue::GetHashValueFnv(NULL, TYPE_STRING, seed);
+ StringValue empty(NULL, 0);
+ uint32_t empty_hash = RawValue::GetHashValue(&empty, TYPE_STRING, seed);
+ uint32_t empty_hash_fnv = RawValue::GetHashValueFnv(&empty, TYPE_STRING, seed);
+ bool false_val = false;
+ uint32_t false_hash = RawValue::GetHashValue(&false_val, TYPE_BOOLEAN, seed);
+ uint32_t false_hash_fnv = RawValue::GetHashValue(&false_val, TYPE_BOOLEAN, seed);
+ EXPECT_NE(seed, null_hash);
+ EXPECT_NE(seed, empty_hash);
+ EXPECT_NE(seed, false_hash);
+ EXPECT_NE(seed, null_hash_fnv);
+ EXPECT_NE(seed, empty_hash_fnv);
+ EXPECT_NE(seed, false_hash_fnv);
+ EXPECT_NE(null_hash, empty_hash);
+ EXPECT_NE(null_hash_fnv, empty_hash_fnv);
+ EXPECT_NE(null_hash, false_hash);
+ EXPECT_NE(false_hash, null_hash_fnv);
+}
+
+/// IMPALA-2270: Test that FNV hash of (int, "") is not skewed.
+TEST(HashUtil, IntNullSkew) {
+ int num_values = 100000;
+ int num_buckets = 16;
+ int buckets[num_buckets];
+ memset(buckets, 0, sizeof(buckets));
+ for (int32_t i = 0; i < num_values; ++i) {
+ uint32_t hash = RawValue::GetHashValueFnv(&i, TYPE_INT, 9999);
+ StringValue empty(NULL, 0);
+ hash = RawValue::GetHashValueFnv(&empty, TYPE_STRING, hash);
+ ++buckets[hash % num_buckets];
+ }
+
+ for (int i = 0; i < num_buckets; ++i) {
+ LOG(INFO) << "Bucket " << i << ": " << buckets[i];
+ double exp_count = num_values / (double) num_buckets;
+ EXPECT_GT(buckets[i], 0.9 * exp_count) << "Bucket " << i
+ << " has <= 90%% of expected";
+ EXPECT_LT(buckets[i], 1.1 * exp_count) << "Bucket " << i
+ << " has >= 110%% of expected";
+ }
+}
+
}
int main(int argc, char **argv) {
View
@@ -90,17 +90,6 @@ class RawValue {
/// This is more performant than Compare() == 0 for string equality, mostly because of
/// the length comparison check.
static bool Eq(const void* v1, const void* v2, const ColumnType& type);
-
- private:
- /// The magic number (used in hash_combine()) 0x9e3779b9 = 2^32 / (golden ratio).
- static const uint32_t HASH32_COMBINE_SEED = 0x9e3779b9;
-
- /// Combine hashes 'value' and 'seed' to get a new hash value.
- /// Similar to boost::hash_combine(), but for uint32_t.
- /// Used for NULL and boolean inputs in GetHashValue().
- static inline uint32_t HashCombine32(uint32_t value, uint32_t seed) {
- return seed ^ (HASH32_COMBINE_SEED + value + (seed << 6) + (seed >> 2));
- }
};
inline bool RawValue::Eq(const void* v1, const void* v2, const ColumnType& type) {
@@ -163,20 +152,28 @@ inline bool RawValue::Eq(const void* v1, const void* v2, const ColumnType& type)
};
}
-/// Use boost::hash_combine for corner cases. boost::hash_combine is reimplemented
-/// here to use uint32_t's (instead of size_t)
+/// Arbitrary constants used to compute hash values for special cases. Constants were
+/// obtained by taking lower bytes of generated UUID. NULL and empty strings should
+/// hash to different values.
+static const uint32_t HASH_VAL_NULL = 0x58081667;
+static const uint32_t HASH_VAL_EMPTY = 0x7dca7eee;
+
inline uint32_t RawValue::GetHashValue(const void* v, const ColumnType& type,
uint32_t seed) {
- // Hash_combine with v = 0
- if (v == NULL) return HashCombine32(0, seed);
+ // Use HashCombine with arbitrary constant to ensure we don't return seed.
+ if (v == NULL) return HashUtil::HashCombine32(HASH_VAL_NULL, seed);
switch (type.type) {
case TYPE_STRING:
case TYPE_VARCHAR: {
const StringValue* string_value = reinterpret_cast<const StringValue*>(v);
+ if (string_value->len == 0) {
+ return HashUtil::HashCombine32(HASH_VAL_EMPTY, seed);
+ }
return HashUtil::Hash(string_value->ptr, string_value->len, seed);
}
- case TYPE_BOOLEAN: return HashCombine32(*reinterpret_cast<const bool*>(v), seed);
+ case TYPE_BOOLEAN:
+ return HashUtil::HashCombine32(*reinterpret_cast<const bool*>(v), seed);
case TYPE_TINYINT: return HashUtil::Hash(v, 1, seed);
case TYPE_SMALLINT: return HashUtil::Hash(v, 2, seed);
case TYPE_INT: return HashUtil::Hash(v, 4, seed);
@@ -195,16 +192,20 @@ inline uint32_t RawValue::GetHashValue(const void* v, const ColumnType& type,
inline uint32_t RawValue::GetHashValueFnv(const void* v, const ColumnType& type,
uint32_t seed) {
- // Hash_combine with v = 0
- if (v == NULL) return HashCombine32(0, seed);
+ // Use HashCombine with arbitrary constant to ensure we don't return seed.
+ if (v == NULL) return HashUtil::HashCombine32(HASH_VAL_NULL, seed);
switch (type.type ) {
case TYPE_STRING:
case TYPE_VARCHAR: {
const StringValue* string_value = reinterpret_cast<const StringValue*>(v);
+ if (string_value->len == 0) {
+ return HashUtil::HashCombine32(HASH_VAL_EMPTY, seed);
+ }
return HashUtil::FnvHash64to32(string_value->ptr, string_value->len, seed);
}
- case TYPE_BOOLEAN: return HashCombine32(*reinterpret_cast<const bool*>(v), seed);
+ case TYPE_BOOLEAN:
+ return HashUtil::HashCombine32(*reinterpret_cast<const bool*>(v), seed);
case TYPE_TINYINT: return HashUtil::FnvHash64to32(v, 1, seed);
case TYPE_SMALLINT: return HashUtil::FnvHash64to32(v, 2, seed);
case TYPE_INT: return HashUtil::FnvHash64to32(v, 4, seed);
View
@@ -122,7 +122,13 @@ class HashUtil {
/// This technique is recommended instead of FNV-32 since the LSB of an FNV hash is the
/// XOR of the LSBs of its input bytes, leading to poor results for duplicate inputs.
/// The input seed 'hash' is duplicated so the top half of the seed is not all zero.
+ /// Data length must be at least 1 byte: zero-length data should be handled separately,
+ /// for example using CombineHash with a unique constant value to avoid returning the
+ /// hash argument. Zero-length data gives terrible results: the initial hash value is
+ /// xored with itself cancelling all bits.
static uint32_t FnvHash64to32(const void* data, int32_t bytes, uint32_t hash) {
+ // IMPALA-2270: this function should never be used for zero-byte inputs.
+ DCHECK_GT(bytes, 0);
uint64_t hash_u64 = hash | ((uint64_t)hash << 32);
hash_u64 = FnvHash64(data, bytes, hash_u64);
return (hash_u64 >> 32) ^ (hash_u64 & 0xFFFFFFFF);
@@ -140,6 +146,17 @@ class HashUtil {
}
}
+ /// The magic number (used in hash_combine()) 0x9e3779b9 = 2^32 / (golden ratio).
+ static const uint32_t HASH_COMBINE_SEED = 0x9e3779b9;
+
+ /// Combine hashes 'value' and 'seed' to get a new hash value. Similar to
+ /// boost::hash_combine(), but for uint32_t. This function should be used with a
+ /// constant first argument to update the hash value for zero-length values such as
+ /// NULL, boolean, and empty strings.
+ static inline uint32_t HashCombine32(uint32_t value, uint32_t seed) {
+ return seed ^ (HASH_COMBINE_SEED + value + (seed << 6) + (seed >> 2));
+ }
+
};
}

0 comments on commit 8ed5130

Please sign in to comment.