Skip to content

Commit

Permalink
apacheGH-17211: refresh history of scalar hash benchmark
Browse files Browse the repository at this point in the history
This commit includes additions to the general hashing benchmarks that
cover the use of hashing functions in key_hash.h without carrying the
burden of a long dev history.

Some existing benchmark names were changed to distinguish between the
use of Int32 and Int64 types, new benchmarks were added that use the
functions declared in key_hash.h. The reason the new benchmarks are
added is because it is claimed they prioritize speed over cryptography
as they're primarily used for join algorithms and other processing
tasks, which the hashing benchmark can now provide observability for.

Issue: apacheGH-17211
Issue: ARROW-8991
  • Loading branch information
drin committed Jan 12, 2024
1 parent 63f1b34 commit 5a635b2
Showing 1 changed file with 106 additions and 2 deletions.
108 changes: 106 additions & 2 deletions cpp/src/arrow/util/hashing_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,22 @@
#include "benchmark/benchmark.h"

#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"
#include "arrow/util/hashing.h"

#include "arrow/array/builder_primitive.h"
#include "arrow/compute/key_hash.h"

namespace arrow {
namespace internal {

namespace {
// copied from scalar_string_benchmark
constexpr auto kSeed = 0x94378165;

static random::RandomArrayGenerator hashing_rng(kSeed);
} // namespace

template <class Integer>
static std::vector<Integer> MakeIntegers(int32_t n_values) {
std::vector<Integer> values(n_values);
Expand Down Expand Up @@ -62,7 +73,22 @@ static std::vector<std::string> MakeStrings(int32_t n_values, int32_t min_length
return values;
}

static void HashIntegers(benchmark::State& state) { // NOLINT non-const reference
static void HashIntegers32(benchmark::State& state) { // NOLINT non-const reference
const std::vector<int32_t> values = MakeIntegers<int32_t>(10000);

while (state.KeepRunning()) {
hash_t total = 0;
for (const int32_t v : values) {
total += ScalarHelper<int32_t, 0>::ComputeHash(v);
total += ScalarHelper<int32_t, 1>::ComputeHash(v);
}
benchmark::DoNotOptimize(total);
}
state.SetBytesProcessed(2 * state.iterations() * values.size() * sizeof(int32_t));
state.SetItemsProcessed(2 * state.iterations() * values.size());
}

static void HashIntegers64(benchmark::State& state) { // NOLINT non-const reference
const std::vector<int64_t> values = MakeIntegers<int64_t>(10000);

while (state.KeepRunning()) {
Expand Down Expand Up @@ -111,13 +137,91 @@ static void HashLargeStrings(benchmark::State& state) { // NOLINT non-const ref
BenchmarkStringHashing(state, values);
}

static void KeyHashIntegers32(benchmark::State& state) { // NOLINT non-const reference
auto test_vals = hashing_rng.Int32(10000, 0, std::numeric_limits<int32_t>::max());

// initialize the stack allocator
util::TempVectorStack stack_memallocator;
ASSERT_OK(
stack_memallocator.Init(compute::default_exec_context()->memory_pool(),
3 * sizeof(int32_t) * util::MiniBatch::kMiniBatchLength));

// prepare the execution context for Hashing32
compute::LightContext hash_ctx;
hash_ctx.hardware_flags = compute::default_exec_context()->cpu_info()->hardware_flags();
hash_ctx.stack = &stack_memallocator;

// allocate memory for results
ASSERT_OK_AND_ASSIGN(std::unique_ptr<Buffer> hash_buffer,
AllocateBuffer(test_vals->length() * sizeof(int32_t)));

// run the benchmark
while (state.KeepRunning()) {
// Prepare input data structure for propagation to hash function
ASSERT_OK_AND_ASSIGN(
compute::KeyColumnArray input_keycol,
compute::ColumnArrayFromArrayData(test_vals->data(), 0, test_vals->length()));

compute::Hashing32::HashMultiColumn(
{input_keycol}, &hash_ctx,
reinterpret_cast<uint32_t*>(hash_buffer->mutable_data()));

// benchmark::DoNotOptimize(hash_buffer);
}

state.SetBytesProcessed(state.iterations() * test_vals->length() * sizeof(int32_t));
state.SetItemsProcessed(state.iterations() * test_vals->length());
}

static void KeyHashIntegers64(benchmark::State& state) { // NOLINT non-const reference
auto test_vals = hashing_rng.Int64(10000, 0, std::numeric_limits<int64_t>::max());

// initialize the stack allocator
util::TempVectorStack stack_memallocator;
ASSERT_OK(
stack_memallocator.Init(compute::default_exec_context()->memory_pool(),
3 * sizeof(int32_t) * util::MiniBatch::kMiniBatchLength));

// prepare the execution context for Hashing32
compute::LightContext hash_ctx;
hash_ctx.hardware_flags = compute::default_exec_context()->cpu_info()->hardware_flags();
hash_ctx.stack = &stack_memallocator;

// allocate memory for results
ASSERT_OK_AND_ASSIGN(std::unique_ptr<Buffer> hash_buffer,
AllocateBuffer(test_vals->length() * sizeof(int64_t)));

// run the benchmark
while (state.KeepRunning()) {
// Prepare input data structure for propagation to hash function
ASSERT_OK_AND_ASSIGN(
compute::KeyColumnArray input_keycol,
compute::ColumnArrayFromArrayData(test_vals->data(), 0, test_vals->length()));

compute::Hashing64::HashMultiColumn(
{input_keycol}, &hash_ctx,
reinterpret_cast<uint64_t*>(hash_buffer->mutable_data()));

// benchmark::DoNotOptimize(hash_buffer);
}

state.SetBytesProcessed(state.iterations() * test_vals->length() * sizeof(int64_t));
state.SetItemsProcessed(state.iterations() * test_vals->length());
}

// ----------------------------------------------------------------------
// Benchmark declarations

BENCHMARK(HashIntegers);
// Directly uses "Hashing" hash functions from hashing.h (xxHash)
BENCHMARK(HashIntegers32);
BENCHMARK(HashIntegers64);
BENCHMARK(HashSmallStrings);
BENCHMARK(HashMediumStrings);
BENCHMARK(HashLargeStrings);

// Directly uses "KeyHash" hash functions from key_hash.h (xxHash-like)
BENCHMARK(KeyHashIntegers32);
BENCHMARK(KeyHashIntegers64);

} // namespace internal
} // namespace arrow

0 comments on commit 5a635b2

Please sign in to comment.