Skip to content

Commit

Permalink
Use Hugepages to back AllocationPool/HashStringAllocator (#5516)
Browse files Browse the repository at this point in the history
Summary:
Use Hugepages to back AllocationPool/HashStringAllocator

Simplifies AllocationPool to consider a mix of small and large
allocations. We present a single set of allocated ranges to the user,
e.g. RowContainer or HashStringAllocator. We do not export
memory::Allocations.

Adaptively starts using huge pages in Allocationpool after passing a
threshold size. We allocate large chunks, 32MB - 512MB and request
huge pages for the range. We do not give these out as a unit
however. Instead we increase the reservation one 2MB huge page at a
time as and when these so far unbacked addresses are given out. The OS
will at some point migrate the ranges to huge pages.

This is seen to speed up hash tables by some 20% over just having the hash table arrays as huge pages.

Adds a SIMD word wide tail to address ranges in HashStringAllocator. This means that any byte in content, also at end of allocations, is safe to access with full width.

Changes the iterators in RowContainer to use the simplified AllocationPool range API.

Pull Request resolved: #5516

Reviewed By: mbasmanova

Differential Revision: D47220175

Pulled By: oerling

fbshipit-source-id: a60127d46fed2960724ac65e780e7ea90937c6f1
  • Loading branch information
Orri Erling authored and facebook-github-bot committed Jul 12, 2023
1 parent e846ebb commit 9134514
Show file tree
Hide file tree
Showing 25 changed files with 777 additions and 307 deletions.
16 changes: 13 additions & 3 deletions velox/common/caching/AsyncDataCache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -623,13 +623,23 @@ bool AsyncDataCache::allocateNonContiguous(
}

bool AsyncDataCache::allocateContiguous(
MachinePageCount numPages,
memory::MachinePageCount numPages,
memory::Allocation* collateral,
memory::ContiguousAllocation& allocation,
ReservationCallback reservationCB) {
ReservationCallback reservationCB,
memory::MachinePageCount maxPages) {
return makeSpace(numPages, [&]() {
return allocator_->allocateContiguous(
numPages, collateral, allocation, reservationCB);
numPages, collateral, allocation, reservationCB, maxPages);
});
}

bool AsyncDataCache::growContiguous(
MachinePageCount increment,
memory::ContiguousAllocation& allocation,
ReservationCallback reservationCB) {
return makeSpace(increment, [&]() {
return allocator_->growContiguous(increment, allocation, reservationCB);
});
}

Expand Down
8 changes: 7 additions & 1 deletion velox/common/caching/AsyncDataCache.h
Original file line number Diff line number Diff line change
Expand Up @@ -663,12 +663,18 @@ class AsyncDataCache : public memory::MemoryAllocator {
memory::MachinePageCount numPages,
memory::Allocation* FOLLY_NULLABLE collateral,
memory::ContiguousAllocation& allocation,
ReservationCallback reservationCB = nullptr) override;
ReservationCallback reservationCB = nullptr,
memory::MachinePageCount maxPages = 0) override;

void freeContiguous(memory::ContiguousAllocation& allocation) override {
allocator_->freeContiguous(allocation);
}

bool growContiguous(
memory::MachinePageCount increment,
memory::ContiguousAllocation& allocation,
ReservationCallback reservationCB = nullptr) override;

void* allocateBytes(uint64_t bytes, uint16_t alignment) override;

void freeBytes(void* p, uint64_t size) noexcept override {
Expand Down
22 changes: 20 additions & 2 deletions velox/common/memory/Allocation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,17 @@ ContiguousAllocation::~ContiguousAllocation() {
}
}

void ContiguousAllocation::set(void* data, uint64_t size) {
void ContiguousAllocation::set(void* data, uint64_t size, uint64_t maxSize) {
data_ = data;
size_ = size;
maxSize_ = maxSize != 0 ? maxSize : size;
sanityCheck();
}

void ContiguousAllocation::grow(MachinePageCount increment) {
pool_->growContiguous(increment, *this);
}

void ContiguousAllocation::clear() {
pool_ = nullptr;
set(nullptr, 0);
Expand All @@ -92,11 +97,24 @@ MachinePageCount ContiguousAllocation::numPages() const {
AllocationTraits::kPageSize;
}

std::optional<folly::Range<char*>> ContiguousAllocation::hugePageRange() const {
auto begin = reinterpret_cast<uintptr_t>(data_);
auto roundedBegin = bits::roundUp(begin, AllocationTraits::kHugePageSize);
auto roundedEnd = (begin + maxSize_) / AllocationTraits::kHugePageSize *
AllocationTraits::kHugePageSize;
if (roundedEnd <= roundedBegin) {
return std::nullopt;
}
return folly::Range<char*>(
reinterpret_cast<char*>(roundedBegin), roundedEnd - roundedBegin);
}

std::string ContiguousAllocation::toString() const {
return fmt::format(
"ContiguousAllocation[data:{}, size:{}, pool:{}]",
"ContiguousAllocation[data:{}, size:{}, maxSize: {}, pool:{}]",
data_,
size_,
maxSize_,
pool_ == nullptr ? "null" : "set");
}
} // namespace facebook::velox::memory
34 changes: 32 additions & 2 deletions velox/common/memory/Allocation.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#pragma once

#include <folly/Range.h>
#include <cstdint>

#include "velox/common/base/BitUtil.h"
Expand All @@ -33,6 +34,9 @@ struct AllocationTraits {
/// Defines a machine page size in bytes.
static constexpr uint64_t kPageSize = 4096;

/// Size of huge page as intended with MADV_HUGEPAGE.
static constexpr uint64_t kHugePageSize = 2 << 20; // 2MB

/// Returns the bytes of the given number pages.
FOLLY_ALWAYS_INLINE static uint64_t pageBytes(MachinePageCount numPages) {
return numPages * kPageSize;
Expand Down Expand Up @@ -187,6 +191,7 @@ class ContiguousAllocation {
pool_ = other.pool_;
data_ = other.data_;
size_ = other.size_;
maxSize_ = other.maxSize_;
other.clear();
sanityCheck();
return *this;
Expand All @@ -196,6 +201,7 @@ class ContiguousAllocation {
pool_ = other.pool_;
data_ = other.data_;
size_ = other.size_;
maxSize_ = other.maxSize_;
other.clear();
sanityCheck();
}
Expand All @@ -212,6 +218,10 @@ class ContiguousAllocation {
return size_;
}

/// Returns the largest huge page range covered by 'this' or std::nullopt if
/// no full huge page is fully contained in 'this'.
std::optional<folly::Range<char*>> hugePageRange() const;

/// Invoked by memory pool to set the ownership on allocation success. All
/// the external contiguous memory allocations go through memory pool.
///
Expand All @@ -228,12 +238,27 @@ class ContiguousAllocation {

bool empty() const {
sanityCheck();
return size_ == 0;
return maxSize_ == 0;
}

void set(void* data, uint64_t size);
/// Sets the pointer and sizes. If maxSize is not specified it defaults to
/// 'size'.
void set(void* data, uint64_t size, uint64_t maxSize_ = 0);

// Adjusts 'size' towards 'maxSize' by 'increment' pages. Rounds
// 'increment' to huge pages, since this is the unit of growth of
// RSS for large contiguous runs. Increases the reservation in in
// 'pool_' and its allocator. May fail by cap exceeded. If failing,
// the size is not changed. 'size_' cannot exceed 'maxSize_'.
void grow(MachinePageCount increment);

void clear();

/// Returns the maximum size
uint64_t maxSize() const {
return maxSize_;
}

std::string toString() const;

private:
Expand All @@ -244,6 +269,11 @@ class ContiguousAllocation {

MemoryPool* pool_{nullptr};
void* data_{nullptr};

// Offset of first byte in 'data_' not counted reserved in 'pool_'.
uint64_t size_{0};

// Offset of first byte after the mmap of 'data'.
uint64_t maxSize_{0};
};
} // namespace facebook::velox::memory
106 changes: 76 additions & 30 deletions velox/common/memory/AllocationPool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,39 +21,48 @@

namespace facebook::velox {

folly::Range<char*> AllocationPool::rangeAt(int32_t index) const {
if (index < allocations_.size()) {
auto run = allocations_[index].runAt(0);
return folly::Range<char*>(
run.data<char>(),
run.data<char>() == startOfRun_ ? currentOffset_ : run.numBytes());
}
const auto largeIndex = index - allocations_.size();
if (largeIndex < largeAllocations_.size()) {
auto range = largeAllocations_[largeIndex].hugePageRange().value();
if (range.data() == startOfRun_) {
return folly::Range<char*>(range.data(), currentOffset_);
}
return range;
}
VELOX_FAIL("Out of range index for rangeAt(): {}", index);
}

void AllocationPool::clear() {
// Trigger Allocation's destructor to free allocated memory
auto copy = std::move(allocation_);
allocations_.clear();
auto copyLarge = std::move(largeAllocations_);
largeAllocations_.clear();
startOfRun_ = nullptr;
bytesInRun_ = 0;
currentOffset_ = 0;
usedBytes_ = 0;
}

char* AllocationPool::allocateFixed(uint64_t bytes, int32_t alignment) {
VELOX_CHECK_GT(bytes, 0, "Cannot allocate zero bytes");
if (availableInRun() >= bytes && alignment == 1) {
auto* result = currentRun().data<char>() + currentOffset_;
auto* result = startOfRun_ + currentOffset_;
currentOffset_ += bytes;
if (currentOffset_ > endOfReservedRun()) {
growLastAllocation();
}
return result;
}
VELOX_CHECK_EQ(
__builtin_popcount(alignment), 1, "Alignment can only be power of 2");

auto numPages = memory::AllocationTraits::numPages(bytes + alignment - 1);

// Use contiguous allocations from mapped memory if allocation size is large
if (numPages > pool_->largestSizeClass()) {
auto largeAlloc = std::make_unique<memory::ContiguousAllocation>();
pool_->allocateContiguous(numPages, *largeAlloc);
largeAllocations_.emplace_back(std::move(largeAlloc));
auto result = largeAllocations_.back()->data<char>();
VELOX_CHECK_NOT_NULL(
result, "Unexpected nullptr for large contiguous allocation");
// Should be at page boundary and always aligned.
VELOX_CHECK_EQ(reinterpret_cast<uintptr_t>(result) % alignment, 0);
return result;
}

if (availableInRun() == 0) {
newRunImpl(numPages);
} else {
Expand All @@ -63,31 +72,68 @@ char* AllocationPool::allocateFixed(uint64_t bytes, int32_t alignment) {
newRunImpl(numPages);
}
}
auto run = currentRun();
currentOffset_ += memory::alignmentPadding(firstFreeInRun(), alignment);
uint64_t size = run.numBytes();
VELOX_CHECK_LE(bytes + currentOffset_, size);
auto* result = run.data<char>() + currentOffset_;
VELOX_CHECK_LE(bytes + currentOffset_, bytesInRun_);
auto* result = startOfRun_ + currentOffset_;
VELOX_CHECK_EQ(reinterpret_cast<uintptr_t>(result) % alignment, 0);
currentOffset_ += bytes;
if (currentOffset_ > endOfReservedRun()) {
growLastAllocation();
}
return result;
}

void AllocationPool::growLastAllocation() {
VELOX_CHECK_GT(bytesInRun_, kHugePageSize);
auto bytesToReserve =
bits::roundUp(currentOffset_ - endOfReservedRun(), kHugePageSize);
largeAllocations_.back().grow(bytesToReserve / kPageSize);
usedBytes_ += bytesToReserve;
}

void AllocationPool::newRunImpl(memory::MachinePageCount numPages) {
++currentRun_;
if (currentRun_ >= allocation_.numRuns()) {
if (allocation_.numRuns() > 0) {
allocations_.push_back(
std::make_unique<memory::Allocation>(std::move(allocation_)));
if (usedBytes_ >= hugePageThreshold_ ||
numPages > pool_->sizeClasses().back()) {
// At least 16 huge pages, no more than kMaxMmapBytes. The next is
// double the previous. Because the previous is a hair under the
// power of two because of fractional pages at ends of allocation,
// add an extra huge page size.
int64_t nextSize = std::min(
kMaxMmapBytes,
std::max<int64_t>(
16 * kHugePageSize,
bits::nextPowerOfTwo(usedBytes_ + kHugePageSize)));
// Round 'numPages' to no of pages in huge page. Allocating this plus an
// extra huge page guarantees that 'numPages' worth of contiguous aligned
// huge pages will be founfd in the allocation.
numPages = bits::roundUp(numPages, kHugePageSize / kPageSize);
if (numPages * kPageSize + kHugePageSize > nextSize) {
// Extra large single request.
nextSize = numPages * kPageSize + kHugePageSize;
}
pool_->allocateNonContiguous(
std::max<int32_t>(kMinPages, numPages), allocation_, numPages);
currentRun_ = 0;
memory::ContiguousAllocation largeAlloc;
pool_->allocateContiguous(
kHugePageSize / kPageSize, largeAlloc, nextSize / kPageSize);
auto range = largeAlloc.hugePageRange().value();
startOfRun_ = range.data();
bytesInRun_ = range.size();
largeAllocations_.emplace_back(std::move(largeAlloc));
currentOffset_ = 0;
usedBytes_ += kHugePageSize;
return;
}
memory::Allocation allocation;
auto roundedPages = std::max<int32_t>(kMinPages, numPages);
pool_->allocateNonContiguous(roundedPages, allocation, roundedPages);
VELOX_CHECK_EQ(allocation.numRuns(), 1);
startOfRun_ = allocation.runAt(0).data<char>();
bytesInRun_ = allocation.runAt(0).numBytes();
currentOffset_ = 0;
allocations_.push_back(std::move(allocation));
usedBytes_ += bytesInRun_;
}

void AllocationPool::newRun(int32_t preferredSize) {
void AllocationPool::newRun(int64_t preferredSize) {
newRunImpl(memory::AllocationTraits::numPages(preferredSize));
}

Expand Down
Loading

0 comments on commit 9134514

Please sign in to comment.