Skip to content

Commit

Permalink
Added support to configure lower bound on per-thread cache size
Browse files Browse the repository at this point in the history
[alkondratenko@gmail.com: removed spurious new line at thread_cache.h]
Signed-off-by: Aliaksey Kandratsenka <alkondratenko@gmail.com>
  • Loading branch information
coder-saab001 authored and alk committed Jun 4, 2024
1 parent 9fb05f3 commit addf751
Show file tree
Hide file tree
Showing 9 changed files with 192 additions and 8 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@
/test-driver
/thread_dealloc_unittest
/thread_dealloc_unittest.exe
/per_thread_cache_size_test
/per_thread_cache_size_test.exe
/unique_path_unittest
/unique_path_unittest.exe
/unwind_bench
Expand Down
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -687,6 +687,10 @@ if(BUILD_TESTING)
src/tests/testutil.cc)
target_link_libraries(thread_dealloc_unittest tcmalloc_minimal)
add_test(thread_dealloc_unittest thread_dealloc_unittest)

add_executable(min_per_thread_cache_size_test src/tests/min_per_thread_cache_size_test.cc)
target_link_libraries(min_per_thread_cache_size_test tcmalloc_minimal gtest)
add_test(min_per_thread_cache_size_test min_per_thread_cache_size_test)
endif()

### ------- tcmalloc_minimal_debug (thread-caching malloc with debugallocation)
Expand Down
6 changes: 6 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,12 @@ thread_dealloc_unittest_SOURCES = src/tests/thread_dealloc_unittest.cc \
thread_dealloc_unittest_LDFLAGS = $(TCMALLOC_FLAGS) $(AM_LDFLAGS)
thread_dealloc_unittest_LDADD = libtcmalloc_minimal.la

TESTS += min_per_thread_cache_size_test
min_per_thread_cache_size_test_SOURCES = src/tests/min_per_thread_cache_size_test.cc
min_per_thread_cache_size_test_LDFLAGS = $(TCMALLOC_FLAGS) $(AM_LDFLAGS)
min_per_thread_cache_size_test_CPPFLAGS = $(gtest_CPPFLAGS)
min_per_thread_cache_size_test_LDADD = libtcmalloc_minimal.la libgtest.la

### Documentation
dist_doc_DATA += docs/tcmalloc.html \
docs/overview.gif \
Expand Down
14 changes: 12 additions & 2 deletions docs/tcmalloc.html
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ <h3>PTMalloc2 unittest</h3>
</table>


<ul>
<ul>
<li> TCMalloc is much more consistently scalable than PTMalloc2 - for
all thread counts &gt;1 it achieves ~7-9 million ops/sec for small
allocations, falling to ~2 million ops/sec for larger
Expand Down Expand Up @@ -517,7 +517,7 @@ <H2><A NAME="runtime">Modifying Runtime Behavior</A></H2>

<tr valign=top>
<td><code>TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES</code></td>
<td>default: 16777216</td>
<td>default: 33554432</td>
<td>
Bound on the total amount of bytes allocated to thread caches. This
bound is not strict, so it is possible for the cache to go over this
Expand Down Expand Up @@ -746,6 +746,16 @@ <h3>Generic Tcmalloc Status</h3>
</td>
</tr>

<tr valign=top>
<td><code>tcmalloc.min_per_thread_cache_bytes</code></td>
<td>
A lower limit to how much memory TCMalloc dedicates for small objects per
thread. Note that this property only shows effect if per-thread cache
calculated using tcmalloc.max_total_thread_cache_bytes ended up being less
than tcmalloc.min_per_thread_cache_bytes.
</td>
</tr>

</table>

<h2><A NAME="caveats">Caveats</A></h2>
Expand Down
9 changes: 8 additions & 1 deletion src/gperftools/malloc_extension.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,14 @@ class PERFTOOLS_DLL_DECL MallocExtension {
// --------
// "tcmalloc.max_total_thread_cache_bytes"
// Upper limit on total number of bytes stored across all
// per-thread caches. Default: 16MB.
// per-thread caches. Default: 32MB.
//
// "tcmalloc.min_per_thread_cache_bytes"
// Lower limit on total number of bytes stored per-thread cache.
// Default: 512kB.
// Note that this property only shows effect if per-thread cache
// calculated using tcmalloc.max_total_thread_cache_bytes ended up being
// less than tcmalloc.min_per_thread_cache_bytes.
//
// "tcmalloc.current_total_thread_cache_bytes"
// Number of bytes used across all thread caches.
Expand Down
10 changes: 10 additions & 0 deletions src/tcmalloc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -833,6 +833,11 @@ class TCMallocImplementation : public MallocExtension {
return true;
}

if (strcmp(name, "tcmalloc.min_per_thread_cache_bytes") == 0) {
*value = ThreadCache::min_per_thread_cache_size();
return true;
}

if (strcmp(name, "tcmalloc.current_total_thread_cache_bytes") == 0) {
TCMallocStats stats;
ExtractStats(&stats, NULL, NULL, NULL);
Expand Down Expand Up @@ -876,6 +881,11 @@ class TCMallocImplementation : public MallocExtension {
return true;
}

if (strcmp(name, "tcmalloc.min_per_thread_cache_bytes") == 0) {
ThreadCache::set_min_per_thread_cache_size(value);
return true;
}

if (strcmp(name, "tcmalloc.aggressive_memory_decommit") == 0) {
SpinLockHolder l(Static::pageheap_lock());
Static::pageheap()->SetAggressiveDecommit(value != 0);
Expand Down
124 changes: 124 additions & 0 deletions src/tests/min_per_thread_cache_size_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
/* -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
* Copyright (c) 2024, gperftools Contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include "config_for_unittests.h"

#include <new>
#include <thread>
#include <vector>

#include <gperftools/malloc_extension.h>
#include <gperftools/malloc_extension_c.h>

#include "base/logging.h"
#include "gtest/gtest.h"

// Number of allocations per thread.
static const int kAllocationsPerThread = 10000;

// Number of threads to create.
static const int kNumThreads = 50;

// Per thread cache size to set.
static const size_t kPerThreadCacheSize = 64 << 10;

// Number of passes to run.
static const int kNumPasses = 10;

// Get current total thread-cache size.
static size_t CurrentThreadCacheSize() {
size_t result = 0;
EXPECT_TRUE(MallocExtension::instance()->GetNumericProperty(
"tcmalloc.current_total_thread_cache_bytes",
&result));
return result;
}

// Maximum cache size seen so far.
static size_t max_cache_size;

// Mutex and condition variable to synchronize threads.
std::mutex filler_mtx;
std::condition_variable filler_cv;
int current_thread = 0;

// A thread that cycles through allocating lots of objects of varying
// size, in an attempt to fill up its thread cache.
void Filler(int thread_id, int num_threads) {
std::unique_lock<std::mutex> filler_lock(filler_mtx);
for (int i = 0; i < kNumPasses; i++) {
// Wait for the current thread to be the one that should run.
filler_cv.wait(filler_lock, [thread_id] { return thread_id == current_thread; });

// Fill the cache by allocating and deallocating objects of varying sizes.
int size = 0;
for (int i = 0; i < kAllocationsPerThread; i++) {
void* p = ::operator new(size);
::operator delete(p);
size += 64;
if (size > (32 << 10)) size = 0;
}

// Get the maximum cache size seen so far.
const size_t cache_size = CurrentThreadCacheSize();
max_cache_size = std::max(max_cache_size, cache_size);

// Move to the next thread.
current_thread = (current_thread + 1) % num_threads;
filler_cv.notify_all();
}
}

TEST(MinPerThreadCacheSizeTest, Basics) {
// Start all threads.
std::vector<std::thread> threads;
threads.reserve(kNumThreads);

// Set the lower bound on per cache size.
CHECK(MallocExtension::instance()->SetNumericProperty(
"tcmalloc.min_per_thread_cache_bytes", kPerThreadCacheSize));

// Setting the max total thread cache size to 0 to ensure that the
// per thread cache size is set to the lower bound.
CHECK(MallocExtension::instance()->SetNumericProperty(
"tcmalloc.max_total_thread_cache_bytes", 0));

for (int i = 0; i < kNumThreads; i++) {
threads.emplace_back(Filler, i, kNumThreads);
}

// Wait for all threads to finish.
for (auto& t : threads) { t.join(); }

// Check that the maximum cache size does not exceed the limit set.
ASSERT_LT(max_cache_size, kPerThreadCacheSize * kNumThreads);
}

18 changes: 13 additions & 5 deletions src/thread_cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ namespace tcmalloc {
static bool phinited = false;

volatile size_t ThreadCache::per_thread_cache_size_ = kMaxThreadCacheSize;

std::atomic<size_t> ThreadCache::min_per_thread_cache_size_ = kMinThreadCacheSize;
size_t ThreadCache::overall_thread_cache_size_ = kDefaultOverallThreadCacheSize;
ssize_t ThreadCache::unclaimed_cache_space_ = kDefaultOverallThreadCacheSize;
PageHeapAllocator<ThreadCache> threadcache_allocator;
Expand All @@ -84,10 +86,11 @@ ThreadCache::ThreadCache() {
if (max_size_ == 0) {
// There isn't enough memory to go around. Just give the minimum to
// this thread.
SetMaxSize(kMinThreadCacheSize);
size_t min_size = min_per_thread_cache_size_.load(std::memory_order_relaxed);
SetMaxSize(min_size);

// Take unclaimed_cache_space_ negative.
unclaimed_cache_space_ -= kMinThreadCacheSize;
unclaimed_cache_space_ -= min_size;
ASSERT(unclaimed_cache_space_ < 0);
}

Expand Down Expand Up @@ -267,7 +270,8 @@ void ThreadCache::IncreaseCacheLimitLocked() {
next_memory_steal_ = thread_heaps_;
}
if (next_memory_steal_ == this ||
next_memory_steal_->max_size_ <= kMinThreadCacheSize) {
next_memory_steal_->max_size_
<= min_per_thread_cache_size_.load(std::memory_order_relaxed)) {
continue;
}
next_memory_steal_->SetMaxSize(next_memory_steal_->max_size_ - kStealAmount);
Expand Down Expand Up @@ -352,8 +356,9 @@ void ThreadCache::RecomputePerThreadCacheSize() {
int n = thread_heap_count_ > 0 ? thread_heap_count_ : 1;
size_t space = overall_thread_cache_size_ / n;

size_t min_size = min_per_thread_cache_size_.load(std::memory_order_relaxed);
// Limit to allowed range
if (space < kMinThreadCacheSize) space = kMinThreadCacheSize;
if (space < min_size) space = min_size;
if (space > kMaxThreadCacheSize) space = kMaxThreadCacheSize;

double ratio = space / max<double>(1, per_thread_cache_size_);
Expand Down Expand Up @@ -383,7 +388,10 @@ void ThreadCache::GetThreadStats(uint64_t* total_bytes, uint64_t* class_count) {

void ThreadCache::set_overall_thread_cache_size(size_t new_size) {
// Clip the value to a reasonable range
if (new_size < kMinThreadCacheSize) new_size = kMinThreadCacheSize;
size_t min_size = min_per_thread_cache_size_.load(std::memory_order_relaxed);
if (new_size < min_size) {
new_size = min_size;
}
if (new_size > (1<<30)) new_size = (1<<30); // Limit to 1GB
overall_thread_cache_size_ = new_size;

Expand Down
13 changes: 13 additions & 0 deletions src/thread_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#define TCMALLOC_THREAD_CACHE_H_

#include <config.h>
#include <atomic>
#include <stddef.h> // for size_t, NULL
#include <stdint.h> // for uint32_t, uint64_t
#include <sys/types.h> // for ssize_t
Expand Down Expand Up @@ -114,6 +115,15 @@ class ThreadCache {
return overall_thread_cache_size_;
}

// Sets the lower bound on per-thread cache size to new_size.
static void set_min_per_thread_cache_size(size_t new_size) {
min_per_thread_cache_size_.store(new_size, std::memory_order_relaxed);
}

static size_t min_per_thread_cache_size() {
return min_per_thread_cache_size_.load(std::memory_order_relaxed);
}

static int thread_heap_count() {
return thread_heap_count_;
}
Expand Down Expand Up @@ -263,6 +273,9 @@ class ThreadCache {
// thread_heaps_. Protected by Static::pageheap_lock.
static ThreadCache* next_memory_steal_;

// Lower bound on per thread cache size. Default value is 512 KBs.
static std::atomic<size_t> min_per_thread_cache_size_;

// Overall thread cache size. Protected by Static::pageheap_lock.
static size_t overall_thread_cache_size_;

Expand Down

0 comments on commit addf751

Please sign in to comment.