Skip to content

Commit

Permalink
Implement virtual CPU support and provide rseq ABI extension.
Browse files Browse the repository at this point in the history
This is a prototype extension to the rseq() syscall.  Since a process may run on only a few cores at a time, we can use a dense set of "v(irtual) cpus."  This can reduce cache requirements, as we only need N caches for the cores we actually run on simultaneously, rather than a cache for every physical core.

This can reduce the RAM footprint of caches (reducing metadata since lazy cache initialization [b058521], freelisted objects) and improve hit rates, since a cache is more likely to be recently used even after core migration.

PiperOrigin-RevId: 318347733
Change-Id: I91eda0c35bfd0a7b658dbee38e5c1a469191b458
  • Loading branch information
ckennelly authored and Copybara-Service committed Jun 25, 2020
1 parent 4ff9fa2 commit ad136d4
Show file tree
Hide file tree
Showing 8 changed files with 283 additions and 18 deletions.
7 changes: 7 additions & 0 deletions tcmalloc/background.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/internal/percpu.h"
#include "tcmalloc/internal_malloc_extension.h"
#include "tcmalloc/malloc_extension.h"
#include "tcmalloc/parameters.h"
Expand Down Expand Up @@ -49,6 +50,12 @@ void ReleasePerCpuMemoryToOS() {
return;
}

if (subtle::percpu::UsingFlatVirtualCpus()) {
// Our (real) CPU mask does not provide useful information about the state
// of our virtual CPU set.
return;
}

// This can only fail due to a sandbox or similar intercepting the syscall.
if (sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus)) {
// We log periodically as start-up errors are frequently ignored and this is
Expand Down
8 changes: 4 additions & 4 deletions tcmalloc/cpu_cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

namespace tcmalloc {

using subtle::percpu::GetCurrentCpuUnsafe;
using subtle::percpu::GetCurrentVirtualCpuUnsafe;

// MaxCapacity() determines how we distribute memory in the per-cpu cache
// to the various class sizes.
Expand Down Expand Up @@ -166,7 +166,7 @@ void *CPUCache::Refill(int cpu, size_t cl) {
}
}
} while (got == batch_length && i == 0 && total < target &&
cpu == GetCurrentCpuUnsafe());
cpu == GetCurrentVirtualCpuUnsafe());

for (size_t i = 0; i < returned; ++i) {
ObjectClass *ret = &to_return[i];
Expand Down Expand Up @@ -389,7 +389,7 @@ size_t CPUCache::Steal(int cpu, size_t dest_cl, size_t bytes,
acquired += size;
}

if (cpu != GetCurrentCpuUnsafe() || acquired >= bytes) {
if (cpu != GetCurrentVirtualCpuUnsafe() || acquired >= bytes) {
// can't steal any more or don't need to
break;
}
Expand Down Expand Up @@ -421,7 +421,7 @@ int CPUCache::Overflow(void *ptr, size_t cl, int cpu) {
Static::transfer_cache()[cl].InsertRange(absl::Span<void *>(batch), count);
if (count != batch_length) break;
count = 0;
} while (total < target && cpu == GetCurrentCpuUnsafe());
} while (total < target && cpu == GetCurrentVirtualCpuUnsafe());
tracking::Report(kFreeTruncations, cl, 1);
return 1;
}
Expand Down
13 changes: 13 additions & 0 deletions tcmalloc/internal/linux_syscall_support.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,19 @@ struct kernel_rseq {
unsigned cpu_id;
unsigned long long rseq_cs;
unsigned flags;
unsigned padding[2];
// This is a prototype extension to the rseq() syscall. Since a process may
// run on only a few cores at a time, we can use a dense set of "v(irtual)
// cpus." This can reduce cache requirements, as we only need N caches for
// the cores we actually run on simultaneously, rather than a cache for every
// physical core.
union {
struct {
short numa_node_id;
short vcpu_id;
};
int vcpu_flat;
};
} __attribute__((aligned(4 * sizeof(unsigned long long))));

static_assert(sizeof(kernel_rseq) == (4 * sizeof(unsigned long long)),
Expand Down
30 changes: 29 additions & 1 deletion tcmalloc/internal/percpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ ABSL_PER_THREAD_TLS_KEYWORD ABSL_ATTRIBUTE_WEAK volatile kernel_rseq
static_cast<unsigned>(kCpuIdUninitialized),
0,
0,
{0, 0},
{{kCpuIdUninitialized, kCpuIdUninitialized}},
};

ABSL_PER_THREAD_TLS_KEYWORD ABSL_ATTRIBUTE_WEAK volatile uint32_t __rseq_refcount;
Expand All @@ -72,6 +74,9 @@ ABSL_ATTRIBUTE_UNUSED ABSL_ATTRIBUTE_NOINLINE void *tcmalloc_tls_fetch_pic() {
}
#endif

ABSL_CONST_INIT size_t tcmalloc_virtual_cpu_id_offset =
offsetof(kernel_rseq, cpu_id);

} // extern "C"

enum PerCpuInitStatus {
Expand Down Expand Up @@ -99,11 +104,26 @@ static bool InitThreadPerCpu() {
return false;
}

bool UsingFlatVirtualCpus() {
return false;
}

static void InitPerCpu() {
CHECK_CONDITION(absl::base_internal::NumCPUs() <=
std::numeric_limits<uint16_t>::max());

// Based on the results of successfully initializing the first thread, mark
// init_status to initialize all subsequent threads.
if (InitThreadPerCpu()) {
init_status = kFastMode;

#if PERCPU_USE_RSEQ
#ifdef __x86_64__
if (UsingFlatVirtualCpus()) {
tcmalloc_virtual_cpu_id_offset = offsetof(kernel_rseq, vcpu_id);
}
#endif // __x86_64__
#endif // PERCPU_USE_RSEQ
}
}

Expand Down Expand Up @@ -272,9 +292,17 @@ void FenceCpu(int cpu) {

// A useful fast path: nothing needs doing at all to order us with respect
// to our own CPU.
if (GetCurrentCpu() == cpu) {
if (GetCurrentVirtualCpu() == cpu) {
return;
}

if (UsingFlatVirtualCpus()) {
// With virtual CPUs, we cannot identify the true physical core we need to
// interrupt.
FenceInterruptCPUs(nullptr);
return;
}

cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(cpu, &set);
Expand Down
79 changes: 76 additions & 3 deletions tcmalloc/internal/percpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,26 @@ inline constexpr int kCpuIdInitialized = 0;
extern "C" ABSL_PER_THREAD_TLS_KEYWORD volatile kernel_rseq __rseq_abi;
extern "C" ABSL_PER_THREAD_TLS_KEYWORD volatile uint32_t __rseq_refcount;

// This is in units of bytes.
extern "C" size_t tcmalloc_virtual_cpu_id_offset;

static inline int RseqCpuId() { return __rseq_abi.cpu_id; }

static inline int VirtualRseqCpuId() {
#ifdef __x86_64__
ASSERT(tcmalloc_virtual_cpu_id_offset == offsetof(kernel_rseq, cpu_id) ||
tcmalloc_virtual_cpu_id_offset == offsetof(kernel_rseq, vcpu_id));
return *reinterpret_cast<short *>(reinterpret_cast<uintptr_t>(&__rseq_abi) +
tcmalloc_virtual_cpu_id_offset);
#else
ASSERT(tcmalloc_virtual_cpu_id_offset == offsetof(kernel_rseq, cpu_id));
return RseqCpuId();
#endif
}
#else // !PERCPU_USE_RSEQ
static inline int RseqCpuId() { return kCpuIdUnsupported; }

static inline int VirtualRseqCpuId() { return kCpuIdUnsupported; }
#endif

typedef int (*OverflowHandler)(int cpu, size_t cl, void *item);
Expand All @@ -96,6 +113,7 @@ typedef void *(*UnderflowHandler)(int cpu, size_t cl);
extern "C" {
int TcmallocSlab_PerCpuCmpxchg64(int target_cpu, intptr_t *p, intptr_t old_val,
intptr_t new_val);

#ifndef __x86_64__
int TcmallocSlab_Push(void *ptr, size_t cl, void *item, size_t shift,
OverflowHandler f);
Expand All @@ -109,15 +127,29 @@ void *TcmallocSlab_Pop_FixedShift(void *ptr, size_t cl, UnderflowHandler f);
// PERCPU_TCMALLOC_FIXED_SLAB_SHIFT
size_t TcmallocSlab_PushBatch_FixedShift(void *ptr, size_t cl, void **batch,
size_t len);

// Pop a batch for a slab which the Shift equal to
// PERCPU_TCMALLOC_FIXED_SLAB_SHIFT
size_t TcmallocSlab_PopBatch_FixedShift(void *ptr, size_t cl, void **batch,
size_t len);

#ifdef __x86_64__
int TcmallocSlab_PerCpuCmpxchg64_VCPU(int target_cpu, intptr_t *p,
intptr_t old_val, intptr_t new_val);
size_t TcmallocSlab_PushBatch_FixedShift_VCPU(void *ptr, size_t cl,
void **batch, size_t len);
size_t TcmallocSlab_PopBatch_FixedShift_VCPU(void *ptr, size_t cl, void **batch,
size_t len);
#endif
}

// NOTE: We skirt the usual naming convention slightly above using "_" to
// increase the visibility of functions embedded into the root-namespace (by
// virtue of C linkage) in the supported case.

// Return whether we are using flat virtual CPUs.
bool UsingFlatVirtualCpus();

inline int GetCurrentCpuUnsafe() {
// On PowerPC, Linux maintains the current CPU in the bottom 12 bits of special
// purpose register SPRG3, which is readable from user mode. References:
Expand Down Expand Up @@ -163,6 +195,32 @@ inline int GetCurrentCpu() {
return cpu;
}

inline int GetCurrentVirtualCpuUnsafe() { return VirtualRseqCpuId(); }

inline int GetCurrentVirtualCpu() {
// We can't use the unsafe version unless we have the appropriate version of
// the rseq extension. This also allows us a convenient escape hatch if the
// kernel changes the way it uses special-purpose registers for CPU IDs.
int cpu = VirtualRseqCpuId();

// We open-code the check for fast-cpu availability since we do not want to
// force initialization in the first-call case. This so done so that we can
// use this in places where it may not always be safe to initialize and so
// that it may serve in the future as a proxy for callers such as
// CPULogicalId() without introducing an implicit dependence on the fast-path
// extensions. Initialization is also simply unneeded on some platforms.
if (ABSL_PREDICT_TRUE(cpu >= kCpuIdInitialized)) {
return cpu;
}

#ifdef TCMALLOC_HAVE_SCHED_GETCPU
cpu = sched_getcpu();
ASSERT(cpu >= 0);
#endif // TCMALLOC_HAVE_SCHED_GETCPU

return cpu;
}

bool InitFastPerCpu();

inline bool IsFast() {
Expand Down Expand Up @@ -237,9 +295,24 @@ inline void TSANMemoryBarrierOn(void *p) {
inline int CompareAndSwapUnsafe(int target_cpu, std::atomic<intptr_t> *p,
intptr_t old_val, intptr_t new_val) {
TSANMemoryBarrierOn(p);
return TcmallocSlab_PerCpuCmpxchg64(
target_cpu, tcmalloc_internal::atomic_danger::CastToIntegral(p), old_val,
new_val);
#if PERCPU_USE_RSEQ
switch (tcmalloc_virtual_cpu_id_offset) {
case offsetof(kernel_rseq, cpu_id):
return TcmallocSlab_PerCpuCmpxchg64(
target_cpu, tcmalloc_internal::atomic_danger::CastToIntegral(p),
old_val, new_val);
#ifdef __x86_64__
case offsetof(kernel_rseq, vcpu_id):
return TcmallocSlab_PerCpuCmpxchg64_VCPU(
target_cpu, tcmalloc_internal::atomic_danger::CastToIntegral(p),
old_val, new_val);
#endif // __x86_64__
default:
__builtin_unreachable();
}
#else // !PERCPU_USE_RSEQ
__builtin_unreachable();
#endif // !PERCPU_USE_RSEQ
}

void FenceCpu(int cpu);
Expand Down
Loading

0 comments on commit ad136d4

Please sign in to comment.