Skip to content

Commit

Permalink
runtime: implement experiment to replace heap bitmap with alloc headers
Browse files Browse the repository at this point in the history
This change replaces the 1-bit-per-word heap bitmap for most size
classes with allocation headers for objects that contain pointers. The
header consists of a single pointer to a type. All allocations with
headers are treated as implicitly containing one or more instances of
the type in the header.

As the name implies, headers are usually stored as the first word of an
object. There are two additional exceptions to where headers are stored
and how they're used.

Objects smaller than 512 bytes do not have headers. Instead, a heap
bitmap is reserved at the end of spans for objects of this size. A full
word of overhead is too much for these small objects. The bitmap is of
the same format of the old bitmap, minus the noMorePtrs bits which are
unnecessary. All the objects <512 bytes have a bitmap less than a
pointer-word in size, and that was the granularity at which noMorePtrs
could stop scanning early anyway.

Objects that are larger than 32 KiB (which have their own span) have
their headers stored directly in the span, to allow power-of-two-sized
allocations to not spill over into an extra page.

The full implementation is behind GOEXPERIMENT=allocheaders.

The purpose of this change is performance. First and foremost, with
headers we no longer have to unroll pointer/scalar data at allocation
time for most size classes. Small size classes still need some
unrolling, but their bitmaps are small so we can optimize that case
fairly well. Larger objects effectively have their pointer/scalar data
unrolled on-demand from type data, which is much more compactly
represented and results in less TLB pressure. Furthermore, since the
headers are usually right next to the object and where we're about to
start scanning, we get an additional temporal locality benefit in the
data cache when looking up type metadata. The pointer/scalar data is
now effectively unrolled on-demand, but it's also simpler to unroll than
before; that unrolled data is never written anywhere, and for arrays we
get the benefit of retreading the same data per element, as opposed to
looking it up from scratch for each pointer-word of bitmap. Lastly,
because we no longer have a heap bitmap that spans the entire heap,
there's a flat 1.5% memory use reduction. This is balanced slightly by
some objects possibly being bumped up a size class, but most objects are
not tightly optimized to size class sizes so there's some memory to
spare, making the header basically free in those cases.

See the follow-up CL which turns on this experiment by default for
benchmark results. (CL 538217.)

Change-Id: I4c9034ee200650d06d8bdecd579d5f7c1bbf1fc5
Reviewed-on: https://go-review.googlesource.com/c/go/+/437955
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
  • Loading branch information
mknyszek committed Nov 9, 2023
1 parent 2586748 commit 38ac7c4
Show file tree
Hide file tree
Showing 23 changed files with 1,131 additions and 526 deletions.
3 changes: 3 additions & 0 deletions src/cmd/compile/internal/test/inl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,13 @@ func TestIntendedInlining(t *testing.T) {
"gclinkptr.ptr",
"guintptr.ptr",
"writeHeapBitsForAddr",
"heapBitsSlice",
"markBits.isMarked",
"muintptr.ptr",
"puintptr.ptr",
"spanOf",
"spanOfUnchecked",
"typePointers.nextFast",
"(*gcWork).putFast",
"(*gcWork).tryGetFast",
"(*guintptr).set",
Expand All @@ -86,6 +88,7 @@ func TestIntendedInlining(t *testing.T) {
"(*mspan).base",
"(*mspan).markBitsForBase",
"(*mspan).markBitsForIndex",
"(*mspan).writeHeapBits",
"(*muintptr).set",
"(*puintptr).set",
"(*wbBuf).get1",
Expand Down
27 changes: 20 additions & 7 deletions src/reflect/all_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7030,10 +7030,18 @@ func verifyGCBits(t *testing.T, typ Type, bits []byte) {
// e.g. with rep(2, lit(1, 0)).
bits = trimBitmap(bits)

if !bytes.Equal(heapBits, bits) {
_, _, line, _ := runtime.Caller(1)
t.Errorf("line %d: heapBits incorrect for %v\nhave %v\nwant %v", line, typ, heapBits, bits)
if bytes.HasPrefix(heapBits, bits) {
// Just the prefix matching is OK.
//
// The Go runtime's pointer/scalar iterator generates pointers beyond
// the size of the type, up to the size of the size class. This space
// is safe for the GC to scan since it's zero, and GCBits checks to
// make sure that's true. But we need to handle the fact that the bitmap
// may be larger than we expect.
return
}
_, _, line, _ := runtime.Caller(1)
t.Errorf("line %d: heapBits incorrect for %v\nhave %v\nwant %v", line, typ, heapBits, bits)
}

func verifyGCBitsSlice(t *testing.T, typ Type, cap int, bits []byte) {
Expand All @@ -7042,15 +7050,20 @@ func verifyGCBitsSlice(t *testing.T, typ Type, cap int, bits []byte) {
// repeat a bitmap for a small array or executing a repeat in
// a GC program.
val := MakeSlice(typ, 0, cap)
data := NewAt(ArrayOf(cap, typ.Elem()), val.UnsafePointer())
data := NewAt(typ.Elem(), val.UnsafePointer())
heapBits := GCBits(data.Interface())
// Repeat the bitmap for the slice size, trimming scalars in
// the last element.
bits = trimBitmap(rep(cap, bits))
if !bytes.Equal(heapBits, bits) {
_, _, line, _ := runtime.Caller(1)
t.Errorf("line %d: heapBits incorrect for make(%v, 0, %v)\nhave %v\nwant %v", line, typ, cap, heapBits, bits)
if bytes.Equal(heapBits, bits) {
return
}
if len(heapBits) > len(bits) && bytes.Equal(heapBits[:len(bits)], bits) {
// Just the prefix matching is OK.
return
}
_, _, line, _ := runtime.Caller(1)
t.Errorf("line %d: heapBits incorrect for make(%v, 0, %v)\nhave %v\nwant %v", line, typ, cap, heapBits, bits)
}

func TestGCBits(t *testing.T) {
Expand Down
70 changes: 49 additions & 21 deletions src/runtime/arena.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@
package runtime

import (
"internal/goarch"
"internal/goexperiment"
"runtime/internal/atomic"
"runtime/internal/math"
"unsafe"
Expand Down Expand Up @@ -218,6 +220,19 @@ func init() {
lockInit(&userArenaState.lock, lockRankUserArenaState)
}

// userArenaChunkReserveBytes returns the amount of additional bytes to reserve for
// heap metadata.
func userArenaChunkReserveBytes() uintptr {
if goexperiment.AllocHeaders {
// In the allocation headers experiment, we reserve the end of the chunk for
// a pointer/scalar bitmap. We also reserve space for a dummy _type that
// refers to the bitmap. The PtrBytes field of the dummy _type indicates how
// many of those bits are valid.
return userArenaChunkBytes/goarch.PtrSize/8 + unsafe.Sizeof(_type{})
}
return 0
}

type userArena struct {
// full is a list of full chunks that have not enough free memory left, and
// that we'll free once this user arena is freed.
Expand Down Expand Up @@ -491,9 +506,9 @@ func (s *mspan) userArenaNextFree(typ *_type, cap int) unsafe.Pointer {
// Set up heap bitmap and do extra accounting.
if typ.PtrBytes != 0 {
if cap >= 0 {
userArenaHeapBitsSetSliceType(typ, cap, ptr, s.base())
userArenaHeapBitsSetSliceType(typ, cap, ptr, s)
} else {
userArenaHeapBitsSetType(typ, ptr, s.base())
userArenaHeapBitsSetType(typ, ptr, s)
}
c := getMCache(mp)
if c == nil {
Expand Down Expand Up @@ -523,13 +538,13 @@ func (s *mspan) userArenaNextFree(typ *_type, cap int) unsafe.Pointer {
// userArenaHeapBitsSetSliceType is the equivalent of heapBitsSetType but for
// Go slice backing store values allocated in a user arena chunk. It sets up the
// heap bitmap for n consecutive values with type typ allocated at address ptr.
func userArenaHeapBitsSetSliceType(typ *_type, n int, ptr unsafe.Pointer, base uintptr) {
func userArenaHeapBitsSetSliceType(typ *_type, n int, ptr unsafe.Pointer, s *mspan) {
mem, overflow := math.MulUintptr(typ.Size_, uintptr(n))
if overflow || n < 0 || mem > maxAlloc {
panic(plainError("runtime: allocation size out of range"))
}
for i := 0; i < n; i++ {
userArenaHeapBitsSetType(typ, add(ptr, uintptr(i)*typ.Size_), base)
userArenaHeapBitsSetType(typ, add(ptr, uintptr(i)*typ.Size_), s)
}
}

Expand Down Expand Up @@ -591,9 +606,12 @@ func newUserArenaChunk() (unsafe.Pointer, *mspan) {
// TODO(mknyszek): Track individual objects.
rzSize := computeRZlog(span.elemsize)
span.elemsize -= rzSize
span.limit -= rzSize
span.userArenaChunkFree = makeAddrRange(span.base(), span.limit)
asanpoison(unsafe.Pointer(span.limit), span.npages*pageSize-span.elemsize)
if goexperiment.AllocHeaders {
span.largeType.Size_ = span.elemsize
}
rzStart := span.base() + span.elemsize
span.userArenaChunkFree = makeAddrRange(span.base(), rzStart)
asanpoison(unsafe.Pointer(rzStart), span.limit-rzStart)
asanunpoison(unsafe.Pointer(span.base()), span.elemsize)
}

Expand Down Expand Up @@ -694,7 +712,7 @@ func (s *mspan) setUserArenaChunkToFault() {
// the span gets off the quarantine list. The main reason is so that the
// amount of bytes allocated doesn't exceed how much is counted as
// "mapped ready," which could cause a deadlock in the pacer.
gcController.totalFree.Add(int64(s.npages * pageSize))
gcController.totalFree.Add(int64(s.elemsize))

// Update consistent stats to match.
//
Expand All @@ -704,11 +722,11 @@ func (s *mspan) setUserArenaChunkToFault() {
atomic.Xaddint64(&stats.committed, -int64(s.npages*pageSize))
atomic.Xaddint64(&stats.inHeap, -int64(s.npages*pageSize))
atomic.Xadd64(&stats.largeFreeCount, 1)
atomic.Xadd64(&stats.largeFree, int64(s.npages*pageSize))
atomic.Xadd64(&stats.largeFree, int64(s.elemsize))
memstats.heapStats.release()

// This counts as a free, so update heapLive.
gcController.update(-int64(s.npages*pageSize), 0)
gcController.update(-int64(s.elemsize), 0)

// Mark it as free for the race detector.
if raceenabled {
Expand Down Expand Up @@ -856,6 +874,10 @@ func (h *mheap) allocUserArenaChunk() *mspan {
spc := makeSpanClass(0, false)
h.initSpan(s, spanAllocHeap, spc, base, userArenaChunkPages)
s.isUserArenaChunk = true
s.elemsize -= userArenaChunkReserveBytes()
s.limit = s.base() + s.elemsize
s.freeindex = 1
s.allocCount = 1

// Account for this new arena chunk memory.
gcController.heapInUse.add(int64(userArenaChunkBytes))
Expand All @@ -866,22 +888,15 @@ func (h *mheap) allocUserArenaChunk() *mspan {
atomic.Xaddint64(&stats.committed, int64(userArenaChunkBytes))

// Model the arena as a single large malloc.
atomic.Xadd64(&stats.largeAlloc, int64(userArenaChunkBytes))
atomic.Xadd64(&stats.largeAlloc, int64(s.elemsize))
atomic.Xadd64(&stats.largeAllocCount, 1)
memstats.heapStats.release()

// Count the alloc in inconsistent, internal stats.
gcController.totalAlloc.Add(int64(userArenaChunkBytes))
gcController.totalAlloc.Add(int64(s.elemsize))

// Update heapLive.
gcController.update(int64(userArenaChunkBytes), 0)

// Put the large span in the mcentral swept list so that it's
// visible to the background sweeper.
h.central[spc].mcentral.fullSwept(h.sweepgen).push(s)
s.limit = s.base() + userArenaChunkBytes
s.freeindex = 1
s.allocCount = 1
gcController.update(int64(s.elemsize), 0)

// This must clear the entire heap bitmap so that it's safe
// to allocate noscan data without writing anything out.
Expand All @@ -902,6 +917,19 @@ func (h *mheap) allocUserArenaChunk() *mspan {
s.freeIndexForScan = 1

// Set up the range for allocation.
s.userArenaChunkFree = makeAddrRange(base, s.limit)
s.userArenaChunkFree = makeAddrRange(base, base+s.elemsize)

// Put the large span in the mcentral swept list so that it's
// visible to the background sweeper.
h.central[spc].mcentral.fullSwept(h.sweepgen).push(s)

if goexperiment.AllocHeaders {
// Set up an allocation header. Avoid write barriers here because this type
// is not a real type, and it exists in an invalid location.
*(*uintptr)(unsafe.Pointer(&s.largeType)) = uintptr(unsafe.Pointer(s.limit))
*(*uintptr)(unsafe.Pointer(&s.largeType.GCData)) = s.limit + unsafe.Sizeof(_type{})
s.largeType.PtrBytes = 0
s.largeType.Size_ = s.elemsize
}
return s
}
33 changes: 23 additions & 10 deletions src/runtime/cgocall.go
Original file line number Diff line number Diff line change
Expand Up @@ -664,19 +664,32 @@ func cgoCheckUnknownPointer(p unsafe.Pointer, msg string) (base, i uintptr) {
if base == 0 {
return
}
n := span.elemsize
hbits := heapBitsForAddr(base, n)
for {
var addr uintptr
if hbits, addr = hbits.next(); addr == 0 {
break
if goexperiment.AllocHeaders {
tp := span.typePointersOfUnchecked(base)
for {
var addr uintptr
if tp, addr = tp.next(base + span.elemsize); addr == 0 {
break
}
pp := *(*unsafe.Pointer)(unsafe.Pointer(addr))
if cgoIsGoPointer(pp) && !isPinned(pp) {
panic(errorString(msg))
}
}
pp := *(*unsafe.Pointer)(unsafe.Pointer(addr))
if cgoIsGoPointer(pp) && !isPinned(pp) {
panic(errorString(msg))
} else {
n := span.elemsize
hbits := heapBitsForAddr(base, n)
for {
var addr uintptr
if hbits, addr = hbits.next(); addr == 0 {
break
}
pp := *(*unsafe.Pointer)(unsafe.Pointer(addr))
if cgoIsGoPointer(pp) && !isPinned(pp) {
panic(errorString(msg))
}
}
}

return
}

Expand Down
32 changes: 23 additions & 9 deletions src/runtime/cgocheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ package runtime

import (
"internal/goarch"
"internal/goexperiment"
"unsafe"
)

Expand Down Expand Up @@ -176,16 +177,29 @@ func cgoCheckTypedBlock(typ *_type, src unsafe.Pointer, off, size uintptr) {
}

// src must be in the regular heap.

hbits := heapBitsForAddr(uintptr(src), size)
for {
var addr uintptr
if hbits, addr = hbits.next(); addr == 0 {
break
if goexperiment.AllocHeaders {
tp := s.typePointersOf(uintptr(src), size)
for {
var addr uintptr
if tp, addr = tp.next(uintptr(src) + size); addr == 0 {
break
}
v := *(*unsafe.Pointer)(unsafe.Pointer(addr))
if cgoIsGoPointer(v) && !isPinned(v) {
throw(cgoWriteBarrierFail)
}
}
v := *(*unsafe.Pointer)(unsafe.Pointer(addr))
if cgoIsGoPointer(v) && !isPinned(v) {
throw(cgoWriteBarrierFail)
} else {
hbits := heapBitsForAddr(uintptr(src), size)
for {
var addr uintptr
if hbits, addr = hbits.next(); addr == 0 {
break
}
v := *(*unsafe.Pointer)(unsafe.Pointer(addr))
if cgoIsGoPointer(v) && !isPinned(v) {
throw(cgoWriteBarrierFail)
}
}
}
}
Expand Down
11 changes: 10 additions & 1 deletion src/runtime/export_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ package runtime
import (
"internal/abi"
"internal/goarch"
"internal/goexperiment"
"internal/goos"
"runtime/internal/atomic"
"runtime/internal/sys"
Expand Down Expand Up @@ -326,6 +327,14 @@ func BenchSetTypeSlice[T any](n int, resetTimer func(), len int) {
// no valid racectx, but if we're instantiated in the runtime_test package,
// we might accidentally cause runtime code to be incorrectly instrumented.
func benchSetType(n int, resetTimer func(), len int, x unsafe.Pointer, t *_type) {
// This benchmark doesn't work with the allocheaders experiment. It sets up
// an elaborate scenario to be able to benchmark the function safely, but doing
// this work for the allocheaders' version of the function would be complex.
// Just fail instead and rely on the test code making sure we never get here.
if goexperiment.AllocHeaders {
panic("called benchSetType with allocheaders experiment enabled")
}

// Compute the input sizes.
size := t.Size() * uintptr(len)

Expand All @@ -340,7 +349,7 @@ func benchSetType(n int, resetTimer func(), len int, x unsafe.Pointer, t *_type)

// Round up the size to the size class to make the benchmark a little more
// realistic. However, validate it, to make sure this is safe.
allocSize := roundupsize(size)
allocSize := roundupsize(size, t.PtrBytes == 0)
if s.npages*pageSize < allocSize {
panic("backing span not large enough for benchmark")
}
Expand Down
7 changes: 7 additions & 0 deletions src/runtime/gc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package runtime_test

import (
"fmt"
"internal/goexperiment"
"math/rand"
"os"
"reflect"
Expand Down Expand Up @@ -457,11 +458,17 @@ func BenchmarkSetTypeNode1024Slice(b *testing.B) {
}

func benchSetType[T any](b *testing.B) {
if goexperiment.AllocHeaders {
b.Skip("not supported with allocation headers experiment")
}
b.SetBytes(int64(unsafe.Sizeof(*new(T))))
runtime.BenchSetType[T](b.N, b.ResetTimer)
}

func benchSetTypeSlice[T any](b *testing.B, len int) {
if goexperiment.AllocHeaders {
b.Skip("not supported with allocation headers experiment")
}
b.SetBytes(int64(unsafe.Sizeof(*new(T)) * uintptr(len)))
runtime.BenchSetTypeSlice[T](b.N, b.ResetTimer, len)
}
Expand Down
11 changes: 9 additions & 2 deletions src/runtime/gcinfo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,17 @@ func TestGCInfo(t *testing.T) {

func verifyGCInfo(t *testing.T, name string, p any, mask0 []byte) {
mask := runtime.GCMask(p)
if !bytes.Equal(mask, mask0) {
t.Errorf("bad GC program for %v:\nwant %+v\ngot %+v", name, mask0, mask)
if bytes.HasPrefix(mask, mask0) {
// Just the prefix matching is OK.
//
// The Go runtime's pointer/scalar iterator generates pointers beyond
// the size of the type, up to the size of the size class. This space
// is safe for the GC to scan since it's zero, and GCBits checks to
// make sure that's true. But we need to handle the fact that the bitmap
// may be larger than we expect.
return
}
t.Errorf("bad GC program for %v:\nwant %+v\ngot %+v", name, mask0, mask)
}

func trimDead(mask []byte) []byte {
Expand Down

0 comments on commit 38ac7c4

Please sign in to comment.