Skip to content
Permalink
Browse files

runtime: support a two-level arena map

Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.

However, there are two problems with this:

1. mips64, ppc64, and s390x support full 64-bit address spaces (though
   on Linux only s390x has kernel support for 64-bit address spaces).
   On these platforms, it would be good to support these larger
   address spaces.

2. On Windows, processes are charged for untouched memory, so for
   processes with small heaps, the mostly-untouched 32 MB arena map
   plus a 64 MB arena are significant overhead. Hence, it would be
   good to reduce both the arena map size and the arena size, but with
   a single-level arena, these are inversely proportional.

This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.

At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:

1. We embed the L2 array directly in mheap, so if there's a single
   entry in the L2 array, the representation is identical to the
   current representation and there's no extra level of indirection.

2. Hot code that accesses the arena map is structured so that it
   optimizes to nearly the same machine code as it does currently.

3. We make some small tweaks to hot code paths and to the inliner
   itself to keep some important functions inlined despite their
   now-larger ASTs. In particular, this is necessary for
   heapBitsForAddr and heapBits.next.

Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:

name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.26ms ± 1%  -1.07%  (p=0.000 n=17+19)

(https://perf.golang.org/search?q=upload:20180223.2)

For #23900.

Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
  • Loading branch information...
aclements committed Feb 23, 2018
1 parent 2dbf15e commit ec25210564562571aeb39cdfd6e02270d7f3fb1d
Showing with 197 additions and 70 deletions.
  1. +15 −0 src/cmd/compile/internal/gc/inl.go
  2. +9 −3 src/runtime/heapdump.go
  3. +57 −20 src/runtime/malloc.go
  4. +49 −26 src/runtime/mbitmap.go
  5. +67 −21 src/runtime/mheap.go
@@ -304,6 +304,21 @@ func (v *hairyVisitor) visit(n *Node) bool {
if t.Nname() == nil {
Fatalf("no function definition for [%p] %+v\n", t, t)
}
if isRuntimePkg(n.Left.Sym.Pkg) {
fn := n.Left.Sym.Name
if fn == "heapBits.nextArena" {
// Special case: explicitly allow
// mid-stack inlining of
// runtime.heapBits.next even though
// it calls slow-path
// runtime.heapBits.nextArena.
//
// TODO(austin): Once mid-stack
// inlining is the default, remove
// this special case.
break
}
}
if inlfn := asNode(t.FuncType().Nname).Func; inlfn.Inl.Len() != 0 {
v.budget -= inlfn.InlCost
break
@@ -489,9 +489,15 @@ func dumpparams() {
}
dumpint(sys.PtrSize)
var arenaStart, arenaEnd uintptr
for i, ha := range mheap_.arenas {
if ha != nil {
base := arenaBase(uint(i))
for i1 := range mheap_.arenas {
if mheap_.arenas[i1] == nil {
continue
}
for i, ha := range mheap_.arenas[i1] {
if ha == nil {
continue
}
base := arenaBase(arenaIdx(i1)<<arenaL1Shift | arenaIdx(i))
if arenaStart == 0 || base < arenaStart {
arenaStart = base
}
@@ -92,8 +92,10 @@
// Since arenas are aligned, the address space can be viewed as a
// series of arena frames. The arena map (mheap_.arenas) maps from
// arena frame number to *heapArena, or nil for parts of the address
// space not backed by the Go heap. Since arenas are large, the arena
// index is just a single-level mapping.
// space not backed by the Go heap. The arena map is structured as a
// two-level array consisting of a "L1" arena map and many "L2" arena
// maps; however, since arenas are large, on many architectures, the
// arena map consists of a single, large L2 map.
//
// The arena map covers the entire possible address space, allowing
// the Go heap to use any part of the address space. The allocator
@@ -202,11 +204,6 @@ const (
// space because doing so is cheap.
// mips32 only has access to the low 2GB of virtual memory, so
// we further limit it to 31 bits.
//
// The size of the arena map is proportional to
// 1<<heapAddrBits, so it's important that this not be too
// large. 48 bits is about the threshold; above that we would
// need to go to a two level arena map.
heapAddrBits = _64bit*48 + (1-_64bit)*(32-(sys.GoarchMips+sys.GoarchMipsle))

// maxAlloc is the maximum size of an allocation. On 64-bit,
@@ -219,13 +216,49 @@ const (
// heapArenaBytes is the size of a heap arena. The heap
// consists of mappings of size heapArenaBytes, aligned to
// heapArenaBytes. The initial heap mapping is one arena.
heapArenaBytes = (64<<20)*_64bit + (4<<20)*(1-_64bit)
//
// This is currently 64MB on 64-bit and 4MB on 32-bit.
heapArenaBytes = 1 << logHeapArenaBytes

// logHeapArenaBytes is log_2 of heapArenaBytes. For clarity,
// prefer using heapArenaBytes where possible (we need the
// constant to compute some other constants).
logHeapArenaBytes = (6+20)*_64bit + (2+20)*(1-_64bit)

// heapArenaBitmapBytes is the size of each heap arena's bitmap.
heapArenaBitmapBytes = heapArenaBytes / (sys.PtrSize * 8 / 2)

pagesPerArena = heapArenaBytes / pageSize

// arenaL1Bits is the number of bits of the arena number
// covered by the first level arena map.
//
// This number should be small, since the first level arena
// map requires PtrSize*(1<<arenaL1Bits) of space in the
// binary's BSS. It can be zero, in which case the first level
// index is effectively unused. There is a performance benefit
// to this, since the generated code can be more efficient,
// but comes at the cost of having a large L2 mapping.
arenaL1Bits = 0

// arenaL2Bits is the number of bits of the arena number
// covered by the second level arena index.
//
// The size of each arena map allocation is proportional to
// 1<<arenaL2Bits, so it's important that this not be too
// large. 48 bits leads to 32MB arena index allocations, which
// is about the practical threshold.
arenaL2Bits = heapAddrBits - logHeapArenaBytes - arenaL1Bits

// arenaL1Shift is the number of bits to shift an arena frame
// number by to compute an index into the first level arena map.
arenaL1Shift = arenaL2Bits

// arenaBits is the total bits in a combined arena map index.
// This is split between the index into the L1 arena map and
// the L2 arena map.
arenaBits = arenaL1Bits + arenaL2Bits

// arenaBaseOffset is the pointer value that corresponds to
// index 0 in the heap arena map.
//
@@ -323,12 +356,6 @@ func mallocinit() {
throw("bad system page size")
}

// Map the arena map. Most of this will never be written to,
mheap_.arenas = (*[(1 << heapAddrBits) / heapArenaBytes]*heapArena)(persistentalloc(unsafe.Sizeof(*mheap_.arenas), sys.PtrSize, nil))
if mheap_.arenas == nil {
throw("failed to allocate arena map")
}

// Initialize the heap.
mheap_.init()
_g_ := getg()
@@ -398,7 +425,7 @@ func mallocinit() {
// 3. We try to stake out a reasonably large initial
// heap reservation.

const arenaMetaSize = unsafe.Sizeof(heapArena{}) * uintptr(len(*mheap_.arenas))
const arenaMetaSize = unsafe.Sizeof([1 << arenaBits]heapArena{})
meta := uintptr(sysReserve(nil, arenaMetaSize))
if meta != 0 {
mheap_.heapArenaAlloc.init(meta, arenaMetaSize)
@@ -476,7 +503,7 @@ func (h *mheap) sysAlloc(n uintptr) (v unsafe.Pointer, size uintptr) {
if p+n < p {
// We can't use this, so don't ask.
v = nil
} else if arenaIndex(p+n-1) >= uint(len(mheap_.arenas)) {
} else if arenaIndex(p+n-1) >= 1<<arenaBits {
// Outside addressable heap. Can't use.
v = nil
} else {
@@ -528,9 +555,9 @@ func (h *mheap) sysAlloc(n uintptr) (v unsafe.Pointer, size uintptr) {
p := uintptr(v)
if p+size < p {
bad = "region exceeds uintptr range"
} else if arenaIndex(p) >= uint(len(mheap_.arenas)) {
} else if arenaIndex(p) >= 1<<arenaBits {
bad = "base outside usable address space"
} else if arenaIndex(p+size-1) >= uint(len(mheap_.arenas)) {
} else if arenaIndex(p+size-1) >= 1<<arenaBits {
bad = "end outside usable address space"
}
if bad != "" {
@@ -551,7 +578,17 @@ func (h *mheap) sysAlloc(n uintptr) (v unsafe.Pointer, size uintptr) {
mapped:
// Create arena metadata.
for ri := arenaIndex(uintptr(v)); ri <= arenaIndex(uintptr(v)+size-1); ri++ {
if h.arenas[ri] != nil {
l2 := h.arenas[ri.l1()]
if l2 == nil {
// Allocate an L2 arena map.
l2 = (*[1 << arenaL2Bits]*heapArena)(persistentalloc(unsafe.Sizeof(*l2), sys.PtrSize, nil))
if l2 == nil {
throw("out of memory allocating heap arena map")
}
atomic.StorepNoWB(unsafe.Pointer(&h.arenas[ri.l1()]), unsafe.Pointer(l2))
}

if l2[ri.l2()] != nil {
throw("arena already initialized")
}
var r *heapArena
@@ -567,7 +604,7 @@ mapped:
// new heap arena becomes visible before the heap lock
// is released (which shouldn't happen, but there's
// little downside to this).
atomic.StorepNoWB(unsafe.Pointer(&h.arenas[ri]), unsafe.Pointer(r))
atomic.StorepNoWB(unsafe.Pointer(&l2[ri.l2()]), unsafe.Pointer(r))
}

// Tell the race detector about the new heap memory.
@@ -332,21 +332,23 @@ func (m *markBits) advance() {
//
// nosplit because it is used during write barriers and must not be preempted.
//go:nosplit
func heapBitsForAddr(addr uintptr) heapBits {
func heapBitsForAddr(addr uintptr) (h heapBits) {
// 2 bits per word, 4 pairs per byte, and a mask is hard coded.
off := addr / sys.PtrSize
arena := arenaIndex(addr)
ha := mheap_.arenas[arena]
ha := mheap_.arenas[arena.l1()][arena.l2()]
// The compiler uses a load for nil checking ha, but in this
// case we'll almost never hit that cache line again, so it
// makes more sense to do a value check.
if ha == nil {
// addr is not in the heap. Crash without inhibiting inlining.
_ = *ha
// addr is not in the heap. Return nil heapBits, which
// we expect to crash in the caller.
return
}
bitp := &ha.bitmap[(off/4)%heapArenaBitmapBytes]
last := &ha.bitmap[len(ha.bitmap)-1]
return heapBits{bitp, uint32(off & 3), uint32(arena), last}
h.bitp = &ha.bitmap[(addr/(sys.PtrSize*4))%heapArenaBitmapBytes]
h.shift = uint32((addr / sys.PtrSize) & 3)
h.arena = uint32(arena)
h.last = &ha.bitmap[len(ha.bitmap)-1]
return
}

// findObject returns the base address for the heap object containing
@@ -432,18 +434,36 @@ func (h heapBits) next() heapBits {
h.bitp, h.shift = add1(h.bitp), 0
} else {
// Move to the next arena.
h.arena++
a := mheap_.arenas[h.arena]
if a == nil {
// We just passed the end of the object, which
// was also the end of the heap. Poison h. It
// should never be dereferenced at this point.
h.bitp, h.last = nil, nil
} else {
h.bitp, h.shift = &a.bitmap[0], 0
h.last = &a.bitmap[len(a.bitmap)-1]
}
return h.nextArena()
}
return h
}

// nextArena advances h to the beginning of the next heap arena.
//
// This is a slow-path helper to next. gc's inliner knows that
// heapBits.next can be inlined even though it calls this. This is
// marked noinline so it doesn't get inlined into next and cause next
// to be too big to inline.
//
//go:nosplit
//go:noinline
func (h heapBits) nextArena() heapBits {
h.arena++
ai := arenaIdx(h.arena)
l2 := mheap_.arenas[ai.l1()]
if l2 == nil {
// We just passed the end of the object, which
// was also the end of the heap. Poison h. It
// should never be dereferenced at this point.
return heapBits{}
}
ha := l2[ai.l2()]
if ha == nil {
return heapBits{}
}
h.bitp, h.shift = &ha.bitmap[0], 0
h.last = &ha.bitmap[len(ha.bitmap)-1]
return h
}

@@ -465,12 +485,13 @@ func (h heapBits) forward(n uintptr) heapBits {
// We're in a new heap arena.
past := nbitp - (uintptr(unsafe.Pointer(h.last)) + 1)
h.arena += 1 + uint32(past/heapArenaBitmapBytes)
a := mheap_.arenas[h.arena]
if a == nil {
h.bitp, h.last = nil, nil
} else {
ai := arenaIdx(h.arena)
if l2 := mheap_.arenas[ai.l1()]; l2 != nil && l2[ai.l2()] != nil {
a := l2[ai.l2()]
h.bitp = &a.bitmap[past%heapArenaBitmapBytes]
h.last = &a.bitmap[len(a.bitmap)-1]
} else {
h.bitp, h.last = nil, nil
}
return h
}
@@ -971,7 +992,7 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
// machine instructions.

outOfPlace := false
if arenaIndex(x+size-1) != uint(h.arena) {
if arenaIndex(x+size-1) != arenaIdx(h.arena) {
// This object spans heap arenas, so the bitmap may be
// discontiguous. Unroll it into the object instead
// and then copy it out.
@@ -1375,12 +1396,14 @@ Phase4:
// x+size may not point to the heap, so back up one
// word and then call next().
end := heapBitsForAddr(x + size - sys.PtrSize).next()
if !outOfPlace && (end.bitp == nil || (end.shift == 0 && end.bitp == &mheap_.arenas[end.arena].bitmap[0])) {
endAI := arenaIdx(end.arena)
if !outOfPlace && (end.bitp == nil || (end.shift == 0 && end.bitp == &mheap_.arenas[endAI.l1()][endAI.l2()].bitmap[0])) {
// The unrolling code above walks hbitp just
// past the bitmap without moving to the next
// arena. Synthesize this for end.bitp.
end.bitp = addb(&mheap_.arenas[end.arena-1].bitmap[0], heapArenaBitmapBytes)
end.arena--
endAI = arenaIdx(end.arena)
end.bitp = addb(&mheap_.arenas[endAI.l1()][endAI.l2()].bitmap[0], heapArenaBitmapBytes)
end.last = nil
}
if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {

0 comments on commit ec25210

Please sign in to comment.
You can’t perform that action at this time.