runtime: manage huge pages explicitly

This change makes it so that on Linux the Go runtime explicitly marks page heap memory as either available to be backed by hugepages or not using heuristics based on density. The motivation behind this change is twofold: 1. In default Linux configurations, khugepaged can recoalesce hugepages even after the scavenger breaks them up, resulting in significant overheads for small heaps when their heaps shrink. 2. The Go runtime already has some heuristics about this, but those heuristics appear to have bit-rotted and result in haphazard hugepage management. Unlucky (but otherwise fairly dense) regions of memory end up not backed by huge pages while sparse regions end up accidentally marked MADV_HUGEPAGE and are not later broken up by the scavenger, because it already got the memory it needed from more dense sections (this is more likely to happen with small heaps that go idle). In this change, the runtime uses a new policy: 1. Mark all new memory MADV_HUGEPAGE. 2. Track whether each page chunk (4 MiB) became dense during the GC cycle. Mark those MADV_HUGEPAGE, and hide them from the scavenger. 3. If a chunk is not dense for 1 full GC cycle, make it visible to the scavenger. 4. The scavenger marks a chunk MADV_NOHUGEPAGE before it scavenges it. This policy is intended to try and back memory that is a good candidate for huge pages (high occupancy) with huge pages, and give memory that is not (low occupancy) to the scavenger. Occupancy is defined not just by occupancy at any instant of time, but also occupancy in the near future. It's generally true that by the end of a GC cycle the heap gets quite dense (from the perspective of the page allocator). Because we want scavenging and huge page management to happen together (the right time to MADV_NOHUGEPAGE is just before scavenging in order to break up huge pages and keep them that way) and the cost of applying MADV_HUGEPAGE and MADV_NOHUGEPAGE is somewhat high, the scavenger avoids releasing memory in dense page chunks. All this together means the scavenger will now more generally release memory on a ~1 GC cycle delay. Notably this has implications for scavenging to maintain the memory limit and the runtime/debug.FreeOSMemory API. This change makes it so that in these cases all memory is visible to the scavenger regardless of sparseness and delays the page allocator in re-marking this memory with MADV_NOHUGEPAGE for around 1 GC cycle to mitigate churn. The end result of this change should be little-to-no performance difference for dense heaps (MADV_HUGEPAGE works a lot like the default unmarked state) but should allow the scavenger to more effectively take back fragments of huge pages. The main risk here is churn, because MADV_HUGEPAGE usually forces the kernel to immediately back memory with a huge page. That's the reason for the large amount of hysteresis (1 full GC cycle) and why the definition of high density is 96% occupancy. Fixes #55328. Change-Id: I8da7998f1a31b498a9cc9bc662c1ae1a6bf64630 Reviewed-on: https://go-review.googlesource.com/c/go/+/436395 Reviewed-by: Michael Pratt <mpratt@google.com> Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
golang · Apr 19, 2023 · 8fa9e3b · 8fa9e3b
1 parent 1f9d80e
commit 8fa9e3b
Show file tree

Hide file tree

Showing 12 changed files with 791 additions and 373 deletions.
diff --git a/src/runtime/debug/garbage_test.go b/src/runtime/debug/garbage_test.go
@@ -146,7 +146,7 @@ func TestFreeOSMemory(t *testing.T) {
 		return
 	}
 	if after.HeapReleased-before.HeapReleased < bigBytes-slack {
-		t.Fatalf("less than %d released: %d -> %d", bigBytes, before.HeapReleased, after.HeapReleased)
+		t.Fatalf("less than %d released: %d -> %d", bigBytes-slack, before.HeapReleased, after.HeapReleased)
 	}
 }
 

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
@@ -831,7 +831,7 @@ func (p *PageAlloc) Free(base, npages uintptr) {
 		// None of the tests need any higher-level locking, so we just
 		// take the lock internally.
 		lock(pp.mheapLock)
-		pp.free(base, npages, true)
+		pp.free(base, npages)
 		unlock(pp.mheapLock)
 	})
 }
@@ -841,7 +841,7 @@ func (p *PageAlloc) Bounds() (ChunkIdx, ChunkIdx) {
 func (p *PageAlloc) Scavenge(nbytes uintptr) (r uintptr) {
 	pp := (*pageAlloc)(p)
 	systemstack(func() {
-		r = pp.scavenge(nbytes, nil)
+		r = pp.scavenge(nbytes, nil, true)
 	})
 	return
 }
@@ -995,9 +995,8 @@ func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
 	p := new(pageAlloc)
 
 	// We've got an entry, so initialize the pageAlloc.
-	p.init(new(mutex), testSysStat)
+	p.init(new(mutex), testSysStat, true)
 	lockInit(p.mheapLock, lockRankMheap)
-	p.test = true
 	for i, init := range chunks {
 		addr := chunkBase(chunkIdx(i))
 
@@ -1009,11 +1008,18 @@ func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
 		})
 
 		// Initialize the bitmap and update pageAlloc metadata.
-		chunk := p.chunkOf(chunkIndex(addr))
+		ci := chunkIndex(addr)
+		chunk := p.chunkOf(ci)
 
 		// Clear all the scavenged bits which grow set.
 		chunk.scavenged.clearRange(0, pallocChunkPages)
 
+		// Simulate the allocation and subsequent free of all pages in
+		// the chunk for the scavenge index. This sets the state equivalent
+		// with all pages within the index being free.
+		p.scav.index.alloc(ci, pallocChunkPages)
+		p.scav.index.free(ci, 0, pallocChunkPages)
+
 		// Apply scavenge state if applicable.
 		if scav != nil {
 			if scvg, ok := scav[i]; ok {
@@ -1033,19 +1039,10 @@ func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
 			// it and it's a no-op anyway.
 			if s.N != 0 {
 				chunk.allocRange(s.I, s.N)
-			}
-		}
 
-		// Make sure the scavenge index is updated.
-		//
-		// This is an inefficient way to do it, but it's also the simplest way.
-		minPages := physPageSize / pageSize
-		if minPages < 1 {
-			minPages = 1
-		}
-		_, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, minPages)
-		if npages != 0 {
-			p.scav.index.mark(addr, addr+pallocChunkBytes)
+				// Make sure the scavenge index is updated.
+				p.scav.index.alloc(ci, s.N)
+			}
 		}
 
 		// Update heap metadata for the allocRange calls above.
@@ -1070,8 +1067,6 @@ func FreePageAlloc(pp *PageAlloc) {
 		for l := 0; l < summaryLevels; l++ {
 			sysFreeOS(unsafe.Pointer(&p.summary[l][0]), uintptr(cap(p.summary[l]))*pallocSumBytes)
 		}
-		// Only necessary on 64-bit. This is a global on 32-bit.
-		sysFreeOS(unsafe.Pointer(&p.scav.index.chunks[0]), uintptr(cap(p.scav.index.chunks)))
 	} else {
 		resSize := uintptr(0)
 		for _, s := range p.summary {
@@ -1080,6 +1075,9 @@ func FreePageAlloc(pp *PageAlloc) {
 		sysFreeOS(unsafe.Pointer(&p.summary[0][0]), alignUp(resSize, physPageSize))
 	}
 
+	// Free extra data structures.
+	sysFreeOS(unsafe.Pointer(&p.scav.index.chunks[0]), uintptr(cap(p.scav.index.chunks))*unsafe.Sizeof(atomicScavChunkData{}))
+
 	// Subtract back out whatever we mapped for the summaries.
 	// sysUsed adds to p.sysStat and memstats.mappedReady no matter what
 	// (and in anger should actually be accounted for), and there's no other
@@ -1629,23 +1627,96 @@ type ScavengeIndex struct {
 
 func NewScavengeIndex(min, max ChunkIdx) *ScavengeIndex {
 	s := new(ScavengeIndex)
-	s.i.chunks = make([]atomic.Uint8, uintptr(1<<heapAddrBits/pallocChunkBytes/8))
-	s.i.min.Store(int32(min / 8))
-	s.i.max.Store(int32(max / 8))
+	// This is a bit lazy but we easily guarantee we'll be able
+	// to reference all the relevant chunks. The worst-case
+	// memory usage here is 512 MiB, but tests generally use
+	// small offsets from BaseChunkIdx, which results in ~100s
+	// of KiB in memory use.
+	//
+	// This may still be worth making better, at least by sharing
+	// this fairly large array across calls with a sync.Pool or
+	// something. Currently, when the tests are run serially,
+	// it takes around 0.5s. Not all that much, but if we have
+	// a lot of tests like this it could add up.
+	s.i.chunks = make([]atomicScavChunkData, max)
+	s.i.min.Store(uintptr(min))
+	s.i.max.Store(uintptr(max))
+	s.i.test = true
 	return s
 }
 
-func (s *ScavengeIndex) Find() (ChunkIdx, uint) {
-	ci, off := s.i.find()
+func (s *ScavengeIndex) Find(force bool) (ChunkIdx, uint) {
+	ci, off := s.i.find(force)
 	return ChunkIdx(ci), off
 }
 
-func (s *ScavengeIndex) Mark(base, limit uintptr) {
-	s.i.mark(base, limit)
+func (s *ScavengeIndex) AllocRange(base, limit uintptr) {
+	sc, ec := chunkIndex(base), chunkIndex(limit-1)
+	si, ei := chunkPageIndex(base), chunkPageIndex(limit-1)
+
+	if sc == ec {
+		// The range doesn't cross any chunk boundaries.
+		s.i.alloc(sc, ei+1-si)
+	} else {
+		// The range crosses at least one chunk boundary.
+		s.i.alloc(sc, pallocChunkPages-si)
+		for c := sc + 1; c < ec; c++ {
+			s.i.alloc(c, pallocChunkPages)
+		}
+		s.i.alloc(ec, ei+1)
+	}
+}
+
+func (s *ScavengeIndex) FreeRange(base, limit uintptr) {
+	sc, ec := chunkIndex(base), chunkIndex(limit-1)
+	si, ei := chunkPageIndex(base), chunkPageIndex(limit-1)
+
+	if sc == ec {
+		// The range doesn't cross any chunk boundaries.
+		s.i.free(sc, si, ei+1-si)
+	} else {
+		// The range crosses at least one chunk boundary.
+		s.i.free(sc, si, pallocChunkPages-si)
+		for c := sc + 1; c < ec; c++ {
+			s.i.free(c, 0, pallocChunkPages)
+		}
+		s.i.free(ec, 0, ei+1)
+	}
+}
+
+func (s *ScavengeIndex) ResetSearchAddrs() {
+	for _, a := range []*atomicOffAddr{&s.i.searchAddrBg, &s.i.searchAddrForce} {
+		addr, marked := a.Load()
+		if marked {
+			a.StoreUnmark(addr, addr)
+		}
+		a.Clear()
+	}
+	s.i.freeHWM = minOffAddr
+}
+
+func (s *ScavengeIndex) NextGen() {
+	s.i.nextGen()
+}
+
+func (s *ScavengeIndex) SetEmpty(ci ChunkIdx) {
+	s.i.setEmpty(chunkIdx(ci))
 }
 
-func (s *ScavengeIndex) Clear(ci ChunkIdx) {
-	s.i.clear(chunkIdx(ci))
+func (s *ScavengeIndex) SetNoHugePage(ci ChunkIdx) bool {
+	return s.i.setNoHugePage(chunkIdx(ci))
+}
+
+func CheckPackScavChunkData(gen uint32, inUse, lastInUse uint16, flags uint8) bool {
+	sc0 := scavChunkData{
+		gen:            gen,
+		inUse:          inUse,
+		lastInUse:      lastInUse,
+		scavChunkFlags: scavChunkFlags(flags),
+	}
+	scp := sc0.pack()
+	sc1 := unpackScavChunkData(scp)
+	return sc0 == sc1
 }
 
 const GTrackingPeriod = gTrackingPeriod

diff --git a/src/runtime/mem_linux.go b/src/runtime/mem_linux.go
@@ -37,64 +37,6 @@ func sysAllocOS(n uintptr) unsafe.Pointer {
 var adviseUnused = uint32(_MADV_FREE)
 
 func sysUnusedOS(v unsafe.Pointer, n uintptr) {
-	// By default, Linux's "transparent huge page" support will
-	// merge pages into a huge page if there's even a single
-	// present regular page, undoing the effects of madvise(adviseUnused)
-	// below. On amd64, that means khugepaged can turn a single
-	// 4KB page to 2MB, bloating the process's RSS by as much as
-	// 512X. (See issue #8832 and Linux kernel bug
-	// https://bugzilla.kernel.org/show_bug.cgi?id=93111)
-	//
-	// To work around this, we explicitly disable transparent huge
-	// pages when we release pages of the heap. However, we have
-	// to do this carefully because changing this flag tends to
-	// split the VMA (memory mapping) containing v in to three
-	// VMAs in order to track the different values of the
-	// MADV_NOHUGEPAGE flag in the different regions. There's a
-	// default limit of 65530 VMAs per address space (sysctl
-	// vm.max_map_count), so we must be careful not to create too
-	// many VMAs (see issue #12233).
-	//
-	// Since huge pages are huge, there's little use in adjusting
-	// the MADV_NOHUGEPAGE flag on a fine granularity, so we avoid
-	// exploding the number of VMAs by only adjusting the
-	// MADV_NOHUGEPAGE flag on a large granularity. This still
-	// gets most of the benefit of huge pages while keeping the
-	// number of VMAs under control. With hugePageSize = 2MB, even
-	// a pessimal heap can reach 128GB before running out of VMAs.
-	if physHugePageSize != 0 {
-		// If it's a large allocation, we want to leave huge
-		// pages enabled. Hence, we only adjust the huge page
-		// flag on the huge pages containing v and v+n-1, and
-		// only if those aren't aligned.
-		var head, tail uintptr
-		if uintptr(v)&(physHugePageSize-1) != 0 {
-			// Compute huge page containing v.
-			head = alignDown(uintptr(v), physHugePageSize)
-		}
-		if (uintptr(v)+n)&(physHugePageSize-1) != 0 {
-			// Compute huge page containing v+n-1.
-			tail = alignDown(uintptr(v)+n-1, physHugePageSize)
-		}
-
-		// Note that madvise will return EINVAL if the flag is
-		// already set, which is quite likely. We ignore
-		// errors.
-		if head != 0 && head+physHugePageSize == tail {
-			// head and tail are different but adjacent,
-			// so do this in one call.
-			madvise(unsafe.Pointer(head), 2*physHugePageSize, _MADV_NOHUGEPAGE)
-		} else {
-			// Advise the huge pages containing v and v+n-1.
-			if head != 0 {
-				madvise(unsafe.Pointer(head), physHugePageSize, _MADV_NOHUGEPAGE)
-			}
-			if tail != 0 && tail != head {
-				madvise(unsafe.Pointer(tail), physHugePageSize, _MADV_NOHUGEPAGE)
-			}
-		}
-	}
-
 	if uintptr(v)&(physPageSize-1) != 0 || n&(physPageSize-1) != 0 {
 		// madvise will round this to any physical page
 		// *covered* by this range, so an unaligned madvise
@@ -133,19 +75,7 @@ func sysUsedOS(v unsafe.Pointer, n uintptr) {
 			throw("runtime: cannot remap pages in address space")
 		}
 		return
-
-		// Don't do the sysHugePage optimization in hard decommit mode.
-		// We're breaking up pages everywhere, there's no point.
 	}
-	// Partially undo the NOHUGEPAGE marks from sysUnused
-	// for whole huge pages between v and v+n. This may
-	// leave huge pages off at the end points v and v+n
-	// even though allocations may cover these entire huge
-	// pages. We could detect this and undo NOHUGEPAGE on
-	// the end points as well, but it's probably not worth
-	// the cost because when neighboring allocations are
-	// freed sysUnused will just set NOHUGEPAGE again.
-	sysHugePageOS(v, n)
 }
 
 func sysHugePageOS(v unsafe.Pointer, n uintptr) {

diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
@@ -1069,6 +1069,12 @@ func gcMarkTermination() {
 	injectglist(&work.sweepWaiters.list)
 	unlock(&work.sweepWaiters.lock)
 
+	// Increment the scavenge generation now.
+	//
+	// This moment represents peak heap in use because we're
+	// about to start sweeping.
+	mheap_.pages.scav.index.nextGen()
+
 	// Release the CPU limiter.
 	gcCPULimiter.finishGCTransition(now)