Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

runtime: windows tests fail with race detector #23900

Closed
alexbrainman opened this issue Feb 18, 2018 · 13 comments

Comments

Projects
None yet
4 participants
@alexbrainman
Copy link
Member

commented Feb 18, 2018

windows-adm64-2008 and windows-adm64-2012 builders fail with tests that use race detector. Like:

##### Testing race detector
--- FAIL: TestOutput (6.19s)
	output_test.go:29: go install -race: exit status 2
		# strconv
		runtime: VirtualAlloc of 67108864 bytes failed with errno=1455
		fatal error: out of memory
		
		runtime stack:
		runtime.throw(0xc3cb28, 0xd)
			C:/workdir/go/src/runtime/panic.go:619 +0x88 fp=0x22fa80 sp=0x22fa60 pc=0x42cba8
		runtime.sysMap(0xc000000000, 0x4000000, 0x1050ef8)
			C:/workdir/go/src/runtime/mem_windows.go:122 +0x13b fp=0x22fac8 sp=0x22fa80 pc=0x415b6b
		runtime.(*mheap).sysAlloc(0x102b920, 0x4000000, 0x0, 0xc0000139)
			C:/workdir/go/src/runtime/malloc.go:513 +0x1be fp=0x22fb38 sp=0x22fac8 pc=0x409e4e
		runtime.(*mheap).grow(0x102b920, 0x1, 0x0)
			C:/workdir/go/src/runtime/mheap.go:849 +0x49 fp=0x22fb90 sp=0x22fb38 pc=0x422919
		runtime.(*mheap).allocSpanLocked(0x102b920, 0x1, 0x1050f08, 0x458f4e)
			C:/workdir/go/src/runtime/mheap.go:777 +0x335 fp=0x22fbd0 sp=0x22fb90 pc=0x422795
		runtime.(*mheap).alloc_m(0x102b920, 0x1, 0x23, 0xe659100001000)
			C:/workdir/go/src/runtime/mheap.go:624 +0x126 fp=0x22fc10 sp=0x22fbd0 pc=0x421fb6
		runtime.(*mheap).alloc.func1()
			C:/workdir/go/src/runtime/mheap.go:691 +0x54 fp=0x22fc48 sp=0x22fc10 pc=0x453484
		runtime.(*mheap).alloc(0x102b920, 0x1, 0x10023, 0x0)
			C:/workdir/go/src/runtime/mheap.go:690 +0x91 fp=0x22fc98 sp=0x22fc48 pc=0x422261
		runtime.(*mcentral).grow(0x102d518, 0x0)
			C:/workdir/go/src/runtime/mcentral.go:232 +0x9b fp=0x22fce0 sp=0x22fc98 pc=0x41537b
		runtime.(*mcentral).cacheSpan(0x102d518, 0x40000)
			C:/workdir/go/src/runtime/mcentral.go:106 +0x2eb fp=0x22fd28 sp=0x22fce0 pc=0x414ebb
		runtime.(*mcache).refill(0x2e0000, 0x1050f23)
			C:/workdir/go/src/runtime/mcache.go:122 +0xaa fp=0x22fd58 sp=0x22fd28 pc=0x414a7a
		runtime.(*mcache).nextFree.func1()
			C:/workdir/go/src/runtime/malloc.go:633 +0x39 fp=0x22fd78 sp=0x22fd58 pc=0x4527e9
		runtime.(*mcache).nextFree(0x2e0000, 0x23, 0x1050f60, 0x2e4000, 0x22fe10)
			C:/workdir/go/src/runtime/malloc.go:632 +0xb0 fp=0x22fdd0 sp=0x22fd78 pc=0x40a410
		runtime.mallocgc(0x100, 0xbdad80, 0x22fe01, 0x409ade)
			C:/workdir/go/src/runtime/malloc.go:787 +0x7de fp=0x22fe70 sp=0x22fdd0 pc=0x40ad9e
		runtime.newobject(0xbdad80, 0x101f340)
			C:/workdir/go/src/runtime/malloc.go:916 +0x3f fp=0x22fea0 sp=0x22fe70 pc=0x40b19f
		runtime.mcommoninit(0x101f340)
			C:/workdir/go/src/runtime/proc.go:579 +0x131 fp=0x22fed8 sp=0x22fea0 pc=0x42f851
		runtime.schedinit()
			C:/workdir/go/src/runtime/proc.go:486 +0x6d fp=0x22ff30 sp=0x22fed8 pc=0x42f4cd
		runtime.rt0_go(0x22ff60, 0x776059cd, 0x22ff60, 0x0, 0x776059cd, 0x0, 0x0, 0x0, 0x0, 0x0, ...)
			C:/workdir/go/src/runtime/asm_amd64.s:252 +0x200 fp=0x22ff38 sp=0x22ff30 pc=0x4552e0
		# bytes
		fatal error: failed to allocate arena index
		runtime: panic before malloc heap initialized
		
		runtime stack:
		runtime.throw(0xc48599, 0x1e)
			C:/workdir/go/src/runtime/panic.go:619 +0x88 fp=0x22fe90 sp=0x22fe70 pc=0x42cba8
		runtime.mallocinit()
			C:/workdir/go/src/runtime/malloc.go:308 +0x1b1 fp=0x22fed8 sp=0x22fe90 pc=0x409b51
		runtime.schedinit()
			C:/workdir/go/src/runtime/proc.go:485 +0x5b fp=0x22ff30 sp=0x22fed8 pc=0x42f4bb
		runtime.rt0_go(0x22ff60, 0x776059cd, 0x22ff60, 0x0, 0x776059cd, 0x0, 0x0, 0x0, 0x0, 0x0, ...)
			C:/workdir/go/src/runtime/asm_amd64.s:252 +0x200 fp=0x22ff38 sp=0x22ff30 pc=0x4552e0
FAIL
FAIL	runtime/race	6.469s
2018/02/17 18:49:44 Failed: exit status 1
ok  	flag	1.188s
ok  	os	1.068s [no tests to run]
--- FAIL: TestEcho (0.17s)
	exec_test.go:54: echo: exit status 2
	exec_test.go:57: echo: want "foo bar baz\n", got ""
FAIL
FAIL	os/exec	1.341s
ok  	encoding/gob	1.055s
2018/02/17 18:49:45 Failed: exit status 1
PASS
scatter = 000000000076C9E0
hello from C
sqrt is: 0
ok  	_/C_/workdir/go/misc/cgo/test	2.604s
runtime: VirtualAlloc of 67108864 bytes failed with errno=1455
fatal error: out of memory

runtime stack:
runtime.throw(0x5bfb6f, 0xd)
	C:/workdir/go/src/runtime/panic.go:619 +0x88 fp=0x22f910 sp=0x22f8f0 pc=0x4314f8
runtime.sysMap(0xc000000000, 0x4000000, 0x6f7138)
	C:/workdir/go/src/runtime/mem_windows.go:122 +0x13b fp=0x22f958 sp=0x22f910 pc=0x41981b
runtime.(*mheap).sysAlloc(0x6dcb20, 0x4000000, 0x3e0000, 0x48)
	C:/workdir/go/src/runtime/malloc.go:513 +0x2bd fp=0x22f9c8 sp=0x22f958 pc=0x40b58d
runtime.(*mheap).grow(0x6dcb20, 0x1, 0x0)
	C:/workdir/go/src/runtime/mheap.go:849 +0x49 fp=0x22fa20 sp=0x22f9c8 pc=0x426b59
runtime.(*mheap).allocSpanLocked(0x6dcb20, 0x1, 0x6f7148, 0x56000003000)
	C:/workdir/go/src/runtime/mheap.go:777 +0x35f fp=0x22fa78 sp=0x22fa20 pc=0x4269cf
runtime.(*mheap).alloc_m(0x6dcb20, 0x1, 0x23, 0x77603380)
	C:/workdir/go/src/runtime/mheap.go:624 +0x126 fp=0x22fab8 sp=0x22fa78 pc=0x426136
runtime.(*mheap).alloc.func1()
	C:/workdir/go/src/runtime/mheap.go:691 +0x54 fp=0x22faf0 sp=0x22fab8 pc=0x45ad54
runtime.(*mheap).alloc(0x6dcb20, 0x1, 0x1010023, 0x63)
	C:/workdir/go/src/runtime/mheap.go:690 +0x91 fp=0x22fb40 sp=0x22faf0 pc=0x4263f1
runtime.(*mcentral).grow(0x6de718, 0x0)
	C:/workdir/go/src/runtime/mcentral.go:232 +0xd5 fp=0x22fb90 sp=0x22fb40 pc=0x419015
runtime.(*mcentral).cacheSpan(0x6de718, 0x41913b)
	C:/workdir/go/src/runtime/mcentral.go:106 +0x2f3 fp=0x22fbd8 sp=0x22fb90 pc=0x418b13
runtime.(*mcache).refill(0x1180000, 0x6d8223)
	C:/workdir/go/src/runtime/mcache.go:122 +0xaa fp=0x22fc08 sp=0x22fbd8 pc=0x41865a
runtime.(*mcache).nextFree.func1()
	C:/workdir/go/src/runtime/malloc.go:633 +0x39 fp=0x22fc28 sp=0x22fc08 pc=0x45a0f9
runtime.(*mcache).nextFree(0x1180000, 0x23, 0x8, 0x6f71a0, 0x1184000)
	C:/workdir/go/src/runtime/malloc.go:632 +0xb0 fp=0x22fc88 sp=0x22fc28 pc=0x40bb50
runtime.mallocgc(0x100, 0x596840, 0x6d7201, 0x6d8280)
	C:/workdir/go/src/runtime/malloc.go:787 +0x6d0 fp=0x22fd30 sp=0x22fc88 pc=0x40c400
runtime.newobject(0x596840, 0x6d8800)
	C:/workdir/go/src/runtime/malloc.go:916 +0x3f fp=0x22fd60 sp=0x22fd30 pc=0x40c77f
runtime.mcommoninit(0x6d8800)
	C:/workdir/go/src/runtime/proc.go:579 +0x13f fp=0x22fd98 sp=0x22fd60 pc=0x43442f
runtime.schedinit()
	C:/workdir/go/src/runtime/proc.go:486 +0x8e fp=0x22fdf0 sp=0x22fd98 pc=0x433ede
runtime.rt0_go(0x6b38a0, 0x2f77e0, 0x6b38a0, 0x2f7cd0, 0x6ae155, 0x2f7cd0, 0x7feff3e13d2, 0x3, 0x30, 0x0, ...)
	C:/workdir/go/src/runtime/asm_amd64.s:252 +0x200 fp=0x22fdf8 sp=0x22fdf0 pc=0x45cbb0
FAIL	flag	0.073s
ok  	os/exec	3.097s
2018/02/17 18:50:01 Failed: exit status 1
2018/02/17 18:50:02 FAILED

see https://build.golang.org/log/58c395893d056c4144f33d21648db2385b55b885 for full log. windows-adm64-race is also broken, but windows-386-2008 and windows-amd64-2016 are fine. This seems started with 2b41554 so paging @aclements .

Alex

@aclements

This comment has been minimized.

Copy link
Member

commented Feb 20, 2018

It seems like my change may be genuinely causing us to use significantly more memory on Windows. Particularly, the "fatal error: failed to allocate arena index" buried in there suggests that the machine is actually low on RAM at that point. I'll try running some of the x/benchmarks benchmarks on the gomote since they report their own memory use, and I'll make that panic dump more information about memory consumption and the layout of the heap. @alexbrainman, any suggestions of other good ways to inspect memory usage and memory layout on Windows?

@aclements aclements self-assigned this Feb 20, 2018

@aclements

This comment has been minimized.

Copy link
Member

commented Feb 20, 2018

Every failure above and most (though not all) of the failures I've looked at on the dashboard have been mapping the very first heap arena. We haven't even mapped the race shadow for the heap yet. The failing process can't possibly be using much memory itself at this point, so it's almost certainly the other things running concurrently that are using the memory.

It may simply be that TSAN maps 4.5 bytes for every byte we map (MapShadow,
kShadowMultiplier), so each 64MB heap mapping turns into a 352MB mapping. If there are several small processes running that are using significantly less than their initial 64MB mapping, the combined overhead may be too much. Plus, if I'm reading the TSAN code correctly, all 352MB are committed.

I could change Go so that it calls racemapshadow more incrementally as it uses more of each 64MB arena, though this definitely complicates things.

Ping @dvyukov for thoughts. Dmitry, last week I changed the Go memory allocator to map 64MB at a time (always 64MB aligned) and eliminated the assumption that the heap is contiguous in memory. TSAN seems fine with this on Linux, but is running into "out of memory" errors on Windows. I know there are various Go-specific changes to the TSAN runtime. Have I just violated some of its assumptions about the Go heap?

@aclements

This comment has been minimized.

Copy link
Member

commented Feb 20, 2018

The fact that we reserve and commit 32MB of memory for the arena index may also be contributing to the problem. @alexbrainman, it would be fine to map that memory incrementally, but it would be really nice to have zero-filled memory backing most of it, even if it's read-only. Is there a way to get zeroed memory in Windows that doesn't count against the commit limit?

@alexbrainman

This comment has been minimized.

Copy link
Member Author

commented Feb 21, 2018

@alexbrainman, any suggestions of other good ways to inspect memory usage and memory layout on Windows?

Sometimes I use https://docs.microsoft.com/en-us/sysinternals/downloads/vmmap and https://docs.microsoft.com/en-us/sysinternals/downloads/rammap Unfortunately these are GUI apps and they won't work over gomote. But if you need a Windows computer to play with, you can always build one on GCE.

You might be interested to watch

https://www.youtube.com/watch?v=AjTl53I_qzY
and
https://www.youtube.com/watch?v=6KZdNsq1YV4

if you have 3 spare hours in your life.

I also use https://docs.microsoft.com/en-us/sysinternals/downloads/process-explorer (not for memory problems), but for debugging in general.

but it would be really nice to have zero-filled memory backing most of it, even if it's read-only. Is there a way to get zeroed memory in Windows that doesn't count against the commit limit?

I do not know. I will try to search old my records.

But you can also search yourself. For example looking at my usual hunting ground (Raymond Chen's blog) - searching for "Virtual­Alloc zero", I get this list

https://social.msdn.microsoft.com/search/en-US?rq=site%3Ablogs.msdn.microsoft.com%2Foldnewthing&rn=oldnewthing&ral=1&query=Virtual%C2%ADAlloc%20zero&ac=5

and https://blogs.msdn.microsoft.com/oldnewthing/20170113-00/?p=95185 on that list looks interesting. But ultimately you should try your ideas and see what happens with VMMap.

I hope it helps.

Alex

@alexbrainman

This comment has been minimized.

Copy link
Member Author

commented Feb 21, 2018

I will try to search old my records.

I did. There is nothing much there:

https://blogs.msdn.microsoft.com/oldnewthing/20170317-00/?p=95755
http://aeops.blogspot.com.au/2009/10/art-of-memreset-flag.html

Maybe @dvyukov can also help with your question. He does know a bit about Windows, but he is shy.

Alex

@dvyukov

This comment has been minimized.

Copy link
Member

commented Feb 21, 2018

MapShadow assumes that Go heap is continuous and grows up. See the mapped_meta_end variable in MapShadow. Tsan also has pretty tight assumptions about layout of virtual address space:
https://github.com/llvm-mirror/compiler-rt/blob/409d44eac7a558f5de389599503db213044819b7/lib/tsan/rtl/tsan_platform.h#L386

That MapShadow complexity was actually imposed by Windows. On linux/bsd we simply map 128TB of memory at start and we are done. But Windows eagerly allocates page tables (potentially even for reserved memory) and charges the process for it. So if we map 128TB and page tables take, say, 1/1000-th, Windows will try to eagerly allocate 128GB which always fails. So we had to do this lazy shadow allocation. There is additional problem with the "meta" shadow which is 1/2 of application memory, so even if an app allocates in dwAllocationGranularity chunks, the shadow is smaller than dwAllocationGranularity.

I've looked at a dozen of failures on the build dashboard and it seems that the broken assumption about continuous heap is not the problem (at least not yet) and they simply fail due to OOM errors:

==2424==ERROR: ThreadSanitizer failed to allocate 0x000010000000 (268435456) bytes at 0x040000000000 (error code: 1455)
==2684==ERROR: ThreadSanitizer failed to allocate 0x000002000000 (33554432) bytes at 0x07c000000000 (error code: 1455)

These are allocations of memory shadow and meta shadow of expected sizes at expected addresses.

Is it possible to reduce 64MB const under race detector on Windows?
Note that this is also penalizes programs without race to some degree. Overallocation of unused memory is not completely free on Windows.

@aclements

This comment has been minimized.

Copy link
Member

commented Feb 21, 2018

@alexbrainman, thanks for the pointers. I'd looked over the MSDN docs for virtual memory functions and didn't get much out of it, but those gave me some ideas I can at least try, though it may just not be possible to do what I want.

Tsan also has pretty tight assumptions about layout of virtual address space

Thanks for pointing me to the memory map. Go will still grow the heap contiguously from 0xc000000000 unless it runs into some other mapping. If it does, it will jump to 0xc100000000 and so on. It looks like TSAN is okay with that from the memory layout standpoint, but that MapShadow will try to map a huge amount of meta shadow to cover the 4GB jump. It looks like the mapping of the regular shadow will be fine; does it have to map all of that in-between space for the meta shadow?

If Go falls all the way back to 0xe000000000, it will ask the OS for whatever it can give, but something would have to go pretty wrong to get to that point.

Is it possible to reduce 64MB const under race detector on Windows?

Unfortunately, that would increase the size of the arena index proportionally, and it's already 32MB. That said,

  1. Since that 32MB of arena index is committed on Windows, it may already be a problem and maybe I need to add some indirection there. If I add the indirection, then maybe it's okay to reduce the arena size and make the arena index bigger.

  2. Since we consume memory within each 64MB arena sequentially, I could commit the memory backing each arena more incrementally (both for the OS and for TSAN).

@dvyukov

This comment has been minimized.

Copy link
Member

commented Feb 21, 2018

It looks like the mapping of the regular shadow will be fine; does it have to map all of that in-between space for the meta shadow?

No, it's just am implementation detail to deal with dwAllocationGranularity for the meta shadow.
The only thing that matters in the end is that shadow for regions that Go program will actually touch is mapped.

Since we consume memory within each 64MB arena sequentially, I could commit the memory backing each arena more incrementally (both for the OS and for TSAN).

Sounds like this can work.
If Go runtime promises to map heap by at least aligned 128KB blocks, then MapShadow can be significantly simplified. I.e. we could them map both shadows with a single VirtualAlloc without accounting for alignment, previous allocations, etc.

@gopherbot

This comment has been minimized.

Copy link

commented Feb 23, 2018

Change https://golang.org/cl/96780 mentions this issue: runtime: reduce arena size to 4MB on 64-bit Windows

@gopherbot

This comment has been minimized.

Copy link

commented Feb 23, 2018

Change https://golang.org/cl/96779 mentions this issue: runtime: support a two-level arena map

@aclements

This comment has been minimized.

Copy link
Member

commented Feb 23, 2018

If Go runtime promises to map heap by at least aligned 128KB blocks, then MapShadow can be significantly simplified. I.e. we could them map both shadows with a single VirtualAlloc without accounting for alignment, previous allocations, etc.

That would be great. We can definitely guarantee 128KB alignment. In fact, it's 64MB-aligned right now, and will still be 4MB-aligned after CL 96780. Should I file a bug to track that, and should I do it here or is there a better place to file it?

gopherbot pushed a commit that referenced this issue Feb 23, 2018

runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.

However, there are two problems with this:

1. mips64, ppc64, and s390x support full 64-bit address spaces (though
   on Linux only s390x has kernel support for 64-bit address spaces).
   On these platforms, it would be good to support these larger
   address spaces.

2. On Windows, processes are charged for untouched memory, so for
   processes with small heaps, the mostly-untouched 32 MB arena map
   plus a 64 MB arena are significant overhead. Hence, it would be
   good to reduce both the arena map size and the arena size, but with
   a single-level arena, these are inversely proportional.

This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.

At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:

1. We embed the L2 array directly in mheap, so if there's a single
   entry in the L2 array, the representation is identical to the
   current representation and there's no extra level of indirection.

2. Hot code that accesses the arena map is structured so that it
   optimizes to nearly the same machine code as it does currently.

3. We make some small tweaks to hot code paths and to the inliner
   itself to keep some important functions inlined despite their
   now-larger ASTs. In particular, this is necessary for
   heapBitsForAddr and heapBits.next.

Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:

name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.26ms ± 1%  -1.07%  (p=0.000 n=17+19)

(https://perf.golang.org/search?q=upload:20180223.2)

For #23900.

Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>

@gopherbot gopherbot closed this in 7884647 Feb 23, 2018

@dvyukov

This comment has been minimized.

Copy link
Member

commented Feb 26, 2018

This tracker is good (add label:RaceDetector).

@aclements

This comment has been minimized.

Copy link
Member

commented Feb 26, 2018

Done: #24133.

@golang golang locked and limited conversation to collaborators Feb 26, 2019

Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
You can’t perform that action at this time.