Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

runtime/pprof: apparent deadlock in TestGoroutineSwitch on linux-armv6l #47505

Closed
toothrot opened this issue Aug 2, 2021 · 14 comments
Closed
Labels
FrozenDueToAge NeedsFix The path to resolution is known, but the work has not been done.
Milestone

Comments

@toothrot
Copy link
Contributor

toothrot commented Aug 2, 2021

What version of Go are you using (go version)?

go1.17rc2

Does this issue reproduce with the latest release?

Literally yes.

What operating system and processor architecture are you using (go env)?

linux-armv6l

What did you do?

Ran tests as part of building a release. They failed the first 2 of 3 attempts.

release -target linux-armv6l -user go-dev -version go1.17rc2 -staging_dir [REDACTED]/go-releasebot-work/go1.17rc2/release-staging/d0232c682c4023cba6e6ce8e9c4384446a47918a_707736155 -rev d0232c682c4023cba6e6ce8e9c4384446a47918a

See #47502

What did you expect to see?

Success

What did you see instead?

SIGQUIT: quit
PC=0x7dfc4 m=1 sigcode=0

goroutine 0 [idle]:
runtime.usleep()
	/usr/local/go/src/runtime/sys_linux_arm.s:604 +0x2c fp=0x203df7c sp=0x203df6c pc=0x7dfc4
runtime.sysmon()
	/usr/local/go/src/runtime/proc.go:5330 +0xe4 fp=0x203dfd8 sp=0x203df7c pc=0x57d7c
runtime.mstart1()
	/usr/local/go/src/runtime/proc.go:1407 +0x7c fp=0x203dfe8 sp=0x203dfd8 pc=0x4e380
runtime.mstart0()
	/usr/local/go/src/runtime/proc.go:1365 +0x68 fp=0x203dffc sp=0x203dfe8 pc=0x4e2e8
runtime.mstart()
	/usr/local/go/src/runtime/asm_arm.s:211 +0x8 fp=0x203e000 sp=0x203dffc pc=0x7b17c

goroutine 1 [chan receive]:
runtime.gopark(0x196d28, 0x211c2f0, 0xe, 0x17, 0x2)
	/usr/local/go/src/runtime/proc.go:366 +0x100 fp=0x2104d0c sp=0x2104cf8 pc=0x4ba2c
runtime.chanrecv(0x211c2c0, 0x2104d93, 0x1)
	/usr/local/go/src/runtime/chan.go:576 +0x530 fp=0x2104d58 sp=0x2104d0c pc=0x15a58
runtime.chanrecv1(0x211c2c0, 0x2104d93)
	/usr/local/go/src/runtime/chan.go:439 +0x20 fp=0x2104d6c sp=0x2104d58 pc=0x15524
testing.(*T).Run(0x21180f0, {0x189c2d, 0x11}, 0x196f00)
	/usr/local/go/src/testing/testing.go:1307 +0x37c fp=0x2104db8 sp=0x2104d6c pc=0xdedd8
testing.runTests.func1(0x21180f0)
	/usr/local/go/src/testing/testing.go:1598 +0x6c fp=0x2104ddc sp=0x2104db8 pc=0xe0788
testing.tRunner(0x21180f0, 0x208de70)
	/usr/local/go/src/testing/testing.go:1259 +0xf0 fp=0x2104e1c sp=0x2104ddc pc=0xde1f0
testing.runTests(0x21241d0, {0x2869b8, 0x1e, 0x1e}, {0xc03a29201607a930, 0xfb750b9c04, 0x288618})
	/usr/local/go/src/testing/testing.go:1596 +0x3d8 fp=0x2104e80 sp=0x2104e1c pc=0xe0604
testing.(*M).Run(0x2166050)
	/usr/local/go/src/testing/testing.go:1504 +0x514 fp=0x2104f88 sp=0x2104e80 pc=0xdf490
main.main()
	_testmain.go:101 +0x150 fp=0x2104fbc sp=0x2104f88 pc=0x15a1d0
runtime.main()
	/usr/local/go/src/runtime/proc.go:255 +0x278 fp=0x2104fe4 sp=0x2104fbc pc=0x4b54c
runtime.goexit()
	/usr/local/go/src/runtime/asm_arm.s:838 +0x4 fp=0x2104fe4 sp=0x2104fe4 pc=0x7cd94

goroutine 2 [running]:
	goroutine running on other thread; stack unavailable
created by runtime.init.6
	/usr/local/go/src/runtime/proc.go:294 +0x24

goroutine 3 [GC sweep wait]:
runtime.gopark(0x196dfc, 0x2885c0, 0xc, 0x14, 0x1)
	/usr/local/go/src/runtime/proc.go:366 +0x100 fp=0x202b7d8 sp=0x202b7c4 pc=0x4ba2c
runtime.goparkunlock(...)
	/usr/local/go/src/runtime/proc.go:372
runtime.bgsweep()
	/usr/local/go/src/runtime/mgcsweep.go:182 +0x104 fp=0x202b7ec sp=0x202b7d8 pc=0x341c4
runtime.goexit()
	/usr/local/go/src/runtime/asm_arm.s:838 +0x4 fp=0x202b7ec sp=0x202b7ec pc=0x7cd94
created by runtime.gcenable
	/usr/local/go/src/runtime/mgc.go:181 +0x60

goroutine 4 [GC scavenge wait]:
runtime.gopark(0x196dfc, 0x288548, 0xd, 0x14, 0x1)
	/usr/local/go/src/runtime/proc.go:366 +0x100 fp=0x202bfb0 sp=0x202bf9c pc=0x4ba2c
runtime.goparkunlock(...)
	/usr/local/go/src/runtime/proc.go:372
runtime.bgscavenge()
	/usr/local/go/src/runtime/mgcscavenge.go:314 +0x2ac fp=0x202bfec sp=0x202bfb0 pc=0x322e0
runtime.goexit()
	/usr/local/go/src/runtime/asm_arm.s:838 +0x4 fp=0x202bfec sp=0x202bfec pc=0x7cd94
created by runtime.gcenable
	/usr/local/go/src/runtime/mgc.go:182 +0x74

goroutine 17 [finalizer wait, 18 minutes]:
runtime.gopark(0x196dfc, 0x29a02c, 0x10, 0x14, 0x1)
	/usr/local/go/src/runtime/proc.go:366 +0x100 fp=0x202a790 sp=0x202a77c pc=0x4ba2c
runtime.goparkunlock(...)
	/usr/local/go/src/runtime/proc.go:372
runtime.runfinq()
	/usr/local/go/src/runtime/mfinal.go:177 +0xbc fp=0x202a7ec sp=0x202a790 pc=0x27258
runtime.goexit()
	/usr/local/go/src/runtime/asm_arm.s:838 +0x4 fp=0x202a7ec sp=0x202a7ec pc=0x7cd94
created by runtime.createfing
	/usr/local/go/src/runtime/mfinal.go:157 +0x5c

goroutine 26 [running]:
	goroutine running on other thread; stack unavailable
created by testing.(*T).Run
	/usr/local/go/src/testing/testing.go:1306 +0x35c

goroutine 21 [GC worker (idle), 18 minutes]:
runtime.gopark(0x196d60, 0x2588000, 0x18, 0x14, 0x0)
	/usr/local/go/src/runtime/proc.go:366 +0x100 fp=0x2026f9c sp=0x2026f88 pc=0x4ba2c
runtime.gcBgMarkWorker()
	/usr/local/go/src/runtime/mgc.go:1200 +0x100 fp=0x2026fec sp=0x2026f9c pc=0x2a6b8
runtime.goexit()
	/usr/local/go/src/runtime/asm_arm.s:838 +0x4 fp=0x2026fec sp=0x2026fec pc=0x7cd94
created by runtime.gcBgMarkStartWorkers
	/usr/local/go/src/runtime/mgc.go:1124 +0x28

goroutine 33 [GC worker (idle), 18 minutes]:
runtime.gopark(0x196d60, 0x2154078, 0x18, 0x14, 0x0)
	/usr/local/go/src/runtime/proc.go:366 +0x100 fp=0x259079c sp=0x2590788 pc=0x4ba2c
runtime.gcBgMarkWorker()
	/usr/local/go/src/runtime/mgc.go:1200 +0x100 fp=0x25907ec sp=0x259079c pc=0x2a6b8
runtime.goexit()
	/usr/local/go/src/runtime/asm_arm.s:838 +0x4 fp=0x25907ec sp=0x25907ec pc=0x7cd94
created by runtime.gcBgMarkStartWorkers
	/usr/local/go/src/runtime/mgc.go:1124 +0x28

goroutine 22 [GC worker (idle)]:
runtime.gopark(0x196d60, 0x2154090, 0x18, 0x14, 0x0)
	/usr/local/go/src/runtime/proc.go:366 +0x100 fp=0x202779c sp=0x2027788 pc=0x4ba2c
runtime.gcBgMarkWorker()
	/usr/local/go/src/runtime/mgc.go:1200 +0x100 fp=0x20277ec sp=0x202779c pc=0x2a6b8
runtime.goexit()
	/usr/local/go/src/runtime/asm_arm.s:838 +0x4 fp=0x20277ec sp=0x20277ec pc=0x7cd94
created by runtime.gcBgMarkStartWorkers
	/usr/local/go/src/runtime/mgc.go:1124 +0x28

goroutine 23 [GC worker (idle)]:
runtime.gopark(0x196d60, 0x21540a8, 0x18, 0x14, 0x0)
	/usr/local/go/src/runtime/proc.go:366 +0x100 fp=0x2027f9c sp=0x2027f88 pc=0x4ba2c
runtime.gcBgMarkWorker()
	/usr/local/go/src/runtime/mgc.go:1200 +0x100 fp=0x2027fec sp=0x2027f9c pc=0x2a6b8
runtime.goexit()
	/usr/local/go/src/runtime/asm_arm.s:838 +0x4 fp=0x2027fec sp=0x2027fec pc=0x7cd94
created by runtime.gcBgMarkStartWorkers
	/usr/local/go/src/runtime/mgc.go:1124 +0x28

goroutine 27 [sleep]:
runtime.gopark(0x196e18, 0x20100c0, 0x13, 0x13, 0x1)
	/usr/local/go/src/runtime/proc.go:366 +0x100 fp=0x2028790 sp=0x202877c pc=0x4ba2c
time.Sleep(0x5f5e100)
	/usr/local/go/src/runtime/time.go:193 +0x170 fp=0x20287ac sp=0x2028790 pc=0x7aed8
runtime/pprof.profileWriter({0x1be2a8, 0x22c8300})
	/usr/local/go/src/runtime/pprof/pprof.go:799 +0x5c fp=0x20287e4 sp=0x20287ac pc=0x13c2cc
runtime.goexit()
	/usr/local/go/src/runtime/asm_arm.s:838 +0x4 fp=0x20287e4 sp=0x20287e4 pc=0x7cd94
created by runtime/pprof.StartCPUProfile
	/usr/local/go/src/runtime/pprof/pprof.go:784 +0x140

trap    0x0
error   0x0
oldmask 0x0
r0      0xfffffffc
r1      0x0
r2      0x0
r3      0x2710
r4      0x0
r5      0x4
r6      0x200c010
r7      0xa2
r8      0x6
r9      0x1
r10     0x20002d0
fp      0x2710
ip      0x80000000
sp      0x203df6c
lr      0x7dfa4
pc      0x7dfc4
cpsr    0x20800010
fault   0x0
*** Test killed with quit: ran too long (19m0s).
@toothrot toothrot added the NeedsInvestigation Someone must examine and confirm this is a valid issue and not a duplicate of an existing one. label Aug 2, 2021
@toothrot toothrot added this to the Go1.17 milestone Aug 2, 2021
@bcmills
Copy link
Member

bcmills commented Aug 7, 2021

@bcmills
Copy link
Member

bcmills commented Aug 12, 2021

$ greplogs --dashboard -md -l -e (?ms)SIGQUIT.*finalizer wait.*FAIL\s+runtime/pprof

2021-08-11T22:07:50-dea23e9/linux-arm-aws
2021-08-06T20:44:03-63b968f/linux-arm-aws

@bcmills
Copy link
Member

bcmills commented Aug 12, 2021

But, it appears this bug might be older still. I see a lot of logs with apparent deadlocks involving goroutines in the finalizer wait state, especially on slower or memory-constrained builders.

$ greplogs --dashboard -md -l -e (?ms)SIGQUIT.*finalizer wait.*FAIL\s+runtime

2021-08-11T22:07:50-dea23e9/linux-arm-aws
2021-08-06T20:44:03-63b968f/linux-arm-aws
2021-05-09T17:07:01-bedf2c4/openbsd-arm-jsing
2021-05-09T17:06:45-603f43c/openbsd-arm-jsing
2021-04-30T00:32:42-303b194/linux-amd64-noopt
2021-04-16T21:50:38-9481789/freebsd-amd64-11_2
2021-03-10T17:36:55-1811aea/netbsd-amd64-9_0
2021-03-08T19:07:25-7419a86/freebsd-arm64-dmgk
2021-02-19T00:02:30-4dd77bd/linux-386-longtest
2021-02-19T00:02:30-4dd77bd/linux-amd64-longtest
2021-02-19T00:02:23-aa0388f/linux-386-longtest
2021-02-19T00:02:23-aa0388f/linux-amd64-longtest
2021-02-19T00:02:15-6fe8981/linux-386-longtest
2021-02-19T00:02:15-6fe8981/linux-amd64-longtest
2021-02-19T00:02:06-01f05d8/linux-386-longtest
2021-02-19T00:02:06-01f05d8/linux-amd64-longtest
2021-02-09T18:40:13-e9c9683/freebsd-arm64-dmgk
2020-12-07T02:31:33-e10c94a/freebsd-amd64-11_2
2020-11-12T13:47:13-9ef65ff/openbsd-amd64-62
2020-11-11T20:48:31-4c174a7/openbsd-amd64-62
2020-11-11T20:26:44-b641f0d/openbsd-amd64-62
2020-11-10T16:09:36-e3de852/openbsd-amd64-62
2020-11-09T21:03:36-d495712/openbsd-amd64-62
2020-11-09T19:46:24-a2d0147/openbsd-amd64-62
2020-11-09T18:34:47-fdecb5c/openbsd-amd64-62
2020-11-09T18:32:36-6560768/openbsd-amd64-62
2020-11-09T17:24:38-51f4168/openbsd-amd64-62
2020-11-09T15:09:58-3fad58f/openbsd-mips64-jsing
2020-11-09T13:12:41-2231243/openbsd-mips64-jsing
2020-11-08T12:19:55-7307e86/openbsd-mips64-jsing
2020-11-07T22:38:01-afe7c8d/linux-386-longtest
2020-11-07T22:38:01-afe7c8d/openbsd-mips64-jsing
2020-11-07T16:59:55-5e371e0/openbsd-mips64-jsing
2020-11-07T16:59:40-e8379ab/openbsd-mips64-jsing
2020-11-07T16:31:02-2c80de7/openbsd-mips64-jsing
2020-11-07T14:23:20-bb9a96d/openbsd-mips64-jsing
2020-11-07T07:33:23-a6755fc/openbsd-mips64-jsing
2020-11-07T03:52:47-d51ae66/openbsd-mips64-jsing
2020-11-07T03:19:27-33bc8ce/openbsd-mips64-jsing
2020-11-07T02:12:20-c9b9cd7/openbsd-mips64-jsing
2020-11-06T23:59:04-f7ef5ca/openbsd-mips64-jsing
2020-11-06T23:53:49-3a81338/openbsd-mips64-jsing
2020-11-06T20:49:11-5736eb0/openbsd-mips64-jsing
2020-11-06T19:42:05-362d25f/openbsd-mips64-jsing
2020-11-06T15:33:23-d21af00/openbsd-mips64-jsing
2020-11-06T10:45:31-854e892/openbsd-mips64-jsing
2020-11-06T07:55:52-b7e0adf/openbsd-mips64-jsing
2020-11-05T16:47:53-67bf1c9/openbsd-amd64-62
2020-11-05T16:20:01-8ab8125/openbsd-amd64-62
2020-11-05T12:57:20-0556831/openbsd-amd64-62
2020-11-04T21:17:01-f532f19/openbsd-amd64-62
2020-11-04T20:58:06-3a1d84b/openbsd-amd64-62
2020-11-02T23:58:08-3a76627/openbsd-amd64-62
2020-11-02T13:02:40-7be8358/openbsd-amd64-62
2020-11-02T03:02:35-fdba080/openbsd-amd64-62
2020-10-31T00:35:18-79fb187/openbsd-amd64-62
2020-10-31T00:17:28-07e4f0f/openbsd-amd64-62
2020-10-30T21:14:09-f96b62b/openbsd-amd64-62
2020-10-30T20:20:58-6abbfc1/openbsd-amd64-62
2020-10-30T18:06:01-8494a62/openbsd-amd64-62
2020-10-30T17:41:35-a313eec/openbsd-amd64-62
2020-10-30T00:23:50-faa4426/openbsd-amd64-62
2020-10-30T00:13:25-60f42ea/openbsd-amd64-62
2020-10-29T15:11:47-ecb79e8/openbsd-amd64-62
2020-10-29T08:00:50-8b51798/openbsd-amd64-62
2020-10-29T03:52:23-3089ef6/openbsd-amd64-62
2020-10-28T17:10:08-642329f/openbsd-amd64-62
2020-10-28T17:08:06-e3c58bb/netbsd-amd64-9_0
2020-10-28T16:17:54-421d4e7/netbsd-386-9_0
2020-10-28T00:49:40-94f3762/openbsd-amd64-62
2020-10-28T00:25:05-40d1ec5/openbsd-amd64-62
2020-10-27T19:52:40-c515852/openbsd-amd64-62
2020-10-27T18:13:59-3f6b1a0/openbsd-amd64-62
2020-10-20T20:13:16-5e9582e/linux-amd64-longtest
2020-10-01T15:16:43-89f687d/freebsd-386-12_0
2020-08-18T17:08:05-20b4987/linux-amd64-longtest
2020-06-26T17:07:58-d1015f3/freebsd-386-11_2
2020-05-12T15:01:56-a0698a6/darwin-amd64-10_15
2020-02-19T21:18:17-2f2e97c/linux-amd64-longtest
2020-01-31T23:17:00-8390c47/android-amd64-emu
2019-12-06T23:44:28-76d2f6c/freebsd-amd64-11_2
2019-11-07T18:06:59-ee2268c/linux-amd64
2019-11-07T15:04:03-73d57bf/linux-amd64-sid
2019-11-07T14:51:06-543c6d2/linux-amd64-clang
2019-11-07T06:58:44-f5949b6/linux-amd64-fedora
2019-11-07T05:59:33-a8f57f4/freebsd-386-11_2
2019-11-06T18:27:09-3c29796/linux-386-clang
2019-11-06T16:17:30-a5936a4/linux-amd64-nocgo
2019-11-06T14:33:39-f891b7c/linux-386-clang
2019-11-06T09:09:59-0c5d545/linux-amd64-jessie
2019-11-06T02:49:10-de70de6/freebsd-amd64-race
2019-11-05T20:21:34-552987f/linux-386-longtest
2019-11-05T14:44:56-e457cc3/android-amd64-emu
2019-11-05T03:42:00-11db7e4/linux-arm
2019-11-05T02:49:48-2ff746d/linux-arm64-packet
2019-11-05T01:01:31-376c4ce/freebsd-386-11_2
2019-11-05T00:17:34-04e80fb/linux-386-387
2019-11-04T23:41:34-383b447/darwin-amd64-race
2019-11-04T22:52:25-74af7fc/linux-amd64-race
2019-11-04T05:27:25-fb29e22/linux-386
2019-11-03T01:44:46-d2c039f/linux-386-clang
2019-11-02T21:51:23-7955ece/linux-amd64-sid
2019-11-01T16:05:22-8d45e61/freebsd-386-11_2
2019-10-30T08:17:29-f4e32ae/aix-ppc64
2019-10-30T04:55:05-01e7a15/aix-ppc64
2019-10-29T21:21:09-70331a3/aix-ppc64
2019-10-29T20:10:15-1fb7d54/aix-ppc64
2019-10-29T19:41:46-9f93fd2/aix-ppc64
2019-10-29T14:52:57-e7ce862/aix-ppc64
2019-10-29T14:22:15-a0c1e8d/aix-ppc64
2019-10-29T13:48:29-ec10e6f/aix-ppc64
2019-10-29T13:46:23-28a15e3/aix-ppc64
2019-10-29T13:46:21-b6bdf45/aix-ppc64
2019-10-29T05:39:25-25f5044/aix-ppc64
2019-10-29T03:18:55-97592b3/aix-ppc64
2019-10-29T03:18:52-d2101e5/aix-ppc64
2019-10-28T23:59:10-fd1e60f/aix-ppc64
2019-10-28T23:33:26-bf670bd/aix-ppc64
2019-10-28T22:00:19-81d6ec2/aix-ppc64
2019-10-28T19:31:17-18e9cdc/aix-ppc64
2019-10-28T17:28:12-503bccb/aix-ppc64
2019-10-28T16:15:54-5c6db9a/aix-ppc64
2019-10-28T09:26:37-152ddde/aix-ppc64
2019-10-28T01:00:20-1f3339f/aix-ppc64
2019-10-27T16:02:29-22d3770/aix-ppc64
2019-10-26T15:17:28-0f55994/aix-ppc64
2019-10-26T08:29:23-5d000a8/aix-ppc64
2019-10-26T08:29:23-5d000a8/linux-ppc64-buildlet
2019-10-26T07:52:35-c70e547/aix-ppc64
2019-10-26T07:16:46-18d57bc/aix-ppc64
2019-10-26T07:06:49-7e68f81/aix-ppc64
2019-10-26T04:00:03-65a1e24/aix-ppc64
2019-10-26T02:52:32-8dc1a15/aix-ppc64
2019-10-26T02:52:30-42aab4b/aix-ppc64
2019-10-26T02:52:28-8714e39/aix-ppc64
2019-10-26T02:36:26-3706cd8/aix-ppc64
2019-10-26T01:46:02-06bdd52/aix-ppc64
2019-10-26T00:05:43-813d8e8/aix-ppc64
2019-10-26T00:05:37-8031532/aix-ppc64
2019-10-25T23:25:35-6058603/aix-ppc64
2019-10-25T23:25:32-8c58615/aix-ppc64
2019-10-25T20:46:57-46e0d72/aix-ppc64
2019-10-25T20:21:24-dcad830/aix-ppc64
2019-10-25T18:44:06-096126d/aix-ppc64
2019-10-25T17:59:05-6b6e67f/aix-ppc64
2019-10-25T16:59:36-f82956b/aix-ppc64
2019-10-25T16:31:41-00d6b28/aix-ppc64
2019-10-25T04:15:59-4f70c15/aix-ppc64
2019-10-24T20:19:06-680ed10/aix-ppc64
2019-10-24T20:15:14-c4c3754/aix-ppc64
2019-10-24T17:40:35-f922cc6/aix-ppc64
2019-10-24T16:18:09-fc33506/aix-ppc64
2019-10-24T12:09:29-a42a396/aix-ppc64
2019-10-24T08:47:14-722b0e3/aix-ppc64
2019-10-24T04:36:53-68981bf/aix-ppc64
2019-10-23T21:41:15-e9e4f6c/aix-ppc64
2019-10-23T17:57:23-7879064/aix-ppc64
2019-10-23T15:12:15-f91e895/aix-ppc64
2019-10-23T07:02:10-c824420/aix-ppc64
2019-10-23T05:20:12-5f92939/aix-ppc64
2019-10-22T23:14:03-5a5854c/aix-ppc64
2019-10-22T21:40:27-220150f/aix-ppc64
2019-10-22T21:34:48-b2ea4e6/aix-ppc64
2019-10-22T21:04:09-4665470/aix-ppc64
2019-10-22T20:22:27-eff3c1e/aix-ppc64
2019-10-22T19:46:38-88186e5/aix-ppc64
2019-10-22T19:46:38-88186e5/linux-ppc64-buildlet
2019-10-22T19:36:42-03fb1f6/aix-ppc64
2019-10-22T19:35:04-48eb79e/aix-ppc64
2019-10-22T19:21:02-a813d3c/aix-ppc64
2019-10-22T18:26:35-3db6d46/aix-ppc64
2019-10-22T18:09:03-a97ccc8/aix-ppc64
2019-10-22T15:28:05-19d2a1c/aix-ppc64
2019-10-22T15:15:06-95544cc/aix-ppc64
2019-10-22T14:26:04-ea726d8/aix-ppc64
2019-10-22T13:57:02-d29c14f/aix-ppc64
2019-10-22T08:54:50-7f98c0e/aix-ppc64
2019-10-22T01:03:29-9979366/aix-ppc64
2019-10-22T01:03:09-2e1a6a2/aix-ppc64
2019-10-21T23:36:02-a59808e/aix-ppc64
2019-10-21T23:18:36-6b3bb4b/aix-ppc64
2019-10-21T23:16:36-9989221/aix-ppc64
2019-10-21T23:16:27-7b58581/aix-ppc64
2019-10-21T22:01:12-dba19c6/aix-ppc64
2019-10-21T21:43:41-10e7bc9/aix-ppc64
2019-10-21T20:43:40-3409ce3/aix-ppc64
2019-10-21T19:06:29-71d127a/aix-ppc64
2019-10-21T19:04:47-ba97d0d/aix-ppc64
2019-10-21T17:10:59-ff9f7bc/aix-ppc64
2019-10-21T16:37:45-50f4896/aix-ppc64
2019-09-26T21:24:48-0a51164/linux-ppc64-buildlet
2019-09-17T20:11:12-575386d/linux-ppc64le-buildlet
2019-09-10T09:12:32-b38be35/linux-ppc64-buildlet
2019-09-06T17:34:02-e3d3e11/freebsd-amd64-11_2
2019-09-03T17:32:37-663680b/linux-arm64-packet
2019-08-26T17:14:41-bdd0ff0/linux-ppc64le-buildlet
2019-08-26T02:18:15-78f6856/linux-ppc64-buildlet
2019-08-22T20:06:29-65e624e/linux-ppc64-buildlet
2019-08-12T20:14:38-4550406/linux-amd64-longtest
2019-07-27T23:29:18-a25c287/solaris-amd64-smartosbuildlet
2019-07-03T14:55:39-b412fde/linux-ppc64-buildlet
2019-06-20T19:27:15-0884bca/linux-ppc64le-buildlet
2019-06-19T05:46:46-1962dc8/linux-amd64-race
2019-06-06T10:49:25-f31b7b9/linux-ppc64-buildlet
2019-06-03T01:37:58-98100c5/freebsd-amd64-race
2019-05-30T12:00:59-a0c96a9/linux-ppc64le-buildlet
2019-05-23T06:16:38-6f51082/linux-ppc64le-buildlet
2019-05-23T01:38:26-7a567a6/solaris-amd64-smartosbuildlet
2019-05-22T20:41:27-983986f/solaris-amd64-smartosbuildlet
2019-05-16T22:39:38-97ecc43/android-amd64-emu
2019-05-14T18:48:16-6081a9f/solaris-amd64-smartosbuildlet
2019-05-13T13:53:07-afd7915/aix-ppc64
2019-05-09T21:13:56-a4f5c9c/aix-ppc64
2019-05-09T21:13:46-56c1cf3/aix-ppc64
2019-05-09T21:13:39-13723d4/aix-ppc64
2019-05-09T21:13:23-60f6763/aix-ppc64
2019-05-09T21:13:18-6ed2ec4/aix-ppc64
2019-05-09T17:49:12-50a1d89/aix-ppc64
2019-05-08T16:38:55-bef1534/linux-ppc64le-buildlet
2019-04-29T19:09:29-6ee8324/linux-ppc64-buildlet

@bcmills bcmills changed the title runtime/pprof: flaky test timeouts on linux-armv6l runtime: apparent GC deadlock in runtime/pprof tests on linux-armv6l Aug 12, 2021
@bcmills
Copy link
Member

bcmills commented Aug 12, 2021

The trace in 2021-08-11T22:07:50-dea23e9/linux-arm-aws seems to have better detail: similar failure mode, but those two goroutine running on other thread; stack unavailable entries actually have stacks there:


goroutine 2 [runnable]:
runtime.gopark(0x196dfc, 0x2883e0, 0x11, 0x14, 0x1)
	/workdir/go/src/runtime/proc.go:366 +0x100 fp=0x242cfd8 sp=0x242cfc4 pc=0x4ba2c
runtime.goparkunlock(...)
	/workdir/go/src/runtime/proc.go:372
runtime.forcegchelper()
	/workdir/go/src/runtime/proc.go:306 +0xc8 fp=0x242cfec sp=0x242cfd8 pc=0x4b890
runtime.goexit()
	/workdir/go/src/runtime/asm_arm.s:838 +0x4 fp=0x242cfec sp=0x242cfec pc=0x7cd94
created by runtime.init.6
	/workdir/go/src/runtime/proc.go:294 +0x24

…

goroutine 49 [runnable]:
runtime.Gosched(...)
	/workdir/go/src/runtime/proc.go:322
runtime/pprof.TestGoroutineSwitch(0x2576000)
	/workdir/go/src/runtime/pprof/pprof_test.go:520 +0x180 fp=0x242a7a4 sp=0x242a768 pc=0x14abfc
testing.tRunner(0x2576000, 0x196ee4)
	/workdir/go/src/testing/testing.go:1259 +0xf0 fp=0x242a7e4 sp=0x242a7a4 pc=0xde1f0
runtime.goexit()
	/workdir/go/src/runtime/asm_arm.s:838 +0x4 fp=0x242a7e4 sp=0x242a7e4 pc=0x7cd94
created by testing.(*T).Run
	/workdir/go/src/testing/testing.go:1306 +0x35c

@bcmills
Copy link
Member

bcmills commented Aug 12, 2021

For comparison, a successful run in https://build.golang.org/log/8b12b357fab1605247238fe334c5b2aec2e090c0 shows the entire runtime/pprof test binary completing in ~5s.

@bcmills bcmills changed the title runtime: apparent GC deadlock in runtime/pprof tests on linux-armv6l runtime/pprof: apparent deadlock in TestGoroutineSwitch on linux-armv6l Aug 12, 2021
@mknyszek
Copy link
Contributor

Reproduced in a gomote, turns out this was a latent bug brought to the forefront by #47554, which turned on GOARM=6 (not reproducible with GOARM=7). This explains why this only started occurring 6 days ago.

@cherrymui figured it out the full issue: on ARMv6 we are missing some atomics, so we use a kernel helper and a lock a spinlock from a table indexed by the atomically accessed value's address (I guess the kernel helper needs to be synchronized for the same address?). When the profiling signal is enabled, we try to use the atomics in the signal handler for the profiling signal. If it happens to land when one of these atomics are being used (so a spinlock is held), and it indexes into the same spinlock in the table, the program can self-deadlock. I don't know the full details as to why it works this way, but the fix is trivial: just back out of the profiling signal handler if it discovers it has landed in the kernel helper.

@prattmic is working on putting the fix up for review.

@mknyszek
Copy link
Contributor

@gopherbot Please open a backport issues for 1.15 and 1.16.

This bug is not new and can cause deadlocks on linux/arm (GOARM=6) when profiling is turned on with no workaround. The biggest immediate issue for us is that this will cause new failures on all release builders, if not fixed.

@gopherbot
Copy link

Backport issue(s) opened: #47674 (for 1.15), #47675 (for 1.16).

Remember to create the cherry-pick CL(s) as soon as the patch is submitted to master, according to https://golang.org/wiki/MinorReleases.

@gopherbot
Copy link

Change https://golang.org/cl/341889 mentions this issue: runtime: drop SIGPROF while in ARM < 7 kernel helpers

@prattmic
Copy link
Member

(I guess the kernel helper needs to be synchronized for the same address?)

I'm not sure what you mean by this, but I'll note that the kernel helper is used to lock the spinlock (the spinlock is locked with Cas, Cas calls the cas kernel helper). So if SIGPROF arrives when we are still in the helper but after the lock is successfully taken (probably on the RET), then another use of atomics from the signal handler may try to lock the same spinlock and deadlock. I believe it is simply bad luck when we happen to pick the same spinlock.

@mknyszek
Copy link
Contributor

Got it. I misunderstood the purpose of the kernel helper.

@prattmic
Copy link
Member

@gopherbot Please open a backport issue for 1.17.

This bug is not new and can cause deadlocks on linux/arm (GOARM=6) when profiling is turned on with no workaround. The biggest immediate issue for us is that this will cause new failures on all release builders, if not fixed.

@gopherbot
Copy link

Change https://golang.org/cl/341853 mentions this issue: [release-branch.go1.16] runtime: drop SIGPROF while in ARM < 7 kernel helpers

@gopherbot
Copy link

Change https://golang.org/cl/341890 mentions this issue: [release-branch.go1.17] runtime: drop SIGPROF while in ARM < 7 kernel helpers

@dmitshur dmitshur modified the milestones: Go1.17, Go1.18 Aug 13, 2021
@dmitshur dmitshur added NeedsFix The path to resolution is known, but the work has not been done. and removed NeedsInvestigation Someone must examine and confirm this is a valid issue and not a duplicate of an existing one. labels Aug 13, 2021
gopherbot pushed a commit that referenced this issue Aug 13, 2021
… helpers

On Linux ARMv6 and below runtime/internal/atomic.Cas calls into a kernel
cas helper at a fixed address. If a SIGPROF arrives while executing the
kernel helper, the sigprof lostAtomic logic will miss that we are
potentially in the spinlock critical section, which could cause
a deadlock when using atomics later in sigprof.

For #47505
Fixes #47688

Change-Id: If8ba0d0fc47e45d4e6c68eca98fac4c6ed4e43c1
Reviewed-on: https://go-review.googlesource.com/c/go/+/341889
Trust: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Pratt <mpratt@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
(cherry picked from commit 20a620f)
Reviewed-on: https://go-review.googlesource.com/c/go/+/341890
gopherbot pushed a commit that referenced this issue Aug 13, 2021
… helpers

On Linux ARMv6 and below runtime/internal/atomic.Cas calls into a kernel
cas helper at a fixed address. If a SIGPROF arrives while executing the
kernel helper, the sigprof lostAtomic logic will miss that we are
potentially in the spinlock critical section, which could cause
a deadlock when using atomics later in sigprof.

For #47505
Fixes #47675

Change-Id: If8ba0d0fc47e45d4e6c68eca98fac4c6ed4e43c1
Reviewed-on: https://go-review.googlesource.com/c/go/+/341889
Trust: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Pratt <mpratt@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
(cherry picked from commit 20a620f)
Reviewed-on: https://go-review.googlesource.com/c/go/+/341853
steeve pushed a commit to znly/go that referenced this issue Aug 19, 2021
… helpers

On Linux ARMv6 and below runtime/internal/atomic.Cas calls into a kernel
cas helper at a fixed address. If a SIGPROF arrives while executing the
kernel helper, the sigprof lostAtomic logic will miss that we are
potentially in the spinlock critical section, which could cause
a deadlock when using atomics later in sigprof.

For golang#47505
Fixes golang#47688

Change-Id: If8ba0d0fc47e45d4e6c68eca98fac4c6ed4e43c1
Reviewed-on: https://go-review.googlesource.com/c/go/+/341889
Trust: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Pratt <mpratt@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
(cherry picked from commit 20a620f)
Reviewed-on: https://go-review.googlesource.com/c/go/+/341890
@golang golang locked and limited conversation to collaborators Aug 13, 2022
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
FrozenDueToAge NeedsFix The path to resolution is known, but the work has not been done.
Projects
None yet
Development

No branches or pull requests

6 participants