Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

runtime: TestPanicSystemstack is flaky on linux #33626

Closed
bcmills opened this issue Aug 13, 2019 · 9 comments
Closed

runtime: TestPanicSystemstack is flaky on linux #33626

bcmills opened this issue Aug 13, 2019 · 9 comments
Labels
Milestone

Comments

@bcmills
Copy link
Member

@bcmills bcmills commented Aug 13, 2019

On the linux-amd64-longtest builder (https://build.golang.org/log/d032877df5968710842d285d5874a466cd80c986):

--- FAIL: TestPanicSystemstack (5.09s)
    crash_unix_test.go:229: traceback missing user stack:
         SIGQUIT: quit
        PC=0x467315 m=0 sigcode=0
        
        goroutine 0 [idle]:
        runtime.write(0x2, 0x66a462, 0x2)
        	/workdir/go/src/runtime/sys_linux_amd64.s:98 +0x15 fp=0x7ffe6832dec0 sp=0x7ffe6832deb8 pc=0x467315
        runtime.writeErr(...)
        	/workdir/go/src/runtime/write_err.go:12
        runtime.gwrite(0x66a462, 0x2, 0x2)
        	/workdir/go/src/runtime/print.go:98 +0xa3 fp=0x7ffe6832df00 sp=0x7ffe6832dec0 pc=0x432b73
        runtime.printstring(0x66a462, 0x2)
        	/workdir/go/src/runtime/print.go:242 +0x78 fp=0x7ffe6832df50 sp=0x7ffe6832df00 pc=0x433398
        runtime.blockOnSystemStackInternal()
        	/workdir/go/src/runtime/export_test.go:363 +0x3b fp=0x7ffe6832df70 sp=0x7ffe6832df50 pc=0x45df4b
        runtime.systemstack(0x463834)
        	/workdir/go/src/runtime/asm_amd64.s:370 +0x66 fp=0x7ffe6832df78 sp=0x7ffe6832df70 pc=0x4639a6
        runtime.mstart()
        	/workdir/go/src/runtime/proc.go:1146 fp=0x7ffe6832df80 sp=0x7ffe6832df78 pc=0x4365c0
        
        goroutine 1 [running, locked to thread]:
        runtime.systemstack_switch()
        	/workdir/go/src/runtime/asm_amd64.s:330 fp=0xc00004eeb8 sp=0xc00004eeb0 pc=0x463930
        runtime.BlockOnSystemStack(...)
        	/workdir/go/src/runtime/export_test.go:359
        runtime_test.testPanicSystemstackInternal()
        	/workdir/go/src/runtime/crash_unix_test.go:246 +0x2d fp=0xc00004eed0 sp=0xc00004eeb8 pc=0x58399d
        runtime_test.init.2()
        	/workdir/go/src/runtime/crash_unix_test.go:241 +0x7a fp=0xc00004ef00 sp=0xc00004eed0 pc=0x58393a
        runtime.doInit(0x818dc0)
        	/workdir/go/src/runtime/proc.go:5222 +0x8a fp=0xc00004ef30 sp=0xc00004ef00 pc=0x43fe5a
        runtime.doInit(0x815640)
        	/workdir/go/src/runtime/proc.go:5217 +0x57 fp=0xc00004ef60 sp=0xc00004ef30 pc=0x43fe27
        runtime.main()
        	/workdir/go/src/runtime/proc.go:190 +0x1da fp=0xc00004efe0 sp=0xc00004ef60 pc=0x43395a
        runtime.goexit()
        	/workdir/go/src/runtime/asm_amd64.s:1357 +0x1 fp=0xc00004efe8 sp=0xc00004efe0 pc=0x465a01
        
        goroutine 2 [force gc (idle)]:
        runtime.gopark(0x67e938, 0x832550, 0x1411, 0x1)
        	/workdir/go/src/runtime/proc.go:304 +0xe0 fp=0xc000040fb0 sp=0xc000040f90 pc=0x433d70
        runtime.goparkunlock(...)
        	/workdir/go/src/runtime/proc.go:310
        runtime.forcegchelper()
        	/workdir/go/src/runtime/proc.go:253 +0xb7 fp=0xc000040fe0 sp=0xc000040fb0 pc=0x433c27
        runtime.goexit()
        	/workdir/go/src/runtime/asm_amd64.s:1357 +0x1 fp=0xc000040fe8 sp=0xc000040fe0 pc=0x465a01
        created by runtime.init.5
        	/workdir/go/src/runtime/proc.go:242 +0x35
        
        goroutine 3 [GC sweep wait]:
        runtime.gopark(0x67e938, 0x832940, 0x140c, 0x1)
        	/workdir/go/src/runtime/proc.go:304 +0xe0 fp=0xc0000417a8 sp=0xc000041788 pc=0x433d70
        runtime.goparkunlock(...)
        	/workdir/go/src/runtime/proc.go:310
        runtime.bgsweep(0xc000014070)
        	/workdir/go/src/runtime/mgcsweep.go:70 +0x9c fp=0xc0000417d8 sp=0xc0000417a8 pc=0x4253fc
        runtime.goexit()
        	/workdir/go/src/runtime/asm_amd64.s:1357 +0x1 fp=0xc0000417e0 sp=0xc0000417d8 pc=0x465a01
        created by runtime.gcenable
        	/workdir/go/src/runtime/mgc.go:210 +0x5c
        
        goroutine 4 [GC scavenge wait]:
        runtime.gopark(0x67e938, 0x832b00, 0x140d, 0x1)
        	/workdir/go/src/runtime/proc.go:304 +0xe0 fp=0xc000041f38 sp=0xc000041f18 pc=0x433d70
        runtime.goparkunlock(...)
        	/workdir/go/src/runtime/proc.go:310
        runtime.bgscavenge(0xc000014070)
        	/workdir/go/src/runtime/mgcscavenge.go:257 +0xe1 fp=0xc000041fd8 sp=0xc000041f38 pc=0x424a51
        runtime.goexit()
        	/workdir/go/src/runtime/asm_amd64.s:1357 +0x1 fp=0xc000041fe0 sp=0xc000041fd8 pc=0x465a01
        created by runtime.gcenable
        	/workdir/go/src/runtime/mgc.go:211 +0x7e
        
        goroutine 18 [finalizer wait]:
        runtime.gopark(0x67e938, 0x84ec18, 0x1410, 0x1)
        	/workdir/go/src/runtime/proc.go:304 +0xe0 fp=0xc000040758 sp=0xc000040738 pc=0x433d70
        runtime.goparkunlock(...)
        	/workdir/go/src/runtime/proc.go:310
        runtime.runfinq()
        	/workdir/go/src/runtime/mfinal.go:175 +0xa3 fp=0xc0000407e0 sp=0xc000040758 pc=0x41acd3
        runtime.goexit()
        	/workdir/go/src/runtime/asm_amd64.s:1357 +0x1 fp=0xc0000407e8 sp=0xc0000407e0 pc=0x465a01
        created by runtime.createfing
        	/workdir/go/src/runtime/mfinal.go:156 +0x61
        
        goroutine 5 [running]:
        	goroutine running on other thread; stack unavailable
        created by runtime_test.init.2
        	/workdir/go/src/runtime/crash_unix_test.go:240 +0x75
        
        rax    0x2
        rbx    0x0
        rcx    0x467315
        rdx    0x2
        rdi    0x2
        rsi    0x66a462
        rbp    0x7ffe6832def0
        rsp    0x7ffe6832deb8
        r8     0x2
        r9     0x2
        r10    0x4
        r11    0x246
        r12    0xf1
        r13    0x0
        r14    0x6b2fa8
        r15    0x0
        rip    0x467315
        rflags 0x246
        cs     0x33
        fs     0x0
        gs     0x0
        
        -----
        
        
FAIL
FAIL	runtime	174.345s

I've only seen this once, so it's not clear to me whether this is an existing issue, but see previously #26393.

CC @aclements @mknyszek

@bcmills bcmills added this to the Go1.14 milestone Aug 13, 2019
@rsc rsc modified the milestones: Go1.14, Backlog Oct 9, 2019
@bcmills bcmills changed the title runtime: TestPanicSystemstack flake on linux-amd64-longtest runtime: TestPanicSystemstack is flaky on linux-amd64-longtest builder Feb 19, 2020
@bcmills
Copy link
Member Author

@bcmills bcmills commented Aug 18, 2020

@bcmills bcmills changed the title runtime: TestPanicSystemstack is flaky on linux-amd64-longtest builder runtime: TestPanicSystemstack is flaky on linux Nov 9, 2020
@bcmills
Copy link
Member Author

@bcmills bcmills commented Nov 9, 2020

@prattmic
Copy link
Member

@prattmic prattmic commented Nov 9, 2020

This test is racy. The child process prints "x", then blocks on a lock. The parent waits for "x", sends SIGQUIT, and then expects the resulting stack trace to reflect the blocked state. However, the signal may be delivered before the child gets to blocking, resulting in this failure.

There isn't a great way for the parent to determine that the child is blocked, but at least it could sleep a bit to make it less likely to hit the bad case.

@prattmic prattmic added NeedsFix and removed NeedsInvestigation labels Nov 9, 2020
@gopherbot
Copy link

@gopherbot gopherbot commented Nov 9, 2020

Change https://golang.org/cl/268578 mentions this issue: runtime: give test child time to block

@bcmills
Copy link
Member Author

@bcmills bcmills commented Nov 10, 2020

There isn't a great way for the parent to determine that the child is blocked, but at least it could sleep a bit to make it less likely to hit the bad case.

Would it be possible to move the blockOnSystemStackInternal call to a separate goroutine, and let runtime.BlockOnSystemStack use some other hook to wait for it before doing the print itself?

(Or is the systemstack goroutine something that can't be introspected through the runtime's usual hooks?)

@bcmills bcmills added the Testing label Nov 10, 2020
@prattmic
Copy link
Member

@prattmic prattmic commented Nov 10, 2020

Hm, we could perhaps have runtime.BlockOnSystemStack spin waiting on the lock word to change to mutex_sleeping, though that is still a bit racy (and ugly): https://cs.opensource.google/go/go/+/master:src/runtime/lock_futex.go;l=100-106;drc=d36169120199e7f2b8c517fa6d82333496bb0a0a;bpv=0;bpt=1

However, that may be worse than sleeping. :)

@prattmic
Copy link
Member

@prattmic prattmic commented Nov 10, 2020

There's a bit of an orthogonal issue with this test, which I'm not sure if it is a regression since golang.org/cl/38012 or not.

The passing version of this test looks like this:

        goroutine 0 [idle]:                                                                                                                  
        runtime.futex(0x85d4a8, 0x200000080, 0x0, 0x0, 0x0, 0x7ffc233c9650, 0xc000030800, 0x7ffc233c9650, 0x7ffc233c9650, 0x40e72d, ...)     
                /usr/local/google/home/mpratt/src/go/src/runtime/sys_linux_amd64.s:579 +0x21 fp=0x7ffc233c95c8 sp=0x7ffc233c95c0 pc=0x47e521
        runtime.futexsleep(0x85d4a8, 0x2, 0xffffffffffffffff)
                /usr/local/google/home/mpratt/src/go/src/runtime/os_linux.go:44 +0x46 fp=0x7ffc233c9618 sp=0x7ffc233c95c8 pc=0x438586        
        runtime.lock2(0x85d4a8)                                                                                                              
                /usr/local/google/home/mpratt/src/go/src/runtime/lock_futex.go:106 +0x14d fp=0x7ffc233c9660 sp=0x7ffc233c9618 pc=0x40e72d    
        runtime.lockWithRank(...)                                                                                                            
                /usr/local/google/home/mpratt/src/go/src/runtime/lockrank_off.go:22                                                          
        runtime.lock(...)                                                                                                                    
                /usr/local/google/home/mpratt/src/go/src/runtime/lock_futex.go:47                                             
        runtime.blockOnSystemStackInternal()                                                                                                 
                /usr/local/google/home/mpratt/src/go/src/runtime/export_test.go:421 +0x68 fp=0x7ffc233c9680 sp=0x7ffc233c9660 pc=0x46dd68
        runtime.systemstack(0x47a754)                                                                                                        
                /usr/local/google/home/mpratt/src/go/src/runtime/asm_amd64.s:375 +0x66 fp=0x7ffc233c9688 sp=0x7ffc233c9680 pc=0x47a906       
        runtime.mstart()                                                                                                                     
                /usr/local/google/home/mpratt/src/go/src/runtime/proc.go:1183 fp=0x7ffc233c9690 sp=0x7ffc233c9688 pc=0x441cc0                
                                                                                                                                             
        goroutine 1 [running, locked to thread]:                                                                                             
        runtime.systemstack_switch()                                                                                                         
                /usr/local/google/home/mpratt/src/go/src/runtime/asm_amd64.s:335 fp=0xc000063ca0 sp=0xc000063c98 pc=0x47a880                 
        runtime.BlockOnSystemStack(...)                                                                                                      
                /usr/local/google/home/mpratt/src/go/src/runtime/export_test.go:415                                                          
        runtime_test.testPanicSystemstackInternal()                                                                                          
                /usr/local/google/home/mpratt/src/go/src/runtime/crash_unix_test.go:282 +0x2d fp=0xc000063cb8 sp=0xc000063ca0 pc=0x58decd    
        runtime_test.init.2()
                /usr/local/google/home/mpratt/src/go/src/runtime/crash_unix_test.go:277 +0x85 fp=0xc000063ce8 sp=0xc000063cb8 pc=0x58de65
        runtime.doInit(0x811120)
                /usr/local/google/home/mpratt/src/go/src/runtime/proc.go:6172 +0xec fp=0xc000063e38 sp=0xc000063ce8 pc=0x44cf6c
        runtime.doInit(0x80e2a0)
                /usr/local/google/home/mpratt/src/go/src/runtime/proc.go:6149 +0x72 fp=0xc000063f88 sp=0xc000063e38 pc=0x44cef2
        runtime.main()
                /usr/local/google/home/mpratt/src/go/src/runtime/proc.go:208 +0x205 fp=0xc000063fe0 sp=0xc000063f88 pc=0x43f205
        runtime.goexit()
                /usr/local/google/home/mpratt/src/go/src/runtime/asm_amd64.s:1367 +0x1 fp=0xc000063fe8 sp=0xc000063fe0 pc=0x47c6c1

...

        goroutine 6 [running]:                                                                                                               
                goroutine running on other thread; stack unavailable                                                                         
        created by runtime_test.init.2                                                                                                       
                /usr/local/google/home/mpratt/src/go/src/runtime/crash_unix_test.go:276 +0x7d 

...

        goroutine 0 [idle]:
        runtime.futex(0x85d4a8, 0x200000080, 0x0, 0x0, 0xc000000000, 0x0, 0x0, 0x7fbd3a526e48, 0x7fbd3a526e48, 0x40e72d, ...)
                /usr/local/google/home/mpratt/src/go/src/runtime/sys_linux_amd64.s:579 +0x21 fp=0x7fbd3a526dc0 sp=0x7fbd3a526db8 pc=0x47e521
        runtime.futexsleep(0x85d4a8, 0x2, 0xffffffffffffffff)
                /usr/local/google/home/mpratt/src/go/src/runtime/os_linux.go:44 +0x46 fp=0x7fbd3a526e10 sp=0x7fbd3a526dc0 pc=0x438586
        runtime.lock2(0x85d4a8)
                /usr/local/google/home/mpratt/src/go/src/runtime/lock_futex.go:106 +0x14d fp=0x7fbd3a526e58 sp=0x7fbd3a526e10 pc=0x40e72d
        runtime.lockWithRank(...)
                /usr/local/google/home/mpratt/src/go/src/runtime/lockrank_off.go:22
        runtime.lock(...)
                /usr/local/google/home/mpratt/src/go/src/runtime/lock_futex.go:47
        runtime.blockOnSystemStackInternal()
                /usr/local/google/home/mpratt/src/go/src/runtime/export_test.go:420 +0x57 fp=0x7fbd3a526e78 sp=0x7fbd3a526e58 pc=0x46dd57
        runtime.systemstack(0x7fbd30000020)
                /usr/local/google/home/mpratt/src/go/src/runtime/asm_amd64.s:375 +0x66 fp=0x7fbd3a526e80 sp=0x7fbd3a526e78 pc=0x47a906
        runtime.mstart()
                /usr/local/google/home/mpratt/src/go/src/runtime/proc.go:1183 fp=0x7fbd3a526e88 sp=0x7fbd3a526e80 pc=0x441cc0
        goroutine 6 [running]:
        runtime.systemstack_switch()
                /usr/local/google/home/mpratt/src/go/src/runtime/asm_amd64.s:335 fp=0xc0000527c8 sp=0xc0000527c0 pc=0x47a880
        runtime.BlockOnSystemStack(...)
                /usr/local/google/home/mpratt/src/go/src/runtime/export_test.go:415
        runtime_test.testPanicSystemstackInternal()
                /usr/local/google/home/mpratt/src/go/src/runtime/crash_unix_test.go:282 +0x2d fp=0xc0000527e0 sp=0xc0000527c8 pc=0x58decd
        runtime.goexit()
                /usr/local/google/home/mpratt/src/go/src/runtime/asm_amd64.s:1367 +0x1 fp=0xc0000527e8 sp=0xc0000527e0 pc=0x47c6c1
        created by runtime_test.init.2
                /usr/local/google/home/mpratt/src/go/src/runtime/crash_unix_test.go:276 +0x7d

We print goroutine 6 twice, once on its own (no stack, since it is running), and once with the systemstack g0.

  1. Is it intentional to print goroutine 6 twice?
  2. There is no spacing between g0 and g6 in the stack trace, but there is between g0 and g1 at the beginning.
  3. We nominally don't print running goroutine stacks since it is racy, but we do print when other is on the system stack. While in this case the system stack thread is blocked, we don't have any way to know for sure (since g0 is always _Gidle). Isn't it equally unsafe to print these stacks?

cc @aclements

@gopherbot gopherbot closed this in b63db7f Nov 18, 2020
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Linked pull requests

Successfully merging a pull request may close this issue.

None yet
4 participants