Skip to content

net: fix spurious netpoll "failed to associate" error on Solaris #7410

@4ad

Description

@4ad
I can't reproduce this with go test net, but it sometimes happens (more often now) when
running all.bash.

netpollupdate: failed to associate (81)
fatal error: netpollupdate: failed to associate

runtime stack:
runtime.throw(0x7d8395)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/runtime/panic.c:463 +0x69
runtime.netpollupdate(0xfffffd7fff150768, 0x400000000)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/runtime/netpoll_solaris.c:101 +0x109
runtime.netpoll(0x7e5601)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/runtime/netpoll_solaris.c:179 +0x1d3
findrunnable()
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/runtime/proc.c:1244 +0x386
schedule()
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/runtime/proc.c:1342 +0xe3
park0(0xc2080027e0)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/runtime/proc.c:1405 +0xfe
runtime.mcall(0x42afed)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/runtime/asm_amd64.s:181 +0x54

goroutine 16 [chan receive]:
testing.RunTests(0x7069b0, 0x7e2b60, 0x79, 0x79, 0x1)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/testing/testing.go:504 +0x909
testing.Main(0x7069b0, 0x7e2b60, 0x79, 0x79, 0x7d93a0, ...)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/testing/testing.go:435 +0x9e
main.main()
    /tmp/go-build639245680/net/_test/_testmain.go:321 +0x9c

goroutine 87 [syscall]:
syscall.sysvicall6(0xfffffd7fff2b03f0, 0x3, 0xc2080c3110, 0x800000, 0x0, ...)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/runtime/syscall_solaris.goc:65 +0x54
syscall.Open(0x63fd70, 0xe, 0x800000, 0x0, 0x0, ...)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/syscall/zsyscall_solaris_amd64.go:447 +0x11e
os.OpenFile(0x63fd70, 0xe, 0x0, 0x0, 0x1, ...)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/os/file_unix.go:77 +0x9d
os.Open(0x63fd70, 0xe, 0x7070d0, 0x0, 0x0)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/os/file.go:239 +0x66
net.open(0x63fd70, 0xe, 0x0, 0x0, 0x0)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/net/parse.go:66 +0x58
net.readProtocols()
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/net/lookup_unix.go:19 +0x4e
sync.(*Once).Do(0x7ed800, 0x706f90)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/sync/once.go:40 +0x9a
net.lookupProtocol(0x647dd4, 0x4, 0x0, 0x0, 0x0)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/net/lookup_unix.go:47 +0x6a
net.parseNetwork(0x647dd0, 0x8, 0x647dd0, 0x3, 0x0, ...)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/net/dial.go:88 +0x726
net.ResolveIPAddr(0x647dd0, 0x8, 0x63ff30, 0x9, 0xc2080d9f01, ...)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/net/iprawsock.go:40 +0x93
net.TestResolveIPAddr(0xc2080e0000)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/net/ipraw_test.go:72 +0xeb
testing.tRunner(0xc2080e0000, 0x7e2f98)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/testing/testing.go:422 +0x9d
created by testing.RunTests
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/testing/testing.go:503 +0x8d9

goroutine 40 [IO wait]:
net.runtime_pollWait(0xfffffd7fff1506b8, 0x77, 0x0)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/runtime/netpoll.goc:146 +0x76
net.(*pollDesc).Wait(0xc2080565a0, 0x77, 0x0, 0x0)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/net/fd_poll_runtime.go:84 +0x46
net.(*pollDesc).WaitWrite(0xc2080565a0, 0x0, 0x0)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/net/fd_poll_runtime.go:93 +0x42
net.(*netFD).connect(0xc208056540, 0x0, 0x0, 0xfffffd7fff14f820, 0xc20803eb40, ...)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/net/fd_unix.go:96 +0x1e9
net.(*netFD).dial(0xc208056540, 0xfffffd7fff14f7d0, 0x0, 0xfffffd7fff14f7d0,
0xc20800cc60, ...)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/net/sock_posix.go:121 +0x333
net.socket(0x63c690, 0x3, 0x2, 0x2, 0x0, ...)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/net/sock_posix.go:91 +0x43b
net.internetSocket(0x63c690, 0x3, 0xfffffd7fff14f7d0, 0x0, 0xfffffd7fff14f7d0, ...)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/net/ipsock_posix.go:136 +0x179
net.dialTCP(0x63c690, 0x3, 0x0, 0xc20800cc60, 0x0, ...)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/net/tcpsock_posix.go:155 +0x11c
net.dialSingle(0x63c690, 0x3, 0xc2080011e0, 0xf, 0x0, ...)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/net/dial.go:238 +0x426
net.func·021(0xfffffd7fff14f768, 0xc20800cc60)
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/net/dial.go:200 +0xb4
created by net.dialMulti
    /home/dfc/work/solaris-amd64-smartos-da7776a7f91c/go/src/pkg/net/dial.go:210 +0x24d

Running go test net in a loop has been very reliable for me, both on
my 2-core laptop and on a 24-way machine.

    for i in `seq 30`; do go test net || break; done
    ok   net 20.073s
    ok   net 20.356s
    ok   net 19.911s
    ok   net 20.162s
    ok   net 20.160s
    ok   net 19.995s
    ok   net 20.180s
    ok   net 29.973s
    ok   net 25.109s
    ok   net 19.795s
    ok   net 20.055s
    ok   net 20.084s
    ok   net 20.086s
    ok   net 19.938s
    ok   net 24.910s
    ok   net 20.093s
    ok   net 24.745s
    ok   net 20.114s
    ok   net 20.168s
    ok   net 24.943s
    ok   net 20.189s
    ok   net 19.923s
    ok   net 20.305s
    ok   net 20.135s
    ok   net 20.332s
    ok   net 19.953s
    ok   net 20.224s
    ok   net 20.381s
    ok   net 20.212s
    ok   net 20.159s
    : oos:aram;

    : oos:net; for i in `seq 100`; do net.test -test.short || break; done
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    PASS
    : oos:net;

It panics as it violates an assertion I've put in there. That assertion can only fail if
there's a race between closing file descriptors and rearming them in the netpoller.

Thinking about this, I've probably been too cautious and it's probably safe to remove
that assertion. The worst that could happen is a spurious netpoller wake. The best
strategy is fixing the underlying cause, however.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions