Skip to content

runtime: deadlock while all goroutines are "IO wait" #64894

@vincentbernat

Description

@vincentbernat

Go version

go version go1.21.5 linux/amd64

What operating system and processor architecture are you using (go env)?

GO111MODULE=''
GOARCH='amd64'
GOBIN=''
GOCACHE='/home/bernat/.cache/go-build'
GOENV='/home/bernat/.config/go/env'
GOEXE=''
GOEXPERIMENT=''
GOFLAGS=''
GOHOSTARCH='amd64'
GOHOSTOS='linux'
GOINSECURE=''
GOMODCACHE='/home/bernat/src/gocode/pkg/mod'
GONOPROXY=''
GONOSUMDB=''
GOOS='linux'
GOPATH='/home/bernat/src/gocode'
GOPRIVATE=''
GOPROXY='direct'
GOROOT='/usr/lib/go-1.21'
GOSUMDB='sum.golang.org'
GOTMPDIR=''
GOTOOLCHAIN='auto'
GOTOOLDIR='/usr/lib/go-1.21/pkg/tool/linux_amd64'
GOVCS=''
GOVERSION='go1.21.5'
GCCGO='gccgo'
GOAMD64='v1'
AR='ar'
CC='gcc'
CXX='g++'
CGO_ENABLED='1'
GOMOD='/home/bernat/code/free/monitor-rtp/go.mod'
GOWORK=''
CGO_CFLAGS='-O2 -g'
CGO_CPPFLAGS=''
CGO_CXXFLAGS='-O2 -g'
CGO_FFLAGS='-O2 -g'
CGO_LDFLAGS='-O2 -g'
PKG_CONFIG='pkg-config'
GOGCCFLAGS='-fPIC -m64 -pthread -Wl,--no-gc-sections -fmessage-length=0 -ffile-prefix-map=/tmp/go-build4122763557=/tmp/go-build -gno-record-gcc-switches'

What did you do?

package main

import (
	"encoding/binary"
	"flag"
	"fmt"
	"log"
	"net"
	"net/netip"
	"os"
	"sync"
	"syscall"
	"time"

	"golang.org/x/net/ipv4"
)

var (
	port            = flag.Int("port", 3001, "port to listen to")
	iface           = flag.String("interface", "eth0", "interface to listen to")
	maxDatagramSize = flag.Int("snaplen", 1500, "datagram size")
	seqThreshold    = flag.Int("seq-threshold", 1, "threshold for sequence number discontinuity")
	timeThreshold   = flag.Duration("time-threshold", time.Second, "threshold for time discontinuity")
)

type state struct {
	Seq  uint16
	Time time.Time
}

func listen(ifi *net.Interface, address string) {
	addr := netip.MustParseAddr(address)
	// We have to use syscalls instead of ListenMulticastUDP(). See
	// https://github.com/golang/go/issues/3820.
	s, err := syscall.Socket(syscall.AF_INET, syscall.SOCK_DGRAM, syscall.IPPROTO_UDP)
	if err != nil {
		log.Fatalf("%s: Socket: ", address, err)
	}
	if err := syscall.SetsockoptInt(s, syscall.SOL_SOCKET, syscall.SO_REUSEADDR, 1); err != nil {
		log.Fatalf("%s: SetsockoptInt: ", address, err)
	}
	lsa := syscall.SockaddrInet4{Port: *port, Addr: addr.As4()}
	if err := syscall.Bind(s, &lsa); err != nil {
		log.Fatalf("%s: Bind: ", address, err)
	}
	f := os.NewFile(uintptr(s), "")
	c, err := net.FilePacketConn(f)
	if err != nil {
		log.Fatalf("%s: FilePacketConn: ", address, err)
	}
	p := ipv4.NewPacketConn(c)
	if err := p.JoinGroup(ifi, &net.UDPAddr{IP: addr.AsSlice()}); err != nil {
		log.Fatalf("%s: JoinGroup: ", address, err)
	}
	b := make([]byte, *maxDatagramSize)
	lastState := map[string]state{}
	for {
		n, _, src, err := p.ReadFrom(b)
		if err != nil {
			log.Fatalf("%s: ReadFrom: ", address, err)
		}
		if n < 12 {
			log.Printf("%s: from %s, got only %d bytes", address, src, n)
			continue
		}
		srcIP := src.String()
		currentState := state{
			Seq:  binary.BigEndian.Uint16(b[2:4]),
			Time: time.Now(),
		}
		previousState, ok := lastState[srcIP]
		if ok {
			if diffSeq := currentState.Seq - previousState.Seq; diffSeq > uint16(*seqThreshold) && *seqThreshold > 0 {
				log.Printf("%s: seq discontinuity from %s %d → %d == %d",
					address, srcIP,
					previousState.Seq, currentState.Seq, diffSeq)
			}
			if diffTime := currentState.Time.Sub(previousState.Time); diffTime > *timeThreshold && *timeThreshold > 0 {
				log.Printf("%s: time discontinuity from %s %s → %s == %s",
					address, srcIP,
					previousState.Time.Format("2006-01-02T15:04:05"),
					currentState.Time.Format("2006-01-02T15:04:05"),
					diffTime)
			}
		}
		lastState[srcIP] = currentState
	}
}

func main() {
	flag.Usage = func() {
		fmt.Fprintf(flag.CommandLine.Output(), "monitor [OPTION] address...\n")
		flag.PrintDefaults()
	}
	flag.Parse()

	if len(flag.Args()) == 0 {
		flag.Usage()
		os.Exit(1)
	}

	ifi, err := net.InterfaceByName(*iface)
	if err != nil {
		log.Fatal("InterfaceByName: ", err)
	}

	var wg sync.WaitGroup
	for _, address := range flag.Args() {
		address := address
		wg.Add(1)
		go func() {
			defer wg.Done()
			log.Printf("listening to %s on %s", address, ifi.Name)
			listen(ifi, address)
		}()
	}

	wg.Wait()
}

What did you expect to see?

No deadlock.

What did you see instead?

Dead lock.

fatal error: all goroutines are asleep - deadlock!

goroutine 1 [semacquire, 973 minutes]:
sync.runtime_Semacquire(0xc000328ae0?)
        /usr/lib/go-1.21/src/runtime/sema.go:62 +0x25
sync.(*WaitGroup).Wait(0x4dbcbb?)
        /usr/lib/go-1.21/src/sync/waitgroup.go:116 +0x48
main.main()
        /home/bernat/code/free/monitor-rtp/main.go:118 +0x205

goroutine 18 [IO wait]:
internal/poll.runtime_pollWait(0x7f0377ebc6d0, 0x72)
        /usr/lib/go-1.21/src/runtime/netpoll.go:343 +0x85
internal/poll.(*pollDesc).wait(0xc0001a0180?, 0xc00042bd10?, 0x0)
        /usr/lib/go-1.21/src/internal/poll/fd_poll_runtime.go:84 +0x27
internal/poll.(*pollDesc).waitRead(...)
        /usr/lib/go-1.21/src/internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).RawRead(0xc0001a0180, 0xc00042bd10)
        /usr/lib/go-1.21/src/internal/poll/fd_unix.go:708 +0x12a
net.(*rawConn).Read(0xc00019e040, 0xc000089000?)
        /usr/lib/go-1.21/src/net/rawconn.go:44 +0x36
golang.org/x/net/internal/socket.(*Conn).recvMsg(0xc0001a4040, 0xc00050cba0, 0x0)
        /home/bernat/src/gocode/pkg/mod/golang.org/x/net@v0.19.0/internal/socket/rawconn_msg.go:27 +0x152
golang.org/x/net/internal/socket.(*Conn).RecvMsg(...)
        /home/bernat/src/gocode/pkg/mod/golang.org/x/net@v0.19.0/internal/socket/socket.go:247
golang.org/x/net/ipv4.(*payloadHandler).ReadFrom(0xc0001a6150, {0xc0001aac00, 0x5dc, 0x5dc})
        /home/bernat/src/gocode/pkg/mod/golang.org/x/net@v0.19.0/ipv4/payload_cmsg.go:31 +0x229
main.listen(0xc0000a41b0?, {0x7ffff495cb42, 0xa})
        /home/bernat/code/free/monitor-rtp/main.go:58 +0x5dc
main.main.func2()
        /home/bernat/code/free/monitor-rtp/main.go:114 +0x146
created by main.main in goroutine 1
        /home/bernat/code/free/monitor-rtp/main.go:111 +0x13b

goroutine 19 [IO wait]:
internal/poll.runtime_pollWait(0x7f0377ebcca0, 0x72)
        /usr/lib/go-1.21/src/runtime/netpoll.go:343 +0x85
internal/poll.(*pollDesc).wait(0xc000224000?, 0xc00007db80?, 0x0)
        /usr/lib/go-1.21/src/internal/poll/fd_poll_runtime.go:84 +0x27
internal/poll.(*pollDesc).waitRead(...)
        /usr/lib/go-1.21/src/internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).RawRead(0xc000224000, 0xc00007db80)
        /usr/lib/go-1.21/src/internal/poll/fd_unix.go:708 +0x12a
net.(*rawConn).Read(0xc000222010, 0xc000089000?)
        /usr/lib/go-1.21/src/net/rawconn.go:44 +0x36
golang.org/x/net/internal/socket.(*Conn).recvMsg(0xc000228000, 0xc00062d800, 0x0)
        /home/bernat/src/gocode/pkg/mod/golang.org/x/net@v0.19.0/internal/socket/rawconn_msg.go:27 +0x152
golang.org/x/net/internal/socket.(*Conn).RecvMsg(...)
        /home/bernat/src/gocode/pkg/mod/golang.org/x/net@v0.19.0/internal/socket/socket.go:247
golang.org/x/net/ipv4.(*payloadHandler).ReadFrom(0xc00022a010, {0xc00022e000, 0x5dc, 0x5dc})
        /home/bernat/src/gocode/pkg/mod/golang.org/x/net@v0.19.0/ipv4/payload_cmsg.go:31 +0x229
main.listen(0xc0000a41b0?, {0x7ffff495cb4d, 0xb})
        /home/bernat/code/free/monitor-rtp/main.go:58 +0x5dc
main.main.func2()
        /home/bernat/code/free/monitor-rtp/main.go:114 +0x146
created by main.main in goroutine 1
        /home/bernat/code/free/monitor-rtp/main.go:111 +0x13b

There are 202 goroutines in total. Except the first one, every other goroutine is exactly in the same state: stuck on /home/bernat/code/free/monitor-rtp/main.go:58//usr/lib/go-1.21/src/runtime/netpoll.go:343 +0x85, status IO wait or IO wait, 973 minutes.

The issue takes some time to appear (here 16 hours, but sometimes several days). I was unable to reproduce it quickly.

Metadata

Metadata

Assignees

No one assigned

    Labels

    NeedsInvestigationSomeone must examine and confirm this is a valid issue and not a duplicate of an existing one.compiler/runtimeIssues related to the Go compiler and/or runtime.

    Type

    No type

    Projects

    Status

    Todo

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions