Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 96 additions & 48 deletions pkg/tcpip/link/fdbased/endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -236,17 +236,11 @@ type Options struct {
ProcessorsPerChannel int
}

// fanoutID is used for AF_PACKET based endpoints to enable PACKET_FANOUT
// support in the host kernel. This allows us to use multiple FD's to receive
// from the same underlying NIC. The fanoutID needs to be the same for a given
// set of FD's that point to the same NIC. Trying to set the PACKET_FANOUT
// option for an FD with a fanoutID already in use by another FD for a different
// NIC will return an EINVAL.
//
// Since fanoutID must be unique within the network namespace, we start with
// the PID to avoid collisions. The only way to be sure of avoiding collisions
// is to run in a new network namespace.
var fanoutID atomicbitops.Int32 = atomicbitops.FromInt32(int32(unix.Getpid()))
// fallbackFanoutID is used only when PACKET_FANOUT_FLAG_UNIQUEID is not
// supported by the host kernel. It preserves the PID-seeded best-effort behavior:
// seed from unix.Getpid() and increment per endpoint. This is not
// collision-free across sentries that share a network namespace.
var fallbackFanoutID atomicbitops.Int32 = atomicbitops.FromInt32(int32(unix.Getpid()))

// New creates a new fd-based endpoint.
//
Expand Down Expand Up @@ -297,9 +291,10 @@ func New(opts *Options) (stack.LinkEndpoint, error) {
}
}

// Increment fanoutID to ensure that we don't re-use the same fanoutID
// for the next endpoint.
fid := fanoutID.Add(1)
// Fanout id allocated by the kernel for this endpoint. All AF_PACKET FDs
// belonging to this endpoint must use the same id. -1 means no AF_PACKET
// FD has allocated an id yet; 0 is a valid fanout id.
fid := int32(-1)

// Create per channel dispatchers.
for _, fd := range opts.FDs {
Expand All @@ -326,7 +321,20 @@ func New(opts *Options) (stack.LinkEndpoint, error) {
opts.ProcessorsPerChannel = max(1, runtime.GOMAXPROCS(0)/len(opts.FDs))
}

inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket, fid, opts)
if isPacket, err := isPacketSocket(fd, isSocket); err != nil {
return nil, err
} else if isPacket {
if fid < 0 {
fid, err = createPacketFanoutGroup(fd)
} else {
err = joinPacketFanoutGroup(fd, fid)
}
if err != nil {
return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err)
}
}

inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket, opts)
if err != nil {
return nil, fmt.Errorf("createInboundDispatcher(...) = %v", err)
}
Expand All @@ -336,7 +344,7 @@ func New(opts *Options) (stack.LinkEndpoint, error) {
return e, nil
}

func createInboundDispatcher(e *endpoint, fd int, isSocket bool, fID int32, opts *Options) (linkDispatcher, error) {
func createInboundDispatcher(e *endpoint, fd int, isSocket bool, opts *Options) (linkDispatcher, error) {
// By default use the readv() dispatcher as it works with all kinds of
// FDs (tap/tun/unix domain sockets and af_packet).
inboundDispatcher, err := newReadVDispatcher(fd, e, opts)
Expand All @@ -345,38 +353,6 @@ func createInboundDispatcher(e *endpoint, fd int, isSocket bool, fID int32, opts
}

if isSocket {
sa, err := unix.Getsockname(fd)
if err != nil {
return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err)
}
switch sa.(type) {
case *unix.SockaddrLinklayer:
// Enable PACKET_FANOUT mode if the underlying socket is of type
// AF_PACKET. We do not enable PACKET_FANOUT_FLAG_DEFRAG as that will
// prevent gvisor from receiving fragmented packets and the host does the
// reassembly on our behalf before delivering the fragments. This makes it
// hard to test fragmentation reassembly code in Netstack.
//
// See: include/uapi/linux/if_packet.h (struct fanout_args).
//
// NOTE: We are using SetSockOptInt here even though the underlying
// option is actually a struct. The code follows the example in the
// kernel documentation as described at the link below:
//
// See: https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
//
// This works out because the actual implementation for the option zero
// initializes the structure and will initialize the max_members field
// to a proper value if zero.
//
// See: https://github.com/torvalds/linux/blob/7acac4b3196caee5e21fb5ea53f8bc124e6a16fc/net/packet/af_packet.c#L3881
const fanoutType = unix.PACKET_FANOUT_HASH
fanoutArg := (int(fID) & 0xffff) | fanoutType<<16
if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil {
return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err)
}
}

switch e.packetDispatchMode {
case PacketMMap:
inboundDispatcher, err = newPacketMMapDispatcher(fd, e, opts)
Expand All @@ -399,6 +375,78 @@ func createInboundDispatcher(e *endpoint, fd int, isSocket bool, fID int32, opts
return inboundDispatcher, nil
}

func isPacketSocket(fd int, isSocket bool) (bool, error) {
if !isSocket {
return false, nil
}
sa, err := unix.Getsockname(fd)
if err != nil {
return false, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err)
}
_, ok := sa.(*unix.SockaddrLinklayer)
return ok, nil
}

// createPacketFanoutGroup enables PACKET_FANOUT for the first AF_PACKET socket
// in an endpoint and returns the fanout id the group joined.
//
// All AF_PACKET FDs that back the same endpoint must join the same fanout
// group so the host kernel consistently hashes packets for a flow to one FD.
// Fanout ids are unique within the Linux network namespace that owns the
// sockets; reusing an id for a different NIC in that namespace fails with
// EINVAL.
//
// We ask the kernel to allocate the id via PACKET_FANOUT_FLAG_UNIQUEID so the
// id is guaranteed unique within the namespace even when multiple sentries
// share it. If that setsockopt fails (e.g. the host kernel predates
// PACKET_FANOUT_FLAG_UNIQUEID), we fall back to the PID-seeded
// fallbackFanoutID, which is best-effort and not collision-free across
// sentries that share a network namespace.
//
// We do not enable PACKET_FANOUT_FLAG_DEFRAG as that will prevent gvisor from
// receiving fragmented packets and the host does the reassembly on our behalf
// before delivering the fragments. This makes it hard to test fragmentation
// reassembly code in Netstack.
//
// See: include/uapi/linux/if_packet.h (struct fanout_args).
//
// NOTE: We are using SetSockOptInt here even though the underlying option is
// actually a struct. The code follows the example in the kernel documentation
// as described at the link below:
//
// See: https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
//
// This works out because the actual implementation for the option zero
// initializes the structure and will initialize the max_members field to a
// proper value if zero.
//
// See: https://github.com/torvalds/linux/blob/7acac4b3196caee5e21fb5ea53f8bc124e6a16fc/net/packet/af_packet.c#L3881
func createPacketFanoutGroup(fd int) (int32, error) {
const fanoutType = unix.PACKET_FANOUT_HASH
fanoutArg := (fanoutType | unix.PACKET_FANOUT_FLAG_UNIQUEID) << 16
if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil {
uniqueIDErr := err
fallbackID := fallbackFanoutID.Add(1)
fanoutArg = (int(fallbackID) & 0xffff) | fanoutType<<16
if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil {
return 0, fmt.Errorf("UNIQUEID failed (%v); fallback fanout id %d also failed: %v", uniqueIDErr, fanoutArg&0xffff, err)
}
return int32(fanoutArg & 0xffff), nil
}

fanoutArg, err := unix.GetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_FANOUT)
if err != nil {
return 0, fmt.Errorf("getsockopt(PACKET_FANOUT) failed: %v", err)
}
return int32(fanoutArg & 0xffff), nil
}

func joinPacketFanoutGroup(fd int, fID int32) error {
const fanoutType = unix.PACKET_FANOUT_HASH
fanoutArg := (int(fID) & 0xffff) | fanoutType<<16
return unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg)
}

func isSocketFD(fd int) (bool, error) {
var stat unix.Stat_t
if err := unix.Fstat(fd, &stat); err != nil {
Expand Down