diff --git a/hypervisor/cloudhypervisor/console.go b/hypervisor/cloudhypervisor/console.go index ce3ccaee..b0ff277a 100644 --- a/hypervisor/cloudhypervisor/console.go +++ b/hypervisor/cloudhypervisor/console.go @@ -14,14 +14,14 @@ import ( // For direct-boot VMs (OCI): opens the virtio-console PTY allocated by CH. // // The endpoint is stored in VM.ConsolePath at start time. -// The caller is responsible for closing the returned ReadCloser. -func (ch *CloudHypervisor) Console(ctx context.Context, ref string) (io.ReadCloser, error) { +// The caller is responsible for closing the returned ReadWriteCloser. +func (ch *CloudHypervisor) Console(ctx context.Context, ref string) (io.ReadWriteCloser, error) { info, err := ch.Inspect(ctx, ref) if err != nil { return nil, err } - var conn io.ReadCloser + var conn io.ReadWriteCloser if err := ch.withRunningVM(info.ID, func(_ int) error { path := info.ConsolePath if path == "" { diff --git a/hypervisor/hypervisor.go b/hypervisor/hypervisor.go index 2a9bbda7..e18a8b29 100644 --- a/hypervisor/hypervisor.go +++ b/hypervisor/hypervisor.go @@ -26,7 +26,7 @@ type Hypervisor interface { Inspect(ctx context.Context, ref string) (*types.VM, error) List(context.Context) ([]*types.VM, error) Delete(ctx context.Context, refs []string, force bool) ([]string, error) - Console(ctx context.Context, ref string) (io.ReadCloser, error) + Console(ctx context.Context, ref string) (io.ReadWriteCloser, error) // TODO SNAPSHOT // TODO RESTORE diff --git a/network/cni/config.go b/network/cni/config.go index 3526054f..67f239c9 100644 --- a/network/cni/config.go +++ b/network/cni/config.go @@ -17,12 +17,12 @@ import ( const defaultQueueSize = 256 // Config creates the network namespace, runs CNI ADD for each NIC, sets up -// bridge + tap inside the netns, and returns NetworkConfigs ready for CH --net. +// TC redirect (eth↔tap) inside the netns, and returns NetworkConfigs ready for CH --net. // -// Flow per NIC (from issue #1): +// Flow per NIC: // 1. Create named netns cocoon-{vmID} // 2. CNI ADD (containerID=vmID, netns path, ifName=eth{i}) -// 3. Inside netns: flush eth{i} IP, create br{i}+tap{i}, bridge them +// 3. Inside netns: flush eth{i} IP, create tap{i}, wire via TC ingress mirred // 4. Return NetworkConfig{Tap: "tap{i}", Mac: generated, Network: CNI result} func (c *CNI) Config(ctx context.Context, vmID string, numNICs int, vmCfg *types.VMConfig) (configs []*types.NetworkConfig, retErr error) { if c.networkConfList == nil || c.cniConf == nil { @@ -61,7 +61,6 @@ func (c *CNI) Config(ctx context.Context, vmID string, numNICs int, vmCfg *types for i := range numNICs { ifName := fmt.Sprintf("eth%d", i) tapName := fmt.Sprintf("tap%d", i) - brName := fmt.Sprintf("br%d", i) // Step 2: CNI ADD — creates veth pair, assigns IP via IPAM. rt := &libcni.RuntimeConf{ @@ -80,19 +79,17 @@ func (c *CNI) Config(ctx context.Context, vmID string, numNICs int, vmCfg *types return nil, fmt.Errorf("parse CNI result: %w", err) } - // Step 3: inside netns — flush IP, create bridge + tap (platform-specific). - if setupErr := setupBridgeTap(nsPath, ifName, brName, tapName); setupErr != nil { - return nil, fmt.Errorf("setup bridge/tap %s: %w", vmID, setupErr) - } - - mac, err := utils.GenerateMAC() - if err != nil { - return nil, err + // Step 3: inside netns — flush IP, create tap, wire via TC redirect (platform-specific). + // Returns eth0's MAC so the guest virtio-net uses the same address, + // required for anti-spoofing CNI plugins (Cilium, Calico eBPF, VPC ENI). + mac, setupErr := setupTCRedirect(nsPath, ifName, tapName) + if setupErr != nil { + return nil, fmt.Errorf("setup tc-redirect %s: %w", vmID, setupErr) } configs = append(configs, &types.NetworkConfig{ Tap: tapName, - Mac: mac.String(), + Mac: mac, Queue: int64(vmCfg.CPU), QueueSize: defaultQueueSize, Network: netInfo, diff --git a/network/cni/config_darwin.go b/network/cni/config_darwin.go index 2f8f2779..57b29edb 100644 --- a/network/cni/config_darwin.go +++ b/network/cni/config_darwin.go @@ -12,6 +12,6 @@ func deleteNetns(_ string) error { return errNotSupported } -func setupBridgeTap(_, _, _, _ string) error { - return errNotSupported +func setupTCRedirect(_, _, _ string) (string, error) { + return "", errNotSupported } diff --git a/network/cni/config_linux.go b/network/cni/config_linux.go index b53a7737..ef09922c 100644 --- a/network/cni/config_linux.go +++ b/network/cni/config_linux.go @@ -3,6 +3,7 @@ package cni import ( "fmt" "runtime" + "syscall" "time" cns "github.com/containernetworking/plugins/pkg/ns" @@ -50,73 +51,123 @@ func deleteNetns(name string) error { return netns.DeleteNamed(name) } -// setupBridgeTap enters the target netns via the CNI plugins/pkg/ns closure -// and configures bridge + tap using netlink. -func setupBridgeTap(nsPath, ifName, brName, tapName string) error { - return cns.WithNetNSPath(nsPath, func(_ cns.NetNS) error { - return bridgeTapInNS(ifName, brName, tapName) +// setupTCRedirect enters the target netns, wires ifName ↔ tapName using +// TC ingress + mirred redirect, and returns ifName's MAC address. +// The caller should pass this MAC to CH so the guest's virtio-net MAC +// matches the CNI veth — required for anti-spoofing CNI plugins. +func setupTCRedirect(nsPath, ifName, tapName string) (string, error) { + var mac string + err := cns.WithNetNSPath(nsPath, func(_ cns.NetNS) error { + var nsErr error + mac, nsErr = tcRedirectInNS(ifName, tapName) + return nsErr }) + return mac, err } -// bridgeTapInNS runs inside the target netns. +// tcRedirectInNS runs inside the target netns. // 1. Flush IP from ifName (guest owns it, not the netns). -// 2. Create bridge + tap. -// 3. Enslave ifName and tap to bridge. -// 4. Bring everything up. -func bridgeTapInNS(ifName, brName, tapName string) error { +// 2. Create tap device. +// 3. Bring both interfaces up. +// 4. Attach ingress qdisc to both. +// 5. Add U32+mirred filters for bidirectional redirect. +func tcRedirectInNS(ifName, tapName string) (string, error) { + // 1. Find CNI veth, capture its MAC, and flush IP addresses. link, err := netlink.LinkByName(ifName) if err != nil { - return fmt.Errorf("find %s: %w", ifName, err) + return "", fmt.Errorf("find %s: %w", ifName, err) } + mac := link.Attrs().HardwareAddr.String() + addrs, err := netlink.AddrList(link, netlink.FAMILY_ALL) if err != nil { - return fmt.Errorf("list addrs on %s: %w", ifName, err) + return "", fmt.Errorf("list addrs on %s: %w", ifName, err) } for _, addr := range addrs { if delErr := netlink.AddrDel(link, &addr); delErr != nil { - return fmt.Errorf("flush addr %s on %s: %w", addr.IPNet, ifName, delErr) + return "", fmt.Errorf("flush addr %s on %s: %w", addr.IPNet, ifName, delErr) } } - br := &netlink.Bridge{LinkAttrs: netlink.LinkAttrs{Name: brName}} - if addErr := netlink.LinkAdd(br); addErr != nil { - return fmt.Errorf("add bridge %s: %w", brName, addErr) - } - brLink, err := netlink.LinkByName(brName) - if err != nil { - return fmt.Errorf("find bridge %s: %w", brName, err) - } - + // 2. Create tap device. + // VNET_HDR: allows kernel to parse virtio_net headers for checksum/GSO offload. + // ONE_QUEUE: prevents packet drops on older kernels when send buffer is full. tap := &netlink.Tuntap{ LinkAttrs: netlink.LinkAttrs{Name: tapName}, Mode: netlink.TUNTAP_MODE_TAP, + Queues: 1, + Flags: netlink.TUNTAP_ONE_QUEUE | netlink.TUNTAP_VNET_HDR, } if addErr := netlink.LinkAdd(tap); addErr != nil { - return fmt.Errorf("add tap %s: %w", tapName, addErr) + return "", fmt.Errorf("add tap %s: %w", tapName, addErr) } tapLink, err := netlink.LinkByName(tapName) if err != nil { - return fmt.Errorf("find tap %s: %w", tapName, err) + return "", fmt.Errorf("find tap %s: %w", tapName, err) } - if masterErr := netlink.LinkSetMaster(link, brLink); masterErr != nil { - return fmt.Errorf("set %s master %s: %w", ifName, brName, masterErr) - } - // Disable MAC learning on the uplink (eth0) port. Without this, frames - // from tap0 traverse br0 → eth0 → cni0 and bounce back via eth0, causing - // br0 to learn the guest MAC on the eth0 port instead of tap0. ARP replies - // then get forwarded to eth0 (back to cni0) instead of tap0 (to the guest). - if learnErr := netlink.LinkSetLearning(link, false); learnErr != nil { - return fmt.Errorf("set %s learning off: %w", ifName, learnErr) - } - if masterErr := netlink.LinkSetMaster(tapLink, brLink); masterErr != nil { - return fmt.Errorf("set %s master %s: %w", tapName, brName, masterErr) + // Sync MTU: tap must match veth to avoid silent large-packet drops + // when CNI uses non-default MTU (e.g. 1450 for overlay, 9000 for jumbo). + if mtu := link.Attrs().MTU; mtu > 0 { + if mtuErr := netlink.LinkSetMTU(tapLink, mtu); mtuErr != nil { + return "", fmt.Errorf("set tap %s mtu %d: %w", tapName, mtu, mtuErr) + } } - for _, l := range []netlink.Link{link, tapLink, brLink} { + // 3. Bring both interfaces up. + for _, l := range []netlink.Link{link, tapLink} { if upErr := netlink.LinkSetUp(l); upErr != nil { - return fmt.Errorf("set %s up: %w", l.Attrs().Name, upErr) + return "", fmt.Errorf("set %s up: %w", l.Attrs().Name, upErr) } } - return nil + + // 4. Attach ingress qdisc to both interfaces. + for _, l := range []netlink.Link{link, tapLink} { + qdisc := &netlink.Ingress{ + QdiscAttrs: netlink.QdiscAttrs{ + LinkIndex: l.Attrs().Index, + Parent: netlink.HANDLE_INGRESS, + }, + } + if qdiscErr := netlink.QdiscAdd(qdisc); qdiscErr != nil { + return "", fmt.Errorf("add ingress qdisc on %s: %w", l.Attrs().Name, qdiscErr) + } + } + + // 5. Bidirectional redirect: eth0 ingress → tap0, tap0 ingress → eth0. + if err := addTCRedirect(link, tapLink); err != nil { + return "", fmt.Errorf("redirect %s -> %s: %w", ifName, tapName, err) + } + if err := addTCRedirect(tapLink, link); err != nil { + return "", fmt.Errorf("redirect %s -> %s: %w", tapName, ifName, err) + } + return mac, nil +} + +// addTCRedirect adds a U32 catch-all filter on from's ingress that redirects +// all packets to to's egress via mirred. TC_ACT_STOLEN ensures the packet is +// consumed and never reaches the netns host stack. +func addTCRedirect(from, to netlink.Link) error { + filter := &netlink.U32{ + FilterAttrs: netlink.FilterAttrs{ + LinkIndex: from.Attrs().Index, + Parent: netlink.HANDLE_INGRESS, + Priority: 1, + Protocol: syscall.ETH_P_ALL, + }, + Sel: &netlink.TcU32Sel{ + Flags: netlink.TC_U32_TERMINAL, + Keys: []netlink.TcU32Key{ + {Mask: 0x0, Val: 0x0, Off: 0, OffMask: 0x0}, + }, + }, + Actions: []netlink.Action{ + &netlink.MirredAction{ + ActionAttrs: netlink.ActionAttrs{Action: netlink.TC_ACT_STOLEN}, + MirredAction: netlink.TCA_EGRESS_REDIR, + Ifindex: to.Attrs().Index, + }, + }, + } + return netlink.FilterAdd(filter) }