Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions hypervisor/cloudhypervisor/console.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@ import (
// For direct-boot VMs (OCI): opens the virtio-console PTY allocated by CH.
//
// The endpoint is stored in VM.ConsolePath at start time.
// The caller is responsible for closing the returned ReadCloser.
func (ch *CloudHypervisor) Console(ctx context.Context, ref string) (io.ReadCloser, error) {
// The caller is responsible for closing the returned ReadWriteCloser.
func (ch *CloudHypervisor) Console(ctx context.Context, ref string) (io.ReadWriteCloser, error) {
info, err := ch.Inspect(ctx, ref)
if err != nil {
return nil, err
}

var conn io.ReadCloser
var conn io.ReadWriteCloser
if err := ch.withRunningVM(info.ID, func(_ int) error {
path := info.ConsolePath
if path == "" {
Expand Down
2 changes: 1 addition & 1 deletion hypervisor/hypervisor.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ type Hypervisor interface {
Inspect(ctx context.Context, ref string) (*types.VM, error)
List(context.Context) ([]*types.VM, error)
Delete(ctx context.Context, refs []string, force bool) ([]string, error)
Console(ctx context.Context, ref string) (io.ReadCloser, error)
Console(ctx context.Context, ref string) (io.ReadWriteCloser, error)

// TODO SNAPSHOT
// TODO RESTORE
Expand Down
23 changes: 10 additions & 13 deletions network/cni/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ import (
const defaultQueueSize = 256

// Config creates the network namespace, runs CNI ADD for each NIC, sets up
// bridge + tap inside the netns, and returns NetworkConfigs ready for CH --net.
// TC redirect (eth↔tap) inside the netns, and returns NetworkConfigs ready for CH --net.
//
// Flow per NIC (from issue #1):
// Flow per NIC:
// 1. Create named netns cocoon-{vmID}
// 2. CNI ADD (containerID=vmID, netns path, ifName=eth{i})
// 3. Inside netns: flush eth{i} IP, create br{i}+tap{i}, bridge them
// 3. Inside netns: flush eth{i} IP, create tap{i}, wire via TC ingress mirred
// 4. Return NetworkConfig{Tap: "tap{i}", Mac: generated, Network: CNI result}
func (c *CNI) Config(ctx context.Context, vmID string, numNICs int, vmCfg *types.VMConfig) (configs []*types.NetworkConfig, retErr error) {
if c.networkConfList == nil || c.cniConf == nil {
Expand Down Expand Up @@ -61,7 +61,6 @@ func (c *CNI) Config(ctx context.Context, vmID string, numNICs int, vmCfg *types
for i := range numNICs {
ifName := fmt.Sprintf("eth%d", i)
tapName := fmt.Sprintf("tap%d", i)
brName := fmt.Sprintf("br%d", i)

// Step 2: CNI ADD — creates veth pair, assigns IP via IPAM.
rt := &libcni.RuntimeConf{
Expand All @@ -80,19 +79,17 @@ func (c *CNI) Config(ctx context.Context, vmID string, numNICs int, vmCfg *types
return nil, fmt.Errorf("parse CNI result: %w", err)
}

// Step 3: inside netns — flush IP, create bridge + tap (platform-specific).
if setupErr := setupBridgeTap(nsPath, ifName, brName, tapName); setupErr != nil {
return nil, fmt.Errorf("setup bridge/tap %s: %w", vmID, setupErr)
}

mac, err := utils.GenerateMAC()
if err != nil {
return nil, err
// Step 3: inside netns — flush IP, create tap, wire via TC redirect (platform-specific).
// Returns eth0's MAC so the guest virtio-net uses the same address,
// required for anti-spoofing CNI plugins (Cilium, Calico eBPF, VPC ENI).
mac, setupErr := setupTCRedirect(nsPath, ifName, tapName)
if setupErr != nil {
return nil, fmt.Errorf("setup tc-redirect %s: %w", vmID, setupErr)
}

configs = append(configs, &types.NetworkConfig{
Tap: tapName,
Mac: mac.String(),
Mac: mac,
Queue: int64(vmCfg.CPU),
QueueSize: defaultQueueSize,
Network: netInfo,
Expand Down
4 changes: 2 additions & 2 deletions network/cni/config_darwin.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ func deleteNetns(_ string) error {
return errNotSupported
}

func setupBridgeTap(_, _, _, _ string) error {
return errNotSupported
func setupTCRedirect(_, _, _ string) (string, error) {
return "", errNotSupported
}
129 changes: 90 additions & 39 deletions network/cni/config_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package cni
import (
"fmt"
"runtime"
"syscall"
"time"

cns "github.com/containernetworking/plugins/pkg/ns"
Expand Down Expand Up @@ -50,73 +51,123 @@ func deleteNetns(name string) error {
return netns.DeleteNamed(name)
}

// setupBridgeTap enters the target netns via the CNI plugins/pkg/ns closure
// and configures bridge + tap using netlink.
func setupBridgeTap(nsPath, ifName, brName, tapName string) error {
return cns.WithNetNSPath(nsPath, func(_ cns.NetNS) error {
return bridgeTapInNS(ifName, brName, tapName)
// setupTCRedirect enters the target netns, wires ifName ↔ tapName using
// TC ingress + mirred redirect, and returns ifName's MAC address.
// The caller should pass this MAC to CH so the guest's virtio-net MAC
// matches the CNI veth — required for anti-spoofing CNI plugins.
func setupTCRedirect(nsPath, ifName, tapName string) (string, error) {
var mac string
err := cns.WithNetNSPath(nsPath, func(_ cns.NetNS) error {
var nsErr error
mac, nsErr = tcRedirectInNS(ifName, tapName)
return nsErr
})
return mac, err
}

// bridgeTapInNS runs inside the target netns.
// tcRedirectInNS runs inside the target netns.
// 1. Flush IP from ifName (guest owns it, not the netns).
// 2. Create bridge + tap.
// 3. Enslave ifName and tap to bridge.
// 4. Bring everything up.
func bridgeTapInNS(ifName, brName, tapName string) error {
// 2. Create tap device.
// 3. Bring both interfaces up.
// 4. Attach ingress qdisc to both.
// 5. Add U32+mirred filters for bidirectional redirect.
func tcRedirectInNS(ifName, tapName string) (string, error) {
// 1. Find CNI veth, capture its MAC, and flush IP addresses.
link, err := netlink.LinkByName(ifName)
if err != nil {
return fmt.Errorf("find %s: %w", ifName, err)
return "", fmt.Errorf("find %s: %w", ifName, err)
}
mac := link.Attrs().HardwareAddr.String()

addrs, err := netlink.AddrList(link, netlink.FAMILY_ALL)
if err != nil {
return fmt.Errorf("list addrs on %s: %w", ifName, err)
return "", fmt.Errorf("list addrs on %s: %w", ifName, err)
}
for _, addr := range addrs {
if delErr := netlink.AddrDel(link, &addr); delErr != nil {
return fmt.Errorf("flush addr %s on %s: %w", addr.IPNet, ifName, delErr)
return "", fmt.Errorf("flush addr %s on %s: %w", addr.IPNet, ifName, delErr)
}
}

br := &netlink.Bridge{LinkAttrs: netlink.LinkAttrs{Name: brName}}
if addErr := netlink.LinkAdd(br); addErr != nil {
return fmt.Errorf("add bridge %s: %w", brName, addErr)
}
brLink, err := netlink.LinkByName(brName)
if err != nil {
return fmt.Errorf("find bridge %s: %w", brName, err)
}

// 2. Create tap device.
// VNET_HDR: allows kernel to parse virtio_net headers for checksum/GSO offload.
// ONE_QUEUE: prevents packet drops on older kernels when send buffer is full.
tap := &netlink.Tuntap{
LinkAttrs: netlink.LinkAttrs{Name: tapName},
Mode: netlink.TUNTAP_MODE_TAP,
Queues: 1,
Flags: netlink.TUNTAP_ONE_QUEUE | netlink.TUNTAP_VNET_HDR,
}
if addErr := netlink.LinkAdd(tap); addErr != nil {
return fmt.Errorf("add tap %s: %w", tapName, addErr)
return "", fmt.Errorf("add tap %s: %w", tapName, addErr)
}
tapLink, err := netlink.LinkByName(tapName)
if err != nil {
return fmt.Errorf("find tap %s: %w", tapName, err)
return "", fmt.Errorf("find tap %s: %w", tapName, err)
}

if masterErr := netlink.LinkSetMaster(link, brLink); masterErr != nil {
return fmt.Errorf("set %s master %s: %w", ifName, brName, masterErr)
}
// Disable MAC learning on the uplink (eth0) port. Without this, frames
// from tap0 traverse br0 → eth0 → cni0 and bounce back via eth0, causing
// br0 to learn the guest MAC on the eth0 port instead of tap0. ARP replies
// then get forwarded to eth0 (back to cni0) instead of tap0 (to the guest).
if learnErr := netlink.LinkSetLearning(link, false); learnErr != nil {
return fmt.Errorf("set %s learning off: %w", ifName, learnErr)
}
if masterErr := netlink.LinkSetMaster(tapLink, brLink); masterErr != nil {
return fmt.Errorf("set %s master %s: %w", tapName, brName, masterErr)
// Sync MTU: tap must match veth to avoid silent large-packet drops
// when CNI uses non-default MTU (e.g. 1450 for overlay, 9000 for jumbo).
if mtu := link.Attrs().MTU; mtu > 0 {
if mtuErr := netlink.LinkSetMTU(tapLink, mtu); mtuErr != nil {
return "", fmt.Errorf("set tap %s mtu %d: %w", tapName, mtu, mtuErr)
}
}

for _, l := range []netlink.Link{link, tapLink, brLink} {
// 3. Bring both interfaces up.
for _, l := range []netlink.Link{link, tapLink} {
if upErr := netlink.LinkSetUp(l); upErr != nil {
return fmt.Errorf("set %s up: %w", l.Attrs().Name, upErr)
return "", fmt.Errorf("set %s up: %w", l.Attrs().Name, upErr)
}
}
return nil

// 4. Attach ingress qdisc to both interfaces.
for _, l := range []netlink.Link{link, tapLink} {
qdisc := &netlink.Ingress{
QdiscAttrs: netlink.QdiscAttrs{
LinkIndex: l.Attrs().Index,
Parent: netlink.HANDLE_INGRESS,
},
}
if qdiscErr := netlink.QdiscAdd(qdisc); qdiscErr != nil {
return "", fmt.Errorf("add ingress qdisc on %s: %w", l.Attrs().Name, qdiscErr)
}
}

// 5. Bidirectional redirect: eth0 ingress → tap0, tap0 ingress → eth0.
if err := addTCRedirect(link, tapLink); err != nil {
return "", fmt.Errorf("redirect %s -> %s: %w", ifName, tapName, err)
}
if err := addTCRedirect(tapLink, link); err != nil {
return "", fmt.Errorf("redirect %s -> %s: %w", tapName, ifName, err)
}
return mac, nil
}

// addTCRedirect adds a U32 catch-all filter on from's ingress that redirects
// all packets to to's egress via mirred. TC_ACT_STOLEN ensures the packet is
// consumed and never reaches the netns host stack.
func addTCRedirect(from, to netlink.Link) error {
filter := &netlink.U32{
FilterAttrs: netlink.FilterAttrs{
LinkIndex: from.Attrs().Index,
Parent: netlink.HANDLE_INGRESS,
Priority: 1,
Protocol: syscall.ETH_P_ALL,
},
Sel: &netlink.TcU32Sel{
Flags: netlink.TC_U32_TERMINAL,
Keys: []netlink.TcU32Key{
{Mask: 0x0, Val: 0x0, Off: 0, OffMask: 0x0},
},
},
Actions: []netlink.Action{
&netlink.MirredAction{
ActionAttrs: netlink.ActionAttrs{Action: netlink.TC_ACT_STOLEN},
MirredAction: netlink.TCA_EGRESS_REDIR,
Ifindex: to.Attrs().Index,
},
},
}
return netlink.FilterAdd(filter)
}