diff --git a/examples/README.md b/examples/README.md index 12e6b73d7..444ff3cab 100644 --- a/examples/README.md +++ b/examples/README.md @@ -16,8 +16,8 @@ Like kprobes, but with better performance and usability, for kernels 5.5 and later. * [tcp_connect](fentry/) - Trace outgoing IPv4 TCP connections. * [tcp_close](tcprtt/) - Log RTT of IPv4 TCP connections using eBPF CO-RE helpers. -* TC and XDP - Attach a program to a network interface to process incoming (XDP) and outgoing (TC) packets. - * [shared_xdp_tc](./shared_xdp_tc/) - monitor and periodically reset the number of incoming and outgoing packets for each network flow identified with the traditional 5-tuple session identifier (IP addresses, L4 Ports, IP protocol). +* TCx - Attach a program to Linux TC (Traffic Control) to process incoming and outgoing packets. + * [tcx](./tcx/) - monitor the number of incoming and outgoing packets for each network flow identified with the traditional 5-tuple session identifier (IP addresses, L4 Ports, IP protocol). * XDP - Attach a program to a network interface to process incoming packets. * [xdp](xdp/) - Print packet counts by IPv4 source address. * Add your use case(s) here! diff --git a/examples/shared_xdp_tc/bpf_bpfeb.o b/examples/shared_xdp_tc/bpf_bpfeb.o deleted file mode 100644 index 0c8e967aa..000000000 Binary files a/examples/shared_xdp_tc/bpf_bpfeb.o and /dev/null differ diff --git a/examples/shared_xdp_tc/bpf_bpfel.o b/examples/shared_xdp_tc/bpf_bpfel.o deleted file mode 100644 index 3dcbae5bb..000000000 Binary files a/examples/shared_xdp_tc/bpf_bpfel.o and /dev/null differ diff --git a/examples/shared_xdp_tc/bpf_bpfeb.go b/examples/tcx/bpf_bpfeb.go similarity index 100% rename from examples/shared_xdp_tc/bpf_bpfeb.go rename to examples/tcx/bpf_bpfeb.go diff --git a/examples/tcx/bpf_bpfeb.o b/examples/tcx/bpf_bpfeb.o new file mode 100644 index 000000000..79626464e Binary files /dev/null and b/examples/tcx/bpf_bpfeb.o differ diff --git a/examples/shared_xdp_tc/bpf_bpfel.go b/examples/tcx/bpf_bpfel.go similarity index 100% rename from examples/shared_xdp_tc/bpf_bpfel.go rename to examples/tcx/bpf_bpfel.go diff --git a/examples/tcx/bpf_bpfel.o b/examples/tcx/bpf_bpfel.o new file mode 100644 index 000000000..f0f64b543 Binary files /dev/null and b/examples/tcx/bpf_bpfel.o differ diff --git a/examples/shared_xdp_tc/main.go b/examples/tcx/main.go similarity index 51% rename from examples/shared_xdp_tc/main.go rename to examples/tcx/main.go index ec95b83c4..af21661c7 100644 --- a/examples/shared_xdp_tc/main.go +++ b/examples/tcx/main.go @@ -1,11 +1,7 @@ // This program demonstrates attaching an eBPF program to a network interface -// with XDP (eXpress Data Path). The program parses the IPv4 source address -// from packets and writes the packet count by IP to an LRU hash map. -// The userspace program (Go code in this file) prints the contents -// of the map to stdout every second. -// It is possible to modify the XDP program to drop or redirect packets -// as well -- give it a try! -// This example depends on bpf_link, available in Linux kernel version 5.7 or newer. +// with Linux TC. The program parses the IPv4 source address +// from packets and writes the Ingress and Egress packet count by IP to an Hash map. +// The userspace program (Go code in this file) prints the content of the map to stdout. package main import ( @@ -22,20 +18,14 @@ import ( "github.com/cilium/ebpf/link" ) -// erase content of the map after this iterations -const eraseEvery = 5 - // mapping between integer value and L4 protocol string -var ( - currIter = 0 - protoMap = map[uint8]string{ - 1: "ICMP", - 6: "TCP", - 17: "UDP", - } -) +var protoMap = map[uint8]string{ + 1: "ICMP", + 6: "TCP", + 17: "UDP", +} -//go:generate go run github.com/cilium/ebpf/cmd/bpf2go bpf xdp_tcx.c -- -I../headers +//go:generate go run github.com/cilium/ebpf/cmd/bpf2go bpf tcx.c -- -I../headers func main() { if len(os.Args) < 2 { @@ -56,17 +46,18 @@ func main() { } defer objs.Close() - // Attach the program to Ingress XDP. - l, err := link.AttachXDP(link.XDPOptions{ + // Attach the program to Ingress TC. + l, err := link.AttachTCX(link.TCXOptions{ Interface: iface.Index, Program: objs.IngressProgFunc, + Attach: ebpf.AttachTCXIngress, }) if err != nil { log.Fatalf("could not attach TCx program: %s", err) } defer l.Close() - log.Printf("Attached XDP program to INGRESS iface %q (index %d)", iface.Name, iface.Index) + log.Printf("Attached TCx program to INGRESS iface %q (index %d)", iface.Name, iface.Index) // Attach the program to Egress TC. l2, err := link.AttachTCX(link.TCXOptions{ @@ -86,69 +77,46 @@ func main() { ticker := time.NewTicker(1 * time.Second) defer ticker.Stop() for range ticker.C { - handleMapContents(objs.StatsMap) + s, err := formatMapContent(objs.StatsMap) + if err != nil { + log.Printf("Error reading map: %s", err) + continue + } + + log.Printf("Map contents:\n%s", s) } } // handleMapContents prints the content of the map into a string. // For each entry (if any), a row is formatted with the following content: // : : Proto: => Ingress: Egress: -// Every nth calls to this function, the entire content of the Hash map is erased -// (lru map would automatically remove old keys, but can also remove additional keys -// so we use hash map to keep constant behaviour) -func handleMapContents(m *ebpf.Map) { +func formatMapContent(m *ebpf.Map) (string, error) { var ( - sb strings.Builder - key bpfSessionKey - val bpfSessionValue - keys []bpfSessionKey + sb strings.Builder + key bpfSessionKey + val bpfSessionValue ) - currIter++ - needsErase := currIter%eraseEvery == 0 - - if needsErase { - keys = make([]bpfSessionKey, 0) - } iter := m.Iterate() for iter.Next(&key, &val) { - sb.WriteString(fmt.Sprintf("\t%s:%d - %s:%d Proto:%s => Ingress:%d Egress:%d\n", - intToIp(key.Saddr), portToLE(key.Sport), intToIp(key.Daddr), portToLE(key.Dport), + sb.WriteString(fmt.Sprintf("\t%15s:%5d - %15s:%5d Proto:%3s => Ingress:%10d Egress:%10d\n", + intToIp(key.Saddr), portToLittleEndian(key.Sport), + intToIp(key.Daddr), portToLittleEndian(key.Dport), protoMap[key.Proto], val.InCount, val.EgCount)) - if needsErase { - keys = append(keys, key) - } - } - if iter.Err() != nil { - log.Printf("Error reading map: %s", iter.Err()) - return } - log.Printf("Map contents:\n%s", sb.String()) - - if !needsErase { - return - } - - n, err := m.BatchDelete(keys, nil) - if err != nil { - log.Printf("Error erasing map: %s", err) - return - } - log.Printf("Successfully Erased Map content (%d elements) at Iteration n. %d\n", n, currIter) + return sb.String(), iter.Err() } -// intToIp convert an int32 value retrieved from the network -// traffic (big endian) into a netip.Addr +// intToIp convert an int32 value retrieved from the network traffic (big endian) into a netip.Addr func intToIp(val uint32) netip.Addr { a4 := [4]byte{} binary.LittleEndian.PutUint32(a4[:], val) return netip.AddrFrom4(a4) } -// portToLE convert a uint16 value retrieved from the network -// traffic (big endian) into a little endian -func portToLE(val uint16) uint16 { +// portToLittleEndian convert a uint16 value retrieved from the network traffic (big endian) into a little endian +func portToLittleEndian(val uint16) uint16 { p2 := [2]byte{} binary.LittleEndian.PutUint16(p2[:], val) return binary.LittleEndian.Uint16(p2[:]) diff --git a/examples/shared_xdp_tc/xdp_tcx.c b/examples/tcx/tcx.c similarity index 53% rename from examples/shared_xdp_tc/xdp_tcx.c rename to examples/tcx/tcx.c index eacdf2d6e..bdabed9b5 100644 --- a/examples/shared_xdp_tc/xdp_tcx.c +++ b/examples/tcx/tcx.c @@ -1,6 +1,7 @@ //go:build ignore #include "common.h" +#include "bpf_endian.h" char __license[] SEC("license") = "Dual MIT/GPL"; @@ -8,23 +9,21 @@ char __license[] SEC("license") = "Dual MIT/GPL"; struct session_key { __u32 saddr; // IP source address __u32 daddr; // IP dest address - __u16 sport; // Source port (if ICMP then 0) - __u16 dport; // Dest port (if ICMP then 0) + __u16 sport; // Source port (set to 0 if ICMP) + __u16 dport; // Dest port (set to 0 if ICMP) __u8 proto; // Protocol ID }; // Session value struct session_value { - __u32 in_count; - __u32 eg_count; + __u32 in_count; // Ingress packet count + __u32 eg_count; // Egress packet count }; #define MAX_MAP_ENTRIES 16 -/* -Define an Hash map for storing packet Ingress and Egress count by 5-tuple session identifier -User-space logic is responsible for cleaning the map, if potentially new entries needs to be monitored. -*/ +// Define an Hash map for storing packet Ingress and Egress count by 5-tuple session identifier +// User-space logic is responsible for cleaning the map, if potentially new entries needs to be monitored. struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, MAX_MAP_ENTRIES); @@ -32,11 +31,9 @@ struct { __type(value, struct session_value); } stats_map SEC(".maps"); -/* -Attempt to parse the 5-tuple session identifierfrom the packet. -Returns 0 if there is no IPv4 header field or if L4 is not a UDP, TCP or ICMP packet; otherwise returns non-zero. -*/ -static __always_inline int parse_session_identifier(void *data, void *data_end, struct session_key *key, __u8 is_ingress) { +// Attempt to parse the 5-tuple session identifier from the packet. +// Returns 0 if there is no IPv4 header field or if L4 is not a UDP, TCP or ICMP packet; otherwise returns non-zero. +static __always_inline int parse_session_identifier(void *data, void *data_end, struct session_key *key) { // First, parse the ethernet header. struct ethhdr *eth = data; if ((void *)(eth + 1) > data_end) { @@ -89,31 +86,41 @@ static __always_inline int parse_session_identifier(void *data, void *data_end, key->saddr = (__u32)(ip->saddr); key->daddr = (__u32)(ip->daddr); - // In case the function is called from Egress hook, swap IP addresses and L4 ports before - // doing the map lookup - if (!is_ingress) { - __u32 tmp = key->saddr; - key->saddr = key->daddr; - key->daddr = tmp; - __u16 tmp2 = key->sport; - key->sport = key->dport; - key->dport = tmp2; - } return 1; } -/* -Main program logic shared by either XDP and TC hook. The function attempts to update the entry -in the LRU map corresponding to the 5-tuple identifier; it increases either the ingress or egress -packet counter value. In case of a non IP, TCP, UDP, ICMP packet, the program ignores the packet. -*/ -static __always_inline int prog_logic(void *data, void *data_end, __u8 is_ingress, int ret_code) { +// Function to swap addresses and ports from a session identifier, used when parsing packets in the Egress hook. +// This is done to align the session identifiers between Ingress and Egress, so that they point to the same +// entry in the Hash map. +static __always_inline void swap_key(struct session_key *key) { + __u32 tmp = key->saddr; + __u16 tmp2 = key->sport; + + key->saddr = key->daddr; + key->sport = key->dport; + key->daddr = tmp; + key->dport = tmp2; + + return; +} + +// Main program logic shared by Ingress and Egress TC hooks. The function attempts to update the entry +// in the Hash map corresponding to the 5-tuple identifier; it increases either the ingress or egress +// packet counter value. In case of a non IP, TCP, UDP, ICMP packet, the program ignores the packet. +// This function would work also with data and data_end retrieved from a xdp_md structure, and XDP_PASS as return type. +static __always_inline int prog_func(void *data, void *data_end, __u8 is_ingress, int ret_code) { struct session_key key = {}; - if (!parse_session_identifier(data, data_end, &key, is_ingress)) { + if (!parse_session_identifier(data, data_end, &key)) { // Not an IPv4 packet, so don't count it. goto done; } + // In case the function is called from Egress hook, swap addresses + // and L4 port before doing the map lookup + if (!is_ingress) { + swap_key(&key); + } + struct session_value *val = bpf_map_lookup_elem(&stats_map, &key); if (!val) { // No entry in the map for this 5-tuple identifier yet, so set the initial value to 1. @@ -132,18 +139,18 @@ static __always_inline int prog_logic(void *data, void *data_end, __u8 is_ingres } done: - // Return code corresponds to the OK action within either XDP or TC + // Return code corresponds to the PASS action in TC return ret_code; } -// XDP Ingress hook -SEC("xdp") -int ingress_prog_func(struct xdp_md *ctx) { - return prog_logic((void *)(long)ctx->data, (void *)(long)ctx->data_end, 0, XDP_PASS); +// TC Ingress hook +SEC("tc") +int ingress_prog_func(struct __sk_buff *skb) { + return prog_func((void *)(long)skb->data, (void *)(long)skb->data_end, 1, TC_ACT_OK); } // TC Egress hook SEC("tc") -int egress_prog_func(struct __sk_buff *ctx) { - return prog_logic((void *)(long)ctx->data, (void *)(long)ctx->data_end, 1, TC_ACT_OK); +int egress_prog_func(struct __sk_buff *skb) { + return prog_func((void *)(long)skb->data, (void *)(long)skb->data_end, 0, TC_ACT_OK); }