From 9bc4004f8cc328db28db1a6d672424dc6d5e68ed Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Thu, 7 Jun 2018 12:15:57 -0700 Subject: [PATCH] sync w/ internal repo. Summary: in this sync: 1) updated kernel's includes 2) added support for ICMP "packet too big" generation if recved packet is bigger then MAX_PCKT_SIZE (turned off by default as this requires bpf_xdp_adjust_tail helper from 4.17) 3) unittests for #2 4) formating/renaming of few internal structs --- build_bpf_modules_opensource.sh | 1 + katran/lib/BpfAdapter.cpp | 33 +- katran/lib/BpfAdapter.h | 2 +- katran/lib/BpfLoader.cpp | 8 +- katran/lib/KatranLb.cpp | 13 +- katran/lib/KatranLb.h | 20 +- katran/lib/TARGETS | 136 -- katran/lib/bpf/balancer_consts.h | 16 +- katran/lib/bpf/balancer_helpers.h | 81 + katran/lib/bpf/balancer_kern.c | 15 + katran/lib/bpf/handle_icmp.h | 117 + katran/lib/linux_includes/bpf.h | 1966 ++++++++++++++--- katran/lib/linux_includes/bpf_common.h | 72 +- katran/lib/linux_includes/bpf_helpers.h | 188 +- katran/lib/linux_includes/libbpf.c | 116 +- katran/lib/linux_includes/libbpf.h | 36 +- katran/lib/testing/CMakeLists.txt | 1 + .../lib/testing/KatranOptionalTestFixtures.h | 59 + katran/lib/testing/TARGETS | 1 + katran/lib/testing/XdpTester.cpp | 30 +- katran/lib/testing/XdpTester.h | 16 + katran/lib/testing/katran_tester.cpp | 24 + 22 files changed, 2223 insertions(+), 728 deletions(-) delete mode 100644 katran/lib/TARGETS create mode 100644 katran/lib/bpf/balancer_helpers.h create mode 100644 katran/lib/testing/KatranOptionalTestFixtures.h diff --git a/build_bpf_modules_opensource.sh b/build_bpf_modules_opensource.sh index 3da348229..86836ec66 100755 --- a/build_bpf_modules_opensource.sh +++ b/build_bpf_modules_opensource.sh @@ -28,5 +28,6 @@ cp ./katran/lib/Makefile-bpf ./deps/linux/bpfprog/Makefile cp -r ./katran/lib/bpf ./deps/linux/bpfprog/ cp ./katran/lib/linux_includes/bpf_helpers.h ./deps/linux/bpfprog/include/ cd ./deps/linux/bpfprog && LD_LIBRARY_PATH="${CLANG_PATH}/lib" make \ + EXTRA_CFLAGS="$*" \ LLC="${CLANG_PATH}/bin/llc" CLANG="${CLANG_PATH}/bin/clang" echo "BPF BUILD COMPLITED" diff --git a/katran/lib/BpfAdapter.cpp b/katran/lib/BpfAdapter.cpp index bba22c21d..f5c757d7e 100644 --- a/katran/lib/BpfAdapter.cpp +++ b/katran/lib/BpfAdapter.cpp @@ -93,7 +93,7 @@ BpfAdapter::BpfAdapter(bool set_limits) { struct rlimit lck_mem = {}; lck_mem.rlim_cur = RLIM_INFINITY; lck_mem.rlim_max = RLIM_INFINITY; - if(setrlimit(RLIMIT_MEMLOCK, &lck_mem)) { + if (setrlimit(RLIMIT_MEMLOCK, &lck_mem)) { LOG(ERROR) << "Can't change limit for locked memory"; throw std::runtime_error("error while setting limit for locked memory"); } @@ -131,7 +131,7 @@ int BpfAdapter::createNamedBpfMap( int numa_node) { const char* name_ptr = !name.empty() ? name.c_str() : nullptr; - return bpf_create_map_node( + return ebpf_create_map_node( static_cast(type), name_ptr, key_size, @@ -154,7 +154,7 @@ int BpfAdapter::bpfUpdateMap( void* key, void* value, unsigned long long flags) { - auto bpfError = bpf_update_elem(map_fd, key, value, flags); + auto bpfError = ebpf_update_elem(map_fd, key, value, flags); if (bpfError) { VLOG(4) << "Error while updating value in map: " << std::strerror(errno); } @@ -162,7 +162,7 @@ int BpfAdapter::bpfUpdateMap( } int BpfAdapter::bpfMapLookupElement(int map_fd, void* key, void* value) { - auto bpfError = bpf_lookup_elem(map_fd, key, value); + auto bpfError = ebpf_lookup_elem(map_fd, key, value); if (bpfError) { VLOG(4) << "Error while geting value from map: " << std::strerror(errno); } @@ -170,7 +170,7 @@ int BpfAdapter::bpfMapLookupElement(int map_fd, void* key, void* value) { } int BpfAdapter::bpfMapDeleteElement(int map_fd, void* key) { - auto bpfError = bpf_delete_elem(map_fd, key); + auto bpfError = ebpf_delete_elem(map_fd, key); if (bpfError) { VLOG(4) << "Error while deleting key from map: " << std::strerror(errno); } @@ -178,7 +178,7 @@ int BpfAdapter::bpfMapDeleteElement(int map_fd, void* key) { } int BpfAdapter::bpfMapGetNextKey(int map_fd, void* key, void* next_key) { - auto bpfError = bpf_get_next_key(map_fd, key, next_key); + auto bpfError = ebpf_get_next_key(map_fd, key, next_key); if (bpfError) { VLOG(4) << "Error getting next key from map: " << std::strerror(errno); } @@ -186,11 +186,11 @@ int BpfAdapter::bpfMapGetNextKey(int map_fd, void* key, void* next_key) { } int BpfAdapter::pinBpfObject(int fd, const std::string& path) { - return bpf_obj_pin(fd, path.c_str()); + return ebpf_obj_pin(fd, path.c_str()); } int BpfAdapter::getPinnedBpfObject(const std::string& path) { - return bpf_obj_get(path.c_str()); + return ebpf_obj_get(path.c_str()); } int BpfAdapter::getInterfaceIndex(const std::string& interface_name) { @@ -298,7 +298,7 @@ int BpfAdapter::testXdpProg( uint32_t* size_out, uint32_t* retval, uint32_t* duration) { - return bpf_prog_test_run( + return ebpf_prog_test_run( prog_fd, repeat, data, data_size, data_out, size_out, retval, duration); } @@ -460,7 +460,14 @@ int BpfAdapter::modifyTcBpfFilter( const int prog_fd, const unsigned int ifindex, const std::string& bpf_name, - const int direction) { + const int direction) +// TODO: T30063437 fix null-pointer-use undefined behavior +#if defined(__has_feature) +#if __has_feature(__address_sanitizer__) + __attribute__((__no_sanitize__("null"))) +#endif +#endif +{ char buf[MNL_SOCKET_BUFFER_SIZE]; struct nlmsghdr* nlh; struct tcmsg* tc; @@ -572,7 +579,7 @@ int BpfAdapter::attachCgroupProg( SCOPE_EXIT { ::close(target_fd); }; - return bpf_prog_attach(prog_fd, target_fd, type, flags); + return ebpf_prog_attach(prog_fd, target_fd, type, flags); } int BpfAdapter::detachCgroupProg( @@ -585,7 +592,7 @@ int BpfAdapter::detachCgroupProg( SCOPE_EXIT { ::close(target_fd); }; - return bpf_prog_detach(target_fd, type); + return ebpf_prog_detach(target_fd, type); } int BpfAdapter::detachCgroupProg( @@ -599,7 +606,7 @@ int BpfAdapter::detachCgroupProg( SCOPE_EXIT { ::close(target_fd); }; - return bpf_prog_detach2(prog_fd, target_fd, type); + return ebpf_prog_detach2(prog_fd, target_fd, type); } } // namespace katran diff --git a/katran/lib/BpfAdapter.h b/katran/lib/BpfAdapter.h index e3b49adfd..f384ea892 100644 --- a/katran/lib/BpfAdapter.h +++ b/katran/lib/BpfAdapter.h @@ -52,7 +52,7 @@ constexpr unsigned int kBpfMapTypeHashOfMaps = 13; */ class BpfAdapter { public: - explicit BpfAdapter(bool set_limits=true); + explicit BpfAdapter(bool set_limits = true); // BpfAdapter is not thread safe. Discourage unsafe use by disabling copy // construction/assignment. diff --git a/katran/lib/BpfLoader.cpp b/katran/lib/BpfLoader.cpp index 689ef45e8..c388fe535 100644 --- a/katran/lib/BpfLoader.cpp +++ b/katran/lib/BpfLoader.cpp @@ -225,14 +225,14 @@ int BpfLoader::loadMaps(Elf* elf) { << "map-in-map prototype"; return 1; } - map_fd = bpf_create_map_in_map( + map_fd = ebpf_create_map_in_map( static_cast(maps[i].type), maps[i].key_size, inner_map_fd, maps[i].max_entries, maps[i].map_flags); } else { - map_fd = bpf_create_map( + map_fd = ebpf_create_map( static_cast(maps[i].type), maps[i].key_size, maps[i].value_size, @@ -579,7 +579,7 @@ int BpfLoader::loadBpfProgs() { << "\nlicense: " << license_ << "\nkernel version: " << kernelVersion_; std::string bpf_log_buf(kLogBufSize, '\0'); - auto prog_fd = bpf_prog_load( + auto prog_fd = ebpf_prog_load( prog_iter.second.type, prog_iter.second.insns, prog_iter.second.size, @@ -607,7 +607,7 @@ int BpfLoader::loadBpfFile(const std::string& path, const bpf_prog_type type) { int fd = -1; SCOPE_EXIT { elf_end(elf_); - if (0 < fd) { + if (fd > 0) { ::close(fd); } }; diff --git a/katran/lib/KatranLb.cpp b/katran/lib/KatranLb.cpp index c982cbc84..2b7e0c6e6 100644 --- a/katran/lib/KatranLb.cpp +++ b/katran/lib/KatranLb.cpp @@ -668,18 +668,19 @@ lb_stats KatranLb::getStatsForVip(const VipKey& vip) { } lb_stats KatranLb::getLruStats() { - uint32_t lru_cntr_pos = maxVips_ + kLruCntrOffset; - return getLbStats(lru_cntr_pos); + return getLbStats(maxVips_ + kLruCntrOffset); } lb_stats KatranLb::getLruMissStats() { - uint32_t lru_miss_pos = maxVips_ + kLruMissOffset; - return getLbStats(lru_miss_pos); + return getLbStats(maxVips_ + kLruMissOffset); } lb_stats KatranLb::getLruFallbackStats() { - uint32_t lru_fallback_pos = maxVips_ + kLruFallbackOffset; - return getLbStats(lru_fallback_pos); + return getLbStats(maxVips_ + kLruFallbackOffset); +} + +lb_stats KatranLb::getIcmpTooBigStats() { + return getLbStats(maxVips_ + kIcmpTooBigOffset); } lb_stats KatranLb::getLbStats(uint32_t position) { diff --git a/katran/lib/KatranLb.h b/katran/lib/KatranLb.h index 1f9c6f11f..04f7f273d 100644 --- a/katran/lib/KatranLb.h +++ b/katran/lib/KatranLb.h @@ -45,6 +45,7 @@ constexpr int kMainIntfPos = 3; constexpr uint32_t kLruCntrOffset = 0; constexpr uint32_t kLruMissOffset = 1; constexpr uint32_t kLruFallbackOffset = 3; +constexpr uint32_t kIcmpTooBigOffset = 4; /** * LRU map related constants @@ -213,7 +214,7 @@ class KatranLb { * @param VipKey vip * @return struct lb_stats w/ statistic for specified vip * - * helper function which return totall ammount of pkts and bytes which + * helper function which return total ammount of pkts and bytes which * has been sent to specified vip. it's up to external entity to calculate * actual speed in pps/bps */ @@ -222,7 +223,7 @@ class KatranLb { /** * @return struct lb_stats w/ statistics for lru misses * - * helper function which returns totall amount of processed packets and + * helper function which returns total amount of processed packets and * how much of em was lru misses (when we wasnt able to find entry in * connection table) */ @@ -231,7 +232,7 @@ class KatranLb { /** * @return struct lb_stats w/ statistic of the reasons for lru misses * - * helper function which return totall amount of tcp lru misses because of + * helper function which returns total amount of tcp lru misses because of * the tcp syns (v1) or non-syns (v2) */ lb_stats getLruMissStats(); @@ -239,11 +240,20 @@ class KatranLb { /** * @return struct lb_stats w/ statistic of fallback lru hits * - * helper function which return totall amount of numbers when we fel back + * helper function which return total amount of numbers when we fel back * to fallback_lru (v1); */ lb_stats getLruFallbackStats(); + /** + * @return struct lb_stats w/ statistic of icmp packet too big packets + * + * helper function which returns how many icmpv4/icmpv6 packet too big + * has been generated after we have received packet, which is bigger then + * maximum supported size. + */ + lb_stats getIcmpTooBigStats(); + /** * @param uint32_t somark of the packet * @param std::string dst for a packed w/ specified so_mark @@ -473,7 +483,7 @@ class KatranLb { std::vector lruMapsFd_; /** - * totall LRUs map size; each forwarding cpu/core will have + * total LRUs map size; each forwarding cpu/core will have * total_size/forwarding_cores entries */ uint64_t totalLruSize_; diff --git a/katran/lib/TARGETS b/katran/lib/TARGETS deleted file mode 100644 index 7b315c4d4..000000000 --- a/katran/lib/TARGETS +++ /dev/null @@ -1,136 +0,0 @@ -cpp_library( - name = "bpfadapter", - srcs = [ - "BpfAdapter.cpp", - "BpfLoader.cpp", - "linux_includes/libbpf.c", - ], - headers = [ - "BpfAdapter.h", - "BpfLoader.h", - "BpfLoaderStructs.h", - "linux_includes/bpf.h", - "linux_includes/bpf_common.h", - "linux_includes/bpf_helpers.h", - "linux_includes/libbpf.h", - ], - deps = [ - "//folly:scope_guard", - ], - external_deps = [ - "glog", - ("libelf", "any", "elf"), - ("libmnl", None, "mnl"), - ], -) - -cpp_library( - name = "iphelpers", - srcs = [ - "IpHelpers.cpp", - ], - headers = [ - "IpHelpers.h", - ], - deps = [ - "//folly:network_address", - "//folly/lang:bits", - ], -) - -cpp_library( - name = "chhelpers", - srcs = [ - "CHHelpers.cpp", - ], - headers = [ - "CHHelpers.h", - ], - deps = [ - ":murmurhash3", - ], -) - -cpp_library( - name = "libkatran", - srcs = [ - "KatranLb.cpp", - "Vip.cpp", - ], - headers = [ - "KatranLb.h", - "KatranLbStructs.h", - "Vip.h", - ], - deps = [ - ":balancer_structs", - ":bpfadapter", - ":chhelpers", - ":iphelpers", - "//folly:format", - "//folly:network_address", - "//folly/lang:bits", - ], - external_deps = [ - "gflags", - "glog", - ], -) - -cpp_library( - name = "balancer_structs", - headers = [ - "BalancerStructs.h", - ], -) - -cpp_library( - name = "murmurhash3", - srcs = [ - "MurmurHash3.cpp", - ], - headers = [ - "MurmurHash3.h", - ], -) - -cpp_library( - name = "machelpers", - srcs = [ - "MacHelpers.cpp", - ], - headers = [ - "MacHelpers.h", - ], - deps = [ - "//folly:format", - "//folly:network_address", - ], -) - -cpp_binary( - name = "xdproot", - srcs = [ - "xdproot.cpp", - ], - deps = [ - ":bpfadapter", - ], - external_deps = [ - "gflags", - "glog", - ], -) - -cpp_binary( - name = "maglev_integration_test", - srcs = [ - "maglev_integration_test.cpp", - ], - deps = [ - ":chhelpers", - ], - external_deps = [ - "gflags", - ], -) diff --git a/katran/lib/bpf/balancer_consts.h b/katran/lib/bpf/balancer_consts.h index d9490cb27..5208b8f16 100644 --- a/katran/lib/bpf/balancer_consts.h +++ b/katran/lib/bpf/balancer_consts.h @@ -118,13 +118,24 @@ #define MAX_PCKT_SIZE 1514 #endif +// for v4 and v6: initial packet would be truncated to the size of eth header +// plus ipv4/ipv6 header and few bytes of payload +#define ICMP_TOOBIG_SIZE 98 +#define ICMP6_TOOBIG_SIZE 262 + + +#define ICMP6_TOOBIG_PAYLOAD_SIZE (ICMP6_TOOBIG_SIZE - 6) +#define ICMP_TOOBIG_PAYLOAD_SIZE (ICMP_TOOBIG_SIZE - 6) + #define NO_FLAGS 0 -// offset of the lru cache hit related cntrs +// offset of the lru cache hit related counters #define LRU_CNTRS 0 #define LRU_MISS_CNTR 1 #define NEW_CONN_RATE_CNTR 2 #define FALLBACK_LRU_CNTR 3 +//offset of icmp related counters +#define ICMP_TOOBIG_CNTRS 4 // max ammount of new connections per seconda per core for lru update // if we go beyond this value - we will bypass lru update. @@ -154,4 +165,7 @@ #define IPIP_V6_PREFIX3 0 #endif +// optional features (requires kernel support. turned off by default) +//#define ICMP_TOOBIG_GENERATION + #endif // of __BALANCER_CONSTS_H diff --git a/katran/lib/bpf/balancer_helpers.h b/katran/lib/bpf/balancer_helpers.h new file mode 100644 index 000000000..47a5b9381 --- /dev/null +++ b/katran/lib/bpf/balancer_helpers.h @@ -0,0 +1,81 @@ +/* Copyright (C) 2018-present, Facebook, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef __BALANCER_HELPERS +#define __BALANCER_HELPERS +/* + * This file contains common used routines. such as csum helpers etc + */ + +#include + +#include "bpf_helpers.h" + +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + + +__attribute__((__always_inline__)) +static inline __u16 csum_fold_helper(__u64 csum) { + int i; +#pragma unroll + for (i = 0; i < 4; i ++) { + if (csum >> 16) + csum = (csum & 0xffff) + (csum >> 16); + } + return ~csum; +} + +__attribute__((__always_inline__)) +static inline void ipv4_csum(void *data_start, int data_size, __u64 *csum) { + *csum = bpf_csum_diff(0, 0, data_start, data_size, *csum); + *csum = csum_fold_helper(*csum); +} + +__attribute__((__always_inline__)) +static inline void ipv4_l4_csum(void *data_start, int data_size, + __u64 *csum, struct iphdr *iph) { + __u32 tmp = 0; + *csum = bpf_csum_diff(0, 0, &iph->saddr, sizeof(__be32), *csum); + *csum = bpf_csum_diff(0, 0, &iph->daddr, sizeof(__be32), *csum); + tmp = __builtin_bswap32((__u32)(iph->protocol)); + *csum = bpf_csum_diff(0, 0, &tmp, sizeof(__u32), *csum); + tmp = __builtin_bswap32((__u32)(data_size)); + *csum = bpf_csum_diff(0, 0, &tmp, sizeof(__u32), *csum); + *csum = bpf_csum_diff(0, 0, data_start, data_size, *csum); + *csum = csum_fold_helper(*csum); +} + +__attribute__((__always_inline__)) +static inline void ipv6_csum(void *data_start, int data_size, + __u64 *csum, struct ipv6hdr *ip6h) { + // ipv6 pseudo header + __u32 tmp = 0; + *csum = bpf_csum_diff(0, 0, &ip6h->saddr, sizeof(struct in6_addr), *csum); + *csum = bpf_csum_diff(0, 0, &ip6h->daddr, sizeof(struct in6_addr), *csum); + tmp = __builtin_bswap32((__u32)(data_size)); + *csum = bpf_csum_diff(0, 0, &tmp, sizeof(__u32), *csum); + tmp = __builtin_bswap32((__u32)(ip6h->nexthdr)); + *csum = bpf_csum_diff(0, 0, &tmp, sizeof(__u32), *csum); + // sum over payload + *csum = bpf_csum_diff(0, 0, data_start, data_size, *csum); + *csum = csum_fold_helper(*csum); +} +#endif diff --git a/katran/lib/bpf/balancer_kern.c b/katran/lib/bpf/balancer_kern.c index 498d55080..6dd3e26de 100644 --- a/katran/lib/bpf/balancer_kern.c +++ b/katran/lib/bpf/balancer_kern.c @@ -310,7 +310,22 @@ static inline int process_packet(void *data, __u64 off, void *data_end, } if (data_end - data > MAX_PCKT_SIZE) { +#ifdef ICMP_TOOBIG_GENERATION + __u32 stats_key = MAX_VIPS + ICMP_TOOBIG_CNTRS; + BUILD_BUG_ON(stats_key >= STATS_MAP_SIZE); + data_stats = bpf_map_lookup_elem(&stats, &stats_key); + if (!data_stats) { + return XDP_DROP; + } + if (is_ipv6) { + data_stats->v2 += 1; + } else { + data_stats->v1 += 1; + } + return send_icmp_too_big(xdp, is_ipv6, data_end - data); +#else return XDP_DROP; +#endif } __u32 stats_key = MAX_VIPS + LRU_CNTRS; diff --git a/katran/lib/bpf/handle_icmp.h b/katran/lib/bpf/handle_icmp.h index c4c096b6a..4e63bc7e3 100644 --- a/katran/lib/bpf/handle_icmp.h +++ b/katran/lib/bpf/handle_icmp.h @@ -32,6 +32,7 @@ #include "balancer_consts.h" #include "balancer_structs.h" +#include "balancer_helpers.h" __attribute__((__always_inline__)) static inline int swap_mac_and_send(void *data, void *data_end) { @@ -44,6 +45,15 @@ static inline int swap_mac_and_send(void *data, void *data_end) { return XDP_TX; } +__attribute__((__always_inline__)) +static inline void swap_mac(void *data, struct eth_hdr *orig_eth) { + struct eth_hdr *eth; + eth = data; + memcpy(eth->eth_source, orig_eth->eth_dest , ETH_ALEN); + memcpy(eth->eth_dest, orig_eth->eth_source, ETH_ALEN); + eth->eth_proto = orig_eth->eth_proto; +} + __attribute__((__always_inline__)) static inline int send_icmp_reply(void *data, void *data_end) { struct iphdr *iph; @@ -106,6 +116,113 @@ static inline int send_icmp6_reply(void *data, void *data_end) { return swap_mac_and_send(data, data_end); } +__attribute__((__always_inline__)) +static inline int send_icmp4_too_big(struct xdp_md *xdp) { + int headroom = (int)sizeof(struct iphdr) + (int)sizeof(struct icmphdr); + if (bpf_xdp_adjust_head(xdp, 0 - headroom)) { + return XDP_DROP; + } + void *data = xdp->data; + void *data_end = xdp->data_end; + if (data + (ICMP_TOOBIG_SIZE + headroom) > data_end) { + return XDP_DROP; + } + struct iphdr *iph, *orig_iph; + struct eth_hdr *orig_eth; + struct icmphdr *icmp_hdr; + __u64 csum = 0; + __u64 off = 0; + orig_eth = data + headroom; + swap_mac(data, orig_eth); + off += sizeof(struct eth_hdr); + iph = data + off; + off += sizeof(struct iphdr); + icmp_hdr = data + off; + off += sizeof(struct icmphdr); + orig_iph = data + off; + icmp_hdr->type = ICMP_DEST_UNREACH; + icmp_hdr->code = ICMP_FRAG_NEEDED; + icmp_hdr->un.frag.mtu = htons(MAX_PCKT_SIZE-sizeof(struct eth_hdr)); + icmp_hdr->checksum = 0; + ipv4_csum(icmp_hdr, ICMP_TOOBIG_PAYLOAD_SIZE, &csum); + icmp_hdr->checksum = csum; + iph->ttl = DEFAULT_TTL; + iph->daddr = orig_iph->saddr; + iph->saddr = orig_iph->daddr; + iph->version = 4; + iph->ihl = 5; + iph->protocol = IPPROTO_ICMP; + iph->tos = 0; + iph->tot_len = htons(ICMP_TOOBIG_SIZE + headroom - sizeof(struct eth_hdr)); + iph->check = 0; + csum = 0; + ipv4_csum(iph, sizeof(struct iphdr), &csum); + iph->check = csum; + return XDP_TX; +} + +__attribute__((__always_inline__)) +static inline int send_icmp6_too_big(struct xdp_md *xdp) { + int headroom = (int)sizeof(struct ipv6hdr) + (int)sizeof(struct icmp6hdr); + if (bpf_xdp_adjust_head(xdp, 0 - headroom)) { + return XDP_DROP; + } + void *data = xdp->data; + void *data_end = xdp->data_end; + if (data + (ICMP6_TOOBIG_SIZE + headroom) > data_end) { + return XDP_DROP; + } + struct ipv6hdr *ip6h, *orig_ip6h; + struct eth_hdr *orig_eth; + struct icmp6hdr *icmp6_hdr; + __u64 csum = 0; + __u64 off = 0; + orig_eth = data + headroom; + swap_mac(data, orig_eth); + off += sizeof(struct eth_hdr); + ip6h = data + off; + off += sizeof(struct ipv6hdr); + icmp6_hdr = data + off; + off += sizeof(struct icmp6hdr); + orig_ip6h = data + off; + ip6h->version = 6; + ip6h->priority = 0; + ip6h->nexthdr = IPPROTO_ICMPV6; + ip6h->hop_limit = DEFAULT_TTL; + ip6h->payload_len = htons(ICMP6_TOOBIG_PAYLOAD_SIZE); + memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl)); + memcpy(ip6h->daddr.s6_addr32, orig_ip6h->saddr.s6_addr32, 16); + memcpy(ip6h->saddr.s6_addr32, orig_ip6h->daddr.s6_addr32, 16); + icmp6_hdr->icmp6_type = ICMPV6_PKT_TOOBIG; + icmp6_hdr->icmp6_code = 0; + icmp6_hdr->icmp6_mtu = htonl(MAX_PCKT_SIZE-sizeof(struct eth_hdr)); + icmp6_hdr->icmp6_cksum = 0; + ipv6_csum(icmp6_hdr, ICMP6_TOOBIG_PAYLOAD_SIZE, &csum, ip6h); + icmp6_hdr->icmp6_cksum = csum; + return XDP_TX; +} + +__attribute__((__always_inline__)) +static inline int send_icmp_too_big(struct xdp_md *xdp, + bool is_ipv6, int pckt_size) { + + int offset = pckt_size; + if (is_ipv6) { + offset -= ICMP6_TOOBIG_SIZE; + } else { + offset -= ICMP_TOOBIG_SIZE; + } + if(bpf_xdp_adjust_tail(xdp, 0 - offset)) { + return XDP_DROP; + } + if (is_ipv6) { + return send_icmp6_too_big(xdp); + } else { + return send_icmp4_too_big(xdp); + } +} + + __attribute__((__always_inline__)) static inline int parse_icmpv6(void *data, void *data_end, __u64 off, struct packet_description *pckt) { diff --git a/katran/lib/linux_includes/bpf.h b/katran/lib/linux_includes/bpf.h index c0855392e..df38ce07a 100644 --- a/katran/lib/linux_includes/bpf.h +++ b/katran/lib/linux_includes/bpf.h @@ -1,4 +1,6 @@ -/* This program is free software; you can redistribute it and/or +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ @@ -14,7 +16,7 @@ #define BPF_ALU64 0x07 /* alu mode in double word width */ /* ld/ldx fields */ -#define BPF_DW 0x18 /* double word */ +#define BPF_DW 0x18 /* double word (64-bit) */ #define BPF_XADD 0xc0 /* exclusive add */ /* alu/jmp fields */ @@ -91,6 +93,8 @@ enum bpf_cmd { BPF_MAP_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, BPF_PROG_QUERY, + BPF_RAW_TRACEPOINT_OPEN, + BPF_BTF_LOAD, }; enum bpf_map_type { @@ -130,6 +134,9 @@ enum bpf_prog_type { BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_CGROUP_DEVICE, + BPF_PROG_TYPE_SK_MSG, + BPF_PROG_TYPE_RAW_TRACEPOINT, + BPF_PROG_TYPE_CGROUP_SOCK_ADDR, }; enum bpf_attach_type { @@ -140,6 +147,13 @@ enum bpf_attach_type { BPF_SK_SKB_STREAM_PARSER, BPF_SK_SKB_STREAM_VERDICT, BPF_CGROUP_DEVICE, + BPF_SK_MSG_VERDICT, + BPF_CGROUP_INET4_BIND, + BPF_CGROUP_INET6_BIND, + BPF_CGROUP_INET4_CONNECT, + BPF_CGROUP_INET6_CONNECT, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET6_POST_BIND, __MAX_BPF_ATTACH_TYPE }; @@ -228,6 +242,28 @@ enum bpf_attach_type { #define BPF_F_RDONLY (1U << 3) #define BPF_F_WRONLY (1U << 4) +/* Flag for stack_map, store build_id+offset instead of pointer */ +#define BPF_F_STACK_BUILD_ID (1U << 5) + +enum bpf_stack_build_id_status { + /* user space need an empty entry to identify end of a trace */ + BPF_STACK_BUILD_ID_EMPTY = 0, + /* with valid build_id and offset */ + BPF_STACK_BUILD_ID_VALID = 1, + /* couldn't get build_id, fallback to ip */ + BPF_STACK_BUILD_ID_IP = 2, +}; + +#define BPF_BUILD_ID_SIZE 20 +struct bpf_stack_build_id { + __s32 status; + unsigned char build_id[BPF_BUILD_ID_SIZE]; + union { + __u64 offset; + __u64 ip; + }; +}; + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -242,6 +278,10 @@ union bpf_attr { * BPF_F_NUMA_NODE is set). */ char map_name[BPF_OBJ_NAME_LEN]; + __u32 map_ifindex; /* ifindex of netdev to create on */ + __u32 btf_fd; /* fd pointing to a BTF type data */ + __u32 btf_key_id; /* BTF type_id of the key */ + __u32 btf_value_id; /* BTF type_id of the value */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -266,6 +306,11 @@ union bpf_attr { __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; __u32 prog_ifindex; /* ifindex of netdev to prep for */ + /* For some prog types expected attach type must be known at + * load time to verify attach type specific parts of prog + * (context accesses, allowed helpers, etc). + */ + __u32 expected_attach_type; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -316,374 +361,1445 @@ union bpf_attr { __aligned_u64 prog_ids; __u32 prog_cnt; } query; + + struct { + __u64 name; + __u32 prog_fd; + } raw_tracepoint; + + struct { /* anonymous struct for BPF_BTF_LOAD */ + __aligned_u64 btf; + __aligned_u64 btf_log_buf; + __u32 btf_size; + __u32 btf_log_size; + __u32 btf_log_level; + }; } __attribute__((aligned(8))); -/* BPF helper function descriptions: +/* The description below is an attempt at providing documentation to eBPF + * developers about the multiple available eBPF helper functions. It can be + * parsed and used to produce a manual page. The workflow is the following, + * and requires the rst2man utility: + * + * $ ./scripts/bpf_helpers_doc.py \ + * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst + * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 + * $ man /tmp/bpf-helpers.7 + * + * Note that in order to produce this external documentation, some RST + * formatting is used in the descriptions to get "bold" and "italics" in + * manual pages. Also note that the few trailing white spaces are + * intentional, removing them would break paragraphs for rst2man. + * + * Start of BPF helper function descriptions: + * + * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key) + * Description + * Perform a lookup in *map* for an entry associated to *key*. + * Return + * Map value associated to *key*, or **NULL** if no entry was + * found. + * + * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) + * Description + * Add or update the value of the entry associated to *key* in + * *map* with *value*. *flags* is one of: * - * void *bpf_map_lookup_elem(&map, &key) - * Return: Map value or NULL + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. * - * int bpf_map_update_elem(&map, &key, &value, flags) - * Return: 0 on success or negative error + * Flag value **BPF_NOEXIST** cannot be used for maps of types + * **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all + * elements always exist), the helper would return an error. + * Return + * 0 on success, or a negative error in case of failure. * - * int bpf_map_delete_elem(&map, &key) - * Return: 0 on success or negative error + * int bpf_map_delete_elem(struct bpf_map *map, const void *key) + * Description + * Delete entry with *key* from *map*. + * Return + * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read(void *dst, int size, void *src) - * Return: 0 on success or negative error + * int bpf_probe_read(void *dst, u32 size, const void *src) + * Description + * For tracing programs, safely attempt to read *size* bytes from + * address *src* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. * * u64 bpf_ktime_get_ns(void) - * Return: current ktime - * - * int bpf_trace_printk(const char *fmt, int fmt_size, ...) - * Return: length of buffer written or negative error - * - * u32 bpf_prandom_u32(void) - * Return: random value - * - * u32 bpf_raw_smp_processor_id(void) - * Return: SMP processor ID - * - * int bpf_skb_store_bytes(skb, offset, from, len, flags) - * store bytes into packet - * @skb: pointer to skb - * @offset: offset within packet from skb->mac_header - * @from: pointer where to copy bytes from - * @len: number of bytes to store into packet - * @flags: bit 0 - if true, recompute skb->csum - * other bits - reserved - * Return: 0 on success or negative error - * - * int bpf_l3_csum_replace(skb, offset, from, to, flags) - * recompute IP checksum - * @skb: pointer to skb - * @offset: offset within packet where IP checksum is located - * @from: old value of header field - * @to: new value of header field - * @flags: bits 0-3 - size of header field - * other bits - reserved - * Return: 0 on success or negative error - * - * int bpf_l4_csum_replace(skb, offset, from, to, flags) - * recompute TCP/UDP checksum - * @skb: pointer to skb - * @offset: offset within packet where TCP/UDP checksum is located - * @from: old value of header field - * @to: new value of header field - * @flags: bits 0-3 - size of header field - * bit 4 - is pseudo header - * other bits - reserved - * Return: 0 on success or negative error - * - * int bpf_tail_call(ctx, prog_array_map, index) - * jump into another BPF program - * @ctx: context pointer passed to next program - * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY - * @index: 32-bit index inside array that selects specific program to run - * Return: 0 on success or negative error - * - * int bpf_clone_redirect(skb, ifindex, flags) - * redirect to another netdev - * @skb: pointer to skb - * @ifindex: ifindex of the net device - * @flags: bit 0 - if set, redirect to ingress instead of egress - * other bits - reserved - * Return: 0 on success or negative error + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Return + * Current *ktime*. + * + * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...) + * Description + * This helper is a "printk()-like" facility for debugging. It + * prints a message defined by format *fmt* (of size *fmt_size*) + * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if + * available. It can take up to three additional **u64** + * arguments (as an eBPF helpers, the total number of arguments is + * limited to five). + * + * Each time the helper is called, it appends a line to the trace. + * The format of the trace is customizable, and the exact output + * one will get depends on the options set in + * *\/sys/kernel/debug/tracing/trace_options* (see also the + * *README* file under the same directory). However, it usually + * defaults to something like: + * + * :: + * + * telnet-470 [001] .N.. 419421.045894: 0x00000001: + * + * In the above: + * + * * ``telnet`` is the name of the current task. + * * ``470`` is the PID of the current task. + * * ``001`` is the CPU number on which the task is + * running. + * * In ``.N..``, each character refers to a set of + * options (whether irqs are enabled, scheduling + * options, whether hard/softirqs are running, level of + * preempt_disabled respectively). **N** means that + * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED** + * are set. + * * ``419421.045894`` is a timestamp. + * * ``0x00000001`` is a fake value used by BPF for the + * instruction pointer register. + * * ```` is the message formatted with + * *fmt*. + * + * The conversion specifiers supported by *fmt* are similar, but + * more limited than for printk(). They are **%d**, **%i**, + * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**, + * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size + * of field, padding with zeroes, etc.) is available, and the + * helper will return **-EINVAL** (but print nothing) if it + * encounters an unknown specifier. + * + * Also, note that **bpf_trace_printk**\ () is slow, and should + * only be used for debugging purposes. For this reason, a notice + * bloc (spanning several lines) is printed to kernel logs and + * states that the helper should not be used "for production use" + * the first time this helper is used (or more precisely, when + * **trace_printk**\ () buffers are allocated). For passing values + * to user space, perf events should be preferred. + * Return + * The number of bytes written to the buffer, or a negative error + * in case of failure. + * + * u32 bpf_get_prandom_u32(void) + * Description + * Get a pseudo-random number. + * + * From a security point of view, this helper uses its own + * pseudo-random internal state, and cannot be used to infer the + * seed of other random functions in the kernel. However, it is + * essential to note that the generator used by the helper is not + * cryptographically secure. + * Return + * A random 32-bit unsigned value. + * + * u32 bpf_get_smp_processor_id(void) + * Description + * Get the SMP (symmetric multiprocessing) processor id. Note that + * all programs run with preemption disabled, which means that the + * SMP processor id is stable during all the execution of the + * program. + * Return + * The SMP id of the processor running the program. + * + * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) + * Description + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. *flags* are a combination of + * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the + * checksum for the packet after storing the bytes) and + * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ + * **->swhash** and *skb*\ **->l4hash** to 0). + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) + * Description + * Recompute the layer 3 (e.g. IP) checksum for the packet + * associated to *skb*. Computation is incremental, so the helper + * must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored in *size*. + * Alternatively, it is possible to store the difference between + * the previous and the new values of the header field in *to*, by + * setting *from* and *size* to 0. For both methods, *offset* + * indicates the location of the IP checksum within the packet. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) + * Description + * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the + * packet associated to *skb*. Computation is incremental, so the + * helper must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored on the lowest + * four bits of *flags*. Alternatively, it is possible to store + * the difference between the previous and the new values of the + * header field in *to*, by setting *from* and the four lowest + * bits of *flags* to 0. For both methods, *offset* indicates the + * location of the IP checksum within the packet. In addition to + * the size of the field, *flags* can be added (bitwise OR) actual + * flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left + * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and + * for updates resulting in a null checksum the value is set to + * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates + * the checksum is to be computed against a pseudo-header. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) + * Description + * This special helper is used to trigger a "tail call", or in + * other words, to jump into another eBPF program. The same stack + * frame is used (but values on stack and in registers for the + * caller are not accessible to the callee). This mechanism allows + * for program chaining, either for raising the maximum number of + * available eBPF instructions, or to execute given programs in + * conditional blocks. For security reasons, there is an upper + * limit to the number of successive tail calls that can be + * performed. + * + * Upon call of this helper, the program attempts to jump into a + * program referenced at index *index* in *prog_array_map*, a + * special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes + * *ctx*, a pointer to the context. + * + * If the call succeeds, the kernel immediately runs the first + * instruction of the new program. This is not a function call, + * and it never returns to the previous program. If the call + * fails, then the helper has no effect, and the caller continues + * to run its subsequent instructions. A call can fail if the + * destination program for the jump does not exist (i.e. *index* + * is superior to the number of entries in *prog_array_map*), or + * if the maximum number of tail calls has been reached for this + * chain of programs. This limit is defined in the kernel by the + * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), + * which is currently set to 32. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) + * Description + * Clone and redirect the packet associated to *skb* to another + * net device of index *ifindex*. Both ingress and egress + * interfaces can be used for redirection. The **BPF_F_INGRESS** + * value in *flags* is used to make the distinction (ingress path + * is selected if the flag is present, egress path otherwise). + * This is the only flag supported for now. + * + * In comparison with **bpf_redirect**\ () helper, + * **bpf_clone_redirect**\ () has the associated cost of + * duplicating the packet buffer, but this can be executed out of + * the eBPF program. Conversely, **bpf_redirect**\ () is more + * efficient, but it is handled through an action code where the + * redirection happens only after the eBPF program has returned. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. * * u64 bpf_get_current_pid_tgid(void) - * Return: current->tgid << 32 | current->pid + * Return + * A 64-bit integer containing the current tgid and pid, and + * created as such: + * *current_task*\ **->tgid << 32 \|** + * *current_task*\ **->pid**. * * u64 bpf_get_current_uid_gid(void) - * Return: current_gid << 32 | current_uid - * - * int bpf_get_current_comm(char *buf, int size_of_buf) - * stores current->comm into buf - * Return: 0 on success or negative error - * - * u32 bpf_get_cgroup_classid(skb) - * retrieve a proc's classid - * @skb: pointer to skb - * Return: classid if != 0 - * - * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) - * Return: 0 on success or negative error - * - * int bpf_skb_vlan_pop(skb) - * Return: 0 on success or negative error - * - * int bpf_skb_get_tunnel_key(skb, key, size, flags) - * int bpf_skb_set_tunnel_key(skb, key, size, flags) - * retrieve or populate tunnel metadata - * @skb: pointer to skb - * @key: pointer to 'struct bpf_tunnel_key' - * @size: size of 'struct bpf_tunnel_key' - * @flags: room for future extensions - * Return: 0 on success or negative error - * - * u64 bpf_perf_event_read(map, flags) - * read perf event counter value - * @map: pointer to perf_event_array map - * @flags: index of event in the map or bitmask flags - * Return: value of perf event counter read or error code - * - * int bpf_redirect(ifindex, flags) - * redirect to another netdev - * @ifindex: ifindex of the net device - * @flags: - * cls_bpf: - * bit 0 - if set, redirect to ingress instead of egress - * other bits - reserved - * xdp_bpf: - * all bits - reserved - * Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error - * xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error - * int bpf_redirect_map(map, key, flags) - * redirect to endpoint in map - * @map: pointer to dev map - * @key: index in map to lookup - * @flags: -- - * Return: XDP_REDIRECT on success or XDP_ABORT on error - * - * u32 bpf_get_route_realm(skb) - * retrieve a dst's tclassid - * @skb: pointer to skb - * Return: realm if != 0 - * - * int bpf_perf_event_output(ctx, map, flags, data, size) - * output perf raw sample - * @ctx: struct pt_regs* - * @map: pointer to perf_event_array map - * @flags: index of event in the map or bitmask flags - * @data: data on stack to be output as raw data - * @size: size of data - * Return: 0 on success or negative error - * - * int bpf_get_stackid(ctx, map, flags) - * walk user or kernel stack and return id - * @ctx: struct pt_regs* - * @map: pointer to stack_trace map - * @flags: bits 0-7 - numer of stack frames to skip - * bit 8 - collect user stack instead of kernel - * bit 9 - compare stacks by hash only - * bit 10 - if two different stacks hash into the same stackid - * discard old - * other bits - reserved - * Return: >= 0 stackid on success or negative error - * - * s64 bpf_csum_diff(from, from_size, to, to_size, seed) - * calculate csum diff - * @from: raw from buffer - * @from_size: length of from buffer - * @to: raw to buffer - * @to_size: length of to buffer - * @seed: optional seed - * Return: csum result or negative error code - * - * int bpf_skb_get_tunnel_opt(skb, opt, size) - * retrieve tunnel options metadata - * @skb: pointer to skb - * @opt: pointer to raw tunnel option data - * @size: size of @opt - * Return: option size - * - * int bpf_skb_set_tunnel_opt(skb, opt, size) - * populate tunnel options metadata - * @skb: pointer to skb - * @opt: pointer to raw tunnel option data - * @size: size of @opt - * Return: 0 on success or negative error - * - * int bpf_skb_change_proto(skb, proto, flags) - * Change protocol of the skb. Currently supported is v4 -> v6, - * v6 -> v4 transitions. The helper will also resize the skb. eBPF - * program is expected to fill the new headers via skb_store_bytes - * and lX_csum_replace. - * @skb: pointer to skb - * @proto: new skb->protocol type - * @flags: reserved - * Return: 0 on success or negative error - * - * int bpf_skb_change_type(skb, type) - * Change packet type of skb. - * @skb: pointer to skb - * @type: new skb->pkt_type type - * Return: 0 on success or negative error - * - * int bpf_skb_under_cgroup(skb, map, index) - * Check cgroup2 membership of skb - * @skb: pointer to skb - * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type - * @index: index of the cgroup in the bpf_map - * Return: - * == 0 skb failed the cgroup2 descendant test - * == 1 skb succeeded the cgroup2 descendant test - * < 0 error - * - * u32 bpf_get_hash_recalc(skb) - * Retrieve and possibly recalculate skb->hash. - * @skb: pointer to skb - * Return: hash + * Return + * A 64-bit integer containing the current GID and UID, and + * created as such: *current_gid* **<< 32 \|** *current_uid*. + * + * int bpf_get_current_comm(char *buf, u32 size_of_buf) + * Description + * Copy the **comm** attribute of the current task into *buf* of + * *size_of_buf*. The **comm** attribute contains the name of + * the executable (excluding the path) for the current task. The + * *size_of_buf* must be strictly positive. On success, the + * helper makes sure that the *buf* is NUL-terminated. On failure, + * it is filled with zeroes. + * Return + * 0 on success, or a negative error in case of failure. + * + * u32 bpf_get_cgroup_classid(struct sk_buff *skb) + * Description + * Retrieve the classid for the current task, i.e. for the net_cls + * cgroup to which *skb* belongs. + * + * This helper can be used on TC egress path, but not on ingress. + * + * The net_cls cgroup provides an interface to tag network packets + * based on a user-provided identifier for all traffic coming from + * the tasks belonging to the related cgroup. See also the related + * kernel documentation, available from the Linux sources in file + * *Documentation/cgroup-v1/net_cls.txt*. + * + * The Linux kernel has two versions for cgroups: there are + * cgroups v1 and cgroups v2. Both are available to users, who can + * use a mixture of them, but note that the net_cls cgroup is for + * cgroup v1 only. This makes it incompatible with BPF programs + * run on cgroups, which is a cgroup-v2-only feature (a socket can + * only hold data for one version of cgroups at a time). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to + * "**y**" or to "**m**". + * Return + * The classid, or 0 for the default unconfigured classid. + * + * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) + * Description + * Push a *vlan_tci* (VLAN tag control information) of protocol + * *vlan_proto* to the packet associated to *skb*, then update + * the checksum. Note that if *vlan_proto* is different from + * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to + * be **ETH_P_8021Q**. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_vlan_pop(struct sk_buff *skb) + * Description + * Pop a VLAN header from the packet associated to *skb*. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * Description + * Get tunnel metadata. This helper takes a pointer *key* to an + * empty **struct bpf_tunnel_key** of **size**, that will be + * filled with tunnel metadata for the packet associated to *skb*. + * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which + * indicates that the tunnel is based on IPv6 protocol instead of + * IPv4. + * + * The **struct bpf_tunnel_key** is an object that generalizes the + * principal parameters used by various tunneling protocols into a + * single struct. This way, it can be used to easily make a + * decision based on the contents of the encapsulation header, + * "summarized" in this struct. In particular, it holds the IP + * address of the remote end (IPv4 or IPv6, depending on the case) + * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also, + * this struct exposes the *key*\ **->tunnel_id**, which is + * generally mapped to a VNI (Virtual Network Identifier), making + * it programmable together with the **bpf_skb_set_tunnel_key**\ + * () helper. + * + * Let's imagine that the following code is part of a program + * attached to the TC ingress interface, on one end of a GRE + * tunnel, and is supposed to filter out all messages coming from + * remote ends with IPv4 address other than 10.0.0.1: + * + * :: + * + * int ret; + * struct bpf_tunnel_key key = {}; + * + * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); + * if (ret < 0) + * return TC_ACT_SHOT; // drop packet + * + * if (key.remote_ipv4 != 0x0a000001) + * return TC_ACT_SHOT; // drop packet + * + * return TC_ACT_OK; // accept packet + * + * This interface can also be used with all encapsulation devices + * that can operate in "collect metadata" mode: instead of having + * one network device per specific configuration, the "collect + * metadata" mode only requires a single device where the + * configuration can be extracted from this helper. + * + * This can be used together with various tunnels such as VXLan, + * Geneve, GRE or IP in IP (IPIP). + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * Description + * Populate tunnel metadata for packet associated to *skb.* The + * tunnel metadata is set to the contents of *key*, of *size*. The + * *flags* can be set to a combination of the following values: + * + * **BPF_F_TUNINFO_IPV6** + * Indicate that the tunnel is based on IPv6 protocol + * instead of IPv4. + * **BPF_F_ZERO_CSUM_TX** + * For IPv4 packets, add a flag to tunnel metadata + * indicating that checksum computation should be skipped + * and checksum set to zeroes. + * **BPF_F_DONT_FRAGMENT** + * Add a flag to tunnel metadata indicating that the + * packet should not be fragmented. + * **BPF_F_SEQ_NUMBER** + * Add a flag to tunnel metadata indicating that a + * sequence number should be added to tunnel header before + * sending the packet. This flag was added for GRE + * encapsulation, but might be used with other protocols + * as well in the future. + * + * Here is a typical usage on the transmit path: + * + * :: + * + * struct bpf_tunnel_key key; + * populate key ... + * bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); + * bpf_clone_redirect(skb, vxlan_dev_ifindex, 0); + * + * See also the description of the **bpf_skb_get_tunnel_key**\ () + * helper for additional information. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags) + * Description + * Read the value of a perf event counter. This helper relies on a + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of + * the perf event counter is selected when *map* is updated with + * perf event file descriptors. The *map* is an array whose size + * is the number of available CPUs, and each cell contains a value + * relative to one CPU. The value to retrieve is indicated by + * *flags*, that contains the index of the CPU to look up, masked + * with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * Note that before Linux 4.13, only hardware perf event can be + * retrieved. + * + * Also, be aware that the newer helper + * **bpf_perf_event_read_value**\ () is recommended over + * **bpf_perf_event_read**\ () in general. The latter has some ABI + * quirks where error and counter value are used as a return code + * (which is wrong to do since ranges may overlap). This issue is + * fixed with **bpf_perf_event_read_value**\ (), which at the same + * time provides more features over the **bpf_perf_event_read**\ + * () interface. Please refer to the description of + * **bpf_perf_event_read_value**\ () for details. + * Return + * The value of the perf event counter read from the map, or a + * negative error code in case of failure. + * + * int bpf_redirect(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_clone_redirect**\ + * (), except that the packet is not cloned, which provides + * increased performance. + * + * Except for XDP, both ingress and egress interfaces can be used + * for redirection. The **BPF_F_INGRESS** value in *flags* is used + * to make the distinction (ingress path is selected if the flag + * is present, egress path otherwise). Currently, XDP only + * supports redirection to the egress interface, and accepts no + * flag at all. + * + * The same effect can be attained with the more generic + * **bpf_redirect_map**\ (), which requires specific maps to be + * used but offers better performance. + * Return + * For XDP, the helper returns **XDP_REDIRECT** on success or + * **XDP_ABORTED** on error. For other program types, the values + * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on + * error. + * + * u32 bpf_get_route_realm(struct sk_buff *skb) + * Description + * Retrieve the realm or the route, that is to say the + * **tclassid** field of the destination for the *skb*. The + * indentifier retrieved is a user-provided tag, similar to the + * one used with the net_cls cgroup (see description for + * **bpf_get_cgroup_classid**\ () helper), but here this tag is + * held by a route (a destination entry), not by a task. + * + * Retrieving this identifier works with the clsact TC egress hook + * (see also **tc-bpf(8)**), or alternatively on conventional + * classful egress qdiscs, but not on TC ingress path. In case of + * clsact TC egress hook, this has the advantage that, internally, + * the destination entry has not been dropped yet in the transmit + * path. Therefore, the destination entry does not need to be + * artificially held via **netif_keep_dst**\ () for a classful + * qdisc until the *skb* is freed. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_IP_ROUTE_CLASSID** configuration option. + * Return + * The realm of the route for the packet associated to *skb*, or 0 + * if none was found. + * + * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * The context of the program *ctx* needs also be passed to the + * helper. + * + * On user space, a program willing to read the values needs to + * call **perf_event_open**\ () on the perf event (either for + * one or for all CPUs) and to store the file descriptor into the + * *map*. This must be done before the eBPF program can send data + * into it. An example is available in file + * *samples/bpf/trace_output_user.c* in the Linux kernel source + * tree (the eBPF program counterpart is in + * *samples/bpf/trace_output_kern.c*). + * + * **bpf_perf_event_output**\ () achieves better performance + * than **bpf_trace_printk**\ () for sharing data with user + * space, and is much better suitable for streaming data from eBPF + * programs. + * + * Note that this helper is not restricted to tracing use cases + * and can be used with programs attached to TC or XDP as well, + * where it allows for passing data to user space listeners. Data + * can be: + * + * * Only custom structs, + * * Only the packet payload, or + * * A combination of both. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) + * Description + * This helper was provided as an easy way to load data from a + * packet. It can be used to load *len* bytes from *offset* from + * the packet associated to *skb*, into the buffer pointed by + * *to*. + * + * Since Linux 4.7, usage of this helper has mostly been replaced + * by "direct packet access", enabling packet data to be + * manipulated with *skb*\ **->data** and *skb*\ **->data_end** + * pointing respectively to the first byte of packet data and to + * the byte after the last byte of packet data. However, it + * remains useful if one wishes to read large quantities of data + * at once from a packet into the eBPF stack. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags) + * Description + * Walk a user or a kernel stack and return its id. To achieve + * this, the helper needs *ctx*, which is a pointer to the context + * on which the tracing program is executed, and a pointer to a + * *map* of type **BPF_MAP_TYPE_STACK_TRACE**. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * a combination of the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_FAST_STACK_CMP** + * Compare stacks by hash only. + * **BPF_F_REUSE_STACKID** + * If two different stacks hash into the same *stackid*, + * discard the old one. + * + * The stack id retrieved is a 32 bit long integer handle which + * can be further combined with other data (including other stack + * ids) and used as a key into maps. This can be useful for + * generating a variety of graphs (such as flame graphs or off-cpu + * graphs). + * + * For walking a stack, this helper is an improvement over + * **bpf_probe_read**\ (), which can be used with unrolled loops + * but is not efficient and consumes a lot of eBPF instructions. + * Instead, **bpf_get_stackid**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * + * Return + * The positive or null stack id on success, or a negative error + * in case of failure. + * + * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed) + * Description + * Compute a checksum difference, from the raw buffer pointed by + * *from*, of length *from_size* (that must be a multiple of 4), + * towards the raw buffer pointed by *to*, of size *to_size* + * (same remark). An optional *seed* can be added to the value + * (this can be cascaded, the seed may come from a previous call + * to the helper). + * + * This is flexible enough to be used in several ways: + * + * * With *from_size* == 0, *to_size* > 0 and *seed* set to + * checksum, it can be used when pushing new data. + * * With *from_size* > 0, *to_size* == 0 and *seed* set to + * checksum, it can be used when removing data from a packet. + * * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it + * can be used to compute a diff. Note that *from_size* and + * *to_size* do not need to be equal. + * + * This helper can be used in combination with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to + * which one can feed in the difference computed with + * **bpf_csum_diff**\ (). + * Return + * The checksum result, or a negative error code in case of + * failure. + * + * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) + * Description + * Retrieve tunnel options metadata for the packet associated to + * *skb*, and store the raw tunnel option data to the buffer *opt* + * of *size*. + * + * This helper can be used with encapsulation devices that can + * operate in "collect metadata" mode (please refer to the related + * note in the description of **bpf_skb_get_tunnel_key**\ () for + * more details). A particular example where this can be used is + * in combination with the Geneve encapsulation protocol, where it + * allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper) + * and retrieving arbitrary TLVs (Type-Length-Value headers) from + * the eBPF program. This allows for full customization of these + * headers. + * Return + * The size of the option data retrieved. + * + * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) + * Description + * Set tunnel options metadata for the packet associated to *skb* + * to the option data contained in the raw buffer *opt* of *size*. + * + * See also the description of the **bpf_skb_get_tunnel_opt**\ () + * helper for additional information. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) + * Description + * Change the protocol of the *skb* to *proto*. Currently + * supported are transition from IPv4 to IPv6, and from IPv6 to + * IPv4. The helper takes care of the groundwork for the + * transition, including resizing the socket buffer. The eBPF + * program is expected to fill the new headers, if any, via + * **skb_store_bytes**\ () and to recompute the checksums with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ + * (). The main case for this helper is to perform NAT64 + * operations out of an eBPF program. + * + * Internally, the GSO type is marked as dodgy so that headers are + * checked and segments are recalculated by the GSO/GRO engine. + * The size for GSO target is adapted as well. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_change_type(struct sk_buff *skb, u32 type) + * Description + * Change the packet type for the packet associated to *skb*. This + * comes down to setting *skb*\ **->pkt_type** to *type*, except + * the eBPF program does not have a write access to *skb*\ + * **->pkt_type** beside this helper. Using a helper here allows + * for graceful handling of errors. + * + * The major use case is to change incoming *skb*s to + * **PACKET_HOST** in a programmatic way instead of having to + * recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for + * example. + * + * Note that *type* only allows certain values. At this time, they + * are: + * + * **PACKET_HOST** + * Packet is for us. + * **PACKET_BROADCAST** + * Send packet to all. + * **PACKET_MULTICAST** + * Send packet to group. + * **PACKET_OTHERHOST** + * Send packet to someone else. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) + * Description + * Check whether *skb* is a descendant of the cgroup2 held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * Return + * The return value depends on the result of the test, and can be: + * + * * 0, if the *skb* failed the cgroup2 descendant test. + * * 1, if the *skb* succeeded the cgroup2 descendant test. + * * A negative error code, if an error occurred. + * + * u32 bpf_get_hash_recalc(struct sk_buff *skb) + * Description + * Retrieve the hash of the packet, *skb*\ **->hash**. If it is + * not set, in particular if the hash was cleared due to mangling, + * recompute this hash. Later accesses to the hash can be done + * directly with *skb*\ **->hash**. + * + * Calling **bpf_set_hash_invalid**\ (), changing a packet + * prototype with **bpf_skb_change_proto**\ (), or calling + * **bpf_skb_store_bytes**\ () with the + * **BPF_F_INVALIDATE_HASH** are actions susceptible to clear + * the hash and to trigger a new computation for the next call to + * **bpf_get_hash_recalc**\ (). + * Return + * The 32-bit hash. * * u64 bpf_get_current_task(void) - * Returns current task_struct - * Return: current - * - * int bpf_probe_write_user(void *dst, void *src, int len) - * safely attempt to write to a location - * @dst: destination address in userspace - * @src: source address on stack - * @len: number of bytes to copy - * Return: 0 on success or negative error - * - * int bpf_current_task_under_cgroup(map, index) - * Check cgroup2 membership of current task - * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type - * @index: index of the cgroup in the bpf_map - * Return: - * == 0 current failed the cgroup2 descendant test - * == 1 current succeeded the cgroup2 descendant test - * < 0 error - * - * int bpf_skb_change_tail(skb, len, flags) - * The helper will resize the skb to the given new size, to be used f.e. - * with control messages. - * @skb: pointer to skb - * @len: new skb length - * @flags: reserved - * Return: 0 on success or negative error - * - * int bpf_skb_pull_data(skb, len) - * The helper will pull in non-linear data in case the skb is non-linear - * and not all of len are part of the linear section. Only needed for - * read/write with direct packet access. - * @skb: pointer to skb - * @len: len to make read/writeable - * Return: 0 on success or negative error - * - * s64 bpf_csum_update(skb, csum) - * Adds csum into skb->csum in case of CHECKSUM_COMPLETE. - * @skb: pointer to skb - * @csum: csum to add - * Return: csum on success or negative error - * - * void bpf_set_hash_invalid(skb) - * Invalidate current skb->hash. - * @skb: pointer to skb - * - * int bpf_get_numa_node_id() - * Return: Id of current NUMA node. - * - * int bpf_skb_change_head() - * Grows headroom of skb and adjusts MAC header offset accordingly. - * Will extends/reallocae as required automatically. - * May change skb data pointer and will thus invalidate any check - * performed for direct packet access. - * @skb: pointer to skb - * @len: length of header to be pushed in front - * @flags: Flags (unused for now) - * Return: 0 on success or negative error - * - * int bpf_xdp_adjust_head(xdp_md, delta) - * Adjust the xdp_md.data by delta - * @xdp_md: pointer to xdp_md - * @delta: An positive/negative integer to be added to xdp_md.data - * Return: 0 on success or negative on error + * Return + * A pointer to the current task struct. + * + * int bpf_probe_write_user(void *dst, const void *src, u32 len) + * Description + * Attempt in a safe way to write *len* bytes from the buffer + * *src* to *dst* in memory. It only works for threads that are in + * user context, and *dst* must be a valid user space address. + * + * This helper should not be used to implement any kind of + * security mechanism because of TOC-TOU attacks, but rather to + * debug, divert, and manipulate execution of semi-cooperative + * processes. + * + * Keep in mind that this feature is meant for experiments, and it + * has a risk of crashing the system and running programs. + * Therefore, when an eBPF program using this helper is attached, + * a warning including PID and process name is printed to kernel + * logs. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) + * Description + * Check whether the probe is being run is the context of a given + * subset of the cgroup2 hierarchy. The cgroup2 to test is held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * Return + * The return value depends on the result of the test, and can be: + * + * * 0, if the *skb* task belongs to the cgroup2. + * * 1, if the *skb* task does not belong to the cgroup2. + * * A negative error code, if an error occurred. + * + * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) + * Description + * Resize (trim or grow) the packet associated to *skb* to the + * new *len*. The *flags* are reserved for future usage, and must + * be left at zero. + * + * The basic idea is that the helper performs the needed work to + * change the size of the packet, then the eBPF program rewrites + * the rest via helpers like **bpf_skb_store_bytes**\ (), + * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ () + * and others. This helper is a slow path utility intended for + * replies with control messages. And because it is targeted for + * slow path, the helper itself can afford to be slow: it + * implicitly linearizes, unclones and drops offloads from the + * *skb*. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_pull_data(struct sk_buff *skb, u32 len) + * Description + * Pull in non-linear data in case the *skb* is non-linear and not + * all of *len* are part of the linear section. Make *len* bytes + * from *skb* readable and writable. If a zero value is passed for + * *len*, then the whole length of the *skb* is pulled. + * + * This helper is only needed for reading and writing with direct + * packet access. + * + * For direct packet access, testing that offsets to access + * are within packet boundaries (test on *skb*\ **->data_end**) is + * susceptible to fail if offsets are invalid, or if the requested + * data is in non-linear parts of the *skb*. On failure the + * program can just bail out, or in the case of a non-linear + * buffer, use a helper to make the data available. The + * **bpf_skb_load_bytes**\ () helper is a first solution to access + * the data. Another one consists in using **bpf_skb_pull_data** + * to pull in once the non-linear parts, then retesting and + * eventually access the data. + * + * At the same time, this also makes sure the *skb* is uncloned, + * which is a necessary condition for direct write. As this needs + * to be an invariant for the write part only, the verifier + * detects writes and adds a prologue that is calling + * **bpf_skb_pull_data()** to effectively unclone the *skb* from + * the very beginning in case it is indeed cloned. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum) + * Description + * Add the checksum *csum* into *skb*\ **->csum** in case the + * driver has supplied a checksum for the entire packet into that + * field. Return an error otherwise. This helper is intended to be + * used in combination with **bpf_csum_diff**\ (), in particular + * when the checksum needs to be updated after data has been + * written into the packet through direct packet access. + * Return + * The checksum on success, or a negative error code in case of + * failure. + * + * void bpf_set_hash_invalid(struct sk_buff *skb) + * Description + * Invalidate the current *skb*\ **->hash**. It can be used after + * mangling on headers through direct packet access, in order to + * indicate that the hash is outdated and to trigger a + * recalculation the next time the kernel tries to access this + * hash or when the **bpf_get_hash_recalc**\ () helper is called. + * + * int bpf_get_numa_node_id(void) + * Description + * Return the id of the current NUMA node. The primary use case + * for this helper is the selection of sockets for the local NUMA + * node, when the program is attached to sockets using the + * **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**), + * but the helper is also available to other eBPF program types, + * similarly to **bpf_get_smp_processor_id**\ (). + * Return + * The id of current NUMA node. + * + * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) + * Description + * Grows headroom of packet associated to *skb* and adjusts the + * offset of the MAC header accordingly, adding *len* bytes of + * space. It automatically extends and reallocates memory as + * required. + * + * This helper can be used on a layer 3 *skb* to push a MAC header + * for redirection into a layer 2 device. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that + * it is possible to use a negative value for *delta*. This helper + * can be used to prepare the packet for pushing or popping + * headers. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. * * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) - * Copy a NUL terminated string from unsafe address. In case the string - * length is smaller than size, the target is not padded with further NUL - * bytes. In case the string length is larger than size, just count-1 - * bytes are copied and the last byte is set to NUL. - * @dst: destination address - * @size: maximum number of bytes to copy, including the trailing NUL - * @unsafe_ptr: unsafe address - * Return: - * > 0 length of the string including the trailing NUL on success - * < 0 error - * - * u64 bpf_get_socket_cookie(skb) - * Get the cookie for the socket stored inside sk_buff. - * @skb: pointer to skb - * Return: 8 Bytes non-decreasing number on success or 0 if the socket - * field is missing inside sk_buff - * - * u32 bpf_get_socket_uid(skb) - * Get the owner uid of the socket stored inside sk_buff. - * @skb: pointer to skb - * Return: uid of the socket owner on success or overflowuid if failed. - * - * u32 bpf_set_hash(skb, hash) - * Set full skb->hash. - * @skb: pointer to skb - * @hash: hash to set - * - * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen) - * Calls setsockopt. Not all opts are available, only those with - * integer optvals plus TCP_CONGESTION. - * Supported levels: SOL_SOCKET and IPPROTO_TCP - * @bpf_socket: pointer to bpf_socket - * @level: SOL_SOCKET or IPPROTO_TCP - * @optname: option name - * @optval: pointer to option value - * @optlen: length of optval in bytes - * Return: 0 or negative error - * - * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen) - * Calls getsockopt. Not all opts are available. - * Supported levels: IPPROTO_TCP - * @bpf_socket: pointer to bpf_socket - * @level: IPPROTO_TCP - * @optname: option name - * @optval: pointer to option value - * @optlen: length of optval in bytes - * Return: 0 or negative error - * - * int bpf_skb_adjust_room(skb, len_diff, mode, flags) - * Grow or shrink room in sk_buff. - * @skb: pointer to skb - * @len_diff: (signed) amount of room to grow/shrink - * @mode: operation mode (enum bpf_adj_room_mode) - * @flags: reserved for future use - * Return: 0 on success or negative error code - * - * int bpf_sk_redirect_map(map, key, flags) - * Redirect skb to a sock in map using key as a lookup key for the - * sock in map. - * @map: pointer to sockmap - * @key: key to lookup sock in map - * @flags: reserved for future use - * Return: SK_PASS - * - * int bpf_sock_map_update(skops, map, key, flags) - * @skops: pointer to bpf_sock_ops - * @map: pointer to sockmap to update - * @key: key to insert/update sock in map - * @flags: same flags as map update elem - * - * int bpf_xdp_adjust_meta(xdp_md, delta) - * Adjust the xdp_md.data_meta by delta - * @xdp_md: pointer to xdp_md - * @delta: An positive/negative integer to be added to xdp_md.data_meta - * Return: 0 on success or negative on error - * - * int bpf_perf_event_read_value(map, flags, buf, buf_size) - * read perf event counter value and perf event enabled/running time - * @map: pointer to perf_event_array map - * @flags: index of event in the map or bitmask flags - * @buf: buf to fill - * @buf_size: size of the buf - * Return: 0 on success or negative error code - * - * int bpf_perf_prog_read_value(ctx, buf, buf_size) - * read perf prog attached perf event counter and enabled/running time - * @ctx: pointer to ctx - * @buf: buf to fill - * @buf_size: size of the buf - * Return : 0 on success or negative error code - * - * int bpf_override_return(pt_regs, rc) - * @pt_regs: pointer to struct pt_regs - * @rc: the return value to set + * Description + * Copy a NUL terminated string from an unsafe address + * *unsafe_ptr* to *dst*. The *size* should include the + * terminating NUL byte. In case the string length is smaller than + * *size*, the target is not padded with further NUL bytes. If the + * string length is larger than *size*, just *size*-1 bytes are + * copied and the last byte is set to NUL. + * + * On success, the length of the copied string is returned. This + * makes this helper useful in tracing programs for reading + * strings, and more importantly to get its length at runtime. See + * the following snippet: + * + * :: + * + * SEC("kprobe/sys_open") + * void bpf_sys_open(struct pt_regs *ctx) + * { + * char buf[PATHLEN]; // PATHLEN is defined to 256 + * int res = bpf_probe_read_str(buf, sizeof(buf), + * ctx->di); + * + * // Consume buf, for example push it to + * // userspace via bpf_perf_event_output(); we + * // can use res (the string length) as event + * // size, after checking its boundaries. + * } + * + * In comparison, using **bpf_probe_read()** helper here instead + * to read the string would require to estimate the length at + * compile time, and would often result in copying more memory + * than necessary. + * + * Another useful use case is when parsing individual process + * arguments or individual environment variables navigating + * *current*\ **->mm->arg_start** and *current*\ + * **->mm->env_start**: using this helper and the return value, + * one can quickly iterate at the right offset of the memory area. + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * u64 bpf_get_socket_cookie(struct sk_buff *skb) + * Description + * If the **struct sk_buff** pointed by *skb* has a known socket, + * retrieve the cookie (generated by the kernel) of this socket. + * If no cookie has been set yet, generate a new cookie. Once + * generated, the socket cookie remains stable for the life of the + * socket. This helper can be useful for monitoring per socket + * networking traffic statistics as it provides a unique socket + * identifier per namespace. + * Return + * A 8-byte long non-decreasing number on success, or 0 if the + * socket field is missing inside *skb*. + * + * u32 bpf_get_socket_uid(struct sk_buff *skb) + * Return + * The owner UID of the socket associated to *skb*. If the socket + * is **NULL**, or if it is not a full socket (i.e. if it is a + * time-wait or a request socket instead), **overflowuid** value + * is returned (note that **overflowuid** might also be the actual + * UID value for the socket). + * + * u32 bpf_set_hash(struct sk_buff *skb, u32 hash) + * Description + * Set the full hash for *skb* (set the field *skb*\ **->hash**) + * to value *hash*. + * Return + * 0 + * + * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) + * Description + * Emulate a call to **setsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **setsockopt(2)** for more information. + * The option value of length *optlen* is pointed by *optval*. + * + * This helper actually implements a subset of **setsockopt()**. + * It supports the following *level*\ s: + * + * * **SOL_SOCKET**, which supports the following *optname*\ s: + * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, + * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**. + * * **IPPROTO_TCP**, which supports the following *optname*\ s: + * **TCP_CONGESTION**, **TCP_BPF_IW**, + * **TCP_BPF_SNDCWND_CLAMP**. + * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. + * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags) + * Description + * Grow or shrink the room for data in the packet associated to + * *skb* by *len_diff*, and according to the selected *mode*. + * + * There is a single supported mode at this time: + * + * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer + * (room space is added or removed below the layer 3 header). + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * Description + * Redirect the packet to the endpoint referenced by *map* at + * index *key*. Depending on its type, this *map* can contain + * references to net devices (for forwarding packets through other + * ports), or to CPUs (for redirecting XDP frames to another CPU; + * but this is only implemented for native XDP (with driver + * support) as of this writing). + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * When used to redirect packets to net devices, this helper + * provides a high performance increase over **bpf_redirect**\ (). + * This is due to various implementation details of the underlying + * mechanisms, one of which is the fact that **bpf_redirect_map**\ + * () tries to send packet as a "bulk" to the device. + * Return + * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error. + * + * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * Description + * Redirect the packet to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a *map* referencing sockets. The + * *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust the address pointed by *xdp_md*\ **->data_meta** by + * *delta* (which can be positive or negative). Note that this + * operation modifies the address stored in *xdp_md*\ **->data**, + * so the latter must be loaded only after the helper has been + * called. + * + * The use of *xdp_md*\ **->data_meta** is optional and programs + * are not required to use it. The rationale is that when the + * packet is processed with XDP (e.g. as DoS filter), it is + * possible to push further meta data along with it before passing + * to the stack, and to give the guarantee that an ingress eBPF + * program attached as a TC classifier on the same device can pick + * this up for further post-processing. Since TC works with socket + * buffers, it remains possible to set from XDP the **mark** or + * **priority** pointers, or other pointers for the socket buffer. + * Having this scratch space generic and programmable allows for + * more flexibility as the user is free to store whatever meta + * data they need. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) + * Description + * Read the value of a perf event counter, and store it into *buf* + * of size *buf_size*. This helper relies on a *map* of type + * **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event + * counter is selected when *map* is updated with perf event file + * descriptors. The *map* is an array whose size is the number of + * available CPUs, and each cell contains a value relative to one + * CPU. The value to retrieve is indicated by *flags*, that + * contains the index of the CPU to look up, masked with + * **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * This helper behaves in a way close to + * **bpf_perf_event_read**\ () helper, save that instead of + * just returning the value observed, it fills the *buf* + * structure. This allows for additional data to be retrieved: in + * particular, the enabled and running times (in *buf*\ + * **->enabled** and *buf*\ **->running**, respectively) are + * copied. In general, **bpf_perf_event_read_value**\ () is + * recommended over **bpf_perf_event_read**\ (), which has some + * ABI issues and provides fewer functionalities. + * + * These values are interesting, because hardware PMU (Performance + * Monitoring Unit) counters are limited resources. When there are + * more PMU based perf events opened than available counters, + * kernel will multiplex these events so each event gets certain + * percentage (but not all) of the PMU time. In case that + * multiplexing happens, the number of samples or counter value + * will not reflect the case compared to when no multiplexing + * occurs. This makes comparison between different runs difficult. + * Typically, the counter value should be normalized before + * comparing to other experiments. The usual normalization is done + * as follows. + * + * :: + * + * normalized_counter = counter * t_enabled / t_running + * + * Where t_enabled is the time enabled for event and t_running is + * the time running for event since last normalization. The + * enabled and running times are accumulated since the perf event + * open. To achieve scaling factor between two invocations of an + * eBPF program, users can can use CPU id as the key (which is + * typical for perf array usage model) to remember the previous + * value and do the calculation inside the eBPF program. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * Description + * For en eBPF program attached to a perf event, retrieve the + * value of the event counter associated to *ctx* and store it in + * the structure pointed by *buf* and of size *buf_size*. Enabled + * and running times are also stored in the structure (see + * description of helper **bpf_perf_event_read_value**\ () for + * more details). + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) + * Description + * Emulate a call to **getsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **getsockopt(2)** for more information. + * The retrieved value is stored in the structure pointed by + * *opval* and of length *optlen*. + * + * This helper actually implements a subset of **getsockopt()**. + * It supports the following *level*\ s: + * + * * **IPPROTO_TCP**, which supports *optname* + * **TCP_CONGESTION**. + * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. + * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_override_return(struct pt_reg *regs, u64 rc) + * Description + * Used for error injection, this helper uses kprobes to override + * the return value of the probed function, and to set it to *rc*. + * The first argument is the context *regs* on which the kprobe + * works. + * + * This helper works by setting setting the PC (program counter) + * to an override function which is run in place of the original + * probed function. This means the probed function is not run at + * all. The replacement function just returns with the required + * value. + * + * This helper has security implications, and thus is subject to + * restrictions. It is only available if the kernel was compiled + * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration + * option, and in this case it only works on functions tagged with + * **ALLOW_ERROR_INJECTION** in the kernel code. + * + * Also, the helper is only available for the architectures having + * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing, + * x86 architecture is the only one to support this feature. + * Return + * 0 + * + * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) + * Description + * Attempt to set the value of the **bpf_sock_ops_cb_flags** field + * for the full TCP socket associated to *bpf_sock_ops* to + * *argval*. + * + * The primary use of this field is to determine if there should + * be calls to eBPF programs of type + * **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP + * code. A program of the same type can change its value, per + * connection and as necessary, when the connection is + * established. This field is directly accessible for reading, but + * this helper must be used for updates in order to return an + * error if an eBPF program tries to set a callback that is not + * supported in the current kernel. + * + * The supported callback values that *argval* can combine are: + * + * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) + * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) + * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) + * + * Here are some examples of where one could call such eBPF + * program: + * + * * When RTO fires. + * * When a packet is retransmitted. + * * When the connection terminates. + * * When a packet is sent. + * * When a packet is received. + * Return + * Code **-EINVAL** if the socket is not a full TCP socket; + * otherwise, a positive number containing the bits that could not + * be set is returned (which comes down to 0 if all bits were set + * as required). + * + * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) + * Description + * For socket policies, apply the verdict of the eBPF program to + * the next *bytes* (number of bytes) of message *msg*. + * + * For example, this helper can be used in the following cases: + * + * * A single **sendmsg**\ () or **sendfile**\ () system call + * contains multiple logical messages that the eBPF program is + * supposed to read and for which it should apply a verdict. + * * An eBPF program only cares to read the first *bytes* of a + * *msg*. If the message has a large payload, then setting up + * and calling the eBPF program repeatedly for all bytes, even + * though the verdict is already known, would create unnecessary + * overhead. + * + * When called from within an eBPF program, the helper sets a + * counter internal to the BPF infrastructure, that is used to + * apply the last verdict to the next *bytes*. If *bytes* is + * smaller than the current data being processed from a + * **sendmsg**\ () or **sendfile**\ () system call, the first + * *bytes* will be sent and the eBPF program will be re-run with + * the pointer for start of data pointing to byte number *bytes* + * **+ 1**. If *bytes* is larger than the current data being + * processed, then the eBPF verdict will be applied to multiple + * **sendmsg**\ () or **sendfile**\ () calls until *bytes* are + * consumed. + * + * Note that if a socket closes with the internal counter holding + * a non-zero value, this is not a problem because data is not + * being buffered for *bytes* and is sent as it is received. + * Return + * 0 + * + * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) + * Description + * For socket policies, prevent the execution of the verdict eBPF + * program for message *msg* until *bytes* (byte number) have been + * accumulated. + * + * This can be used when one needs a specific number of bytes + * before a verdict can be assigned, even if the data spans + * multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme + * case would be a user calling **sendmsg**\ () repeatedly with + * 1-byte long message segments. Obviously, this is bad for + * performance, but it is still valid. If the eBPF program needs + * *bytes* bytes to validate a header, this helper can be used to + * prevent the eBPF program to be called again until *bytes* have + * been accumulated. + * Return + * 0 + * + * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) + * Description + * For socket policies, pull in non-linear data from user space + * for *msg* and set pointers *msg*\ **->data** and *msg*\ + * **->data_end** to *start* and *end* bytes offsets into *msg*, + * respectively. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it can only parse data that the (**data**, **data_end**) + * pointers have already consumed. For **sendmsg**\ () hooks this + * is likely the first scatterlist element. But for calls relying + * on the **sendpage** handler (e.g. **sendfile**\ ()) this will + * be the range (**0**, **0**) because the data is shared with + * user space and by default the objective is to avoid allowing + * user space to modify data while (or after) eBPF verdict is + * being decided. This helper can be used to pull in data and to + * set the start and end pointer to given values. Data will be + * copied if necessary (i.e. if data was not linear and if start + * and end pointers do not point to the same chunk). + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) + * Description + * Bind the socket associated to *ctx* to the address pointed by + * *addr*, of length *addr_len*. This allows for making outgoing + * connection from the desired IP address, which can be useful for + * example when all processes inside a cgroup should use one + * single IP address on a host that has multiple IP configured. + * + * This helper works for IPv4 and IPv6, TCP and UDP sockets. The + * domain (*addr*\ **->sa_family**) must be **AF_INET** (or + * **AF_INET6**). Looking for a free port to bind to can be + * expensive, therefore binding to port is not permitted by the + * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) + * must be set to zero. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is + * only possible to shrink the packet as of this writing, + * therefore *delta* must be a negative integer. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) + * Description + * Retrieve the XFRM state (IP transform framework, see also + * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. + * + * The retrieved value is stored in the **struct bpf_xfrm_state** + * pointed by *xfrm_state* and of length *size*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_XFRM** configuration option. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *ctx*, which is a pointer + * to the context on which the tracing program is executed. + * To store the stacktrace, the bpf program provides *buf* with + * a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * + * Return + * a non-negative value equal to or less than size on success, or + * a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -744,7 +1860,16 @@ union bpf_attr { FN(perf_event_read_value), \ FN(perf_prog_read_value), \ FN(getsockopt), \ - FN(override_return), + FN(override_return), \ + FN(sock_ops_cb_flags_set), \ + FN(msg_redirect_map), \ + FN(msg_apply_bytes), \ + FN(msg_cork_bytes), \ + FN(msg_pull_data), \ + FN(bind), \ + FN(xdp_adjust_tail), \ + FN(skb_get_xfrm_state), \ + FN(get_stack), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -778,15 +1903,19 @@ enum bpf_func_id { /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ #define BPF_F_TUNINFO_IPV6 (1ULL << 0) -/* BPF_FUNC_get_stackid flags. */ +/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ #define BPF_F_SKIP_FIELD_MASK 0xffULL #define BPF_F_USER_STACK (1ULL << 8) +/* flags used by BPF_FUNC_get_stackid only. */ #define BPF_F_FAST_STACK_CMP (1ULL << 9) #define BPF_F_REUSE_STACKID (1ULL << 10) +/* flags used by BPF_FUNC_get_stack only. */ +#define BPF_F_USER_BUILD_ID (1ULL << 11) /* BPF_FUNC_skb_set_tunnel_key flags. */ #define BPF_F_ZERO_CSUM_TX (1ULL << 1) #define BPF_F_DONT_FRAGMENT (1ULL << 2) +#define BPF_F_SEQ_NUMBER (1ULL << 3) /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and * BPF_FUNC_perf_event_read_value flags. @@ -849,6 +1978,19 @@ struct bpf_tunnel_key { __u32 tunnel_label; }; +/* user accessible mirror of in-kernel xfrm_state. + * new fields can only be added to the end of this structure + */ +struct bpf_xfrm_state { + __u32 reqid; + __u32 spi; /* Stored in network byte order */ + __u16 family; + union { + __u32 remote_ipv4; /* Stored in network byte order */ + __u32 remote_ipv6[4]; /* Stored in network byte order */ + }; +}; + /* Generic BPF return codes which all BPF program types may support. * The values are binary compatible with their TC_ACT_* counter-part to * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT @@ -872,6 +2014,15 @@ struct bpf_sock { __u32 protocol; __u32 mark; __u32 priority; + __u32 src_ip4; /* Allows 1,2,4-byte read. + * Stored in network byte order. + */ + __u32 src_ip6[4]; /* Allows 1,2,4-byte read. + * Stored in network byte order. + */ + __u32 src_port; /* Allows 4-byte read. + * Stored in host byte order + */ }; #define XDP_PACKET_HEADROOM 256 @@ -896,6 +2047,9 @@ struct xdp_md { __u32 data; __u32 data_end; __u32 data_meta; + /* Below access go through struct xdp_rxq_info */ + __u32 ingress_ifindex; /* rxq->dev->ifindex */ + __u32 rx_queue_index; /* rxq->queue_index */ }; enum sk_action { @@ -918,6 +2072,10 @@ struct bpf_prog_info { __u32 nr_map_ids; __aligned_u64 map_ids; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u32 gpl_compatible:1; + __u64 netns_dev; + __u64 netns_ino; } __attribute__((aligned(8))); struct bpf_map_info { @@ -928,8 +2086,31 @@ struct bpf_map_info { __u32 max_entries; __u32 map_flags; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u64 netns_dev; + __u64 netns_ino; } __attribute__((aligned(8))); +/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed + * by user and intended to be used by socket (e.g. to bind to, depends on + * attach attach type). + */ +struct bpf_sock_addr { + __u32 user_family; /* Allows 4-byte read, but no write. */ + __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order. + */ + __u32 user_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. + * Stored in network byte order. + */ + __u32 user_port; /* Allows 4-byte read and write. + * Stored in network byte order + */ + __u32 family; /* Allows 4-byte read, but no write */ + __u32 type; /* Allows 4-byte read, but no write */ + __u32 protocol; /* Allows 4-byte read, but no write */ +}; + /* User bpf_sock_ops struct to access socket values and specify request ops * and their replies. * Some of this fields are in network (bigendian) byte order and may need @@ -939,8 +2120,9 @@ struct bpf_map_info { struct bpf_sock_ops { __u32 op; union { - __u32 reply; - __u32 replylong[4]; + __u32 args[4]; /* Optionally passed to bpf program */ + __u32 reply; /* Returned by bpf program */ + __u32 replylong[4]; /* Optionally returned by bpf prog */ }; __u32 family; __u32 remote_ip4; /* Stored in network byte order */ @@ -955,8 +2137,39 @@ struct bpf_sock_ops { */ __u32 snd_cwnd; __u32 srtt_us; /* Averaged RTT << 3 in usecs */ + __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ + __u32 state; + __u32 rtt_min; + __u32 snd_ssthresh; + __u32 rcv_nxt; + __u32 snd_nxt; + __u32 snd_una; + __u32 mss_cache; + __u32 ecn_flags; + __u32 rate_delivered; + __u32 rate_interval_us; + __u32 packets_out; + __u32 retrans_out; + __u32 total_retrans; + __u32 segs_in; + __u32 data_segs_in; + __u32 segs_out; + __u32 data_segs_out; + __u32 lost_out; + __u32 sacked_out; + __u32 sk_txhash; + __u64 bytes_received; + __u64 bytes_acked; }; +/* Definitions for bpf_sock_ops_cb_flags */ +#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) +#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) +#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) +#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently + * supported cb flags + */ + /* List of known BPF sock_ops operators. * New entries can only be added at the end */ @@ -990,6 +2203,43 @@ enum { * a congestion threshold. RTTs above * this indicate congestion */ + BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered. + * Arg1: value of icsk_retransmits + * Arg2: value of icsk_rto + * Arg3: whether RTO has expired + */ + BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted. + * Arg1: sequence number of 1st byte + * Arg2: # segments + * Arg3: return value of + * tcp_transmit_skb (0 => success) + */ + BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state. + * Arg1: old_state + * Arg2: new_state + */ +}; + +/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect + * changes between the TCP and BPF versions. Ideally this should never happen. + * If it does, we need to add code to convert them before calling + * the BPF sock_ops function. + */ +enum { + BPF_TCP_ESTABLISHED = 1, + BPF_TCP_SYN_SENT, + BPF_TCP_SYN_RECV, + BPF_TCP_FIN_WAIT1, + BPF_TCP_FIN_WAIT2, + BPF_TCP_TIME_WAIT, + BPF_TCP_CLOSE, + BPF_TCP_CLOSE_WAIT, + BPF_TCP_LAST_ACK, + BPF_TCP_LISTEN, + BPF_TCP_CLOSING, /* Now a valid state */ + BPF_TCP_NEW_SYN_RECV, + + BPF_TCP_MAX_STATES /* Leave at the end! */ }; #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ diff --git a/katran/lib/linux_includes/bpf_common.h b/katran/lib/linux_includes/bpf_common.h index e3667e441..899349896 100644 --- a/katran/lib/linux_includes/bpf_common.h +++ b/katran/lib/linux_includes/bpf_common.h @@ -3,51 +3,51 @@ /* Instruction classes */ #define BPF_CLASS(code) ((code) & 0x07) -// @lint-ignore TXT2 T25377293 Grandfathered in -#define BPF_LD 0x00 -#define BPF_LDX 0x01 -#define BPF_ST 0x02 -#define BPF_STX 0x03 -#define BPF_ALU 0x04 -#define BPF_JMP 0x05 -#define BPF_RET 0x06 -#define BPF_MISC 0x07 +#define BPF_LD 0x00 +#define BPF_LDX 0x01 +#define BPF_ST 0x02 +#define BPF_STX 0x03 +#define BPF_ALU 0x04 +#define BPF_JMP 0x05 +#define BPF_RET 0x06 +#define BPF_MISC 0x07 /* ld/ldx fields */ #define BPF_SIZE(code) ((code) & 0x18) -#define BPF_W 0x00 -#define BPF_H 0x08 -#define BPF_B 0x10 +#define BPF_W 0x00 /* 32-bit */ +#define BPF_H 0x08 /* 16-bit */ +#define BPF_B 0x10 /* 8-bit */ +/* eBPF BPF_DW 0x18 64-bit */ #define BPF_MODE(code) ((code) & 0xe0) -#define BPF_IMM 0x00 -#define BPF_ABS 0x20 -#define BPF_IND 0x40 -#define BPF_MEM 0x60 -#define BPF_LEN 0x80 -#define BPF_MSH 0xa0 +#define BPF_IMM 0x00 +#define BPF_ABS 0x20 +#define BPF_IND 0x40 +#define BPF_MEM 0x60 +#define BPF_LEN 0x80 +#define BPF_MSH 0xa0 /* alu/jmp fields */ #define BPF_OP(code) ((code) & 0xf0) -#define BPF_ADD 0x00 -#define BPF_SUB 0x10 -#define BPF_MUL 0x20 -#define BPF_DIV 0x30 -#define BPF_OR 0x40 -#define BPF_AND 0x50 -#define BPF_LSH 0x60 -#define BPF_RSH 0x70 -#define BPF_NEG 0x80 -#define BPF_MOD 0x90 -#define BPF_XOR 0xa0 +#define BPF_ADD 0x00 +#define BPF_SUB 0x10 +#define BPF_MUL 0x20 +#define BPF_DIV 0x30 +#define BPF_OR 0x40 +#define BPF_AND 0x50 +#define BPF_LSH 0x60 +#define BPF_RSH 0x70 +#define BPF_NEG 0x80 +#define BPF_MOD 0x90 +#define BPF_XOR 0xa0 -#define BPF_JA 0x00 -#define BPF_JEQ 0x10 -#define BPF_JGT 0x20 -#define BPF_JGE 0x30 -#define BPF_JSET 0x40 +#define BPF_JA 0x00 +#define BPF_JEQ 0x10 +#define BPF_JGT 0x20 +#define BPF_JGE 0x30 +#define BPF_JSET 0x40 #define BPF_SRC(code) ((code) & 0x08) -#define BPF_K 0x00 -#define BPF_X 0x08 +#define BPF_K 0x00 +#define BPF_X 0x08 #ifndef BPF_MAXINSNS #define BPF_MAXINSNS 4096 diff --git a/katran/lib/linux_includes/bpf_helpers.h b/katran/lib/linux_includes/bpf_helpers.h index e25dbf603..609247e77 100644 --- a/katran/lib/linux_includes/bpf_helpers.h +++ b/katran/lib/linux_includes/bpf_helpers.h @@ -9,151 +9,169 @@ /* helper functions called from eBPF programs written in C */ static void *(*bpf_map_lookup_elem)(void *map, void *key) = - (void *) BPF_FUNC_map_lookup_elem; + (void *) BPF_FUNC_map_lookup_elem; static int (*bpf_map_update_elem)(void *map, void *key, void *value, - unsigned long long flags) = - (void *) BPF_FUNC_map_update_elem; + unsigned long long flags) = + (void *) BPF_FUNC_map_update_elem; static int (*bpf_map_delete_elem)(void *map, void *key) = - (void *) BPF_FUNC_map_delete_elem; + (void *) BPF_FUNC_map_delete_elem; static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) = - (void *) BPF_FUNC_probe_read; + (void *) BPF_FUNC_probe_read; static unsigned long long (*bpf_ktime_get_ns)(void) = - (void *) BPF_FUNC_ktime_get_ns; + (void *) BPF_FUNC_ktime_get_ns; static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) = - (void *) BPF_FUNC_trace_printk; + (void *) BPF_FUNC_trace_printk; static void (*bpf_tail_call)(void *ctx, void *map, int index) = - (void *) BPF_FUNC_tail_call; + (void *) BPF_FUNC_tail_call; static unsigned long long (*bpf_get_smp_processor_id)(void) = - (void *) BPF_FUNC_get_smp_processor_id; + (void *) BPF_FUNC_get_smp_processor_id; static unsigned long long (*bpf_get_current_pid_tgid)(void) = - (void *) BPF_FUNC_get_current_pid_tgid; + (void *) BPF_FUNC_get_current_pid_tgid; static unsigned long long (*bpf_get_current_uid_gid)(void) = - (void *) BPF_FUNC_get_current_uid_gid; + (void *) BPF_FUNC_get_current_uid_gid; static int (*bpf_get_current_comm)(void *buf, int buf_size) = - (void *) BPF_FUNC_get_current_comm; + (void *) BPF_FUNC_get_current_comm; static unsigned long long (*bpf_perf_event_read)(void *map, - unsigned long long flags) = - (void *) BPF_FUNC_perf_event_read; + unsigned long long flags) = + (void *) BPF_FUNC_perf_event_read; static int (*bpf_clone_redirect)(void *ctx, int ifindex, int flags) = - (void *) BPF_FUNC_clone_redirect; + (void *) BPF_FUNC_clone_redirect; static int (*bpf_redirect)(int ifindex, int flags) = - (void *) BPF_FUNC_redirect; + (void *) BPF_FUNC_redirect; static int (*bpf_redirect_map)(void *map, int key, int flags) = - (void *) BPF_FUNC_redirect_map; + (void *) BPF_FUNC_redirect_map; static int (*bpf_perf_event_output)(void *ctx, void *map, - unsigned long long flags, void *data, - int size) = - (void *) BPF_FUNC_perf_event_output; + unsigned long long flags, void *data, + int size) = + (void *) BPF_FUNC_perf_event_output; static int (*bpf_get_stackid)(void *ctx, void *map, int flags) = - (void *) BPF_FUNC_get_stackid; + (void *) BPF_FUNC_get_stackid; static int (*bpf_probe_write_user)(void *dst, void *src, int size) = - (void *) BPF_FUNC_probe_write_user; + (void *) BPF_FUNC_probe_write_user; static int (*bpf_current_task_under_cgroup)(void *map, int index) = - (void *) BPF_FUNC_current_task_under_cgroup; + (void *) BPF_FUNC_current_task_under_cgroup; static int (*bpf_skb_get_tunnel_key)(void *ctx, void *key, int size, int flags) = - (void *) BPF_FUNC_skb_get_tunnel_key; + (void *) BPF_FUNC_skb_get_tunnel_key; static int (*bpf_skb_set_tunnel_key)(void *ctx, void *key, int size, int flags) = - (void *) BPF_FUNC_skb_set_tunnel_key; + (void *) BPF_FUNC_skb_set_tunnel_key; static int (*bpf_skb_get_tunnel_opt)(void *ctx, void *md, int size) = - (void *) BPF_FUNC_skb_get_tunnel_opt; + (void *) BPF_FUNC_skb_get_tunnel_opt; static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int size) = - (void *) BPF_FUNC_skb_set_tunnel_opt; + (void *) BPF_FUNC_skb_set_tunnel_opt; static unsigned long long (*bpf_get_prandom_u32)(void) = - (void *) BPF_FUNC_get_prandom_u32; + (void *) BPF_FUNC_get_prandom_u32; static int (*bpf_xdp_adjust_head)(void *ctx, int offset) = - (void *) BPF_FUNC_xdp_adjust_head; + (void *) BPF_FUNC_xdp_adjust_head; static int (*bpf_xdp_adjust_meta)(void *ctx, int offset) = - (void *) BPF_FUNC_xdp_adjust_meta; + (void *) BPF_FUNC_xdp_adjust_meta; static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval, - int optlen) = - (void *) BPF_FUNC_setsockopt; + int optlen) = + (void *) BPF_FUNC_setsockopt; static int (*bpf_sk_redirect_map)(void *map, int key, int flags) = - (void *) BPF_FUNC_sk_redirect_map; + (void *) BPF_FUNC_sk_redirect_map; +static int (*bpf_getsockopt)(void *ctx, int level, int optname, void *optval, + int optlen) = + (void *) BPF_FUNC_getsockopt; +static int (*bpf_sock_ops_cb_flags_set)(void *ctx, int flags) = + (void *) BPF_FUNC_sock_ops_cb_flags_set; static int (*bpf_sock_map_update)(void *map, void *key, void *value, - unsigned long long flags) = - (void *) BPF_FUNC_sock_map_update; + unsigned long long flags) = + (void *) BPF_FUNC_sock_map_update; static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags, - void *buf, unsigned int buf_size) = - (void *) BPF_FUNC_perf_event_read_value; + void *buf, unsigned int buf_size) = + (void *) BPF_FUNC_perf_event_read_value; static int (*bpf_perf_prog_read_value)(void *ctx, void *buf, - unsigned int buf_size) = - (void *) BPF_FUNC_perf_prog_read_value; - - + unsigned int buf_size) = + (void *) BPF_FUNC_perf_prog_read_value; +static int (*bpf_override_return)(void *ctx, unsigned long rc) = + (void *) BPF_FUNC_override_return; +#ifdef KERNEL_417_PLUS +static int (*bpf_bind)(void *ctx, void *addr, int addr_len) = + (void *) BPF_FUNC_bind; +static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) = + (void *) BPF_FUNC_xdp_adjust_tail; +static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state, + int size, int flags) = + (void *) BPF_FUNC_skb_get_xfrm_state; +static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) = + (void *) BPF_FUNC_get_stack; +#endif /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions */ struct sk_buff; unsigned long long load_byte(void *skb, - unsigned long long off) asm("llvm.bpf.load.byte"); + unsigned long long off) asm("llvm.bpf.load.byte"); unsigned long long load_half(void *skb, - unsigned long long off) asm("llvm.bpf.load.half"); + unsigned long long off) asm("llvm.bpf.load.half"); unsigned long long load_word(void *skb, - unsigned long long off) asm("llvm.bpf.load.word"); + unsigned long long off) asm("llvm.bpf.load.word"); /* a helper structure used by eBPF C program * to describe map attributes to elf_bpf loader */ struct bpf_map_def { - unsigned int type; - unsigned int key_size; - unsigned int value_size; - unsigned int max_entries; - unsigned int map_flags; - unsigned int inner_map_idx; - unsigned int numa_node; + unsigned int type; + unsigned int key_size; + unsigned int value_size; + unsigned int max_entries; + unsigned int map_flags; + unsigned int inner_map_idx; + unsigned int numa_node; }; static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) = - (void *) BPF_FUNC_skb_load_bytes; + (void *) BPF_FUNC_skb_load_bytes; static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) = - (void *) BPF_FUNC_skb_store_bytes; + (void *) BPF_FUNC_skb_store_bytes; static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flags) = - (void *) BPF_FUNC_l3_csum_replace; + (void *) BPF_FUNC_l3_csum_replace; static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flags) = - (void *) BPF_FUNC_l4_csum_replace; + (void *) BPF_FUNC_l4_csum_replace; +static int (*bpf_csum_diff)(void *from, int from_size, void *to, int to_size, int seed) = + (void *) BPF_FUNC_csum_diff; static int (*bpf_skb_under_cgroup)(void *ctx, void *map, int index) = - (void *) BPF_FUNC_skb_under_cgroup; + (void *) BPF_FUNC_skb_under_cgroup; static int (*bpf_skb_change_head)(void *, int len, int flags) = - (void *) BPF_FUNC_skb_change_head; + (void *) BPF_FUNC_skb_change_head; /* Scan the ARCH passed in from ARCH env variable (see Makefile) */ #if defined(__TARGET_ARCH_x86) - #define bpf_target_x86 - #define bpf_target_defined + #define bpf_target_x86 + #define bpf_target_defined #elif defined(__TARGET_ARCH_s930x) - #define bpf_target_s930x - #define bpf_target_defined + #define bpf_target_s930x + #define bpf_target_defined #elif defined(__TARGET_ARCH_arm64) - #define bpf_target_arm64 - #define bpf_target_defined + #define bpf_target_arm64 + #define bpf_target_defined #elif defined(__TARGET_ARCH_mips) - #define bpf_target_mips - #define bpf_target_defined + #define bpf_target_mips + #define bpf_target_defined #elif defined(__TARGET_ARCH_powerpc) - #define bpf_target_powerpc - #define bpf_target_defined + #define bpf_target_powerpc + #define bpf_target_defined #elif defined(__TARGET_ARCH_sparc) - #define bpf_target_sparc - #define bpf_target_defined + #define bpf_target_sparc + #define bpf_target_defined #else - #undef bpf_target_defined + #undef bpf_target_defined #endif /* Fall back to what the compiler says */ #ifndef bpf_target_defined #if defined(__x86_64__) - #define bpf_target_x86 + #define bpf_target_x86 #elif defined(__s390x__) - #define bpf_target_s930x + #define bpf_target_s930x #elif defined(__aarch64__) - #define bpf_target_arm64 + #define bpf_target_arm64 #elif defined(__mips__) - #define bpf_target_mips + #define bpf_target_mips #elif defined(__powerpc__) - #define bpf_target_powerpc + #define bpf_target_powerpc #elif defined(__sparc__) - #define bpf_target_sparc + #define bpf_target_sparc #endif #endif @@ -241,17 +259,17 @@ static int (*bpf_skb_change_head)(void *, int len, int flags) = #endif #ifdef bpf_target_powerpc -#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = (ctx)->link; }) -#define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = (ctx)->link; }) +#define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP #elif bpf_target_sparc -#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = PT_REGS_RET(ctx); }) -#define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = PT_REGS_RET(ctx); }) +#define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP #else -#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ \ - bpf_probe_read(&(ip), sizeof(ip), (void *)PT_REGS_RET(ctx)); }) -#define BPF_KRETPROBE_READ_RET_IP(ip, ctx) ({ \ - bpf_probe_read(&(ip), sizeof(ip), \ - (void *)(PT_REGS_FP(ctx) + sizeof(ip))); }) +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ \ + bpf_probe_read(&(ip), sizeof(ip), (void *)PT_REGS_RET(ctx)); }) +#define BPF_KRETPROBE_READ_RET_IP(ip, ctx) ({ \ + bpf_probe_read(&(ip), sizeof(ip), \ + (void *)(PT_REGS_FP(ctx) + sizeof(ip))); }) #endif #endif diff --git a/katran/lib/linux_includes/libbpf.c b/katran/lib/linux_includes/libbpf.c index c907686f3..e44e70889 100644 --- a/katran/lib/linux_includes/libbpf.c +++ b/katran/lib/linux_includes/libbpf.c @@ -20,12 +20,12 @@ static __u64 ptr_to_u64(const void *ptr) } static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr, - unsigned int size) + unsigned int size) { - return syscall(__NR_bpf, cmd, attr, size); + return syscall(__NR_bpf, cmd, attr, size); } -int bpf_create_map_node(enum bpf_map_type map_type, const char *name, +int ebpf_create_map_node(enum bpf_map_type map_type, const char *name, int key_size, int value_size, int max_entries, __u32 map_flags, int node) { @@ -50,25 +50,25 @@ int bpf_create_map_node(enum bpf_map_type map_type, const char *name, return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } -int bpf_create_map_name(enum bpf_map_type map_type, const char *name, - int key_size, int value_size, int max_entries, - __u32 map_flags) +int ebpf_create_map_name(enum bpf_map_type map_type, const char *name, + int key_size, int value_size, int max_entries, + __u32 map_flags) { - return bpf_create_map_node(map_type, name, key_size, value_size, - max_entries, map_flags, -1); + return ebpf_create_map_node(map_type, name, key_size, value_size, + max_entries, map_flags, -1); } -int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, - int max_entries, __u32 map_flags) +int ebpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, + int max_entries, __u32 map_flags) { - return bpf_create_map_node(map_type, NULL, key_size, value_size, - max_entries, map_flags, -1); + return ebpf_create_map_node(map_type, NULL, key_size, value_size, + max_entries, map_flags, -1); } -int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name, - int key_size, int inner_map_fd, int max_entries, - __u32 map_flags, int node) +int ebpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name, + int key_size, int inner_map_fd, int max_entries, + __u32 map_flags, int node) { __u32 name_len = name ? strlen(name) : 0; union bpf_attr attr; @@ -93,14 +93,14 @@ int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name, -int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size, +int ebpf_create_map_in_map(enum bpf_map_type map_type, int key_size, int inner_map_fd, int max_entries, __u32 map_flags) { - return bpf_create_map_in_map_node(map_type, NULL, key_size, - inner_map_fd, max_entries, map_flags, -1); + return ebpf_create_map_in_map_node(map_type, NULL, key_size, + inner_map_fd, max_entries, map_flags, -1); } -int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags) +int ebpf_update_elem(int fd, void *key, void *value, unsigned long long flags) { union bpf_attr attr; @@ -113,7 +113,7 @@ int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags) return sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); } -int bpf_lookup_elem(int fd, void *key, void *value) +int ebpf_lookup_elem(int fd, void *key, void *value) { union bpf_attr attr; @@ -125,7 +125,7 @@ int bpf_lookup_elem(int fd, void *key, void *value) return sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); } -int bpf_delete_elem(int fd, void *key) +int ebpf_delete_elem(int fd, void *key) { union bpf_attr attr; @@ -136,7 +136,7 @@ int bpf_delete_elem(int fd, void *key) return sys_bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); } -int bpf_get_next_key(int fd, void *key, void *next_key) +int ebpf_get_next_key(int fd, void *key, void *next_key) { union bpf_attr attr; @@ -150,49 +150,49 @@ int bpf_get_next_key(int fd, void *key, void *next_key) #define ROUND_UP(x, n) (((x) + (n) - 1u) & ~((n) - 1u)) -int bpf_prog_load_name(enum bpf_prog_type prog_type, const char *name, - const struct bpf_insn *insns, int prog_len, - const char *license, __u32 kern_version, - char *buf, int buf_size) +int ebpf_prog_load_name(enum bpf_prog_type prog_type, const char *name, + const struct bpf_insn *insns, int prog_len, + const char *license, __u32 kern_version, + char *buf, int buf_size) { - int fd; - union bpf_attr attr; + int fd; + union bpf_attr attr; - bzero(&attr, sizeof(attr)); - attr.prog_type = prog_type; - attr.insns = ptr_to_u64(insns); + bzero(&attr, sizeof(attr)); + attr.prog_type = prog_type; + attr.insns = ptr_to_u64(insns); attr.insn_cnt = prog_len / sizeof(struct bpf_insn), - attr.license = ptr_to_u64(license); - attr.log_buf = ptr_to_u64(NULL); - attr.log_size = 0; - attr.log_level = 0; - attr.kern_version = kern_version; + attr.license = ptr_to_u64(license); + attr.log_buf = ptr_to_u64(NULL); + attr.log_size = 0; + attr.log_level = 0; + attr.kern_version = kern_version; if (name) { memcpy(attr.prog_name, name, min(strlen(name), BPF_OBJ_NAME_LEN - 1)); } fd = sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); - if (fd >= 0 || !buf || !buf_size) - return fd; + if (fd >= 0 || !buf || !buf_size) + return fd; - /* Try again with log */ - attr.log_buf = ptr_to_u64(buf); - attr.log_size = buf_size; - attr.log_level = 1; + /* Try again with log */ + attr.log_buf = ptr_to_u64(buf); + attr.log_size = buf_size; + attr.log_level = 1; return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); } -int bpf_prog_load(enum bpf_prog_type prog_type, - const struct bpf_insn *insns, int prog_len, - const char *license, __u32 kern_version, - char *buf, int buf_size) +int ebpf_prog_load(enum bpf_prog_type prog_type, + const struct bpf_insn *insns, int prog_len, + const char *license, __u32 kern_version, + char *buf, int buf_size) { - return bpf_prog_load_name(prog_type, NULL, insns, prog_len, license, - kern_version, buf, buf_size); + return ebpf_prog_load_name(prog_type, NULL, insns, prog_len, license, + kern_version, buf, buf_size); } -int bpf_obj_pin(int fd, const char *pathname) +int ebpf_obj_pin(int fd, const char *pathname) { union bpf_attr attr; @@ -203,7 +203,7 @@ int bpf_obj_pin(int fd, const char *pathname) return sys_bpf(BPF_OBJ_PIN, &attr, sizeof(attr)); } -int bpf_obj_get(const char *pathname) +int ebpf_obj_get(const char *pathname) { union bpf_attr attr; @@ -213,8 +213,8 @@ int bpf_obj_get(const char *pathname) return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr)); } -int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, - unsigned int flags) +int ebpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, + unsigned int flags) { union bpf_attr attr; @@ -227,7 +227,7 @@ int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)); } -int bpf_prog_detach(int target_fd, enum bpf_attach_type type) +int ebpf_prog_detach(int target_fd, enum bpf_attach_type type) { union bpf_attr attr; @@ -238,7 +238,7 @@ int bpf_prog_detach(int target_fd, enum bpf_attach_type type) return sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr)); } -int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type) +int ebpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type) { union bpf_attr attr; @@ -250,9 +250,9 @@ int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type) return sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr)); } -int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, - void *data_out, __u32 *size_out, __u32 *retval, - __u32 *duration) +int ebpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, + void *data_out, __u32 *size_out, __u32 *retval, + __u32 *duration) { union bpf_attr attr; int ret; @@ -274,7 +274,7 @@ int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, return ret; } -int perf_event_open(struct perf_event_attr *attr, int pid, int cpu, +int ebpf_perf_event_open(struct perf_event_attr *attr, int pid, int cpu, int group_fd, unsigned long flags) { return syscall(__NR_perf_event_open, attr, pid, cpu, diff --git a/katran/lib/linux_includes/libbpf.h b/katran/lib/linux_includes/libbpf.h index c84d6dba2..81053361f 100644 --- a/katran/lib/linux_includes/libbpf.h +++ b/katran/lib/linux_includes/libbpf.h @@ -16,42 +16,42 @@ struct bpf_insn; -int bpf_create_map_node(enum bpf_map_type map_type, const char *name, +int ebpf_create_map_node(enum bpf_map_type map_type, const char *name, int key_size, int value_size, int max_entries, __u32 map_flags, int node); -int bpf_create_map_name(enum bpf_map_type map_type, const char *name, +int ebpf_create_map_name(enum bpf_map_type map_type, const char *name, int key_size, int value_size, int max_entries, __u32 map_flags); -int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, +int ebpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries, __u32 map_flags); -int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name, +int ebpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name, int key_size, int inner_map_fd, int max_entries, __u32 map_flags, int node); -int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size, +int ebpf_create_map_in_map(enum bpf_map_type map_type, int key_size, int inner_map_fd, int max_entries, __u32 map_flags); -int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags); -int bpf_lookup_elem(int fd, void *key, void *value); -int bpf_delete_elem(int fd, void *key); -int bpf_get_next_key(int fd, void *key, void *next_key); +int ebpf_update_elem(int fd, void *key, void *value, unsigned long long flags); +int ebpf_lookup_elem(int fd, void *key, void *value); +int ebpf_delete_elem(int fd, void *key); +int ebpf_get_next_key(int fd, void *key, void *next_key); -int bpf_prog_load(enum bpf_prog_type prog_type, +int ebpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, int insn_len, const char *license, __u32 kern_version, char *buf, int buf_size); -int bpf_prog_load_name(enum bpf_prog_type prog_type, const char *name, +int ebpf_prog_load_name(enum bpf_prog_type prog_type, const char *name, const struct bpf_insn *insns, int insn_len, const char *license, __u32 kern_version, char *buf, int buf_size); -int bpf_obj_pin(int fd, const char *pathname); -int bpf_obj_get(const char *pathname); -int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, +int ebpf_obj_pin(int fd, const char *pathname); +int ebpf_obj_get(const char *pathname); +int ebpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, unsigned int flags); -int bpf_prog_detach(int target_fd, enum bpf_attach_type type); -int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type); -int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, +int ebpf_prog_detach(int target_fd, enum bpf_attach_type type); +int ebpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type); +int ebpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, void *data_out, __u32 *size_out, __u32 *retval, __u32 *duration); @@ -236,6 +236,6 @@ int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, .imm = 0 }) struct perf_event_attr; -int perf_event_open(struct perf_event_attr *attr, int pid, int cpu, +int ebpf_perf_event_open(struct perf_event_attr *attr, int pid, int cpu, int group_fd, unsigned long flags); #endif diff --git a/katran/lib/testing/CMakeLists.txt b/katran/lib/testing/CMakeLists.txt index 940837586..a8b6a5ad3 100644 --- a/katran/lib/testing/CMakeLists.txt +++ b/katran/lib/testing/CMakeLists.txt @@ -32,6 +32,7 @@ add_library(xdptester STATIC XdpTester.h XdpTester.cpp KatranTestFixtures.h + KatranOptionalTestFixtures.h ) target_link_libraries(xdptester diff --git a/katran/lib/testing/KatranOptionalTestFixtures.h b/katran/lib/testing/KatranOptionalTestFixtures.h new file mode 100644 index 000000000..c9a123a79 --- /dev/null +++ b/katran/lib/testing/KatranOptionalTestFixtures.h @@ -0,0 +1,59 @@ +// @nolint + +/* Copyright (C) 2018-present, Facebook, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#pragma once +#include +#include +#include + +namespace katran { +namespace testing { +/** + * see KatranTestFixtures.h on how to generate input and output data + */ +using TestFixture = std::vector>; +const TestFixture inputOptionalTestFixtures = { + //1 + { + // Ether(src="0x1", dst="0x2")/IP(src="192.168.1.1", dst="10.200.1.1")/UDP(sport=31337, dport=80)/("katran test pkt"*100) + "AgAAAAAAAQAAAAAACABFAAX4AAEAAEARp4LAqAEBCsgBAXppAFAF5Og2a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0a2F0cmFuIHRlc3QgcGt0", + "ICMPv4 packet too big. ICMP_TOOBIG_GENERATION and 4.17+ kernel is required" + }, + //2 + { + //Ether(src="0x1", dst="0x2")/IPv6(src="fc00:2::1", dst="fc00:1::1")/TCP(sport=31337, dport=80,flags="A")/("katran test pkt"*100) + "AgAAAAAAAQAAAAAAht1gAAAABfAGQPwAAAIAAAAAAAAAAAAAAAH8AAABAAAAAAAAAAAAAAABemkAUAAAAAAAAAAAUBAgAFN1AABrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3Q=", + "ICMPv6 packet too big. ICMP_TOOBIG_GENERATION and 4.17+ kernel is required" + }, +}; + +const TestFixture outputOptionalTestFixtures = { + //1 + { + "AQAAAAAAAgAAAAAACABFAABwAAAAAEABrRsKyAEBwKgBAQMEboQAAAXcRQAF+AABAABAEaeCwKgBAQrIAQF6aQBQBeToNmthdHJhbiB0ZXN0IHBrdGthdHJhbiB0ZXN0IHBrdGthdHJhbiB0ZXN0IHBrdGthdHJhbiB0ZXN0", + "XDP_TX" + }, + //2 + { + "AQAAAAAAAgAAAAAAht1gAAAAAQA6QPwAAAEAAAAAAAAAAAAAAAH8AAACAAAAAAAAAAAAAAABAgD3sgAABdxgAAAABfAGQPwAAAIAAAAAAAAAAAAAAAH8AAABAAAAAAAAAAAAAAABemkAUAAAAAAAAAAAUBAgAFN1AABrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdGVzdCBwa3RrYXRyYW4gdA==", + "XDP_TX" + }, +}; + +} +} diff --git a/katran/lib/testing/TARGETS b/katran/lib/testing/TARGETS index d557b2799..7e4df7a07 100644 --- a/katran/lib/testing/TARGETS +++ b/katran/lib/testing/TARGETS @@ -60,6 +60,7 @@ cpp_binary( "katran_tester.cpp", ], headers = [ + "KatranOptionalTestFixtures.h", "KatranTestFixtures.h", ], deps = [ diff --git a/katran/lib/testing/XdpTester.cpp b/katran/lib/testing/XdpTester.cpp index 67a292a2a..0335f7b3c 100644 --- a/katran/lib/testing/XdpTester.cpp +++ b/katran/lib/testing/XdpTester.cpp @@ -58,6 +58,18 @@ void XdpTester::printPcktBase64() { } } +void XdpTester::writePcapOutput(std::unique_ptr&& buf) { + if (config_.outputFileName.empty()) { + VLOG(2) << "no output file specified"; + return; + } + auto success = parser_.writePacket(std::move(buf)); + if (!success) { + LOG(INFO) << "failed to write pckt into output " + << "pcap file: " << config_.outputFileName; + } +} + void XdpTester::testPcktsFromPcap() { if (config_.inputFileName.empty() || config_.bpfProgFd < 0) { LOG(INFO) << "can't run pcap based tests. input pcap file or bpf prog fd " @@ -96,13 +108,7 @@ void XdpTester::testPcktsFromPcap() { } // adjust IOBuf so data data_end will acount for writen data buf->append(output_pckt_size); - if (!config_.outputFileName.empty()) { - auto success = parser_.writePacket(buf->cloneOne()); - if (!success) { - LOG(INFO) << "failed to write pckt #" << pckt_num << " into output " - << "pcap file: " << config_.outputFileName; - } - } + writePcapOutput(buf->cloneOne()); ++pckt_num; } } @@ -122,6 +128,7 @@ void XdpTester::testFromFixture() { for (int i = 0; i < config_.inputData.size(); i++) { auto buf = folly::IOBuf::create(kMaxXdpPcktSize); auto input_pckt = parser_.getPacketFromBase64(config_.inputData[i].first); + writePcapOutput(input_pckt->cloneOne()); auto res = adapter_.testXdpProg( config_.bpfProgFd, kTestRepeatCount, @@ -142,6 +149,7 @@ void XdpTester::testFromFixture() { } // adjust IOBuf so data data_end will acount for writen data buf->append(output_pckt_size); + writePcapOutput(buf->cloneOne()); if (ret_val_str != config_.outputData[i].second) { VLOG(2) << "value from test: " << ret_val_str << " expected: " << config_.outputData[i].second; @@ -161,6 +169,14 @@ void XdpTester::testFromFixture() { } } +void XdpTester::resetTestFixtures( + const std::vector>& inputData, + const std::vector>& outputData) { + // + config_.inputData = inputData; + config_.outputData = outputData; +} + void XdpTester::testPerfFromFixture(uint32_t repeat, const int position) { // for inputData format is int first_index{0}, last_index{0}; diff --git a/katran/lib/testing/XdpTester.h b/katran/lib/testing/XdpTester.h index 7ff6aba70..022c25e57 100644 --- a/katran/lib/testing/XdpTester.h +++ b/katran/lib/testing/XdpTester.h @@ -98,6 +98,15 @@ class XdpTester { */ void testFromFixture(); + /** + * @param vector new input fixtures + * @param vector new output fixtures + * helper function which set test fixtures to new values + */ + void resetTestFixtures( + const std::vector>& inputData, + const std::vector>& outputData); + /** * @param int repeat how many time should we repeat the test * @param int position of the packet if fixtures vector. @@ -106,6 +115,13 @@ class XdpTester { */ void testPerfFromFixture(uint32_t repeat, const int position = -1); + /** + * @param IOBuf with packet data to write. + * + * helper function to write packet in pcap format to specified outputFilenName + */ + void writePcapOutput(std::unique_ptr&& buf); + private: TesterConfig config_; PcapParser parser_; diff --git a/katran/lib/testing/katran_tester.cpp b/katran/lib/testing/katran_tester.cpp index 72130c14f..984d18e3f 100644 --- a/katran/lib/testing/katran_tester.cpp +++ b/katran/lib/testing/katran_tester.cpp @@ -22,6 +22,7 @@ #include #include "KatranTestFixtures.h" +#include "KatranOptionalTestFixtures.h" #include "XdpTester.h" #include "katran/lib/KatranLb.h" #include "katran/lib/KatranLbStructs.h" @@ -33,6 +34,7 @@ DEFINE_string(healtchecking_prog, "", "path to healthchecking bpf prog"); DEFINE_bool(print_base64, false, "print packets in base64 from pcap file"); DEFINE_bool(test_from_fixtures, false, "run tests on predefined dataset"); DEFINE_bool(perf_testing, false, "run perf tests on predefined dataset"); +DEFINE_bool(optional_tests, false, "run optional (kernel specific) tests"); DEFINE_int32(repeat, 1000000, "perf test runs for single packet"); DEFINE_int32(position, -1, "perf test runs for single packet"); @@ -143,6 +145,19 @@ void prepareLbData(katran::KatranLb& lb) { addReals(lb, vip, reals6); } +void prepareOptionalLbData(katran::KatranLb& /* unused */) { +} + +void testOptionalLbCounters(katran::KatranLb& lb) { + LOG(INFO) << "Testing optional counter's sanity"; + auto stats = lb.getIcmpTooBigStats(); + if (stats.v1 != 1 || stats.v2 != 1) { + VLOG(2) << "icmpV4 hits: " << stats.v1 << " icmpv6 hits:" << stats.v2; + LOG(INFO) << "icmp packet too big counter is incorrect"; + } + LOG(INFO) << "Testing of optional counters is complite"; +} + void testLbCounters(katran::KatranLb& lb) { katran::VipKey vip; vip.address = "10.200.1.1"; @@ -211,6 +226,15 @@ int main(int argc, char** argv) { } else if (FLAGS_test_from_fixtures) { tester.testFromFixture(); testLbCounters(lb); + if (FLAGS_optional_tests) { + LOG(INFO) << "Running optional tests. they could fail if requirements " + << "are not satisfied"; + tester.resetTestFixtures( + katran::testing::inputOptionalTestFixtures, + katran::testing::outputOptionalTestFixtures); + tester.testFromFixture(); + testOptionalLbCounters(lb); + } return 0; } else if (FLAGS_perf_testing) { tester.testPerfFromFixture(FLAGS_repeat, FLAGS_position);