Skip to content

Commit

Permalink
bpf: Add initial fd-based API to attach tc BPF programs
Browse files Browse the repository at this point in the history
[ Commit msg tbd ]

This work revamps the tc BPF ingress and egress side in order to implement
an fd-based API in bpf(2) to attach/detach BPF programs without the detour
to go via qdiscs and filters. This is also needed for tc BPF link management
since the latter operates on fds whereas the traditional tc framework operates
on qdiscs and filters which are separate objects with their own lifecycle
management and do not fit into BPF links.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
  • Loading branch information
borkmann committed Jul 22, 2022
1 parent f8d3da4 commit fafe94e
Show file tree
Hide file tree
Showing 16 changed files with 746 additions and 127 deletions.
2 changes: 2 additions & 0 deletions MAINTAINERS
Expand Up @@ -3776,6 +3776,8 @@ R: John Fastabend <john.fastabend@gmail.com>
L: bpf@vger.kernel.org
L: netdev@vger.kernel.org
S: Maintained
F: include/net/sch_xgress.h
F: kernel/bpf/net.c
F: net/core/filter.c
F: net/sched/act_bpf.c
F: net/sched/cls_bpf.c
Expand Down
1 change: 1 addition & 0 deletions include/linux/bpf.h
Expand Up @@ -1313,6 +1313,7 @@ struct bpf_prog_array_item {
union {
struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
u64 bpf_cookie;
u32 bpf_priority;
};
};

Expand Down
14 changes: 6 additions & 8 deletions include/linux/netdevice.h
Expand Up @@ -1860,8 +1860,7 @@ enum netdev_ml_priv_type {
*
* @rx_handler: handler for received packets
* @rx_handler_data: XXX: need comments on this one
* @miniq_ingress: ingress/clsact qdisc specific data for
* ingress processing
* @sch_ingress: BPF/clsact qdisc specific data for ingress processing
* @ingress_queue: XXX: need comments on this one
* @nf_hooks_ingress: netfilter hooks executed for ingress packets
* @broadcast: hw bcast address
Expand All @@ -1882,8 +1881,7 @@ enum netdev_ml_priv_type {
* @xps_maps: all CPUs/RXQs maps for XPS device
*
* @xps_maps: XXX: need comments on this one
* @miniq_egress: clsact qdisc specific data for
* egress processing
* @sch_egress: BPF/clsact qdisc specific data for egress processing
* @nf_hooks_egress: netfilter hooks executed for egress packets
* @qdisc_hash: qdisc hash table
* @watchdog_timeo: Represents the timeout that is used by
Expand Down Expand Up @@ -2174,8 +2172,8 @@ struct net_device {
rx_handler_func_t __rcu *rx_handler;
void __rcu *rx_handler_data;

#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_ingress;
#ifdef CONFIG_NET_XGRESS
struct sch_entry __rcu *sch_ingress;
#endif
struct netdev_queue __rcu *ingress_queue;
#ifdef CONFIG_NETFILTER_INGRESS
Expand Down Expand Up @@ -2203,8 +2201,8 @@ struct net_device {
#ifdef CONFIG_XPS
struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX];
#endif
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_egress;
#ifdef CONFIG_NET_XGRESS
struct sch_entry __rcu *sch_egress;
#endif
#ifdef CONFIG_NETFILTER_EGRESS
struct nf_hook_entries __rcu *nf_hooks_egress;
Expand Down
2 changes: 1 addition & 1 deletion include/linux/skbuff.h
Expand Up @@ -1117,7 +1117,7 @@ struct sk_buff {
__u8 csum_level:2;
__u8 dst_pending_confirm:1;
__u8 mono_delivery_time:1; /* See SKB_MONO_DELIVERY_TIME_MASK */
#ifdef CONFIG_NET_CLS_ACT
#ifdef CONFIG_NET_XGRESS
__u8 tc_skip_classify:1;
__u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */
#endif
Expand Down
2 changes: 1 addition & 1 deletion include/net/sch_generic.h
Expand Up @@ -714,7 +714,7 @@ int skb_do_redirect(struct sk_buff *);

static inline bool skb_at_tc_ingress(const struct sk_buff *skb)
{
#ifdef CONFIG_NET_CLS_ACT
#ifdef CONFIG_NET_XGRESS
return skb->tc_at_ingress;
#else
return false;
Expand Down
187 changes: 187 additions & 0 deletions include/net/sch_xgress.h
@@ -0,0 +1,187 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (c) 2022 Isovalent */
#ifndef __NET_SCHED_XGRESS_H
#define __NET_SCHED_XGRESS_H

#include <linux/idr.h>
#include <linux/bpf.h>

#include <net/sch_generic.h>

#define SCH_MAX_ENTRIES 30
/* Adds 1 NULL entry and 1 tc entry. */
#define SCH_MAX (SCH_MAX_ENTRIES + 2)
#define SCH_PRIO_RESERVED INT_MAX

enum sch_entry_type {
SCH_ENTRY_A,
SCH_ENTRY_B,
};

struct sch_entry {
struct bpf_prog_array_item items[SCH_MAX] ____cacheline_aligned;
enum sch_entry_type type;
};

struct mini_Qdisc;

struct sch_entry_pair {
struct rcu_head rcu;
struct idr idr;
struct mini_Qdisc *miniq;
struct sch_entry a;
struct sch_entry b;
};

static inline void sch_set_ingress(struct sk_buff *skb, bool ingress)
{
#ifdef CONFIG_NET_XGRESS
skb->tc_at_ingress = ingress;
#endif
}

#ifdef CONFIG_NET_XGRESS
unsigned int sch_cls_ingress(const void *pskb, const struct bpf_insn *null);
unsigned int sch_cls_egress(const void *pskb, const struct bpf_insn *null);

int sch_prog_attach_kern(struct net_device *dev, bool ingress);
int sch_prog_detach_kern(struct net_device *dev, bool ingress);

static inline void
dev_sch_entry_update(struct net_device *dev, struct sch_entry *entry,
bool ingress)
{
ASSERT_RTNL();
if (ingress)
rcu_assign_pointer(dev->sch_ingress, entry);
else
rcu_assign_pointer(dev->sch_egress, entry);
}

static inline struct sch_entry_pair *dev_sch_entry_pair(struct sch_entry *entry)
{
if (entry->type == SCH_ENTRY_A)
return container_of(entry, struct sch_entry_pair, a);
else
return container_of(entry, struct sch_entry_pair, b);
}

static inline struct sch_entry *dev_sch_entry_peer(struct sch_entry *entry)
{
if (entry->type == SCH_ENTRY_A)
return &dev_sch_entry_pair(entry)->b;
else
return &dev_sch_entry_pair(entry)->a;
}

static inline struct sch_entry *dev_sch_entry_create(void)
{
struct sch_entry_pair *pair = kzalloc(sizeof(*pair), GFP_KERNEL);

if (pair) {
pair->a.type = SCH_ENTRY_A;
pair->b.type = SCH_ENTRY_B;
idr_init(&pair->idr);
return &pair->a;
}
return NULL;
}

static inline struct sch_entry *dev_sch_entry_fetch(struct net_device *dev,
bool ingress, bool *created)
{
struct sch_entry *entry = ingress ?
rcu_dereference(dev->sch_ingress) :
rcu_dereference(dev->sch_egress);

*created = false;
if (!entry) {
entry = dev_sch_entry_create();
if (!entry)
return NULL;
*created = true;
}
return entry;
}

static inline void dev_sch_entry_clear(struct sch_entry *entry)
{
memset(entry->items, 0, sizeof(entry->items));
}

static inline int dev_sch_entry_prio_new(struct sch_entry *entry, u32 prio,
struct bpf_prog *prog)
{
struct sch_entry_pair *pair = dev_sch_entry_pair(entry);
static const u32 prio_kern = SCH_PRIO_RESERVED;
int ret;

if (prio >= prio_kern)
return prio;
if (prio == 0)
prio = 1;
ret = idr_alloc_u32(&pair->idr, prog, &prio, prio_kern - 1,
GFP_KERNEL);
return ret < 0 ? ret : prio;
}

static inline void dev_sch_entry_prio_set(struct sch_entry *entry, u32 prio,
struct bpf_prog *prog)
{
struct sch_entry_pair *pair = dev_sch_entry_pair(entry);

idr_replace(&pair->idr, prog, prio);
}

static inline void dev_sch_entry_prio_del(struct sch_entry *entry, u32 prio)
{
struct sch_entry_pair *pair = dev_sch_entry_pair(entry);

idr_remove(&pair->idr, prio);
}

static inline void dev_sch_entry_free(struct sch_entry *entry)
{
struct sch_entry_pair *pair = dev_sch_entry_pair(entry);

idr_destroy(&pair->idr);
kfree_rcu(pair, rcu);
}

static inline u32 dev_sch_entry_total(struct sch_entry *entry)
{
const struct bpf_prog_array_item *item;
const struct bpf_prog *prog;
u32 num = 0;

item = &entry->items[0];
while ((prog = READ_ONCE(item->prog))) {
num++;
item++;
}
return num;
}

int sch_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
int sch_prog_detach(const union bpf_attr *attr);
int sch_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr);
#else
static inline int sch_prog_attach(const union bpf_attr *attr,
struct bpf_prog *prog)
{
return -EINVAL;
}

static inline int sch_prog_detach(const union bpf_attr *attr)
{
return -EINVAL;
}

static inline int sch_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
return -EINVAL;
}
#endif /* CONFIG_NET_XGRESS */
#endif /* __NET_SCHED_XGRESS_H */
17 changes: 14 additions & 3 deletions include/uapi/linux/bpf.h
Expand Up @@ -998,6 +998,8 @@ enum bpf_attach_type {
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
BPF_PERF_EVENT,
BPF_TRACE_KPROBE_MULTI,
BPF_NET_INGRESS,
BPF_NET_EGRESS,
__MAX_BPF_ATTACH_TYPE
};

Expand Down Expand Up @@ -1372,14 +1374,20 @@ union bpf_attr {
};

struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
__u32 target_fd; /* container object to attach to */
union {
__u32 target_fd; /* container object to attach to */
__u32 target_ifindex; /* target ifindex */
};
__u32 attach_bpf_fd; /* eBPF program to attach */
__u32 attach_type;
__u32 attach_flags;
__u32 replace_bpf_fd; /* previously attached eBPF
union {
__u32 attach_priority;
__u32 replace_bpf_fd; /* previously attached eBPF
* program to replace if
* BPF_F_REPLACE is used
*/
};
};

struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
Expand Down Expand Up @@ -1425,7 +1433,10 @@ union bpf_attr {
} info;

struct { /* anonymous struct used by BPF_PROG_QUERY command */
__u32 target_fd; /* container object to query */
union {
__u32 target_fd; /* container object to query */
__u32 target_ifindex; /* target ifindex */
};
__u32 attach_type;
__u32 query_flags;
__u32 attach_flags;
Expand Down
1 change: 1 addition & 0 deletions kernel/bpf/Kconfig
Expand Up @@ -31,6 +31,7 @@ config BPF_SYSCALL
select TASKS_TRACE_RCU
select BINARY_PRINTF
select NET_SOCK_MSG if NET
select NET_XGRESS if NET
select PAGE_POOL if NET
default n
help
Expand Down
1 change: 1 addition & 0 deletions kernel/bpf/Makefile
Expand Up @@ -20,6 +20,7 @@ obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
obj-$(CONFIG_BPF_SYSCALL) += offload.o
obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o
obj-$(CONFIG_BPF_SYSCALL) += net.o
endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
Expand Down

0 comments on commit fafe94e

Please sign in to comment.