Skip to content

Commit c96dac8

Browse files
aditighagMartin KaFai Lau
authored andcommitted
bpf: udp: Implement batching for sockets iterator
Batch UDP sockets from BPF iterator that allows for overlapping locking semantics in BPF/kernel helpers executed in BPF programs. This facilitates BPF socket destroy kfunc (introduced by follow-up patches) to execute from BPF iterator programs. Previously, BPF iterators acquired the sock lock and sockets hash table bucket lock while executing BPF programs. This prevented BPF helpers that again acquire these locks to be executed from BPF iterators. With the batching approach, we acquire a bucket lock, batch all the bucket sockets, and then release the bucket lock. This enables BPF or kernel helpers to skip sock locking when invoked in the supported BPF contexts. The batching logic is similar to the logic implemented in TCP iterator: https://lore.kernel.org/bpf/20210701200613.1036157-1-kafai@fb.com/. Suggested-by: Martin KaFai Lau <martin.lau@kernel.org> Signed-off-by: Aditi Ghag <aditi.ghag@isovalent.com> Link: https://lore.kernel.org/r/20230519225157.760788-6-aditi.ghag@isovalent.com Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
1 parent e4fe1bf commit c96dac8

File tree

1 file changed

+199
-6
lines changed

1 file changed

+199
-6
lines changed

net/ipv4/udp.c

Lines changed: 199 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3155,6 +3155,143 @@ struct bpf_iter__udp {
31553155
int bucket __aligned(8);
31563156
};
31573157

3158+
struct bpf_udp_iter_state {
3159+
struct udp_iter_state state;
3160+
unsigned int cur_sk;
3161+
unsigned int end_sk;
3162+
unsigned int max_sk;
3163+
int offset;
3164+
struct sock **batch;
3165+
bool st_bucket_done;
3166+
};
3167+
3168+
static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
3169+
unsigned int new_batch_sz);
3170+
static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
3171+
{
3172+
struct bpf_udp_iter_state *iter = seq->private;
3173+
struct udp_iter_state *state = &iter->state;
3174+
struct net *net = seq_file_net(seq);
3175+
struct udp_table *udptable;
3176+
unsigned int batch_sks = 0;
3177+
bool resized = false;
3178+
struct sock *sk;
3179+
3180+
/* The current batch is done, so advance the bucket. */
3181+
if (iter->st_bucket_done) {
3182+
state->bucket++;
3183+
iter->offset = 0;
3184+
}
3185+
3186+
udptable = udp_get_table_seq(seq, net);
3187+
3188+
again:
3189+
/* New batch for the next bucket.
3190+
* Iterate over the hash table to find a bucket with sockets matching
3191+
* the iterator attributes, and return the first matching socket from
3192+
* the bucket. The remaining matched sockets from the bucket are batched
3193+
* before releasing the bucket lock. This allows BPF programs that are
3194+
* called in seq_show to acquire the bucket lock if needed.
3195+
*/
3196+
iter->cur_sk = 0;
3197+
iter->end_sk = 0;
3198+
iter->st_bucket_done = false;
3199+
batch_sks = 0;
3200+
3201+
for (; state->bucket <= udptable->mask; state->bucket++) {
3202+
struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
3203+
3204+
if (hlist_empty(&hslot2->head)) {
3205+
iter->offset = 0;
3206+
continue;
3207+
}
3208+
3209+
spin_lock_bh(&hslot2->lock);
3210+
udp_portaddr_for_each_entry(sk, &hslot2->head) {
3211+
if (seq_sk_match(seq, sk)) {
3212+
/* Resume from the last iterated socket at the
3213+
* offset in the bucket before iterator was stopped.
3214+
*/
3215+
if (iter->offset) {
3216+
--iter->offset;
3217+
continue;
3218+
}
3219+
if (iter->end_sk < iter->max_sk) {
3220+
sock_hold(sk);
3221+
iter->batch[iter->end_sk++] = sk;
3222+
}
3223+
batch_sks++;
3224+
}
3225+
}
3226+
spin_unlock_bh(&hslot2->lock);
3227+
3228+
if (iter->end_sk)
3229+
break;
3230+
3231+
/* Reset the current bucket's offset before moving to the next bucket. */
3232+
iter->offset = 0;
3233+
}
3234+
3235+
/* All done: no batch made. */
3236+
if (!iter->end_sk)
3237+
return NULL;
3238+
3239+
if (iter->end_sk == batch_sks) {
3240+
/* Batching is done for the current bucket; return the first
3241+
* socket to be iterated from the batch.
3242+
*/
3243+
iter->st_bucket_done = true;
3244+
goto done;
3245+
}
3246+
if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) {
3247+
resized = true;
3248+
/* After allocating a larger batch, retry one more time to grab
3249+
* the whole bucket.
3250+
*/
3251+
state->bucket--;
3252+
goto again;
3253+
}
3254+
done:
3255+
return iter->batch[0];
3256+
}
3257+
3258+
static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3259+
{
3260+
struct bpf_udp_iter_state *iter = seq->private;
3261+
struct sock *sk;
3262+
3263+
/* Whenever seq_next() is called, the iter->cur_sk is
3264+
* done with seq_show(), so unref the iter->cur_sk.
3265+
*/
3266+
if (iter->cur_sk < iter->end_sk) {
3267+
sock_put(iter->batch[iter->cur_sk++]);
3268+
++iter->offset;
3269+
}
3270+
3271+
/* After updating iter->cur_sk, check if there are more sockets
3272+
* available in the current bucket batch.
3273+
*/
3274+
if (iter->cur_sk < iter->end_sk)
3275+
sk = iter->batch[iter->cur_sk];
3276+
else
3277+
/* Prepare a new batch. */
3278+
sk = bpf_iter_udp_batch(seq);
3279+
3280+
++*pos;
3281+
return sk;
3282+
}
3283+
3284+
static void *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos)
3285+
{
3286+
/* bpf iter does not support lseek, so it always
3287+
* continue from where it was stop()-ped.
3288+
*/
3289+
if (*pos)
3290+
return bpf_iter_udp_batch(seq);
3291+
3292+
return SEQ_START_TOKEN;
3293+
}
3294+
31583295
static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
31593296
struct udp_sock *udp_sk, uid_t uid, int bucket)
31603297
{
@@ -3175,18 +3312,37 @@ static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)
31753312
struct bpf_prog *prog;
31763313
struct sock *sk = v;
31773314
uid_t uid;
3315+
int ret;
31783316

31793317
if (v == SEQ_START_TOKEN)
31803318
return 0;
31813319

3320+
lock_sock(sk);
3321+
3322+
if (unlikely(sk_unhashed(sk))) {
3323+
ret = SEQ_SKIP;
3324+
goto unlock;
3325+
}
3326+
31823327
uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
31833328
meta.seq = seq;
31843329
prog = bpf_iter_get_info(&meta, false);
3185-
return udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
3330+
ret = udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
3331+
3332+
unlock:
3333+
release_sock(sk);
3334+
return ret;
3335+
}
3336+
3337+
static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter)
3338+
{
3339+
while (iter->cur_sk < iter->end_sk)
3340+
sock_put(iter->batch[iter->cur_sk++]);
31863341
}
31873342

31883343
static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
31893344
{
3345+
struct bpf_udp_iter_state *iter = seq->private;
31903346
struct bpf_iter_meta meta;
31913347
struct bpf_prog *prog;
31923348

@@ -3197,12 +3353,15 @@ static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
31973353
(void)udp_prog_seq_show(prog, &meta, v, 0, 0);
31983354
}
31993355

3200-
udp_seq_stop(seq, v);
3356+
if (iter->cur_sk < iter->end_sk) {
3357+
bpf_iter_udp_put_batch(iter);
3358+
iter->st_bucket_done = false;
3359+
}
32013360
}
32023361

32033362
static const struct seq_operations bpf_iter_udp_seq_ops = {
3204-
.start = udp_seq_start,
3205-
.next = udp_seq_next,
3363+
.start = bpf_iter_udp_seq_start,
3364+
.next = bpf_iter_udp_seq_next,
32063365
.stop = bpf_iter_udp_seq_stop,
32073366
.show = bpf_iter_udp_seq_show,
32083367
};
@@ -3431,21 +3590,55 @@ static struct pernet_operations __net_initdata udp_sysctl_ops = {
34313590
DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
34323591
struct udp_sock *udp_sk, uid_t uid, int bucket)
34333592

3593+
static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
3594+
unsigned int new_batch_sz)
3595+
{
3596+
struct sock **new_batch;
3597+
3598+
new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch),
3599+
GFP_USER | __GFP_NOWARN);
3600+
if (!new_batch)
3601+
return -ENOMEM;
3602+
3603+
bpf_iter_udp_put_batch(iter);
3604+
kvfree(iter->batch);
3605+
iter->batch = new_batch;
3606+
iter->max_sk = new_batch_sz;
3607+
3608+
return 0;
3609+
}
3610+
3611+
#define INIT_BATCH_SZ 16
3612+
34343613
static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
34353614
{
3436-
return bpf_iter_init_seq_net(priv_data, aux);
3615+
struct bpf_udp_iter_state *iter = priv_data;
3616+
int ret;
3617+
3618+
ret = bpf_iter_init_seq_net(priv_data, aux);
3619+
if (ret)
3620+
return ret;
3621+
3622+
ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ);
3623+
if (ret)
3624+
bpf_iter_fini_seq_net(priv_data);
3625+
3626+
return ret;
34373627
}
34383628

34393629
static void bpf_iter_fini_udp(void *priv_data)
34403630
{
3631+
struct bpf_udp_iter_state *iter = priv_data;
3632+
34413633
bpf_iter_fini_seq_net(priv_data);
3634+
kvfree(iter->batch);
34423635
}
34433636

34443637
static const struct bpf_iter_seq_info udp_seq_info = {
34453638
.seq_ops = &bpf_iter_udp_seq_ops,
34463639
.init_seq_private = bpf_iter_init_udp,
34473640
.fini_seq_private = bpf_iter_fini_udp,
3448-
.seq_priv_size = sizeof(struct udp_iter_state),
3641+
.seq_priv_size = sizeof(struct bpf_udp_iter_state),
34493642
};
34503643

34513644
static struct bpf_iter_reg udp_reg_info = {

0 commit comments

Comments
 (0)