Skip to content

Commit e9db4ef

Browse files
jrfastabborkmann
authored andcommitted
bpf: sockhash fix omitted bucket lock in sock_close
First the sk_callback_lock() was being used to protect both the sock callback hooks and the psock->maps list. This got overly convoluted after the addition of sockhash (in sockmap it made some sense because masp and callbacks were tightly coupled) so lets split out a specific lock for maps and only use the callback lock for its intended purpose. This fixes a couple cases where we missed using maps lock when it was in fact needed. Also this makes it easier to follow the code because now we can put the locking closer to the actual code its serializing. Next, in sock_hash_delete_elem() the pattern was as follows, sock_hash_delete_elem() [...] spin_lock(bucket_lock) l = lookup_elem_raw() if (l) hlist_del_rcu() write_lock(sk_callback_lock) .... destroy psock ... write_unlock(sk_callback_lock) spin_unlock(bucket_lock) The ordering is necessary because we only know the {p}sock after dereferencing the hash table which we can't do unless we have the bucket lock held. Once we have the bucket lock and the psock element it is deleted from the hashmap to ensure any other path doing a lookup will fail. Finally, the refcnt is decremented and if zero the psock is destroyed. In parallel with the above (or free'ing the map) a tcp close event may trigger tcp_close(). Which at the moment omits the bucket lock altogether (oops!) where the flow looks like this, bpf_tcp_close() [...] write_lock(sk_callback_lock) for each psock->maps // list of maps this sock is part of hlist_del_rcu(ref_hash_node); .... destroy psock ... write_unlock(sk_callback_lock) Obviously, and demonstrated by syzbot, this is broken because we can have multiple threads deleting entries via hlist_del_rcu(). To fix this we might be tempted to wrap the hlist operation in a bucket lock but that would create a lock inversion problem. In summary to follow locking rules the psocks maps list needs the sk_callback_lock (after this patch maps_lock) but we need the bucket lock to do the hlist_del_rcu. To resolve the lock inversion problem pop the head of the maps list repeatedly and remove the reference until no more are left. If a delete happens in parallel from the BPF API that is OK as well because it will do a similar action, lookup the lock in the map/hash, delete it from the map/hash, and dec the refcnt. We check for this case before doing a destroy on the psock to ensure we don't have two threads tearing down a psock. The new logic is as follows, bpf_tcp_close() e = psock_map_pop(psock->maps) // done with map lock bucket_lock() // lock hash list bucket l = lookup_elem_raw(head, hash, key, key_size); if (l) { //only get here if elmnt was not already removed hlist_del_rcu() ... destroy psock... } bucket_unlock() And finally for all the above to work add missing locking around map operations per above. Then add RCU annotations and use rcu_dereference/rcu_assign_pointer to manage values relying on RCU so that the object is not free'd from sock_hash_free() while it is being referenced in bpf_tcp_close(). Reported-by: syzbot+0ce137753c78f7b6acc1@syzkaller.appspotmail.com Fixes: 8111038 ("bpf: sockmap, add hash map support") Signed-off-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
1 parent 54fedb4 commit e9db4ef

File tree

1 file changed

+96
-49
lines changed

1 file changed

+96
-49
lines changed

kernel/bpf/sockmap.c

Lines changed: 96 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ struct bpf_htab {
7272
u32 n_buckets;
7373
u32 elem_size;
7474
struct bpf_sock_progs progs;
75+
struct rcu_head rcu;
7576
};
7677

7778
struct htab_elem {
@@ -89,8 +90,8 @@ enum smap_psock_state {
8990
struct smap_psock_map_entry {
9091
struct list_head list;
9192
struct sock **entry;
92-
struct htab_elem *hash_link;
93-
struct bpf_htab *htab;
93+
struct htab_elem __rcu *hash_link;
94+
struct bpf_htab __rcu *htab;
9495
};
9596

9697
struct smap_psock {
@@ -120,6 +121,7 @@ struct smap_psock {
120121
struct bpf_prog *bpf_parse;
121122
struct bpf_prog *bpf_verdict;
122123
struct list_head maps;
124+
spinlock_t maps_lock;
123125

124126
/* Back reference used when sock callback trigger sockmap operations */
125127
struct sock *sock;
@@ -258,16 +260,54 @@ static void bpf_tcp_release(struct sock *sk)
258260
rcu_read_unlock();
259261
}
260262

263+
static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
264+
u32 hash, void *key, u32 key_size)
265+
{
266+
struct htab_elem *l;
267+
268+
hlist_for_each_entry_rcu(l, head, hash_node) {
269+
if (l->hash == hash && !memcmp(&l->key, key, key_size))
270+
return l;
271+
}
272+
273+
return NULL;
274+
}
275+
276+
static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
277+
{
278+
return &htab->buckets[hash & (htab->n_buckets - 1)];
279+
}
280+
281+
static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
282+
{
283+
return &__select_bucket(htab, hash)->head;
284+
}
285+
261286
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
262287
{
263288
atomic_dec(&htab->count);
264289
kfree_rcu(l, rcu);
265290
}
266291

292+
static struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
293+
struct smap_psock *psock)
294+
{
295+
struct smap_psock_map_entry *e;
296+
297+
spin_lock_bh(&psock->maps_lock);
298+
e = list_first_entry_or_null(&psock->maps,
299+
struct smap_psock_map_entry,
300+
list);
301+
if (e)
302+
list_del(&e->list);
303+
spin_unlock_bh(&psock->maps_lock);
304+
return e;
305+
}
306+
267307
static void bpf_tcp_close(struct sock *sk, long timeout)
268308
{
269309
void (*close_fun)(struct sock *sk, long timeout);
270-
struct smap_psock_map_entry *e, *tmp;
310+
struct smap_psock_map_entry *e;
271311
struct sk_msg_buff *md, *mtmp;
272312
struct smap_psock *psock;
273313
struct sock *osk;
@@ -286,7 +326,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
286326
*/
287327
close_fun = psock->save_close;
288328

289-
write_lock_bh(&sk->sk_callback_lock);
290329
if (psock->cork) {
291330
free_start_sg(psock->sock, psock->cork);
292331
kfree(psock->cork);
@@ -299,20 +338,38 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
299338
kfree(md);
300339
}
301340

302-
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
341+
e = psock_map_pop(sk, psock);
342+
while (e) {
303343
if (e->entry) {
304344
osk = cmpxchg(e->entry, sk, NULL);
305345
if (osk == sk) {
306-
list_del(&e->list);
307346
smap_release_sock(psock, sk);
308347
}
309348
} else {
310-
hlist_del_rcu(&e->hash_link->hash_node);
311-
smap_release_sock(psock, e->hash_link->sk);
312-
free_htab_elem(e->htab, e->hash_link);
349+
struct htab_elem *link = rcu_dereference(e->hash_link);
350+
struct bpf_htab *htab = rcu_dereference(e->htab);
351+
struct hlist_head *head;
352+
struct htab_elem *l;
353+
struct bucket *b;
354+
355+
b = __select_bucket(htab, link->hash);
356+
head = &b->head;
357+
raw_spin_lock_bh(&b->lock);
358+
l = lookup_elem_raw(head,
359+
link->hash, link->key,
360+
htab->map.key_size);
361+
/* If another thread deleted this object skip deletion.
362+
* The refcnt on psock may or may not be zero.
363+
*/
364+
if (l) {
365+
hlist_del_rcu(&link->hash_node);
366+
smap_release_sock(psock, link->sk);
367+
free_htab_elem(htab, link);
368+
}
369+
raw_spin_unlock_bh(&b->lock);
313370
}
371+
e = psock_map_pop(sk, psock);
314372
}
315-
write_unlock_bh(&sk->sk_callback_lock);
316373
rcu_read_unlock();
317374
close_fun(sk, timeout);
318375
}
@@ -1395,7 +1452,9 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
13951452
{
13961453
if (refcount_dec_and_test(&psock->refcnt)) {
13971454
tcp_cleanup_ulp(sock);
1455+
write_lock_bh(&sock->sk_callback_lock);
13981456
smap_stop_sock(psock, sock);
1457+
write_unlock_bh(&sock->sk_callback_lock);
13991458
clear_bit(SMAP_TX_RUNNING, &psock->state);
14001459
rcu_assign_sk_user_data(sock, NULL);
14011460
call_rcu_sched(&psock->rcu, smap_destroy_psock);
@@ -1546,6 +1605,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, int node)
15461605
INIT_LIST_HEAD(&psock->maps);
15471606
INIT_LIST_HEAD(&psock->ingress);
15481607
refcount_set(&psock->refcnt, 1);
1608+
spin_lock_init(&psock->maps_lock);
15491609

15501610
rcu_assign_sk_user_data(sock, psock);
15511611
sock_hold(sock);
@@ -1607,23 +1667,27 @@ static void smap_list_map_remove(struct smap_psock *psock,
16071667
{
16081668
struct smap_psock_map_entry *e, *tmp;
16091669

1670+
spin_lock_bh(&psock->maps_lock);
16101671
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
16111672
if (e->entry == entry)
16121673
list_del(&e->list);
16131674
}
1675+
spin_unlock_bh(&psock->maps_lock);
16141676
}
16151677

16161678
static void smap_list_hash_remove(struct smap_psock *psock,
16171679
struct htab_elem *hash_link)
16181680
{
16191681
struct smap_psock_map_entry *e, *tmp;
16201682

1683+
spin_lock_bh(&psock->maps_lock);
16211684
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
1622-
struct htab_elem *c = e->hash_link;
1685+
struct htab_elem *c = rcu_dereference(e->hash_link);
16231686

16241687
if (c == hash_link)
16251688
list_del(&e->list);
16261689
}
1690+
spin_unlock_bh(&psock->maps_lock);
16271691
}
16281692

16291693
static void sock_map_free(struct bpf_map *map)
@@ -1649,7 +1713,6 @@ static void sock_map_free(struct bpf_map *map)
16491713
if (!sock)
16501714
continue;
16511715

1652-
write_lock_bh(&sock->sk_callback_lock);
16531716
psock = smap_psock_sk(sock);
16541717
/* This check handles a racing sock event that can get the
16551718
* sk_callback_lock before this case but after xchg happens
@@ -1660,7 +1723,6 @@ static void sock_map_free(struct bpf_map *map)
16601723
smap_list_map_remove(psock, &stab->sock_map[i]);
16611724
smap_release_sock(psock, sock);
16621725
}
1663-
write_unlock_bh(&sock->sk_callback_lock);
16641726
}
16651727
rcu_read_unlock();
16661728

@@ -1709,7 +1771,6 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
17091771
if (!sock)
17101772
return -EINVAL;
17111773

1712-
write_lock_bh(&sock->sk_callback_lock);
17131774
psock = smap_psock_sk(sock);
17141775
if (!psock)
17151776
goto out;
@@ -1719,7 +1780,6 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
17191780
smap_list_map_remove(psock, &stab->sock_map[k]);
17201781
smap_release_sock(psock, sock);
17211782
out:
1722-
write_unlock_bh(&sock->sk_callback_lock);
17231783
return 0;
17241784
}
17251785

@@ -1800,7 +1860,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
18001860
}
18011861
}
18021862

1803-
write_lock_bh(&sock->sk_callback_lock);
18041863
psock = smap_psock_sk(sock);
18051864

18061865
/* 2. Do not allow inheriting programs if psock exists and has
@@ -1857,7 +1916,9 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
18571916
if (err)
18581917
goto out_free;
18591918
smap_init_progs(psock, verdict, parse);
1919+
write_lock_bh(&sock->sk_callback_lock);
18601920
smap_start_sock(psock, sock);
1921+
write_unlock_bh(&sock->sk_callback_lock);
18611922
}
18621923

18631924
/* 4. Place psock in sockmap for use and stop any programs on
@@ -1867,9 +1928,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
18671928
*/
18681929
if (map_link) {
18691930
e->entry = map_link;
1931+
spin_lock_bh(&psock->maps_lock);
18701932
list_add_tail(&e->list, &psock->maps);
1933+
spin_unlock_bh(&psock->maps_lock);
18711934
}
1872-
write_unlock_bh(&sock->sk_callback_lock);
18731935
return err;
18741936
out_free:
18751937
smap_release_sock(psock, sock);
@@ -1880,7 +1942,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
18801942
}
18811943
if (tx_msg)
18821944
bpf_prog_put(tx_msg);
1883-
write_unlock_bh(&sock->sk_callback_lock);
18841945
kfree(e);
18851946
return err;
18861947
}
@@ -1917,10 +1978,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
19171978
if (osock) {
19181979
struct smap_psock *opsock = smap_psock_sk(osock);
19191980

1920-
write_lock_bh(&osock->sk_callback_lock);
19211981
smap_list_map_remove(opsock, &stab->sock_map[i]);
19221982
smap_release_sock(opsock, osock);
1923-
write_unlock_bh(&osock->sk_callback_lock);
19241983
}
19251984
out:
19261985
return err;
@@ -2109,14 +2168,13 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
21092168
return ERR_PTR(err);
21102169
}
21112170

2112-
static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
2171+
static void __bpf_htab_free(struct rcu_head *rcu)
21132172
{
2114-
return &htab->buckets[hash & (htab->n_buckets - 1)];
2115-
}
2173+
struct bpf_htab *htab;
21162174

2117-
static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
2118-
{
2119-
return &__select_bucket(htab, hash)->head;
2175+
htab = container_of(rcu, struct bpf_htab, rcu);
2176+
bpf_map_area_free(htab->buckets);
2177+
kfree(htab);
21202178
}
21212179

21222180
static void sock_hash_free(struct bpf_map *map)
@@ -2135,16 +2193,18 @@ static void sock_hash_free(struct bpf_map *map)
21352193
*/
21362194
rcu_read_lock();
21372195
for (i = 0; i < htab->n_buckets; i++) {
2138-
struct hlist_head *head = select_bucket(htab, i);
2196+
struct bucket *b = __select_bucket(htab, i);
2197+
struct hlist_head *head;
21392198
struct hlist_node *n;
21402199
struct htab_elem *l;
21412200

2201+
raw_spin_lock_bh(&b->lock);
2202+
head = &b->head;
21422203
hlist_for_each_entry_safe(l, n, head, hash_node) {
21432204
struct sock *sock = l->sk;
21442205
struct smap_psock *psock;
21452206

21462207
hlist_del_rcu(&l->hash_node);
2147-
write_lock_bh(&sock->sk_callback_lock);
21482208
psock = smap_psock_sk(sock);
21492209
/* This check handles a racing sock event that can get
21502210
* the sk_callback_lock before this case but after xchg
@@ -2155,13 +2215,12 @@ static void sock_hash_free(struct bpf_map *map)
21552215
smap_list_hash_remove(psock, l);
21562216
smap_release_sock(psock, sock);
21572217
}
2158-
write_unlock_bh(&sock->sk_callback_lock);
2159-
kfree(l);
2218+
free_htab_elem(htab, l);
21602219
}
2220+
raw_spin_unlock_bh(&b->lock);
21612221
}
21622222
rcu_read_unlock();
2163-
bpf_map_area_free(htab->buckets);
2164-
kfree(htab);
2223+
call_rcu(&htab->rcu, __bpf_htab_free);
21652224
}
21662225

21672226
static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
@@ -2188,19 +2247,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
21882247
return l_new;
21892248
}
21902249

2191-
static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
2192-
u32 hash, void *key, u32 key_size)
2193-
{
2194-
struct htab_elem *l;
2195-
2196-
hlist_for_each_entry_rcu(l, head, hash_node) {
2197-
if (l->hash == hash && !memcmp(&l->key, key, key_size))
2198-
return l;
2199-
}
2200-
2201-
return NULL;
2202-
}
2203-
22042250
static inline u32 htab_map_hash(const void *key, u32 key_len)
22052251
{
22062252
return jhash(key, key_len, 0);
@@ -2320,9 +2366,12 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
23202366
goto bucket_err;
23212367
}
23222368

2323-
e->hash_link = l_new;
2324-
e->htab = container_of(map, struct bpf_htab, map);
2369+
rcu_assign_pointer(e->hash_link, l_new);
2370+
rcu_assign_pointer(e->htab,
2371+
container_of(map, struct bpf_htab, map));
2372+
spin_lock_bh(&psock->maps_lock);
23252373
list_add_tail(&e->list, &psock->maps);
2374+
spin_unlock_bh(&psock->maps_lock);
23262375

23272376
/* add new element to the head of the list, so that
23282377
* concurrent search will find it before old elem
@@ -2392,7 +2441,6 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key)
23922441
struct smap_psock *psock;
23932442

23942443
hlist_del_rcu(&l->hash_node);
2395-
write_lock_bh(&sock->sk_callback_lock);
23962444
psock = smap_psock_sk(sock);
23972445
/* This check handles a racing sock event that can get the
23982446
* sk_callback_lock before this case but after xchg happens
@@ -2403,7 +2451,6 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key)
24032451
smap_list_hash_remove(psock, l);
24042452
smap_release_sock(psock, sock);
24052453
}
2406-
write_unlock_bh(&sock->sk_callback_lock);
24072454
free_htab_elem(htab, l);
24082455
ret = 0;
24092456
}

0 commit comments

Comments
 (0)