Skip to content

Commit d9ccb18

Browse files
Omid Ehtemam-Haghighikuba-moo
authored andcommitted
ipv6: Fix soft lockups in fib6_select_path under high next hop churn
Soft lockups have been observed on a cluster of Linux-based edge routers located in a highly dynamic environment. Using the `bird` service, these routers continuously update BGP-advertised routes due to frequently changing nexthop destinations, while also managing significant IPv6 traffic. The lockups occur during the traversal of the multipath circular linked-list in the `fib6_select_path` function, particularly while iterating through the siblings in the list. The issue typically arises when the nodes of the linked list are unexpectedly deleted concurrently on a different core—indicated by their 'next' and 'previous' elements pointing back to the node itself and their reference count dropping to zero. This results in an infinite loop, leading to a soft lockup that triggers a system panic via the watchdog timer. Apply RCU primitives in the problematic code sections to resolve the issue. Where necessary, update the references to fib6_siblings to annotate or use the RCU APIs. Include a test script that reproduces the issue. The script periodically updates the routing table while generating a heavy load of outgoing IPv6 traffic through multiple iperf3 clients. It consistently induces infinite soft lockups within a couple of minutes. Kernel log: 0 [ffffbd13003e8d30] machine_kexec at ffffffff8ceaf3eb 1 [ffffbd13003e8d90] __crash_kexec at ffffffff8d0120e3 2 [ffffbd13003e8e58] panic at ffffffff8cef65d4 3 [ffffbd13003e8ed8] watchdog_timer_fn at ffffffff8d05cb03 4 [ffffbd13003e8f08] __hrtimer_run_queues at ffffffff8cfec62f 5 [ffffbd13003e8f70] hrtimer_interrupt at ffffffff8cfed756 6 [ffffbd13003e8fd0] __sysvec_apic_timer_interrupt at ffffffff8cea01af 7 [ffffbd13003e8ff0] sysvec_apic_timer_interrupt at ffffffff8df1b83d -- <IRQ stack> -- 8 [ffffbd13003d3708] asm_sysvec_apic_timer_interrupt at ffffffff8e000ecb [exception RIP: fib6_select_path+299] RIP: ffffffff8ddafe7b RSP: ffffbd13003d37b8 RFLAGS: 00000287 RAX: ffff975850b43600 RBX: ffff975850b40200 RCX: 0000000000000000 RDX: 000000003fffffff RSI: 0000000051d383e4 RDI: ffff975850b43618 RBP: ffffbd13003d3800 R8: 0000000000000000 R9: ffff975850b40200 R10: 0000000000000000 R11: 0000000000000000 R12: ffffbd13003d3830 R13: ffff975850b436a8 R14: ffff975850b43600 R15: 0000000000000007 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 9 [ffffbd13003d3808] ip6_pol_route at ffffffff8ddb030c 10 [ffffbd13003d3888] ip6_pol_route_input at ffffffff8ddb068c 11 [ffffbd13003d3898] fib6_rule_lookup at ffffffff8ddf02b5 12 [ffffbd13003d3928] ip6_route_input at ffffffff8ddb0f47 13 [ffffbd13003d3a18] ip6_rcv_finish_core.constprop.0 at ffffffff8dd950d0 14 [ffffbd13003d3a30] ip6_list_rcv_finish.constprop.0 at ffffffff8dd96274 15 [ffffbd13003d3a98] ip6_sublist_rcv at ffffffff8dd96474 16 [ffffbd13003d3af8] ipv6_list_rcv at ffffffff8dd96615 17 [ffffbd13003d3b60] __netif_receive_skb_list_core at ffffffff8dc16fec 18 [ffffbd13003d3be0] netif_receive_skb_list_internal at ffffffff8dc176b3 19 [ffffbd13003d3c50] napi_gro_receive at ffffffff8dc565b9 20 [ffffbd13003d3c80] ice_receive_skb at ffffffffc087e4f5 [ice] 21 [ffffbd13003d3c90] ice_clean_rx_irq at ffffffffc0881b80 [ice] 22 [ffffbd13003d3d20] ice_napi_poll at ffffffffc088232f [ice] 23 [ffffbd13003d3d80] __napi_poll at ffffffff8dc18000 24 [ffffbd13003d3db8] net_rx_action at ffffffff8dc18581 25 [ffffbd13003d3e40] __do_softirq at ffffffff8df352e9 26 [ffffbd13003d3eb0] run_ksoftirqd at ffffffff8ceffe47 27 [ffffbd13003d3ec0] smpboot_thread_fn at ffffffff8cf36a30 28 [ffffbd13003d3ee8] kthread at ffffffff8cf2b39f 29 [ffffbd13003d3f28] ret_from_fork at ffffffff8ce5fa64 30 [ffffbd13003d3f50] ret_from_fork_asm at ffffffff8ce03cbb Fixes: 66f5d6c ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Reported-by: Adrian Oliver <kernel@aoliver.ca> Signed-off-by: Omid Ehtemam-Haghighi <omid.ehtemamhaghighi@menlosecurity.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Ido Schimmel <idosch@idosch.org> Cc: Kuniyuki Iwashima <kuniyu@amazon.com> Cc: Simon Horman <horms@kernel.org> Reviewed-by: David Ahern <dsahern@kernel.org> Link: https://patch.msgid.link/20241106010236.1239299-1-omid.ehtemamhaghighi@menlosecurity.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent c4e39b8 commit d9ccb18

File tree

4 files changed

+297
-19
lines changed

4 files changed

+297
-19
lines changed

net/ipv6/ip6_fib.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1183,8 +1183,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
11831183
while (sibling) {
11841184
if (sibling->fib6_metric == rt->fib6_metric &&
11851185
rt6_qualify_for_ecmp(sibling)) {
1186-
list_add_tail(&rt->fib6_siblings,
1187-
&sibling->fib6_siblings);
1186+
list_add_tail_rcu(&rt->fib6_siblings,
1187+
&sibling->fib6_siblings);
11881188
break;
11891189
}
11901190
sibling = rcu_dereference_protected(sibling->fib6_next,
@@ -1245,7 +1245,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
12451245
fib6_siblings)
12461246
sibling->fib6_nsiblings--;
12471247
rt->fib6_nsiblings = 0;
1248-
list_del_init(&rt->fib6_siblings);
1248+
list_del_rcu(&rt->fib6_siblings);
12491249
rt6_multipath_rebalance(next_sibling);
12501250
return err;
12511251
}
@@ -1963,7 +1963,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
19631963
&rt->fib6_siblings, fib6_siblings)
19641964
sibling->fib6_nsiblings--;
19651965
rt->fib6_nsiblings = 0;
1966-
list_del_init(&rt->fib6_siblings);
1966+
list_del_rcu(&rt->fib6_siblings);
19671967
rt6_multipath_rebalance(next_sibling);
19681968
}
19691969

net/ipv6/route.c

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -416,8 +416,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
416416
struct flowi6 *fl6, int oif, bool have_oif_match,
417417
const struct sk_buff *skb, int strict)
418418
{
419-
struct fib6_info *sibling, *next_sibling;
420419
struct fib6_info *match = res->f6i;
420+
struct fib6_info *sibling;
421421

422422
if (!match->nh && (!match->fib6_nsiblings || have_oif_match))
423423
goto out;
@@ -443,8 +443,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
443443
if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
444444
goto out;
445445

446-
list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
447-
fib6_siblings) {
446+
list_for_each_entry_rcu(sibling, &match->fib6_siblings,
447+
fib6_siblings) {
448448
const struct fib6_nh *nh = sibling->fib6_nh;
449449
int nh_upper_bound;
450450

@@ -5195,14 +5195,18 @@ static void ip6_route_mpath_notify(struct fib6_info *rt,
51955195
* nexthop. Since sibling routes are always added at the end of
51965196
* the list, find the first sibling of the last route appended
51975197
*/
5198+
rcu_read_lock();
5199+
51985200
if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
5199-
rt = list_first_entry(&rt_last->fib6_siblings,
5200-
struct fib6_info,
5201-
fib6_siblings);
5201+
rt = list_first_or_null_rcu(&rt_last->fib6_siblings,
5202+
struct fib6_info,
5203+
fib6_siblings);
52025204
}
52035205

52045206
if (rt)
52055207
inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
5208+
5209+
rcu_read_unlock();
52065210
}
52075211

52085212
static bool ip6_route_mpath_should_notify(const struct fib6_info *rt)
@@ -5547,17 +5551,21 @@ static size_t rt6_nlmsg_size(struct fib6_info *f6i)
55475551
nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
55485552
&nexthop_len);
55495553
} else {
5550-
struct fib6_info *sibling, *next_sibling;
55515554
struct fib6_nh *nh = f6i->fib6_nh;
5555+
struct fib6_info *sibling;
55525556

55535557
nexthop_len = 0;
55545558
if (f6i->fib6_nsiblings) {
55555559
rt6_nh_nlmsg_size(nh, &nexthop_len);
55565560

5557-
list_for_each_entry_safe(sibling, next_sibling,
5558-
&f6i->fib6_siblings, fib6_siblings) {
5561+
rcu_read_lock();
5562+
5563+
list_for_each_entry_rcu(sibling, &f6i->fib6_siblings,
5564+
fib6_siblings) {
55595565
rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len);
55605566
}
5567+
5568+
rcu_read_unlock();
55615569
}
55625570
nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
55635571
}
@@ -5721,7 +5729,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
57215729
lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
57225730
goto nla_put_failure;
57235731
} else if (rt->fib6_nsiblings) {
5724-
struct fib6_info *sibling, *next_sibling;
5732+
struct fib6_info *sibling;
57255733
struct nlattr *mp;
57265734

57275735
mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
@@ -5733,14 +5741,21 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
57335741
0) < 0)
57345742
goto nla_put_failure;
57355743

5736-
list_for_each_entry_safe(sibling, next_sibling,
5737-
&rt->fib6_siblings, fib6_siblings) {
5744+
rcu_read_lock();
5745+
5746+
list_for_each_entry_rcu(sibling, &rt->fib6_siblings,
5747+
fib6_siblings) {
57385748
if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
57395749
sibling->fib6_nh->fib_nh_weight,
5740-
AF_INET6, 0) < 0)
5750+
AF_INET6, 0) < 0) {
5751+
rcu_read_unlock();
5752+
57415753
goto nla_put_failure;
5754+
}
57425755
}
57435756

5757+
rcu_read_unlock();
5758+
57445759
nla_nest_end(skb, mp);
57455760
} else if (rt->nh) {
57465761
if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
@@ -6177,7 +6192,7 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
61776192
err = -ENOBUFS;
61786193
seq = info->nlh ? info->nlh->nlmsg_seq : 0;
61796194

6180-
skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
6195+
skb = nlmsg_new(rt6_nlmsg_size(rt), GFP_ATOMIC);
61816196
if (!skb)
61826197
goto errout;
61836198

@@ -6190,7 +6205,7 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
61906205
goto errout;
61916206
}
61926207
rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
6193-
info->nlh, gfp_any());
6208+
info->nlh, GFP_ATOMIC);
61946209
return;
61956210
errout:
61966211
rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);

tools/testing/selftests/net/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ TEST_PROGS += fdb_flush.sh
9696
TEST_PROGS += fq_band_pktlimit.sh
9797
TEST_PROGS += vlan_hw_filter.sh
9898
TEST_PROGS += bpf_offload.py
99+
TEST_PROGS += ipv6_route_update_soft_lockup.sh
99100

100101
# YNL files, must be before "include ..lib.mk"
101102
YNL_GEN_FILES := ncdevmem

0 commit comments

Comments
 (0)