Skip to content

Commit 7d3f3b4

Browse files
Dellirankuba-moo
authored andcommitted
net: ipv4: Cache pmtu for all packet paths if multipath enabled
Check number of paths by fib_info_num_path(), and update_or_create_fnhe() for every path. Problem is that pmtu is cached only for the oif that has received icmp message "need to frag", other oifs will still try to use "default" iface mtu. An example topology showing the problem: | host1 +---------+ | dummy0 | 10.179.20.18/32 mtu9000 +---------+ +-----------+----------------+ +---------+ +---------+ | ens17f0 | 10.179.2.141/31 | ens17f1 | 10.179.2.13/31 +---------+ +---------+ | (all here have mtu 9000) | +------+ +------+ | ro1 | 10.179.2.140/31 | ro2 | 10.179.2.12/31 +------+ +------+ | | ---------+------------+-------------------+------ | +-----+ | ro3 | 10.10.10.10 mtu1500 +-----+ | ======================================== some networks ======================================== | +-----+ | eth0| 10.10.30.30 mtu9000 +-----+ | host2 host1 have enabled multipath and sysctl net.ipv4.fib_multipath_hash_policy = 1: default proto static src 10.179.20.18 nexthop via 10.179.2.12 dev ens17f1 weight 1 nexthop via 10.179.2.140 dev ens17f0 weight 1 When host1 tries to do pmtud from 10.179.20.18/32 to host2, host1 receives at ens17f1 iface an icmp packet from ro3 that ro3 mtu=1500. And host1 caches it in nexthop exceptions cache. Problem is that it is cached only for the iface that has received icmp, and there is no way that ro3 will send icmp msg to host1 via another path. Host1 now have this routes to host2: ip r g 10.10.30.30 sport 30000 dport 443 10.10.30.30 via 10.179.2.12 dev ens17f1 src 10.179.20.18 uid 0 cache expires 521sec mtu 1500 ip r g 10.10.30.30 sport 30033 dport 443 10.10.30.30 via 10.179.2.140 dev ens17f0 src 10.179.20.18 uid 0 cache So when host1 tries again to reach host2 with mtu>1500, if packet flow is lucky enough to be hashed with oif=ens17f1 its ok, if oif=ens17f0 it blackholes and still gets icmp msgs from ro3 to ens17f1, until lucky day when ro3 will send it through another flow to ens17f0. Signed-off-by: Vladimir Vdovin <deliran@verdict.gg> Reviewed-by: Ido Schimmel <idosch@nvidia.com> Link: https://patch.msgid.link/20241108093427.317942-1-deliran@verdict.gg Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent 43271bb commit 7d3f3b4

File tree

2 files changed

+108
-17
lines changed

2 files changed

+108
-17
lines changed

net/ipv4/route.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1027,6 +1027,19 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
10271027
struct fib_nh_common *nhc;
10281028

10291029
fib_select_path(net, &res, fl4, NULL);
1030+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
1031+
if (fib_info_num_path(res.fi) > 1) {
1032+
int nhsel;
1033+
1034+
for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) {
1035+
nhc = fib_info_nhc(res.fi, nhsel);
1036+
update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1037+
jiffies + net->ipv4.ip_rt_mtu_expires);
1038+
}
1039+
rcu_read_unlock();
1040+
return;
1041+
}
1042+
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
10301043
nhc = FIB_RES_NHC(res);
10311044
update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
10321045
jiffies + net->ipv4.ip_rt_mtu_expires);

tools/testing/selftests/net/pmtu.sh

Lines changed: 95 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,12 @@
197197
#
198198
# - pmtu_ipv6_route_change
199199
# Same as above but with IPv6
200+
#
201+
# - pmtu_ipv4_mp_exceptions
202+
# Use the same topology as in pmtu_ipv4, but add routeable addresses
203+
# on host A and B on lo reachable via both routers. Host A and B
204+
# addresses have multipath routes to each other, b_r1 mtu = 1500.
205+
# Check that PMTU exceptions are created for both paths.
200206

201207
source lib.sh
202208
source net_helper.sh
@@ -266,7 +272,8 @@ tests="
266272
list_flush_ipv4_exception ipv4: list and flush cached exceptions 1
267273
list_flush_ipv6_exception ipv6: list and flush cached exceptions 1
268274
pmtu_ipv4_route_change ipv4: PMTU exception w/route replace 1
269-
pmtu_ipv6_route_change ipv6: PMTU exception w/route replace 1"
275+
pmtu_ipv6_route_change ipv6: PMTU exception w/route replace 1
276+
pmtu_ipv4_mp_exceptions ipv4: PMTU multipath nh exceptions 1"
270277

271278
# Addressing and routing for tests with routers: four network segments, with
272279
# index SEGMENT between 1 and 4, a common prefix (PREFIX4 or PREFIX6) and an
@@ -343,6 +350,9 @@ tunnel6_a_addr="fd00:2::a"
343350
tunnel6_b_addr="fd00:2::b"
344351
tunnel6_mask="64"
345352

353+
host4_a_addr="192.168.99.99"
354+
host4_b_addr="192.168.88.88"
355+
346356
dummy6_0_prefix="fc00:1000::"
347357
dummy6_1_prefix="fc00:1001::"
348358
dummy6_mask="64"
@@ -984,6 +994,52 @@ setup_ovs_bridge() {
984994
run_cmd ip route add ${prefix6}:${b_r1}::1 via ${prefix6}:${a_r1}::2
985995
}
986996

997+
setup_multipath_new() {
998+
# Set up host A with multipath routes to host B host4_b_addr
999+
run_cmd ${ns_a} ip addr add ${host4_a_addr} dev lo
1000+
run_cmd ${ns_a} ip nexthop add id 401 via ${prefix4}.${a_r1}.2 dev veth_A-R1
1001+
run_cmd ${ns_a} ip nexthop add id 402 via ${prefix4}.${a_r2}.2 dev veth_A-R2
1002+
run_cmd ${ns_a} ip nexthop add id 403 group 401/402
1003+
run_cmd ${ns_a} ip route add ${host4_b_addr} src ${host4_a_addr} nhid 403
1004+
1005+
# Set up host B with multipath routes to host A host4_a_addr
1006+
run_cmd ${ns_b} ip addr add ${host4_b_addr} dev lo
1007+
run_cmd ${ns_b} ip nexthop add id 401 via ${prefix4}.${b_r1}.2 dev veth_B-R1
1008+
run_cmd ${ns_b} ip nexthop add id 402 via ${prefix4}.${b_r2}.2 dev veth_B-R2
1009+
run_cmd ${ns_b} ip nexthop add id 403 group 401/402
1010+
run_cmd ${ns_b} ip route add ${host4_a_addr} src ${host4_b_addr} nhid 403
1011+
}
1012+
1013+
setup_multipath_old() {
1014+
# Set up host A with multipath routes to host B host4_b_addr
1015+
run_cmd ${ns_a} ip addr add ${host4_a_addr} dev lo
1016+
run_cmd ${ns_a} ip route add ${host4_b_addr} \
1017+
src ${host4_a_addr} \
1018+
nexthop via ${prefix4}.${a_r1}.2 weight 1 \
1019+
nexthop via ${prefix4}.${a_r2}.2 weight 1
1020+
1021+
# Set up host B with multipath routes to host A host4_a_addr
1022+
run_cmd ${ns_b} ip addr add ${host4_b_addr} dev lo
1023+
run_cmd ${ns_b} ip route add ${host4_a_addr} \
1024+
src ${host4_b_addr} \
1025+
nexthop via ${prefix4}.${b_r1}.2 weight 1 \
1026+
nexthop via ${prefix4}.${b_r2}.2 weight 1
1027+
}
1028+
1029+
setup_multipath() {
1030+
if [ "$USE_NH" = "yes" ]; then
1031+
setup_multipath_new
1032+
else
1033+
setup_multipath_old
1034+
fi
1035+
1036+
# Set up routers with routes to dummies
1037+
run_cmd ${ns_r1} ip route add ${host4_a_addr} via ${prefix4}.${a_r1}.1
1038+
run_cmd ${ns_r2} ip route add ${host4_a_addr} via ${prefix4}.${a_r2}.1
1039+
run_cmd ${ns_r1} ip route add ${host4_b_addr} via ${prefix4}.${b_r1}.1
1040+
run_cmd ${ns_r2} ip route add ${host4_b_addr} via ${prefix4}.${b_r2}.1
1041+
}
1042+
9871043
setup() {
9881044
[ "$(id -u)" -ne 0 ] && echo " need to run as root" && return $ksft_skip
9891045

@@ -1076,23 +1132,15 @@ link_get_mtu() {
10761132
}
10771133

10781134
route_get_dst_exception() {
1079-
ns_cmd="${1}"
1080-
dst="${2}"
1081-
dsfield="${3}"
1135+
ns_cmd="${1}"; shift
10821136

1083-
if [ -z "${dsfield}" ]; then
1084-
dsfield=0
1085-
fi
1086-
1087-
${ns_cmd} ip route get "${dst}" dsfield "${dsfield}"
1137+
${ns_cmd} ip route get "$@"
10881138
}
10891139

10901140
route_get_dst_pmtu_from_exception() {
1091-
ns_cmd="${1}"
1092-
dst="${2}"
1093-
dsfield="${3}"
1141+
ns_cmd="${1}"; shift
10941142

1095-
mtu_parse "$(route_get_dst_exception "${ns_cmd}" "${dst}" "${dsfield}")"
1143+
mtu_parse "$(route_get_dst_exception "${ns_cmd}" "$@")"
10961144
}
10971145

10981146
check_pmtu_value() {
@@ -1235,10 +1283,10 @@ test_pmtu_ipv4_dscp_icmp_exception() {
12351283
run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst2}"
12361284

12371285
# Check that exceptions have been created with the correct PMTU
1238-
pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
1286+
pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" dsfield "${policy_mark}")"
12391287
check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
12401288

1241-
pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
1289+
pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" dsfield "${policy_mark}")"
12421290
check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
12431291
}
12441292

@@ -1285,9 +1333,9 @@ test_pmtu_ipv4_dscp_udp_exception() {
12851333
UDP:"${dst2}":50000,tos="${dsfield}"
12861334

12871335
# Check that exceptions have been created with the correct PMTU
1288-
pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
1336+
pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" dsfield "${policy_mark}")"
12891337
check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
1290-
pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
1338+
pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" dsfield "${policy_mark}")"
12911339
check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
12921340
}
12931341

@@ -2329,6 +2377,36 @@ test_pmtu_ipv6_route_change() {
23292377
test_pmtu_ipvX_route_change 6
23302378
}
23312379

2380+
test_pmtu_ipv4_mp_exceptions() {
2381+
setup namespaces routing multipath || return $ksft_skip
2382+
2383+
trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
2384+
"${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \
2385+
"${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \
2386+
"${ns_r2}" veth_R2-B "${ns_b}" veth_B-R2
2387+
2388+
# Set up initial MTU values
2389+
mtu "${ns_a}" veth_A-R1 2000
2390+
mtu "${ns_r1}" veth_R1-A 2000
2391+
mtu "${ns_r1}" veth_R1-B 1500
2392+
mtu "${ns_b}" veth_B-R1 1500
2393+
2394+
mtu "${ns_a}" veth_A-R2 2000
2395+
mtu "${ns_r2}" veth_R2-A 2000
2396+
mtu "${ns_r2}" veth_R2-B 1500
2397+
mtu "${ns_b}" veth_B-R2 1500
2398+
2399+
# Ping and expect two nexthop exceptions for two routes
2400+
run_cmd ${ns_a} ping -q -M want -i 0.1 -c 1 -s 1800 "${host4_b_addr}"
2401+
2402+
# Check that exceptions have been created with the correct PMTU
2403+
pmtu_a_R1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${host4_b_addr}" oif veth_A-R1)"
2404+
pmtu_a_R2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${host4_b_addr}" oif veth_A-R2)"
2405+
2406+
check_pmtu_value "1500" "${pmtu_a_R1}" "exceeding MTU (veth_A-R1)" || return 1
2407+
check_pmtu_value "1500" "${pmtu_a_R2}" "exceeding MTU (veth_A-R2)" || return 1
2408+
}
2409+
23322410
usage() {
23332411
echo
23342412
echo "$0 [OPTIONS] [TEST]..."

0 commit comments

Comments
 (0)