Skip to content

Commit 6fcc062

Browse files
committed
Merge branch 'tcp-optimizations'
Eric Dumazet says: ==================== tcp: optimizations for linux-5.17 Mostly small improvements in this series. The notable change is in "defer skb freeing after socket lock is released" in recvmsg() (and RX zerocopy) The idea is to try to let skb freeing to BH handler, whenever possible, or at least perform the freeing outside of the socket lock section, for much improved performance. This idea can probably be extended to other protocols. Tests on a 100Gbit NIC Max throughput for one TCP_STREAM flow, over 10 runs. MTU : 1500 (1428 bytes of TCP payload per MSS) Before: 55 Gbit After: 66 Gbit MTU : 4096+ (4096 bytes of TCP payload, plus TCP/IPv6 headers) Before: 82 Gbit After: 95 Gbit ==================== Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2 parents 3ad4b7c + 43f51df commit 6fcc062

File tree

16 files changed

+148
-94
lines changed

16 files changed

+148
-94
lines changed

include/linux/skbuff.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
#include <linux/splice.h>
3737
#include <linux/in6.h>
3838
#include <linux/if_packet.h>
39+
#include <linux/llist.h>
3940
#include <net/flow.h>
4041
#include <net/page_pool.h>
4142
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
@@ -743,6 +744,7 @@ struct sk_buff {
743744
};
744745
struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
745746
struct list_head list;
747+
struct llist_node ll_node;
746748
};
747749

748750
union {

include/linux/skmsg.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -507,12 +507,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
507507
return !!psock->saved_data_ready;
508508
}
509509

510-
static inline bool sk_is_tcp(const struct sock *sk)
511-
{
512-
return sk->sk_type == SOCK_STREAM &&
513-
sk->sk_protocol == IPPROTO_TCP;
514-
}
515-
516510
static inline bool sk_is_udp(const struct sock *sk)
517511
{
518512
return sk->sk_type == SOCK_DGRAM &&

include/net/ip6_checksum.h

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -65,15 +65,9 @@ static inline void __tcp_v6_send_check(struct sk_buff *skb,
6565
{
6666
struct tcphdr *th = tcp_hdr(skb);
6767

68-
if (skb->ip_summed == CHECKSUM_PARTIAL) {
69-
th->check = ~tcp_v6_check(skb->len, saddr, daddr, 0);
70-
skb->csum_start = skb_transport_header(skb) - skb->head;
71-
skb->csum_offset = offsetof(struct tcphdr, check);
72-
} else {
73-
th->check = tcp_v6_check(skb->len, saddr, daddr,
74-
csum_partial(th, th->doff << 2,
75-
skb->csum));
76-
}
68+
th->check = ~tcp_v6_check(skb->len, saddr, daddr, 0);
69+
skb->csum_start = skb_transport_header(skb) - skb->head;
70+
skb->csum_offset = offsetof(struct tcphdr, check);
7771
}
7872

7973
static inline void tcp_v6_gso_csum_prep(struct sk_buff *skb)

include/net/ipv6.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,9 +345,9 @@ struct ipcm6_cookie {
345345
struct sockcm_cookie sockc;
346346
__s16 hlimit;
347347
__s16 tclass;
348+
__u16 gso_size;
348349
__s8 dontfrag;
349350
struct ipv6_txoptions *opt;
350-
__u16 gso_size;
351351
};
352352

353353
static inline void ipcm6_init(struct ipcm6_cookie *ipc6)

include/net/sock.h

Lines changed: 33 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363
#include <linux/indirect_call_wrapper.h>
6464
#include <linux/atomic.h>
6565
#include <linux/refcount.h>
66+
#include <linux/llist.h>
6667
#include <net/dst.h>
6768
#include <net/checksum.h>
6869
#include <net/tcp_states.h>
@@ -284,9 +285,7 @@ struct bpf_local_storage;
284285
* @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets
285286
* @sk_no_check_rx: allow zero checksum in RX packets
286287
* @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
287-
* @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK)
288-
* @sk_route_forced_caps: static, forced route capabilities
289-
* (set in tcp_init_sock())
288+
* @sk_gso_disabled: if set, NETIF_F_GSO_MASK is forbidden.
290289
* @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
291290
* @sk_gso_max_size: Maximum GSO segment size to build
292291
* @sk_gso_max_segs: Maximum number of GSO segments
@@ -391,6 +390,11 @@ struct sock {
391390
#define sk_flags __sk_common.skc_flags
392391
#define sk_rxhash __sk_common.skc_rxhash
393392

393+
/* early demux fields */
394+
struct dst_entry *sk_rx_dst;
395+
int sk_rx_dst_ifindex;
396+
u32 sk_rx_dst_cookie;
397+
394398
socket_lock_t sk_lock;
395399
atomic_t sk_drops;
396400
int sk_rcvlowat;
@@ -410,6 +414,8 @@ struct sock {
410414
struct sk_buff *head;
411415
struct sk_buff *tail;
412416
} sk_backlog;
417+
struct llist_head defer_list;
418+
413419
#define sk_rmem_alloc sk_backlog.rmem_alloc
414420

415421
int sk_forward_alloc;
@@ -431,9 +437,6 @@ struct sock {
431437
#ifdef CONFIG_XFRM
432438
struct xfrm_policy __rcu *sk_policy[2];
433439
#endif
434-
struct dst_entry *sk_rx_dst;
435-
int sk_rx_dst_ifindex;
436-
u32 sk_rx_dst_cookie;
437440

438441
struct dst_entry __rcu *sk_dst_cache;
439442
atomic_t sk_omem_alloc;
@@ -460,8 +463,6 @@ struct sock {
460463
unsigned long sk_max_pacing_rate;
461464
struct page_frag sk_frag;
462465
netdev_features_t sk_route_caps;
463-
netdev_features_t sk_route_nocaps;
464-
netdev_features_t sk_route_forced_caps;
465466
int sk_gso_type;
466467
unsigned int sk_gso_max_size;
467468
gfp_t sk_allocation;
@@ -471,7 +472,7 @@ struct sock {
471472
* Because of non atomicity rules, all
472473
* changes are protected by socket lock.
473474
*/
474-
u8 sk_padding : 1,
475+
u8 sk_gso_disabled : 1,
475476
sk_kern_sock : 1,
476477
sk_no_check_tx : 1,
477478
sk_no_check_rx : 1,
@@ -493,6 +494,7 @@ struct sock {
493494
u16 sk_busy_poll_budget;
494495
#endif
495496
spinlock_t sk_peer_lock;
497+
int sk_bind_phc;
496498
struct pid *sk_peer_pid;
497499
const struct cred *sk_peer_cred;
498500

@@ -502,7 +504,6 @@ struct sock {
502504
seqlock_t sk_stamp_seq;
503505
#endif
504506
u16 sk_tsflags;
505-
int sk_bind_phc;
506507
u8 sk_shutdown;
507508
u32 sk_tskey;
508509
atomic_t sk_zckey;
@@ -1022,12 +1023,18 @@ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *s
10221023

10231024
int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);
10241025

1026+
INDIRECT_CALLABLE_DECLARE(int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb));
1027+
INDIRECT_CALLABLE_DECLARE(int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb));
1028+
10251029
static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
10261030
{
10271031
if (sk_memalloc_socks() && skb_pfmemalloc(skb))
10281032
return __sk_backlog_rcv(sk, skb);
10291033

1030-
return sk->sk_backlog_rcv(sk, skb);
1034+
return INDIRECT_CALL_INET(sk->sk_backlog_rcv,
1035+
tcp_v6_do_rcv,
1036+
tcp_v4_do_rcv,
1037+
sk, skb);
10311038
}
10321039

10331040
static inline void sk_incoming_cpu_update(struct sock *sk)
@@ -1210,7 +1217,9 @@ struct proto {
12101217
unsigned int inuse_idx;
12111218
#endif
12121219

1220+
#if IS_ENABLED(CONFIG_MPTCP)
12131221
int (*forward_alloc_get)(const struct sock *sk);
1222+
#endif
12141223

12151224
bool (*stream_memory_free)(const struct sock *sk, int wake);
12161225
bool (*sock_is_readable)(struct sock *sk);
@@ -1299,10 +1308,11 @@ INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int
12991308

13001309
static inline int sk_forward_alloc_get(const struct sock *sk)
13011310
{
1302-
if (!sk->sk_prot->forward_alloc_get)
1303-
return sk->sk_forward_alloc;
1304-
1305-
return sk->sk_prot->forward_alloc_get(sk);
1311+
#if IS_ENABLED(CONFIG_MPTCP)
1312+
if (sk->sk_prot->forward_alloc_get)
1313+
return sk->sk_prot->forward_alloc_get(sk);
1314+
#endif
1315+
return sk->sk_forward_alloc;
13061316
}
13071317

13081318
static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
@@ -2124,10 +2134,10 @@ static inline bool sk_can_gso(const struct sock *sk)
21242134

21252135
void sk_setup_caps(struct sock *sk, struct dst_entry *dst);
21262136

2127-
static inline void sk_nocaps_add(struct sock *sk, netdev_features_t flags)
2137+
static inline void sk_gso_disable(struct sock *sk)
21282138
{
2129-
sk->sk_route_nocaps |= flags;
2130-
sk->sk_route_caps &= ~flags;
2139+
sk->sk_gso_disabled = 1;
2140+
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
21312141
}
21322142

21332143
static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb,
@@ -2638,6 +2648,11 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
26382648
&skb_shinfo(skb)->tskey);
26392649
}
26402650

2651+
static inline bool sk_is_tcp(const struct sock *sk)
2652+
{
2653+
return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP;
2654+
}
2655+
26412656
/**
26422657
* sk_eat_skb - Release a skb if it is no longer needed
26432658
* @sk: socket to eat this skb from

include/net/tcp.h

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1368,6 +1368,16 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb)
13681368
}
13691369

13701370
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb);
1371+
1372+
void __sk_defer_free_flush(struct sock *sk);
1373+
1374+
static inline void sk_defer_free_flush(struct sock *sk)
1375+
{
1376+
if (llist_empty(&sk->defer_list))
1377+
return;
1378+
__sk_defer_free_flush(sk);
1379+
}
1380+
13711381
int tcp_filter(struct sock *sk, struct sk_buff *skb);
13721382
void tcp_set_state(struct sock *sk, int state);
13731383
void tcp_done(struct sock *sk);
@@ -2172,9 +2182,13 @@ static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb)
21722182
u16 segs_in;
21732183

21742184
segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
2175-
tp->segs_in += segs_in;
2185+
2186+
/* We update these fields while other threads might
2187+
* read them from tcp_get_info()
2188+
*/
2189+
WRITE_ONCE(tp->segs_in, tp->segs_in + segs_in);
21762190
if (skb->len > tcp_hdrlen(skb))
2177-
tp->data_segs_in += segs_in;
2191+
WRITE_ONCE(tp->data_segs_in, tp->data_segs_in + segs_in);
21782192
}
21792193

21802194
/*

net/core/skbuff.c

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4849,8 +4849,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb,
48494849
serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
48504850
if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
48514851
serr->ee.ee_data = skb_shinfo(skb)->tskey;
4852-
if (sk->sk_protocol == IPPROTO_TCP &&
4853-
sk->sk_type == SOCK_STREAM)
4852+
if (sk_is_tcp(sk))
48544853
serr->ee.ee_data -= sk->sk_tskey;
48554854
}
48564855

@@ -4919,8 +4918,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
49194918
if (tsonly) {
49204919
#ifdef CONFIG_INET
49214920
if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
4922-
sk->sk_protocol == IPPROTO_TCP &&
4923-
sk->sk_type == SOCK_STREAM) {
4921+
sk_is_tcp(sk)) {
49244922
skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
49254923
ack_skb);
49264924
opt_stats = true;

net/core/sock.c

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,10 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
327327
BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
328328

329329
noreclaim_flag = memalloc_noreclaim_save();
330-
ret = sk->sk_backlog_rcv(sk, skb);
330+
ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
331+
tcp_v6_do_rcv,
332+
tcp_v4_do_rcv,
333+
sk, skb);
331334
memalloc_noreclaim_restore(noreclaim_flag);
332335

333336
return ret;
@@ -872,8 +875,7 @@ int sock_set_timestamping(struct sock *sk, int optname,
872875

873876
if (val & SOF_TIMESTAMPING_OPT_ID &&
874877
!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
875-
if (sk->sk_protocol == IPPROTO_TCP &&
876-
sk->sk_type == SOCK_STREAM) {
878+
if (sk_is_tcp(sk)) {
877879
if ((1 << sk->sk_state) &
878880
(TCPF_CLOSE | TCPF_LISTEN))
879881
return -EINVAL;
@@ -1370,8 +1372,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
13701372

13711373
case SO_ZEROCOPY:
13721374
if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1373-
if (!((sk->sk_type == SOCK_STREAM &&
1374-
sk->sk_protocol == IPPROTO_TCP) ||
1375+
if (!(sk_is_tcp(sk) ||
13751376
(sk->sk_type == SOCK_DGRAM &&
13761377
sk->sk_protocol == IPPROTO_UDP)))
13771378
ret = -ENOTSUPP;
@@ -2246,10 +2247,13 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
22462247
u32 max_segs = 1;
22472248

22482249
sk_dst_set(sk, dst);
2249-
sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2250+
sk->sk_route_caps = dst->dev->features;
2251+
if (sk_is_tcp(sk))
2252+
sk->sk_route_caps |= NETIF_F_GSO;
22502253
if (sk->sk_route_caps & NETIF_F_GSO)
22512254
sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2252-
sk->sk_route_caps &= ~sk->sk_route_nocaps;
2255+
if (unlikely(sk->sk_gso_disabled))
2256+
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
22532257
if (sk_can_gso(sk)) {
22542258
if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
22552259
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;

0 commit comments

Comments
 (0)