Skip to content
/ linux Public

Commit b4d5e97

Browse files
edumazetSasha Levin
authored andcommitted
tcp: defer regular ACK while processing socket backlog
[ Upstream commit 133c4c0 ] This idea came after a particular workload requested the quickack attribute set on routes, and a performance drop was noticed for large bulk transfers. For high throughput flows, it is best to use one cpu running the user thread issuing socket system calls, and a separate cpu to process incoming packets from BH context. (With TSO/GRO, bottleneck is usually the 'user' cpu) Problem is the user thread can spend a lot of time while holding the socket lock, forcing BH handler to queue most of incoming packets in the socket backlog. Whenever the user thread releases the socket lock, it must first process all accumulated packets in the backlog, potentially adding latency spikes. Due to flood mitigation, having too many packets in the backlog increases chance of unexpected drops. Backlog processing unfortunately shifts a fair amount of cpu cycles from the BH cpu to the 'user' cpu, thus reducing max throughput. This patch takes advantage of the backlog processing, and the fact that ACK are mostly cumulative. The idea is to detect we are in the backlog processing and defer all eligible ACK into a single one, sent from tcp_release_cb(). This saves cpu cycles on both sides, and network resources. Performance of a single TCP flow on a 200Gbit NIC: - Throughput is increased by 20% (100Gbit -> 120Gbit). - Number of generated ACK per second shrinks from 240,000 to 40,000. - Number of backlog drops per second shrinks from 230 to 0. Benchmark context: - Regular netperf TCP_STREAM (no zerocopy) - Intel(R) Xeon(R) Platinum 8481C (Saphire Rapids) - MAX_SKB_FRAGS = 17 (~60KB per GRO packet) This feature is guarded by a new sysctl, and enabled by default: /proc/sys/net/ipv4/tcp_backlog_ack_defer Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Acked-by: Dave Taht <dave.taht@gmail.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com> Stable-dep-of: 87b0891 ("inet: move icmp_global_{credit,stamp} to a separate cache line") Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent 22023ff commit b4d5e97

File tree

7 files changed

+38
-7
lines changed

7 files changed

+38
-7
lines changed

Documentation/networking/ip-sysctl.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -745,6 +745,13 @@ tcp_comp_sack_nr - INTEGER
745745

746746
Default : 44
747747

748+
tcp_backlog_ack_defer - BOOLEAN
749+
If set, user thread processing socket backlog tries sending
750+
one ACK for the whole queue. This helps to avoid potential
751+
long latencies at end of a TCP socket syscall.
752+
753+
Default : true
754+
748755
tcp_slow_start_after_idle - BOOLEAN
749756
If set, provide RFC2861 behavior and time out the congestion
750757
window after an idle period. An idle period is defined at

include/linux/tcp.h

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -471,15 +471,17 @@ enum tsq_enum {
471471
TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
472472
* tcp_v{4|6}_mtu_reduced()
473473
*/
474+
TCP_ACK_DEFERRED, /* TX pure ack is deferred */
474475
};
475476

476477
enum tsq_flags {
477-
TSQF_THROTTLED = (1UL << TSQ_THROTTLED),
478-
TSQF_QUEUED = (1UL << TSQ_QUEUED),
479-
TCPF_TSQ_DEFERRED = (1UL << TCP_TSQ_DEFERRED),
480-
TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED),
481-
TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED),
482-
TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED),
478+
TSQF_THROTTLED = BIT(TSQ_THROTTLED),
479+
TSQF_QUEUED = BIT(TSQ_QUEUED),
480+
TCPF_TSQ_DEFERRED = BIT(TCP_TSQ_DEFERRED),
481+
TCPF_WRITE_TIMER_DEFERRED = BIT(TCP_WRITE_TIMER_DEFERRED),
482+
TCPF_DELACK_TIMER_DEFERRED = BIT(TCP_DELACK_TIMER_DEFERRED),
483+
TCPF_MTU_REDUCED_DEFERRED = BIT(TCP_MTU_REDUCED_DEFERRED),
484+
TCPF_ACK_DEFERRED = BIT(TCP_ACK_DEFERRED),
483485
};
484486

485487
#define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)

include/net/netns/ipv4.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ struct netns_ipv4 {
135135
u8 sysctl_tcp_syncookies;
136136
u8 sysctl_tcp_migrate_req;
137137
u8 sysctl_tcp_comp_sack_nr;
138+
u8 sysctl_tcp_backlog_ack_defer;
138139
int sysctl_tcp_reordering;
139140
u8 sysctl_tcp_retries1;
140141
u8 sysctl_tcp_retries2;

net/ipv4/sysctl_net_ipv4.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1366,6 +1366,15 @@ static struct ctl_table ipv4_net_table[] = {
13661366
.proc_handler = proc_dou8vec_minmax,
13671367
.extra1 = SYSCTL_ZERO,
13681368
},
1369+
{
1370+
.procname = "tcp_backlog_ack_defer",
1371+
.data = &init_net.ipv4.sysctl_tcp_backlog_ack_defer,
1372+
.maxlen = sizeof(u8),
1373+
.mode = 0644,
1374+
.proc_handler = proc_dou8vec_minmax,
1375+
.extra1 = SYSCTL_ZERO,
1376+
.extra2 = SYSCTL_ONE,
1377+
},
13691378
{
13701379
.procname = "tcp_reflect_tos",
13711380
.data = &init_net.ipv4.sysctl_tcp_reflect_tos,

net/ipv4/tcp_input.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5678,6 +5678,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
56785678
tcp_in_quickack_mode(sk) ||
56795679
/* Protocol state mandates a one-time immediate ACK */
56805680
inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
5681+
/* If we are running from __release_sock() in user context,
5682+
* Defer the ack until tcp_release_cb().
5683+
*/
5684+
if (sock_owned_by_user_nocheck(sk) &&
5685+
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) {
5686+
set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags);
5687+
return;
5688+
}
56815689
send_now:
56825690
tcp_send_ack(sk);
56835691
return;

net/ipv4/tcp_ipv4.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3279,6 +3279,7 @@ static int __net_init tcp_sk_init(struct net *net)
32793279
net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
32803280
net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
32813281
net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3282+
net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
32823283
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
32833284
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
32843285
atomic_set(&net->ipv4.tfo_active_disable_times, 0);

net/ipv4/tcp_output.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1083,7 +1083,8 @@ static void tcp_tasklet_func(struct tasklet_struct *t)
10831083
#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
10841084
TCPF_WRITE_TIMER_DEFERRED | \
10851085
TCPF_DELACK_TIMER_DEFERRED | \
1086-
TCPF_MTU_REDUCED_DEFERRED)
1086+
TCPF_MTU_REDUCED_DEFERRED | \
1087+
TCPF_ACK_DEFERRED)
10871088
/**
10881089
* tcp_release_cb - tcp release_sock() callback
10891090
* @sk: socket
@@ -1130,6 +1131,8 @@ void tcp_release_cb(struct sock *sk)
11301131
inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
11311132
__sock_put(sk);
11321133
}
1134+
if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk))
1135+
tcp_send_ack(sk);
11331136
}
11341137
EXPORT_SYMBOL(tcp_release_cb);
11351138

0 commit comments

Comments
 (0)