Skip to content
This repository has been archived by the owner on Sep 24, 2020. It is now read-only.

Commit

Permalink
Merge branch 'tcp-preempt'
Browse files Browse the repository at this point in the history
Eric Dumazet says:

====================
net: make TCP preemptible

Most of TCP stack assumed it was running from BH handler.

This is great for most things, as TCP behavior is very sensitive
to scheduling artifacts.

However, the prequeue and backlog processing are problematic,
as they need to be flushed with BH being blocked.

To cope with modern needs, TCP sockets have big sk_rcvbuf values,
in the order of 16 MB, and soon 32 MB.
This means that backlog can hold thousands of packets, and things
like TCP coalescing or collapsing on this amount of packets can
lead to insane latency spikes, since BH are blocked for too long.

It is time to make UDP/TCP stacks preemptible.

Note that fast path still runs from BH handler.

v2: Added "tcp: make tcp_sendmsg() aware of socket backlog"
    to reduce latency problems of large sends.

v3: Fixed a typo in tcp_cdg.c
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
davem330 committed May 2, 2016
2 parents 5e59c83 + d41a69f commit 570d632
Show file tree
Hide file tree
Showing 20 changed files with 150 additions and 157 deletions.
11 changes: 11 additions & 0 deletions include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -926,6 +926,17 @@ void sk_stream_kill_queues(struct sock *sk);
void sk_set_memalloc(struct sock *sk);
void sk_clear_memalloc(struct sock *sk);

void __sk_flush_backlog(struct sock *sk);

static inline bool sk_flush_backlog(struct sock *sk)
{
if (unlikely(READ_ONCE(sk->sk_backlog.tail))) {
__sk_flush_backlog(sk);
return true;
}
return false;
}

int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb);

struct request_sock_ops;
Expand Down
29 changes: 15 additions & 14 deletions net/core/sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -2019,33 +2019,27 @@ static void __release_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
{
struct sk_buff *skb = sk->sk_backlog.head;
struct sk_buff *skb, *next;

do {
while ((skb = sk->sk_backlog.head) != NULL) {
sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
bh_unlock_sock(sk);

do {
struct sk_buff *next = skb->next;
spin_unlock_bh(&sk->sk_lock.slock);

do {
next = skb->next;
prefetch(next);
WARN_ON_ONCE(skb_dst_is_noref(skb));
skb->next = NULL;
sk_backlog_rcv(sk, skb);

/*
* We are in process context here with softirqs
* disabled, use cond_resched_softirq() to preempt.
* This is safe to do because we've taken the backlog
* queue private:
*/
cond_resched_softirq();
cond_resched();

skb = next;
} while (skb != NULL);

bh_lock_sock(sk);
} while ((skb = sk->sk_backlog.head) != NULL);
spin_lock_bh(&sk->sk_lock.slock);
}

/*
* Doing the zeroing here guarantee we can not loop forever
Expand All @@ -2054,6 +2048,13 @@ static void __release_sock(struct sock *sk)
sk->sk_backlog.len = 0;
}

void __sk_flush_backlog(struct sock *sk)
{
spin_lock_bh(&sk->sk_lock.slock);
__release_sock(sk);
spin_unlock_bh(&sk->sk_lock.slock);
}

/**
* sk_wait_data - wait for data to arrive at sk_receive_queue
* @sk: sock to wait on
Expand Down
2 changes: 1 addition & 1 deletion net/dccp/input.c
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
goto discard;
}

__DCCP_INC_STATS(DCCP_MIB_INERRS);
DCCP_INC_STATS(DCCP_MIB_INERRS);
discard:
__kfree_skb(skb);
return 0;
Expand Down
4 changes: 2 additions & 2 deletions net/dccp/ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -533,8 +533,8 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
bh_unlock_sock(ctl_sk);

if (net_xmit_eval(err) == 0) {
__DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
__DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
}
out:
dst_release(dst);
Expand Down
4 changes: 2 additions & 2 deletions net/dccp/ipv6.c
Original file line number Diff line number Diff line change
Expand Up @@ -277,8 +277,8 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
if (!IS_ERR(dst)) {
skb_dst_set(skb, dst);
ip6_xmit(ctl_sk, skb, &fl6, NULL, 0);
__DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
__DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
return;
}

Expand Down
2 changes: 1 addition & 1 deletion net/dccp/options.c
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
return 0;

out_invalid_option:
__DCCP_INC_STATS(DCCP_MIB_INVALIDOPT);
DCCP_INC_STATS(DCCP_MIB_INVALIDOPT);
rc = DCCP_RESET_CODE_OPTION_ERROR;
out_featneg_failed:
DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc);
Expand Down
14 changes: 7 additions & 7 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -1136,11 +1136,12 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
/* This should be in poll */
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);

mss_now = tcp_send_mss(sk, &size_goal, flags);

/* Ok commence sending. */
copied = 0;

restart:
mss_now = tcp_send_mss(sk, &size_goal, flags);

err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto out_err;
Expand All @@ -1166,6 +1167,9 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;

if (sk_flush_backlog(sk))
goto restart;

skb = sk_stream_alloc_skb(sk,
select_size(sk, sg),
sk->sk_allocation,
Expand Down Expand Up @@ -1449,12 +1453,8 @@ static void tcp_prequeue_process(struct sock *sk)

NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED);

/* RX process wants to run with disabled BHs, though it is not
* necessary */
local_bh_disable();
while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
sk_backlog_rcv(sk, skb);
local_bh_enable();

/* Clear memory counter. */
tp->ucopy.memory = 0;
Expand Down Expand Up @@ -3095,7 +3095,7 @@ void tcp_done(struct sock *sk)
struct request_sock *req = tcp_sk(sk)->fastopen_rsk;

if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
__TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);

tcp_set_state(sk, TCP_CLOSE);
tcp_clear_xmit_timers(sk);
Expand Down
20 changes: 10 additions & 10 deletions net/ipv4/tcp_cdg.c
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,11 @@ static void tcp_cdg_hystart_update(struct sock *sk)

ca->last_ack = now_us;
if (after(now_us, ca->round_start + base_owd)) {
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINDETECT);
__NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINCWND,
tp->snd_cwnd);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINDETECT);
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINCWND,
tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
return;
}
Expand All @@ -174,11 +174,11 @@ static void tcp_cdg_hystart_update(struct sock *sk)
125U);

if (ca->rtt.min > thresh) {
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYDETECT);
__NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYCWND,
tp->snd_cwnd);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYDETECT);
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYCWND,
tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
}
}
Expand Down
20 changes: 10 additions & 10 deletions net/ipv4/tcp_cubic.c
Original file line number Diff line number Diff line change
Expand Up @@ -402,11 +402,11 @@ static void hystart_update(struct sock *sk, u32 delay)
ca->last_ack = now;
if ((s32)(now - ca->round_start) > ca->delay_min >> 4) {
ca->found |= HYSTART_ACK_TRAIN;
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINDETECT);
__NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINCWND,
tp->snd_cwnd);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINDETECT);
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINCWND,
tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
}
}
Expand All @@ -423,11 +423,11 @@ static void hystart_update(struct sock *sk, u32 delay)
if (ca->curr_rtt > ca->delay_min +
HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
ca->found |= HYSTART_DELAY;
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYDETECT);
__NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYCWND,
tp->snd_cwnd);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYDETECT);
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYCWND,
tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
}
}
Expand Down
12 changes: 6 additions & 6 deletions net/ipv4/tcp_fastopen.c
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,9 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
spin_lock(&fastopenq->lock);
req1 = fastopenq->rskq_rst_head;
if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) {
spin_unlock(&fastopenq->lock);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
spin_unlock(&fastopenq->lock);
return false;
}
fastopenq->rskq_rst_head = req1->dl_next;
Expand All @@ -282,7 +282,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
struct sock *child;

if (foc->len == 0) /* Client requests a cookie */
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);

if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
(syn_data || foc->len >= 0) &&
Expand Down Expand Up @@ -311,13 +311,13 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
child = tcp_fastopen_create_child(sk, skb, dst, req);
if (child) {
foc->len = -1;
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENPASSIVE);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENPASSIVE);
return child;
}
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
} else if (foc->len > 0) /* Client presents an invalid cookie */
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);

valid_foc.exp = foc->exp;
*foc = valid_foc;
Expand Down
Loading

0 comments on commit 570d632

Please sign in to comment.