Skip to content

Commit afb8301

Browse files
soheilhydavem330
authored andcommitted
tcp: schedule EPOLLOUT after a partial sendmsg
For EPOLLET, applications must call sendmsg until they get EAGAIN. Otherwise, there is no guarantee that EPOLLOUT is sent if there was a failure upon memory allocation. As a result on high-speed NICs, userspace observes multiple small sendmsgs after a partial sendmsg until EAGAIN, since TCP can send 1-2 TSOs in between two sendmsg syscalls: // One large partial send due to memory allocation failure. sendmsg(20MB) = 2MB // Many small sends until EAGAIN. sendmsg(18MB) = 64KB sendmsg(17.9MB) = 128KB sendmsg(17.8MB) = 64KB ... sendmsg(...) = EAGAIN // At this point, userspace can assume an EPOLLOUT. To fix this, set the SOCK_NOSPACE on all partial sendmsg scenarios to guarantee that we send EPOLLOUT after partial sendmsg. After this commit userspace can assume that it will receive an EPOLLOUT after the first partial sendmsg. This EPOLLOUT will benefit from sk_stream_write_space() logic delaying the EPOLLOUT until significant space is available in write queue. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent 8ba3c9d commit afb8301

File tree

1 file changed

+9
-11
lines changed

1 file changed

+9
-11
lines changed

net/ipv4/tcp.c

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1004,12 +1004,12 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
10041004
!tcp_skb_can_collapse_to(skb)) {
10051005
new_segment:
10061006
if (!sk_stream_memory_free(sk))
1007-
goto wait_for_sndbuf;
1007+
goto wait_for_space;
10081008

10091009
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
10101010
tcp_rtx_and_write_queues_empty(sk));
10111011
if (!skb)
1012-
goto wait_for_memory;
1012+
goto wait_for_space;
10131013

10141014
#ifdef CONFIG_TLS_DEVICE
10151015
skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
@@ -1028,7 +1028,7 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
10281028
goto new_segment;
10291029
}
10301030
if (!sk_wmem_schedule(sk, copy))
1031-
goto wait_for_memory;
1031+
goto wait_for_space;
10321032

10331033
if (can_coalesce) {
10341034
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
@@ -1069,9 +1069,8 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
10691069
tcp_push_one(sk, mss_now);
10701070
continue;
10711071

1072-
wait_for_sndbuf:
1072+
wait_for_space:
10731073
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1074-
wait_for_memory:
10751074
tcp_push(sk, flags & ~MSG_MORE, mss_now,
10761075
TCP_NAGLE_PUSH, size_goal);
10771076

@@ -1282,7 +1281,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
12821281

12831282
new_segment:
12841283
if (!sk_stream_memory_free(sk))
1285-
goto wait_for_sndbuf;
1284+
goto wait_for_space;
12861285

12871286
if (unlikely(process_backlog >= 16)) {
12881287
process_backlog = 0;
@@ -1293,7 +1292,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
12931292
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
12941293
first_skb);
12951294
if (!skb)
1296-
goto wait_for_memory;
1295+
goto wait_for_space;
12971296

12981297
process_backlog++;
12991298
skb->ip_summed = CHECKSUM_PARTIAL;
@@ -1326,7 +1325,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
13261325
struct page_frag *pfrag = sk_page_frag(sk);
13271326

13281327
if (!sk_page_frag_refill(sk, pfrag))
1329-
goto wait_for_memory;
1328+
goto wait_for_space;
13301329

13311330
if (!skb_can_coalesce(skb, i, pfrag->page,
13321331
pfrag->offset)) {
@@ -1340,7 +1339,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
13401339
copy = min_t(int, copy, pfrag->size - pfrag->offset);
13411340

13421341
if (!sk_wmem_schedule(sk, copy))
1343-
goto wait_for_memory;
1342+
goto wait_for_space;
13441343

13451344
err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
13461345
pfrag->page,
@@ -1393,9 +1392,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
13931392
tcp_push_one(sk, mss_now);
13941393
continue;
13951394

1396-
wait_for_sndbuf:
1395+
wait_for_space:
13971396
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1398-
wait_for_memory:
13991397
if (copied)
14001398
tcp_push(sk, flags & ~MSG_MORE, mss_now,
14011399
TCP_NAGLE_PUSH, size_goal);

0 commit comments

Comments
 (0)