Skip to content

Commit dcd2cf5

Browse files
Dust Lidavem330
authored andcommitted
net/smc: add autocorking support
This patch adds autocorking support for SMC which could improve throughput for small message by x3+. The main idea is borrowed from TCP autocorking with some RDMA specific modification: 1. The first message should never cork to make sure we won't bring extra latency 2. If we have posted any Tx WRs to the NIC that have not completed, cork the new messages until: a) Receive CQE for the last Tx WR b) We have corked enough message on the connection 3. Try to push the corked data out when we receive CQE of the last Tx WR to prevent the corked messages hang in the send queue. Both SMC autocorking and TCP autocorking check the TX completion to decide whether we should cork or not. The difference is when we got a SMC Tx WR completion, the data have been confirmed by the RNIC while TCP TX completion just tells us the data have been sent out by the local NIC. Add an atomic variable tx_pushing in smc_connection to make sure only one can send to let it cork more and save CDC slot. SMC autocorking should not bring extra latency since the first message will always been sent out immediately. The qperf tcp_bw test shows more than x4 increase under small message size with Mellanox connectX4-Lx, same result with other throughput benchmarks like sockperf/netperf. The qperf tcp_lat test shows SMC autocorking has not increase any ping-pong latency. Test command: client: smc_run taskset -c 1 qperf smc-server -oo msg_size:1:64K:*2 \ -t 30 -vu tcp_{bw|lat} server: smc_run taskset -c 1 qperf === Bandwidth ==== MsgSize(Bytes) SMC-NoCork TCP SMC-AutoCorking 1 0.578 MB/s 2.392 MB/s(313.57%) 2.647 MB/s(357.72%) 2 1.159 MB/s 4.780 MB/s(312.53%) 5.153 MB/s(344.71%) 4 2.283 MB/s 10.266 MB/s(349.77%) 10.363 MB/s(354.02%) 8 4.668 MB/s 19.040 MB/s(307.86%) 21.215 MB/s(354.45%) 16 9.147 MB/s 38.904 MB/s(325.31%) 41.740 MB/s(356.32%) 32 18.369 MB/s 79.587 MB/s(333.25%) 82.392 MB/s(348.52%) 64 36.562 MB/s 148.668 MB/s(306.61%) 161.564 MB/s(341.89%) 128 72.961 MB/s 274.913 MB/s(276.80%) 325.363 MB/s(345.94%) 256 144.705 MB/s 512.059 MB/s(253.86%) 633.743 MB/s(337.96%) 512 288.873 MB/s 884.977 MB/s(206.35%) 1250.681 MB/s(332.95%) 1024 574.180 MB/s 1337.736 MB/s(132.98%) 2246.121 MB/s(291.19%) 2048 1095.192 MB/s 1865.952 MB/s( 70.38%) 2057.767 MB/s( 87.89%) 4096 2066.157 MB/s 2380.337 MB/s( 15.21%) 2173.983 MB/s( 5.22%) 8192 3717.198 MB/s 2733.073 MB/s(-26.47%) 3491.223 MB/s( -6.08%) 16384 4742.221 MB/s 2958.693 MB/s(-37.61%) 4637.692 MB/s( -2.20%) 32768 5349.550 MB/s 3061.285 MB/s(-42.77%) 5385.796 MB/s( 0.68%) 65536 5162.919 MB/s 3731.408 MB/s(-27.73%) 5223.890 MB/s( 1.18%) ==== Latency ==== MsgSize(Bytes) SMC-NoCork TCP SMC-AutoCorking 1 10.540 us 11.938 us( 13.26%) 10.573 us( 0.31%) 2 10.996 us 11.992 us( 9.06%) 10.269 us( -6.61%) 4 10.229 us 11.687 us( 14.25%) 10.240 us( 0.11%) 8 10.203 us 11.653 us( 14.21%) 10.402 us( 1.95%) 16 10.530 us 11.313 us( 7.44%) 10.599 us( 0.66%) 32 10.241 us 11.586 us( 13.13%) 10.223 us( -0.18%) 64 10.693 us 11.652 us( 8.97%) 10.251 us( -4.13%) 128 10.597 us 11.579 us( 9.27%) 10.494 us( -0.97%) 256 10.409 us 11.957 us( 14.87%) 10.710 us( 2.89%) 512 11.088 us 12.505 us( 12.78%) 10.547 us( -4.88%) 1024 11.240 us 12.255 us( 9.03%) 10.787 us( -4.03%) 2048 11.485 us 16.970 us( 47.76%) 11.256 us( -1.99%) 4096 12.077 us 13.948 us( 15.49%) 12.230 us( 1.27%) 8192 13.683 us 16.693 us( 22.00%) 13.786 us( 0.75%) 16384 16.470 us 23.615 us( 43.38%) 16.459 us( -0.07%) 32768 22.540 us 40.966 us( 81.75%) 23.284 us( 3.30%) 65536 34.192 us 73.003 us(113.51%) 34.233 us( 0.12%) With SMC autocorking support, we can archive better throughput than TCP in most message sizes without any latency trade-off. Signed-off-by: Dust Li <dust.li@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent 462791b commit dcd2cf5

File tree

3 files changed

+105
-15
lines changed

3 files changed

+105
-15
lines changed

net/smc/smc.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM
3030
* devices
3131
*/
32+
#define SMC_AUTOCORKING_DEFAULT_SIZE 0x10000 /* 64K by default */
3233

3334
extern struct proto smc_proto;
3435
extern struct proto smc_proto6;
@@ -192,6 +193,7 @@ struct smc_connection {
192193
* - dec on polled tx cqe
193194
*/
194195
wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/
196+
atomic_t tx_pushing; /* nr_threads trying tx push */
195197
struct delayed_work tx_work; /* retry of smc_cdc_msg_send */
196198
u32 tx_off; /* base offset in peer rmb */
197199

net/smc/smc_cdc.c

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,14 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
4848
conn->tx_cdc_seq_fin = cdcpend->ctrl_seq;
4949
}
5050

51-
if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) &&
52-
unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq)))
53-
wake_up(&conn->cdc_pend_tx_wq);
51+
if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) {
52+
/* If this is the last pending WR complete, we must push to
53+
* prevent hang when autocork enabled.
54+
*/
55+
smc_tx_sndbuf_nonempty(conn);
56+
if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq)))
57+
wake_up(&conn->cdc_pend_tx_wq);
58+
}
5459
WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0);
5560

5661
smc_tx_sndbuf_nonfull(smc);

net/smc/smc_tx.c

Lines changed: 95 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,51 @@ static bool smc_tx_is_corked(struct smc_sock *smc)
131131
return (tp->nonagle & TCP_NAGLE_CORK) ? true : false;
132132
}
133133

134+
/* If we have pending CDC messages, do not send:
135+
* Because CQE of this CDC message will happen shortly, it gives
136+
* a chance to coalesce future sendmsg() payload in to one RDMA Write,
137+
* without need for a timer, and with no latency trade off.
138+
* Algorithm here:
139+
* 1. First message should never cork
140+
* 2. If we have pending Tx CDC messages, wait for the first CDC
141+
* message's completion
142+
* 3. Don't cork to much data in a single RDMA Write to prevent burst
143+
* traffic, total corked message should not exceed sendbuf/2
144+
*/
145+
static bool smc_should_autocork(struct smc_sock *smc)
146+
{
147+
struct smc_connection *conn = &smc->conn;
148+
int corking_size;
149+
150+
corking_size = min(SMC_AUTOCORKING_DEFAULT_SIZE,
151+
conn->sndbuf_desc->len >> 1);
152+
153+
if (atomic_read(&conn->cdc_pend_tx_wr) == 0 ||
154+
smc_tx_prepared_sends(conn) > corking_size)
155+
return false;
156+
return true;
157+
}
158+
159+
static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg)
160+
{
161+
struct smc_connection *conn = &smc->conn;
162+
163+
if (smc_should_autocork(smc))
164+
return true;
165+
166+
/* for a corked socket defer the RDMA writes if
167+
* sndbuf_space is still available. The applications
168+
* should known how/when to uncork it.
169+
*/
170+
if ((msg->msg_flags & MSG_MORE ||
171+
smc_tx_is_corked(smc) ||
172+
msg->msg_flags & MSG_SENDPAGE_NOTLAST) &&
173+
atomic_read(&conn->sndbuf_space))
174+
return true;
175+
176+
return false;
177+
}
178+
134179
/* sndbuf producer: main API called by socket layer.
135180
* called under sock lock.
136181
*/
@@ -235,13 +280,10 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
235280
*/
236281
if ((msg->msg_flags & MSG_OOB) && !send_remaining)
237282
conn->urg_tx_pend = true;
238-
/* for a corked socket defer the RDMA writes if
239-
* sndbuf_space is still available. The applications
240-
* should known how/when to uncork it.
283+
/* If we need to cork, do nothing and wait for the next
284+
* sendmsg() call or push on tx completion
241285
*/
242-
if (!((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc) ||
243-
msg->msg_flags & MSG_SENDPAGE_NOTLAST) &&
244-
atomic_read(&conn->sndbuf_space)))
286+
if (!smc_tx_should_cork(smc, msg))
245287
smc_tx_sndbuf_nonempty(conn);
246288

247289
trace_smc_tx_sendmsg(smc, copylen);
@@ -589,24 +631,65 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn)
589631
return rc;
590632
}
591633

592-
int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
634+
static int __smc_tx_sndbuf_nonempty(struct smc_connection *conn)
593635
{
594-
int rc;
636+
struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
637+
int rc = 0;
638+
639+
/* No data in the send queue */
640+
if (unlikely(smc_tx_prepared_sends(conn) <= 0))
641+
goto out;
642+
643+
/* Peer don't have RMBE space */
644+
if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) {
645+
SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk);
646+
goto out;
647+
}
595648

596649
if (conn->killed ||
597-
conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
598-
return -EPIPE; /* connection being aborted */
650+
conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
651+
rc = -EPIPE; /* connection being aborted */
652+
goto out;
653+
}
599654
if (conn->lgr->is_smcd)
600655
rc = smcd_tx_sndbuf_nonempty(conn);
601656
else
602657
rc = smcr_tx_sndbuf_nonempty(conn);
603658

604659
if (!rc) {
605660
/* trigger socket release if connection is closing */
606-
struct smc_sock *smc = container_of(conn, struct smc_sock,
607-
conn);
608661
smc_close_wake_tx_prepared(smc);
609662
}
663+
664+
out:
665+
return rc;
666+
}
667+
668+
int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
669+
{
670+
int rc;
671+
672+
/* This make sure only one can send simultaneously to prevent wasting
673+
* of CPU and CDC slot.
674+
* Record whether someone has tried to push while we are pushing.
675+
*/
676+
if (atomic_inc_return(&conn->tx_pushing) > 1)
677+
return 0;
678+
679+
again:
680+
atomic_set(&conn->tx_pushing, 1);
681+
smp_wmb(); /* Make sure tx_pushing is 1 before real send */
682+
rc = __smc_tx_sndbuf_nonempty(conn);
683+
684+
/* We need to check whether someone else have added some data into
685+
* the send queue and tried to push but failed after the atomic_set()
686+
* when we are pushing.
687+
* If so, we need to push again to prevent those data hang in the send
688+
* queue.
689+
*/
690+
if (unlikely(!atomic_dec_and_test(&conn->tx_pushing)))
691+
goto again;
692+
610693
return rc;
611694
}
612695

0 commit comments

Comments
 (0)