Skip to content

Commit 9b3e446

Browse files
committed
Merge tag 'mlx5-updates-2022-02-14' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux
Saeed Mahameed says: ==================== mlx5-updates-2022-02-14 mlx5 TX routines improvements 1) From Aya and Tariq, first 3 patches, Use the Max size of the TX descriptor as advertised by the device and not the fixed value of 16 that the driver always assumed, this is not a bug fix as all existing devices have Max value larger than 16, but the series is necessary for future proofing the driver. 2) TX Synchronization improvements from Maxim, last 12 patches Maxim Mikityanskiy Says: ======================= mlx5e: Synchronize ndo_select_queue with configuration changes The kernel can call ndo_select_queue at any time, and there is no direct way to block it. The implementation of ndo_select_queue in mlx5e expects the parameters to be consistent and may crash (invalid pointer, division by zero) if they aren't. There were attempts to partially fix some of the most frequent crashes, see commit 846d6da ("net/mlx5e: Fix division by 0 in mlx5e_select_queue") and commit 84c8a87 ("net/mlx5e: Fix division by 0 in mlx5e_select_queue for representors"). However, they don't address the issue completely. This series introduces the proper synchronization mechanism between mlx5e configuration and TX data path: 1. txq2sq updates are synchronized properly with ndo_start_xmit (mlx5e_xmit). The TX queue is stopped when it configuration is being updated, and memory barriers ensure the changes are visible before restarting. 2. The set of parameters needed for mlx5e_select_queue is reduced, and synchronization using RCU is implemented. This way, changes are atomic, and the state in mlx5e_select_queue is always consistent. 3. A few optimizations are applied to the new implementation of mlx5e_select_queue. ======================= ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
2 parents d0b78ab + 71753b8 commit 9b3e446

File tree

17 files changed

+480
-232
lines changed

17 files changed

+480
-232
lines changed

drivers/net/ethernet/mellanox/mlx5/core/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN) += en/rqt.o en/tir.o en/rss.o en/rx_res.o \
2828
en_selftest.o en/port.o en/monitor_stats.o en/health.o \
2929
en/reporter_tx.o en/reporter_rx.o en/params.o en/xsk/pool.o \
3030
en/xsk/setup.o en/xsk/rx.o en/xsk/tx.o en/devlink.o en/ptp.o \
31-
en/qos.o en/trap.o en/fs_tt_redirect.o
31+
en/qos.o en/trap.o en/fs_tt_redirect.o en/selq.o
3232

3333
#
3434
# Netdev extra

drivers/net/ethernet/mellanox/mlx5/core/en.h

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
#include "lib/hv_vhca.h"
6060
#include "lib/clock.h"
6161
#include "en/rx_res.h"
62+
#include "en/selq.h"
6263

6364
extern const struct net_device_ops mlx5e_netdev_ops;
6465
struct page_pool;
@@ -172,8 +173,9 @@ struct page_pool;
172173
#define MLX5E_KLM_ENTRIES_PER_WQE(wqe_size)\
173174
ALIGN_DOWN(MLX5E_KLM_MAX_ENTRIES_PER_WQE(wqe_size), MLX5_UMR_KLM_ALIGNMENT)
174175

175-
#define MLX5E_MAX_KLM_PER_WQE \
176-
MLX5E_KLM_ENTRIES_PER_WQE(MLX5E_TX_MPW_MAX_NUM_DS << MLX5_MKEY_BSF_OCTO_SIZE)
176+
#define MLX5E_MAX_KLM_PER_WQE(mdev) \
177+
MLX5E_KLM_ENTRIES_PER_WQE(mlx5e_get_sw_max_sq_mpw_wqebbs(mlx5e_get_max_sq_wqebbs(mdev)) \
178+
<< MLX5_MKEY_BSF_OCTO_SIZE)
177179

178180
#define MLX5E_MSG_LEVEL NETIF_MSG_LINK
179181

@@ -221,6 +223,32 @@ static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
221223
min_t(int, mlx5_comp_vectors_count(mdev), MLX5E_MAX_NUM_CHANNELS);
222224
}
223225

226+
/* The maximum WQE size can be retrieved by max_wqe_sz_sq in
227+
* bytes units. Driver hardens the limitation to 1KB (16
228+
* WQEBBs), unless firmware capability is stricter.
229+
*/
230+
static inline u16 mlx5e_get_max_sq_wqebbs(struct mlx5_core_dev *mdev)
231+
{
232+
return min_t(u16, MLX5_SEND_WQE_MAX_WQEBBS,
233+
MLX5_CAP_GEN(mdev, max_wqe_sz_sq) / MLX5_SEND_WQE_BB);
234+
}
235+
236+
static inline u16 mlx5e_get_sw_max_sq_mpw_wqebbs(u16 max_sq_wqebbs)
237+
{
238+
/* The return value will be multiplied by MLX5_SEND_WQEBB_NUM_DS.
239+
* Since max_sq_wqebbs may be up to MLX5_SEND_WQE_MAX_WQEBBS == 16,
240+
* see mlx5e_get_max_sq_wqebbs(), the multiplication (16 * 4 == 64)
241+
* overflows the 6-bit DS field of Ctrl Segment. Use a bound lower
242+
* than MLX5_SEND_WQE_MAX_WQEBBS to let a full-session WQE be
243+
* cache-aligned.
244+
*/
245+
#if L1_CACHE_BYTES < 128
246+
return min_t(u16, max_sq_wqebbs, MLX5_SEND_WQE_MAX_WQEBBS - 1);
247+
#else
248+
return min_t(u16, max_sq_wqebbs, MLX5_SEND_WQE_MAX_WQEBBS - 2);
249+
#endif
250+
}
251+
224252
struct mlx5e_tx_wqe {
225253
struct mlx5_wqe_ctrl_seg ctrl;
226254
struct mlx5_wqe_eth_seg eth;
@@ -427,12 +455,12 @@ struct mlx5e_txqsq {
427455
struct netdev_queue *txq;
428456
u32 sqn;
429457
u16 stop_room;
458+
u16 max_sq_mpw_wqebbs;
430459
u8 min_inline_mode;
431460
struct device *pdev;
432461
__be32 mkey_be;
433462
unsigned long state;
434463
unsigned int hw_mtu;
435-
struct hwtstamp_config *tstamp;
436464
struct mlx5_clock *clock;
437465
struct net_device *netdev;
438466
struct mlx5_core_dev *mdev;
@@ -446,6 +474,7 @@ struct mlx5e_txqsq {
446474
struct work_struct recover_work;
447475
struct mlx5e_ptpsq *ptpsq;
448476
cqe_ts_to_ns ptp_cyc2time;
477+
u16 max_sq_wqebbs;
449478
} ____cacheline_aligned_in_smp;
450479

451480
struct mlx5e_dma_info {
@@ -540,13 +569,16 @@ struct mlx5e_xdpsq {
540569
u32 sqn;
541570
struct device *pdev;
542571
__be32 mkey_be;
572+
u16 stop_room;
573+
u16 max_sq_mpw_wqebbs;
543574
u8 min_inline_mode;
544575
unsigned long state;
545576
unsigned int hw_mtu;
546577

547578
/* control path */
548579
struct mlx5_wq_ctrl wq_ctrl;
549580
struct mlx5e_channel *channel;
581+
u16 max_sq_wqebbs;
550582
} ____cacheline_aligned_in_smp;
551583

552584
struct mlx5e_ktls_resync_resp;
@@ -575,6 +607,7 @@ struct mlx5e_icosq {
575607
/* control path */
576608
struct mlx5_wq_ctrl wq_ctrl;
577609
struct mlx5e_channel *channel;
610+
u16 max_sq_wqebbs;
578611

579612
struct work_struct recover_work;
580613
} ____cacheline_aligned_in_smp;
@@ -876,9 +909,8 @@ struct mlx5e_trap;
876909

877910
struct mlx5e_priv {
878911
/* priv data path fields - start */
912+
struct mlx5e_selq selq;
879913
struct mlx5e_txqsq **txq2sq;
880-
int **channel_tc2realtxq;
881-
int port_ptp_tc2realtxq[MLX5E_MAX_NUM_TC];
882914
#ifdef CONFIG_MLX5_CORE_EN_DCB
883915
struct mlx5e_dcbx_dp dcbx_dp;
884916
#endif
@@ -921,7 +953,6 @@ struct mlx5e_priv {
921953
u16 drop_rq_q_counter;
922954
struct notifier_block events_nb;
923955
struct notifier_block blocking_events_nb;
924-
int num_tc_x_num_ch;
925956

926957
struct udp_tunnel_nic_info nic_info;
927958
#ifdef CONFIG_MLX5_CORE_EN_DCB

drivers/net/ethernet/mellanox/mlx5/core/en/params.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -196,13 +196,13 @@ u16 mlx5e_calc_sq_stop_room(struct mlx5_core_dev *mdev, struct mlx5e_params *par
196196
u16 stop_room;
197197

198198
stop_room = mlx5e_tls_get_stop_room(mdev, params);
199-
stop_room += mlx5e_stop_room_for_wqe(MLX5_SEND_WQE_MAX_WQEBBS);
199+
stop_room += mlx5e_stop_room_for_max_wqe(mdev);
200200
if (is_mpwqe)
201201
/* A MPWQE can take up to the maximum-sized WQE + all the normal
202202
* stop room can be taken if a new packet breaks the active
203203
* MPWQE session and allocates its WQEs right away.
204204
*/
205-
stop_room += mlx5e_stop_room_for_wqe(MLX5_SEND_WQE_MAX_WQEBBS);
205+
stop_room += mlx5e_stop_room_for_max_wqe(mdev);
206206

207207
return stop_room;
208208
}
@@ -717,7 +717,7 @@ static u32 mlx5e_shampo_icosq_sz(struct mlx5_core_dev *mdev,
717717
int wq_size = BIT(MLX5_GET(wq, wqc, log_wq_sz));
718718
u32 wqebbs;
719719

720-
max_klm_per_umr = MLX5E_MAX_KLM_PER_WQE;
720+
max_klm_per_umr = MLX5E_MAX_KLM_PER_WQE(mdev);
721721
max_hd_per_wqe = mlx5e_shampo_hd_per_wqe(mdev, params, rq_param);
722722
max_num_of_umr_per_wqe = max_hd_per_wqe / max_klm_per_umr;
723723
rest = max_hd_per_wqe % max_klm_per_umr;
@@ -774,10 +774,10 @@ static void mlx5e_build_async_icosq_param(struct mlx5_core_dev *mdev,
774774
void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
775775

776776
mlx5e_build_sq_param_common(mdev, param);
777-
param->stop_room = mlx5e_stop_room_for_wqe(1); /* for XSK NOP */
777+
param->stop_room = mlx5e_stop_room_for_wqe(mdev, 1); /* for XSK NOP */
778778
param->is_tls = mlx5e_accel_is_ktls_rx(mdev);
779779
if (param->is_tls)
780-
param->stop_room += mlx5e_stop_room_for_wqe(1); /* for TLS RX resync NOP */
780+
param->stop_room += mlx5e_stop_room_for_wqe(mdev, 1); /* for TLS RX resync NOP */
781781
MLX5_SET(sqc, sqc, reg_umr, MLX5_CAP_ETH(mdev, reg_umr_sq));
782782
MLX5_SET(wq, wq, log_wq_sz, log_wq_size);
783783
mlx5e_build_ico_cq_param(mdev, log_wq_size, &param->cqp);

drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,6 @@ static int mlx5e_ptp_alloc_txqsq(struct mlx5e_ptp *c, int txq_ix,
195195
int node;
196196

197197
sq->pdev = c->pdev;
198-
sq->tstamp = c->tstamp;
199198
sq->clock = &mdev->clock;
200199
sq->mkey_be = c->mkey_be;
201200
sq->netdev = c->netdev;
@@ -449,7 +448,7 @@ static void mlx5e_ptp_build_sq_param(struct mlx5_core_dev *mdev,
449448

450449
wq = MLX5_ADDR_OF(sqc, sqc, wq);
451450
MLX5_SET(wq, wq, log_wq_sz, params->log_sq_size);
452-
param->stop_room = mlx5e_stop_room_for_wqe(MLX5_SEND_WQE_MAX_WQEBBS);
451+
param->stop_room = mlx5e_stop_room_for_max_wqe(mdev);
453452
mlx5e_build_tx_cq_param(mdev, params, &param->cqp);
454453
}
455454

drivers/net/ethernet/mellanox/mlx5/core/en/qos.c

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ static int mlx5e_find_unused_qos_qid(struct mlx5e_priv *priv)
5050

5151
struct mlx5e_qos_node {
5252
struct hlist_node hnode;
53-
struct rcu_head rcu;
5453
struct mlx5e_qos_node *parent;
5554
u64 rate;
5655
u32 bw_share;
@@ -132,7 +131,11 @@ static void mlx5e_sw_node_delete(struct mlx5e_priv *priv, struct mlx5e_qos_node
132131
__clear_bit(node->qid, priv->htb.qos_used_qids);
133132
mlx5e_update_tx_netdev_queues(priv);
134133
}
135-
kfree_rcu(node, rcu);
134+
/* Make sure this qid is no longer selected by mlx5e_select_queue, so
135+
* that mlx5e_reactivate_qos_sq can safely restart the netdev TX queue.
136+
*/
137+
synchronize_net();
138+
kfree(node);
136139
}
137140

138141
/* TX datapath API */
@@ -273,10 +276,18 @@ static int mlx5e_open_qos_sq(struct mlx5e_priv *priv, struct mlx5e_channels *chs
273276
static void mlx5e_activate_qos_sq(struct mlx5e_priv *priv, struct mlx5e_qos_node *node)
274277
{
275278
struct mlx5e_txqsq *sq;
279+
u16 qid;
276280

277281
sq = mlx5e_get_qos_sq(priv, node->qid);
278282

279-
WRITE_ONCE(priv->txq2sq[mlx5e_qid_from_qos(&priv->channels, node->qid)], sq);
283+
qid = mlx5e_qid_from_qos(&priv->channels, node->qid);
284+
285+
/* If it's a new queue, it will be marked as started at this point.
286+
* Stop it before updating txq2sq.
287+
*/
288+
mlx5e_tx_disable_queue(netdev_get_tx_queue(priv->netdev, qid));
289+
290+
priv->txq2sq[qid] = sq;
280291

281292
/* Make the change to txq2sq visible before the queue is started.
282293
* As mlx5e_xmit runs under a spinlock, there is an implicit ACQUIRE,
@@ -299,8 +310,13 @@ static void mlx5e_deactivate_qos_sq(struct mlx5e_priv *priv, u16 qid)
299310
qos_dbg(priv->mdev, "Deactivate QoS SQ qid %u\n", qid);
300311
mlx5e_deactivate_txqsq(sq);
301312

302-
/* The queue is disabled, no synchronization with datapath is needed. */
303313
priv->txq2sq[mlx5e_qid_from_qos(&priv->channels, qid)] = NULL;
314+
315+
/* Make the change to txq2sq visible before the queue is started again.
316+
* As mlx5e_xmit runs under a spinlock, there is an implicit ACQUIRE,
317+
* which pairs with this barrier.
318+
*/
319+
smp_wmb();
304320
}
305321

306322
static void mlx5e_close_qos_sq(struct mlx5e_priv *priv, u16 qid)
@@ -485,9 +501,11 @@ int mlx5e_htb_root_add(struct mlx5e_priv *priv, u16 htb_maj_id, u16 htb_defcls,
485501

486502
opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
487503
if (opened) {
504+
mlx5e_selq_prepare(&priv->selq, &priv->channels.params, true);
505+
488506
err = mlx5e_qos_alloc_queues(priv, &priv->channels);
489507
if (err)
490-
return err;
508+
goto err_cancel_selq;
491509
}
492510

493511
root = mlx5e_sw_node_create_root(priv);
@@ -508,6 +526,9 @@ int mlx5e_htb_root_add(struct mlx5e_priv *priv, u16 htb_maj_id, u16 htb_defcls,
508526
*/
509527
smp_store_release(&priv->htb.maj_id, htb_maj_id);
510528

529+
if (opened)
530+
mlx5e_selq_apply(&priv->selq);
531+
511532
return 0;
512533

513534
err_sw_node_delete:
@@ -516,6 +537,8 @@ int mlx5e_htb_root_add(struct mlx5e_priv *priv, u16 htb_maj_id, u16 htb_defcls,
516537
err_free_queues:
517538
if (opened)
518539
mlx5e_qos_close_all_queues(&priv->channels);
540+
err_cancel_selq:
541+
mlx5e_selq_cancel(&priv->selq);
519542
return err;
520543
}
521544

@@ -526,8 +549,15 @@ int mlx5e_htb_root_del(struct mlx5e_priv *priv)
526549

527550
qos_dbg(priv->mdev, "TC_HTB_DESTROY\n");
528551

552+
/* Wait until real_num_tx_queues is updated for mlx5e_select_queue,
553+
* so that we can safely switch to its non-HTB non-PTP fastpath.
554+
*/
555+
synchronize_net();
556+
557+
mlx5e_selq_prepare(&priv->selq, &priv->channels.params, false);
558+
mlx5e_selq_apply(&priv->selq);
559+
529560
WRITE_ONCE(priv->htb.maj_id, 0);
530-
synchronize_rcu(); /* Sync with mlx5e_select_htb_queue and TX data path. */
531561

532562
root = mlx5e_sw_node_find(priv, MLX5E_HTB_CLASSID_ROOT);
533563
if (!root) {

0 commit comments

Comments
 (0)