Skip to content

Commit 7d91126

Browse files
Eran Ben Elishadavem330
authored andcommitted
net/mlx5e: Add tx timeout support for mlx5e tx reporter
With this patch, ndo_tx_timeout callback will be redirected to the tx reporter in order to detect a tx timeout error and report it to the devlink health. (The watchdog detects tx timeouts, but the driver verify the issue still exists before launching any recover method). In addition, recover from tx timeout in case of lost interrupt was added to the tx reporter recover method. The tx timeout recover from lost interrupt is not a new feature in the driver, this patch re-organize the functionality and move it to the tx reporter recovery flow. tx timeout example: (with auto_recover set to false, if set to true, the manual recover and diagnose sections are irrelevant) $cat /sys/kernel/debug/tracing/trace ... devlink_health_report: bus_name=pci dev_name=0000:00:09.0 driver_name=mlx5_core reporter_name=tx: TX timeout on queue: 0, SQ: 0x8a, CQ: 0x35, SQ Cons: 0x2 SQ Prod: 0x2, usecs since last trans: 14912000 $devlink health show pci/0000:00:09.0: name tx state healthy #err 1 #recover 0 last_dump_ts N/A parameters: grace_period 500 auto_recover false $devlink health diagnose pci/0000:00:09.0 reporter tx -j -p { "SQs": [ { "sqn": 138, "HW state": 1, "stopped": true },{ "sqn": 142, "HW state": 1, "stopped": false } ] } $devlink health diagnose pci/0000:00:09.0 reporter tx SQs: sqn: 138 HW state: 1 stopped: true sqn: 142 HW state: 1 stopped: false $devlink health recover pci/0000:00:09 reporter tx $devlink health show pci/0000:00:09.0: name tx state healthy #err 1 #recover 1 last_dump_ts N/A parameters: grace_period 500 auto_recover false Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com> Reviewed-by: Moshe Shemesh <moshe@mellanox.com> Acked-by: Saeed Mahameed <saeedm@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent de8650a commit 7d91126

File tree

3 files changed

+55
-37
lines changed

3 files changed

+55
-37
lines changed

drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,6 @@
1010
int mlx5e_tx_reporter_create(struct mlx5e_priv *priv);
1111
void mlx5e_tx_reporter_destroy(struct mlx5e_priv *priv);
1212
void mlx5e_tx_reporter_err_cqe(struct mlx5e_txqsq *sq);
13+
int mlx5e_tx_reporter_timeout(struct mlx5e_txqsq *sq);
1314

1415
#endif

drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,44 @@ void mlx5e_tx_reporter_err_cqe(struct mlx5e_txqsq *sq)
126126
&err_ctx);
127127
}
128128

129+
static int mlx5e_tx_reporter_timeout_recover(struct mlx5e_txqsq *sq)
130+
{
131+
struct mlx5_eq_comp *eq = sq->cq.mcq.eq;
132+
u32 eqe_count;
133+
int ret;
134+
135+
netdev_err(sq->channel->netdev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n",
136+
eq->core.eqn, eq->core.cons_index, eq->core.irqn);
137+
138+
eqe_count = mlx5_eq_poll_irq_disabled(eq);
139+
ret = eqe_count ? true : false;
140+
if (!eqe_count) {
141+
clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
142+
return ret;
143+
}
144+
145+
netdev_err(sq->channel->netdev, "Recover %d eqes on EQ 0x%x\n",
146+
eqe_count, eq->core.eqn);
147+
sq->channel->stats->eq_rearm++;
148+
return ret;
149+
}
150+
151+
int mlx5e_tx_reporter_timeout(struct mlx5e_txqsq *sq)
152+
{
153+
char err_str[MLX5E_TX_REPORTER_PER_SQ_MAX_LEN];
154+
struct mlx5e_tx_err_ctx err_ctx;
155+
156+
err_ctx.sq = sq;
157+
err_ctx.recover = mlx5e_tx_reporter_timeout_recover;
158+
sprintf(err_str,
159+
"TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n",
160+
sq->channel->ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
161+
jiffies_to_usecs(jiffies - sq->txq->trans_start));
162+
163+
return devlink_health_report(sq->channel->priv->tx_reporter, err_str,
164+
&err_ctx);
165+
}
166+
129167
/* state lock cannot be grabbed within this function.
130168
* It can cause a dead lock or a read-after-free.
131169
*/

drivers/net/ethernet/mellanox/mlx5/core/en_main.c

Lines changed: 16 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -4116,31 +4116,13 @@ netdev_features_t mlx5e_features_check(struct sk_buff *skb,
41164116
return features;
41174117
}
41184118

4119-
static bool mlx5e_tx_timeout_eq_recover(struct net_device *dev,
4120-
struct mlx5e_txqsq *sq)
4121-
{
4122-
struct mlx5_eq_comp *eq = sq->cq.mcq.eq;
4123-
u32 eqe_count;
4124-
4125-
netdev_err(dev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n",
4126-
eq->core.eqn, eq->core.cons_index, eq->core.irqn);
4127-
4128-
eqe_count = mlx5_eq_poll_irq_disabled(eq);
4129-
if (!eqe_count)
4130-
return false;
4131-
4132-
netdev_err(dev, "Recover %d eqes on EQ 0x%x\n", eqe_count, eq->core.eqn);
4133-
sq->channel->stats->eq_rearm++;
4134-
return true;
4135-
}
4136-
41374119
static void mlx5e_tx_timeout_work(struct work_struct *work)
41384120
{
41394121
struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv,
41404122
tx_timeout_work);
4141-
struct net_device *dev = priv->netdev;
4142-
bool reopen_channels = false;
4143-
int i, err;
4123+
bool report_failed = false;
4124+
int err;
4125+
int i;
41444126

41454127
rtnl_lock();
41464128
mutex_lock(&priv->state_lock);
@@ -4149,31 +4131,22 @@ static void mlx5e_tx_timeout_work(struct work_struct *work)
41494131
goto unlock;
41504132

41514133
for (i = 0; i < priv->channels.num * priv->channels.params.num_tc; i++) {
4152-
struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, i);
4134+
struct netdev_queue *dev_queue =
4135+
netdev_get_tx_queue(priv->netdev, i);
41534136
struct mlx5e_txqsq *sq = priv->txq2sq[i];
41544137

41554138
if (!netif_xmit_stopped(dev_queue))
41564139
continue;
41574140

4158-
netdev_err(dev,
4159-
"TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n",
4160-
i, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
4161-
jiffies_to_usecs(jiffies - dev_queue->trans_start));
4162-
4163-
/* If we recover a lost interrupt, most likely TX timeout will
4164-
* be resolved, skip reopening channels
4165-
*/
4166-
if (!mlx5e_tx_timeout_eq_recover(dev, sq)) {
4167-
clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
4168-
reopen_channels = true;
4169-
}
4141+
if (mlx5e_tx_reporter_timeout(sq))
4142+
report_failed = true;
41704143
}
41714144

4172-
if (!reopen_channels)
4145+
if (!report_failed)
41734146
goto unlock;
41744147

4175-
mlx5e_close_locked(dev);
4176-
err = mlx5e_open_locked(dev);
4148+
mlx5e_close_locked(priv->netdev);
4149+
err = mlx5e_open_locked(priv->netdev);
41774150
if (err)
41784151
netdev_err(priv->netdev,
41794152
"mlx5e_open_locked failed recovering from a tx_timeout, err(%d).\n",
@@ -4189,6 +4162,12 @@ static void mlx5e_tx_timeout(struct net_device *dev)
41894162
struct mlx5e_priv *priv = netdev_priv(dev);
41904163

41914164
netdev_err(dev, "TX timeout detected\n");
4165+
4166+
if (IS_ERR_OR_NULL(priv->tx_reporter)) {
4167+
netdev_err_once(priv->netdev, "tx timeout will not be handled, no valid tx reporter\n");
4168+
return;
4169+
}
4170+
41924171
queue_work(priv->wq, &priv->tx_timeout_work);
41934172
}
41944173

0 commit comments

Comments
 (0)