Skip to content

Commit 06d8971

Browse files
mosheshemesh2gregkh
authored andcommitted
net/mlx5: Fix lockdep assertion on sync reset unload event
[ Upstream commit 902a8bc ] Fix lockdep assertion triggered during sync reset unload event. When the sync reset flow is initiated using the devlink reload fw_activate option, the PF already holds the devlink lock while handling unload event. In this case, delegate sync reset unload event handling back to the devlink callback process to avoid double-locking and resolve the lockdep warning. Kernel log: WARNING: CPU: 9 PID: 1578 at devl_assert_locked+0x31/0x40 [...] Call Trace: <TASK> mlx5_unload_one_devl_locked+0x2c/0xc0 [mlx5_core] mlx5_sync_reset_unload_event+0xaf/0x2f0 [mlx5_core] process_one_work+0x222/0x640 worker_thread+0x199/0x350 kthread+0x10b/0x230 ? __pfx_worker_thread+0x10/0x10 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x8e/0x100 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 </TASK> Fixes: 7a9770f ("net/mlx5: Handle sync reset unload event") Signed-off-by: Moshe Shemesh <moshe@nvidia.com> Signed-off-by: Mark Bloch <mbloch@nvidia.com> Link: https://patch.msgid.link/20250825143435.598584-7-mbloch@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent d0ac078 commit 06d8971

File tree

3 files changed

+69
-54
lines changed

3 files changed

+69
-54
lines changed

drivers/net/ethernet/mellanox/mlx5/core/devlink.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ static int mlx5_devlink_reload_fw_activate(struct devlink *devlink, struct netli
107107
if (err)
108108
return err;
109109

110-
mlx5_unload_one_devl_locked(dev, false);
110+
mlx5_sync_reset_unload_flow(dev, true);
111111
err = mlx5_health_wait_pci_up(dev);
112112
if (err)
113113
NL_SET_ERR_MSG_MOD(extack, "FW activate aborted, PCI reads fail after reset");

drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c

Lines changed: 67 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ enum {
1212
MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST,
1313
MLX5_FW_RESET_FLAGS_PENDING_COMP,
1414
MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS,
15-
MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED
15+
MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED,
16+
MLX5_FW_RESET_FLAGS_UNLOAD_EVENT,
1617
};
1718

1819
struct mlx5_fw_reset {
@@ -219,7 +220,7 @@ int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev)
219220
return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL0, 0, 0, false);
220221
}
221222

222-
static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev, bool unloaded)
223+
static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev)
223224
{
224225
struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
225226
struct devlink *devlink = priv_to_devlink(dev);
@@ -228,8 +229,7 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev, bool unload
228229
if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) {
229230
complete(&fw_reset->done);
230231
} else {
231-
if (!unloaded)
232-
mlx5_unload_one(dev, false);
232+
mlx5_sync_reset_unload_flow(dev, false);
233233
if (mlx5_health_wait_pci_up(dev))
234234
mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n");
235235
else
@@ -272,7 +272,7 @@ static void mlx5_sync_reset_reload_work(struct work_struct *work)
272272

273273
mlx5_sync_reset_clear_reset_requested(dev, false);
274274
mlx5_enter_error_state(dev, true);
275-
mlx5_fw_reset_complete_reload(dev, false);
275+
mlx5_fw_reset_complete_reload(dev);
276276
}
277277

278278
#define MLX5_RESET_POLL_INTERVAL (HZ / 10)
@@ -586,6 +586,65 @@ static int mlx5_sync_pci_reset(struct mlx5_core_dev *dev, u8 reset_method)
586586
return err;
587587
}
588588

589+
void mlx5_sync_reset_unload_flow(struct mlx5_core_dev *dev, bool locked)
590+
{
591+
struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
592+
unsigned long timeout;
593+
int poll_freq = 20;
594+
bool reset_action;
595+
u8 rst_state;
596+
int err;
597+
598+
if (locked)
599+
mlx5_unload_one_devl_locked(dev, false);
600+
else
601+
mlx5_unload_one(dev, false);
602+
603+
if (!test_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags))
604+
return;
605+
606+
mlx5_set_fw_rst_ack(dev);
607+
mlx5_core_warn(dev, "Sync Reset Unload done, device reset expected\n");
608+
609+
reset_action = false;
610+
timeout = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, RESET_UNLOAD));
611+
do {
612+
rst_state = mlx5_get_fw_rst_state(dev);
613+
if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ ||
614+
rst_state == MLX5_FW_RST_STATE_IDLE) {
615+
reset_action = true;
616+
break;
617+
}
618+
if (rst_state == MLX5_FW_RST_STATE_DROP_MODE) {
619+
mlx5_core_info(dev, "Sync Reset Drop mode ack\n");
620+
mlx5_set_fw_rst_ack(dev);
621+
poll_freq = 1000;
622+
}
623+
msleep(poll_freq);
624+
} while (!time_after(jiffies, timeout));
625+
626+
if (!reset_action) {
627+
mlx5_core_err(dev, "Got timeout waiting for sync reset action, state = %u\n",
628+
rst_state);
629+
fw_reset->ret = -ETIMEDOUT;
630+
goto done;
631+
}
632+
633+
mlx5_core_warn(dev, "Sync Reset, got reset action. rst_state = %u\n",
634+
rst_state);
635+
if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ) {
636+
err = mlx5_sync_pci_reset(dev, fw_reset->reset_method);
637+
if (err) {
638+
mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, err %d\n",
639+
err);
640+
fw_reset->ret = err;
641+
}
642+
}
643+
644+
done:
645+
clear_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags);
646+
}
647+
589648
static void mlx5_sync_reset_now_event(struct work_struct *work)
590649
{
591650
struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
@@ -613,17 +672,13 @@ static void mlx5_sync_reset_now_event(struct work_struct *work)
613672
mlx5_enter_error_state(dev, true);
614673
done:
615674
fw_reset->ret = err;
616-
mlx5_fw_reset_complete_reload(dev, false);
675+
mlx5_fw_reset_complete_reload(dev);
617676
}
618677

619678
static void mlx5_sync_reset_unload_event(struct work_struct *work)
620679
{
621680
struct mlx5_fw_reset *fw_reset;
622681
struct mlx5_core_dev *dev;
623-
unsigned long timeout;
624-
int poll_freq = 20;
625-
bool reset_action;
626-
u8 rst_state;
627682
int err;
628683

629684
fw_reset = container_of(work, struct mlx5_fw_reset, reset_unload_work);
@@ -632,6 +687,7 @@ static void mlx5_sync_reset_unload_event(struct work_struct *work)
632687
if (mlx5_sync_reset_clear_reset_requested(dev, false))
633688
return;
634689

690+
set_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags);
635691
mlx5_core_warn(dev, "Sync Reset Unload. Function is forced down.\n");
636692

637693
err = mlx5_cmd_fast_teardown_hca(dev);
@@ -640,49 +696,7 @@ static void mlx5_sync_reset_unload_event(struct work_struct *work)
640696
else
641697
mlx5_enter_error_state(dev, true);
642698

643-
if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags))
644-
mlx5_unload_one_devl_locked(dev, false);
645-
else
646-
mlx5_unload_one(dev, false);
647-
648-
mlx5_set_fw_rst_ack(dev);
649-
mlx5_core_warn(dev, "Sync Reset Unload done, device reset expected\n");
650-
651-
reset_action = false;
652-
timeout = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, RESET_UNLOAD));
653-
do {
654-
rst_state = mlx5_get_fw_rst_state(dev);
655-
if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ ||
656-
rst_state == MLX5_FW_RST_STATE_IDLE) {
657-
reset_action = true;
658-
break;
659-
}
660-
if (rst_state == MLX5_FW_RST_STATE_DROP_MODE) {
661-
mlx5_core_info(dev, "Sync Reset Drop mode ack\n");
662-
mlx5_set_fw_rst_ack(dev);
663-
poll_freq = 1000;
664-
}
665-
msleep(poll_freq);
666-
} while (!time_after(jiffies, timeout));
667-
668-
if (!reset_action) {
669-
mlx5_core_err(dev, "Got timeout waiting for sync reset action, state = %u\n",
670-
rst_state);
671-
fw_reset->ret = -ETIMEDOUT;
672-
goto done;
673-
}
674-
675-
mlx5_core_warn(dev, "Sync Reset, got reset action. rst_state = %u\n", rst_state);
676-
if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ) {
677-
err = mlx5_sync_pci_reset(dev, fw_reset->reset_method);
678-
if (err) {
679-
mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, err %d\n", err);
680-
fw_reset->ret = err;
681-
}
682-
}
683-
684-
done:
685-
mlx5_fw_reset_complete_reload(dev, true);
699+
mlx5_fw_reset_complete_reload(dev);
686700
}
687701

688702
static void mlx5_sync_reset_abort_event(struct work_struct *work)

drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel,
1212
int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev);
1313

1414
int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev);
15+
void mlx5_sync_reset_unload_flow(struct mlx5_core_dev *dev, bool locked);
1516
int mlx5_fw_reset_verify_fw_complete(struct mlx5_core_dev *dev,
1617
struct netlink_ext_ack *extack);
1718
void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev);

0 commit comments

Comments
 (0)