Skip to content

Commit 51d138c

Browse files
shayshyiSaeed Mahameed
authored andcommitted
net/mlx5: Fix health error state handling
Currently, when we discover a fatal error, we are queueing a work that will wait for a lock in order to enter the device to error state. Meanwhile, FW commands are still being processed, and gets timeouts. This can block the driver for few minutes before the work will manage to get the lock and enter to error state. Setting the device to error state before queueing health work, in order to avoid FW commands being processed while the work is waiting for the lock. Fixes: c1d4d2e ("net/mlx5: Avoid calling sleeping function by the health poll thread") Signed-off-by: Shay Drory <shayd@nvidia.com> Reviewed-by: Moshe Shemesh <moshe@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
1 parent 65ba859 commit 51d138c

File tree

1 file changed

+14
-8
lines changed
  • drivers/net/ethernet/mellanox/mlx5/core

1 file changed

+14
-8
lines changed

drivers/net/ethernet/mellanox/mlx5/core/health.c

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,16 @@ static bool reset_fw_if_needed(struct mlx5_core_dev *dev)
190190
return true;
191191
}
192192

193+
static void enter_error_state(struct mlx5_core_dev *dev, bool force)
194+
{
195+
if (mlx5_health_check_fatal_sensors(dev) || force) { /* protected state setting */
196+
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
197+
mlx5_cmd_flush(dev);
198+
}
199+
200+
mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1);
201+
}
202+
193203
void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
194204
{
195205
bool err_detected = false;
@@ -208,12 +218,7 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
208218
goto unlock;
209219
}
210220

211-
if (mlx5_health_check_fatal_sensors(dev) || force) { /* protected state setting */
212-
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
213-
mlx5_cmd_flush(dev);
214-
}
215-
216-
mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1);
221+
enter_error_state(dev, force);
217222
unlock:
218223
mutex_unlock(&dev->intf_state_mutex);
219224
}
@@ -613,7 +618,7 @@ static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work)
613618
priv = container_of(health, struct mlx5_priv, health);
614619
dev = container_of(priv, struct mlx5_core_dev, priv);
615620

616-
mlx5_enter_error_state(dev, false);
621+
enter_error_state(dev, false);
617622
if (IS_ERR_OR_NULL(health->fw_fatal_reporter)) {
618623
if (mlx5_health_try_recover(dev))
619624
mlx5_core_err(dev, "health recovery failed\n");
@@ -707,8 +712,9 @@ static void poll_health(struct timer_list *t)
707712
mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error);
708713
dev->priv.health.fatal_error = fatal_error;
709714
print_health_info(dev);
715+
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
710716
mlx5_trigger_health_work(dev);
711-
goto out;
717+
return;
712718
}
713719

714720
count = ioread32be(health->health_counter);

0 commit comments

Comments
 (0)