Skip to content

Commit baf4e79

Browse files
Lijo LazarSasha Levin
authored andcommitted
drm/amdgpu: Fix error handling in slot reset
[ Upstream commit b57c4ec ] If the device has not recovered after slot reset is called, it goes to out label for error handling. There it could make decision based on uninitialized hive pointer and could result in accessing an uninitialized list. Initialize the list and hive properly so that it handles the error situation and also releases the reset domain lock which is acquired during error_detected callback. Fixes: 732c6ce ("drm/amdgpu: Replace tmp_adev with hive in amdgpu_pci_slot_reset") Signed-off-by: Lijo Lazar <lijo.lazar@amd.com> Reviewed-by: Ce Sun <cesun102@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> (cherry picked from commit bb71362182e59caa227e4192da5a612b09349696) Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent f85294c commit baf4e79

File tree

1 file changed

+10
-7
lines changed

1 file changed

+10
-7
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7203,6 +7203,15 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
72037203
dev_info(adev->dev, "PCI error: slot reset callback!!\n");
72047204

72057205
memset(&reset_context, 0, sizeof(reset_context));
7206+
INIT_LIST_HEAD(&device_list);
7207+
hive = amdgpu_get_xgmi_hive(adev);
7208+
if (hive) {
7209+
mutex_lock(&hive->hive_lock);
7210+
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
7211+
list_add_tail(&tmp_adev->reset_list, &device_list);
7212+
} else {
7213+
list_add_tail(&adev->reset_list, &device_list);
7214+
}
72067215

72077216
if (adev->pcie_reset_ctx.swus)
72087217
link_dev = adev->pcie_reset_ctx.swus;
@@ -7243,19 +7252,13 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
72437252
reset_context.reset_req_dev = adev;
72447253
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
72457254
set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
7246-
INIT_LIST_HEAD(&device_list);
72477255

7248-
hive = amdgpu_get_xgmi_hive(adev);
72497256
if (hive) {
7250-
mutex_lock(&hive->hive_lock);
72517257
reset_context.hive = hive;
7252-
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
7258+
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
72537259
tmp_adev->pcie_reset_ctx.in_link_reset = true;
7254-
list_add_tail(&tmp_adev->reset_list, &device_list);
7255-
}
72567260
} else {
72577261
set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
7258-
list_add_tail(&adev->reset_list, &device_list);
72597262
}
72607263

72617264
r = amdgpu_device_asic_reset(adev, &device_list, &reset_context);

0 commit comments

Comments
 (0)