Skip to content

Commit 087a3e1

Browse files
ChristianKoenigAMDalexdeucher
authored andcommitted
drm/amdgpu: revert "Adjust removal control flow for smu v13_0_2"
Calling amdgpu_device_ip_resume_phase1() during shutdown leaves the HW in an active state and is an unbalanced use of the IP callbacks. Using the IP callbacks like this can lead to memory leaks, double free and imbalanced reference counters. Leaving the HW in an active state can lead to DMA accesses to memory now freed by the driver. Both is a complete no-go for driver unload so completely revert the workaround for now. This reverts commit f5c7e77. Signed-off-by: Christian König <christian.koenig@amd.com> Acked-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent 8a1f7fd commit 087a3e1

File tree

4 files changed

+1
-65
lines changed

4 files changed

+1
-65
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 1 addition & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -5267,7 +5267,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
52675267
struct amdgpu_device *tmp_adev = NULL;
52685268
bool need_full_reset, skip_hw_reset, vram_lost = false;
52695269
int r = 0;
5270-
bool gpu_reset_for_dev_remove = 0;
52715270

52725271
/* Try reset handler method first */
52735272
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
@@ -5287,10 +5286,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
52875286
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
52885287
skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
52895288

5290-
gpu_reset_for_dev_remove =
5291-
test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5292-
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5293-
52945289
/*
52955290
* ASIC reset has to be done on all XGMI hive nodes ASAP
52965291
* to allow proper links negotiation in FW (within 1 sec)
@@ -5333,18 +5328,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
53335328
amdgpu_ras_intr_cleared();
53345329
}
53355330

5336-
/* Since the mode1 reset affects base ip blocks, the
5337-
* phase1 ip blocks need to be resumed. Otherwise there
5338-
* will be a BIOS signature error and the psp bootloader
5339-
* can't load kdb on the next amdgpu install.
5340-
*/
5341-
if (gpu_reset_for_dev_remove) {
5342-
list_for_each_entry(tmp_adev, device_list_handle, reset_list)
5343-
amdgpu_device_ip_resume_phase1(tmp_adev);
5344-
5345-
goto end;
5346-
}
5347-
53485331
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
53495332
if (need_full_reset) {
53505333
/* post card */
@@ -5581,11 +5564,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
55815564
int i, r = 0;
55825565
bool need_emergency_restart = false;
55835566
bool audio_suspended = false;
5584-
bool gpu_reset_for_dev_remove = false;
5585-
5586-
gpu_reset_for_dev_remove =
5587-
test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5588-
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
55895567

55905568
/*
55915569
* Special case: RAS triggered and full reset isn't supported
@@ -5623,7 +5601,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
56235601
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
56245602
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
56255603
list_add_tail(&tmp_adev->reset_list, &device_list);
5626-
if (gpu_reset_for_dev_remove && adev->shutdown)
5604+
if (adev->shutdown)
56275605
tmp_adev->shutdown = true;
56285606
}
56295607
if (!list_is_first(&adev->reset_list, &device_list))
@@ -5708,10 +5686,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
57085686

57095687
retry: /* Rest of adevs pre asic reset from XGMI hive. */
57105688
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5711-
if (gpu_reset_for_dev_remove) {
5712-
/* Workaroud for ASICs need to disable SMC first */
5713-
amdgpu_device_smu_fini_early(tmp_adev);
5714-
}
57155689
r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
57165690
/*TODO Should we stop ?*/
57175691
if (r) {
@@ -5743,9 +5717,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
57435717
r = amdgpu_do_asic_reset(device_list_handle, reset_context);
57445718
if (r && r == -EAGAIN)
57455719
goto retry;
5746-
5747-
if (!r && gpu_reset_for_dev_remove)
5748-
goto recover_end;
57495720
}
57505721

57515722
skip_hw_reset:
@@ -5801,7 +5772,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
58015772
amdgpu_ras_set_error_query_ready(tmp_adev, true);
58025773
}
58035774

5804-
recover_end:
58055775
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
58065776
reset_list);
58075777
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);

drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2337,38 +2337,6 @@ amdgpu_pci_remove(struct pci_dev *pdev)
23372337
pm_runtime_forbid(dev->dev);
23382338
}
23392339

2340-
if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 2) &&
2341-
!amdgpu_sriov_vf(adev)) {
2342-
bool need_to_reset_gpu = false;
2343-
2344-
if (adev->gmc.xgmi.num_physical_nodes > 1) {
2345-
struct amdgpu_hive_info *hive;
2346-
2347-
hive = amdgpu_get_xgmi_hive(adev);
2348-
if (hive->device_remove_count == 0)
2349-
need_to_reset_gpu = true;
2350-
hive->device_remove_count++;
2351-
amdgpu_put_xgmi_hive(hive);
2352-
} else {
2353-
need_to_reset_gpu = true;
2354-
}
2355-
2356-
/* Workaround for ASICs need to reset SMU.
2357-
* Called only when the first device is removed.
2358-
*/
2359-
if (need_to_reset_gpu) {
2360-
struct amdgpu_reset_context reset_context;
2361-
2362-
adev->shutdown = true;
2363-
memset(&reset_context, 0, sizeof(reset_context));
2364-
reset_context.method = AMD_RESET_METHOD_NONE;
2365-
reset_context.reset_req_dev = adev;
2366-
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
2367-
set_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context.flags);
2368-
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
2369-
}
2370-
}
2371-
23722340
amdgpu_driver_unload_kms(dev);
23732341

23742342
/*

drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ enum AMDGPU_RESET_FLAGS {
3232

3333
AMDGPU_NEED_FULL_RESET = 0,
3434
AMDGPU_SKIP_HW_RESET = 1,
35-
AMDGPU_RESET_FOR_DEVICE_REMOVE = 2,
3635
};
3736

3837
struct amdgpu_reset_context {

drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ struct amdgpu_hive_info {
4343
} pstate;
4444

4545
struct amdgpu_reset_domain *reset_domain;
46-
uint32_t device_remove_count;
4746
atomic_t ras_recovery;
4847
};
4948

0 commit comments

Comments
 (0)