Skip to content

Commit bac38ca

Browse files
jokim-amdalexdeucher
authored andcommitted
drm/amdkfd: implement per queue sdma reset for gfx 9.4+
To reset hung SDMA queues on GFX 9.4+ for the GFX9 family, a soft reset must be issued through SMU. Since soft resets will reset an entire SDMA engine, use a common KGD call to do the reset as the KGD will handle avoiding a reset of in flight GFX and paging queues on that engine. In addition, create a common call for all reset types to simplify the handling of module parameter settings that block gpu resets. Signed-off-by: Jonathan Kim <jonathan.kim@amd.com> Reviewed-by: Harish Kasiviswanathan <harish.kasiviswanathan@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent 057fef2 commit bac38ca

12 files changed

+171
-25
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,4 +193,5 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
193193
.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
194194
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
195195
.hqd_reset = kgd_gfx_v9_hqd_reset,
196+
.hqd_sdma_get_doorbell = kgd_gfx_v9_hqd_sdma_get_doorbell
196197
};

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -419,5 +419,6 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
419419
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
420420
.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
421421
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
422-
.hqd_reset = kgd_gfx_v9_hqd_reset
422+
.hqd_reset = kgd_gfx_v9_hqd_reset,
423+
.hqd_sdma_get_doorbell = kgd_gfx_v9_hqd_sdma_get_doorbell
423424
};

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,17 @@ static uint32_t kgd_gfx_v9_4_3_clear_address_watch(struct amdgpu_device *adev,
509509
return 0;
510510
}
511511

512+
static uint32_t kgd_gfx_v9_4_3_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
513+
int engine, int queue)
514+
{
515+
uint32_t reg_offset = get_sdma_rlc_reg_offset(adev, engine, queue);
516+
uint32_t status = RREG32(regSDMA_RLC0_CONTEXT_STATUS + reg_offset);
517+
uint32_t doorbell_off = RREG32(regSDMA_RLC0_DOORBELL_OFFSET + reg_offset);
518+
bool is_active = !!REG_GET_FIELD(status, SDMA_RLC0_CONTEXT_STATUS, SELECTED);
519+
520+
return is_active ? doorbell_off >> 2 : 0;
521+
}
522+
512523
const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
513524
.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
514525
.set_pasid_vmid_mapping = kgd_gfx_v9_4_3_set_pasid_vmid_mapping,
@@ -543,5 +554,6 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
543554
.set_address_watch = kgd_gfx_v9_4_3_set_address_watch,
544555
.clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch,
545556
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
546-
.hqd_reset = kgd_gfx_v9_hqd_reset
557+
.hqd_reset = kgd_gfx_v9_hqd_reset,
558+
.hqd_sdma_get_doorbell = kgd_gfx_v9_4_3_hqd_sdma_get_doorbell
547559
};

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1084,6 +1084,12 @@ uint64_t kgd_gfx_v10_hqd_reset(struct amdgpu_device *adev,
10841084
return 0;
10851085
}
10861086

1087+
uint32_t kgd_gfx_v10_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
1088+
int engine, int queue)
1089+
{
1090+
return 0;
1091+
}
1092+
10871093
const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
10881094
.program_sh_mem_settings = kgd_program_sh_mem_settings,
10891095
.set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
@@ -1112,5 +1118,6 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
11121118
.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
11131119
.program_trap_handler_settings = program_trap_handler_settings,
11141120
.hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr,
1115-
.hqd_reset = kgd_gfx_v10_hqd_reset
1121+
.hqd_reset = kgd_gfx_v10_hqd_reset,
1122+
.hqd_sdma_get_doorbell = kgd_gfx_v10_hqd_sdma_get_doorbell
11161123
};

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,5 @@ uint64_t kgd_gfx_v10_hqd_reset(struct amdgpu_device *adev,
6565
uint32_t queue_id,
6666
uint32_t inst,
6767
unsigned int utimeout);
68+
uint32_t kgd_gfx_v10_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
69+
int engine, int queue);

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -682,5 +682,6 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
682682
.set_address_watch = kgd_gfx_v10_set_address_watch,
683683
.clear_address_watch = kgd_gfx_v10_clear_address_watch,
684684
.hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr,
685-
.hqd_reset = kgd_gfx_v10_hqd_reset
685+
.hqd_reset = kgd_gfx_v10_hqd_reset,
686+
.hqd_sdma_get_doorbell = kgd_gfx_v10_hqd_sdma_get_doorbell
686687
};

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -800,6 +800,12 @@ static uint64_t kgd_gfx_v11_hqd_reset(struct amdgpu_device *adev,
800800
return 0;
801801
}
802802

803+
static uint32_t kgd_gfx_v11_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
804+
int engine, int queue)
805+
{
806+
return 0;
807+
}
808+
803809
const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
804810
.program_sh_mem_settings = program_sh_mem_settings_v11,
805811
.set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11,
@@ -824,5 +830,6 @@ const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
824830
.set_address_watch = kgd_gfx_v11_set_address_watch,
825831
.clear_address_watch = kgd_gfx_v11_clear_address_watch,
826832
.hqd_get_pq_addr = kgd_gfx_v11_hqd_get_pq_addr,
827-
.hqd_reset = kgd_gfx_v11_hqd_reset
833+
.hqd_reset = kgd_gfx_v11_hqd_reset,
834+
.hqd_sdma_get_doorbell = kgd_gfx_v11_hqd_sdma_get_doorbell
828835
};

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v12.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,12 @@ static uint32_t kgd_gfx_v12_clear_address_watch(struct amdgpu_device *adev,
361361
return 0;
362362
}
363363

364+
static uint32_t kgd_gfx_v12_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
365+
int engine, int queue)
366+
{
367+
return 0;
368+
}
369+
364370
const struct kfd2kgd_calls gfx_v12_kfd2kgd = {
365371
.init_interrupts = init_interrupts_v12,
366372
.hqd_dump = hqd_dump_v12,
@@ -374,4 +380,5 @@ const struct kfd2kgd_calls gfx_v12_kfd2kgd = {
374380
.set_wave_launch_mode = kgd_gfx_v12_set_wave_launch_mode,
375381
.set_address_watch = kgd_gfx_v12_set_address_watch,
376382
.clear_address_watch = kgd_gfx_v12_clear_address_watch,
383+
.hqd_sdma_get_doorbell = kgd_gfx_v12_hqd_sdma_get_doorbell
377384
};

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1131,9 +1131,6 @@ uint64_t kgd_gfx_v9_hqd_get_pq_addr(struct amdgpu_device *adev,
11311131
uint32_t low, high;
11321132
uint64_t queue_addr = 0;
11331133

1134-
if (!amdgpu_gpu_recovery)
1135-
return 0;
1136-
11371134
kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
11381135
amdgpu_gfx_rlc_enter_safe_mode(adev, inst);
11391136

@@ -1182,9 +1179,6 @@ uint64_t kgd_gfx_v9_hqd_reset(struct amdgpu_device *adev,
11821179
uint32_t low, high, pipe_reset_data = 0;
11831180
uint64_t queue_addr = 0;
11841181

1185-
if (!amdgpu_gpu_recovery)
1186-
return 0;
1187-
11881182
kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
11891183
amdgpu_gfx_rlc_enter_safe_mode(adev, inst);
11901184

@@ -1229,6 +1223,13 @@ uint64_t kgd_gfx_v9_hqd_reset(struct amdgpu_device *adev,
12291223
return queue_addr;
12301224
}
12311225

1226+
uint32_t kgd_gfx_v9_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
1227+
int engine, int queue)
1228+
1229+
{
1230+
return 0;
1231+
}
1232+
12321233
const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
12331234
.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
12341235
.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -1258,5 +1259,6 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
12581259
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
12591260
.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
12601261
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
1261-
.hqd_reset = kgd_gfx_v9_hqd_reset
1262+
.hqd_reset = kgd_gfx_v9_hqd_reset,
1263+
.hqd_sdma_get_doorbell = kgd_gfx_v9_hqd_sdma_get_doorbell
12621264
};

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,3 +111,5 @@ uint64_t kgd_gfx_v9_hqd_reset(struct amdgpu_device *adev,
111111
uint32_t queue_id,
112112
uint32_t inst,
113113
unsigned int utimeout);
114+
uint32_t kgd_gfx_v9_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
115+
int engine, int queue);

0 commit comments

Comments
 (0)