Skip to content

Commit 5993345

Browse files
author
Thomas Hellström
committed
drm/xe: Block exec and rebind worker while evicting for suspend / hibernate
When the xe pm_notifier evicts for suspend / hibernate, there might be racing tasks trying to re-validate again. This can lead to suspend taking excessive time or get stuck in a live-lock. This behaviour becomes much worse with the fix that actually makes re-validation bring back bos to VRAM rather than letting them remain in TT. Prevent that by having exec and the rebind worker waiting for a completion that is set to block by the pm_notifier before suspend and is signaled by the pm_notifier after resume / wakeup. It's probably still possible to craft malicious applications that block suspending. More work is pending to fix that. v3: - Avoid wait_for_completion() in the kernel worker since it could potentially cause work item flushes from freezable processes to wait forever. Instead terminate the rebind workers if needed and re-launch at resume. (Matt Auld) v4: - Fix some bad naming and leftover debug printouts. - Fix kerneldoc. - Use drmm_mutex_init() for the xe->rebind_resume_lock (Matt Auld). - Rework the interface of xe_vm_rebind_resume_worker (Matt Auld). Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4288 Fixes: c6a4d46 ("drm/xe: evict user memory in PM notifier") Cc: Matthew Auld <matthew.auld@intel.com> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: <stable@vger.kernel.org> # v6.16+ Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com> Reviewed-by: Matthew Auld <matthew.auld@intel.com> Link: https://lore.kernel.org/r/20250904160715.2613-4-thomas.hellstrom@linux.intel.com
1 parent ebd546f commit 5993345

File tree

6 files changed

+88
-1
lines changed

6 files changed

+88
-1
lines changed

drivers/gpu/drm/xe/xe_device_types.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,12 @@ struct xe_device {
507507

508508
/** @pm_notifier: Our PM notifier to perform actions in response to various PM events. */
509509
struct notifier_block pm_notifier;
510+
/** @pm_block: Completion to block validating tasks on suspend / hibernate prepare */
511+
struct completion pm_block;
512+
/** @rebind_resume_list: List of wq items to kick on resume. */
513+
struct list_head rebind_resume_list;
514+
/** @rebind_resume_lock: Lock to protect the rebind_resume_list */
515+
struct mutex rebind_resume_lock;
510516

511517
/** @pmt: Support the PMT driver callback interface */
512518
struct {

drivers/gpu/drm/xe/xe_exec.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,15 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
238238
goto err_unlock_list;
239239
}
240240

241+
/*
242+
* It's OK to block interruptible here with the vm lock held, since
243+
* on task freezing during suspend / hibernate, the call will
244+
* return -ERESTARTSYS and the IOCTL will be rerun.
245+
*/
246+
err = wait_for_completion_interruptible(&xe->pm_block);
247+
if (err)
248+
goto err_unlock_list;
249+
241250
vm_exec.vm = &vm->gpuvm;
242251
vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
243252
if (xe_vm_in_lr_mode(vm)) {

drivers/gpu/drm/xe/xe_pm.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "xe_pxp.h"
2626
#include "xe_sriov_vf_ccs.h"
2727
#include "xe_trace.h"
28+
#include "xe_vm.h"
2829
#include "xe_wa.h"
2930

3031
/**
@@ -301,6 +302,19 @@ static u32 vram_threshold_value(struct xe_device *xe)
301302
return DEFAULT_VRAM_THRESHOLD;
302303
}
303304

305+
static void xe_pm_wake_rebind_workers(struct xe_device *xe)
306+
{
307+
struct xe_vm *vm, *next;
308+
309+
mutex_lock(&xe->rebind_resume_lock);
310+
list_for_each_entry_safe(vm, next, &xe->rebind_resume_list,
311+
preempt.pm_activate_link) {
312+
list_del_init(&vm->preempt.pm_activate_link);
313+
xe_vm_resume_rebind_worker(vm);
314+
}
315+
mutex_unlock(&xe->rebind_resume_lock);
316+
}
317+
304318
static int xe_pm_notifier_callback(struct notifier_block *nb,
305319
unsigned long action, void *data)
306320
{
@@ -310,6 +324,7 @@ static int xe_pm_notifier_callback(struct notifier_block *nb,
310324
switch (action) {
311325
case PM_HIBERNATION_PREPARE:
312326
case PM_SUSPEND_PREPARE:
327+
reinit_completion(&xe->pm_block);
313328
xe_pm_runtime_get(xe);
314329
err = xe_bo_evict_all_user(xe);
315330
if (err)
@@ -326,6 +341,8 @@ static int xe_pm_notifier_callback(struct notifier_block *nb,
326341
break;
327342
case PM_POST_HIBERNATION:
328343
case PM_POST_SUSPEND:
344+
complete_all(&xe->pm_block);
345+
xe_pm_wake_rebind_workers(xe);
329346
xe_bo_notifier_unprepare_all_pinned(xe);
330347
xe_pm_runtime_put(xe);
331348
break;
@@ -352,6 +369,14 @@ int xe_pm_init(struct xe_device *xe)
352369
if (err)
353370
return err;
354371

372+
err = drmm_mutex_init(&xe->drm, &xe->rebind_resume_lock);
373+
if (err)
374+
goto err_unregister;
375+
376+
init_completion(&xe->pm_block);
377+
complete_all(&xe->pm_block);
378+
INIT_LIST_HEAD(&xe->rebind_resume_list);
379+
355380
/* For now suspend/resume is only allowed with GuC */
356381
if (!xe_device_uc_enabled(xe))
357382
return 0;

drivers/gpu/drm/xe/xe_vm.c

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,9 @@ static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
343343
list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
344344
&vm->rebind_list);
345345

346+
if (!try_wait_for_completion(&vm->xe->pm_block))
347+
return -EAGAIN;
348+
346349
ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false);
347350
if (ret)
348351
return ret;
@@ -429,6 +432,33 @@ static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
429432
return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues);
430433
}
431434

435+
static bool vm_suspend_rebind_worker(struct xe_vm *vm)
436+
{
437+
struct xe_device *xe = vm->xe;
438+
bool ret = false;
439+
440+
mutex_lock(&xe->rebind_resume_lock);
441+
if (!try_wait_for_completion(&vm->xe->pm_block)) {
442+
ret = true;
443+
list_move_tail(&vm->preempt.pm_activate_link, &xe->rebind_resume_list);
444+
}
445+
mutex_unlock(&xe->rebind_resume_lock);
446+
447+
return ret;
448+
}
449+
450+
/**
451+
* xe_vm_resume_rebind_worker() - Resume the rebind worker.
452+
* @vm: The vm whose preempt worker to resume.
453+
*
454+
* Resume a preempt worker that was previously suspended by
455+
* vm_suspend_rebind_worker().
456+
*/
457+
void xe_vm_resume_rebind_worker(struct xe_vm *vm)
458+
{
459+
queue_work(vm->xe->ordered_wq, &vm->preempt.rebind_work);
460+
}
461+
432462
static void preempt_rebind_work_func(struct work_struct *w)
433463
{
434464
struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
@@ -452,6 +482,11 @@ static void preempt_rebind_work_func(struct work_struct *w)
452482
}
453483

454484
retry:
485+
if (!try_wait_for_completion(&vm->xe->pm_block) && vm_suspend_rebind_worker(vm)) {
486+
up_write(&vm->lock);
487+
return;
488+
}
489+
455490
if (xe_vm_userptr_check_repin(vm)) {
456491
err = xe_vm_userptr_pin(vm);
457492
if (err)
@@ -1470,6 +1505,7 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef)
14701505
if (flags & XE_VM_FLAG_LR_MODE) {
14711506
INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
14721507
xe_pm_runtime_get_noresume(xe);
1508+
INIT_LIST_HEAD(&vm->preempt.pm_activate_link);
14731509
}
14741510

14751511
err = xe_svm_init(vm);
@@ -1649,8 +1685,12 @@ void xe_vm_close_and_put(struct xe_vm *vm)
16491685
xe_assert(xe, !vm->preempt.num_exec_queues);
16501686

16511687
xe_vm_close(vm);
1652-
if (xe_vm_in_preempt_fence_mode(vm))
1688+
if (xe_vm_in_preempt_fence_mode(vm)) {
1689+
mutex_lock(&xe->rebind_resume_lock);
1690+
list_del_init(&vm->preempt.pm_activate_link);
1691+
mutex_unlock(&xe->rebind_resume_lock);
16531692
flush_work(&vm->preempt.rebind_work);
1693+
}
16541694
if (xe_vm_in_fault_mode(vm))
16551695
xe_svm_close(vm);
16561696

drivers/gpu/drm/xe/xe_vm.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,8 @@ struct dma_fence *xe_vm_bind_kernel_bo(struct xe_vm *vm, struct xe_bo *bo,
271271
struct xe_exec_queue *q, u64 addr,
272272
enum xe_cache_level cache_lvl);
273273

274+
void xe_vm_resume_rebind_worker(struct xe_vm *vm);
275+
274276
/**
275277
* xe_vm_resv() - Return's the vm's reservation object
276278
* @vm: The vm

drivers/gpu/drm/xe/xe_vm_types.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,11 @@ struct xe_vm {
282282
* BOs
283283
*/
284284
struct work_struct rebind_work;
285+
/**
286+
* @preempt.pm_activate_link: Link to list of rebind workers to be
287+
* kicked on resume.
288+
*/
289+
struct list_head pm_activate_link;
285290
} preempt;
286291

287292
/** @um: unified memory state */

0 commit comments

Comments
 (0)