Skip to content

Commit c6a4d46

Browse files
committed
drm/xe: evict user memory in PM notifier
In the case of VRAM we might need to allocate large amounts of GFP_KERNEL memory on suspend, however doing that directly in the driver .suspend()/.prepare() callback is not advisable (no swap for example). To improve on this we can instead hook up to the PM notifier framework which is invoked at an earlier stage. We effectively call the evict routine twice, where the notifier will have hopefully have cleared out most if not everything by the time we call it a second time when entering the .suspend() callback. For s4 we also get the added benefit of allocating the system pages before the hibernation image size is calculated, which looks more sensible. Note that the .suspend() hook is still responsible for dealing with all the pinned memory. Improving that is left to another patch. Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/1181 Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4288 Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4566 Suggested-by: Thomas Hellström <thomas.hellstrom@linux.intel.com> Signed-off-by: Matthew Auld <matthew.auld@intel.com> Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com> Link: https://lore.kernel.org/r/20250416150913.434369-6-matthew.auld@intel.com
1 parent fa59771 commit c6a4d46

File tree

6 files changed

+84
-24
lines changed

6 files changed

+84
-24
lines changed

drivers/gpu/drm/xe/xe_bo_evict.c

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -47,25 +47,17 @@ static int xe_bo_apply_to_pinned(struct xe_device *xe,
4747
}
4848

4949
/**
50-
* xe_bo_evict_all - evict all BOs from VRAM
51-
*
50+
* xe_bo_evict_all_user - evict all non-pinned user BOs from VRAM
5251
* @xe: xe device
5352
*
54-
* Evict non-pinned user BOs first (via GPU), evict pinned external BOs next
55-
* (via GPU), wait for evictions, and finally evict pinned kernel BOs via CPU.
56-
* All eviction magic done via TTM calls.
53+
* Evict non-pinned user BOs (via GPU).
5754
*
5855
* Evict == move VRAM BOs to temporary (typically system) memory.
59-
*
60-
* This function should be called before the device goes into a suspend state
61-
* where the VRAM loses power.
6256
*/
63-
int xe_bo_evict_all(struct xe_device *xe)
57+
int xe_bo_evict_all_user(struct xe_device *xe)
6458
{
6559
struct ttm_device *bdev = &xe->ttm;
66-
struct xe_tile *tile;
6760
u32 mem_type;
68-
u8 id;
6961
int ret;
7062

7163
/* User memory */
@@ -91,9 +83,34 @@ int xe_bo_evict_all(struct xe_device *xe)
9183
}
9284
}
9385

94-
ret = xe_bo_apply_to_pinned(xe, &xe->pinned.late.external,
95-
&xe->pinned.late.external,
96-
xe_bo_evict_pinned);
86+
return 0;
87+
}
88+
89+
/**
90+
* xe_bo_evict_all - evict all BOs from VRAM
91+
* @xe: xe device
92+
*
93+
* Evict non-pinned user BOs first (via GPU), evict pinned external BOs next
94+
* (via GPU), wait for evictions, and finally evict pinned kernel BOs via CPU.
95+
* All eviction magic done via TTM calls.
96+
*
97+
* Evict == move VRAM BOs to temporary (typically system) memory.
98+
*
99+
* This function should be called before the device goes into a suspend state
100+
* where the VRAM loses power.
101+
*/
102+
int xe_bo_evict_all(struct xe_device *xe)
103+
{
104+
struct xe_tile *tile;
105+
u8 id;
106+
int ret;
107+
108+
ret = xe_bo_evict_all_user(xe);
109+
if (ret)
110+
return ret;
111+
112+
ret = xe_bo_apply_to_pinned(xe, &xe->pinned.late.kernel_bo_present,
113+
&xe->pinned.late.evicted, xe_bo_evict_pinned);
97114

98115
if (!ret)
99116
ret = xe_bo_apply_to_pinned(xe, &xe->pinned.late.kernel_bo_present,

drivers/gpu/drm/xe/xe_bo_evict.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
struct xe_device;
1010

1111
int xe_bo_evict_all(struct xe_device *xe);
12+
int xe_bo_evict_all_user(struct xe_device *xe);
1213
int xe_bo_restore_early(struct xe_device *xe);
1314
int xe_bo_restore_late(struct xe_device *xe);
1415

drivers/gpu/drm/xe/xe_device_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,6 +522,9 @@ struct xe_device {
522522
struct mutex lock;
523523
} d3cold;
524524

525+
/** @pm_notifier: Our PM notifier to perform actions in response to various PM events. */
526+
struct notifier_block pm_notifier;
527+
525528
/** @pmt: Support the PMT driver callback interface */
526529
struct {
527530
/** @pmt.lock: protect access for telemetry data */

drivers/gpu/drm/xe/xe_pci.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -742,7 +742,7 @@ static void xe_pci_remove(struct pci_dev *pdev)
742742
return;
743743

744744
xe_device_remove(xe);
745-
xe_pm_runtime_fini(xe);
745+
xe_pm_fini(xe);
746746
}
747747

748748
/*

drivers/gpu/drm/xe/xe_pm.c

Lines changed: 47 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,29 @@ static u32 vram_threshold_value(struct xe_device *xe)
286286
return DEFAULT_VRAM_THRESHOLD;
287287
}
288288

289+
static int xe_pm_notifier_callback(struct notifier_block *nb,
290+
unsigned long action, void *data)
291+
{
292+
struct xe_device *xe = container_of(nb, struct xe_device, pm_notifier);
293+
int err = 0;
294+
295+
switch (action) {
296+
case PM_HIBERNATION_PREPARE:
297+
case PM_SUSPEND_PREPARE:
298+
xe_pm_runtime_get(xe);
299+
err = xe_bo_evict_all_user(xe);
300+
xe_pm_runtime_put(xe);
301+
if (err)
302+
drm_dbg(&xe->drm, "Notifier evict user failed (%d)\n", err);
303+
break;
304+
}
305+
306+
if (err)
307+
return NOTIFY_BAD;
308+
309+
return NOTIFY_DONE;
310+
}
311+
289312
/**
290313
* xe_pm_init - Initialize Xe Power Management
291314
* @xe: xe device instance
@@ -299,6 +322,11 @@ int xe_pm_init(struct xe_device *xe)
299322
u32 vram_threshold;
300323
int err;
301324

325+
xe->pm_notifier.notifier_call = xe_pm_notifier_callback;
326+
err = register_pm_notifier(&xe->pm_notifier);
327+
if (err)
328+
return err;
329+
302330
/* For now suspend/resume is only allowed with GuC */
303331
if (!xe_device_uc_enabled(xe))
304332
return 0;
@@ -308,31 +336,42 @@ int xe_pm_init(struct xe_device *xe)
308336
if (xe->d3cold.capable) {
309337
err = xe_device_sysfs_init(xe);
310338
if (err)
311-
return err;
339+
goto err_unregister;
312340

313341
vram_threshold = vram_threshold_value(xe);
314342
err = xe_pm_set_vram_threshold(xe, vram_threshold);
315343
if (err)
316-
return err;
344+
goto err_unregister;
317345
}
318346

319347
xe_pm_runtime_init(xe);
320-
321348
return 0;
349+
350+
err_unregister:
351+
unregister_pm_notifier(&xe->pm_notifier);
352+
return err;
322353
}
323354

324-
/**
325-
* xe_pm_runtime_fini - Finalize Runtime PM
326-
* @xe: xe device instance
327-
*/
328-
void xe_pm_runtime_fini(struct xe_device *xe)
355+
static void xe_pm_runtime_fini(struct xe_device *xe)
329356
{
330357
struct device *dev = xe->drm.dev;
331358

332359
pm_runtime_get_sync(dev);
333360
pm_runtime_forbid(dev);
334361
}
335362

363+
/**
364+
* xe_pm_fini - Finalize PM
365+
* @xe: xe device instance
366+
*/
367+
void xe_pm_fini(struct xe_device *xe)
368+
{
369+
if (xe_device_uc_enabled(xe))
370+
xe_pm_runtime_fini(xe);
371+
372+
unregister_pm_notifier(&xe->pm_notifier);
373+
}
374+
336375
static void xe_pm_write_callback_task(struct xe_device *xe,
337376
struct task_struct *task)
338377
{

drivers/gpu/drm/xe/xe_pm.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ int xe_pm_resume(struct xe_device *xe);
1717

1818
int xe_pm_init_early(struct xe_device *xe);
1919
int xe_pm_init(struct xe_device *xe);
20-
void xe_pm_runtime_fini(struct xe_device *xe);
20+
void xe_pm_fini(struct xe_device *xe);
2121
bool xe_pm_runtime_suspended(struct xe_device *xe);
2222
int xe_pm_runtime_suspend(struct xe_device *xe);
2323
int xe_pm_runtime_resume(struct xe_device *xe);

0 commit comments

Comments
 (0)