Skip to content

Commit ae5fbbd

Browse files
dceraolorodrigovivi
authored andcommitted
drm/xe: Fix error handling if PXP fails to start
Since the PXP start comes after __xe_exec_queue_init() has completed, we need to cleanup what was done in that function in case of a PXP start error. __xe_exec_queue_init calls the submission backend init() function, so we need to introduce an opposite for that. Unfortunately, while we already have a fini() function pointer, it performs other operations in addition to cleaning up what was done by the init(). Therefore, for clarity, the existing fini() has been renamed to destroy(), while a new fini() has been added to only clean up what was done by the init(), with the latter being called by the former (via xe_exec_queue_fini). Fixes: 72d4796 ("drm/xe/pxp/uapi: Add userspace and LRC support for PXP-using queues") Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> Cc: John Harrison <John.C.Harrison@Intel.com> Cc: Matthew Brost <matthew.brost@intel.com> Reviewed-by: John Harrison <John.C.Harrison@Intel.com> Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Link: https://lore.kernel.org/r/20250909221240.3711023-3-daniele.ceraolospurio@intel.com (cherry picked from commit 6266673) Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
1 parent ff89a4d commit ae5fbbd

File tree

6 files changed

+72
-41
lines changed

6 files changed

+72
-41
lines changed

drivers/gpu/drm/xe/xe_exec_queue.c

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,16 @@ static int __xe_exec_queue_init(struct xe_exec_queue *q)
151151
return err;
152152
}
153153

154+
static void __xe_exec_queue_fini(struct xe_exec_queue *q)
155+
{
156+
int i;
157+
158+
q->ops->fini(q);
159+
160+
for (i = 0; i < q->width; ++i)
161+
xe_lrc_put(q->lrc[i]);
162+
}
163+
154164
struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *vm,
155165
u32 logical_mask, u16 width,
156166
struct xe_hw_engine *hwe, u32 flags,
@@ -181,11 +191,13 @@ struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *v
181191
if (xe_exec_queue_uses_pxp(q)) {
182192
err = xe_pxp_exec_queue_add(xe->pxp, q);
183193
if (err)
184-
goto err_post_alloc;
194+
goto err_post_init;
185195
}
186196

187197
return q;
188198

199+
err_post_init:
200+
__xe_exec_queue_fini(q);
189201
err_post_alloc:
190202
__xe_exec_queue_free(q);
191203
return ERR_PTR(err);
@@ -283,13 +295,11 @@ void xe_exec_queue_destroy(struct kref *ref)
283295
xe_exec_queue_put(eq);
284296
}
285297

286-
q->ops->fini(q);
298+
q->ops->destroy(q);
287299
}
288300

289301
void xe_exec_queue_fini(struct xe_exec_queue *q)
290302
{
291-
int i;
292-
293303
/*
294304
* Before releasing our ref to lrc and xef, accumulate our run ticks
295305
* and wakeup any waiters.
@@ -298,9 +308,7 @@ void xe_exec_queue_fini(struct xe_exec_queue *q)
298308
if (q->xef && atomic_dec_and_test(&q->xef->exec_queue.pending_removal))
299309
wake_up_var(&q->xef->exec_queue.pending_removal);
300310

301-
for (i = 0; i < q->width; ++i)
302-
xe_lrc_put(q->lrc[i]);
303-
311+
__xe_exec_queue_fini(q);
304312
__xe_exec_queue_free(q);
305313
}
306314

drivers/gpu/drm/xe/xe_exec_queue_types.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,8 +166,14 @@ struct xe_exec_queue_ops {
166166
int (*init)(struct xe_exec_queue *q);
167167
/** @kill: Kill inflight submissions for backend */
168168
void (*kill)(struct xe_exec_queue *q);
169-
/** @fini: Fini exec queue for submission backend */
169+
/** @fini: Undoes the init() for submission backend */
170170
void (*fini)(struct xe_exec_queue *q);
171+
/**
172+
* @destroy: Destroy exec queue for submission backend. The backend
173+
* function must call xe_exec_queue_fini() (which will in turn call the
174+
* fini() backend function) to ensure the queue is properly cleaned up.
175+
*/
176+
void (*destroy)(struct xe_exec_queue *q);
171177
/** @set_priority: Set priority for exec queue */
172178
int (*set_priority)(struct xe_exec_queue *q,
173179
enum xe_exec_queue_priority priority);

drivers/gpu/drm/xe/xe_execlist.c

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -385,10 +385,20 @@ static int execlist_exec_queue_init(struct xe_exec_queue *q)
385385
return err;
386386
}
387387

388-
static void execlist_exec_queue_fini_async(struct work_struct *w)
388+
static void execlist_exec_queue_fini(struct xe_exec_queue *q)
389+
{
390+
struct xe_execlist_exec_queue *exl = q->execlist;
391+
392+
drm_sched_entity_fini(&exl->entity);
393+
drm_sched_fini(&exl->sched);
394+
395+
kfree(exl);
396+
}
397+
398+
static void execlist_exec_queue_destroy_async(struct work_struct *w)
389399
{
390400
struct xe_execlist_exec_queue *ee =
391-
container_of(w, struct xe_execlist_exec_queue, fini_async);
401+
container_of(w, struct xe_execlist_exec_queue, destroy_async);
392402
struct xe_exec_queue *q = ee->q;
393403
struct xe_execlist_exec_queue *exl = q->execlist;
394404
struct xe_device *xe = gt_to_xe(q->gt);
@@ -401,10 +411,6 @@ static void execlist_exec_queue_fini_async(struct work_struct *w)
401411
list_del(&exl->active_link);
402412
spin_unlock_irqrestore(&exl->port->lock, flags);
403413

404-
drm_sched_entity_fini(&exl->entity);
405-
drm_sched_fini(&exl->sched);
406-
kfree(exl);
407-
408414
xe_exec_queue_fini(q);
409415
}
410416

@@ -413,10 +419,10 @@ static void execlist_exec_queue_kill(struct xe_exec_queue *q)
413419
/* NIY */
414420
}
415421

416-
static void execlist_exec_queue_fini(struct xe_exec_queue *q)
422+
static void execlist_exec_queue_destroy(struct xe_exec_queue *q)
417423
{
418-
INIT_WORK(&q->execlist->fini_async, execlist_exec_queue_fini_async);
419-
queue_work(system_unbound_wq, &q->execlist->fini_async);
424+
INIT_WORK(&q->execlist->destroy_async, execlist_exec_queue_destroy_async);
425+
queue_work(system_unbound_wq, &q->execlist->destroy_async);
420426
}
421427

422428
static int execlist_exec_queue_set_priority(struct xe_exec_queue *q,
@@ -467,6 +473,7 @@ static const struct xe_exec_queue_ops execlist_exec_queue_ops = {
467473
.init = execlist_exec_queue_init,
468474
.kill = execlist_exec_queue_kill,
469475
.fini = execlist_exec_queue_fini,
476+
.destroy = execlist_exec_queue_destroy,
470477
.set_priority = execlist_exec_queue_set_priority,
471478
.set_timeslice = execlist_exec_queue_set_timeslice,
472479
.set_preempt_timeout = execlist_exec_queue_set_preempt_timeout,

drivers/gpu/drm/xe/xe_execlist_types.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ struct xe_execlist_exec_queue {
4242

4343
bool has_run;
4444

45-
struct work_struct fini_async;
45+
struct work_struct destroy_async;
4646

4747
enum xe_exec_queue_priority active_priority;
4848
struct list_head active_link;

drivers/gpu/drm/xe/xe_guc_exec_queue_types.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ struct xe_guc_exec_queue {
3535
struct xe_sched_msg static_msgs[MAX_STATIC_MSG_TYPE];
3636
/** @lr_tdr: long running TDR worker */
3737
struct work_struct lr_tdr;
38-
/** @fini_async: do final fini async from this worker */
39-
struct work_struct fini_async;
38+
/** @destroy_async: do final destroy async from this worker */
39+
struct work_struct destroy_async;
4040
/** @resume_time: time of last resume */
4141
u64 resume_time;
4242
/** @state: GuC specific state for this xe_exec_queue */

drivers/gpu/drm/xe/xe_guc_submit.c

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1277,48 +1277,57 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
12771277
return DRM_GPU_SCHED_STAT_NO_HANG;
12781278
}
12791279

1280-
static void __guc_exec_queue_fini_async(struct work_struct *w)
1280+
static void guc_exec_queue_fini(struct xe_exec_queue *q)
1281+
{
1282+
struct xe_guc_exec_queue *ge = q->guc;
1283+
struct xe_guc *guc = exec_queue_to_guc(q);
1284+
1285+
release_guc_id(guc, q);
1286+
xe_sched_entity_fini(&ge->entity);
1287+
xe_sched_fini(&ge->sched);
1288+
1289+
/*
1290+
* RCU free due sched being exported via DRM scheduler fences
1291+
* (timeline name).
1292+
*/
1293+
kfree_rcu(ge, rcu);
1294+
}
1295+
1296+
static void __guc_exec_queue_destroy_async(struct work_struct *w)
12811297
{
12821298
struct xe_guc_exec_queue *ge =
1283-
container_of(w, struct xe_guc_exec_queue, fini_async);
1299+
container_of(w, struct xe_guc_exec_queue, destroy_async);
12841300
struct xe_exec_queue *q = ge->q;
12851301
struct xe_guc *guc = exec_queue_to_guc(q);
12861302

12871303
xe_pm_runtime_get(guc_to_xe(guc));
12881304
trace_xe_exec_queue_destroy(q);
12891305

1290-
release_guc_id(guc, q);
12911306
if (xe_exec_queue_is_lr(q))
12921307
cancel_work_sync(&ge->lr_tdr);
12931308
/* Confirm no work left behind accessing device structures */
12941309
cancel_delayed_work_sync(&ge->sched.base.work_tdr);
1295-
xe_sched_entity_fini(&ge->entity);
1296-
xe_sched_fini(&ge->sched);
12971310

1298-
/*
1299-
* RCU free due sched being exported via DRM scheduler fences
1300-
* (timeline name).
1301-
*/
1302-
kfree_rcu(ge, rcu);
13031311
xe_exec_queue_fini(q);
1312+
13041313
xe_pm_runtime_put(guc_to_xe(guc));
13051314
}
13061315

1307-
static void guc_exec_queue_fini_async(struct xe_exec_queue *q)
1316+
static void guc_exec_queue_destroy_async(struct xe_exec_queue *q)
13081317
{
13091318
struct xe_guc *guc = exec_queue_to_guc(q);
13101319
struct xe_device *xe = guc_to_xe(guc);
13111320

1312-
INIT_WORK(&q->guc->fini_async, __guc_exec_queue_fini_async);
1321+
INIT_WORK(&q->guc->destroy_async, __guc_exec_queue_destroy_async);
13131322

13141323
/* We must block on kernel engines so slabs are empty on driver unload */
13151324
if (q->flags & EXEC_QUEUE_FLAG_PERMANENT || exec_queue_wedged(q))
1316-
__guc_exec_queue_fini_async(&q->guc->fini_async);
1325+
__guc_exec_queue_destroy_async(&q->guc->destroy_async);
13171326
else
1318-
queue_work(xe->destroy_wq, &q->guc->fini_async);
1327+
queue_work(xe->destroy_wq, &q->guc->destroy_async);
13191328
}
13201329

1321-
static void __guc_exec_queue_fini(struct xe_guc *guc, struct xe_exec_queue *q)
1330+
static void __guc_exec_queue_destroy(struct xe_guc *guc, struct xe_exec_queue *q)
13221331
{
13231332
/*
13241333
* Might be done from within the GPU scheduler, need to do async as we
@@ -1327,7 +1336,7 @@ static void __guc_exec_queue_fini(struct xe_guc *guc, struct xe_exec_queue *q)
13271336
* this we and don't really care when everything is fini'd, just that it
13281337
* is.
13291338
*/
1330-
guc_exec_queue_fini_async(q);
1339+
guc_exec_queue_destroy_async(q);
13311340
}
13321341

13331342
static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg)
@@ -1341,7 +1350,7 @@ static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg)
13411350
if (exec_queue_registered(q))
13421351
disable_scheduling_deregister(guc, q);
13431352
else
1344-
__guc_exec_queue_fini(guc, q);
1353+
__guc_exec_queue_destroy(guc, q);
13451354
}
13461355

13471356
static bool guc_exec_queue_allowed_to_change_state(struct xe_exec_queue *q)
@@ -1574,14 +1583,14 @@ static bool guc_exec_queue_try_add_msg(struct xe_exec_queue *q,
15741583
#define STATIC_MSG_CLEANUP 0
15751584
#define STATIC_MSG_SUSPEND 1
15761585
#define STATIC_MSG_RESUME 2
1577-
static void guc_exec_queue_fini(struct xe_exec_queue *q)
1586+
static void guc_exec_queue_destroy(struct xe_exec_queue *q)
15781587
{
15791588
struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_CLEANUP;
15801589

15811590
if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && !exec_queue_wedged(q))
15821591
guc_exec_queue_add_msg(q, msg, CLEANUP);
15831592
else
1584-
__guc_exec_queue_fini(exec_queue_to_guc(q), q);
1593+
__guc_exec_queue_destroy(exec_queue_to_guc(q), q);
15851594
}
15861595

15871596
static int guc_exec_queue_set_priority(struct xe_exec_queue *q,
@@ -1711,6 +1720,7 @@ static const struct xe_exec_queue_ops guc_exec_queue_ops = {
17111720
.init = guc_exec_queue_init,
17121721
.kill = guc_exec_queue_kill,
17131722
.fini = guc_exec_queue_fini,
1723+
.destroy = guc_exec_queue_destroy,
17141724
.set_priority = guc_exec_queue_set_priority,
17151725
.set_timeslice = guc_exec_queue_set_timeslice,
17161726
.set_preempt_timeout = guc_exec_queue_set_preempt_timeout,
@@ -1732,7 +1742,7 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
17321742
if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q))
17331743
xe_exec_queue_put(q);
17341744
else if (exec_queue_destroyed(q))
1735-
__guc_exec_queue_fini(guc, q);
1745+
__guc_exec_queue_destroy(guc, q);
17361746
}
17371747
if (q->guc->suspend_pending) {
17381748
set_exec_queue_suspended(q);
@@ -1989,7 +1999,7 @@ static void handle_deregister_done(struct xe_guc *guc, struct xe_exec_queue *q)
19891999
if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q))
19902000
xe_exec_queue_put(q);
19912001
else
1992-
__guc_exec_queue_fini(guc, q);
2002+
__guc_exec_queue_destroy(guc, q);
19932003
}
19942004

19952005
int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len)

0 commit comments

Comments
 (0)