Skip to content

Commit

Permalink
drm/i915: Don't touch fence->error when resetting an innocent request
Browse files Browse the repository at this point in the history
If the request has been completed before the reset took effect, we don't
need to mark it up as being a victim. Touching fence->error after the
fence has been signaled is detected by dma_fence_set_error() and
triggers a BUG:

[  231.743133] kernel BUG at ./include/linux/dma-fence.h:434!
[  231.743156] invalid opcode: 0000 [FireflyTeam#1] SMP KASAN
[  231.743172] Modules linked in: i915 drm_kms_helper drm iptable_nat nf_nat_ipv4 nf_nat x86_pkg_temp_thermal iosf_mbi i2c_algo_bit cfbfillrect syscopyarea cfbimgblt sysfillrect sysimgblt fb_sys_fops cfbcopyarea fb font fbdev [last unloaded: drm]
[  231.743221] CPU: 2 PID: 20 Comm: kworker/2:0 Tainted: G     U          4.13.0-rc1+ rockchip-linux#52
[  231.743236] Hardware name: Hewlett-Packard HP EliteBook 8460p/161C, BIOS 68SCF Ver. F.01 03/11/2011
[  231.743363] Workqueue: events_long i915_hangcheck_elapsed [i915]
[  231.743382] task: ffff8801f42e9780 task.stack: ffff8801f42f8000
[  231.743489] RIP: 0010:i915_gem_reset_engine+0x45a/0x460 [i915]
[  231.743505] RSP: 0018:ffff8801f42ff770 EFLAGS: 00010202
[  231.743521] RAX: 0000000000000007 RBX: ffff8801bf6b1880 RCX: ffffffffa02881a6
[  231.743537] RDX: dffffc0000000000 RSI: dffffc0000000000 RDI: ffff8801bf6b18c8
[  231.743551] RBP: ffff8801f42ff7c8 R08: 0000000000000001 R09: 0000000000000000
[  231.743566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801edb02d00
[  231.743581] R13: ffff8801e19d4200 R14: 000000000000001d R15: ffff8801ce2a4000
[  231.743599] FS:  0000000000000000(0000) GS:ffff8801f5a80000(0000) knlGS:0000000000000000
[  231.743614] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  231.743629] CR2: 00007f0ebd1add10 CR3: 0000000002621000 CR4: 00000000000406e0
[  231.743643] Call Trace:
[  231.743752]  i915_gem_reset+0x6c/0x150 [i915]
[  231.743853]  i915_reset+0x175/0x210 [i915]
[  231.743958]  i915_reset_device+0x33b/0x350 [i915]
[  231.744061]  ? valleyview_pipestat_irq_handler+0xe0/0xe0 [i915]
[  231.744081]  ? trace_hardirqs_off_caller+0x70/0x110
[  231.744102]  ? _raw_spin_unlock_irqrestore+0x46/0x50
[  231.744120]  ? find_held_lock+0x119/0x150
[  231.744138]  ? mark_lock+0x6d/0x850
[  231.744241]  ? gen8_gt_irq_ack+0x1f0/0x1f0 [i915]
[  231.744262]  ? work_on_cpu_safe+0x60/0x60
[  231.744284]  ? rcu_read_lock_sched_held+0x57/0xa0
[  231.744400]  ? gen6_read32+0x2ba/0x320 [i915]
[  231.744506]  i915_handle_error+0x382/0x5f0 [i915]
[  231.744611]  ? gen6_rps_reset_ei+0x20/0x20 [i915]
[  231.744630]  ? vsnprintf+0x128/0x8e0
[  231.744649]  ? pointer+0x6b0/0x6b0
[  231.744667]  ? debug_check_no_locks_freed+0x1a0/0x1a0
[  231.744688]  ? scnprintf+0x92/0xe0
[  231.744706]  ? snprintf+0xb0/0xb0
[  231.744820]  hangcheck_declare_hang+0x15a/0x1a0 [i915]
[  231.744932]  ? engine_stuck+0x440/0x440 [i915]
[  231.744951]  ? rcu_read_lock_sched_held+0x57/0xa0
[  231.745062]  ? gen6_read32+0x2ba/0x320 [i915]
[  231.745173]  ? gen6_read16+0x320/0x320 [i915]
[  231.745284]  ? intel_engine_get_active_head+0x91/0x170 [i915]
[  231.745401]  i915_hangcheck_elapsed+0x3d8/0x400 [i915]
[  231.745424]  process_one_work+0x3e8/0xac0
[  231.745444]  ? pwq_dec_nr_in_flight+0x110/0x110
[  231.745464]  ? do_raw_spin_lock+0x8e/0x120
[  231.745484]  worker_thread+0x8d/0x720
[  231.745506]  kthread+0x19e/0x1f0
[  231.745524]  ? process_one_work+0xac0/0xac0
[  231.745541]  ? kthread_create_on_node+0xa0/0xa0
[  231.745560]  ret_from_fork+0x27/0x40
[  231.745581] Code: 8b 7d c8 e8 49 0d 02 e1 49 8b 7f 38 48 8b 75 b8 48 83 c7 10 e8 b8 89 be e1 e9 95 fc ff ff 4c 89 e7 e8 4b b9 ff ff e9 30 ff ff ff <0f> 0b 0f 1f 40 00 55 48 89 e5 41 57 41 56 41 55 41 54 49 89 fe
[  231.745767] RIP: i915_gem_reset_engine+0x45a/0x460 [i915] RSP: ffff8801f42ff770

At first glance this looks to be related to commit c64992e
("drm/i915: Look for active requests earlier in the reset path"), but it
could easily happen before as well. On the other hand, we no longer
logged victims due to the active_request being dropped earlier.

v2: Be trickier to unwind the incomplete request as we cannot rely on
request retirement for the lockless per-engine reset.
v3: Reprobe the active request at the time of the reset.

Reported-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Fixes: c64992e ("drm/i915: Look for active requests earlier in the reset path")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michel Thierry <michel.thierry@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20170721123238.16428-15-chris@chris-wilson.co.uk
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> #v1
Reviewed-by: Michel Thierry <michel.thierry@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
  • Loading branch information
ickle authored and danvet committed Jul 27, 2017
1 parent 6492ca7 commit d1d1ebf
Showing 1 changed file with 34 additions and 19 deletions.
53 changes: 34 additions & 19 deletions drivers/gpu/drm/i915/i915_gem.c
Original file line number Diff line number Diff line change
Expand Up @@ -2854,11 +2854,9 @@ i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
if (engine->irq_seqno_barrier)
engine->irq_seqno_barrier(engine);

if (engine_stalled(engine)) {
request = i915_gem_find_active_request(engine);
if (request && request->fence.error == -EIO)
request = ERR_PTR(-EIO); /* Previous reset failed! */
}
request = i915_gem_find_active_request(engine);
if (request && request->fence.error == -EIO)
request = ERR_PTR(-EIO); /* Previous reset failed! */

return request;
}
Expand Down Expand Up @@ -2927,12 +2925,11 @@ static void engine_skip_context(struct drm_i915_gem_request *request)
spin_unlock_irqrestore(&engine->timeline->lock, flags);
}

/* Returns true if the request was guilty of hang */
static bool i915_gem_reset_request(struct drm_i915_gem_request *request)
/* Returns the request if it was guilty of the hang */
static struct drm_i915_gem_request *
i915_gem_reset_request(struct intel_engine_cs *engine,
struct drm_i915_gem_request *request)
{
/* Read once and return the resolution */
const bool guilty = !i915_gem_request_completed(request);

/* The guilty request will get skipped on a hung engine.
*
* Users of client default contexts do not rely on logical
Expand All @@ -2954,29 +2951,47 @@ static bool i915_gem_reset_request(struct drm_i915_gem_request *request)
* subsequent hangs.
*/

if (guilty) {
if (engine_stalled(engine)) {
i915_gem_context_mark_guilty(request->ctx);
skip_request(request);

/* If this context is now banned, skip all pending requests. */
if (i915_gem_context_is_banned(request->ctx))
engine_skip_context(request);
} else {
i915_gem_context_mark_innocent(request->ctx);
dma_fence_set_error(&request->fence, -EAGAIN);
/*
* Since this is not the hung engine, it may have advanced
* since the hang declaration. Double check by refinding
* the active request at the time of the reset.
*/
request = i915_gem_find_active_request(engine);
if (request) {
i915_gem_context_mark_innocent(request->ctx);
dma_fence_set_error(&request->fence, -EAGAIN);

/* Rewind the engine to replay the incomplete rq */
spin_lock_irq(&engine->timeline->lock);
request = list_prev_entry(request, link);
if (&request->link == &engine->timeline->requests)
request = NULL;
spin_unlock_irq(&engine->timeline->lock);
}
}

return guilty;
return request;
}

void i915_gem_reset_engine(struct intel_engine_cs *engine,
struct drm_i915_gem_request *request)
{
engine->irq_posted = 0;

if (request && i915_gem_reset_request(request)) {
if (request)
request = i915_gem_reset_request(engine, request);

if (request) {
DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
engine->name, request->global_seqno);

/* If this context is now banned, skip all pending requests. */
if (i915_gem_context_is_banned(request->ctx))
engine_skip_context(request);
}

/* Setup the CS to resume from the breadcrumb of the hung request */
Expand Down

0 comments on commit d1d1ebf

Please sign in to comment.