Skip to content

Commit bda3002

Browse files
committed
drm/i915: Improve record of hung engines in error state
Between events which trigger engine and GPU resets and capturing the error state we lose information on which engine triggered the reset. Improve this by passing in the hung engine mask down to error capture. Result is that the list of engines in user visible "GPU HANG: ecode <gen>:<engines>:<ecode>, <process>" is now a list of hanging and not just active engines. Most importantly the displayed process is now the one which was actually hung. v2: * Stub prototype. (Chris) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Link: https://patchwork.freedesktop.org/patch/msgid/20201104134743.916027-1-tvrtko.ursulin@linux.intel.com
1 parent ad18fa0 commit bda3002

File tree

5 files changed

+34
-17
lines changed

5 files changed

+34
-17
lines changed

drivers/gpu/drm/i915/gt/intel_lrc.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3037,6 +3037,8 @@ static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
30373037
if (!cap->error->gt->engine)
30383038
goto err_gt;
30393039

3040+
cap->error->gt->engine->hung = true;
3041+
30403042
return cap;
30413043

30423044
err_gt:

drivers/gpu/drm/i915/gt/intel_reset.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1251,7 +1251,7 @@ void intel_gt_handle_error(struct intel_gt *gt,
12511251
engine_mask &= gt->info.engine_mask;
12521252

12531253
if (flags & I915_ERROR_CAPTURE) {
1254-
i915_capture_error_state(gt->i915);
1254+
i915_capture_error_state(gt, engine_mask);
12551255
intel_gt_clear_error_registers(gt, engine_mask);
12561256
}
12571257

drivers/gpu/drm/i915/i915_debugfs.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -725,7 +725,7 @@ static int i915_gpu_info_open(struct inode *inode, struct file *file)
725725

726726
gpu = NULL;
727727
with_intel_runtime_pm(&i915->runtime_pm, wakeref)
728-
gpu = i915_gpu_coredump(i915);
728+
gpu = i915_gpu_coredump(&i915->gt, ALL_ENGINES);
729729
if (IS_ERR(gpu))
730730
return PTR_ERR(gpu);
731731

drivers/gpu/drm/i915/i915_gpu_error.c

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
570570
ee->vm_info.pp_dir_base);
571571
}
572572
}
573+
err_printf(m, " hung: %u\n", ee->hung);
573574
err_printf(m, " engine reset count: %u\n", ee->reset_count);
574575

575576
for (n = 0; n < ee->num_ports; n++) {
@@ -1456,6 +1457,7 @@ capture_engine(struct intel_engine_cs *engine,
14561457

14571458
static void
14581459
gt_record_engines(struct intel_gt_coredump *gt,
1460+
intel_engine_mask_t engine_mask,
14591461
struct i915_vma_compress *compress)
14601462
{
14611463
struct intel_engine_cs *engine;
@@ -1471,6 +1473,8 @@ gt_record_engines(struct intel_gt_coredump *gt,
14711473
if (!ee)
14721474
continue;
14731475

1476+
ee->hung = engine->mask & engine_mask;
1477+
14741478
gt->simulated |= ee->simulated;
14751479
if (ee->simulated) {
14761480
kfree(ee);
@@ -1663,11 +1667,13 @@ static const char *error_msg(struct i915_gpu_coredump *error)
16631667
for (gt = error->gt; gt; gt = gt->next) {
16641668
struct intel_engine_coredump *cs;
16651669

1666-
if (gt->engine && !first)
1667-
first = gt->engine;
1668-
1669-
for (cs = gt->engine; cs; cs = cs->next)
1670-
engines |= cs->engine->mask;
1670+
for (cs = gt->engine; cs; cs = cs->next) {
1671+
if (cs->hung) {
1672+
engines |= cs->engine->mask;
1673+
if (!first)
1674+
first = cs;
1675+
}
1676+
}
16711677
}
16721678

16731679
len = scnprintf(error->error_msg, sizeof(error->error_msg),
@@ -1781,8 +1787,10 @@ void i915_vma_capture_finish(struct intel_gt_coredump *gt,
17811787
kfree(compress);
17821788
}
17831789

1784-
struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915)
1790+
struct i915_gpu_coredump *
1791+
i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask)
17851792
{
1793+
struct drm_i915_private *i915 = gt->i915;
17861794
struct i915_gpu_coredump *error;
17871795

17881796
/* Check if GPU capture has been disabled */
@@ -1794,7 +1802,7 @@ struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915)
17941802
if (!error)
17951803
return ERR_PTR(-ENOMEM);
17961804

1797-
error->gt = intel_gt_coredump_alloc(&i915->gt, ALLOW_FAIL);
1805+
error->gt = intel_gt_coredump_alloc(gt, ALLOW_FAIL);
17981806
if (error->gt) {
17991807
struct i915_vma_compress *compress;
18001808

@@ -1806,7 +1814,7 @@ struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915)
18061814
}
18071815

18081816
gt_record_info(error->gt);
1809-
gt_record_engines(error->gt, compress);
1817+
gt_record_engines(error->gt, engine_mask, compress);
18101818

18111819
if (INTEL_INFO(i915)->has_gt_uc)
18121820
error->gt->uc = gt_record_uc(error->gt, compress);
@@ -1853,20 +1861,23 @@ void i915_error_state_store(struct i915_gpu_coredump *error)
18531861

18541862
/**
18551863
* i915_capture_error_state - capture an error record for later analysis
1856-
* @i915: i915 device
1864+
* @gt: intel_gt which originated the hang
1865+
* @engine_mask: hung engines
1866+
*
18571867
*
18581868
* Should be called when an error is detected (either a hang or an error
18591869
* interrupt) to capture error state from the time of the error. Fills
18601870
* out a structure which becomes available in debugfs for user level tools
18611871
* to pick up.
18621872
*/
1863-
void i915_capture_error_state(struct drm_i915_private *i915)
1873+
void i915_capture_error_state(struct intel_gt *gt,
1874+
intel_engine_mask_t engine_mask)
18641875
{
18651876
struct i915_gpu_coredump *error;
18661877

1867-
error = i915_gpu_coredump(i915);
1878+
error = i915_gpu_coredump(gt, engine_mask);
18681879
if (IS_ERR(error)) {
1869-
cmpxchg(&i915->gpu_error.first_error, NULL, error);
1880+
cmpxchg(&gt->i915->gpu_error.first_error, NULL, error);
18701881
return;
18711882
}
18721883

drivers/gpu/drm/i915/i915_gpu_error.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ struct i915_request_coredump {
5959
struct intel_engine_coredump {
6060
const struct intel_engine_cs *engine;
6161

62+
bool hung;
6263
bool simulated;
6364
u32 reset_count;
6465

@@ -218,8 +219,10 @@ struct drm_i915_error_state_buf {
218219
__printf(2, 3)
219220
void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
220221

221-
struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915);
222-
void i915_capture_error_state(struct drm_i915_private *i915);
222+
struct i915_gpu_coredump *i915_gpu_coredump(struct intel_gt *gt,
223+
intel_engine_mask_t engine_mask);
224+
void i915_capture_error_state(struct intel_gt *gt,
225+
intel_engine_mask_t engine_mask);
223226

224227
struct i915_gpu_coredump *
225228
i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp);
@@ -271,7 +274,8 @@ void i915_disable_error_state(struct drm_i915_private *i915, int err);
271274

272275
#else
273276

274-
static inline void i915_capture_error_state(struct drm_i915_private *i915)
277+
static inline void
278+
i915_capture_error_state(struct intel_gt *gt, intel_engine_mask_t engine_mask)
275279
{
276280
}
277281

0 commit comments

Comments
 (0)