Skip to content

Commit a7ffcea

Browse files
committed
drm/xe/guc: Enable extended CAT error reporting
On newer HW (Xe2 onwards + PVC) it is possible to get extra information when a CAT error occurs, specifically a dword reporting the error type. To enable this extra reporting, we need to opt-in with the GuC, which is done via a specific per-VF feature opt-in H2G. On platforms where the HW does not support the extra reporting, the GuC will set the type to 0xdeadbeef, so we can keep the code simple and opt-in to the feature on every platform and then just discard the data if it is invalid. Note that on native/PF we're guaranteed that the opt in is available because we don't support any GuC old enough to not have it, but if we're a VF we might be running on a non-XE PF with an older GuC, so we need to handle that case. We can re-use the invalid type above to handle this scenario the same way as if the feature was not supported in HW. Given that this patch is the first user of the guc_buf_cache on native and VF, it also extends that feature to non-PF use-cases. v2: simpler print for the error type (John), rebase v3: use guc_buf_cache instead of new alloc, simpler doc (Michal) Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> Cc: Nirmoy Das <nirmoy.das@intel.com> Cc: John Harrison <John.C.Harrison@Intel.com> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com> Reviewed-by: Nirmoy Das <nirmoy.das@intel.com> #v1 Reviewed-by: Michal Wajdeczko <michal.wajdeczko@intel.com> Reviewed-by: John Harrison <John.C.Harrison@Intel.com> Link: https://lore.kernel.org/r/20250625205405.1653212-3-daniele.ceraolospurio@intel.com
1 parent c038bdb commit a7ffcea

File tree

6 files changed

+98
-3
lines changed

6 files changed

+98
-3
lines changed

drivers/gpu/drm/xe/abi/guc_actions_abi.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ enum xe_guc_action {
142142
XE_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
143143
XE_GUC_ACTION_SET_DEVICE_ENGINE_ACTIVITY_BUFFER = 0x550C,
144144
XE_GUC_ACTION_SET_FUNCTION_ENGINE_ACTIVITY_BUFFER = 0x550D,
145+
XE_GUC_ACTION_OPT_IN_FEATURE_KLV = 0x550E,
145146
XE_GUC_ACTION_NOTIFY_MEMORY_CAT_ERROR = 0x6000,
146147
XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC = 0x6002,
147148
XE_GUC_ACTION_PAGE_FAULT_RES_DESC = 0x6003,
@@ -271,4 +272,7 @@ enum xe_guc_g2g_type {
271272
#define XE_G2G_DEREGISTER_TILE REG_GENMASK(15, 12)
272273
#define XE_G2G_DEREGISTER_TYPE REG_GENMASK(11, 8)
273274

275+
/* invalid type for XE_GUC_ACTION_NOTIFY_MEMORY_CAT_ERROR */
276+
#define XE_GUC_CAT_ERR_TYPE_INVALID 0xdeadbeef
277+
274278
#endif

drivers/gpu/drm/xe/abi/guc_klvs_abi.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
* +===+=======+==============================================================+
1717
* | 0 | 31:16 | **KEY** - KLV key identifier |
1818
* | | | - `GuC Self Config KLVs`_ |
19+
* | | | - `GuC Opt In Feature KLVs`_ |
1920
* | | | - `GuC VGT Policy KLVs`_ |
2021
* | | | - `GuC VF Configuration KLVs`_ |
2122
* | | | |
@@ -124,6 +125,20 @@ enum {
124125
GUC_CONTEXT_POLICIES_KLV_NUM_IDS = 5,
125126
};
126127

128+
/**
129+
* DOC: GuC Opt In Feature KLVs
130+
*
131+
* `GuC KLV`_ keys available for use with OPT_IN_FEATURE_KLV
132+
*
133+
* _`GUC_KLV_OPT_IN_FEATURE_EXT_CAT_ERR_TYPE` : 0x4001
134+
* Adds an extra dword to the XE_GUC_ACTION_NOTIFY_MEMORY_CAT_ERROR G2H
135+
* containing the type of the CAT error. On HW that does not support
136+
* reporting the CAT error type, the extra dword is set to 0xdeadbeef.
137+
*/
138+
139+
#define GUC_KLV_OPT_IN_FEATURE_EXT_CAT_ERR_TYPE_KEY 0x4001
140+
#define GUC_KLV_OPT_IN_FEATURE_EXT_CAT_ERR_TYPE_LEN 0u
141+
127142
/**
128143
* DOC: GuC VGT Policy KLVs
129144
*

drivers/gpu/drm/xe/xe_guc.c

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "xe_guc_db_mgr.h"
3030
#include "xe_guc_engine_activity.h"
3131
#include "xe_guc_hwconfig.h"
32+
#include "xe_guc_klv_helpers.h"
3233
#include "xe_guc_log.h"
3334
#include "xe_guc_pc.h"
3435
#include "xe_guc_relay.h"
@@ -570,6 +571,57 @@ static int guc_g2g_start(struct xe_guc *guc)
570571
return err;
571572
}
572573

574+
static int __guc_opt_in_features_enable(struct xe_guc *guc, u64 addr, u32 num_dwords)
575+
{
576+
u32 action[] = {
577+
XE_GUC_ACTION_OPT_IN_FEATURE_KLV,
578+
lower_32_bits(addr),
579+
upper_32_bits(addr),
580+
num_dwords
581+
};
582+
583+
return xe_guc_ct_send_block(&guc->ct, action, ARRAY_SIZE(action));
584+
}
585+
586+
#define OPT_IN_MAX_DWORDS 16
587+
int xe_guc_opt_in_features_enable(struct xe_guc *guc)
588+
{
589+
struct xe_device *xe = guc_to_xe(guc);
590+
CLASS(xe_guc_buf, buf)(&guc->buf, OPT_IN_MAX_DWORDS);
591+
u32 count = 0;
592+
u32 *klvs;
593+
int ret;
594+
595+
if (!xe_guc_buf_is_valid(buf))
596+
return -ENOBUFS;
597+
598+
klvs = xe_guc_buf_cpu_ptr(buf);
599+
600+
/*
601+
* The extra CAT error type opt-in was added in GuC v70.17.0, which maps
602+
* to compatibility version v1.7.0.
603+
* Note that the GuC allows enabling this KLV even on platforms that do
604+
* not support the extra type; in such case the returned type variable
605+
* will be set to a known invalid value which we can check against.
606+
*/
607+
if (GUC_SUBMIT_VER(guc) >= MAKE_GUC_VER(1, 7, 0))
608+
klvs[count++] = PREP_GUC_KLV_TAG(OPT_IN_FEATURE_EXT_CAT_ERR_TYPE);
609+
610+
if (count) {
611+
xe_assert(xe, count <= OPT_IN_MAX_DWORDS);
612+
613+
ret = __guc_opt_in_features_enable(guc, xe_guc_buf_flush(buf), count);
614+
if (ret < 0) {
615+
xe_gt_err(guc_to_gt(guc),
616+
"failed to enable GuC opt-in features: %pe\n",
617+
ERR_PTR(ret));
618+
return ret;
619+
}
620+
}
621+
622+
return 0;
623+
}
624+
573625
static void guc_fini_hw(void *arg)
574626
{
575627
struct xe_guc *guc = arg;
@@ -789,6 +841,10 @@ int xe_guc_post_load_init(struct xe_guc *guc)
789841

790842
xe_guc_ads_populate_post_load(&guc->ads);
791843

844+
ret = xe_guc_opt_in_features_enable(guc);
845+
if (ret)
846+
return ret;
847+
792848
if (xe_guc_g2g_wanted(guc_to_xe(guc))) {
793849
ret = guc_g2g_start(guc);
794850
if (ret)

drivers/gpu/drm/xe/xe_guc.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ int xe_guc_reset(struct xe_guc *guc);
3434
int xe_guc_upload(struct xe_guc *guc);
3535
int xe_guc_min_load_for_hwconfig(struct xe_guc *guc);
3636
int xe_guc_enable_communication(struct xe_guc *guc);
37+
int xe_guc_opt_in_features_enable(struct xe_guc *guc);
3738
int xe_guc_suspend(struct xe_guc *guc);
3839
void xe_guc_notify(struct xe_guc *guc);
3940
int xe_guc_auth_huc(struct xe_guc *guc, u32 rsa_addr);

drivers/gpu/drm/xe/xe_guc_submit.c

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2092,12 +2092,16 @@ int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
20922092
struct xe_gt *gt = guc_to_gt(guc);
20932093
struct xe_exec_queue *q;
20942094
u32 guc_id;
2095+
u32 type = XE_GUC_CAT_ERR_TYPE_INVALID;
20952096

2096-
if (unlikely(len < 1))
2097+
if (unlikely(!len || len > 2))
20972098
return -EPROTO;
20982099

20992100
guc_id = msg[0];
21002101

2102+
if (len == 2)
2103+
type = msg[1];
2104+
21012105
if (guc_id == GUC_ID_UNKNOWN) {
21022106
/*
21032107
* GuC uses GUC_ID_UNKNOWN if it can not map the CAT fault to any PF/VF
@@ -2111,8 +2115,19 @@ int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
21112115
if (unlikely(!q))
21122116
return -EPROTO;
21132117

2114-
xe_gt_dbg(gt, "Engine memory cat error: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
2115-
xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
2118+
/*
2119+
* The type is HW-defined and changes based on platform, so we don't
2120+
* decode it in the kernel and only check if it is valid.
2121+
* See bspec 54047 and 72187 for details.
2122+
*/
2123+
if (type != XE_GUC_CAT_ERR_TYPE_INVALID)
2124+
xe_gt_dbg(gt,
2125+
"Engine memory CAT error [%u]: class=%s, logical_mask: 0x%x, guc_id=%d",
2126+
type, xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
2127+
else
2128+
xe_gt_dbg(gt,
2129+
"Engine memory CAT error: class=%s, logical_mask: 0x%x, guc_id=%d",
2130+
xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
21162131

21172132
trace_xe_exec_queue_memory_cat_error(q);
21182133

drivers/gpu/drm/xe/xe_uc.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,10 @@ static int vf_uc_load_hw(struct xe_uc *uc)
162162

163163
uc->guc.submission_state.enabled = true;
164164

165+
err = xe_guc_opt_in_features_enable(&uc->guc);
166+
if (err)
167+
return err;
168+
165169
err = xe_gt_record_default_lrcs(uc_to_gt(uc));
166170
if (err)
167171
return err;

0 commit comments

Comments
 (0)