Skip to content

Commit 82b98ca

Browse files
committed
drm/xe: Add WA BB to capture active context utilization
Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the context, but only gets updated when the context switches out. In order to check how long a context has been active before it switches out, two things are required: (1) Determine if the context is running: To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in the LRC. The value chosen is 1 since 0 is the initial value when the LRC is initialized. During a query, we just check for this value to determine if the context is active. If the context switched out, it would overwrite this location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as the last part of the context restore, so reusing this LRC location will not clobber anything. (2) Calculate the time that the context has been active for: The CTX_TIMESTAMP ticks only when the context is active. If a context is active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization. While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific engine instance. Since we do not know which instance the context is running on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and store it in the PPHSWP. Using the above 2 instructions in a WA BB, capture active context utilization. v2: (Matt Brost) - This breaks TDR, fix it by saving the CTX_TIMESTAMP register "drm/xe: Save CTX_TIMESTAMP mmio value instead of LRC value" - Drop tile from LRC if using gt "drm/xe: Save the gt pointer in LRC and drop the tile" v3: - Remove helpers for bb_per_ctx_ptr (Matt) - Add define for context active value (Matt) - Use 64 bit CTX TIMESTAMP for platforms that support it. For platforms that don't, live with the rare race. (Matt, Lucas) - Convert engine id to hwe and get the MMIO value (Lucas) - Correct commit message on when WA BB runs (Lucas) v4: - s/GRAPHICS_VER(...)/xe->info.has_64bit_timestamp/ (Matt) - Drop support for active utilization on a VF (CI failure) - In xe_lrc_init ensure the lrc value is 0 to begin with (CI regression) v5: - Minor checkpatch fix - Squash into previous commit and make TDR use 32-bit time - Update code comment to match commit msg Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4532 Suggested-by: Lucas De Marchi <lucas.demarchi@intel.com> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com> Link: https://lore.kernel.org/r/20250509161159.2173069-8-umesh.nerlige.ramappa@intel.com
1 parent 741d3ef commit 82b98ca

File tree

11 files changed

+203
-16
lines changed

11 files changed

+203
-16
lines changed

drivers/gpu/drm/xe/regs/xe_engine_regs.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@
4343
#define XEHPC_BCS8_RING_BASE 0x3ee000
4444
#define GSCCS_RING_BASE 0x11a000
4545

46+
#define ENGINE_ID(base) XE_REG((base) + 0x8c)
47+
#define ENGINE_INSTANCE_ID REG_GENMASK(9, 4)
48+
#define ENGINE_CLASS_ID REG_GENMASK(2, 0)
49+
4650
#define RING_TAIL(base) XE_REG((base) + 0x30)
4751
#define TAIL_ADDR REG_GENMASK(20, 3)
4852

@@ -154,6 +158,7 @@
154158
#define STOP_RING REG_BIT(8)
155159

156160
#define RING_CTX_TIMESTAMP(base) XE_REG((base) + 0x3a8)
161+
#define RING_CTX_TIMESTAMP_UDW(base) XE_REG((base) + 0x3ac)
157162
#define CSBE_DEBUG_STATUS(base) XE_REG((base) + 0x3fc)
158163

159164
#define RING_FORCE_TO_NONPRIV(base, i) XE_REG(((base) + 0x4d0) + (i) * 4)

drivers/gpu/drm/xe/regs/xe_lrc_layout.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
#define CTX_RING_TAIL (0x06 + 1)
1212
#define CTX_RING_START (0x08 + 1)
1313
#define CTX_RING_CTL (0x0a + 1)
14+
#define CTX_BB_PER_CTX_PTR (0x12 + 1)
1415
#define CTX_TIMESTAMP (0x22 + 1)
16+
#define CTX_TIMESTAMP_UDW (0x24 + 1)
1517
#define CTX_INDIRECT_RING_STATE (0x26 + 1)
1618
#define CTX_PDP0_UDW (0x30 + 1)
1719
#define CTX_PDP0_LDW (0x32 + 1)

drivers/gpu/drm/xe/xe_device_types.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,8 @@ struct xe_device {
334334
u8 has_sriov:1;
335335
/** @info.has_usm: Device has unified shared memory support */
336336
u8 has_usm:1;
337+
/** @info.has_64bit_timestamp: Device supports 64-bit timestamps */
338+
u8 has_64bit_timestamp:1;
337339
/** @info.is_dgfx: is discrete device */
338340
u8 is_dgfx:1;
339341
/** @info.needs_scratch: needs scratch page for oob prefetch to work */

drivers/gpu/drm/xe/xe_exec_queue.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -829,7 +829,7 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q)
829829
{
830830
struct xe_device *xe = gt_to_xe(q->gt);
831831
struct xe_lrc *lrc;
832-
u32 old_ts, new_ts;
832+
u64 old_ts, new_ts;
833833
int idx;
834834

835835
/*

drivers/gpu/drm/xe/xe_guc_submit.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -967,7 +967,7 @@ static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job)
967967
return xe_sched_invalidate_job(job, 2);
968968
}
969969

970-
ctx_timestamp = xe_lrc_ctx_timestamp(q->lrc[0]);
970+
ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(q->lrc[0]));
971971
ctx_job_timestamp = xe_lrc_ctx_job_timestamp(q->lrc[0]);
972972

973973
/*

drivers/gpu/drm/xe/xe_lrc.c

Lines changed: 178 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "xe_hw_fence.h"
2525
#include "xe_map.h"
2626
#include "xe_memirq.h"
27+
#include "xe_mmio.h"
2728
#include "xe_sriov.h"
2829
#include "xe_trace_lrc.h"
2930
#include "xe_vm.h"
@@ -654,6 +655,7 @@ u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
654655
#define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
655656
#define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
656657
#define LRC_PARALLEL_PPHWSP_OFFSET 2048
658+
#define LRC_ENGINE_ID_PPHWSP_OFFSET 2096
657659

658660
u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
659661
{
@@ -697,11 +699,21 @@ static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
697699
return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
698700
}
699701

702+
static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
703+
{
704+
return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
705+
}
706+
700707
static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
701708
{
702709
return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
703710
}
704711

712+
static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
713+
{
714+
return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
715+
}
716+
705717
static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
706718
{
707719
/* Indirect ring state page is at the very end of LRC */
@@ -729,8 +741,10 @@ DECL_MAP_ADDR_HELPERS(regs)
729741
DECL_MAP_ADDR_HELPERS(start_seqno)
730742
DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
731743
DECL_MAP_ADDR_HELPERS(ctx_timestamp)
744+
DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
732745
DECL_MAP_ADDR_HELPERS(parallel)
733746
DECL_MAP_ADDR_HELPERS(indirect_ring)
747+
DECL_MAP_ADDR_HELPERS(engine_id)
734748

735749
#undef DECL_MAP_ADDR_HELPERS
736750

@@ -745,19 +759,38 @@ u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
745759
return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
746760
}
747761

762+
/**
763+
* xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
764+
* @lrc: Pointer to the lrc.
765+
*
766+
* Returns: ctx timestamp udw GGTT address
767+
*/
768+
u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
769+
{
770+
return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
771+
}
772+
748773
/**
749774
* xe_lrc_ctx_timestamp() - Read ctx timestamp value
750775
* @lrc: Pointer to the lrc.
751776
*
752777
* Returns: ctx timestamp value
753778
*/
754-
u32 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
779+
u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
755780
{
756781
struct xe_device *xe = lrc_to_xe(lrc);
757782
struct iosys_map map;
783+
u32 ldw, udw = 0;
758784

759785
map = __xe_lrc_ctx_timestamp_map(lrc);
760-
return xe_map_read32(xe, &map);
786+
ldw = xe_map_read32(xe, &map);
787+
788+
if (xe->info.has_64bit_timestamp) {
789+
map = __xe_lrc_ctx_timestamp_udw_map(lrc);
790+
udw = xe_map_read32(xe, &map);
791+
}
792+
793+
return (u64)udw << 32 | ldw;
761794
}
762795

763796
/**
@@ -880,6 +913,65 @@ static void xe_lrc_finish(struct xe_lrc *lrc)
880913
xe_bo_unpin(lrc->bo);
881914
xe_bo_unlock(lrc->bo);
882915
xe_bo_put(lrc->bo);
916+
xe_bo_unpin_map_no_vm(lrc->bb_per_ctx_bo);
917+
}
918+
919+
/*
920+
* xe_lrc_setup_utilization() - Setup wa bb to assist in calculating active
921+
* context run ticks.
922+
* @lrc: Pointer to the lrc.
923+
*
924+
* Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
925+
* context, but only gets updated when the context switches out. In order to
926+
* check how long a context has been active before it switches out, two things
927+
* are required:
928+
*
929+
* (1) Determine if the context is running:
930+
* To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
931+
* the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
932+
* initialized. During a query, we just check for this value to determine if the
933+
* context is active. If the context switched out, it would overwrite this
934+
* location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
935+
* the last part of context restore, so reusing this LRC location will not
936+
* clobber anything.
937+
*
938+
* (2) Calculate the time that the context has been active for:
939+
* The CTX_TIMESTAMP ticks only when the context is active. If a context is
940+
* active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
941+
* While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
942+
* engine instance. Since we do not know which instance the context is running
943+
* on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
944+
* store it in the PPHSWP.
945+
*/
946+
#define CONTEXT_ACTIVE 1ULL
947+
static void xe_lrc_setup_utilization(struct xe_lrc *lrc)
948+
{
949+
u32 *cmd;
950+
951+
cmd = lrc->bb_per_ctx_bo->vmap.vaddr;
952+
953+
*cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
954+
*cmd++ = ENGINE_ID(0).addr;
955+
*cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
956+
*cmd++ = 0;
957+
958+
*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
959+
*cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
960+
*cmd++ = 0;
961+
*cmd++ = lower_32_bits(CONTEXT_ACTIVE);
962+
963+
if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
964+
*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
965+
*cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
966+
*cmd++ = 0;
967+
*cmd++ = upper_32_bits(CONTEXT_ACTIVE);
968+
}
969+
970+
*cmd++ = MI_BATCH_BUFFER_END;
971+
972+
xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
973+
xe_bo_ggtt_addr(lrc->bb_per_ctx_bo) | 1);
974+
883975
}
884976

885977
#define PVC_CTX_ASID (0x2e + 1)
@@ -921,10 +1013,17 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
9211013
if (IS_ERR(lrc->bo))
9221014
return PTR_ERR(lrc->bo);
9231015

1016+
lrc->bb_per_ctx_bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4K,
1017+
ttm_bo_type_kernel,
1018+
bo_flags);
1019+
if (IS_ERR(lrc->bb_per_ctx_bo)) {
1020+
err = PTR_ERR(lrc->bb_per_ctx_bo);
1021+
goto err_lrc_finish;
1022+
}
1023+
9241024
lrc->size = lrc_size;
9251025
lrc->ring.size = ring_size;
9261026
lrc->ring.tail = 0;
927-
lrc->ctx_timestamp = 0;
9281027

9291028
xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
9301029
hwe->fence_irq, hwe->name);
@@ -997,7 +1096,10 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
9971096
xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
9981097
_MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE));
9991098

1099+
lrc->ctx_timestamp = 0;
10001100
xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1101+
if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1102+
xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
10011103

10021104
if (xe->info.has_asid && vm)
10031105
xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid);
@@ -1026,6 +1128,8 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
10261128
map = __xe_lrc_start_seqno_map(lrc);
10271129
xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
10281130

1131+
xe_lrc_setup_utilization(lrc);
1132+
10291133
return 0;
10301134

10311135
err_lrc_finish:
@@ -1245,6 +1349,21 @@ struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
12451349
return __xe_lrc_parallel_map(lrc);
12461350
}
12471351

1352+
/**
1353+
* xe_lrc_engine_id() - Read engine id value
1354+
* @lrc: Pointer to the lrc.
1355+
*
1356+
* Returns: context id value
1357+
*/
1358+
static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1359+
{
1360+
struct xe_device *xe = lrc_to_xe(lrc);
1361+
struct iosys_map map;
1362+
1363+
map = __xe_lrc_engine_id_map(lrc);
1364+
return xe_map_read32(xe, &map);
1365+
}
1366+
12481367
static int instr_dw(u32 cmd_header)
12491368
{
12501369
/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
@@ -1692,7 +1811,7 @@ struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
16921811
snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
16931812
snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset;
16941813
snapshot->lrc_snapshot = NULL;
1695-
snapshot->ctx_timestamp = xe_lrc_ctx_timestamp(lrc);
1814+
snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
16961815
snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
16971816
return snapshot;
16981817
}
@@ -1792,22 +1911,74 @@ void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
17921911
kfree(snapshot);
17931912
}
17941913

1914+
static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
1915+
{
1916+
u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
1917+
u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
1918+
struct xe_hw_engine *hwe;
1919+
u64 val;
1920+
1921+
hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
1922+
if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
1923+
"Unexpected engine class:instance %d:%d for context utilization\n",
1924+
class, instance))
1925+
return -1;
1926+
1927+
if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1928+
val = xe_mmio_read64_2x32(&hwe->gt->mmio,
1929+
RING_CTX_TIMESTAMP(hwe->mmio_base));
1930+
else
1931+
val = xe_mmio_read32(&hwe->gt->mmio,
1932+
RING_CTX_TIMESTAMP(hwe->mmio_base));
1933+
1934+
*reg_ctx_ts = val;
1935+
1936+
return 0;
1937+
}
1938+
17951939
/**
17961940
* xe_lrc_update_timestamp() - Update ctx timestamp
17971941
* @lrc: Pointer to the lrc.
17981942
* @old_ts: Old timestamp value
17991943
*
18001944
* Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
1801-
* update saved value.
1945+
* update saved value. With support for active contexts, the calculation may be
1946+
* slightly racy, so follow a read-again logic to ensure that the context is
1947+
* still active before returning the right timestamp.
18021948
*
18031949
* Returns: New ctx timestamp value
18041950
*/
1805-
u32 xe_lrc_update_timestamp(struct xe_lrc *lrc, u32 *old_ts)
1951+
u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
18061952
{
1953+
u64 lrc_ts, reg_ts;
1954+
u32 engine_id;
1955+
18071956
*old_ts = lrc->ctx_timestamp;
18081957

1809-
lrc->ctx_timestamp = xe_lrc_ctx_timestamp(lrc);
1958+
lrc_ts = xe_lrc_ctx_timestamp(lrc);
1959+
/* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
1960+
if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
1961+
lrc->ctx_timestamp = lrc_ts;
1962+
goto done;
1963+
}
1964+
1965+
if (lrc_ts == CONTEXT_ACTIVE) {
1966+
engine_id = xe_lrc_engine_id(lrc);
1967+
if (!get_ctx_timestamp(lrc, engine_id, &reg_ts))
1968+
lrc->ctx_timestamp = reg_ts;
1969+
1970+
/* read lrc again to ensure context is still active */
1971+
lrc_ts = xe_lrc_ctx_timestamp(lrc);
1972+
}
1973+
1974+
/*
1975+
* If context switched out, just use the lrc_ts. Note that this needs to
1976+
* be a separate if condition.
1977+
*/
1978+
if (lrc_ts != CONTEXT_ACTIVE)
1979+
lrc->ctx_timestamp = lrc_ts;
18101980

1981+
done:
18111982
trace_xe_lrc_update_timestamp(lrc, *old_ts);
18121983

18131984
return lrc->ctx_timestamp;

drivers/gpu/drm/xe/xe_lrc.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,8 @@ void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer
120120
void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot);
121121

122122
u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc);
123-
u32 xe_lrc_ctx_timestamp(struct xe_lrc *lrc);
123+
u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc);
124+
u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc);
124125
u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc);
125126
u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc);
126127

@@ -136,6 +137,6 @@ u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc);
136137
*
137138
* Returns the current LRC timestamp
138139
*/
139-
u32 xe_lrc_update_timestamp(struct xe_lrc *lrc, u32 *old_ts);
140+
u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts);
140141

141142
#endif

drivers/gpu/drm/xe/xe_lrc_types.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,10 @@ struct xe_lrc {
5252
struct xe_hw_fence_ctx fence_ctx;
5353

5454
/** @ctx_timestamp: readout value of CTX_TIMESTAMP on last update */
55-
u32 ctx_timestamp;
55+
u64 ctx_timestamp;
56+
57+
/** @bb_per_ctx_bo: buffer object for per context batch wa buffer */
58+
struct xe_bo *bb_per_ctx_bo;
5659
};
5760

5861
struct xe_lrc_snapshot;

0 commit comments

Comments
 (0)