Skip to content

Commit 25961ba

Browse files
srishanmalexdeucher
authored andcommitted
drm/amdgpu/gfx10: Add cleaner shader for GFX10.1.10
This commit adds the cleaner shader microcode for GFX10.1.0 GPUs. The cleaner shader is a piece of GPU code that is used to clear or initialize certain GPU resources, such as Local Data Share (LDS), Vector General Purpose Registers (VGPRs), and Scalar General Purpose Registers (SGPRs). Clearing these resources is important for ensuring data isolation between different workloads running on the GPU. Without the cleaner shader, residual data from a previous workload could potentially be accessed by a subsequent workload, leading to data leaks and incorrect computation results. The cleaner shader microcode is represented as an array of 32-bit words (`gfx_10_1_0_cleaner_shader_hex`). This array is the binary representation of the cleaner shader code, which is written in a low-level GPU instruction set. When the cleaner shader feature is enabled, the AMDGPU driver loads this array into a specific location in the GPU memory. The GPU then reads this memory location to fetch and execute the cleaner shader instructions. The cleaner shader is executed automatically by the GPU at the end of each workload, before the next workload starts. This ensures that all GPU resources are in a clean state before the start of each workload. This addition is part of the cleaner shader feature implementation. The cleaner shader feature helps resource utilization by cleaning up GPU resources after they are used. It also enhances security and reliability by preventing data leaks between workloads. Cc: Christian König <christian.koenig@amd.com> Cc: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com> Suggested-by: Alex Deucher <alexander.deucher@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent 0489339 commit 25961ba

File tree

3 files changed

+175
-0
lines changed

3 files changed

+175
-0
lines changed

drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4794,6 +4794,20 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block)
47944794
break;
47954795
}
47964796
switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
4797+
case IP_VERSION(10, 1, 10):
4798+
adev->gfx.cleaner_shader_ptr = gfx_10_1_10_cleaner_shader_hex;
4799+
adev->gfx.cleaner_shader_size = sizeof(gfx_10_1_10_cleaner_shader_hex);
4800+
if (adev->gfx.me_fw_version >= 101 &&
4801+
adev->gfx.pfp_fw_version >= 158 &&
4802+
adev->gfx.mec_fw_version >= 152) {
4803+
adev->gfx.enable_cleaner_shader = true;
4804+
r = amdgpu_gfx_cleaner_shader_sw_init(adev, adev->gfx.cleaner_shader_size);
4805+
if (r) {
4806+
adev->gfx.enable_cleaner_shader = false;
4807+
dev_err(adev->dev, "Failed to initialize cleaner shader\n");
4808+
}
4809+
}
4810+
break;
47974811
case IP_VERSION(10, 3, 0):
47984812
case IP_VERSION(10, 3, 2):
47994813
case IP_VERSION(10, 3, 4):

drivers/gpu/drm/amd/amdgpu/gfx_v10_0_cleaner_shader.h

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,41 @@
2121
* OTHER DEALINGS IN THE SOFTWARE.
2222
*/
2323

24+
/* Define the cleaner shader gfx_10_1_10 */
25+
static const u32 gfx_10_1_10_cleaner_shader_hex[] = {
26+
0xb0804004, 0xbf8a0000,
27+
0xbf068100, 0xbf840023,
28+
0xbe8203b8, 0xbefc0380,
29+
0x7e008480, 0x7e028480,
30+
0x7e048480, 0x7e068480,
31+
0x7e088480, 0x7e0a8480,
32+
0x7e0c8480, 0x7e0e8480,
33+
0xbefc0302, 0x80828802,
34+
0xbf84fff5, 0xbe8203ff,
35+
0x80000000, 0x87020102,
36+
0xbf840012, 0xbefe03c1,
37+
0xbeff03c1, 0xd7650001,
38+
0x0001007f, 0xd7660001,
39+
0x0002027e, 0x16020288,
40+
0xbe8203bf, 0xbefc03c1,
41+
0xd9382000, 0x00020201,
42+
0xd9386040, 0x00040401,
43+
0xd70f6a01, 0x000202ff,
44+
0x00000400, 0x80828102,
45+
0xbf84fff7, 0xbefc03ff,
46+
0x00000068, 0xbe803080,
47+
0xbe813080, 0xbe823080,
48+
0xbe833080, 0x80fc847c,
49+
0xbf84fffa, 0xbeea0480,
50+
0xbeec0480, 0xbeee0480,
51+
0xbef00480, 0xbef20480,
52+
0xbef40480, 0xbef60480,
53+
0xbef80480, 0xbefa0480,
54+
0xbf810000, 0xbf9f0000,
55+
0xbf9f0000, 0xbf9f0000,
56+
0xbf9f0000, 0xbf9f0000,
57+
};
58+
2459
/* Define the cleaner shader gfx_10_3_0 */
2560
static const u32 gfx_10_3_0_cleaner_shader_hex[] = {
2661
0xb0804004, 0xbf8a0000,
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
/* SPDX-License-Identifier: MIT */
2+
/*
3+
* Copyright 2025 Advanced Micro Devices, Inc.
4+
*
5+
* Permission is hereby granted, free of charge, to any person obtaining a
6+
* copy of this software and associated documentation files (the "Software"),
7+
* to deal in the Software without restriction, including without limitation
8+
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
9+
* and/or sell copies of the Software, and to permit persons to whom the
10+
* Software is furnished to do so, subject to the following conditions:
11+
*
12+
* The above copyright notice and this permission notice shall be included in
13+
* all copies or substantial portions of the Software.
14+
*
15+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18+
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19+
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20+
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21+
* OTHER DEALINGS IN THE SOFTWARE.
22+
*/
23+
24+
// This shader is to clean LDS, SGPRs and VGPRs. It is first 64 Dwords or 256 bytes of 256 Dwords cleaner shader.
25+
26+
// GFX10.1 : Clear SGPRs, VGPRs and LDS
27+
// Launch 32 waves per CU (16 per SIMD) as a workgroup (threadgroup) to fill every wave slot
28+
// Waves are "wave32" and have 64 VGPRs each, which uses all 1024 VGPRs per SIMD
29+
// Waves are launched in "CU" mode, and the workgroup shares 64KB of LDS (half of the WGP's LDS)
30+
// It takes 2 workgroups to use all of LDS: one on each CU of the WGP
31+
// Each wave clears SGPRs 0 - 107
32+
// Each wave clears VGPRs 0 - 63
33+
// The first wave of the workgroup clears its 64KB of LDS
34+
// The shader starts with "S_BARRIER" to ensure SPI has launched all waves of the workgroup
35+
// before any wave in the workgroup could end. Without this, it is possible not all SGPRs get cleared.
36+
37+
38+
shader main
39+
asic(GFX10.1)
40+
type(CS)
41+
wave_size(32)
42+
// Note: original source code from SQ team
43+
44+
//
45+
// Create 32 waves in a threadgroup (CS waves)
46+
// Each allocates 64 VGPRs
47+
// The workgroup allocates all of LDS (64kbytes)
48+
//
49+
// Takes about 2500 clocks to run.
50+
// (theorhetical fastest = 1024clks vgpr + 640lds = 1660 clks)
51+
//
52+
S_BARRIER
53+
s_cmp_eq_u32 s0, 1 // Bit0 is set, sgpr0 is set then clear VGPRS and LDS as FW set COMPUTE_USER_DATA_0
54+
s_cbranch_scc0 label_0023 // Clean VGPRs and LDS if sgpr0 of wave is set, scc = (s0 == 1)
55+
56+
s_mov_b32 s2, 0x00000038 // Loop 64/8=8 times (loop unrolled for performance)
57+
s_mov_b32 m0, 0
58+
//
59+
// CLEAR VGPRs
60+
//
61+
label_0005:
62+
v_movreld_b32 v0, 0
63+
v_movreld_b32 v1, 0
64+
v_movreld_b32 v2, 0
65+
v_movreld_b32 v3, 0
66+
v_movreld_b32 v4, 0
67+
v_movreld_b32 v5, 0
68+
v_movreld_b32 v6, 0
69+
v_movreld_b32 v7, 0
70+
s_mov_b32 m0, s2
71+
s_sub_u32 s2, s2, 8
72+
s_cbranch_scc0 label_0005
73+
//
74+
s_mov_b32 s2, 0x80000000 // Bit31 is first_wave
75+
s_and_b32 s2, s2, s0 // sgpr0 has tg_size (first_wave) term as in ucode only COMPUTE_PGM_RSRC2.tg_size_en is set
76+
s_cbranch_scc0 label_0023 // Clean LDS if its first wave of ThreadGroup/WorkGroup
77+
// CLEAR LDS
78+
//
79+
s_mov_b32 exec_lo, 0xffffffff
80+
s_mov_b32 exec_hi, 0xffffffff
81+
v_mbcnt_lo_u32_b32 v1, exec_hi, 0 // Set V1 to thread-ID (0..63)
82+
v_mbcnt_hi_u32_b32 v1, exec_lo, v1 // Set V1 to thread-ID (0..63)
83+
v_mul_u32_u24 v1, 0x00000008, v1 // * 8, so each thread is a double-dword address (8byte)
84+
s_mov_b32 s2, 0x00000003f // 64 loop iterations
85+
s_mov_b32 m0, 0xffffffff
86+
// Clear all of LDS space
87+
// Each FirstWave of WorkGroup clears 64kbyte block
88+
89+
label_001F:
90+
ds_write2_b64 v1, v[2:3], v[2:3] offset1:32
91+
ds_write2_b64 v1, v[4:5], v[4:5] offset0:64 offset1:96
92+
v_add_co_u32 v1, vcc, 0x00000400, v1
93+
s_sub_u32 s2, s2, 1
94+
s_cbranch_scc0 label_001F
95+
96+
//
97+
// CLEAR SGPRs
98+
//
99+
label_0023:
100+
s_mov_b32 m0, 0x00000068 // Loop 108/4=27 times (loop unrolled for performance)
101+
label_sgpr_loop:
102+
s_movreld_b32 s0, 0
103+
s_movreld_b32 s1, 0
104+
s_movreld_b32 s2, 0
105+
s_movreld_b32 s3, 0
106+
s_sub_u32 m0, m0, 4
107+
s_cbranch_scc0 label_sgpr_loop
108+
109+
//clear vcc
110+
s_mov_b64 vcc, 0 //clear vcc
111+
//s_setreg_imm32_b32 hw_reg_shader_flat_scratch_lo, 0 //clear flat scratch lo SGPR
112+
//s_setreg_imm32_b32 hw_reg_shader_flat_scratch_hi, 0 //clear flat scratch hi SGPR
113+
s_mov_b64 ttmp0, 0 //Clear ttmp0 and ttmp1
114+
s_mov_b64 ttmp2, 0 //Clear ttmp2 and ttmp3
115+
s_mov_b64 ttmp4, 0 //Clear ttmp4 and ttmp5
116+
s_mov_b64 ttmp6, 0 //Clear ttmp6 and ttmp7
117+
s_mov_b64 ttmp8, 0 //Clear ttmp8 and ttmp9
118+
s_mov_b64 ttmp10, 0 //Clear ttmp10 and ttmp11
119+
s_mov_b64 ttmp12, 0 //Clear ttmp12 and ttmp13
120+
s_mov_b64 ttmp14, 0 //Clear ttmp14 and ttmp15
121+
122+
s_endpgm
123+
124+
end
125+
126+

0 commit comments

Comments
 (0)