Skip to content

Commit 733aab5

Browse files
committed
EDAC/amd64: Add support for family 0x19, models 0x90-9f devices
JIRA: https://issues.redhat.com/browse/RHEL-10022 Tested: by AMD commit 12f230c Author: Muralidhara M K <muralidhara.mk@amd.com> Date: Thu Nov 2 11:42:25 2023 +0000 EDAC/amd64: Add support for family 0x19, models 0x90-9f devices AMD Models 90h-9fh are APUs. They have built-in HBM3 memory. ECC support is enabled by default. APU models have a single Data Fabric (DF) per Package. Each DF is visible to the OS in the same way as chiplet-based systems like Zen2 CPUs and later. However, the Unified Memory Controllers (UMCs) are arranged in the same way as GPU-based MI200 devices rather than CPU-based systems. Use the existing gpu_ops for hetergeneous systems to support enumeration of nodes and memory topology with few fixups. [ bp: Massage comments. ] Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Link: https://lore.kernel.org/r/20231102114225.2006878-5-muralimk@amd.com Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
1 parent 62a51f2 commit 733aab5

File tree

2 files changed

+49
-18
lines changed

2 files changed

+49
-18
lines changed

drivers/edac/amd64_edac.c

Lines changed: 48 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1001,15 +1001,23 @@ static struct local_node_map {
10011001
#define LNTM_NODE_COUNT GENMASK(27, 16)
10021002
#define LNTM_BASE_NODE_ID GENMASK(11, 0)
10031003

1004-
static int gpu_get_node_map(void)
1004+
static int gpu_get_node_map(struct amd64_pvt *pvt)
10051005
{
10061006
struct pci_dev *pdev;
10071007
int ret;
10081008
u32 tmp;
10091009

10101010
/*
1011-
* Node ID 0 is reserved for CPUs.
1012-
* Therefore, a non-zero Node ID means we've already cached the values.
1011+
* Mapping of nodes from hardware-provided AMD Node ID to a
1012+
* Linux logical one is applicable for MI200 models. Therefore,
1013+
* return early for other heterogeneous systems.
1014+
*/
1015+
if (pvt->F3->device != PCI_DEVICE_ID_AMD_MI200_DF_F3)
1016+
return 0;
1017+
1018+
/*
1019+
* Node ID 0 is reserved for CPUs. Therefore, a non-zero Node ID
1020+
* means the values have been already cached.
10131021
*/
10141022
if (gpu_node_map.base_node_id)
10151023
return 0;
@@ -3856,7 +3864,7 @@ static void gpu_init_csrows(struct mem_ctl_info *mci)
38563864

38573865
dimm->nr_pages = gpu_get_csrow_nr_pages(pvt, umc, cs);
38583866
dimm->edac_mode = EDAC_SECDED;
3859-
dimm->mtype = MEM_HBM2;
3867+
dimm->mtype = pvt->dram_type;
38603868
dimm->dtype = DEV_X16;
38613869
dimm->grain = 64;
38623870
}
@@ -3885,7 +3893,7 @@ static bool gpu_ecc_enabled(struct amd64_pvt *pvt)
38853893
return true;
38863894
}
38873895

3888-
static inline u32 gpu_get_umc_base(u8 umc, u8 channel)
3896+
static inline u32 gpu_get_umc_base(struct amd64_pvt *pvt, u8 umc, u8 channel)
38893897
{
38903898
/*
38913899
* On CPUs, there is one channel per UMC, so UMC numbering equals
@@ -3898,13 +3906,16 @@ static inline u32 gpu_get_umc_base(u8 umc, u8 channel)
38983906
* On GPU nodes channels are selected in 3rd nibble
38993907
* HBM chX[3:0]= [Y ]5X[3:0]000;
39003908
* HBM chX[7:4]= [Y+1]5X[3:0]000
3909+
*
3910+
* On MI300 APU nodes, same as GPU nodes but channels are selected
3911+
* in the base address of 0x90000
39013912
*/
39023913
umc *= 2;
39033914

39043915
if (channel >= 4)
39053916
umc++;
39063917

3907-
return 0x50000 + (umc << 20) + ((channel % 4) << 12);
3918+
return pvt->gpu_umc_base + (umc << 20) + ((channel % 4) << 12);
39083919
}
39093920

39103921
static void gpu_read_mc_regs(struct amd64_pvt *pvt)
@@ -3915,7 +3926,7 @@ static void gpu_read_mc_regs(struct amd64_pvt *pvt)
39153926

39163927
/* Read registers from each UMC */
39173928
for_each_umc(i) {
3918-
umc_base = gpu_get_umc_base(i, 0);
3929+
umc_base = gpu_get_umc_base(pvt, i, 0);
39193930
umc = &pvt->umc[i];
39203931

39213932
amd_smn_read(nid, umc_base + UMCCH_UMC_CFG, &umc->umc_cfg);
@@ -3932,15 +3943,15 @@ static void gpu_read_base_mask(struct amd64_pvt *pvt)
39323943

39333944
for_each_umc(umc) {
39343945
for_each_chip_select(cs, umc, pvt) {
3935-
base_reg = gpu_get_umc_base(umc, cs) + UMCCH_BASE_ADDR;
3946+
base_reg = gpu_get_umc_base(pvt, umc, cs) + UMCCH_BASE_ADDR;
39363947
base = &pvt->csels[umc].csbases[cs];
39373948

39383949
if (!amd_smn_read(pvt->mc_node_id, base_reg, base)) {
39393950
edac_dbg(0, " DCSB%d[%d]=0x%08x reg: 0x%x\n",
39403951
umc, cs, *base, base_reg);
39413952
}
39423953

3943-
mask_reg = gpu_get_umc_base(umc, cs) + UMCCH_ADDR_MASK;
3954+
mask_reg = gpu_get_umc_base(pvt, umc, cs) + UMCCH_ADDR_MASK;
39443955
mask = &pvt->csels[umc].csmasks[cs];
39453956

39463957
if (!amd_smn_read(pvt->mc_node_id, mask_reg, mask)) {
@@ -3965,7 +3976,7 @@ static int gpu_hw_info_get(struct amd64_pvt *pvt)
39653976
{
39663977
int ret;
39673978

3968-
ret = gpu_get_node_map();
3979+
ret = gpu_get_node_map(pvt);
39693980
if (ret)
39703981
return ret;
39713982

@@ -4129,6 +4140,8 @@ static int per_family_init(struct amd64_pvt *pvt)
41294140
if (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) {
41304141
pvt->ctl_name = "MI200";
41314142
pvt->max_mcs = 4;
4143+
pvt->dram_type = MEM_HBM2;
4144+
pvt->gpu_umc_base = 0x50000;
41324145
pvt->ops = &gpu_ops;
41334146
} else {
41344147
pvt->ctl_name = "F19h_M30h";
@@ -4143,6 +4156,13 @@ static int per_family_init(struct amd64_pvt *pvt)
41434156
pvt->ctl_name = "F19h_M70h";
41444157
pvt->flags.zn_regs_v2 = 1;
41454158
break;
4159+
case 0x90 ... 0x9f:
4160+
pvt->ctl_name = "F19h_M90h";
4161+
pvt->max_mcs = 4;
4162+
pvt->dram_type = MEM_HBM3;
4163+
pvt->gpu_umc_base = 0x90000;
4164+
pvt->ops = &gpu_ops;
4165+
break;
41464166
case 0xa0 ... 0xaf:
41474167
pvt->ctl_name = "F19h_MA0h";
41484168
pvt->max_mcs = 12;
@@ -4181,23 +4201,33 @@ static const struct attribute_group *amd64_edac_attr_groups[] = {
41814201
NULL
41824202
};
41834203

4204+
/*
4205+
* For heterogeneous and APU models EDAC CHIP_SELECT and CHANNEL layers
4206+
* should be swapped to fit into the layers.
4207+
*/
4208+
static unsigned int get_layer_size(struct amd64_pvt *pvt, u8 layer)
4209+
{
4210+
bool is_gpu = (pvt->ops == &gpu_ops);
4211+
4212+
if (!layer)
4213+
return is_gpu ? pvt->max_mcs
4214+
: pvt->csels[0].b_cnt;
4215+
else
4216+
return is_gpu ? pvt->csels[0].b_cnt
4217+
: pvt->max_mcs;
4218+
}
4219+
41844220
static int init_one_instance(struct amd64_pvt *pvt)
41854221
{
41864222
struct mem_ctl_info *mci = NULL;
41874223
struct edac_mc_layer layers[2];
41884224
int ret = -ENOMEM;
41894225

4190-
/*
4191-
* For Heterogeneous family EDAC CHIP_SELECT and CHANNEL layers should
4192-
* be swapped to fit into the layers.
4193-
*/
41944226
layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
4195-
layers[0].size = (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) ?
4196-
pvt->max_mcs : pvt->csels[0].b_cnt;
4227+
layers[0].size = get_layer_size(pvt, 0);
41974228
layers[0].is_virt_csrow = true;
41984229
layers[1].type = EDAC_MC_LAYER_CHANNEL;
4199-
layers[1].size = (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) ?
4200-
pvt->csels[0].b_cnt : pvt->max_mcs;
4230+
layers[1].size = get_layer_size(pvt, 1);
42014231
layers[1].is_virt_csrow = false;
42024232

42034233
mci = edac_mc_alloc(pvt->mc_node_id, ARRAY_SIZE(layers), layers, 0);

drivers/edac/amd64_edac.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,7 @@ struct amd64_pvt {
363363
u32 dct_sel_lo; /* DRAM Controller Select Low */
364364
u32 dct_sel_hi; /* DRAM Controller Select High */
365365
u32 online_spare; /* On-Line spare Reg */
366+
u32 gpu_umc_base; /* Base address used for channel selection on GPUs */
366367

367368
/* x4, x8, or x16 syndromes in use */
368369
u8 ecc_sym_sz;

0 commit comments

Comments
 (0)