Skip to content

Commit 12f230c

Browse files
Muralidhara M Kbp3tk0v
authored andcommitted
EDAC/amd64: Add support for family 0x19, models 0x90-9f devices
AMD Models 90h-9fh are APUs. They have built-in HBM3 memory. ECC support is enabled by default. APU models have a single Data Fabric (DF) per Package. Each DF is visible to the OS in the same way as chiplet-based systems like Zen2 CPUs and later. However, the Unified Memory Controllers (UMCs) are arranged in the same way as GPU-based MI200 devices rather than CPU-based systems. Use the existing gpu_ops for hetergeneous systems to support enumeration of nodes and memory topology with few fixups. [ bp: Massage comments. ] Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Link: https://lore.kernel.org/r/20231102114225.2006878-5-muralimk@amd.com
1 parent 9a5f580 commit 12f230c

File tree

2 files changed

+49
-18
lines changed

2 files changed

+49
-18
lines changed

drivers/edac/amd64_edac.c

Lines changed: 48 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -996,15 +996,23 @@ static struct local_node_map {
996996
#define LNTM_NODE_COUNT GENMASK(27, 16)
997997
#define LNTM_BASE_NODE_ID GENMASK(11, 0)
998998

999-
static int gpu_get_node_map(void)
999+
static int gpu_get_node_map(struct amd64_pvt *pvt)
10001000
{
10011001
struct pci_dev *pdev;
10021002
int ret;
10031003
u32 tmp;
10041004

10051005
/*
1006-
* Node ID 0 is reserved for CPUs.
1007-
* Therefore, a non-zero Node ID means we've already cached the values.
1006+
* Mapping of nodes from hardware-provided AMD Node ID to a
1007+
* Linux logical one is applicable for MI200 models. Therefore,
1008+
* return early for other heterogeneous systems.
1009+
*/
1010+
if (pvt->F3->device != PCI_DEVICE_ID_AMD_MI200_DF_F3)
1011+
return 0;
1012+
1013+
/*
1014+
* Node ID 0 is reserved for CPUs. Therefore, a non-zero Node ID
1015+
* means the values have been already cached.
10081016
*/
10091017
if (gpu_node_map.base_node_id)
10101018
return 0;
@@ -3851,7 +3859,7 @@ static void gpu_init_csrows(struct mem_ctl_info *mci)
38513859

38523860
dimm->nr_pages = gpu_get_csrow_nr_pages(pvt, umc, cs);
38533861
dimm->edac_mode = EDAC_SECDED;
3854-
dimm->mtype = MEM_HBM2;
3862+
dimm->mtype = pvt->dram_type;
38553863
dimm->dtype = DEV_X16;
38563864
dimm->grain = 64;
38573865
}
@@ -3880,7 +3888,7 @@ static bool gpu_ecc_enabled(struct amd64_pvt *pvt)
38803888
return true;
38813889
}
38823890

3883-
static inline u32 gpu_get_umc_base(u8 umc, u8 channel)
3891+
static inline u32 gpu_get_umc_base(struct amd64_pvt *pvt, u8 umc, u8 channel)
38843892
{
38853893
/*
38863894
* On CPUs, there is one channel per UMC, so UMC numbering equals
@@ -3893,13 +3901,16 @@ static inline u32 gpu_get_umc_base(u8 umc, u8 channel)
38933901
* On GPU nodes channels are selected in 3rd nibble
38943902
* HBM chX[3:0]= [Y ]5X[3:0]000;
38953903
* HBM chX[7:4]= [Y+1]5X[3:0]000
3904+
*
3905+
* On MI300 APU nodes, same as GPU nodes but channels are selected
3906+
* in the base address of 0x90000
38963907
*/
38973908
umc *= 2;
38983909

38993910
if (channel >= 4)
39003911
umc++;
39013912

3902-
return 0x50000 + (umc << 20) + ((channel % 4) << 12);
3913+
return pvt->gpu_umc_base + (umc << 20) + ((channel % 4) << 12);
39033914
}
39043915

39053916
static void gpu_read_mc_regs(struct amd64_pvt *pvt)
@@ -3910,7 +3921,7 @@ static void gpu_read_mc_regs(struct amd64_pvt *pvt)
39103921

39113922
/* Read registers from each UMC */
39123923
for_each_umc(i) {
3913-
umc_base = gpu_get_umc_base(i, 0);
3924+
umc_base = gpu_get_umc_base(pvt, i, 0);
39143925
umc = &pvt->umc[i];
39153926

39163927
amd_smn_read(nid, umc_base + UMCCH_UMC_CFG, &umc->umc_cfg);
@@ -3927,15 +3938,15 @@ static void gpu_read_base_mask(struct amd64_pvt *pvt)
39273938

39283939
for_each_umc(umc) {
39293940
for_each_chip_select(cs, umc, pvt) {
3930-
base_reg = gpu_get_umc_base(umc, cs) + UMCCH_BASE_ADDR;
3941+
base_reg = gpu_get_umc_base(pvt, umc, cs) + UMCCH_BASE_ADDR;
39313942
base = &pvt->csels[umc].csbases[cs];
39323943

39333944
if (!amd_smn_read(pvt->mc_node_id, base_reg, base)) {
39343945
edac_dbg(0, " DCSB%d[%d]=0x%08x reg: 0x%x\n",
39353946
umc, cs, *base, base_reg);
39363947
}
39373948

3938-
mask_reg = gpu_get_umc_base(umc, cs) + UMCCH_ADDR_MASK;
3949+
mask_reg = gpu_get_umc_base(pvt, umc, cs) + UMCCH_ADDR_MASK;
39393950
mask = &pvt->csels[umc].csmasks[cs];
39403951

39413952
if (!amd_smn_read(pvt->mc_node_id, mask_reg, mask)) {
@@ -3960,7 +3971,7 @@ static int gpu_hw_info_get(struct amd64_pvt *pvt)
39603971
{
39613972
int ret;
39623973

3963-
ret = gpu_get_node_map();
3974+
ret = gpu_get_node_map(pvt);
39643975
if (ret)
39653976
return ret;
39663977

@@ -4125,6 +4136,8 @@ static int per_family_init(struct amd64_pvt *pvt)
41254136
if (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) {
41264137
pvt->ctl_name = "MI200";
41274138
pvt->max_mcs = 4;
4139+
pvt->dram_type = MEM_HBM2;
4140+
pvt->gpu_umc_base = 0x50000;
41284141
pvt->ops = &gpu_ops;
41294142
} else {
41304143
pvt->ctl_name = "F19h_M30h";
@@ -4142,6 +4155,13 @@ static int per_family_init(struct amd64_pvt *pvt)
41424155
pvt->ctl_name = "F19h_M70h";
41434156
pvt->flags.zn_regs_v2 = 1;
41444157
break;
4158+
case 0x90 ... 0x9f:
4159+
pvt->ctl_name = "F19h_M90h";
4160+
pvt->max_mcs = 4;
4161+
pvt->dram_type = MEM_HBM3;
4162+
pvt->gpu_umc_base = 0x90000;
4163+
pvt->ops = &gpu_ops;
4164+
break;
41454165
case 0xa0 ... 0xaf:
41464166
pvt->ctl_name = "F19h_MA0h";
41474167
pvt->max_mcs = 12;
@@ -4180,23 +4200,33 @@ static const struct attribute_group *amd64_edac_attr_groups[] = {
41804200
NULL
41814201
};
41824202

4203+
/*
4204+
* For heterogeneous and APU models EDAC CHIP_SELECT and CHANNEL layers
4205+
* should be swapped to fit into the layers.
4206+
*/
4207+
static unsigned int get_layer_size(struct amd64_pvt *pvt, u8 layer)
4208+
{
4209+
bool is_gpu = (pvt->ops == &gpu_ops);
4210+
4211+
if (!layer)
4212+
return is_gpu ? pvt->max_mcs
4213+
: pvt->csels[0].b_cnt;
4214+
else
4215+
return is_gpu ? pvt->csels[0].b_cnt
4216+
: pvt->max_mcs;
4217+
}
4218+
41834219
static int init_one_instance(struct amd64_pvt *pvt)
41844220
{
41854221
struct mem_ctl_info *mci = NULL;
41864222
struct edac_mc_layer layers[2];
41874223
int ret = -ENOMEM;
41884224

4189-
/*
4190-
* For Heterogeneous family EDAC CHIP_SELECT and CHANNEL layers should
4191-
* be swapped to fit into the layers.
4192-
*/
41934225
layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
4194-
layers[0].size = (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) ?
4195-
pvt->max_mcs : pvt->csels[0].b_cnt;
4226+
layers[0].size = get_layer_size(pvt, 0);
41964227
layers[0].is_virt_csrow = true;
41974228
layers[1].type = EDAC_MC_LAYER_CHANNEL;
4198-
layers[1].size = (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) ?
4199-
pvt->csels[0].b_cnt : pvt->max_mcs;
4229+
layers[1].size = get_layer_size(pvt, 1);
42004230
layers[1].is_virt_csrow = false;
42014231

42024232
mci = edac_mc_alloc(pvt->mc_node_id, ARRAY_SIZE(layers), layers, 0);

drivers/edac/amd64_edac.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,7 @@ struct amd64_pvt {
362362
u32 dct_sel_lo; /* DRAM Controller Select Low */
363363
u32 dct_sel_hi; /* DRAM Controller Select High */
364364
u32 online_spare; /* On-Line spare Reg */
365+
u32 gpu_umc_base; /* Base address used for channel selection on GPUs */
365366

366367
/* x4, x8, or x16 syndromes in use */
367368
u8 ecc_sym_sz;

0 commit comments

Comments
 (0)