Skip to content

Commit 0b21f21

Browse files
committed
Merge tag 'edac_for_4.20_2' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp
Pull more EDAC updates from Borislav Petkov: "The second part of the EDAC pile which contains the ADXL user and a build fix which addresses a not-so-sensical .config but fixes randconfig builds people do: - skx_edac: Address translation for NVDIMMs (Tony Luck and Qiuxu Zhuo) - ACPI_ADXL build fix" [ I don't think "sensical" is a word, particularly when used in the context of actually meaning "nonsensical", but I like it - Linus ] * tag 'edac_for_4.20_2' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp: EDAC, skx: Fix randconfig builds EDAC, skx_edac: Add address translation for non-volatile DIMMs
2 parents 54480aa + a324e93 commit 0b21f21

File tree

3 files changed

+186
-13
lines changed

3 files changed

+186
-13
lines changed

drivers/edac/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,7 @@ config EDAC_SKX
234234
depends on PCI && X86_64 && X86_MCE_INTEL && PCI_MMCONFIG
235235
depends on ACPI_NFIT || !ACPI_NFIT # if ACPI_NFIT=m, EDAC_SKX can't be y
236236
select DMI
237+
select ACPI_ADXL if ACPI
237238
help
238239
Support for error detection and correction the Intel
239240
Skylake server Integrated Memory Controllers. If your

drivers/edac/skx_edac.c

Lines changed: 180 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <linux/bitmap.h>
2727
#include <linux/math64.h>
2828
#include <linux/mod_devicetable.h>
29+
#include <linux/adxl.h>
2930
#include <acpi/nfit.h>
3031
#include <asm/cpu_device_id.h>
3132
#include <asm/intel-family.h>
@@ -35,6 +36,7 @@
3536
#include "edac_module.h"
3637

3738
#define EDAC_MOD_STR "skx_edac"
39+
#define MSG_SIZE 1024
3840

3941
/*
4042
* Debug macros
@@ -54,6 +56,29 @@
5456
static LIST_HEAD(skx_edac_list);
5557

5658
static u64 skx_tolm, skx_tohm;
59+
static char *skx_msg;
60+
static unsigned int nvdimm_count;
61+
62+
enum {
63+
INDEX_SOCKET,
64+
INDEX_MEMCTRL,
65+
INDEX_CHANNEL,
66+
INDEX_DIMM,
67+
INDEX_MAX
68+
};
69+
70+
static const char * const component_names[] = {
71+
[INDEX_SOCKET] = "ProcessorSocketId",
72+
[INDEX_MEMCTRL] = "MemoryControllerId",
73+
[INDEX_CHANNEL] = "ChannelId",
74+
[INDEX_DIMM] = "DimmSlotId",
75+
};
76+
77+
static int component_indices[ARRAY_SIZE(component_names)];
78+
static int adxl_component_count;
79+
static const char * const *adxl_component_names;
80+
static u64 *adxl_values;
81+
static char *adxl_msg;
5782

5883
#define NUM_IMC 2 /* memory controllers per socket */
5984
#define NUM_CHANNELS 3 /* channels per memory controller */
@@ -393,6 +418,8 @@ static int get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc,
393418
u16 flags;
394419
u64 size = 0;
395420

421+
nvdimm_count++;
422+
396423
dev_handle = ACPI_NFIT_BUILD_DEVICE_HANDLE(dimmno, chan, imc->lmc,
397424
imc->src_id, 0);
398425

@@ -941,12 +968,46 @@ static void teardown_skx_debug(void)
941968
}
942969
#endif /*CONFIG_EDAC_DEBUG*/
943970

971+
static bool skx_adxl_decode(struct decoded_addr *res)
972+
973+
{
974+
int i, len = 0;
975+
976+
if (res->addr >= skx_tohm || (res->addr >= skx_tolm &&
977+
res->addr < BIT_ULL(32))) {
978+
edac_dbg(0, "Address 0x%llx out of range\n", res->addr);
979+
return false;
980+
}
981+
982+
if (adxl_decode(res->addr, adxl_values)) {
983+
edac_dbg(0, "Failed to decode 0x%llx\n", res->addr);
984+
return false;
985+
}
986+
987+
res->socket = (int)adxl_values[component_indices[INDEX_SOCKET]];
988+
res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
989+
res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]];
990+
res->dimm = (int)adxl_values[component_indices[INDEX_DIMM]];
991+
992+
for (i = 0; i < adxl_component_count; i++) {
993+
if (adxl_values[i] == ~0x0ull)
994+
continue;
995+
996+
len += snprintf(adxl_msg + len, MSG_SIZE - len, " %s:0x%llx",
997+
adxl_component_names[i], adxl_values[i]);
998+
if (MSG_SIZE - len <= 0)
999+
break;
1000+
}
1001+
1002+
return true;
1003+
}
1004+
9441005
static void skx_mce_output_error(struct mem_ctl_info *mci,
9451006
const struct mce *m,
9461007
struct decoded_addr *res)
9471008
{
9481009
enum hw_event_mc_err_type tp_event;
949-
char *type, *optype, msg[256];
1010+
char *type, *optype;
9501011
bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
9511012
bool overflow = GET_BITFIELD(m->status, 62, 62);
9521013
bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
@@ -1007,22 +1068,47 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
10071068
break;
10081069
}
10091070
}
1071+
if (adxl_component_count) {
1072+
snprintf(skx_msg, MSG_SIZE, "%s%s err_code:%04x:%04x %s",
1073+
overflow ? " OVERFLOW" : "",
1074+
(uncorrected_error && recoverable) ? " recoverable" : "",
1075+
mscod, errcode, adxl_msg);
1076+
} else {
1077+
snprintf(skx_msg, MSG_SIZE,
1078+
"%s%s err_code:%04x:%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:%x col:%x",
1079+
overflow ? " OVERFLOW" : "",
1080+
(uncorrected_error && recoverable) ? " recoverable" : "",
1081+
mscod, errcode,
1082+
res->socket, res->imc, res->rank,
1083+
res->bank_group, res->bank_address, res->row, res->column);
1084+
}
10101085

1011-
snprintf(msg, sizeof(msg),
1012-
"%s%s err_code:%04x:%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:%x col:%x",
1013-
overflow ? " OVERFLOW" : "",
1014-
(uncorrected_error && recoverable) ? " recoverable" : "",
1015-
mscod, errcode,
1016-
res->socket, res->imc, res->rank,
1017-
res->bank_group, res->bank_address, res->row, res->column);
1018-
1019-
edac_dbg(0, "%s\n", msg);
1086+
edac_dbg(0, "%s\n", skx_msg);
10201087

10211088
/* Call the helper to output message */
10221089
edac_mc_handle_error(tp_event, mci, core_err_cnt,
10231090
m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0,
10241091
res->channel, res->dimm, -1,
1025-
optype, msg);
1092+
optype, skx_msg);
1093+
}
1094+
1095+
static struct mem_ctl_info *get_mci(int src_id, int lmc)
1096+
{
1097+
struct skx_dev *d;
1098+
1099+
if (lmc > NUM_IMC - 1) {
1100+
skx_printk(KERN_ERR, "Bad lmc %d\n", lmc);
1101+
return NULL;
1102+
}
1103+
1104+
list_for_each_entry(d, &skx_edac_list, list) {
1105+
if (d->imc[0].src_id == src_id)
1106+
return d->imc[lmc].mci;
1107+
}
1108+
1109+
skx_printk(KERN_ERR, "No mci for src_id %d lmc %d\n", src_id, lmc);
1110+
1111+
return NULL;
10261112
}
10271113

10281114
static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
@@ -1040,10 +1126,23 @@ static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
10401126
if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV))
10411127
return NOTIFY_DONE;
10421128

1129+
memset(&res, 0, sizeof(res));
10431130
res.addr = mce->addr;
1044-
if (!skx_decode(&res))
1131+
1132+
if (adxl_component_count) {
1133+
if (!skx_adxl_decode(&res))
1134+
return NOTIFY_DONE;
1135+
1136+
mci = get_mci(res.socket, res.imc);
1137+
} else {
1138+
if (!skx_decode(&res))
1139+
return NOTIFY_DONE;
1140+
1141+
mci = res.dev->imc[res.imc].mci;
1142+
}
1143+
1144+
if (!mci)
10451145
return NOTIFY_DONE;
1046-
mci = res.dev->imc[res.imc].mci;
10471146

10481147
if (mce->mcgstatus & MCG_STATUS_MCIP)
10491148
type = "Exception";
@@ -1094,6 +1193,62 @@ static void skx_remove(void)
10941193
}
10951194
}
10961195

1196+
static void __init skx_adxl_get(void)
1197+
{
1198+
const char * const *names;
1199+
int i, j;
1200+
1201+
names = adxl_get_component_names();
1202+
if (!names) {
1203+
skx_printk(KERN_NOTICE, "No firmware support for address translation.");
1204+
skx_printk(KERN_CONT, " Only decoding DDR4 address!\n");
1205+
return;
1206+
}
1207+
1208+
for (i = 0; i < INDEX_MAX; i++) {
1209+
for (j = 0; names[j]; j++) {
1210+
if (!strcmp(component_names[i], names[j])) {
1211+
component_indices[i] = j;
1212+
break;
1213+
}
1214+
}
1215+
1216+
if (!names[j])
1217+
goto err;
1218+
}
1219+
1220+
adxl_component_names = names;
1221+
while (*names++)
1222+
adxl_component_count++;
1223+
1224+
adxl_values = kcalloc(adxl_component_count, sizeof(*adxl_values),
1225+
GFP_KERNEL);
1226+
if (!adxl_values) {
1227+
adxl_component_count = 0;
1228+
return;
1229+
}
1230+
1231+
adxl_msg = kzalloc(MSG_SIZE, GFP_KERNEL);
1232+
if (!adxl_msg) {
1233+
adxl_component_count = 0;
1234+
kfree(adxl_values);
1235+
}
1236+
1237+
return;
1238+
err:
1239+
skx_printk(KERN_ERR, "'%s' is not matched from DSM parameters: ",
1240+
component_names[i]);
1241+
for (j = 0; names[j]; j++)
1242+
skx_printk(KERN_CONT, "%s ", names[j]);
1243+
skx_printk(KERN_CONT, "\n");
1244+
}
1245+
1246+
static void __exit skx_adxl_put(void)
1247+
{
1248+
kfree(adxl_values);
1249+
kfree(adxl_msg);
1250+
}
1251+
10971252
/*
10981253
* skx_init:
10991254
* make sure we are running on the correct cpu model
@@ -1158,6 +1313,15 @@ static int __init skx_init(void)
11581313
}
11591314
}
11601315

1316+
skx_msg = kzalloc(MSG_SIZE, GFP_KERNEL);
1317+
if (!skx_msg) {
1318+
rc = -ENOMEM;
1319+
goto fail;
1320+
}
1321+
1322+
if (nvdimm_count)
1323+
skx_adxl_get();
1324+
11611325
/* Ensure that the OPSTATE is set correctly for POLL or NMI */
11621326
opstate_init();
11631327

@@ -1176,6 +1340,9 @@ static void __exit skx_exit(void)
11761340
edac_dbg(2, "\n");
11771341
mce_unregister_decode_chain(&skx_mce_dec);
11781342
skx_remove();
1343+
if (nvdimm_count)
1344+
skx_adxl_put();
1345+
kfree(skx_msg);
11791346
teardown_skx_debug();
11801347
}
11811348

include/linux/adxl.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,12 @@
77
#ifndef _LINUX_ADXL_H
88
#define _LINUX_ADXL_H
99

10+
#ifdef CONFIG_ACPI_ADXL
1011
const char * const *adxl_get_component_names(void);
1112
int adxl_decode(u64 addr, u64 component_values[]);
13+
#else
14+
static inline const char * const *adxl_get_component_names(void) { return NULL; }
15+
static inline int adxl_decode(u64 addr, u64 component_values[]) { return -EOPNOTSUPP; }
16+
#endif
1217

1318
#endif /* _LINUX_ADXL_H */

0 commit comments

Comments
 (0)