Skip to content

Commit ef5055c

Browse files
committed
EDAC/skx_common: Enable EDAC support for the "near" memory
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2165650 Tested: tested with the EMR machine we have in the lab commit 6e8746c Author: Qiuxu Zhuo <qiuxu.zhuo@intel.com> Date: Fri Jan 13 11:27:58 2023 +0800 EDAC/skx_common: Enable EDAC support for the "near" memory The current {skx,i10nm}_edac miss the EDAC support to decode errors from the 1st level memory (the fast "near" memory as cache) of the 2-level memory system. Introduce a helper function skx_error_in_mem() to check whether errors are from memory at the beginning of skx_mce_check_error(). As long as the errors are from memory (either the 1-level memory system or the 2-level memory system), decode the errors. Reported-and-tested-by: Youquan Song <youquan.song@intel.com> Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com> Signed-off-by: Tony Luck <tony.luck@intel.com> Link: https://lore.kernel.org/all/20230113032802.41752-1-qiuxu.zhuo@intel.com Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
1 parent e52fe10 commit ef5055c

File tree

2 files changed

+36
-6
lines changed

2 files changed

+36
-6
lines changed

drivers/edac/skx_common.c

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -629,12 +629,18 @@ static bool skx_error_in_1st_level_mem(const struct mce *m)
629629
if (!skx_mem_cfg_2lm)
630630
return false;
631631

632-
errcode = GET_BITFIELD(m->status, 0, 15);
632+
errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
633633

634-
if ((errcode & 0xef80) != 0x280)
635-
return false;
634+
return errcode == MCACOD_EXT_MEM_ERR;
635+
}
636636

637-
return true;
637+
static bool skx_error_in_mem(const struct mce *m)
638+
{
639+
u32 errcode;
640+
641+
errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
642+
643+
return (errcode == MCACOD_MEM_CTL_ERR || errcode == MCACOD_EXT_MEM_ERR);
638644
}
639645

640646
int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
@@ -648,8 +654,8 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
648654
if (mce->kflags & MCE_HANDLED_CEC)
649655
return NOTIFY_DONE;
650656

651-
/* ignore unless this is memory related with an address */
652-
if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV))
657+
/* Ignore unless this is memory related with an address */
658+
if (!skx_error_in_mem(mce) || !(mce->status & MCI_STATUS_ADDRV))
653659
return NOTIFY_DONE;
654660

655661
memset(&res, 0, sizeof(res));

drivers/edac/skx_common.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,30 @@
5252
#define IS_DIMM_PRESENT(r) GET_BITFIELD(r, 15, 15)
5353
#define IS_NVDIMM_PRESENT(r, i) GET_BITFIELD(r, i, i)
5454

55+
/*
56+
* According to Intel Architecture spec vol 3B,
57+
* Table 15-10 "IA32_MCi_Status [15:0] Compound Error Code Encoding"
58+
* memory errors should fit one of these masks:
59+
* 000f 0000 1mmm cccc (binary)
60+
* 000f 0010 1mmm cccc (binary) [RAM used as cache]
61+
* where:
62+
* f = Correction Report Filtering Bit. If 1, subsequent errors
63+
* won't be shown
64+
* mmm = error type
65+
* cccc = channel
66+
*/
67+
#define MCACOD_MEM_ERR_MASK 0xef80
68+
/*
69+
* Errors from either the memory of the 1-level memory system or the
70+
* 2nd level memory (the slow "far" memory) of the 2-level memory system.
71+
*/
72+
#define MCACOD_MEM_CTL_ERR 0x80
73+
/*
74+
* Errors from the 1st level memory (the fast "near" memory as cache)
75+
* of the 2-level memory system.
76+
*/
77+
#define MCACOD_EXT_MEM_ERR 0x280
78+
5579
/*
5680
* Each cpu socket contains some pci devices that provide global
5781
* information, and also some that are local to each of the two

0 commit comments

Comments
 (0)