Skip to content

Commit ea149b3

Browse files
Andi KleenH. Peter Anvin
authored andcommitted
x86, mce: add basic error injection infrastructure
Allow user programs to write mce records into /dev/mcelog. When they do that a fake machine check is triggered to test the machine check code. This uses the MCE MSR wrappers added earlier. The implementation is straight forward. There is a struct mce record per CPU and the MCE MSR accesses get data from there if there is valid data injected there. This allows to test the machine check code relatively realistically because only the lowest layer of hardware access is intercepted. The test suite and injector are available at git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git Signed-off-by: Andi Kleen <ak@linux.intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
1 parent 5f8c1a5 commit ea149b3

File tree

5 files changed

+176
-1
lines changed

5 files changed

+176
-1
lines changed

arch/x86/Kconfig

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -835,6 +835,14 @@ config X86_MCE_THRESHOLD
835835
bool
836836
default y
837837

838+
config X86_MCE_INJECT
839+
depends on X86_NEW_MCE
840+
tristate "Machine check injector support"
841+
---help---
842+
Provide support for injecting machine checks for testing purposes.
843+
If you don't know what a machine check is and you don't do kernel
844+
QA it is safe to say n.
845+
838846
config X86_MCE_NONFATAL
839847
tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
840848
depends on X86_OLD_MCE

arch/x86/include/asm/mce.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,9 @@ extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
141141

142142
extern int mce_notify_user(void);
143143

144+
DECLARE_PER_CPU(struct mce, injectm);
145+
extern struct file_operations mce_chrdev_ops;
146+
144147
#ifdef CONFIG_X86_MCE
145148
extern void mcheck_init(struct cpuinfo_x86 *c);
146149
#else

arch/x86/kernel/cpu/mcheck/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o
77
obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
88
obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
99
obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
10+
obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
/*
2+
* Machine check injection support.
3+
* Copyright 2008 Intel Corporation.
4+
*
5+
* This program is free software; you can redistribute it and/or
6+
* modify it under the terms of the GNU General Public License
7+
* as published by the Free Software Foundation; version 2
8+
* of the License.
9+
*
10+
* Authors:
11+
* Andi Kleen
12+
* Ying Huang
13+
*/
14+
#include <linux/module.h>
15+
#include <linux/timer.h>
16+
#include <linux/kernel.h>
17+
#include <linux/string.h>
18+
#include <linux/fs.h>
19+
#include <linux/smp.h>
20+
#include <asm/uaccess.h>
21+
#include <asm/mce.h>
22+
23+
/* Update fake mce registers on current CPU. */
24+
static void inject_mce(struct mce *m)
25+
{
26+
struct mce *i = &per_cpu(injectm, m->cpu);
27+
28+
/* Make sure noone reads partially written injectm */
29+
i->finished = 0;
30+
mb();
31+
m->finished = 0;
32+
/* First set the fields after finished */
33+
i->cpu = m->cpu;
34+
mb();
35+
/* Now write record in order, finished last (except above) */
36+
memcpy(i, m, sizeof(struct mce));
37+
/* Finally activate it */
38+
mb();
39+
i->finished = 1;
40+
}
41+
42+
struct delayed_mce {
43+
struct timer_list timer;
44+
struct mce m;
45+
};
46+
47+
/* Inject mce on current CPU */
48+
static void raise_mce(unsigned long data)
49+
{
50+
struct delayed_mce *dm = (struct delayed_mce *)data;
51+
struct mce *m = &dm->m;
52+
int cpu = m->cpu;
53+
54+
inject_mce(m);
55+
if (m->status & MCI_STATUS_UC) {
56+
struct pt_regs regs;
57+
memset(&regs, 0, sizeof(struct pt_regs));
58+
regs.ip = m->ip;
59+
regs.cs = m->cs;
60+
printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
61+
do_machine_check(&regs, 0);
62+
printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
63+
} else {
64+
mce_banks_t b;
65+
memset(&b, 0xff, sizeof(mce_banks_t));
66+
printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
67+
machine_check_poll(0, &b);
68+
mce_notify_user();
69+
printk(KERN_INFO "Finished machine check poll on CPU %d\n",
70+
cpu);
71+
}
72+
kfree(dm);
73+
}
74+
75+
/* Error injection interface */
76+
static ssize_t mce_write(struct file *filp, const char __user *ubuf,
77+
size_t usize, loff_t *off)
78+
{
79+
struct delayed_mce *dm;
80+
struct mce m;
81+
82+
if (!capable(CAP_SYS_ADMIN))
83+
return -EPERM;
84+
/*
85+
* There are some cases where real MSR reads could slip
86+
* through.
87+
*/
88+
if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
89+
return -EIO;
90+
91+
if ((unsigned long)usize > sizeof(struct mce))
92+
usize = sizeof(struct mce);
93+
if (copy_from_user(&m, ubuf, usize))
94+
return -EFAULT;
95+
96+
if (m.cpu >= NR_CPUS || !cpu_online(m.cpu))
97+
return -EINVAL;
98+
99+
dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
100+
if (!dm)
101+
return -ENOMEM;
102+
103+
/*
104+
* Need to give user space some time to set everything up,
105+
* so do it a jiffie or two later everywhere.
106+
* Should we use a hrtimer here for better synchronization?
107+
*/
108+
memcpy(&dm->m, &m, sizeof(struct mce));
109+
setup_timer(&dm->timer, raise_mce, (unsigned long)dm);
110+
dm->timer.expires = jiffies + 2;
111+
add_timer_on(&dm->timer, m.cpu);
112+
return usize;
113+
}
114+
115+
static int inject_init(void)
116+
{
117+
printk(KERN_INFO "Machine check injector initialized\n");
118+
mce_chrdev_ops.write = mce_write;
119+
return 0;
120+
}
121+
122+
module_init(inject_init);
123+
/* Cannot tolerate unloading currently because we cannot
124+
* guarantee all openers of mce_chrdev will get a reference to us.
125+
*/
126+
MODULE_LICENSE("GPL");

arch/x86/kernel/cpu/mcheck/mce.c

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,9 @@ void mce_setup(struct mce *m)
9898
rdtscll(m->tsc);
9999
}
100100

101+
DEFINE_PER_CPU(struct mce, injectm);
102+
EXPORT_PER_CPU_SYMBOL_GPL(injectm);
103+
101104
/*
102105
* Lockless MCE logging infrastructure.
103106
* This avoids deadlocks on printk locks without having to break locks. Also
@@ -194,16 +197,46 @@ static void mce_panic(char *msg, struct mce *backup, u64 start)
194197
panic(msg);
195198
}
196199

200+
/* Support code for software error injection */
201+
202+
static int msr_to_offset(u32 msr)
203+
{
204+
unsigned bank = __get_cpu_var(injectm.bank);
205+
if (msr == rip_msr)
206+
return offsetof(struct mce, ip);
207+
if (msr == MSR_IA32_MC0_STATUS + bank*4)
208+
return offsetof(struct mce, status);
209+
if (msr == MSR_IA32_MC0_ADDR + bank*4)
210+
return offsetof(struct mce, addr);
211+
if (msr == MSR_IA32_MC0_MISC + bank*4)
212+
return offsetof(struct mce, misc);
213+
if (msr == MSR_IA32_MCG_STATUS)
214+
return offsetof(struct mce, mcgstatus);
215+
return -1;
216+
}
217+
197218
/* MSR access wrappers used for error injection */
198219
static u64 mce_rdmsrl(u32 msr)
199220
{
200221
u64 v;
222+
if (__get_cpu_var(injectm).finished) {
223+
int offset = msr_to_offset(msr);
224+
if (offset < 0)
225+
return 0;
226+
return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
227+
}
201228
rdmsrl(msr, v);
202229
return v;
203230
}
204231

205232
static void mce_wrmsrl(u32 msr, u64 v)
206233
{
234+
if (__get_cpu_var(injectm).finished) {
235+
int offset = msr_to_offset(msr);
236+
if (offset >= 0)
237+
*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
238+
return;
239+
}
207240
wrmsrl(msr, v);
208241
}
209242

@@ -296,6 +329,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
296329
* exceptions.
297330
*/
298331
}
332+
EXPORT_SYMBOL_GPL(machine_check_poll);
299333

300334
/*
301335
* The actual machine check handler. This only handles real
@@ -468,6 +502,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
468502
out2:
469503
atomic_dec(&mce_entry);
470504
}
505+
EXPORT_SYMBOL_GPL(do_machine_check);
471506

472507
#ifdef CONFIG_X86_MCE_INTEL
473508
/***
@@ -568,6 +603,7 @@ int mce_notify_user(void)
568603
}
569604
return 0;
570605
}
606+
EXPORT_SYMBOL_GPL(mce_notify_user);
571607

572608
/*
573609
* Initialize Machine Checks for a CPU.
@@ -904,13 +940,14 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
904940
}
905941
}
906942

907-
static const struct file_operations mce_chrdev_ops = {
943+
struct file_operations mce_chrdev_ops = {
908944
.open = mce_open,
909945
.release = mce_release,
910946
.read = mce_read,
911947
.poll = mce_poll,
912948
.unlocked_ioctl = mce_ioctl,
913949
};
950+
EXPORT_SYMBOL_GPL(mce_chrdev_ops);
914951

915952
static struct miscdevice mce_log_device = {
916953
MISC_MCELOG_MINOR,

0 commit comments

Comments
 (0)