Skip to content

Commit 9a22bf6

Browse files
kirylhansendc
authored andcommitted
x86/traps: Add #VE support for TDX guest
Virtualization Exceptions (#VE) are delivered to TDX guests due to specific guest actions which may happen in either user space or the kernel: * Specific instructions (WBINVD, for example) * Specific MSR accesses * Specific CPUID leaf accesses * Access to specific guest physical addresses Syscall entry code has a critical window where the kernel stack is not yet set up. Any exception in this window leads to hard to debug issues and can be exploited for privilege escalation. Exceptions in the NMI entry code also cause issues. Returning from the exception handler with IRET will re-enable NMIs and nested NMI will corrupt the NMI stack. For these reasons, the kernel avoids #VEs during the syscall gap and the NMI entry code. Entry code paths do not access TD-shared memory, MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves that might generate #VE. VMM can remove memory from TD at any point, but access to unaccepted (or missing) private memory leads to VM termination, not to #VE. Similarly to page faults and breakpoints, #VEs are allowed in NMI handlers once the kernel is ready to deal with nested NMIs. During #VE delivery, all interrupts, including NMIs, are blocked until TDGETVEINFO is called. It prevents #VE nesting until the kernel reads the VE info. TDGETVEINFO retrieves the #VE info from the TDX module, which also clears the "#VE valid" flag. This must be done before anything else as any #VE that occurs while the valid flag is set escalates to #DF by TDX module. It will result in an oops. Virtual NMIs are inhibited if the #VE valid flag is set. NMI will not be delivered until TDGETVEINFO is called. For now, convert unhandled #VE's (everything, until later in this series) so that they appear just like a #GP by calling the ve_raise_fault() directly. The ve_raise_fault() function is similar to #GP handler and is responsible for sending SIGSEGV to userspace and CPU die and notifying debuggers and other die chain users. Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com> Co-developed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Andi Kleen <ak@linux.intel.com> Reviewed-by: Tony Luck <tony.luck@intel.com> Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com> Link: https://lkml.kernel.org/r/20220405232939.73860-8-kirill.shutemov@linux.intel.com
1 parent 775acc8 commit 9a22bf6

File tree

5 files changed

+152
-0
lines changed

5 files changed

+152
-0
lines changed

arch/x86/coco/tdx/tdx.c

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
/* TDX module Call Leaf IDs */
1212
#define TDX_GET_INFO 1
13+
#define TDX_GET_VEINFO 3
1314

1415
/*
1516
* Wrapper for standard use of __tdx_hypercall with no output aside from
@@ -73,6 +74,43 @@ static u64 get_cc_mask(void)
7374
return BIT_ULL(gpa_width - 1);
7475
}
7576

77+
void tdx_get_ve_info(struct ve_info *ve)
78+
{
79+
struct tdx_module_output out;
80+
81+
/*
82+
* Called during #VE handling to retrieve the #VE info from the
83+
* TDX module.
84+
*
85+
* This has to be called early in #VE handling. A "nested" #VE which
86+
* occurs before this will raise a #DF and is not recoverable.
87+
*
88+
* The call retrieves the #VE info from the TDX module, which also
89+
* clears the "#VE valid" flag. This must be done before anything else
90+
* because any #VE that occurs while the valid flag is set will lead to
91+
* #DF.
92+
*
93+
* Note, the TDX module treats virtual NMIs as inhibited if the #VE
94+
* valid flag is set. It means that NMI=>#VE will not result in a #DF.
95+
*/
96+
tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out);
97+
98+
/* Transfer the output parameters */
99+
ve->exit_reason = out.rcx;
100+
ve->exit_qual = out.rdx;
101+
ve->gla = out.r8;
102+
ve->gpa = out.r9;
103+
ve->instr_len = lower_32_bits(out.r10);
104+
ve->instr_info = upper_32_bits(out.r10);
105+
}
106+
107+
bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
108+
{
109+
pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
110+
111+
return false;
112+
}
113+
76114
void __init tdx_early_init(void)
77115
{
78116
u64 cc_mask;

arch/x86/include/asm/idtentry.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -632,6 +632,10 @@ DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback);
632632
DECLARE_IDTENTRY_RAW(X86_TRAP_OTHER, exc_xen_unknown_trap);
633633
#endif
634634

635+
#ifdef CONFIG_INTEL_TDX_GUEST
636+
DECLARE_IDTENTRY(X86_TRAP_VE, exc_virtualization_exception);
637+
#endif
638+
635639
/* Device interrupts common/spurious */
636640
DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER, common_interrupt);
637641
#ifdef CONFIG_X86_LOCAL_APIC

arch/x86/include/asm/tdx.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <linux/bits.h>
77
#include <linux/init.h>
88
#include <linux/bits.h>
9+
#include <asm/ptrace.h>
910

1011
#define TDX_CPUID_LEAF_ID 0x21
1112
#define TDX_IDENT "IntelTDX "
@@ -56,6 +57,22 @@ struct tdx_hypercall_args {
5657
u64 r15;
5758
};
5859

60+
/*
61+
* Used by the #VE exception handler to gather the #VE exception
62+
* info from the TDX module. This is a software only structure
63+
* and not part of the TDX module/VMM ABI.
64+
*/
65+
struct ve_info {
66+
u64 exit_reason;
67+
u64 exit_qual;
68+
/* Guest Linear (virtual) Address */
69+
u64 gla;
70+
/* Guest Physical Address */
71+
u64 gpa;
72+
u32 instr_len;
73+
u32 instr_info;
74+
};
75+
5976
#ifdef CONFIG_INTEL_TDX_GUEST
6077

6178
void __init tdx_early_init(void);
@@ -70,6 +87,10 @@ u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags);
7087
/* Called from __tdx_hypercall() for unrecoverable failure */
7188
void __tdx_hypercall_failed(void);
7289

90+
void tdx_get_ve_info(struct ve_info *ve);
91+
92+
bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve);
93+
7394
#else
7495

7596
static inline void tdx_early_init(void) { };

arch/x86/kernel/idt.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ static const __initconst struct idt_data early_idts[] = {
6969
*/
7070
INTG(X86_TRAP_PF, asm_exc_page_fault),
7171
#endif
72+
#ifdef CONFIG_INTEL_TDX_GUEST
73+
INTG(X86_TRAP_VE, asm_exc_virtualization_exception),
74+
#endif
7275
};
7376

7477
/*

arch/x86/kernel/traps.c

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
#include <asm/insn.h>
6363
#include <asm/insn-eval.h>
6464
#include <asm/vdso.h>
65+
#include <asm/tdx.h>
6566

6667
#ifdef CONFIG_X86_64
6768
#include <asm/x86_init.h>
@@ -1348,6 +1349,91 @@ DEFINE_IDTENTRY(exc_device_not_available)
13481349
}
13491350
}
13501351

1352+
#ifdef CONFIG_INTEL_TDX_GUEST
1353+
1354+
#define VE_FAULT_STR "VE fault"
1355+
1356+
static void ve_raise_fault(struct pt_regs *regs, long error_code)
1357+
{
1358+
if (user_mode(regs)) {
1359+
gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR);
1360+
return;
1361+
}
1362+
1363+
if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR))
1364+
return;
1365+
1366+
die_addr(VE_FAULT_STR, regs, error_code, 0);
1367+
}
1368+
1369+
/*
1370+
* Virtualization Exceptions (#VE) are delivered to TDX guests due to
1371+
* specific guest actions which may happen in either user space or the
1372+
* kernel:
1373+
*
1374+
* * Specific instructions (WBINVD, for example)
1375+
* * Specific MSR accesses
1376+
* * Specific CPUID leaf accesses
1377+
* * Access to specific guest physical addresses
1378+
*
1379+
* In the settings that Linux will run in, virtualization exceptions are
1380+
* never generated on accesses to normal, TD-private memory that has been
1381+
* accepted.
1382+
*
1383+
* Syscall entry code has a critical window where the kernel stack is not
1384+
* yet set up. Any exception in this window leads to hard to debug issues
1385+
* and can be exploited for privilege escalation. Exceptions in the NMI
1386+
* entry code also cause issues. Returning from the exception handler with
1387+
* IRET will re-enable NMIs and nested NMI will corrupt the NMI stack.
1388+
*
1389+
* For these reasons, the kernel avoids #VEs during the syscall gap and
1390+
* the NMI entry code. Entry code paths do not access TD-shared memory,
1391+
* MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves
1392+
* that might generate #VE. VMM can remove memory from TD at any point,
1393+
* but access to unaccepted (or missing) private memory leads to VM
1394+
* termination, not to #VE.
1395+
*
1396+
* Similarly to page faults and breakpoints, #VEs are allowed in NMI
1397+
* handlers once the kernel is ready to deal with nested NMIs.
1398+
*
1399+
* During #VE delivery, all interrupts, including NMIs, are blocked until
1400+
* TDGETVEINFO is called. It prevents #VE nesting until the kernel reads
1401+
* the VE info.
1402+
*
1403+
* If a guest kernel action which would normally cause a #VE occurs in
1404+
* the interrupt-disabled region before TDGETVEINFO, a #DF (fault
1405+
* exception) is delivered to the guest which will result in an oops.
1406+
*
1407+
* The entry code has been audited carefully for following these expectations.
1408+
* Changes in the entry code have to be audited for correctness vs. this
1409+
* aspect. Similarly to #PF, #VE in these places will expose kernel to
1410+
* privilege escalation or may lead to random crashes.
1411+
*/
1412+
DEFINE_IDTENTRY(exc_virtualization_exception)
1413+
{
1414+
struct ve_info ve;
1415+
1416+
/*
1417+
* NMIs/Machine-checks/Interrupts will be in a disabled state
1418+
* till TDGETVEINFO TDCALL is executed. This ensures that VE
1419+
* info cannot be overwritten by a nested #VE.
1420+
*/
1421+
tdx_get_ve_info(&ve);
1422+
1423+
cond_local_irq_enable(regs);
1424+
1425+
/*
1426+
* If tdx_handle_virt_exception() could not process
1427+
* it successfully, treat it as #GP(0) and handle it.
1428+
*/
1429+
if (!tdx_handle_virt_exception(regs, &ve))
1430+
ve_raise_fault(regs, 0);
1431+
1432+
cond_local_irq_disable(regs);
1433+
}
1434+
1435+
#endif
1436+
13511437
#ifdef CONFIG_X86_32
13521438
DEFINE_IDTENTRY_SW(iret_error)
13531439
{

0 commit comments

Comments
 (0)