Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

kvm: switch to vm just to run guest code #10216

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions pkg/ring0/defs_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@

package ring0

import (
"gvisor.dev/gvisor/pkg/sentry/arch"
)

var (
// VirtualAddressBits is the number of bits available in the virtual
// address space.
Expand Down Expand Up @@ -144,6 +148,17 @@ type CPUArchState struct {
hasXSAVE bool
hasXSAVEOPT bool
hasFSGSBASE bool

SwitchOptsNeedIRET uint64
SwitchOptsRegs *arch.Registers
SwitchOptsFPU *byte
SwitchOptsUserCR3 uint64
SwitchOptsVector Vector
SwitchOptsFaultAddr uintptr
SwitchOptsErrorCode uintptr
SwitchOptsErrorType uintptr
SwitchOptsInterrupted uint64
SwitchOptsStack [256]byte
}

// ErrorCode returns the last error code.
Expand Down Expand Up @@ -180,6 +195,10 @@ func (c *CPU) FaultAddr() uintptr {
return c.faultAddr
}

func (c *CPU) KernelCR3(kernelPCID uint16) {
c.kernelCR3 = uintptr(c.kernel.PageTables.CR3(true, kernelPCID))
}

// SwitchArchOpts are embedded in SwitchOpts.
type SwitchArchOpts struct {
// UserPCID indicates that the application PCID to be used on switch,
Expand Down
3 changes: 3 additions & 0 deletions pkg/ring0/entry_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ func start()
// wrapper function rather than the function itself. We must reference from
// assembly to get the ABI0 (i.e., primary) address.
func AddrOfStart() uintptr
func AddrOfDoSwitchToUserLoop() uintptr

func doSwitchToUserLoop()

// Exception stubs.
func divideByZero()
Expand Down
117 changes: 114 additions & 3 deletions pkg/ring0/entry_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
#include "textflag.h"

// CPU offsets.
#define CPU_REGISTERS 72 // +checkoffset . CPU.registers
#define CPU_FPU_STATE 288 // +checkoffset . CPU.floatingPointState
#define CPU_REGISTERS 400 // +checkoffset . CPU.registers
#define CPU_FPU_STATE 616 // +checkoffset . CPU.floatingPointState
#define CPU_ARCH_STATE 16 // +checkoffset . CPU.CPUArchState
#define CPU_ERROR_CODE CPU_ARCH_STATE+0 // +checkoffset . CPUArchState.errorCode
#define CPU_ERROR_TYPE CPU_ARCH_STATE+8 // +checkoffset . CPUArchState.errorType
Expand All @@ -27,7 +27,16 @@
#define CPU_APP_GS_BASE CPU_ARCH_STATE+40 // +checkoffset . CPUArchState.appGsBase
#define CPU_HAS_XSAVE CPU_ARCH_STATE+48 // +checkoffset . CPUArchState.hasXSAVE
#define CPU_HAS_XSAVEOPT CPU_ARCH_STATE+49 // +checkoffset . CPUArchState.hasXSAVEOPT
#define CPU_HAS_FSGSBASE CPU_ARCH_STATE+50 // +checkoffset . CPUArchState.hasFSGSBASE
#define CPU_HAS_FSGSBASE CPU_ARCH_STATE+50 // +checkoffset . CPUArchState.hasFSGSBASE //
#define CPU_SWITCH_OPTS_NEED_IRET CPU_ARCH_STATE+56 // +checkoffset . CPUArchState.SwitchOptsNeedIRET
#define CPU_SWITCH_OPTS_REGS CPU_ARCH_STATE+64 // +checkoffset . CPUArchState.SwitchOptsRegs
#define CPU_SWITCH_OPTS_FPU CPU_ARCH_STATE+72 // +checkoffset . CPUArchState.SwitchOptsFPU
#define CPU_SWITCH_OPTS_USER_CR3 CPU_ARCH_STATE+80 // +checkoffset . CPUArchState.SwitchOptsUserCR3
#define CPU_SWITCH_OPTS_VECTOR CPU_ARCH_STATE+88 // +checkoffset . CPUArchState.SwitchOptsVector
#define CPU_SWITCH_OPTS_FAULT_ADDR CPU_ARCH_STATE+96 // +checkoffset . CPUArchState.SwitchOptsFaultAddr
#define CPU_SWITCH_OPTS_ERROR_CODE CPU_ARCH_STATE+104 // +checkoffset . CPUArchState.SwitchOptsErrorCode
#define CPU_SWITCH_OPTS_ERROR_TYPE CPU_ARCH_STATE+112 // +checkoffset . CPUArchState.SwitchOptsErrorType
#define CPU_SWITCH_OPTS_INT CPU_ARCH_STATE+120 // +checkoffset . CPUArchState.SwitchOptsInterrupted

#define ENTRY_SCRATCH0 256 // +checkoffset . kernelEntry.scratch0
#define ENTRY_STACK_TOP 264 // +checkoffset . kernelEntry.stackTop
Expand Down Expand Up @@ -214,6 +223,108 @@ TEXT ·jumpToUser(SB),NOSPLIT|NOFRAME,$0
MOVQ AX, 0(SP)
RET

// See kernel_amd64.go.
//
// The 16-byte frame size is for the saved values of MXCSR and the x87 control
// word.
TEXT ·doSwitchToUserLoop(SB),NOSPLIT,$16
// We are passed pointers to heap objects, but do not store them in our
// local frame.
// NO_LOCAL_POINTERS

MOVQ SI, 0(SP)
// MXCSR and the x87 control word are the only floating point state
// that is callee-save and thus we must save.
//STMXCSR mxcsr-0(SP)
//FSTCW cw-8(SP)

CMPB CPU_SWITCH_OPTS_INT(SI), $1
JE interrupted

// Restore application floating point state.
// MOVQ cpu+0(FP), SI
MOVQ CPU_SWITCH_OPTS_FPU(SI), DI
MOVB ·hasXSAVE(SB), BX
TESTB BX, BX
JZ no_xrstor
// Use xrstor to restore all available fp staCPU_SWITCH_OPT_REGS(SI)$XCR0_EAX, AX
MOVL $XCR0_EAX, AX
MOVL $XCR0_EDX, DX
BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x2f // XRSTOR64 0(DI)
JMP fprestore_done
no_xrstor:
// Fall back to fxrstor if xsave is not available.
FXRSTOR64 0(DI)
fprestore_done:

// Set application GS.
MOVQ CPU_SWITCH_OPTS_REGS(SI), R8
SWAP_GS()
MOVQ PTRACE_GS_BASE(R8), AX
CMPQ AX, CPU_APP_GS_BASE(SI)
JE skip_gs
MOVQ AX, CPU_APP_GS_BASE(SI)
PUSHQ AX
CALL ·writeGS(SB)
POPQ AX
skip_gs:
// Call sysret() or iret().
MOVQ CPU_SWITCH_OPTS_USER_CR3(SI), CX
MOVQ CPU_SWITCH_OPTS_NEED_IRET(SI), R9
ADDQ $-32, SP
MOVQ SI, 0(SP) // cpu
MOVQ R8, 8(SP) // regs
MOVQ CX, 16(SP) // userCR3
TESTQ R9, R9
JNZ do_iret
CALL ·sysret(SB)
JMP done_sysret_or_iret
do_iret:
CALL ·iret(SB)
done_sysret_or_iret:
MOVQ 24(SP), AX // vector
ADDQ $32, SP
MOVQ 0(SP), SI
MOVQ AX, CPU_SWITCH_OPTS_VECTOR(SI)

BYTE $0x0f; BYTE $0x20; BYTE $0xd3; // MOV CR2, RBX
MOVQ BX, CPU_SWITCH_OPTS_FAULT_ADDR(SI)
MOVQ CPU_ERROR_CODE(SI), BX
MOVQ BX, CPU_SWITCH_OPTS_ERROR_CODE(SI)
MOVQ CPU_ERROR_TYPE(SI), BX
MOVQ BX, CPU_SWITCH_OPTS_ERROR_TYPE(SI)

// Save application floating point state.
MOVQ CPU_SWITCH_OPTS_FPU(SI), DI
MOVB ·hasXSAVE(SB), BX
MOVB ·hasXSAVEOPT(SB), CX
TESTB BX, BX
JZ no_xsave
// Use xsave/xsaveopt to save all extended state.
MOVL $XCR0_EAX, AX
MOVL $XCR0_EDX, DX
TESTB CX, CX
JZ no_xsaveopt
BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI)
JMP fpsave_done
no_xsaveopt:
BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI)
JMP fpsave_done
no_xsave:
FXSAVE64 0(DI)
fpsave_done:


MOVQ $0xffffffffffffffff, AX
SYSCALL
interrupted:
MOVQ $20, CPU_SWITCH_OPTS_VECTOR(SI)
JMP fpsave_done

RET

ADDR_OF_FUNC(·AddrOfDoSwitchToUserLoop(SB), ·doSwitchToUserLoop(SB));

// See kernel_amd64.go.
//
// The 16-byte frame size is for the saved values of MXCSR and the x87 control
Expand Down
5 changes: 5 additions & 0 deletions pkg/ring0/kernel_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,11 @@ func (c *CPU) StackTop() uint64 {
return uint64(kernelAddr(&c.stack[0])) + uint64(len(c.stack))
}

//go:nosplit
func (c *CPU) SwitchOptsStackTop() uint64 {
return uint64(reflect.ValueOf(&c.SwitchOptsStack[0]).Pointer()) + 256 - 32
}

// IDT returns the CPU's IDT base and limit.
//
//go:nosplit
Expand Down
13 changes: 7 additions & 6 deletions pkg/ring0/x86.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,13 @@ const (
_EFER_LMA = 0x400
_EFER_NX = 0x800

_MSR_STAR = 0xc0000081
_MSR_LSTAR = 0xc0000082
_MSR_CSTAR = 0xc0000083
_MSR_SYSCALL_MASK = 0xc0000084
_MSR_PLATFORM_INFO = 0xce
_MSR_MISC_FEATURES = 0x140
_MSR_STAR = 0xc0000081
_MSR_LSTAR = 0xc0000082
_MSR_CSTAR = 0xc0000083
_MSR_SYSCALL_MASK = 0xc0000084
_MSR_PLATFORM_INFO = 0xce
_MSR_MISC_FEATURES = 0x140
_MSR_IA32_SPEC_CTRL = 0x48

_PLATFORM_INFO_CPUID_FAULT = 1 << 31

Expand Down
4 changes: 2 additions & 2 deletions pkg/sentry/platform/kvm/address_space.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ func (as *addressSpace) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.Fil
}

// See bluepill_allocator.go.
bluepill(as.pageTables.Allocator.(*allocator).cpu)
// bluepill(as.pageTables.Allocator.(*allocator).cpu)

// Perform the mapping.
prev := as.mapLocked(addr, hostMapEntry{
Expand Down Expand Up @@ -216,7 +216,7 @@ func (as *addressSpace) Unmap(addr hostarch.Addr, length uint64) {
// See above & bluepill_allocator.go.
as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
bluepill(as.pageTables.Allocator.(*allocator).cpu)
//bluepill(as.pageTables.Allocator.(*allocator).cpu)

if prev := as.unmapLocked(addr, length); prev {
// Invalidate all active vCPUs.
Expand Down
12 changes: 6 additions & 6 deletions pkg/sentry/platform/kvm/bluepill_allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ func newAllocator() *allocator {
//go:nosplit
func (a *allocator) NewPTEs() *pagetables.PTEs {
ptes := a.base.NewPTEs() // escapes: bluepill below.
if a.cpu != nil {
bluepill(a.cpu)
}
//if a.cpu != nil {
// bluepill(a.cpu)
//}
return ptes
}

Expand Down Expand Up @@ -87,9 +87,9 @@ func (a *allocator) LookupPTEs(physical uintptr) *pagetables.PTEs {
//go:nosplit
func (a *allocator) FreePTEs(ptes *pagetables.PTEs) {
a.base.FreePTEs(ptes) // escapes: bluepill below.
if a.cpu != nil {
bluepill(a.cpu)
}
//if a.cpu != nil {
// bluepill(a.cpu)
//}
}

// Recycle implements pagetables.Allocator.Recycle.
Expand Down
Loading
Loading