From 3aeaf18bc5d932921e37760e69e25de455a37ab3 Mon Sep 17 00:00:00 2001 From: Changyuan Lyu Date: Sat, 18 Apr 2026 11:24:30 -0700 Subject: [PATCH 1/2] refactor(vm): move VFIO-related fields to struct Machine Signed-off-by: Changyuan Lyu --- alioth/src/board/board.rs | 14 -------------- alioth/src/vm/vm.rs | 20 ++++++++++++++++---- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/alioth/src/board/board.rs b/alioth/src/board/board.rs index fee407e8..74d5ec5e 100644 --- a/alioth/src/board/board.rs +++ b/alioth/src/board/board.rs @@ -19,8 +19,6 @@ mod aarch64; #[path = "board_x86_64/board_x86_64.rs"] mod x86_64; -#[cfg(target_os = "linux")] -use std::collections::HashMap; use std::ffi::CStr; use std::sync::Arc; use std::thread::JoinHandle; @@ -52,10 +50,6 @@ use crate::loader::{Executable, InitState, Payload, linux}; use crate::mem::mapped::ArcMemPages; use crate::mem::{MemBackend, MemConfig, MemRegion, MemRegionType, Memory}; use crate::pci::bus::PciBus; -#[cfg(target_os = "linux")] -use crate::vfio::container::Container; -#[cfg(target_os = "linux")] -use crate::vfio::iommu::Ioas; #[cfg(target_arch = "aarch64")] use self::aarch64::ArchBoard; @@ -224,10 +218,6 @@ where pub pci_bus: PciBus, #[cfg(target_arch = "x86_64")] pub fw_cfg: Mutex>>>, - #[cfg(target_os = "linux")] - pub vfio_ioases: Mutex, Arc>>, - #[cfg(target_os = "linux")] - pub vfio_containers: Mutex, Arc>>, mp_sync: Mutex, cond_var: Condvar, @@ -262,10 +252,6 @@ where pci_bus: PciBus::new(), #[cfg(target_arch = "x86_64")] fw_cfg: Mutex::new(None), - #[cfg(target_os = "linux")] - vfio_ioases: Mutex::new(HashMap::new()), - #[cfg(target_os = "linux")] - vfio_containers: Mutex::new(HashMap::new()), mp_sync: Mutex::new(MpSync { state: BoardState::Paused, diff --git a/alioth/src/vm/vm.rs b/alioth/src/vm/vm.rs index 95ba96e0..006a488d 100644 --- a/alioth/src/vm/vm.rs +++ b/alioth/src/vm/vm.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#[cfg(target_os = "linux")] +use std::collections::HashMap; #[cfg(target_os = "linux")] use std::path::Path; use std::sync::Arc; @@ -107,8 +109,14 @@ where H: Hypervisor, { board: Arc>, + #[cfg(target_os = "linux")] iommu: Mutex>>, + #[cfg(target_os = "linux")] + pub vfio_ioases: Mutex, Arc>>, + #[cfg(target_os = "linux")] + pub vfio_containers: Mutex, Arc>>, + event_rx: Receiver, _event_tx: Sender, } @@ -151,6 +159,10 @@ where _event_tx: event_tx, #[cfg(target_os = "linux")] iommu: Mutex::new(None), + #[cfg(target_os = "linux")] + vfio_ioases: Mutex::new(HashMap::new()), + #[cfg(target_os = "linux")] + vfio_containers: Mutex::new(HashMap::new()), }; Ok(vm) @@ -311,7 +323,7 @@ where const DEFAULT_NAME: &str = "default"; pub fn add_vfio_ioas(&self, param: IoasParam) -> Result, Error> { - let mut ioases = self.board.vfio_ioases.lock(); + let mut ioases = self.vfio_ioases.lock(); if ioases.contains_key(¶m.name) { return error::AlreadyExists { name: param.name }.fail(); } @@ -337,7 +349,7 @@ where fn get_ioas(&self, name: Option<&str>) -> Result> { let ioas_name = name.unwrap_or(Self::DEFAULT_NAME); - if let Some(ioas) = self.board.vfio_ioases.lock().get(ioas_name) { + if let Some(ioas) = self.vfio_ioases.lock().get(ioas_name) { return Ok(ioas.clone()); }; if name.is_none() { @@ -367,7 +379,7 @@ where } pub fn add_vfio_container(&self, param: ContainerParam) -> Result, Error> { - let mut containers = self.board.vfio_containers.lock(); + let mut containers = self.vfio_containers.lock(); if containers.contains_key(¶m.name) { return error::AlreadyExists { name: param.name }.fail(); } @@ -387,7 +399,7 @@ where fn get_container(&self, name: Option<&str>) -> Result> { let container_name = name.unwrap_or(Self::DEFAULT_NAME); - if let Some(container) = self.board.vfio_containers.lock().get(container_name) { + if let Some(container) = self.vfio_containers.lock().get(container_name) { return Ok(container.clone()); } if name.is_none() { From cdcc4d99617e28077832828e65ffeb7fe6e4edd0 Mon Sep 17 00:00:00 2001 From: Changyuan Lyu Date: Sat, 18 Apr 2026 22:49:55 -0700 Subject: [PATCH 2/2] refactor(cpu): move VCPU thread orchestration to mod cpu Signed-off-by: Changyuan Lyu --- alioth/src/board/board.rs | 326 +---------------- alioth/src/board/board_aarch64.rs | 34 +- alioth/src/board/board_x86_64/board_x86_64.rs | 105 +----- alioth/src/board/board_x86_64/sev.rs | 130 +------ alioth/src/board/board_x86_64/tdx.rs | 87 +---- alioth/src/cpu/cpu.rs | 333 ++++++++++++++++++ alioth/src/cpu/cpu_aarch64.rs | 48 +++ alioth/src/cpu/cpu_x86_64/cpu_x86_64.rs | 111 ++++++ alioth/src/cpu/cpu_x86_64/sev.rs | 155 ++++++++ alioth/src/cpu/cpu_x86_64/tdx.rs | 106 ++++++ alioth/src/lib.rs | 2 + alioth/src/vm/vm.rs | 195 ++++++---- 12 files changed, 900 insertions(+), 732 deletions(-) create mode 100644 alioth/src/cpu/cpu.rs create mode 100644 alioth/src/cpu/cpu_aarch64.rs create mode 100644 alioth/src/cpu/cpu_x86_64/cpu_x86_64.rs create mode 100644 alioth/src/cpu/cpu_x86_64/sev.rs create mode 100644 alioth/src/cpu/cpu_x86_64/tdx.rs diff --git a/alioth/src/board/board.rs b/alioth/src/board/board.rs index 74d5ec5e..3b59360d 100644 --- a/alioth/src/board/board.rs +++ b/alioth/src/board/board.rs @@ -21,14 +21,14 @@ mod x86_64; use std::ffi::CStr; use std::sync::Arc; -use std::thread::JoinHandle; -use flume::Sender; use libc::{MAP_PRIVATE, MAP_SHARED}; -use parking_lot::{Condvar, Mutex, RwLock, RwLockReadGuard}; +#[cfg(target_arch = "x86_64")] +use parking_lot::Mutex; +use parking_lot::RwLock; use serde::Deserialize; use serde_aco::Help; -use snafu::{ResultExt, Snafu}; +use snafu::Snafu; #[cfg(target_arch = "x86_64")] use crate::arch::cpuid::CpuidIn; @@ -43,10 +43,8 @@ use crate::device::MmioDev; #[cfg(target_arch = "x86_64")] use crate::device::fw_cfg::FwCfg; use crate::errors::{DebugTrace, trace_error}; -use crate::hv::{Coco, Hypervisor, Vcpu, Vm, VmConfig, VmEntry, VmExit}; -#[cfg(target_arch = "x86_64")] -use crate::loader::xen; -use crate::loader::{Executable, InitState, Payload, linux}; +use crate::hv::{Coco, Hypervisor, Vm, VmConfig}; +use crate::loader::Payload; use crate::mem::mapped::ArcMemPages; use crate::mem::{MemBackend, MemConfig, MemRegion, MemRegionType, Memory}; use crate::pci::bus::PciBus; @@ -64,44 +62,13 @@ pub enum Error { HvError { source: Box }, #[snafu(display("Failed to access guest memory"), context(false))] Memory { source: Box }, - #[snafu(display("Failed to load payload"), context(false))] - Loader { source: Box }, #[snafu(display("Invalid CPU topology"))] InvalidCpuTopology, - #[snafu(display("Failed to create VCPU-{index}"))] - CreateVcpu { - index: u16, - source: Box, - }, - #[snafu(display("Failed to run VCPU-{index}"))] - RunVcpu { - index: u16, - source: Box, - }, - #[snafu(display("Failed to stop VCPU-{index}"))] - StopVcpu { - index: u16, - source: Box, - }, - #[snafu(display("Failed to reset PCI devices"))] - ResetPci { source: Box }, - #[snafu(display("Failed to configure firmware"))] + #[snafu(display("Failed to configure fw_cfg device"))] FwCfg { error: std::io::Error }, - #[snafu(display("Missing payload"))] - MissingPayload, - #[snafu(display("Failed to notify the VMM thread"))] - NotifyVmm, - #[snafu(display("Another VCPU thread has signaled failure"))] - PeerFailure, - #[snafu(display("Unexpected state: {state:?}, want {want:?}"))] - UnexpectedState { state: BoardState, want: BoardState }, #[cfg(target_arch = "x86_64")] #[snafu(display("Missing CPUID leaf {leaf:x?}"))] MissingCpuid { leaf: CpuidIn }, - #[snafu(display("Firmware error"), context(false))] - Firmware { source: Box }, - #[snafu(display("Unknown firmware metadata"))] - UnknownFirmwareMetadata, } type Result = std::result::Result; @@ -166,21 +133,6 @@ impl CpuConfig { } } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum BoardState { - Paused, - Running, - Shutdown, - RebootPending, -} - -#[derive(Debug)] -struct MpSync { - state: BoardState, - fatal: bool, - count: u16, -} - pub const PCIE_MMIO_64_SIZE: u64 = 1 << 40; #[derive(Debug, Default, PartialEq, Eq, Deserialize)] @@ -200,16 +152,12 @@ impl BoardConfig { } } -type VcpuGuard<'a> = RwLockReadGuard<'a, Vec>; -type VcpuHandle = JoinHandle>; - pub struct Board where V: Vm, { pub vm: V, pub memory: Memory, - pub vcpus: Arc>>, pub arch: ArchBoard, pub config: BoardConfig, pub payload: RwLock>, @@ -218,9 +166,6 @@ where pub pci_bus: PciBus, #[cfg(target_arch = "x86_64")] pub fw_cfg: Mutex>>>, - - mp_sync: Mutex, - cond_var: Condvar, } impl Board @@ -246,19 +191,11 @@ where arch, config, payload: RwLock::new(None), - vcpus: Arc::new(RwLock::new(Vec::new())), io_devs: RwLock::new(Vec::new()), mmio_devs: RwLock::new(Vec::new()), pci_bus: PciBus::new(), #[cfg(target_arch = "x86_64")] fw_cfg: Mutex::new(None), - - mp_sync: Mutex::new(MpSync { - state: BoardState::Paused, - count: 0, - fatal: false, - }), - cond_var: Condvar::new(), }; board.coco_init(vm_memory)?; @@ -266,74 +203,6 @@ where Ok(board) } - pub fn boot(&self) -> Result<()> { - self.resume() - } - - pub fn resume(&self) -> Result<()> { - let mut mp_sync = self.mp_sync.lock(); - if mp_sync.state == BoardState::Paused { - mp_sync.state = BoardState::Running; - } else { - return error::UnexpectedState { - state: mp_sync.state, - want: BoardState::Paused, - } - .fail(); - } - self.cond_var.notify_all(); - Ok(()) - } - - pub fn pause(&self) -> Result<()> { - let vcpus = self.vcpus.read(); - let mut mp_sync = self.mp_sync.lock(); - if mp_sync.state != BoardState::Running { - return error::UnexpectedState { - state: mp_sync.state, - want: BoardState::Running, - } - .fail(); - } - mp_sync.state = BoardState::Paused; - self.stop_other_vcpus(None, &vcpus)?; - Ok(()) - } - - fn load_payload(&self, vcpu: &mut V::Vcpu) -> Result { - let payload = self.payload.read(); - let Some(payload) = payload.as_ref() else { - return error::MissingPayload.fail(); - }; - - if let Some(fw) = payload.firmware.as_ref() { - return self.setup_firmware(fw, payload, vcpu); - } - - let Some(exec) = &payload.executable else { - return error::MissingPayload.fail(); - }; - let mem_regions = self.memory.mem_region_entries(); - let init_state = match exec { - Executable::Linux(image) => linux::load( - &self.memory.ram_bus(), - &mem_regions, - image.as_ref(), - payload.cmdline.as_deref(), - payload.initramfs.as_deref(), - ), - #[cfg(target_arch = "x86_64")] - Executable::Pvh(image) => xen::load( - &self.memory.ram_bus(), - &mem_regions, - image.as_ref(), - payload.cmdline.as_deref(), - payload.initramfs.as_deref(), - ), - }?; - Ok(init_state) - } - fn add_pci_devs(&self) -> Result<()> { #[cfg(target_arch = "x86_64")] self.memory @@ -361,182 +230,15 @@ where Ok(()) } - fn vcpu_loop(&self, vcpu: &mut ::Vcpu, index: u16) -> Result { - let mut vm_entry = VmEntry::None; - loop { - let vm_exit = vcpu.run(vm_entry).context(error::RunVcpu { index })?; - vm_entry = match vm_exit { - #[cfg(target_arch = "x86_64")] - VmExit::Io { port, write, size } => self.memory.handle_io(port, write, size)?, - VmExit::Mmio { addr, write, size } => self.memory.handle_mmio(addr, write, size)?, - VmExit::Shutdown => break Ok(BoardState::Shutdown), - VmExit::Reboot => break Ok(BoardState::RebootPending), - VmExit::Paused => break Ok(BoardState::Paused), - VmExit::Interrupted => { - let mp_sync = self.mp_sync.lock(); - match mp_sync.state { - BoardState::Shutdown => VmEntry::Shutdown, - BoardState::RebootPending => VmEntry::Reboot, - BoardState::Paused => VmEntry::Pause, - BoardState::Running => VmEntry::None, - } - } - VmExit::ConvertMemory { gpa, size, private } => { - self.memory.mark_private_memory(gpa, size, private)?; - VmEntry::None - } - }; - } - } - - fn sync_vcpus(&self, vcpus: &VcpuGuard) -> Result<()> { - let mut mp_sync = self.mp_sync.lock(); - if mp_sync.fatal { - return error::PeerFailure.fail(); - } - - mp_sync.count += 1; - if mp_sync.count == vcpus.len() as u16 { - mp_sync.count = 0; - self.cond_var.notify_all(); - } else { - self.cond_var.wait(&mut mp_sync) - } - - if mp_sync.fatal { - return error::PeerFailure.fail(); - } - - Ok(()) - } - - fn notify_vmm(&self, index: u16, event_tx: &Sender) -> Result<()> { - if event_tx.send(index).is_err() { - error::NotifyVmm.fail() - } else { - Ok(()) + pub(crate) fn init_devices(&self) -> Result<()> { + self.create_ram()?; + for (port, dev) in self.io_devs.read().iter() { + self.memory.add_io_dev(*port, dev.clone())?; } - } - - fn boot_init_sync(&self, index: u16, vcpu: &mut V::Vcpu) -> Result<()> { - let vcpus = self.vcpus.read(); - if index == 0 { - self.create_ram()?; - for (port, dev) in self.io_devs.read().iter() { - self.memory.add_io_dev(*port, dev.clone())?; - } - for (addr, dev) in self.mmio_devs.read().iter() { - self.memory.add_mmio_dev(*addr, dev.clone())?; - } - self.add_pci_devs()?; - let init_state = self.load_payload(vcpu)?; - self.init_boot_vcpu(vcpu, &init_state)?; - self.create_firmware_data(&init_state)?; - } - self.init_ap(index, vcpu, &vcpus)?; - self.coco_finalize(index, &vcpus)?; - self.sync_vcpus(&vcpus) - } - - fn stop_other_vcpus(&self, current: Option, vcpus: &VcpuGuard) -> Result<()> { - for (index, handle) in vcpus.iter().enumerate() { - let index = index as u16; - if let Some(current) = current { - if current == index { - continue; - } - log::info!("VCPU-{current}: stopping VCPU-{index}"); - } else { - log::info!("Stopping VCPU-{index}"); - } - let identity = self.encode_cpu_identity(index); - self.vm - .stop_vcpu(identity, handle) - .context(error::StopVcpu { index })?; - } - Ok(()) - } - - fn run_vcpu_inner(&self, index: u16, event_tx: &Sender) -> Result<(), Error> { - let mut vcpu = self.create_vcpu(index)?; - self.notify_vmm(index, event_tx)?; - self.init_vcpu(index, &mut vcpu)?; - - 'reboot: loop { - let mut mp_sync = self.mp_sync.lock(); - loop { - match mp_sync.state { - BoardState::Paused => self.cond_var.wait(&mut mp_sync), - BoardState::Running => break, - BoardState::Shutdown => break 'reboot Ok(()), - BoardState::RebootPending => mp_sync.state = BoardState::Running, - } - } - drop(mp_sync); - - self.boot_init_sync(index, &mut vcpu)?; - - let request = 'pause: loop { - let request = self.vcpu_loop(&mut vcpu, index); - - let vcpus = self.vcpus.read(); - let mut mp_sync = self.mp_sync.lock(); - if mp_sync.state == BoardState::Running { - mp_sync.state = match request { - Ok(BoardState::RebootPending) => BoardState::RebootPending, - Ok(BoardState::Paused) => BoardState::Paused, - _ => BoardState::Shutdown, - }; - log::trace!("VCPU-{index}: change state to {:?}", mp_sync.state); - self.stop_other_vcpus(Some(index), &vcpus)?; - } - loop { - match mp_sync.state { - BoardState::Running => break, - BoardState::Paused => self.cond_var.wait(&mut mp_sync), - BoardState::RebootPending | BoardState::Shutdown => break 'pause request, - } - } - }; - - if index == 0 { - self.pci_bus.segment.reset().context(error::ResetPci)?; - self.memory.reset()?; - } - self.reset_vcpu(index, &mut vcpu)?; - - request?; - - let vcpus = self.vcpus.read(); - self.sync_vcpus(&vcpus)?; - } - } - - fn create_vcpu(&self, index: u16) -> Result { - let identity = self.encode_cpu_identity(index); - let vcpu = self - .vm - .create_vcpu(index, identity) - .context(error::CreateVcpu { index })?; - Ok(vcpu) - } - - pub fn run_vcpu(&self, index: u16, event_tx: Sender) -> Result<(), Error> { - let ret = self.run_vcpu_inner(index, &event_tx); - - let _ = self.notify_vmm(index, &event_tx); - - if matches!(ret, Ok(_) | Err(Error::PeerFailure { .. })) { - return Ok(()); - } - - log::warn!("VCPU-{index} reported error {ret:?}, unblocking other VCPUs..."); - let mut mp_sync = self.mp_sync.lock(); - mp_sync.fatal = true; - if mp_sync.count > 0 { - self.cond_var.notify_all(); + for (addr, dev) in self.mmio_devs.read().iter() { + self.memory.add_mmio_dev(*addr, dev.clone())?; } - ret + self.add_pci_devs() } fn create_ram_pages( diff --git a/alioth/src/board/board_aarch64.rs b/alioth/src/board/board_aarch64.rs index a37498bc..af0fedf6 100644 --- a/alioth/src/board/board_aarch64.rs +++ b/alioth/src/board/board_aarch64.rs @@ -13,7 +13,6 @@ // limitations under the License. use std::collections::HashMap; -use std::path::Path; use std::sync::Arc; use crate::arch::layout::{ @@ -24,10 +23,10 @@ use crate::arch::layout::{ RAM_32_SIZE, RAM_32_START, }; use crate::arch::reg::MpidrEl1; -use crate::board::{Board, BoardConfig, CpuTopology, PCIE_MMIO_64_SIZE, Result, VcpuGuard}; +use crate::board::{Board, BoardConfig, CpuTopology, PCIE_MMIO_64_SIZE, Result}; use crate::firmware::dt::{DeviceTree, Node, PropVal}; -use crate::hv::{GicV2, GicV2m, GicV3, Hypervisor, Its, Vcpu, Vm}; -use crate::loader::{Executable, InitState, Payload}; +use crate::hv::{GicV2, GicV2m, GicV3, Hypervisor, Its, Vm}; +use crate::loader::{Executable, InitState}; use crate::mem::{MemRegion, MemRegionType}; enum Gic @@ -108,29 +107,6 @@ where encode_mpidr(&self.config.cpu.topology, index).0 } - pub fn setup_firmware(&self, _: &Path, _: &Payload, _: &V::Vcpu) -> Result { - unimplemented!() - } - - pub fn init_ap(&self, _id: u16, _vcpu: &mut V::Vcpu, _vcpus: &VcpuGuard) -> Result<()> { - Ok(()) - } - - pub fn init_boot_vcpu(&self, vcpu: &mut V::Vcpu, init_state: &InitState) -> Result<()> { - vcpu.set_regs(&init_state.regs)?; - vcpu.set_sregs(&init_state.sregs)?; - Ok(()) - } - - pub fn init_vcpu(&self, index: u16, vcpu: &mut V::Vcpu) -> Result<()> { - self.reset_vcpu(index, vcpu) - } - - pub fn reset_vcpu(&self, index: u16, vcpu: &mut V::Vcpu) -> Result<()> { - vcpu.reset(index == 0)?; - Ok(()) - } - pub fn create_ram(&self) -> Result<()> { let mem_size = self.config.mem.size; let memory = &self.memory; @@ -158,10 +134,6 @@ where Ok(()) } - pub fn coco_finalize(&self, _id: u16, _vcpus: &VcpuGuard) -> Result<()> { - Ok(()) - } - pub fn arch_init(&self) -> Result<()> { match &self.arch.gic { Gic::V2(v2) => v2.init(), diff --git a/alioth/src/board/board_x86_64/board_x86_64.rs b/alioth/src/board/board_x86_64/board_x86_64.rs index 385f525e..e6a0d5be 100644 --- a/alioth/src/board/board_x86_64/board_x86_64.rs +++ b/alioth/src/board/board_x86_64/board_x86_64.rs @@ -18,7 +18,6 @@ mod tdx; use std::arch::x86_64::{__cpuid, CpuidResult}; use std::collections::HashMap; use std::mem::{offset_of, size_of, size_of_val}; -use std::path::Path; use std::sync::Arc; use std::sync::atomic::{AtomicU32, AtomicU64}; @@ -31,8 +30,7 @@ use crate::arch::layout::{ BIOS_DATA_END, EBDA_END, EBDA_START, IOAPIC_START, MEM_64_START, PORT_ACPI_RESET, PORT_ACPI_SLEEP_CONTROL, PORT_ACPI_TIMER, RAM_32_SIZE, }; -use crate::arch::msr::{MiscEnable, Msr}; -use crate::board::{Board, BoardConfig, CpuTopology, PCIE_MMIO_64_SIZE, Result, VcpuGuard, error}; +use crate::board::{Board, BoardConfig, CpuTopology, PCIE_MMIO_64_SIZE, Result, error}; use crate::device::ioapic::IoApic; use crate::firmware::acpi::bindings::{ AcpiTableFadt, AcpiTableHeader, AcpiTableRsdp, AcpiTableXsdt3, @@ -41,9 +39,8 @@ use crate::firmware::acpi::reg::{AcpiPmTimer, FadtReset, FadtSleepControl}; use crate::firmware::acpi::{ AcpiTable, create_fadt, create_madt, create_mcfg, create_rsdp, create_xsdt, }; -use crate::hv::{Coco, Hypervisor, Vcpu, Vm}; -use crate::loader::{Executable, InitState, Payload, firmware}; -use crate::mem::mapped::ArcMemPages; +use crate::hv::{Coco, Hypervisor, Vm}; +use crate::loader::{Executable, InitState, Payload}; use crate::mem::{MemRange, MemRegion, MemRegionEntry, MemRegionType}; use crate::utils::wrapping_sum; @@ -51,9 +48,9 @@ pub struct ArchBoard where V: Vm, { - cpuids: HashMap, - sev_ap_eip: AtomicU32, - tdx_hob: AtomicU64, + pub(crate) cpuids: HashMap, + pub(crate) sev_ap_eip: AtomicU32, + pub(crate) tdx_hob: AtomicU64, pub(crate) io_apic: Arc>, } @@ -176,7 +173,7 @@ where encode_x2apic_id(&self.config.cpu.topology, index) as u64 } - fn setup_fw_cfg(&self, payload: &Payload) -> Result<()> { + pub(crate) fn setup_fw_cfg(&self, payload: &Payload) -> Result<()> { let Some(dev) = &*self.fw_cfg.lock() else { return Ok(()); }; @@ -193,79 +190,6 @@ where Ok(()) } - fn setup_coco(&self, fw: &mut ArcMemPages, vcpu: &V::Vcpu) -> Result<()> { - let Some(coco) = &self.config.coco else { - return Ok(()); - }; - match coco { - Coco::AmdSev { policy } => self.setup_sev(fw, *policy), - Coco::AmdSnp { .. } => self.setup_snp(fw), - Coco::IntelTdx { .. } => self.setup_tdx(fw, vcpu), - } - } - - pub fn setup_firmware( - &self, - fw: &Path, - payload: &Payload, - vcpu: &V::Vcpu, - ) -> Result { - let (init_state, mut rom) = firmware::load(&self.memory, fw)?; - self.setup_coco(&mut rom, vcpu)?; - self.setup_fw_cfg(payload)?; - Ok(init_state) - } - - pub fn init_ap(&self, index: u16, vcpu: &mut V::Vcpu, vcpus: &VcpuGuard) -> Result<()> { - let Some(coco) = &self.config.coco else { - return Ok(()); - }; - self.sync_vcpus(vcpus)?; - if index == 0 { - return Ok(()); - } - match coco { - Coco::AmdSev { policy } => { - if policy.es() { - self.sev_init_ap(vcpu)?; - } - } - Coco::AmdSnp { .. } => self.sev_init_ap(vcpu)?, - Coco::IntelTdx { .. } => self.tdx_init_ap(vcpu)?, - } - Ok(()) - } - - pub fn init_boot_vcpu(&self, vcpu: &mut V::Vcpu, init_state: &InitState) -> Result<()> { - if matches!(self.config.coco, Some(Coco::IntelTdx { .. })) { - return Ok(()); - } - vcpu.set_sregs(&init_state.sregs, &init_state.seg_regs, &init_state.dt_regs)?; - vcpu.set_msrs(&init_state.msrs)?; - vcpu.set_regs(&init_state.regs)?; - Ok(()) - } - - pub fn init_vcpu(&self, index: u16, vcpu: &mut V::Vcpu) -> Result<()> { - let mut cpuids = self.arch.cpuids.clone(); - let apic_id = self.encode_cpu_identity(index) as u32; - for (in_, out) in &mut cpuids { - if in_.func == 0x1 { - out.ebx &= 0x00ff_ffff; - out.ebx |= apic_id << 24; - } else if in_.func == 0xb || in_.func == 0x1f || in_.func == 0x80000026 { - out.edx = apic_id; - } - } - vcpu.set_cpuids(cpuids)?; - vcpu.set_msrs(&[(Msr::MISC_ENABLE, MiscEnable::FAST_STRINGS.bits())])?; - Ok(()) - } - - pub fn reset_vcpu(&self, _index: u16, _vcpu: &mut V::Vcpu) -> Result<()> { - Ok(()) - } - pub fn create_ram(&self) -> Result<()> { let config = &self.config; let memory = &self.memory; @@ -324,21 +248,6 @@ where Ok(()) } - pub fn coco_finalize(&self, index: u16, vcpus: &VcpuGuard) -> Result<()> { - let Some(coco) = &self.config.coco else { - return Ok(()); - }; - self.sync_vcpus(vcpus)?; - if index != 0 { - return Ok(()); - }; - match coco { - Coco::AmdSev { policy } => self.sev_finalize(*policy), - Coco::AmdSnp { .. } => self.snp_finalize(), - Coco::IntelTdx { .. } => self.tdx_finalize(), - } - } - fn patch_dsdt(&self, data: &mut [u8; 352]) { let pcie_mmio_64_start = self.config.pcie_mmio_64_start(); let pcei_mmio_64_max = pcie_mmio_64_start - 1 + PCIE_MMIO_64_SIZE; diff --git a/alioth/src/board/board_x86_64/sev.rs b/alioth/src/board/board_x86_64/sev.rs index 98fc6e91..75dd9a32 100644 --- a/alioth/src/board/board_x86_64/sev.rs +++ b/alioth/src/board/board_x86_64/sev.rs @@ -14,24 +14,15 @@ use std::arch::x86_64::{__cpuid, CpuidResult}; use std::collections::HashMap; -use std::iter::zip; use std::sync::Arc; -use std::sync::atomic::Ordering; - -use zerocopy::FromZeros; use crate::arch::cpuid::{ Cpuid1Ecx, Cpuid7Index0Ebx, Cpuid7Index0Edx, CpuidExt1fEAx, CpuidExt1fEbx, CpuidExt8Ebx, CpuidExt21EAx, CpuidIn, }; -use crate::arch::layout::MEM_64_START; -use crate::arch::reg::{Reg, SegAccess, SegReg, SegRegVal}; -use crate::arch::sev::{SevPolicy, SnpPageType, SnpPolicy}; +use crate::arch::sev::{SevPolicy, SnpPolicy}; use crate::board::{Board, Result, error}; -use crate::firmware::ovmf::sev::{ - SevDescType, SevMetadataDesc, SnpCpuidFunc, SnpCpuidInfo, parse_desc, parse_sev_ap_eip, -}; -use crate::hv::{Coco, Vcpu, Vm, VmMemory}; +use crate::hv::{Coco, Vm, VmMemory}; use crate::mem::mapped::ArcMemPages; use crate::mem::{self, LayoutChanged, MarkPrivateMemory}; @@ -117,123 +108,6 @@ impl Board where V: Vm, { - fn fill_snp_cpuid(&self, entries: &mut [SnpCpuidFunc]) { - for ((in_, out), dst) in zip(self.arch.cpuids.iter(), entries.iter_mut()) { - dst.eax_in = in_.func; - dst.ecx_in = in_.index.unwrap_or(0); - dst.eax = out.eax; - dst.ebx = out.ebx; - dst.ecx = out.ecx; - dst.edx = out.edx; - if dst.eax_in == 0xd && (dst.ecx_in == 0x0 || dst.ecx_in == 0x1) { - dst.ebx = 0x240; - dst.xcr0_in = 1; - dst.xss_in = 0; - } - } - } - - fn parse_sev_api_eip(&self, data: &[u8]) -> Result<()> { - let ap_eip = parse_sev_ap_eip(data)?; - self.arch.sev_ap_eip.store(ap_eip, Ordering::Release); - Ok(()) - } - - fn update_snp_desc(&self, desc: &SevMetadataDesc) -> Result<()> { - let mut cpuid_table = SnpCpuidInfo::new_zeroed(); - let ram_bus = self.memory.ram_bus(); - let ram = ram_bus.lock_layout(); - let page_type = match desc.type_ { - SevDescType::SNP_DESC_MEM => SnpPageType::UNMEASURED, - SevDescType::SNP_SECRETS => SnpPageType::SECRETS, - SevDescType::CPUID => { - assert!(desc.len as usize >= size_of::()); - assert!(cpuid_table.entries.len() >= self.arch.cpuids.len()); - cpuid_table.count = self.arch.cpuids.len() as u32; - self.fill_snp_cpuid(&mut cpuid_table.entries); - ram.write_t(desc.base as _, &cpuid_table)?; - SnpPageType::CPUID - } - _ => SnpPageType::ZERO, - }; - let range_ref = ram.get_slice::(desc.base as u64, desc.len as u64)?; - let bytes = - unsafe { std::slice::from_raw_parts_mut(range_ref.as_ptr() as _, range_ref.len()) }; - self.memory - .mark_private_memory(desc.base as _, desc.len as _, true)?; - let ret = self.vm.snp_launch_update(bytes, desc.base as _, page_type); - if ret.is_err() && desc.type_ == SevDescType::CPUID { - let updated_cpuid: SnpCpuidInfo = ram.read_t(desc.base as _)?; - for (set, got) in zip(&cpuid_table.entries, &updated_cpuid.entries) { - if set != got { - log::error!("set {set:#x?}, but firmware expects {got:#x?}"); - } - } - } - ret?; - Ok(()) - } - - pub(crate) fn setup_sev(&self, fw: &mut ArcMemPages, policy: SevPolicy) -> Result<()> { - self.memory.register_encrypted_pages(fw)?; - - let data = fw.as_slice_mut(); - if policy.es() { - self.parse_sev_api_eip(data)?; - } - self.vm.sev_launch_update_data(data)?; - Ok(()) - } - - pub(crate) fn setup_snp(&self, fw: &mut ArcMemPages) -> Result<()> { - self.memory.register_encrypted_pages(fw)?; - - let data = fw.as_slice_mut(); - self.parse_sev_api_eip(data)?; - for desc in parse_desc(data)? { - self.update_snp_desc(desc)?; - } - let fw_gpa = MEM_64_START - data.len() as u64; - self.memory - .mark_private_memory(fw_gpa, data.len() as _, true)?; - self.vm - .snp_launch_update(data, fw_gpa, SnpPageType::NORMAL)?; - Ok(()) - } - - pub(crate) fn sev_finalize(&self, policy: SevPolicy) -> Result<()> { - if policy.es() { - self.vm.sev_launch_update_vmsa()?; - } - self.vm.sev_launch_measure()?; - self.vm.sev_launch_finish()?; - Ok(()) - } - - pub(crate) fn snp_finalize(&self) -> Result<()> { - self.vm.snp_launch_finish()?; - Ok(()) - } - - pub(crate) fn sev_init_ap(&self, vcpu: &mut V::Vcpu) -> Result<()> { - let eip = self.arch.sev_ap_eip.load(Ordering::Acquire); - vcpu.set_regs(&[(Reg::Rip, eip as u64 & 0xffff)])?; - vcpu.set_sregs( - &[], - &[( - SegReg::Cs, - SegRegVal { - selector: 0xf000, - base: eip as u64 & 0xffff_0000, - limit: 0xffff, - access: SegAccess(0x9b), - }, - )], - &[], - )?; - Ok(()) - } - pub(crate) fn sev_init(&self, policy: SevPolicy, memory: Arc) -> Result<()> { self.vm.sev_launch_start(policy)?; let encrypt_pages = Box::new(EncryptPages { memory }); diff --git a/alioth/src/board/board_x86_64/tdx.rs b/alioth/src/board/board_x86_64/tdx.rs index bab2276f..f1e054be 100644 --- a/alioth/src/board/board_x86_64/tdx.rs +++ b/alioth/src/board/board_x86_64/tdx.rs @@ -13,15 +13,11 @@ // limitations under the License. use std::sync::Arc; -use std::sync::atomic::Ordering; -use crate::arch::layout::MEM_64_START; use crate::arch::tdx::TdAttr; -use crate::board::{Board, Result, error}; -use crate::firmware::ovmf::tdx::{TdvfSectionAttr, TdvfSectionType, create_hob, parse_entries}; -use crate::hv::{Vcpu, Vm, VmMemory}; +use crate::board::{Board, Result}; +use crate::hv::{Vm, VmMemory}; use crate::mem::MarkPrivateMemory; -use crate::mem::mapped::ArcMemPages; impl Board where @@ -33,83 +29,4 @@ where self.memory.register_change_callback(mark_private_memory)?; Ok(()) } - - pub(crate) fn create_hob(&self, dst: &mut [u8], mut accepted: Vec<(u64, u64)>) -> Result { - let hob_phys = self.arch.tdx_hob.load(Ordering::Relaxed); - let mut entries = self.memory.mem_region_entries(); - create_hob(dst, hob_phys, &mut entries, &mut accepted)?; - Ok(hob_phys) - } - - pub(crate) fn setup_tdx(&self, fw: &mut ArcMemPages, vcpu: &V::Vcpu) -> Result<()> { - let data = fw.as_slice(); - let entries = parse_entries(data)?; - - let fw_gpa = MEM_64_START - data.len() as u64; - self.memory - .mark_private_memory(fw_gpa, data.len() as _, true)?; - - let mut accepted = Vec::new(); - let mut hob_ram = None; - for entry in entries { - match entry.r#type { - TdvfSectionType::TD_HOB => { - let p = ArcMemPages::from_anonymous(entry.size as usize, None, None)?; - hob_ram = Some(p); - let tdx_hob = &self.arch.tdx_hob; - tdx_hob.store(entry.address, Ordering::Relaxed); - accepted.push((entry.address, entry.size)); - } - TdvfSectionType::TEMP_MEM => { - accepted.push((entry.address, entry.size)); - } - _ => {} - }; - } - - let Some(hob_ram) = &mut hob_ram else { - return error::MissingPayload.fail(); - }; - let hob_phys = self.create_hob(hob_ram.as_slice_mut(), accepted)?; - - vcpu.tdx_init_vcpu(hob_phys)?; - - for entry in entries { - let tmp_mem; - let region = match entry.r#type { - TdvfSectionType::TD_HOB => hob_ram.as_slice(), - TdvfSectionType::TEMP_MEM => { - tmp_mem = ArcMemPages::from_anonymous(entry.size as usize, None, None)?; - tmp_mem.as_slice() - } - TdvfSectionType::BFV | TdvfSectionType::CFV => { - let start = entry.data_offset as usize; - let end = start + entry.size as usize; - let Some(d) = data.get(start..end) else { - return error::MissingPayload.fail(); - }; - d - } - t => { - log::error!("Unknown entry type: {t:x?}"); - return error::UnknownFirmwareMetadata.fail(); - } - }; - let measure = entry.attributes.contains(TdvfSectionAttr::MR_EXTEND); - vcpu.tdx_init_mem_region(region, entry.address, measure)?; - } - - Ok(()) - } - - pub(crate) fn tdx_init_ap(&self, vcpu: &mut V::Vcpu) -> Result<()> { - let hob = self.arch.tdx_hob.load(Ordering::Relaxed); - vcpu.tdx_init_vcpu(hob)?; - Ok(()) - } - - pub(crate) fn tdx_finalize(&self) -> Result<()> { - self.vm.tdx_finalize_vm()?; - Ok(()) - } } diff --git a/alioth/src/cpu/cpu.rs b/alioth/src/cpu/cpu.rs new file mode 100644 index 00000000..202befc2 --- /dev/null +++ b/alioth/src/cpu/cpu.rs @@ -0,0 +1,333 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#[cfg(target_arch = "aarch64")] +#[path = "cpu_aarch64.rs"] +mod aarch64; +#[cfg(target_arch = "x86_64")] +#[path = "cpu_x86_64/cpu_x86_64.rs"] +mod x86_64; + +use std::sync::Arc; +use std::thread::JoinHandle; + +use flume::Sender; +use parking_lot::{Condvar, Mutex, RwLock}; +use snafu::{ResultExt, Snafu}; + +use crate::board::Board; +use crate::errors::{DebugTrace, trace_error}; +use crate::hv::{Vcpu, Vm, VmEntry, VmExit}; +#[cfg(target_arch = "x86_64")] +use crate::loader::xen; +use crate::loader::{Executable, InitState, linux}; + +#[trace_error] +#[derive(Snafu, DebugTrace)] +#[snafu(module, context(suffix(false)))] +pub enum Error { + #[snafu(display("Hypervisor internal error"), context(false))] + HvError { source: Box }, + #[snafu(display("Failed to configure guest memory"), context(false))] + Memory { source: Box }, + #[snafu(display("Failed to setup board"), context(false))] + Board { source: Box }, + #[snafu(display("Failed to reset PCI devices"))] + ResetPci { source: Box }, + #[snafu(display("Firmware error"), context(false))] + Firmware { source: Box }, + #[snafu(display("Unknown firmware metadata"))] + UnknownFirmwareMetadata, + #[snafu(display("Missing payload"))] + MissingPayload, + #[snafu(display("Failed to load payload"), context(false))] + Loader { source: Box }, + #[snafu(display("Failed to notify the VMM thread"))] + NotifyVmm, + #[snafu(display("Another VCPU thread has signaled failure"))] + PeerFailure, +} + +pub type Result = std::result::Result; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum State { + Paused, + Running, + Shutdown, + RebootPending, +} + +pub(crate) struct MpSync { + pub(crate) state: State, + fatal: bool, + count: u16, +} + +pub struct VcpuHandle { + pub thread: JoinHandle>, +} + +pub struct Context { + pub(crate) board: Board, + pub(crate) vcpus: RwLock>, + + pub(crate) sync: Mutex, + pub(crate) cond: Condvar, +} + +impl Context { + pub fn new(board: Board) -> Self { + Self { + board, + vcpus: RwLock::new(Vec::new()), + sync: Mutex::new(MpSync { + state: State::Paused, + fatal: false, + count: 0, + }), + cond: Condvar::new(), + } + } +} + +struct VcpuThread { + ctx: Arc>, + index: u16, + event_tx: Sender, + vcpu: ::Vcpu, +} + +fn notify_vmm(event_tx: &Sender, index: u16) -> Result<()> { + if event_tx.send(index).is_err() { + error::NotifyVmm.fail() + } else { + Ok(()) + } +} + +impl VcpuThread { + pub fn new(index: u16, ctx: Arc>, event_tx: Sender) -> Result { + let identity = ctx.board.encode_cpu_identity(index); + let vcpu = ctx.board.vm.create_vcpu(index, identity)?; + + Ok(Self { + ctx, + index, + event_tx, + vcpu, + }) + } + + fn notify_vmm(&self) -> Result<()> { + notify_vmm(&self.event_tx, self.index) + } + + fn sync_vcpus(&self, vcpus: &[VcpuHandle]) -> Result<()> { + let mut sync = self.ctx.sync.lock(); + if sync.fatal { + return error::PeerFailure.fail(); + } + + sync.count += 1; + if sync.count == vcpus.len() as u16 { + sync.count = 0; + self.ctx.cond.notify_all(); + } else { + self.ctx.cond.wait(&mut sync) + } + + if sync.fatal { + return error::PeerFailure.fail(); + } + + Ok(()) + } + + fn load_payload(&self) -> Result { + let payload = self.ctx.board.payload.read(); + let Some(payload) = payload.as_ref() else { + return error::MissingPayload.fail(); + }; + + if let Some(fw) = payload.firmware.as_ref() { + return self.setup_firmware(fw, payload); + } + + let Some(exec) = &payload.executable else { + return error::MissingPayload.fail(); + }; + let mem_regions = self.ctx.board.memory.mem_region_entries(); + let init_state = match exec { + Executable::Linux(image) => linux::load( + &self.ctx.board.memory.ram_bus(), + &mem_regions, + image.as_ref(), + payload.cmdline.as_deref(), + payload.initramfs.as_deref(), + ), + #[cfg(target_arch = "x86_64")] + Executable::Pvh(image) => xen::load( + &self.ctx.board.memory.ram_bus(), + &mem_regions, + image.as_ref(), + payload.cmdline.as_deref(), + payload.initramfs.as_deref(), + ), + }?; + Ok(init_state) + } + + fn boot_init_sync(&mut self) -> Result<()> { + let ctx = self.ctx.clone(); + let vcpus = ctx.vcpus.read(); + if self.index == 0 { + self.ctx.board.init_devices()?; + let init_state = self.load_payload()?; + self.init_boot_vcpu(&init_state)?; + self.ctx.board.create_firmware_data(&init_state)?; + } + self.init_ap(&vcpus)?; + self.coco_finalize(&vcpus)?; + self.sync_vcpus(&vcpus) + } + + fn vcpu_loop(&mut self) -> Result { + let mut vm_entry = VmEntry::None; + loop { + let vm_exit = self.vcpu.run(vm_entry)?; + let memory = &self.ctx.board.memory; + vm_entry = match vm_exit { + #[cfg(target_arch = "x86_64")] + VmExit::Io { port, write, size } => memory.handle_io(port, write, size)?, + VmExit::Mmio { addr, write, size } => memory.handle_mmio(addr, write, size)?, + VmExit::Shutdown => break Ok(State::Shutdown), + VmExit::Reboot => break Ok(State::RebootPending), + VmExit::Paused => break Ok(State::Paused), + VmExit::Interrupted => { + let state = self.ctx.sync.lock(); + match state.state { + State::Shutdown => VmEntry::Shutdown, + State::RebootPending => VmEntry::Reboot, + State::Paused => VmEntry::Pause, + State::Running => VmEntry::None, + } + } + VmExit::ConvertMemory { gpa, size, private } => { + memory.mark_private_memory(gpa, size, private)?; + VmEntry::None + } + }; + } + } + + fn run(&mut self) -> Result<()> { + self.init_vcpu()?; + + 'reboot: loop { + let mut sync = self.ctx.sync.lock(); + loop { + match sync.state { + State::Paused => self.ctx.cond.wait(&mut sync), + State::Running => break, + State::Shutdown => break 'reboot Ok(()), + State::RebootPending => sync.state = State::Running, + } + } + drop(sync); + + self.boot_init_sync()?; + + let request = 'pause: loop { + let request = self.vcpu_loop(); + + let vcpus = self.ctx.vcpus.read(); + let mut sync = self.ctx.sync.lock(); + if sync.state == State::Running { + sync.state = match request { + Ok(State::RebootPending) => State::RebootPending, + Ok(State::Paused) => State::Paused, + _ => State::Shutdown, + }; + log::trace!("VCPU-{}: change state to {:?}", self.index, sync.state); + stop_vcpus(&self.ctx.board, Some(self.index), &vcpus)?; + } + loop { + match sync.state { + State::Paused => self.ctx.cond.wait(&mut sync), + State::Running => break, + State::RebootPending | State::Shutdown => break 'pause request, + } + } + }; + + if self.index == 0 { + let board = &self.ctx.board; + board.pci_bus.segment.reset().context(error::ResetPci)?; + board.memory.reset()?; + } + self.reset_vcpu()?; + + request?; + + let vcpus = self.ctx.vcpus.read(); + self.sync_vcpus(&vcpus)?; + } + } +} + +fn vcpu_thread_(index: u16, ctx: Arc>, event_tx: Sender) -> Result<()> { + let mut thread = VcpuThread::new(index, ctx, event_tx)?; + thread.notify_vmm()?; + thread.run() +} + +pub fn vcpu_thread(index: u16, ctx: Arc>, event_tx: Sender) -> Result<()> { + let ret = vcpu_thread_(index, ctx.clone(), event_tx.clone()); + + let _ = notify_vmm(&event_tx, index); + + if matches!(ret, Ok(_) | Err(Error::PeerFailure { .. })) { + return Ok(()); + } + + log::warn!("VCPU-{index} reported error {ret:?}, unblocking other VCPUs..."); + let mut sync = ctx.sync.lock(); + sync.fatal = true; + if sync.count > 0 { + ctx.cond.notify_all(); + } + ret +} + +pub fn stop_vcpus( + board: &Board, + current: Option, + vcpus: &[VcpuHandle], +) -> Result<()> { + for (index, handle) in vcpus.iter().enumerate() { + let index = index as u16; + if let Some(current) = current { + if current == index { + continue; + } + log::info!("VCPU-{current}: stopping VCPU-{index}"); + } else { + log::info!("Stopping VCPU-{index}"); + } + let identity = board.encode_cpu_identity(index); + board.vm.stop_vcpu(identity, &handle.thread)?; + } + Ok(()) +} diff --git a/alioth/src/cpu/cpu_aarch64.rs b/alioth/src/cpu/cpu_aarch64.rs new file mode 100644 index 00000000..d5bf74ed --- /dev/null +++ b/alioth/src/cpu/cpu_aarch64.rs @@ -0,0 +1,48 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::path::Path; + +use crate::cpu::{Result, VcpuHandle, VcpuThread}; +use crate::hv::{Vcpu, Vm}; +use crate::loader::{InitState, Payload}; + +impl VcpuThread { + pub(crate) fn init_vcpu(&mut self) -> Result<()> { + self.reset_vcpu() + } + + pub(crate) fn init_boot_vcpu(&mut self, init: &InitState) -> Result<()> { + self.vcpu.set_regs(&init.regs)?; + self.vcpu.set_sregs(&init.sregs)?; + Ok(()) + } + + pub(crate) fn init_ap(&mut self, _: &[VcpuHandle]) -> Result<()> { + Ok(()) + } + + pub(crate) fn coco_finalize(&self, _: &[VcpuHandle]) -> Result<()> { + Ok(()) + } + + pub(crate) fn setup_firmware(&self, _: &Path, _: &Payload) -> Result { + unimplemented!() + } + + pub(crate) fn reset_vcpu(&mut self) -> Result<()> { + self.vcpu.reset(self.index == 0)?; + Ok(()) + } +} diff --git a/alioth/src/cpu/cpu_x86_64/cpu_x86_64.rs b/alioth/src/cpu/cpu_x86_64/cpu_x86_64.rs new file mode 100644 index 00000000..441b222f --- /dev/null +++ b/alioth/src/cpu/cpu_x86_64/cpu_x86_64.rs @@ -0,0 +1,111 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod sev; +mod tdx; + +use std::path::Path; + +use crate::arch::msr::{MiscEnable, Msr}; +use crate::cpu::{Result, VcpuHandle, VcpuThread}; +use crate::hv::{Coco, Vcpu, Vm}; +use crate::loader::{InitState, Payload, firmware}; +use crate::mem::mapped::ArcMemPages; + +impl VcpuThread { + pub(crate) fn init_vcpu(&mut self) -> Result<()> { + let apic_id = self.ctx.board.encode_cpu_identity(self.index) as u32; + let mut cpuids = self.ctx.board.arch.cpuids.clone(); + for (in_, out) in &mut cpuids { + if in_.func == 0x1 { + out.ebx &= 0x00ff_ffff; + out.ebx |= apic_id << 24; + } else if in_.func == 0xb || in_.func == 0x1f || in_.func == 0x80000026 { + out.edx = apic_id; + } + } + self.vcpu.set_cpuids(cpuids)?; + + let msrs = [(Msr::MISC_ENABLE, MiscEnable::FAST_STRINGS.bits())]; + self.vcpu.set_msrs(&msrs)?; + Ok(()) + } + + pub(crate) fn init_boot_vcpu(&mut self, init: &InitState) -> Result<()> { + if matches!(self.ctx.board.config.coco, Some(Coco::IntelTdx { .. })) { + return Ok(()); + } + self.vcpu + .set_sregs(&init.sregs, &init.seg_regs, &init.dt_regs)?; + self.vcpu.set_regs(&init.regs)?; + Ok(()) + } + + pub(crate) fn init_ap(&mut self, vcpus: &[VcpuHandle]) -> Result<()> { + let Some(coco) = &self.ctx.board.config.coco else { + return Ok(()); + }; + self.sync_vcpus(vcpus)?; + if self.index == 0 { + return Ok(()); + } + match coco { + Coco::AmdSev { policy } => { + if policy.es() { + self.sev_init_ap()?; + } + } + Coco::AmdSnp { .. } => self.sev_init_ap()?, + Coco::IntelTdx { .. } => self.tdx_init_ap()?, + } + Ok(()) + } + + pub(crate) fn setup_coco(&self, fw: &mut ArcMemPages) -> Result<()> { + let Some(coco) = &self.ctx.board.config.coco else { + return Ok(()); + }; + match coco { + Coco::AmdSev { policy } => self.setup_sev(fw, *policy), + Coco::AmdSnp { .. } => self.setup_snp(fw), + Coco::IntelTdx { .. } => self.setup_tdx(fw), + } + } + + pub(crate) fn coco_finalize(&self, vcpus: &[VcpuHandle]) -> Result<()> { + let Some(coco) = &self.ctx.board.config.coco else { + return Ok(()); + }; + self.sync_vcpus(vcpus)?; + if self.index != 0 { + return Ok(()); + }; + match coco { + Coco::AmdSev { policy } => self.sev_finalize(*policy), + Coco::AmdSnp { .. } => self.snp_finalize(), + Coco::IntelTdx { .. } => self.tdx_finalize(), + } + } + + pub(crate) fn setup_firmware(&self, fw: &Path, payload: &Payload) -> Result { + let (init_state, mut rom) = firmware::load(&self.ctx.board.memory, fw)?; + self.setup_coco(&mut rom)?; + self.ctx.board.setup_fw_cfg(payload)?; + Ok(init_state) + } + + pub(crate) fn reset_vcpu(&self) -> Result<()> { + Ok(()) + } +} diff --git a/alioth/src/cpu/cpu_x86_64/sev.rs b/alioth/src/cpu/cpu_x86_64/sev.rs new file mode 100644 index 00000000..e1d98b7a --- /dev/null +++ b/alioth/src/cpu/cpu_x86_64/sev.rs @@ -0,0 +1,155 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::iter::zip; +use std::sync::atomic::Ordering; + +use zerocopy::FromZeros; + +use crate::arch::layout::MEM_64_START; +use crate::arch::reg::{Reg, SegAccess, SegReg, SegRegVal}; +use crate::arch::sev::{SevPolicy, SnpPageType}; +use crate::cpu::{Result, VcpuThread}; +use crate::firmware::ovmf::sev::{ + SevDescType, SevMetadataDesc, SnpCpuidFunc, SnpCpuidInfo, parse_desc, parse_sev_ap_eip, +}; +use crate::hv::{Vcpu, Vm}; +use crate::mem::mapped::ArcMemPages; + +impl VcpuThread +where + V: Vm, +{ + fn fill_snp_cpuid(&self, entries: &mut [SnpCpuidFunc]) { + for ((in_, out), dst) in zip(self.ctx.board.arch.cpuids.iter(), entries.iter_mut()) { + dst.eax_in = in_.func; + dst.ecx_in = in_.index.unwrap_or(0); + dst.eax = out.eax; + dst.ebx = out.ebx; + dst.ecx = out.ecx; + dst.edx = out.edx; + if dst.eax_in == 0xd && (dst.ecx_in == 0x0 || dst.ecx_in == 0x1) { + dst.ebx = 0x240; + dst.xcr0_in = 1; + dst.xss_in = 0; + } + } + } + + fn parse_sev_ap_eip(&self, data: &[u8]) -> Result<()> { + let ap_eip = parse_sev_ap_eip(data)?; + let sev_ap_eip = &self.ctx.board.arch.sev_ap_eip; + sev_ap_eip.store(ap_eip, Ordering::Release); + Ok(()) + } + + fn update_snp_desc(&self, desc: &SevMetadataDesc) -> Result<()> { + let mut cpuid_table = SnpCpuidInfo::new_zeroed(); + let ram_bus = self.ctx.board.memory.ram_bus(); + let ram = ram_bus.lock_layout(); + let page_type = match desc.type_ { + SevDescType::SNP_DESC_MEM => SnpPageType::UNMEASURED, + SevDescType::SNP_SECRETS => SnpPageType::SECRETS, + SevDescType::CPUID => { + assert!(desc.len as usize >= size_of::()); + assert!(cpuid_table.entries.len() >= self.ctx.board.arch.cpuids.len()); + cpuid_table.count = self.ctx.board.arch.cpuids.len() as u32; + self.fill_snp_cpuid(&mut cpuid_table.entries); + ram.write_t(desc.base as _, &cpuid_table)?; + SnpPageType::CPUID + } + _ => SnpPageType::ZERO, + }; + let range_ref = ram.get_slice::(desc.base as u64, desc.len as u64)?; + let bytes = + unsafe { std::slice::from_raw_parts_mut(range_ref.as_ptr() as _, range_ref.len()) }; + let memory = &self.ctx.board.memory; + memory.mark_private_memory(desc.base as _, desc.len as _, true)?; + let vm = &self.ctx.board.vm; + let ret = vm.snp_launch_update(bytes, desc.base as _, page_type); + if ret.is_err() && desc.type_ == SevDescType::CPUID { + let updated_cpuid: SnpCpuidInfo = ram.read_t(desc.base as _)?; + for (set, got) in zip(&cpuid_table.entries, &updated_cpuid.entries) { + if set != got { + log::error!("set {set:#x?}, but firmware expects {got:#x?}"); + } + } + } + ret?; + Ok(()) + } + + pub(crate) fn setup_sev(&self, fw: &mut ArcMemPages, policy: SevPolicy) -> Result<()> { + let board = &self.ctx.board; + + board.memory.register_encrypted_pages(fw)?; + + let data = fw.as_slice_mut(); + if policy.es() { + self.parse_sev_ap_eip(data)?; + } + self.ctx.board.vm.sev_launch_update_data(data)?; + Ok(()) + } + + pub(crate) fn setup_snp(&self, fw: &mut ArcMemPages) -> Result<()> { + let memory = &self.ctx.board.memory; + memory.register_encrypted_pages(fw)?; + + let data = fw.as_slice_mut(); + self.parse_sev_ap_eip(data)?; + for desc in parse_desc(data)? { + self.update_snp_desc(desc)?; + } + let fw_gpa = MEM_64_START - data.len() as u64; + + memory.mark_private_memory(fw_gpa, data.len() as _, true)?; + let vm = &self.ctx.board.vm; + vm.snp_launch_update(data, fw_gpa, SnpPageType::NORMAL)?; + Ok(()) + } + + pub(crate) fn sev_finalize(&self, policy: SevPolicy) -> Result<()> { + if policy.es() { + self.ctx.board.vm.sev_launch_update_vmsa()?; + } + self.ctx.board.vm.sev_launch_measure()?; + self.ctx.board.vm.sev_launch_finish()?; + Ok(()) + } + + pub(crate) fn snp_finalize(&self) -> Result<()> { + self.ctx.board.vm.snp_launch_finish()?; + Ok(()) + } + + pub(crate) fn sev_init_ap(&mut self) -> Result<()> { + let eip = self.ctx.board.arch.sev_ap_eip.load(Ordering::Acquire); + self.vcpu.set_regs(&[(Reg::Rip, eip as u64 & 0xffff)])?; + self.vcpu.set_sregs( + &[], + &[( + SegReg::Cs, + SegRegVal { + selector: 0xf000, + base: eip as u64 & 0xffff_0000, + limit: 0xffff, + access: SegAccess(0x9b), + }, + )], + &[], + )?; + Ok(()) + } +} diff --git a/alioth/src/cpu/cpu_x86_64/tdx.rs b/alioth/src/cpu/cpu_x86_64/tdx.rs new file mode 100644 index 00000000..954bfd7c --- /dev/null +++ b/alioth/src/cpu/cpu_x86_64/tdx.rs @@ -0,0 +1,106 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::atomic::Ordering; + +use crate::arch::layout::MEM_64_START; +use crate::cpu::{Result, VcpuThread, error}; +use crate::firmware::ovmf::tdx::{TdvfSectionAttr, TdvfSectionType, create_hob, parse_entries}; +use crate::hv::{Vcpu, Vm}; +use crate::mem::mapped::ArcMemPages; + +impl VcpuThread +where + V: Vm, +{ + pub(crate) fn create_hob(&self, dst: &mut [u8], mut accepted: Vec<(u64, u64)>) -> Result { + let hob_phys = self.ctx.board.arch.tdx_hob.load(Ordering::Relaxed); + let mut entries = self.ctx.board.memory.mem_region_entries(); + create_hob(dst, hob_phys, &mut entries, &mut accepted)?; + Ok(hob_phys) + } + + pub(crate) fn setup_tdx(&self, fw: &mut ArcMemPages) -> Result<()> { + let data = fw.as_slice(); + let entries = parse_entries(data)?; + + let fw_gpa = MEM_64_START - data.len() as u64; + let memory = &self.ctx.board.memory; + memory.mark_private_memory(fw_gpa, data.len() as _, true)?; + + let mut accepted = Vec::new(); + let mut hob_ram = None; + for entry in entries { + match entry.r#type { + TdvfSectionType::TD_HOB => { + let p = ArcMemPages::from_anonymous(entry.size as usize, None, None)?; + hob_ram = Some(p); + let tdx_hob = &self.ctx.board.arch.tdx_hob; + tdx_hob.store(entry.address, Ordering::Relaxed); + accepted.push((entry.address, entry.size)); + } + TdvfSectionType::TEMP_MEM => { + accepted.push((entry.address, entry.size)); + } + _ => {} + }; + } + + let Some(hob_ram) = &mut hob_ram else { + return error::MissingPayload.fail(); + }; + let hob_phys = self.create_hob(hob_ram.as_slice_mut(), accepted)?; + + self.vcpu.tdx_init_vcpu(hob_phys)?; + + for entry in entries { + let tmp_mem; + let region = match entry.r#type { + TdvfSectionType::TD_HOB => hob_ram.as_slice(), + TdvfSectionType::TEMP_MEM => { + tmp_mem = ArcMemPages::from_anonymous(entry.size as usize, None, None)?; + tmp_mem.as_slice() + } + TdvfSectionType::BFV | TdvfSectionType::CFV => { + let start = entry.data_offset as usize; + let end = start + entry.size as usize; + let Some(d) = data.get(start..end) else { + return error::MissingPayload.fail(); + }; + d + } + t => { + log::error!("Unknown entry type: {t:x?}"); + return error::UnknownFirmwareMetadata.fail(); + } + }; + let measure = entry.attributes.contains(TdvfSectionAttr::MR_EXTEND); + self.vcpu + .tdx_init_mem_region(region, entry.address, measure)?; + } + + Ok(()) + } + + pub(crate) fn tdx_init_ap(&self) -> Result<()> { + let hob = self.ctx.board.arch.tdx_hob.load(Ordering::Relaxed); + self.vcpu.tdx_init_vcpu(hob)?; + Ok(()) + } + + pub(crate) fn tdx_finalize(&self) -> Result<()> { + self.ctx.board.vm.tdx_finalize_vm()?; + Ok(()) + } +} diff --git a/alioth/src/lib.rs b/alioth/src/lib.rs index e143a361..5e393c5e 100644 --- a/alioth/src/lib.rs +++ b/alioth/src/lib.rs @@ -18,6 +18,8 @@ pub mod arch; pub mod blk; #[path = "board/board.rs"] pub mod board; +#[path = "cpu/cpu.rs"] +pub mod cpu; #[path = "device/device.rs"] pub mod device; pub mod errors; diff --git a/alioth/src/vm/vm.rs b/alioth/src/vm/vm.rs index 006a488d..9957f3e6 100644 --- a/alioth/src/vm/vm.rs +++ b/alioth/src/vm/vm.rs @@ -30,6 +30,7 @@ use crate::arch::layout::{PL011_START, PL031_START}; #[cfg(target_arch = "x86_64")] use crate::arch::layout::{PORT_CMOS_REG, PORT_COM1, PORT_FW_CFG_SELECTOR, PORT_FWDBG}; use crate::board::{Board, BoardConfig}; +use crate::cpu::{Context, State, VcpuHandle, stop_vcpus, vcpu_thread}; use crate::device::clock::SystemClock; #[cfg(target_arch = "x86_64")] use crate::device::cmos::Cmos; @@ -72,13 +73,18 @@ use crate::virtio::pci::VirtioPciDevice; pub enum Error { #[snafu(display("Hypervisor internal error"), context(false))] HvError { source: Box }, - #[snafu(display("Failed to create board"), context(false))] - CreateBoard { source: Box }, #[snafu(display("Failed to create VCPU-{index} thread"))] - VcpuThread { index: u16, error: std::io::Error }, + CreateVcpu { index: u16, error: std::io::Error }, + #[snafu(display("Failed to stop VCPUs"))] + StopVcpus { source: Box }, + #[snafu(display("VCPU-{index} thread exited unexpectedly"))] + VcpuExit { + index: u16, + source: Box, + }, #[snafu(display("Failed to create a console"))] CreateConsole { error: crate::device::Error }, - #[snafu(display("Failed to create fw-cfg device"))] + #[snafu(display("Failed to configure firmware"))] FwCfg { error: std::io::Error }, #[snafu(display("Failed to create a VirtIO device"), context(false))] CreateVirtio { source: Box }, @@ -87,28 +93,29 @@ pub enum Error { #[cfg(target_os = "linux")] #[snafu(display("Failed to create a VFIO device"), context(false))] CreateVfio { source: Box }, - #[snafu(display("VCPU-{index} error"))] - VcpuError { - index: u16, - source: Box, - }, #[snafu(display("Failed to configure guest memory"), context(false))] Memory { source: Box }, + #[snafu(display("Failed to setup board"), context(false))] + Board { source: Box }, #[cfg(target_os = "linux")] #[snafu(display("{name:?} already exists"))] AlreadyExists { name: Box }, #[cfg(target_os = "linux")] #[snafu(display("{name:?} does not exist"))] NotExist { name: Box }, + #[snafu(display("Unexpected state: {state:?}, want {want:?}"))] + UnexpectedState { state: State, want: State }, } -type Result = std::result::Result; +pub type Result = std::result::Result; pub struct Machine where H: Hypervisor, { - board: Arc>, + ctx: Arc>, + event_rx: Receiver, + _event_tx: Sender, #[cfg(target_os = "linux")] iommu: Mutex>>, @@ -116,9 +123,6 @@ where pub vfio_ioases: Mutex, Arc>>, #[cfg(target_os = "linux")] pub vfio_containers: Mutex, Arc>>, - - event_rx: Receiver, - _event_tx: Sender, } pub type VirtioPciDev = VirtioPciDevice< @@ -131,30 +135,31 @@ where H: Hypervisor, { pub fn new(hv: &H, config: BoardConfig) -> Result { - let board = Arc::new(Board::new(hv, config)?); + let ctx = Arc::new(Context::new(Board::new(hv, config)?)); let (event_tx, event_rx) = flume::unbounded(); - let mut vcpus = board.vcpus.write(); - for index in 0..board.config.cpu.count { + let mut vcpus = ctx.vcpus.write(); + for index in 0..ctx.board.config.cpu.count { let event_tx = event_tx.clone(); - let board = board.clone(); + let ctx = ctx.clone(); let handle = thread::Builder::new() .name(format!("vcpu_{index}")) - .spawn(move || board.run_vcpu(index, event_tx)) - .context(error::VcpuThread { index })?; + .spawn(move || vcpu_thread(index, ctx, event_tx)) + .context(error::CreateVcpu { index })?; if event_rx.recv_timeout(Duration::from_secs(2)).is_err() { let err = std::io::ErrorKind::TimedOut.into(); - Err(err).context(error::VcpuThread { index })?; + Err(err).context(error::CreateVcpu { index })?; } + let handle = VcpuHandle { thread: handle }; vcpus.push(handle); } drop(vcpus); - board.arch_init()?; + ctx.board.arch_init()?; let vm = Machine { - board, + ctx, event_rx, _event_tx: event_tx, #[cfg(target_os = "linux")] @@ -170,33 +175,34 @@ where #[cfg(target_arch = "x86_64")] pub fn add_com1(&self) -> Result<(), Error> { - let io_apic = self.board.arch.io_apic.clone(); + let io_apic = self.ctx.board.arch.io_apic.clone(); let console = StdioConsole::new().context(error::CreateConsole)?; let com1 = Serial::new(PORT_COM1, io_apic, 4, console).context(error::CreateConsole)?; - self.board.io_devs.write().push((PORT_COM1, Arc::new(com1))); + let mut io_devs = self.ctx.board.io_devs.write(); + io_devs.push((PORT_COM1, Arc::new(com1))); Ok(()) } #[cfg(target_arch = "x86_64")] pub fn add_cmos(&self) -> Result<(), Error> { - let mut io_devs = self.board.io_devs.write(); + let mut io_devs = self.ctx.board.io_devs.write(); io_devs.push((PORT_CMOS_REG, Arc::new(Cmos::new(SystemClock)))); Ok(()) } #[cfg(target_arch = "x86_64")] pub fn add_fw_dbg(&self) -> Result<(), Error> { - let mut io_devs = self.board.io_devs.write(); + let mut io_devs = self.ctx.board.io_devs.write(); io_devs.push((PORT_FWDBG, Arc::new(FwDbg::new()))); Ok(()) } #[cfg(target_arch = "aarch64")] pub fn add_pl011(&self) -> Result<(), Error> { - let irq_line = self.board.vm.create_irq_sender(1)?; + let irq_line = self.ctx.board.vm.create_irq_sender(1)?; let console = StdioConsole::new().context(error::CreateConsole)?; let pl011_dev = Pl011::new(PL011_START, irq_line, console).context(error::CreateConsole)?; - let mut mmio_devs = self.board.mmio_devs.write(); + let mut mmio_devs = self.ctx.board.mmio_devs.write(); mmio_devs.push((PL011_START, Arc::new(pl011_dev))); Ok(()) } @@ -204,7 +210,7 @@ where #[cfg(target_arch = "aarch64")] pub fn add_pl031(&self) { let pl031_dev = Pl031::new(PL031_START, SystemClock); - let mut mmio_devs = self.board.mmio_devs.write(); + let mut mmio_devs = self.ctx.board.mmio_devs.write(); mmio_devs.push((PL031_START, Arc::new(pl031_dev))); } @@ -212,11 +218,11 @@ where let bdf = if let Some(bdf) = bdf { bdf } else { - self.board.pci_bus.reserve(None).unwrap() + self.ctx.board.pci_bus.reserve(None).unwrap() }; dev.config().get_header().set_bdf(bdf); log::info!("{bdf}: device: {}", dev.name()); - self.board.pci_bus.add(bdf, dev); + self.ctx.board.pci_bus.add(bdf, dev); Ok(()) } @@ -236,11 +242,11 @@ where .collect::, _>>() .context(error::FwCfg)?; let fw_cfg = Arc::new(Mutex::new( - FwCfg::new(self.board.memory.ram_bus(), items).context(error::FwCfg)?, + FwCfg::new(self.ctx.board.memory.ram_bus(), items).context(error::FwCfg)?, )); - let mut io_devs = self.board.io_devs.write(); + let mut io_devs = self.ctx.board.io_devs.write(); io_devs.push((PORT_FW_CFG_SELECTOR, fw_cfg.clone())); - *self.board.fw_cfg.lock() = Some(fw_cfg.clone()); + *self.ctx.board.fw_cfg.lock() = Some(fw_cfg.clone()); Ok(fw_cfg) } @@ -253,26 +259,26 @@ where P: DevParam, D: Virtio, { - if param.needs_mem_shared_fd() && !self.board.config.mem.has_shared_fd() { + if param.needs_mem_shared_fd() && !self.ctx.board.config.mem.has_shared_fd() { return error::MemNotSharedFd.fail(); } let name = name.into(); - let bdf = self.board.pci_bus.reserve(None).unwrap(); + let bdf = self.ctx.board.pci_bus.reserve(None).unwrap(); let dev = param.build(name.clone())?; if let Some(callback) = dev.mem_update_callback() { - self.board.memory.register_update_callback(callback)?; + self.ctx.board.memory.register_update_callback(callback)?; } if let Some(callback) = dev.mem_change_callback() { - self.board.memory.register_change_callback(callback)?; + self.ctx.board.memory.register_change_callback(callback)?; } - let registry = self.board.vm.create_ioeventfd_registry()?; + let registry = self.ctx.board.vm.create_ioeventfd_registry()?; let virtio_dev = VirtioDevice::new( name.clone(), dev, - self.board.memory.ram_bus(), - self.board.config.coco.is_some(), + self.ctx.board.memory.ram_bus(), + self.ctx.board.config.coco.is_some(), )?; - let msi_sender = self.board.vm.create_msi_sender( + let msi_sender = self.ctx.board.vm.create_msi_sender( #[cfg(target_arch = "aarch64")] u32::from(bdf.0), )?; @@ -283,35 +289,7 @@ where } pub fn add_payload(&self, payload: Payload) { - *self.board.payload.write() = Some(payload) - } - - pub fn boot(&self) -> Result<(), Error> { - self.board.boot()?; - Ok(()) - } - - pub fn wait(&self) -> Result<()> { - self.event_rx.recv().unwrap(); - let vcpus = self.board.vcpus.read(); - for _ in 1..vcpus.len() { - self.event_rx.recv().unwrap(); - } - drop(vcpus); - let mut vcpus = self.board.vcpus.write(); - let mut ret = Ok(()); - for (index, handle) in vcpus.drain(..).enumerate() { - let Ok(r) = handle.join() else { - log::error!("Cannot join VCPU-{index}"); - continue; - }; - if r.is_err() && ret.is_ok() { - ret = r.context(error::Vcpu { - index: index as u16, - }); - } - } - ret + *self.ctx.board.payload.write() = Some(payload) } } @@ -342,7 +320,7 @@ where }; let ioas = Arc::new(Ioas::alloc_on(iommu)?); let update = Box::new(UpdateIommuIoas { ioas: ioas.clone() }); - self.board.memory.register_change_callback(update)?; + self.ctx.board.memory.register_change_callback(update)?; ioases.insert(param.name, ioas.clone()); Ok(ioas) } @@ -368,8 +346,8 @@ where let mut cdev = Cdev::new(¶m.path)?; cdev.attach_iommu_ioas(ioas.clone())?; - let bdf = self.board.pci_bus.reserve(None).unwrap(); - let msi_sender = self.board.vm.create_msi_sender( + let bdf = self.ctx.board.pci_bus.reserve(None).unwrap(); + let msi_sender = self.ctx.board.vm.create_msi_sender( #[cfg(target_arch = "aarch64")] u32::from(bdf.0), )?; @@ -392,7 +370,7 @@ where let update = Box::new(UpdateContainerMapping { container: container.clone(), }); - self.board.memory.register_change_callback(update)?; + self.ctx.board.memory.register_change_callback(update)?; containers.insert(param.name, container.clone()); Ok(container) } @@ -431,8 +409,8 @@ where } fn add_vfio_devfd(&self, name: Arc, devfd: DevFd) -> Result<()> { - let bdf = self.board.pci_bus.reserve(None).unwrap(); - let msi_sender = self.board.vm.create_msi_sender( + let bdf = self.ctx.board.pci_bus.reserve(None).unwrap(); + let msi_sender = self.ctx.board.vm.create_msi_sender( #[cfg(target_arch = "aarch64")] u32::from(bdf.0), )?; @@ -440,3 +418,64 @@ where self.add_pci_dev(Some(bdf), Arc::new(dev)) } } + +impl Machine +where + H: Hypervisor, +{ + pub fn boot(&self) -> Result<()> { + self.resume() + } + + pub fn resume(&self) -> Result<()> { + let mut sync = self.ctx.sync.lock(); + if !matches!(sync.state, State::Paused) { + return error::UnexpectedState { + state: sync.state, + want: State::Paused, + } + .fail(); + } + sync.state = State::Running; + self.ctx.cond.notify_all(); + Ok(()) + } + + pub fn pause(&self) -> Result<()> { + let vcpus = self.ctx.vcpus.read(); + let mut sync = self.ctx.sync.lock(); + if !matches!(sync.state, State::Running) { + return error::UnexpectedState { + state: sync.state, + want: State::Running, + } + .fail(); + } + sync.state = State::Paused; + stop_vcpus(&self.ctx.board, None, &vcpus).context(error::StopVcpus)?; + Ok(()) + } + + pub fn wait(&self) -> Result<()> { + self.event_rx.recv().unwrap(); + let vcpus = self.ctx.vcpus.read(); + for _ in 1..vcpus.len() { + self.event_rx.recv().unwrap(); + } + drop(vcpus); + let mut vcpus = self.ctx.vcpus.write(); + let mut ret = Ok(()); + for (index, handle) in vcpus.drain(..).enumerate() { + let Ok(r) = handle.thread.join() else { + log::error!("Cannot join VCPU-{index}"); + continue; + }; + if ret.is_ok() { + ret = r.context(error::VcpuExit { + index: index as u16, + }); + } + } + ret + } +}