From 61798a5afafa143118e9e998df44cebd8c621752 Mon Sep 17 00:00:00 2001 From: Jakub Dobry Date: Mon, 1 Dec 2025 14:34:00 +0100 Subject: [PATCH 1/9] io_uring flush fixes --- src/vmm/src/device_manager/pci_mngr.rs | 2 +- src/vmm/src/device_manager/persist.rs | 2 +- src/vmm/src/devices/virtio/block/device.rs | 6 ++-- .../src/devices/virtio/block/virtio/device.rs | 32 +++++++++++++------ .../virtio/block/virtio/io/async_io.rs | 4 +++ .../src/devices/virtio/block/virtio/io/mod.rs | 7 ++++ .../src/devices/virtio/block/virtio/mod.rs | 2 ++ 7 files changed, 40 insertions(+), 15 deletions(-) diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 8ebf3250581..0bea2c2f7bd 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -311,7 +311,7 @@ impl<'a> Persist<'a> for PciDevices { snapshotting yet" ); } else { - block_dev.prepare_save(); + block_dev.prepare_save()?; let device_state = block_dev.save(); state.block_devices.push(VirtioDeviceState { device_id: block_dev.id().to_string(), diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 0a074dc9fa1..08593ef3b80 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -265,7 +265,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { snapshotting yet" ); } else { - block.prepare_save(); + block.prepare_save()?; let device_state = block.save(); states.block_devices.push(VirtioDeviceState { device_id, diff --git a/src/vmm/src/devices/virtio/block/device.rs b/src/vmm/src/devices/virtio/block/device.rs index 93eb56e7760..1c1175f93c4 100644 --- a/src/vmm/src/devices/virtio/block/device.rs +++ b/src/vmm/src/devices/virtio/block/device.rs @@ -82,10 +82,10 @@ impl Block { } } - pub fn prepare_save(&mut self) { + pub fn prepare_save(&mut self) -> Result<(), BlockError> { match self { - Self::Virtio(b) => b.prepare_save(), - Self::VhostUser(b) => b.prepare_save(), + Self::Virtio(b) => b.prepare_save().map_err(BlockError::VirtioBackend), + Self::VhostUser(_) => Err(BlockError::InvalidBlockBackend), } } diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index 62ebdbbf5fd..47e72f98fdf 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -563,22 +563,32 @@ impl VirtioBlock { } } - fn drain_and_flush(&mut self, discard: bool) { - if let Err(err) = self.disk.file_engine.drain_and_flush(discard) { - error!("Failed to drain ops and flush block data: {:?}", err); - } + fn drain_and_flush(&mut self, discard: bool) -> Result<(), VirtioBlockError> { + self.disk + .file_engine + .drain_and_flush(discard) + .map_err(VirtioBlockError::FileEngine) } /// Prepare device for being snapshotted. - pub fn prepare_save(&mut self) { + pub fn prepare_save(&mut self) -> Result<(), VirtioBlockError> { if !self.is_activated() { - return; + return Ok(()); } - self.drain_and_flush(false); - if let FileEngine::Async(ref _engine) = self.disk.file_engine { + self.drain_and_flush(false)?; + let is_async = matches!(self.disk.file_engine, FileEngine::Async(_)); + if is_async { self.process_async_completion_queue(); + if let FileEngine::Async(ref engine) = self.disk.file_engine { + let pending = engine.pending_ops(); + if pending != 0 { + return Err(VirtioBlockError::PendingAsyncOperations(pending)); + } + } } + + Ok(()) } } @@ -682,7 +692,9 @@ impl Drop for VirtioBlock { } } CacheType::Writeback => { - self.drain_and_flush(true); + if let Err(err) = self.drain_and_flush(true) { + error!("Failed to drain ops and flush block data: {:?}", err); + } } }; } @@ -1673,7 +1685,7 @@ mod tests { // Add a batch of flush requests. add_flush_requests_batch(&mut block, &vq, 5); simulate_queue_event(&mut block, None); - block.prepare_save(); + block.prepare_save().unwrap(); // Check that all the pending flush requests were processed during `prepare_save()`. check_flush_requests_batch(5, &vq); diff --git a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs index f83d9dea1df..2409f92b6b8 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs @@ -119,6 +119,10 @@ impl AsyncFileEngine { &self.completion_evt } + pub fn pending_ops(&self) -> u32 { + self.ring.num_ops() + } + pub fn push_read( &mut self, offset: u64, diff --git a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs index 09cc7c4e31d..05a7982b8e0 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs @@ -172,6 +172,13 @@ impl FileEngine { FileEngine::Sync(engine) => engine.flush().map_err(BlockIoError::Sync), } } + + pub fn pending_async_ops(&self) -> u32 { + match self { + FileEngine::Async(engine) => engine.pending_ops(), + FileEngine::Sync(_) => 0, + } + } } #[cfg(test)] diff --git a/src/vmm/src/devices/virtio/block/virtio/mod.rs b/src/vmm/src/devices/virtio/block/virtio/mod.rs index 9e97d6d3897..24fc574cc5f 100644 --- a/src/vmm/src/devices/virtio/block/virtio/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/mod.rs @@ -63,4 +63,6 @@ pub enum VirtioBlockError { RateLimiter(std::io::Error), /// Persistence error: {0} Persist(crate::devices::virtio::persist::PersistError), + /// Async file engine still has {0} pending operations. + PendingAsyncOperations(u32), } From 0ca53bf41a1050dce19428360d2b342d7e645341 Mon Sep 17 00:00:00 2001 From: Jakub Dobry Date: Mon, 1 Dec 2025 14:27:03 +0000 Subject: [PATCH 2/9] compilation fixes --- src/vmm/src/device_manager/pci_mngr.rs | 4 +++- src/vmm/src/device_manager/persist.rs | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 0bea2c2f7bd..e3554c60470 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -311,7 +311,9 @@ impl<'a> Persist<'a> for PciDevices { snapshotting yet" ); } else { - block_dev.prepare_save()?; + if let Err(err) = block_dev.prepare_save() { + warn!("Failed to prepare block device for save: {:?}", err); + } let device_state = block_dev.save(); state.block_devices.push(VirtioDeviceState { device_id: block_dev.id().to_string(), diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 08593ef3b80..9cbab81fd79 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -265,7 +265,9 @@ impl<'a> Persist<'a> for MMIODeviceManager { snapshotting yet" ); } else { - block.prepare_save()?; + if let Err(err) = block.prepare_save() { + warn!("Failed to prepare block device for save: {:?}", err); + } let device_state = block.save(); states.block_devices.push(VirtioDeviceState { device_id, From 8ffa614a36f6c418b8e5f34ea0cd3932bf34de99 Mon Sep 17 00:00:00 2001 From: Jakub Dobry Date: Mon, 1 Dec 2025 15:31:15 +0100 Subject: [PATCH 3/9] add debug messages --- .../src/devices/virtio/block/virtio/device.rs | 24 ++++++++++++++++++- .../virtio/block/virtio/io/async_io.rs | 16 ++++++++++++- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index 47e72f98fdf..7786c23dd35 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -35,7 +35,7 @@ use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use crate::devices::virtio::queue::{InvalidAvailIdx, Queue}; use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::impl_device_type; -use crate::logger::{IncMetric, error, warn}; +use crate::logger::{IncMetric, debug, error, warn}; use crate::rate_limiter::{BucketUpdate, RateLimiter}; use crate::utils::u64_to_usize; use crate::vmm_config::RateLimiterConfig; @@ -474,6 +474,7 @@ impl VirtioBlock { // This is safe since we checked in the event handler that the device is activated. let active_state = self.device_state.active_state().unwrap(); let queue = &mut self.queues[0]; + let mut processed = 0_u32; loop { match engine.pop(&active_state.mem) { @@ -504,6 +505,7 @@ impl VirtioBlock { finished.desc_idx, err ) }); + processed = processed.saturating_add(1); } } } @@ -517,6 +519,10 @@ impl VirtioBlock { self.metrics.event_fails.inc(); }); } + debug!( + "virtio-block '{}' processed {} io_uring completions", + self.id, processed + ); } pub fn process_async_completion_event(&mut self) { @@ -573,21 +579,37 @@ impl VirtioBlock { /// Prepare device for being snapshotted. pub fn prepare_save(&mut self) -> Result<(), VirtioBlockError> { if !self.is_activated() { + debug!( + "virtio-block '{}' prepare_save skipped (device inactive)", + self.id + ); return Ok(()); } + debug!( + "virtio-block '{}' draining outstanding IO before snapshot", + self.id + ); self.drain_and_flush(false)?; let is_async = matches!(self.disk.file_engine, FileEngine::Async(_)); if is_async { self.process_async_completion_queue(); if let FileEngine::Async(ref engine) = self.disk.file_engine { let pending = engine.pending_ops(); + debug!( + "virtio-block '{}' prepare_save async pending_ops after drain: {}", + self.id, pending + ); if pending != 0 { return Err(VirtioBlockError::PendingAsyncOperations(pending)); } } } + debug!( + "virtio-block '{}' prepare_save finished; async backend drained", + self.id + ); Ok(()) } } diff --git a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs index 2409f92b6b8..843df0b43a5 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs @@ -14,7 +14,7 @@ use crate::devices::virtio::block::virtio::{IO_URING_NUM_ENTRIES, PendingRequest use crate::io_uring::operation::{Cqe, OpCode, Operation}; use crate::io_uring::restriction::Restriction; use crate::io_uring::{IoUring, IoUringError}; -use crate::logger::log_dev_preview_warning; +use crate::logger::{debug, log_dev_preview_warning}; use crate::vstate::memory::{GuestAddress, GuestMemory, GuestMemoryExtension, GuestMemoryMmap}; #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -224,12 +224,26 @@ impl AsyncFileEngine { } pub fn drain_and_flush(&mut self, discard_cqes: bool) -> Result<(), AsyncIoError> { + let pending_before = self.pending_ops(); + debug!( + "AsyncFileEngine draining: pending_ops={} discard_cqes={}", + pending_before, discard_cqes + ); self.drain(discard_cqes)?; + debug!( + "AsyncFileEngine drain complete: pending_ops={} (discard_cqes={})", + self.pending_ops(), + discard_cqes + ); // Sync data out to physical media on host. // We don't need to call flush first since all the ops are performed through io_uring // and Rust shouldn't manage any data in its internal buffers. self.file.sync_all().map_err(AsyncIoError::SyncAll)?; + debug!( + "AsyncFileEngine sync complete: pending_ops={}", + self.pending_ops() + ); Ok(()) } From ed87706f3abb276296c65b1c69e1542fbc4a37d3 Mon Sep 17 00:00:00 2001 From: Jakub Dobry Date: Mon, 1 Dec 2025 15:03:25 +0000 Subject: [PATCH 4/9] change default logging level to debug --- docs/logger.md | 2 +- src/firecracker/swagger/firecracker.yaml | 131 +++++++++-------------- src/vmm/src/logger/logging.rs | 2 +- 3 files changed, 51 insertions(+), 84 deletions(-) diff --git a/docs/logger.md b/docs/logger.md index 4585c4f3f67..3b0f9a082ae 100644 --- a/docs/logger.md +++ b/docs/logger.md @@ -50,7 +50,7 @@ process: ``` The other Logger fields have, in this case, the default values: -`Level -> Warning`, `show_level -> false`, `show_log_origin -> false`. For +`Level -> Debug`, `show_level -> false`, `show_log_origin -> false`. For configuring these too, you can also pass the following optional parameters: `--level `, `--show-level`, `--show-log-origin`: diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index 5f16fc1df40..6bdc63f7792 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -85,12 +85,12 @@ paths: Will fail if update is not possible. operationId: putBalloon parameters: - - name: body - in: body - description: Balloon properties - required: true - schema: - $ref: "#/definitions/Balloon" + - name: body + in: body + description: Balloon properties + required: true + schema: + $ref: "#/definitions/Balloon" responses: 204: description: Balloon device created/updated @@ -109,12 +109,12 @@ paths: Will fail if update is not possible. operationId: patchBalloon parameters: - - name: body - in: body - description: Balloon properties - required: true - schema: - $ref: "#/definitions/BalloonUpdate" + - name: body + in: body + description: Balloon properties + required: true + schema: + $ref: "#/definitions/BalloonUpdate" responses: 204: description: Balloon device updated @@ -151,12 +151,12 @@ paths: Will fail if update is not possible. operationId: patchBalloonStatsInterval parameters: - - name: body - in: body - description: Balloon properties - required: true - schema: - $ref: "#/definitions/BalloonStatsUpdate" + - name: body + in: body + description: Balloon properties + required: true + schema: + $ref: "#/definitions/BalloonStatsUpdate" responses: 204: description: Balloon statistics interval updated @@ -220,7 +220,6 @@ paths: schema: $ref: "#/definitions/Error" - /drives/{drive_id}: put: summary: Creates or updates a drive. Pre-boot only. @@ -488,8 +487,7 @@ paths: /entropy: put: summary: Creates an entropy device. Pre-boot only. - description: - Enables an entropy device that provides high-quality random data to the guest. + description: Enables an entropy device that provides high-quality random data to the guest. operationId: putEntropyDevice parameters: - name: body @@ -506,12 +504,10 @@ paths: schema: $ref: "#/definitions/Error" - /network-interfaces/{iface_id}: put: summary: Creates a network interface. Pre-boot only. - description: - Creates new network interface with ID specified by iface_id path parameter. + description: Creates new network interface with ID specified by iface_id path parameter. operationId: putGuestNetworkInterfaceByID parameters: - name: iface_id @@ -538,8 +534,7 @@ paths: $ref: "#/definitions/Error" patch: summary: Updates the rate limiters applied to a network interface. Post-boot only. - description: - Updates the rate limiters applied to a network interface. + description: Updates the rate limiters applied to a network interface. operationId: patchGuestNetworkInterfaceByID parameters: - name: iface_id @@ -594,8 +589,7 @@ paths: /snapshot/load: put: summary: Loads a snapshot. Pre-boot only. - description: - Loads the microVM state from a snapshot. + description: Loads the microVM state from a snapshot. Only accepted on a fresh Firecracker process (before configuring any resource other than the Logger and Metrics). operationId: loadSnapshot @@ -635,8 +629,7 @@ paths: /vm: patch: summary: Updates the microVM state. - description: - Sets the desired state (Paused or Resumed) for the microVM. + description: Sets the desired state (Paused or Resumed) for the microVM. operationId: patchVm parameters: - name: body @@ -707,8 +700,7 @@ definitions: required: - amount_mib - deflate_on_oom - description: - Balloon device descriptor. + description: Balloon device descriptor. properties: amount_mib: type: integer @@ -724,8 +716,7 @@ definitions: type: object required: - amount_mib - description: - Balloon device descriptor. + description: Balloon device descriptor. properties: amount_mib: type: integer @@ -733,8 +724,7 @@ definitions: BalloonStats: type: object - description: - Describes the balloon device statistics. + description: Describes the balloon device statistics. required: - target_pages - actual_pages @@ -798,8 +788,7 @@ definitions: type: object required: - stats_polling_interval_s - description: - Update the statistics polling interval, with the first statistics update scheduled immediately. Statistics cannot be turned on/off after boot. + description: Update the statistics polling interval, with the first statistics update scheduled immediately. Statistics cannot be turned on/off after boot. properties: stats_polling_interval_s: type: integer @@ -809,8 +798,7 @@ definitions: type: object required: - kernel_image_path - description: - Boot source descriptor. + description: Boot source descriptor. properties: boot_args: type: string @@ -879,21 +867,18 @@ definitions: type: boolean cache_type: type: string - description: - Represents the caching strategy for the block device. + description: Represents the caching strategy for the block device. enum: ["Unsafe", "Writeback"] default: "Unsafe" # VirtioBlock specific parameters is_read_only: type: boolean - description: - Is block read only. + description: Is block read only. This field is required for virtio-block config and should be omitted for vhost-user-block configuration. path_on_host: type: string - description: - Host level path for the guest drive. + description: Host level path for the guest drive. This field is required for virtio-block config and should be omitted for vhost-user-block configuration. rate_limiter: $ref: "#/definitions/RateLimiter" @@ -909,8 +894,7 @@ definitions: # VhostUserBlock specific parameters socket: type: string - description: - Path to the socket of vhost-user-block backend. + description: Path to the socket of vhost-user-block backend. This field is required for vhost-user-block config should be omitted for virtio-block configuration. Error: @@ -955,8 +939,7 @@ definitions: InstanceActionInfo: type: object - description: - Variant wrapper containing the real action. + description: Variant wrapper containing the real action. required: - action_type properties: @@ -970,8 +953,7 @@ definitions: InstanceInfo: type: object - description: - Describes MicroVM instance information. + description: Describes MicroVM instance information. required: - app_name - id @@ -999,14 +981,13 @@ definitions: Logger: type: object - description: - Describes the configuration option for the logging capability. + description: Describes the configuration option for the logging capability. properties: level: type: string description: Set the level. The possible values are case-insensitive. enum: [Error, Warning, Info, Debug, Trace, Off] - default: Info + default: Debug log_path: type: string description: Path to the named pipe or file for the human readable log output. @@ -1085,8 +1066,7 @@ definitions: Metrics: type: object - description: - Describes the configuration option for the metrics capability. + description: Describes the configuration option for the metrics capability. required: - metrics_path properties: @@ -1096,8 +1076,7 @@ definitions: MmdsConfig: type: object - description: - Defines the MMDS configuration. + description: Defines the MMDS configuration. required: - network_interfaces properties: @@ -1134,13 +1113,11 @@ definitions: MmdsContentsObject: type: object - description: - Describes the contents of MMDS in JSON format. + description: Describes the contents of MMDS in JSON format. NetworkInterface: type: object - description: - Defines a network interface. + description: Defines a network interface. required: - host_dev_name - iface_id @@ -1166,8 +1143,7 @@ definitions: type: string path_on_host: type: string - description: - Host level path for the guest drive. + description: Host level path for the guest drive. This field is optional for virtio-block config and should be omitted for vhost-user-block configuration. rate_limiter: $ref: "#/definitions/RateLimiter" @@ -1233,12 +1209,10 @@ definitions: properties: iface_id: type: string - description: - The name of the interface to modify + description: The name of the interface to modify host_dev_name: type: string - description: - The new host device of the interface + description: The new host device of the interface SnapshotLoadParams: type: object @@ -1250,12 +1224,10 @@ definitions: properties: enable_diff_snapshots: type: boolean - description: - (Deprecated) Enable dirty page tracking to improve space efficiency of diff snapshots + description: (Deprecated) Enable dirty page tracking to improve space efficiency of diff snapshots track_dirty_pages: type: boolean - description: - Enable dirty page tracking to improve space efficiency of diff snapshots + description: Enable dirty page tracking to improve space efficiency of diff snapshots mem_file_path: type: string description: @@ -1273,15 +1245,13 @@ definitions: description: Path to the file that contains the microVM state to be loaded. resume_vm: type: boolean - description: - When set to true, the vm is also resumed if the snapshot load is successful. + description: When set to true, the vm is also resumed if the snapshot load is successful. network_overrides: type: array description: Network host device names to override items: $ref: "#/definitions/NetworkOverride" - TokenBucket: type: object description: @@ -1315,8 +1285,7 @@ definitions: Vm: type: object - description: - Defines the microVM running state. It is especially useful in the snapshotting context. + description: Defines the microVM running state. It is especially useful in the snapshotting context. required: - state properties: @@ -1328,16 +1297,14 @@ definitions: EntropyDevice: type: object - description: - Defines an entropy device. + description: Defines an entropy device. properties: rate_limiter: $ref: "#/definitions/RateLimiter" FirecrackerVersion: type: object - description: - Describes the Firecracker version. + description: Describes the Firecracker version. required: - firecracker_version properties: diff --git a/src/vmm/src/logger/logging.rs b/src/vmm/src/logger/logging.rs index e5cdd8a33d3..7ab240dd3c7 100644 --- a/src/vmm/src/logger/logging.rs +++ b/src/vmm/src/logger/logging.rs @@ -17,7 +17,7 @@ use super::metrics::{IncMetric, METRICS}; /// Default level filter for logger matching the swagger specification /// (`src/firecracker/swagger/firecracker.yaml`). -pub const DEFAULT_LEVEL: log::LevelFilter = log::LevelFilter::Info; +pub const DEFAULT_LEVEL: log::LevelFilter = log::LevelFilter::Debug; /// Default instance id. pub const DEFAULT_INSTANCE_ID: &str = "anonymous-instance"; /// Instance id. From 5d20657f65188f6af5b932af439ec2afa829003b Mon Sep 17 00:00:00 2001 From: Jakub Dobry Date: Mon, 1 Dec 2025 20:47:33 +0000 Subject: [PATCH 5/9] add more debug messages --- .../src/devices/virtio/block/virtio/device.rs | 11 ++++- .../virtio/block/virtio/io/async_io.rs | 40 +++++++++++++++++-- .../devices/virtio/block/virtio/persist.rs | 6 +++ .../devices/virtio/block/virtio/request.rs | 12 ++++++ 4 files changed, 64 insertions(+), 5 deletions(-) diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index 7786c23dd35..b7e26ea4d40 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -475,6 +475,11 @@ impl VirtioBlock { let active_state = self.device_state.active_state().unwrap(); let queue = &mut self.queues[0]; let mut processed = 0_u32; + let pending_before = engine.pending_ops(); + debug!( + "virtio-block '{}' starting async completion drain: pending_ops={}", + self.id, pending_before + ); loop { match engine.pop(&active_state.mem) { @@ -520,8 +525,10 @@ impl VirtioBlock { }); } debug!( - "virtio-block '{}' processed {} io_uring completions", - self.id, processed + "virtio-block '{}' processed {} io_uring completions; pending_ops={} after drain", + self.id, + processed, + engine.pending_ops() ); } diff --git a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs index 843df0b43a5..4b7e4517d4a 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs @@ -131,6 +131,8 @@ impl AsyncFileEngine { count: u32, req: PendingRequest, ) -> Result<(), RequestError> { + let req_type = req.request_type(); + let desc_idx = req.desc_idx(); let buf = match mem.get_slice(addr, count as usize) { Ok(slice) => slice.ptr_guard_mut().as_ptr(), Err(err) => { @@ -154,7 +156,17 @@ impl AsyncFileEngine { .map_err(|(io_uring_error, data)| RequestError { req: data.req, error: AsyncIoError::IoUring(io_uring_error), - }) + })?; + + debug!( + "AsyncFileEngine queued {:?} request desc_idx={} offset={} len={} pending_ops={}", + req_type, + desc_idx, + offset, + count, + self.pending_ops() + ); + Ok(()) } pub fn push_write( @@ -165,6 +177,8 @@ impl AsyncFileEngine { count: u32, req: PendingRequest, ) -> Result<(), RequestError> { + let req_type = req.request_type(); + let desc_idx = req.desc_idx(); let buf = match mem.get_slice(addr, count as usize) { Ok(slice) => slice.ptr_guard_mut().as_ptr(), Err(err) => { @@ -188,10 +202,22 @@ impl AsyncFileEngine { .map_err(|(io_uring_error, data)| RequestError { req: data.req, error: AsyncIoError::IoUring(io_uring_error), - }) + })?; + + debug!( + "AsyncFileEngine queued {:?} request desc_idx={} offset={} len={} pending_ops={}", + req_type, + desc_idx, + offset, + count, + self.pending_ops() + ); + Ok(()) } pub fn push_flush(&mut self, req: PendingRequest) -> Result<(), RequestError> { + let req_type = req.request_type(); + let desc_idx = req.desc_idx(); let wrapped_user_data = WrappedRequest::new(req); self.ring @@ -199,7 +225,15 @@ impl AsyncFileEngine { .map_err(|(io_uring_error, data)| RequestError { req: data.req, error: AsyncIoError::IoUring(io_uring_error), - }) + })?; + + debug!( + "AsyncFileEngine queued {:?} request desc_idx={} pending_ops={}", + req_type, + desc_idx, + self.pending_ops() + ); + Ok(()) } pub fn kick_submission_queue(&mut self) -> Result<(), AsyncIoError> { diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 5b544a12c19..99804b634d2 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -16,6 +16,7 @@ use crate::devices::virtio::device::{ActiveState, DeviceState}; use crate::devices::virtio::generated::virtio_blk::VIRTIO_BLK_F_RO; use crate::devices::virtio::generated::virtio_ids::VIRTIO_ID_BLOCK; use crate::devices::virtio::persist::VirtioDeviceState; +use crate::logger::debug; use crate::rate_limiter::RateLimiter; use crate::rate_limiter::persist::RateLimiterState; use crate::snapshot::Persist; @@ -69,6 +70,11 @@ impl Persist<'_> for VirtioBlock { type Error = VirtioBlockError; fn save(&self) -> Self::State { + let pending_ops = self.disk.file_engine.pending_async_ops(); + debug!( + "virtio-block '{}' save invoked with pending_async_ops={}", + self.id, pending_ops + ); // Save device state. VirtioBlockState { id: self.id.clone(), diff --git a/src/vmm/src/devices/virtio/block/virtio/request.rs b/src/vmm/src/devices/virtio/block/virtio/request.rs index 8fc83cf43da..97119083720 100644 --- a/src/vmm/src/devices/virtio/block/virtio/request.rs +++ b/src/vmm/src/devices/virtio/block/virtio/request.rs @@ -98,6 +98,18 @@ pub struct PendingRequest { } impl PendingRequest { + pub fn request_type(&self) -> RequestType { + self.r#type + } + + pub fn desc_idx(&self) -> u16 { + self.desc_idx + } + + pub fn data_len(&self) -> u32 { + self.data_len + } + fn write_status_and_finish( self, status: &Status, From cec59685ef3c264b5cd8f4a37ab078f036e476b3 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Mon, 1 Dec 2025 17:44:42 -0800 Subject: [PATCH 6/9] Cleanup --- .gitignore | 1 + tests/README.md | 28 +- .../functional/test_snapshot_basic.py | 283 +++++++++++++++++- tools/test.sh | 2 +- 4 files changed, 301 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index 155e4cbd8a8..1044507cef7 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ test_results/* /resources/linux /resources/x86_64 /resources/aarch64 +aws/ \ No newline at end of file diff --git a/tests/README.md b/tests/README.md index e8ad62d0792..b08ab5746c2 100644 --- a/tests/README.md +++ b/tests/README.md @@ -81,7 +81,7 @@ exercise its programmatic API without the HTTP integration. `Cargo` automatically picks up these tests when `cargo test` is issued. They also count towards code coverage. -To run *only* the Rust integration tests: +To run _only_ the Rust integration tests: ```bash cargo test --test integration_tests --all @@ -92,7 +92,7 @@ Unlike unit tests, Rust integration tests are each run in a separate process. 1. Only the `pub` functions can be called. This is fine, as it allows the VMM to be consumed as a programmatic user would. If any function is necessary but - not `pub`, please consider carefully whether it conceptually *needs* to be in + not `pub`, please consider carefully whether it conceptually _needs_ to be in the public interface before making it so. 1. The correct functioning scenario of the `vmm` implies that it `exit`s with @@ -413,33 +413,33 @@ setting to achieve consistent performance. Please see the `test` section of ## FAQ -`Q1:` *I have a shell script that runs my tests and I don't want to rewrite -it.*\ +`Q1:` _I have a shell script that runs my tests and I don't want to rewrite +it._\ `A1:` Insofar as it makes sense, you should write it as a python test function. However, you can always call the script from a shim python test function. You can also add it as a microvm image resource in the s3 bucket (and it will be made available under `microvm.slot.path`) or copy it over to a guest filesystem as part of your test. -`Q2:` *I want to add more tests that I don't want to commit to the Firecracker -repository.*\ +`Q2:` _I want to add more tests that I don't want to commit to the Firecracker +repository._\ `A2:` Before a testrun or test session, just add your test directory under `tests/`. `pytest` will discover all tests in this tree. -`Q3:` *I want to have my own test fixtures, and not commit them in the repo.*\ +`Q3:` _I want to have my own test fixtures, and not commit them in the repo._\ `A3:` Add a `conftest.py` file in your test directory, and place your fixtures there. `pytest` will bring them into scope for all your tests. -`Q4:` *I want to use more/other microvm test images, but I don't want to add -them to the common s3 bucket.*\ +`Q4:` _I want to use more/other microvm test images, but I don't want to add +them to the common s3 bucket._\ `A4:` Add your custom images to the `build/img` subdirectory in the Firecracker source tree. This directory is bind-mounted in the container and used as a local image cache. -`Q5:` *How can I get live logger output from the tests?*\ +`Q5:` _How can I get live logger output from the tests?_\ `A5:` Accessing **pytest.ini** will allow you to modify logger settings. -`Q6:` *Is there a way to speed up integration tests execution time?*\ +`Q6:` _Is there a way to speed up integration tests execution time?_\ `A6:` You can narrow down the test selection as described in the **Running** section. For example: @@ -658,3 +658,9 @@ sudo env PYTHONPATH=tests HOME=$HOME ~/.local/bin/ipython3 -i tools/sandbox.py - > [!WARNING] > > **Notice this runs as root!** + +## Repro the freeze by + +```sh +tools/devtool -y test -- -s integration_tests/functional/test_snapshot_basic.py::test_snapshot_with_heavy_async_io +``` diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py index c4eac866028..1540a8ce01d 100644 --- a/tests/integration_tests/functional/test_snapshot_basic.py +++ b/tests/integration_tests/functional/test_snapshot_basic.py @@ -19,8 +19,9 @@ import host_tools.drive as drive_tools import host_tools.network as net_tools from framework import utils +from framework.microvm import SnapshotType from framework.properties import global_props -from framework.utils import check_filesystem, check_output +from framework.utils import Timeout, check_filesystem, check_output from framework.utils_vsock import ( ECHO_SERVER_PORT, VSOCK_UDS_PATH, @@ -586,3 +587,283 @@ def test_snapshot_rename_interface(uvm_nano, microvm_factory): rename_interfaces={iface_override.dev_name: iface_override.tap_name}, resume=True, ) + + +@pytest.mark.parametrize("snapshot_type", [SnapshotType.FULL]) +@pytest.mark.parametrize("pci_enabled", [False]) +@pytest.mark.parametrize("iteration", range(100)) # Run many iterations to catch non-zero drain +def test_snapshot_with_heavy_async_io( + microvm_factory, guest_kernel_linux_6_1, rootfs, snapshot_type, pci_enabled, iteration +): + """ + Test snapshot with heavy filesystem I/O using async/io_uring engine. + + This test verifies that async I/O operations in-flight during snapshot + are properly completed and their completion information is written to + guest memory (used ring) so that after restore, the guest driver sees + the completions and doesn't freeze. + + CRITICAL: The test restores from the snapshot IMMEDIATELY after creation. + In the error case (if async I/O completions weren't written to guest memory + during prepare_save()), the guest virtio driver will be stuck waiting for + completions that will never come, causing the VM to freeze. This test + detects that freeze by attempting to run a command immediately after restore. + + The test focuses on freeze detection - if async I/O completions aren't written + to guest memory during snapshot, the VM will freeze after restore. + + The test: + 1. Configures VM with async io_engine (kernel 6.1 only) + 2. Performs heavy async write operations (no sync) on the filesystem + 3. Creates a snapshot IMMEDIATELY while I/O operations are still in-flight + 4. Restores from the snapshot IMMEDIATELY after creation + 5. Verifies VM is responsive (freeze detection - will timeout if frozen) + 6. Verifies filesystem integrity and that all I/O completed correctly + """ + logger = logging.getLogger("snapshot_heavy_async_io") + + print("=" * 80) + print(f"Starting iteration {iteration + 1}/100 - Testing for non-zero async I/O drain") + print("=" * 80) + + # Build VM with kernel 6.1 only + vm = microvm_factory.build(guest_kernel_linux_6_1, rootfs, pci=pci_enabled) + # Enable Trace-level logging to see ALL async I/O drain operations during snapshot + # Trace level shows the most detailed device operations including block device async I/O + # The debug! macros in async_io.rs (like drain_and_flush) require Debug or Trace level + vm.spawn(log_level="Trace", log_show_level=True, log_show_origin=True) + vm.basic_config( + vcpu_count=2, + mem_size_mib=1024, + # track_dirty_pages=snapshot_type.needs_dirty_page_tracking, + rootfs_io_engine="Async", # Use async/io_uring engine + ) + vm.add_net_iface() + vm.start() + + logger.info("Starting heavy write I/O workload on guest filesystem...") + + # Perform heavy WRITE operations before snapshot + # Writes are more likely to be async and non-blocking, generating many + # in-flight async I/O requests that will need completion info written to memory + # We want to maximize the chance of having pending_ops > 0 during drain + write_io_script = """ + # Create a test directory + mkdir -p /tmp/io_test + cd /tmp/io_test + + # Strategy: Fire THOUSANDS of small writes VERY quickly to maximize io_uring queue depth + # Small writes complete faster but queue more operations + # Goal: Schedule thousands of operations to increase chance of pending_ops > 0 + + # Fire 2000+ very small writes in parallel - these will queue up quickly + # Small block sizes (4k-16k) are more likely to stay queued + for i in $(seq 1 2000); do + dd if=/dev/urandom of=test_file_$i bs=8k count=1 oflag=direct 2>/dev/null & + done + + # Fire 1000 medium writes to add more operations to the queue + for i in $(seq 1 1000); do + dd if=/dev/urandom of=medium_file_$i bs=16k count=1 oflag=direct 2>/dev/null & + done + + # Fire 500 larger writes + for i in $(seq 1 500); do + dd if=/dev/urandom of=large_file_$i bs=32k count=1 oflag=direct 2>/dev/null & + done + + # Fire 200 even larger writes + for i in $(seq 1 200); do + dd if=/dev/urandom of=xlarge_file_$i bs=64k count=1 oflag=direct 2>/dev/null & + done + + # Use fio with MAXIMUM iodepth to maximize in-flight operations + # io_uring queue depth is typically 4096, so we want to fill it up + # Total: 3700+ dd operations + fio operations + if command -v fio >/dev/null 2>&1; then + # Maximum iodepth (512) with many jobs to fill the queue + # Random writes are better than sequential for keeping ops in-flight + fio --name=heavy_write --filename=/tmp/io_test/fio_write_test \ + --rw=randwrite --bs=4k --size=2G --ioengine=libaio \ + --iodepth=512 --direct=1 --runtime=30 --time_based \ + --numjobs=64 --group_reporting \ + --output=/tmp/fio_write.log >/dev/null 2>&1 & + + # Start multiple fio jobs on different files for even more operations + fio --name=heavy_write2 --filename=/tmp/io_test/fio_write_test2 \ + --rw=randwrite --bs=8k --size=2G --ioengine=libaio \ + --iodepth=512 --direct=1 --runtime=30 --time_based \ + --numjobs=64 --group_reporting \ + --output=/tmp/fio_write2.log >/dev/null 2>&1 & + + fio --name=heavy_write3 --filename=/tmp/io_test/fio_write_test3 \ + --rw=randwrite --bs=16k --size=2G --ioengine=libaio \ + --iodepth=512 --direct=1 --runtime=30 --time_based \ + --numjobs=64 --group_reporting \ + --output=/tmp/fio_write3.log >/dev/null 2>&1 & + + fio --name=heavy_write4 --filename=/tmp/io_test/fio_write_test4 \ + --rw=randwrite --bs=4k --size=2G --ioengine=libaio \ + --iodepth=512 --direct=1 --runtime=30 --time_based \ + --numjobs=64 --group_reporting \ + --output=/tmp/fio_write4.log >/dev/null 2>&1 & + fi + + # Total operations scheduled: ~3700 dd + ~256 fio workers (64*4) = ~3956 operations + """ + + # Execute initial write workload + vm.ssh.run(write_io_script) + # No wait - fire first snapshot immediately to catch operations in-flight + + # Perform 4 snapshot-resume cycles + # Strategy: Take snapshots as fast as possible after firing I/O + # This keeps operations overlapping across snapshots so each has pending_ops > 0 + NUM_SNAPSHOTS = 4 + current_vm = vm + snapshots = [] + all_pending_ops = {} # Track pending_ops for each snapshot + queued_ops_count = [] + + for snap_num in range(1, NUM_SNAPSHOTS + 1): + print(f"\n{'='*80}") + print(f"Iteration {iteration + 1}, Snapshot {snap_num}/{NUM_SNAPSHOTS}") + print(f"{'='*80}") + + # Fire ONE or FEW really large operations - simple approach + # Large operations take time to complete, naturally staying in-flight + print(f"Snapshot {snap_num}: Firing large I/O operations...") + heavy_io_script = f""" + cd /tmp/io_test + + # Fire one or few really large operations + # Large operations take time to complete, keeping them in-flight in io_uring queue + # Simple and effective - just big writes that take time + + # One really big write - this will take time and stay in-flight + dd if=/dev/urandom of=snap{snap_num}_huge bs=1M count=1000 oflag=direct 2>/dev/null & + + # A few more large operations for good measure + dd if=/dev/urandom of=snap{snap_num}_large1 bs=1M count=500 oflag=direct 2>/dev/null & + dd if=/dev/urandom of=snap{snap_num}_large2 bs=1M count=500 oflag=direct 2>/dev/null & + dd if=/dev/urandom of=snap{snap_num}_large3 bs=1M count=500 oflag=direct 2>/dev/null & + """ + current_vm.ssh.run(heavy_io_script) + + # Short wait - just enough for operations to be submitted to io_uring + # Large operations will take time to complete, staying in-flight + time.sleep(0.01) # 10ms - just enough for submission + + print(f"Snapshot {snap_num}: Creating snapshot immediately (operations still in-flight)...") + snapshot = current_vm.make_snapshot(snapshot_type) + snapshots.append(snapshot) + + # Minimal wait for logs to flush + time.sleep(0.05) + + # Parse logs for pending_ops for EVERY snapshot + pending_ops_during_drain = None + if current_vm.log_file and current_vm.log_file.exists(): + try: + log_data = current_vm.log_data + log_lines = log_data.splitlines() + + # Parse the drain messages to extract pending_ops + for line in log_lines: + # Look for: "AsyncFileEngine queued ... request ... pending_ops=X" + if "AsyncFileEngine queued" in line and "pending_ops=" in line: + match = re.search(r'pending_ops=(\d+)', line) + if match: + queued_ops_count.append(int(match.group(1))) + + # Look for: "AsyncFileEngine draining: pending_ops=X discard_cqes=..." + # We want to find the MOST RECENT drain message for this snapshot + if "AsyncFileEngine draining:" in line: + match = re.search(r'pending_ops=(\d+)', line) + if match: + pending_ops_during_drain = int(match.group(1)) + print(f"Snapshot {snap_num}: Found drain start: pending_ops={pending_ops_during_drain}") + except Exception as e: + print(f"ERROR: Failed to parse log file: {e}") + + # Store pending_ops for this snapshot + all_pending_ops[snap_num] = pending_ops_during_drain + + # Kill current VM before restoring + current_vm.kill() + + # Restore immediately and fire next I/O as fast as possible + print(f"Snapshot {snap_num}: Restoring from snapshot IMMEDIATELY...") + restored_vm = microvm_factory.build_from_snapshot(snapshot) + + # Verify VM is responsive (freeze detection) - do this quickly + print(f"Snapshot {snap_num}: Verifying VM is responsive (freeze detection)...") + print(f"Snapshot {snap_num}: Waiting up to 30 seconds for VM to respond...") + try: + # Use a timeout to detect freeze - if VM is frozen, this will timeout + # The default SSH timeout is 60s, but we want to fail faster for freeze detection + with Timeout(30): + restored_vm.ssh.check_output("true") + print(f"Snapshot {snap_num}: ✓ VM is responsive - no freeze detected") + + # If not the last snapshot, immediately prepare for next snapshot + # This keeps operations overlapping across snapshots + if snap_num < NUM_SNAPSHOTS: + print(f"Snapshot {snap_num}: Ready for next snapshot (operations may still be running)...") + + # Report findings for this snapshot + print(f"\nSnapshot {snap_num} Results:") + print(f" pending_ops during drain: {pending_ops_during_drain}") + + if pending_ops_during_drain is not None and pending_ops_during_drain > 0: + print(f" *** NON-ZERO DRAIN: pending_ops={pending_ops_during_drain} ***") + elif pending_ops_during_drain == 0: + print(f" WARNING: pending_ops=0 (operations completed before drain)") + else: + print(f" WARNING: Could not parse pending_ops from logs") + + # Report summary for last snapshot + if snap_num == NUM_SNAPSHOTS: + print(f"\n{'='*80}") + print(f"Summary for all {NUM_SNAPSHOTS} snapshots:") + for s in range(1, NUM_SNAPSHOTS + 1): + ops = all_pending_ops.get(s, "unknown") + status = "✓" if (isinstance(ops, int) and ops > 0) else "✗" + print(f" Snapshot {s}: pending_ops={ops} {status}") + + if queued_ops_count: + max_queued = max(queued_ops_count) + min_queued = min(queued_ops_count) + print(f"\nOperation counts from FC logs:") + print(f" Max pending_ops seen in logs: {max_queued}") + print(f" Min pending_ops seen in logs: {min_queued}") + print(f" Sample of queued ops counts: {queued_ops_count[-10:]}") + + # Check if we got non-zero drain in any snapshot + non_zero_snapshots = [s for s, ops in all_pending_ops.items() + if isinstance(ops, int) and ops > 0] + if non_zero_snapshots: + print(f"\n{'='*80}") + print(f"SUCCESS: Found non-zero drain in snapshots: {non_zero_snapshots}") + print(f"All snapshots with non-zero drain resumed correctly - no freeze!") + print(f"This proves that non-zero drain with proper completion handling works correctly.") + print(f"{'='*80}\n") + else: + print(f"\nNo non-zero drain found in any snapshot - continuing to next iteration...") + print(f"{'='*80}") + + except Exception as e: + print(f"\n{'='*80}") + print(f"FAILURE: Snapshot {snap_num} - VM FROZE after restore!") + print(f"pending_ops during drain: {pending_ops_during_drain}") + print(f"Error: {e}") + print(f"{'='*80}\n") + restored_vm.kill() + raise + + # For next iteration, use the restored VM + if snap_num < NUM_SNAPSHOTS: + current_vm = restored_vm + else: + # Last snapshot - cleanup + restored_vm.kill() diff --git a/tools/test.sh b/tools/test.sh index 0bf67a65666..5b5c80159d3 100755 --- a/tools/test.sh +++ b/tools/test.sh @@ -47,7 +47,7 @@ export PYTEST_ADDOPTS="${PYTEST_ADDOPTS:-} --pdbcls=IPython.terminal.debugger:Te # if the tests failed and we are running in CI, print some disk usage stats # to help troubleshooting -if [ $ret != 0 ] && [ "$BUILDKITE" == "true" ]; then +if [ $ret != 0 ] && [ "${BUILDKITE:-false}" == "true" ]; then df -ih df -h du -h / 2>/dev/null |sort -h |tail -32 From d056e696cd677a0cfdad494a8cb0db359b9daed9 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Mon, 1 Dec 2025 22:53:01 -0800 Subject: [PATCH 7/9] Improve test --- .../functional/test_snapshot_basic.py | 271 ++++-------------- 1 file changed, 48 insertions(+), 223 deletions(-) diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py index 1540a8ce01d..2a425811a0e 100644 --- a/tests/integration_tests/functional/test_snapshot_basic.py +++ b/tests/integration_tests/functional/test_snapshot_basic.py @@ -591,279 +591,104 @@ def test_snapshot_rename_interface(uvm_nano, microvm_factory): @pytest.mark.parametrize("snapshot_type", [SnapshotType.FULL]) @pytest.mark.parametrize("pci_enabled", [False]) -@pytest.mark.parametrize("iteration", range(100)) # Run many iterations to catch non-zero drain +@pytest.mark.parametrize("iteration", range(100)) def test_snapshot_with_heavy_async_io( - microvm_factory, guest_kernel_linux_6_1, rootfs, snapshot_type, pci_enabled, iteration + microvm_factory, guest_kernel_linux_6_1, rootfs_rw, snapshot_type, pci_enabled, iteration ): - """ - Test snapshot with heavy filesystem I/O using async/io_uring engine. - - This test verifies that async I/O operations in-flight during snapshot - are properly completed and their completion information is written to - guest memory (used ring) so that after restore, the guest driver sees - the completions and doesn't freeze. - - CRITICAL: The test restores from the snapshot IMMEDIATELY after creation. - In the error case (if async I/O completions weren't written to guest memory - during prepare_save()), the guest virtio driver will be stuck waiting for - completions that will never come, causing the VM to freeze. This test - detects that freeze by attempting to run a command immediately after restore. - - The test focuses on freeze detection - if async I/O completions aren't written - to guest memory during snapshot, the VM will freeze after restore. - - The test: - 1. Configures VM with async io_engine (kernel 6.1 only) - 2. Performs heavy async write operations (no sync) on the filesystem - 3. Creates a snapshot IMMEDIATELY while I/O operations are still in-flight - 4. Restores from the snapshot IMMEDIATELY after creation - 5. Verifies VM is responsive (freeze detection - will timeout if frozen) - 6. Verifies filesystem integrity and that all I/O completed correctly - """ - logger = logging.getLogger("snapshot_heavy_async_io") - print("=" * 80) print(f"Starting iteration {iteration + 1}/100 - Testing for non-zero async I/O drain") print("=" * 80) - # Build VM with kernel 6.1 only - vm = microvm_factory.build(guest_kernel_linux_6_1, rootfs, pci=pci_enabled) - # Enable Trace-level logging to see ALL async I/O drain operations during snapshot - # Trace level shows the most detailed device operations including block device async I/O - # The debug! macros in async_io.rs (like drain_and_flush) require Debug or Trace level + # CHECK: if we set `rootfs` to just "rootfs" (readonly) it also freezes on async. + # By default though, even the rw rootfs has only like 37MB of free space. + vm = microvm_factory.build(guest_kernel_linux_6_1, rootfs_rw, pci=pci_enabled) + vm.spawn(log_level="Trace", log_show_level=True, log_show_origin=True) vm.basic_config( vcpu_count=2, mem_size_mib=1024, - # track_dirty_pages=snapshot_type.needs_dirty_page_tracking, - rootfs_io_engine="Async", # Use async/io_uring engine + # CHECK: If we set this to "Sync", it will not freeze. + rootfs_io_engine="Async", ) vm.add_net_iface() vm.start() - logger.info("Starting heavy write I/O workload on guest filesystem...") - - # Perform heavy WRITE operations before snapshot - # Writes are more likely to be async and non-blocking, generating many - # in-flight async I/O requests that will need completion info written to memory - # We want to maximize the chance of having pending_ops > 0 during drain write_io_script = """ # Create a test directory - mkdir -p /tmp/io_test - cd /tmp/io_test + mkdir -p /root/io_test + cd /root/io_test - # Strategy: Fire THOUSANDS of small writes VERY quickly to maximize io_uring queue depth - # Small writes complete faster but queue more operations - # Goal: Schedule thousands of operations to increase chance of pending_ops > 0 - - # Fire 2000+ very small writes in parallel - these will queue up quickly - # Small block sizes (4k-16k) are more likely to stay queued for i in $(seq 1 2000); do dd if=/dev/urandom of=test_file_$i bs=8k count=1 oflag=direct 2>/dev/null & done - # Fire 1000 medium writes to add more operations to the queue for i in $(seq 1 1000); do dd if=/dev/urandom of=medium_file_$i bs=16k count=1 oflag=direct 2>/dev/null & done - # Fire 500 larger writes for i in $(seq 1 500); do dd if=/dev/urandom of=large_file_$i bs=32k count=1 oflag=direct 2>/dev/null & done - # Fire 200 even larger writes for i in $(seq 1 200); do dd if=/dev/urandom of=xlarge_file_$i bs=64k count=1 oflag=direct 2>/dev/null & done - # Use fio with MAXIMUM iodepth to maximize in-flight operations - # io_uring queue depth is typically 4096, so we want to fill it up - # Total: 3700+ dd operations + fio operations if command -v fio >/dev/null 2>&1; then - # Maximum iodepth (512) with many jobs to fill the queue - # Random writes are better than sequential for keeping ops in-flight - fio --name=heavy_write --filename=/tmp/io_test/fio_write_test \ + fio --name=heavy_write --filename=/root/io_test/fio_write_test \ --rw=randwrite --bs=4k --size=2G --ioengine=libaio \ --iodepth=512 --direct=1 --runtime=30 --time_based \ - --numjobs=64 --group_reporting \ - --output=/tmp/fio_write.log >/dev/null 2>&1 & + --numjobs=2 --group_reporting \ + --output=/root/fio_write.log >/dev/null 2>&1 & - # Start multiple fio jobs on different files for even more operations - fio --name=heavy_write2 --filename=/tmp/io_test/fio_write_test2 \ + fio --name=heavy_write2 --filename=/root/io_test/fio_write_test2 \ --rw=randwrite --bs=8k --size=2G --ioengine=libaio \ --iodepth=512 --direct=1 --runtime=30 --time_based \ - --numjobs=64 --group_reporting \ - --output=/tmp/fio_write2.log >/dev/null 2>&1 & + --numjobs=2 --group_reporting \ + --output=/root/fio_write2.log >/dev/null 2>&1 & - fio --name=heavy_write3 --filename=/tmp/io_test/fio_write_test3 \ + fio --name=heavy_write3 --filename=/root/io_test/fio_write_test3 \ --rw=randwrite --bs=16k --size=2G --ioengine=libaio \ --iodepth=512 --direct=1 --runtime=30 --time_based \ - --numjobs=64 --group_reporting \ - --output=/tmp/fio_write3.log >/dev/null 2>&1 & + --numjobs=2 --group_reporting \ + --output=/root/fio_write3.log >/dev/null 2>&1 & - fio --name=heavy_write4 --filename=/tmp/io_test/fio_write_test4 \ + fio --name=heavy_write4 --filename=/root/io_test/fio_write_test4 \ --rw=randwrite --bs=4k --size=2G --ioengine=libaio \ --iodepth=512 --direct=1 --runtime=30 --time_based \ - --numjobs=64 --group_reporting \ - --output=/tmp/fio_write4.log >/dev/null 2>&1 & + --numjobs=2 --group_reporting \ + --output=/root/fio_write4.log >/dev/null 2>&1 & fi - - # Total operations scheduled: ~3700 dd + ~256 fio workers (64*4) = ~3956 operations """ - # Execute initial write workload vm.ssh.run(write_io_script) - # No wait - fire first snapshot immediately to catch operations in-flight - - # Perform 4 snapshot-resume cycles - # Strategy: Take snapshots as fast as possible after firing I/O - # This keeps operations overlapping across snapshots so each has pending_ops > 0 - NUM_SNAPSHOTS = 4 - current_vm = vm - snapshots = [] - all_pending_ops = {} # Track pending_ops for each snapshot - queued_ops_count = [] + + # if set to cca 2 seconds it stops freezing for Async. + time.sleep(0.01) - for snap_num in range(1, NUM_SNAPSHOTS + 1): - print(f"\n{'='*80}") - print(f"Iteration {iteration + 1}, Snapshot {snap_num}/{NUM_SNAPSHOTS}") - print(f"{'='*80}") - - # Fire ONE or FEW really large operations - simple approach - # Large operations take time to complete, naturally staying in-flight - print(f"Snapshot {snap_num}: Firing large I/O operations...") - heavy_io_script = f""" - cd /tmp/io_test - - # Fire one or few really large operations - # Large operations take time to complete, keeping them in-flight in io_uring queue - # Simple and effective - just big writes that take time - - # One really big write - this will take time and stay in-flight - dd if=/dev/urandom of=snap{snap_num}_huge bs=1M count=1000 oflag=direct 2>/dev/null & - - # A few more large operations for good measure - dd if=/dev/urandom of=snap{snap_num}_large1 bs=1M count=500 oflag=direct 2>/dev/null & - dd if=/dev/urandom of=snap{snap_num}_large2 bs=1M count=500 oflag=direct 2>/dev/null & - dd if=/dev/urandom of=snap{snap_num}_large3 bs=1M count=500 oflag=direct 2>/dev/null & - """ - current_vm.ssh.run(heavy_io_script) - - # Short wait - just enough for operations to be submitted to io_uring - # Large operations will take time to complete, staying in-flight - time.sleep(0.01) # 10ms - just enough for submission - - print(f"Snapshot {snap_num}: Creating snapshot immediately (operations still in-flight)...") - snapshot = current_vm.make_snapshot(snapshot_type) - snapshots.append(snapshot) - - # Minimal wait for logs to flush - time.sleep(0.05) - - # Parse logs for pending_ops for EVERY snapshot - pending_ops_during_drain = None - if current_vm.log_file and current_vm.log_file.exists(): - try: - log_data = current_vm.log_data - log_lines = log_data.splitlines() - - # Parse the drain messages to extract pending_ops - for line in log_lines: - # Look for: "AsyncFileEngine queued ... request ... pending_ops=X" - if "AsyncFileEngine queued" in line and "pending_ops=" in line: - match = re.search(r'pending_ops=(\d+)', line) - if match: - queued_ops_count.append(int(match.group(1))) - - # Look for: "AsyncFileEngine draining: pending_ops=X discard_cqes=..." - # We want to find the MOST RECENT drain message for this snapshot - if "AsyncFileEngine draining:" in line: - match = re.search(r'pending_ops=(\d+)', line) - if match: - pending_ops_during_drain = int(match.group(1)) - print(f"Snapshot {snap_num}: Found drain start: pending_ops={pending_ops_during_drain}") - except Exception as e: - print(f"ERROR: Failed to parse log file: {e}") - - # Store pending_ops for this snapshot - all_pending_ops[snap_num] = pending_ops_during_drain - - # Kill current VM before restoring - current_vm.kill() - - # Restore immediately and fire next I/O as fast as possible - print(f"Snapshot {snap_num}: Restoring from snapshot IMMEDIATELY...") - restored_vm = microvm_factory.build_from_snapshot(snapshot) - - # Verify VM is responsive (freeze detection) - do this quickly - print(f"Snapshot {snap_num}: Verifying VM is responsive (freeze detection)...") - print(f"Snapshot {snap_num}: Waiting up to 30 seconds for VM to respond...") + snapshot = vm.make_snapshot(snapshot_type) + + time.sleep(0.05) + + if vm.log_file and vm.log_file.exists(): try: - # Use a timeout to detect freeze - if VM is frozen, this will timeout - # The default SSH timeout is 60s, but we want to fail faster for freeze detection - with Timeout(30): - restored_vm.ssh.check_output("true") - print(f"Snapshot {snap_num}: ✓ VM is responsive - no freeze detected") - - # If not the last snapshot, immediately prepare for next snapshot - # This keeps operations overlapping across snapshots - if snap_num < NUM_SNAPSHOTS: - print(f"Snapshot {snap_num}: Ready for next snapshot (operations may still be running)...") - - # Report findings for this snapshot - print(f"\nSnapshot {snap_num} Results:") - print(f" pending_ops during drain: {pending_ops_during_drain}") - - if pending_ops_during_drain is not None and pending_ops_during_drain > 0: - print(f" *** NON-ZERO DRAIN: pending_ops={pending_ops_during_drain} ***") - elif pending_ops_during_drain == 0: - print(f" WARNING: pending_ops=0 (operations completed before drain)") - else: - print(f" WARNING: Could not parse pending_ops from logs") - - # Report summary for last snapshot - if snap_num == NUM_SNAPSHOTS: - print(f"\n{'='*80}") - print(f"Summary for all {NUM_SNAPSHOTS} snapshots:") - for s in range(1, NUM_SNAPSHOTS + 1): - ops = all_pending_ops.get(s, "unknown") - status = "✓" if (isinstance(ops, int) and ops > 0) else "✗" - print(f" Snapshot {s}: pending_ops={ops} {status}") - - if queued_ops_count: - max_queued = max(queued_ops_count) - min_queued = min(queued_ops_count) - print(f"\nOperation counts from FC logs:") - print(f" Max pending_ops seen in logs: {max_queued}") - print(f" Min pending_ops seen in logs: {min_queued}") - print(f" Sample of queued ops counts: {queued_ops_count[-10:]}") - - # Check if we got non-zero drain in any snapshot - non_zero_snapshots = [s for s, ops in all_pending_ops.items() - if isinstance(ops, int) and ops > 0] - if non_zero_snapshots: - print(f"\n{'='*80}") - print(f"SUCCESS: Found non-zero drain in snapshots: {non_zero_snapshots}") - print(f"All snapshots with non-zero drain resumed correctly - no freeze!") - print(f"This proves that non-zero drain with proper completion handling works correctly.") - print(f"{'='*80}\n") - else: - print(f"\nNo non-zero drain found in any snapshot - continuing to next iteration...") - print(f"{'='*80}") + log_data = vm.log_data + log_lines = log_data.splitlines() + for line in log_lines: + if "AsyncFileEngine draining:" in line: + match = re.search(r'pending_ops=(\d+)', line) + if match: + pending_ops_during_drain = int(match.group(1)) + print(f"DRAIN: pending_ops={pending_ops_during_drain}") + except Exception as e: - print(f"\n{'='*80}") - print(f"FAILURE: Snapshot {snap_num} - VM FROZE after restore!") - print(f"pending_ops during drain: {pending_ops_during_drain}") - print(f"Error: {e}") - print(f"{'='*80}\n") - restored_vm.kill() - raise - - # For next iteration, use the restored VM - if snap_num < NUM_SNAPSHOTS: - current_vm = restored_vm - else: - # Last snapshot - cleanup - restored_vm.kill() + print(f"ERROR: Failed to parse log file: {e}") + + vm.kill() + + print(f"Restoring from snapshot...") + with Timeout(30): + restored_vm = microvm_factory.build_from_snapshot(snapshot) + + print(f"VM resumed - no freeze detected") From 4f428d2aa49b4845b0cac6eefc5b076432a54971 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Mon, 1 Dec 2025 23:36:53 -0800 Subject: [PATCH 8/9] Resize the rootfs --- tests/conftest.py | 34 +++++++++++++++++-- .../functional/test_snapshot_basic.py | 31 ++++++++++++++--- 2 files changed, 57 insertions(+), 8 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 96ee285d192..87ded8df2cc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -26,6 +26,7 @@ import shutil import sys import tempfile +import uuid from pathlib import Path import pytest @@ -502,10 +503,37 @@ def rootfs(): return disks("ubuntu-24.04.squashfs")[0] -@pytest.fixture +@pytest.fixture(scope="function") def rootfs_rw(): - """Return an Ubuntu 24.04 ext4 rootfs""" - return disks("ubuntu-24.04.ext4")[0] + """Return an Ubuntu 24.04 ext4 rootfs, resized to 20GB for each test""" + original_rootfs = disks("ubuntu-24.04.ext4")[0] + original_ssh_key = original_rootfs.with_suffix(".id_rsa") + + # Create a unique resized copy for this test + unique_id = str(uuid.uuid4())[:8] + resized_rootfs_path = Path(tempfile.gettempdir()) / f"rootfs_rw_{unique_id}.ext4" + resized_ssh_key_path = resized_rootfs_path.with_suffix(".id_rsa") + + # Copy the original rootfs + shutil.copyfile(original_rootfs, resized_rootfs_path) + + # Copy the SSH key + if original_ssh_key.exists(): + shutil.copyfile(original_ssh_key, resized_ssh_key_path) + + # Resize to 20GB + new_size_bytes = 20 * 1024 * 1024 * 1024 + os.truncate(resized_rootfs_path, new_size_bytes) + utils.check_output(f"e2fsck -f -y {resized_rootfs_path}") + utils.check_output(f"resize2fs {resized_rootfs_path}") + + yield resized_rootfs_path + + # Cleanup: remove the resized rootfs file and SSH key after the test + if resized_rootfs_path.exists(): + resized_rootfs_path.unlink() + if resized_ssh_key_path.exists(): + resized_ssh_key_path.unlink() @pytest.fixture diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py index 2a425811a0e..60565349927 100644 --- a/tests/integration_tests/functional/test_snapshot_basic.py +++ b/tests/integration_tests/functional/test_snapshot_basic.py @@ -591,7 +591,7 @@ def test_snapshot_rename_interface(uvm_nano, microvm_factory): @pytest.mark.parametrize("snapshot_type", [SnapshotType.FULL]) @pytest.mark.parametrize("pci_enabled", [False]) -@pytest.mark.parametrize("iteration", range(100)) +@pytest.mark.parametrize("iteration", range(4)) def test_snapshot_with_heavy_async_io( microvm_factory, guest_kernel_linux_6_1, rootfs_rw, snapshot_type, pci_enabled, iteration ): @@ -613,25 +613,29 @@ def test_snapshot_with_heavy_async_io( vm.add_net_iface() vm.start() + # Check free space on sandbox start + _, free_space_start, _ = vm.ssh.check_output("df -h / | awk 'NR==2 {{print $4}}'") + print(f"Free space on sandbox start: {free_space_start.strip()}") + write_io_script = """ # Create a test directory mkdir -p /root/io_test cd /root/io_test for i in $(seq 1 2000); do - dd if=/dev/urandom of=test_file_$i bs=8k count=1 oflag=direct 2>/dev/null & + dd if=/dev/urandom of=/root/io_test/test_file_$i bs=8k count=1 oflag=direct 2>/dev/null & done for i in $(seq 1 1000); do - dd if=/dev/urandom of=medium_file_$i bs=16k count=1 oflag=direct 2>/dev/null & + dd if=/dev/urandom of=/root/io_test/medium_file_$i bs=16k count=1 oflag=direct 2>/dev/null & done for i in $(seq 1 500); do - dd if=/dev/urandom of=large_file_$i bs=32k count=1 oflag=direct 2>/dev/null & + dd if=/dev/urandom of=/root/io_test/large_file_$i bs=32k count=1 oflag=direct 2>/dev/null & done for i in $(seq 1 200); do - dd if=/dev/urandom of=xlarge_file_$i bs=64k count=1 oflag=direct 2>/dev/null & + dd if=/dev/urandom of=/root/io_test/xlarge_file_$i bs=64k count=1 oflag=direct 2>/dev/null & done if command -v fio >/dev/null 2>&1; then @@ -691,4 +695,21 @@ def test_snapshot_with_heavy_async_io( with Timeout(30): restored_vm = microvm_factory.build_from_snapshot(snapshot) + # Check free space after resume + _, free_space_after_resume, _ = restored_vm.ssh.check_output("df -h / | awk 'NR==2 {{print $4}}'") + print(f"Free space after resume: {free_space_after_resume.strip()}") + + # Delete all files/dirs created during the test + cleanup_script = """ + rm -rf /root/io_test + rm -f /root/fio_write*.log + """ + restored_vm.ssh.run(cleanup_script) + + # Check free space after cleanup + _, free_space_after_cleanup, _ = restored_vm.ssh.check_output("df -h / | awk 'NR==2 {{print $4}}'") + print(f"Free space after cleanup: {free_space_after_cleanup.strip()}") + print(f"VM resumed - no freeze detected") + + restored_vm.kill() From 2e504f9241c75d423e52f3dff99fa63bf86cfaaf Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Mon, 1 Dec 2025 23:43:46 -0800 Subject: [PATCH 9/9] Change iterations --- tests/integration_tests/functional/test_snapshot_basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py index 60565349927..f5ebd5e0ad7 100644 --- a/tests/integration_tests/functional/test_snapshot_basic.py +++ b/tests/integration_tests/functional/test_snapshot_basic.py @@ -591,7 +591,7 @@ def test_snapshot_rename_interface(uvm_nano, microvm_factory): @pytest.mark.parametrize("snapshot_type", [SnapshotType.FULL]) @pytest.mark.parametrize("pci_enabled", [False]) -@pytest.mark.parametrize("iteration", range(4)) +@pytest.mark.parametrize("iteration", range(100)) def test_snapshot_with_heavy_async_io( microvm_factory, guest_kernel_linux_6_1, rootfs_rw, snapshot_type, pci_enabled, iteration ):