From 61798a5afafa143118e9e998df44cebd8c621752 Mon Sep 17 00:00:00 2001
From: Jakub Dobry <jakub.dobry8@gmail.com>
Date: Mon, 1 Dec 2025 14:34:00 +0100
Subject: [PATCH 1/9] io_uring flush fixes

---
 src/vmm/src/device_manager/pci_mngr.rs        |  2 +-
 src/vmm/src/device_manager/persist.rs         |  2 +-
 src/vmm/src/devices/virtio/block/device.rs    |  6 ++--
 .../src/devices/virtio/block/virtio/device.rs | 32 +++++++++++++------
 .../virtio/block/virtio/io/async_io.rs        |  4 +++
 .../src/devices/virtio/block/virtio/io/mod.rs |  7 ++++
 .../src/devices/virtio/block/virtio/mod.rs    |  2 ++
 7 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs
index 8ebf3250581..0bea2c2f7bd 100644
--- a/src/vmm/src/device_manager/pci_mngr.rs
+++ b/src/vmm/src/device_manager/pci_mngr.rs
@@ -311,7 +311,7 @@ impl<'a> Persist<'a> for PciDevices {
                              snapshotting yet"
                         );
                     } else {
-                        block_dev.prepare_save();
+                        block_dev.prepare_save()?;
                         let device_state = block_dev.save();
                         state.block_devices.push(VirtioDeviceState {
                             device_id: block_dev.id().to_string(),
diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs
index 0a074dc9fa1..08593ef3b80 100644
--- a/src/vmm/src/device_manager/persist.rs
+++ b/src/vmm/src/device_manager/persist.rs
@@ -265,7 +265,7 @@ impl<'a> Persist<'a> for MMIODeviceManager {
                              snapshotting yet"
                         );
                     } else {
-                        block.prepare_save();
+                        block.prepare_save()?;
                         let device_state = block.save();
                         states.block_devices.push(VirtioDeviceState {
                             device_id,
diff --git a/src/vmm/src/devices/virtio/block/device.rs b/src/vmm/src/devices/virtio/block/device.rs
index 93eb56e7760..1c1175f93c4 100644
--- a/src/vmm/src/devices/virtio/block/device.rs
+++ b/src/vmm/src/devices/virtio/block/device.rs
@@ -82,10 +82,10 @@ impl Block {
         }
     }
 
-    pub fn prepare_save(&mut self) {
+    pub fn prepare_save(&mut self) -> Result<(), BlockError> {
         match self {
-            Self::Virtio(b) => b.prepare_save(),
-            Self::VhostUser(b) => b.prepare_save(),
+            Self::Virtio(b) => b.prepare_save().map_err(BlockError::VirtioBackend),
+            Self::VhostUser(_) => Err(BlockError::InvalidBlockBackend),
         }
     }
 
diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs
index 62ebdbbf5fd..47e72f98fdf 100644
--- a/src/vmm/src/devices/virtio/block/virtio/device.rs
+++ b/src/vmm/src/devices/virtio/block/virtio/device.rs
@@ -563,22 +563,32 @@ impl VirtioBlock {
         }
     }
 
-    fn drain_and_flush(&mut self, discard: bool) {
-        if let Err(err) = self.disk.file_engine.drain_and_flush(discard) {
-            error!("Failed to drain ops and flush block data: {:?}", err);
-        }
+    fn drain_and_flush(&mut self, discard: bool) -> Result<(), VirtioBlockError> {
+        self.disk
+            .file_engine
+            .drain_and_flush(discard)
+            .map_err(VirtioBlockError::FileEngine)
     }
 
     /// Prepare device for being snapshotted.
-    pub fn prepare_save(&mut self) {
+    pub fn prepare_save(&mut self) -> Result<(), VirtioBlockError> {
         if !self.is_activated() {
-            return;
+            return Ok(());
         }
 
-        self.drain_and_flush(false);
-        if let FileEngine::Async(ref _engine) = self.disk.file_engine {
+        self.drain_and_flush(false)?;
+        let is_async = matches!(self.disk.file_engine, FileEngine::Async(_));
+        if is_async {
             self.process_async_completion_queue();
+            if let FileEngine::Async(ref engine) = self.disk.file_engine {
+                let pending = engine.pending_ops();
+                if pending != 0 {
+                    return Err(VirtioBlockError::PendingAsyncOperations(pending));
+                }
+            }
         }
+
+        Ok(())
     }
 }
 
@@ -682,7 +692,9 @@ impl Drop for VirtioBlock {
                 }
             }
             CacheType::Writeback => {
-                self.drain_and_flush(true);
+                if let Err(err) = self.drain_and_flush(true) {
+                    error!("Failed to drain ops and flush block data: {:?}", err);
+                }
             }
         };
     }
@@ -1673,7 +1685,7 @@ mod tests {
             // Add a batch of flush requests.
             add_flush_requests_batch(&mut block, &vq, 5);
             simulate_queue_event(&mut block, None);
-            block.prepare_save();
+            block.prepare_save().unwrap();
 
             // Check that all the pending flush requests were processed during `prepare_save()`.
             check_flush_requests_batch(5, &vq);
diff --git a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs
index f83d9dea1df..2409f92b6b8 100644
--- a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs
+++ b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs
@@ -119,6 +119,10 @@ impl AsyncFileEngine {
         &self.completion_evt
     }
 
+    pub fn pending_ops(&self) -> u32 {
+        self.ring.num_ops()
+    }
+
     pub fn push_read(
         &mut self,
         offset: u64,
diff --git a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs
index 09cc7c4e31d..05a7982b8e0 100644
--- a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs
+++ b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs
@@ -172,6 +172,13 @@ impl FileEngine {
             FileEngine::Sync(engine) => engine.flush().map_err(BlockIoError::Sync),
         }
     }
+
+    pub fn pending_async_ops(&self) -> u32 {
+        match self {
+            FileEngine::Async(engine) => engine.pending_ops(),
+            FileEngine::Sync(_) => 0,
+        }
+    }
 }
 
 #[cfg(test)]
diff --git a/src/vmm/src/devices/virtio/block/virtio/mod.rs b/src/vmm/src/devices/virtio/block/virtio/mod.rs
index 9e97d6d3897..24fc574cc5f 100644
--- a/src/vmm/src/devices/virtio/block/virtio/mod.rs
+++ b/src/vmm/src/devices/virtio/block/virtio/mod.rs
@@ -63,4 +63,6 @@ pub enum VirtioBlockError {
     RateLimiter(std::io::Error),
     /// Persistence error: {0}
     Persist(crate::devices::virtio::persist::PersistError),
+    /// Async file engine still has {0} pending operations.
+    PendingAsyncOperations(u32),
 }

From 0ca53bf41a1050dce19428360d2b342d7e645341 Mon Sep 17 00:00:00 2001
From: Jakub Dobry <jakub.dobry8@gmail.com>
Date: Mon, 1 Dec 2025 14:27:03 +0000
Subject: [PATCH 2/9] compilation fixes

---
 src/vmm/src/device_manager/pci_mngr.rs | 4 +++-
 src/vmm/src/device_manager/persist.rs  | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs
index 0bea2c2f7bd..e3554c60470 100644
--- a/src/vmm/src/device_manager/pci_mngr.rs
+++ b/src/vmm/src/device_manager/pci_mngr.rs
@@ -311,7 +311,9 @@ impl<'a> Persist<'a> for PciDevices {
                              snapshotting yet"
                         );
                     } else {
-                        block_dev.prepare_save()?;
+                        if let Err(err) = block_dev.prepare_save() {
+                            warn!("Failed to prepare block device for save: {:?}", err);
+                        }
                         let device_state = block_dev.save();
                         state.block_devices.push(VirtioDeviceState {
                             device_id: block_dev.id().to_string(),
diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs
index 08593ef3b80..9cbab81fd79 100644
--- a/src/vmm/src/device_manager/persist.rs
+++ b/src/vmm/src/device_manager/persist.rs
@@ -265,7 +265,9 @@ impl<'a> Persist<'a> for MMIODeviceManager {
                              snapshotting yet"
                         );
                     } else {
-                        block.prepare_save()?;
+                        if let Err(err) = block.prepare_save() {
+                            warn!("Failed to prepare block device for save: {:?}", err);
+                        }
                         let device_state = block.save();
                         states.block_devices.push(VirtioDeviceState {
                             device_id,

From 8ffa614a36f6c418b8e5f34ea0cd3932bf34de99 Mon Sep 17 00:00:00 2001
From: Jakub Dobry <jakub.dobry8@gmail.com>
Date: Mon, 1 Dec 2025 15:31:15 +0100
Subject: [PATCH 3/9] add debug messages

---
 .../src/devices/virtio/block/virtio/device.rs | 24 ++++++++++++++++++-
 .../virtio/block/virtio/io/async_io.rs        | 16 ++++++++++++-
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs
index 47e72f98fdf..7786c23dd35 100644
--- a/src/vmm/src/devices/virtio/block/virtio/device.rs
+++ b/src/vmm/src/devices/virtio/block/virtio/device.rs
@@ -35,7 +35,7 @@ use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX;
 use crate::devices::virtio::queue::{InvalidAvailIdx, Queue};
 use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType};
 use crate::impl_device_type;
-use crate::logger::{IncMetric, error, warn};
+use crate::logger::{IncMetric, debug, error, warn};
 use crate::rate_limiter::{BucketUpdate, RateLimiter};
 use crate::utils::u64_to_usize;
 use crate::vmm_config::RateLimiterConfig;
@@ -474,6 +474,7 @@ impl VirtioBlock {
         // This is safe since we checked in the event handler that the device is activated.
         let active_state = self.device_state.active_state().unwrap();
         let queue = &mut self.queues[0];
+        let mut processed = 0_u32;
 
         loop {
             match engine.pop(&active_state.mem) {
@@ -504,6 +505,7 @@ impl VirtioBlock {
                                 finished.desc_idx, err
                             )
                         });
+                    processed = processed.saturating_add(1);
                 }
             }
         }
@@ -517,6 +519,10 @@ impl VirtioBlock {
                     self.metrics.event_fails.inc();
                 });
         }
+        debug!(
+            "virtio-block '{}' processed {} io_uring completions",
+            self.id, processed
+        );
     }
 
     pub fn process_async_completion_event(&mut self) {
@@ -573,21 +579,37 @@ impl VirtioBlock {
     /// Prepare device for being snapshotted.
     pub fn prepare_save(&mut self) -> Result<(), VirtioBlockError> {
         if !self.is_activated() {
+            debug!(
+                "virtio-block '{}' prepare_save skipped (device inactive)",
+                self.id
+            );
             return Ok(());
         }
 
+        debug!(
+            "virtio-block '{}' draining outstanding IO before snapshot",
+            self.id
+        );
         self.drain_and_flush(false)?;
         let is_async = matches!(self.disk.file_engine, FileEngine::Async(_));
         if is_async {
             self.process_async_completion_queue();
             if let FileEngine::Async(ref engine) = self.disk.file_engine {
                 let pending = engine.pending_ops();
+                debug!(
+                    "virtio-block '{}' prepare_save async pending_ops after drain: {}",
+                    self.id, pending
+                );
                 if pending != 0 {
                     return Err(VirtioBlockError::PendingAsyncOperations(pending));
                 }
             }
         }
 
+        debug!(
+            "virtio-block '{}' prepare_save finished; async backend drained",
+            self.id
+        );
         Ok(())
     }
 }
diff --git a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs
index 2409f92b6b8..843df0b43a5 100644
--- a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs
+++ b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs
@@ -14,7 +14,7 @@ use crate::devices::virtio::block::virtio::{IO_URING_NUM_ENTRIES, PendingRequest
 use crate::io_uring::operation::{Cqe, OpCode, Operation};
 use crate::io_uring::restriction::Restriction;
 use crate::io_uring::{IoUring, IoUringError};
-use crate::logger::log_dev_preview_warning;
+use crate::logger::{debug, log_dev_preview_warning};
 use crate::vstate::memory::{GuestAddress, GuestMemory, GuestMemoryExtension, GuestMemoryMmap};
 
 #[derive(Debug, thiserror::Error, displaydoc::Display)]
@@ -224,12 +224,26 @@ impl AsyncFileEngine {
     }
 
     pub fn drain_and_flush(&mut self, discard_cqes: bool) -> Result<(), AsyncIoError> {
+        let pending_before = self.pending_ops();
+        debug!(
+            "AsyncFileEngine draining: pending_ops={} discard_cqes={}",
+            pending_before, discard_cqes
+        );
         self.drain(discard_cqes)?;
+        debug!(
+            "AsyncFileEngine drain complete: pending_ops={} (discard_cqes={})",
+            self.pending_ops(),
+            discard_cqes
+        );
 
         // Sync data out to physical media on host.
         // We don't need to call flush first since all the ops are performed through io_uring
         // and Rust shouldn't manage any data in its internal buffers.
         self.file.sync_all().map_err(AsyncIoError::SyncAll)?;
+        debug!(
+            "AsyncFileEngine sync complete: pending_ops={}",
+            self.pending_ops()
+        );
 
         Ok(())
     }

From ed87706f3abb276296c65b1c69e1542fbc4a37d3 Mon Sep 17 00:00:00 2001
From: Jakub Dobry <jakub.dobry8@gmail.com>
Date: Mon, 1 Dec 2025 15:03:25 +0000
Subject: [PATCH 4/9] change default logging level to debug

---
 docs/logger.md                           |   2 +-
 src/firecracker/swagger/firecracker.yaml | 131 +++++++++--------------
 src/vmm/src/logger/logging.rs            |   2 +-
 3 files changed, 51 insertions(+), 84 deletions(-)

diff --git a/docs/logger.md b/docs/logger.md
index 4585c4f3f67..3b0f9a082ae 100644
--- a/docs/logger.md
+++ b/docs/logger.md
@@ -50,7 +50,7 @@ process:
 ```
 
 The other Logger fields have, in this case, the default values:
-`Level -> Warning`, `show_level -> false`, `show_log_origin -> false`. For
+`Level -> Debug`, `show_level -> false`, `show_log_origin -> false`. For
 configuring these too, you can also pass the following optional parameters:
 `--level <log_level>`, `--show-level`, `--show-log-origin`:
 
diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml
index 5f16fc1df40..6bdc63f7792 100644
--- a/src/firecracker/swagger/firecracker.yaml
+++ b/src/firecracker/swagger/firecracker.yaml
@@ -85,12 +85,12 @@ paths:
         Will fail if update is not possible.
       operationId: putBalloon
       parameters:
-      - name: body
-        in: body
-        description: Balloon properties
-        required: true
-        schema:
-          $ref: "#/definitions/Balloon"
+        - name: body
+          in: body
+          description: Balloon properties
+          required: true
+          schema:
+            $ref: "#/definitions/Balloon"
       responses:
         204:
           description: Balloon device created/updated
@@ -109,12 +109,12 @@ paths:
         Will fail if update is not possible.
       operationId: patchBalloon
       parameters:
-      - name: body
-        in: body
-        description: Balloon properties
-        required: true
-        schema:
-          $ref: "#/definitions/BalloonUpdate"
+        - name: body
+          in: body
+          description: Balloon properties
+          required: true
+          schema:
+            $ref: "#/definitions/BalloonUpdate"
       responses:
         204:
           description: Balloon device updated
@@ -151,12 +151,12 @@ paths:
         Will fail if update is not possible.
       operationId: patchBalloonStatsInterval
       parameters:
-      - name: body
-        in: body
-        description: Balloon properties
-        required: true
-        schema:
-          $ref: "#/definitions/BalloonStatsUpdate"
+        - name: body
+          in: body
+          description: Balloon properties
+          required: true
+          schema:
+            $ref: "#/definitions/BalloonStatsUpdate"
       responses:
         204:
           description: Balloon statistics interval updated
@@ -220,7 +220,6 @@ paths:
           schema:
             $ref: "#/definitions/Error"
 
-
   /drives/{drive_id}:
     put:
       summary: Creates or updates a drive. Pre-boot only.
@@ -488,8 +487,7 @@ paths:
   /entropy:
     put:
       summary: Creates an entropy device. Pre-boot only.
-      description:
-        Enables an entropy device that provides high-quality random data to the guest.
+      description: Enables an entropy device that provides high-quality random data to the guest.
       operationId: putEntropyDevice
       parameters:
         - name: body
@@ -506,12 +504,10 @@ paths:
           schema:
             $ref: "#/definitions/Error"
 
-
   /network-interfaces/{iface_id}:
     put:
       summary: Creates a network interface. Pre-boot only.
-      description:
-        Creates new network interface with ID specified by iface_id path parameter.
+      description: Creates new network interface with ID specified by iface_id path parameter.
       operationId: putGuestNetworkInterfaceByID
       parameters:
         - name: iface_id
@@ -538,8 +534,7 @@ paths:
             $ref: "#/definitions/Error"
     patch:
       summary: Updates the rate limiters applied to a network interface. Post-boot only.
-      description:
-        Updates the rate limiters applied to a network interface.
+      description: Updates the rate limiters applied to a network interface.
       operationId: patchGuestNetworkInterfaceByID
       parameters:
         - name: iface_id
@@ -594,8 +589,7 @@ paths:
   /snapshot/load:
     put:
       summary: Loads a snapshot. Pre-boot only.
-      description:
-        Loads the microVM state from a snapshot.
+      description: Loads the microVM state from a snapshot.
         Only accepted on a fresh Firecracker process (before configuring
         any resource other than the Logger and Metrics).
       operationId: loadSnapshot
@@ -635,8 +629,7 @@ paths:
   /vm:
     patch:
       summary: Updates the microVM state.
-      description:
-        Sets the desired state (Paused or Resumed) for the microVM.
+      description: Sets the desired state (Paused or Resumed) for the microVM.
       operationId: patchVm
       parameters:
         - name: body
@@ -707,8 +700,7 @@ definitions:
     required:
       - amount_mib
       - deflate_on_oom
-    description:
-      Balloon device descriptor.
+    description: Balloon device descriptor.
     properties:
       amount_mib:
         type: integer
@@ -724,8 +716,7 @@ definitions:
     type: object
     required:
       - amount_mib
-    description:
-      Balloon device descriptor.
+    description: Balloon device descriptor.
     properties:
       amount_mib:
         type: integer
@@ -733,8 +724,7 @@ definitions:
 
   BalloonStats:
     type: object
-    description:
-      Describes the balloon device statistics.
+    description: Describes the balloon device statistics.
     required:
       - target_pages
       - actual_pages
@@ -798,8 +788,7 @@ definitions:
     type: object
     required:
       - stats_polling_interval_s
-    description:
-      Update the statistics polling interval, with the first statistics update scheduled immediately. Statistics cannot be turned on/off after boot.
+    description: Update the statistics polling interval, with the first statistics update scheduled immediately. Statistics cannot be turned on/off after boot.
     properties:
       stats_polling_interval_s:
         type: integer
@@ -809,8 +798,7 @@ definitions:
     type: object
     required:
       - kernel_image_path
-    description:
-      Boot source descriptor.
+    description: Boot source descriptor.
     properties:
       boot_args:
         type: string
@@ -879,21 +867,18 @@ definitions:
         type: boolean
       cache_type:
         type: string
-        description:
-          Represents the caching strategy for the block device.
+        description: Represents the caching strategy for the block device.
         enum: ["Unsafe", "Writeback"]
         default: "Unsafe"
 
       # VirtioBlock specific parameters
       is_read_only:
         type: boolean
-        description:
-          Is block read only.
+        description: Is block read only.
           This field is required for virtio-block config and should be omitted for vhost-user-block configuration.
       path_on_host:
         type: string
-        description:
-          Host level path for the guest drive.
+        description: Host level path for the guest drive.
           This field is required for virtio-block config and should be omitted for vhost-user-block configuration.
       rate_limiter:
         $ref: "#/definitions/RateLimiter"
@@ -909,8 +894,7 @@ definitions:
       # VhostUserBlock specific parameters
       socket:
         type: string
-        description:
-          Path to the socket of vhost-user-block backend.
+        description: Path to the socket of vhost-user-block backend.
           This field is required for vhost-user-block config should be omitted for virtio-block configuration.
 
   Error:
@@ -955,8 +939,7 @@ definitions:
 
   InstanceActionInfo:
     type: object
-    description:
-      Variant wrapper containing the real action.
+    description: Variant wrapper containing the real action.
     required:
       - action_type
     properties:
@@ -970,8 +953,7 @@ definitions:
 
   InstanceInfo:
     type: object
-    description:
-      Describes MicroVM instance information.
+    description: Describes MicroVM instance information.
     required:
       - app_name
       - id
@@ -999,14 +981,13 @@ definitions:
 
   Logger:
     type: object
-    description:
-      Describes the configuration option for the logging capability.
+    description: Describes the configuration option for the logging capability.
     properties:
       level:
         type: string
         description: Set the level. The possible values are case-insensitive.
         enum: [Error, Warning, Info, Debug, Trace, Off]
-        default: Info
+        default: Debug
       log_path:
         type: string
         description: Path to the named pipe or file for the human readable log output.
@@ -1085,8 +1066,7 @@ definitions:
 
   Metrics:
     type: object
-    description:
-      Describes the configuration option for the metrics capability.
+    description: Describes the configuration option for the metrics capability.
     required:
       - metrics_path
     properties:
@@ -1096,8 +1076,7 @@ definitions:
 
   MmdsConfig:
     type: object
-    description:
-      Defines the MMDS configuration.
+    description: Defines the MMDS configuration.
     required:
       - network_interfaces
     properties:
@@ -1134,13 +1113,11 @@ definitions:
 
   MmdsContentsObject:
     type: object
-    description:
-      Describes the contents of MMDS in JSON format.
+    description: Describes the contents of MMDS in JSON format.
 
   NetworkInterface:
     type: object
-    description:
-      Defines a network interface.
+    description: Defines a network interface.
     required:
       - host_dev_name
       - iface_id
@@ -1166,8 +1143,7 @@ definitions:
         type: string
       path_on_host:
         type: string
-        description:
-          Host level path for the guest drive.
+        description: Host level path for the guest drive.
           This field is optional for virtio-block config and should be omitted for vhost-user-block configuration.
       rate_limiter:
         $ref: "#/definitions/RateLimiter"
@@ -1233,12 +1209,10 @@ definitions:
     properties:
       iface_id:
         type: string
-        description:
-          The name of the interface to modify
+        description: The name of the interface to modify
       host_dev_name:
         type: string
-        description:
-          The new host device of the interface
+        description: The new host device of the interface
 
   SnapshotLoadParams:
     type: object
@@ -1250,12 +1224,10 @@ definitions:
     properties:
       enable_diff_snapshots:
         type: boolean
-        description:
-          (Deprecated) Enable dirty page tracking to improve space efficiency of diff snapshots
+        description: (Deprecated) Enable dirty page tracking to improve space efficiency of diff snapshots
       track_dirty_pages:
         type: boolean
-        description:
-          Enable dirty page tracking to improve space efficiency of diff snapshots
+        description: Enable dirty page tracking to improve space efficiency of diff snapshots
       mem_file_path:
         type: string
         description:
@@ -1273,15 +1245,13 @@ definitions:
         description: Path to the file that contains the microVM state to be loaded.
       resume_vm:
         type: boolean
-        description:
-          When set to true, the vm is also resumed if the snapshot load is successful.
+        description: When set to true, the vm is also resumed if the snapshot load is successful.
       network_overrides:
         type: array
         description: Network host device names to override
         items:
           $ref: "#/definitions/NetworkOverride"
 
-
   TokenBucket:
     type: object
     description:
@@ -1315,8 +1285,7 @@ definitions:
 
   Vm:
     type: object
-    description:
-      Defines the microVM running state. It is especially useful in the snapshotting context.
+    description: Defines the microVM running state. It is especially useful in the snapshotting context.
     required:
       - state
     properties:
@@ -1328,16 +1297,14 @@ definitions:
 
   EntropyDevice:
     type: object
-    description:
-      Defines an entropy device.
+    description: Defines an entropy device.
     properties:
       rate_limiter:
         $ref: "#/definitions/RateLimiter"
 
   FirecrackerVersion:
     type: object
-    description:
-      Describes the Firecracker version.
+    description: Describes the Firecracker version.
     required:
       - firecracker_version
     properties:
diff --git a/src/vmm/src/logger/logging.rs b/src/vmm/src/logger/logging.rs
index e5cdd8a33d3..7ab240dd3c7 100644
--- a/src/vmm/src/logger/logging.rs
+++ b/src/vmm/src/logger/logging.rs
@@ -17,7 +17,7 @@ use super::metrics::{IncMetric, METRICS};
 
 /// Default level filter for logger matching the swagger specification
 /// (`src/firecracker/swagger/firecracker.yaml`).
-pub const DEFAULT_LEVEL: log::LevelFilter = log::LevelFilter::Info;
+pub const DEFAULT_LEVEL: log::LevelFilter = log::LevelFilter::Debug;
 /// Default instance id.
 pub const DEFAULT_INSTANCE_ID: &str = "anonymous-instance";
 /// Instance id.

From 5d20657f65188f6af5b932af439ec2afa829003b Mon Sep 17 00:00:00 2001
From: Jakub Dobry <jakub.dobry8@gmail.com>
Date: Mon, 1 Dec 2025 20:47:33 +0000
Subject: [PATCH 5/9] add more debug messages

---
 .../src/devices/virtio/block/virtio/device.rs | 11 ++++-
 .../virtio/block/virtio/io/async_io.rs        | 40 +++++++++++++++++--
 .../devices/virtio/block/virtio/persist.rs    |  6 +++
 .../devices/virtio/block/virtio/request.rs    | 12 ++++++
 4 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs
index 7786c23dd35..b7e26ea4d40 100644
--- a/src/vmm/src/devices/virtio/block/virtio/device.rs
+++ b/src/vmm/src/devices/virtio/block/virtio/device.rs
@@ -475,6 +475,11 @@ impl VirtioBlock {
         let active_state = self.device_state.active_state().unwrap();
         let queue = &mut self.queues[0];
         let mut processed = 0_u32;
+        let pending_before = engine.pending_ops();
+        debug!(
+            "virtio-block '{}' starting async completion drain: pending_ops={}",
+            self.id, pending_before
+        );
 
         loop {
             match engine.pop(&active_state.mem) {
@@ -520,8 +525,10 @@ impl VirtioBlock {
                 });
         }
         debug!(
-            "virtio-block '{}' processed {} io_uring completions",
-            self.id, processed
+            "virtio-block '{}' processed {} io_uring completions; pending_ops={} after drain",
+            self.id,
+            processed,
+            engine.pending_ops()
         );
     }
 
diff --git a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs
index 843df0b43a5..4b7e4517d4a 100644
--- a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs
+++ b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs
@@ -131,6 +131,8 @@ impl AsyncFileEngine {
         count: u32,
         req: PendingRequest,
     ) -> Result<(), RequestError<AsyncIoError>> {
+        let req_type = req.request_type();
+        let desc_idx = req.desc_idx();
         let buf = match mem.get_slice(addr, count as usize) {
             Ok(slice) => slice.ptr_guard_mut().as_ptr(),
             Err(err) => {
@@ -154,7 +156,17 @@ impl AsyncFileEngine {
             .map_err(|(io_uring_error, data)| RequestError {
                 req: data.req,
                 error: AsyncIoError::IoUring(io_uring_error),
-            })
+            })?;
+
+        debug!(
+            "AsyncFileEngine queued {:?} request desc_idx={} offset={} len={} pending_ops={}",
+            req_type,
+            desc_idx,
+            offset,
+            count,
+            self.pending_ops()
+        );
+        Ok(())
     }
 
     pub fn push_write(
@@ -165,6 +177,8 @@ impl AsyncFileEngine {
         count: u32,
         req: PendingRequest,
     ) -> Result<(), RequestError<AsyncIoError>> {
+        let req_type = req.request_type();
+        let desc_idx = req.desc_idx();
         let buf = match mem.get_slice(addr, count as usize) {
             Ok(slice) => slice.ptr_guard_mut().as_ptr(),
             Err(err) => {
@@ -188,10 +202,22 @@ impl AsyncFileEngine {
             .map_err(|(io_uring_error, data)| RequestError {
                 req: data.req,
                 error: AsyncIoError::IoUring(io_uring_error),
-            })
+            })?;
+
+        debug!(
+            "AsyncFileEngine queued {:?} request desc_idx={} offset={} len={} pending_ops={}",
+            req_type,
+            desc_idx,
+            offset,
+            count,
+            self.pending_ops()
+        );
+        Ok(())
     }
 
     pub fn push_flush(&mut self, req: PendingRequest) -> Result<(), RequestError<AsyncIoError>> {
+        let req_type = req.request_type();
+        let desc_idx = req.desc_idx();
         let wrapped_user_data = WrappedRequest::new(req);
 
         self.ring
@@ -199,7 +225,15 @@ impl AsyncFileEngine {
             .map_err(|(io_uring_error, data)| RequestError {
                 req: data.req,
                 error: AsyncIoError::IoUring(io_uring_error),
-            })
+            })?;
+
+        debug!(
+            "AsyncFileEngine queued {:?} request desc_idx={} pending_ops={}",
+            req_type,
+            desc_idx,
+            self.pending_ops()
+        );
+        Ok(())
     }
 
     pub fn kick_submission_queue(&mut self) -> Result<(), AsyncIoError> {
diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs
index 5b544a12c19..99804b634d2 100644
--- a/src/vmm/src/devices/virtio/block/virtio/persist.rs
+++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs
@@ -16,6 +16,7 @@ use crate::devices::virtio::device::{ActiveState, DeviceState};
 use crate::devices::virtio::generated::virtio_blk::VIRTIO_BLK_F_RO;
 use crate::devices::virtio::generated::virtio_ids::VIRTIO_ID_BLOCK;
 use crate::devices::virtio::persist::VirtioDeviceState;
+use crate::logger::debug;
 use crate::rate_limiter::RateLimiter;
 use crate::rate_limiter::persist::RateLimiterState;
 use crate::snapshot::Persist;
@@ -69,6 +70,11 @@ impl Persist<'_> for VirtioBlock {
     type Error = VirtioBlockError;
 
     fn save(&self) -> Self::State {
+        let pending_ops = self.disk.file_engine.pending_async_ops();
+        debug!(
+            "virtio-block '{}' save invoked with pending_async_ops={}",
+            self.id, pending_ops
+        );
         // Save device state.
         VirtioBlockState {
             id: self.id.clone(),
diff --git a/src/vmm/src/devices/virtio/block/virtio/request.rs b/src/vmm/src/devices/virtio/block/virtio/request.rs
index 8fc83cf43da..97119083720 100644
--- a/src/vmm/src/devices/virtio/block/virtio/request.rs
+++ b/src/vmm/src/devices/virtio/block/virtio/request.rs
@@ -98,6 +98,18 @@ pub struct PendingRequest {
 }
 
 impl PendingRequest {
+    pub fn request_type(&self) -> RequestType {
+        self.r#type
+    }
+
+    pub fn desc_idx(&self) -> u16 {
+        self.desc_idx
+    }
+
+    pub fn data_len(&self) -> u32 {
+        self.data_len
+    }
+
     fn write_status_and_finish(
         self,
         status: &Status,

From cec59685ef3c264b5cd8f4a37ab078f036e476b3 Mon Sep 17 00:00:00 2001
From: ValentaTomas <valenta.and.thomas@gmail.com>
Date: Mon, 1 Dec 2025 17:44:42 -0800
Subject: [PATCH 6/9] Cleanup

---
 .gitignore                                    |   1 +
 tests/README.md                               |  28 +-
 .../functional/test_snapshot_basic.py         | 283 +++++++++++++++++-
 tools/test.sh                                 |   2 +-
 4 files changed, 301 insertions(+), 13 deletions(-)

diff --git a/.gitignore b/.gitignore
index 155e4cbd8a8..1044507cef7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,4 @@ test_results/*
 /resources/linux
 /resources/x86_64
 /resources/aarch64
+aws/
\ No newline at end of file
diff --git a/tests/README.md b/tests/README.md
index e8ad62d0792..b08ab5746c2 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -81,7 +81,7 @@ exercise its programmatic API without the HTTP integration. `Cargo`
 automatically picks up these tests when `cargo test` is issued. They also count
 towards code coverage.
 
-To run *only* the Rust integration tests:
+To run _only_ the Rust integration tests:
 
 ```bash
 cargo test --test integration_tests --all
@@ -92,7 +92,7 @@ Unlike unit tests, Rust integration tests are each run in a separate process.
 
 1. Only the `pub` functions can be called. This is fine, as it allows the VMM to
    be consumed as a programmatic user would. If any function is necessary but
-   not `pub`, please consider carefully whether it conceptually *needs* to be in
+   not `pub`, please consider carefully whether it conceptually _needs_ to be in
    the public interface before making it so.
 
 1. The correct functioning scenario of the `vmm` implies that it `exit`s with
@@ -413,33 +413,33 @@ setting to achieve consistent performance. Please see the `test` section of
 
 ## FAQ
 
-`Q1:` *I have a shell script that runs my tests and I don't want to rewrite
-it.*\
+`Q1:` _I have a shell script that runs my tests and I don't want to rewrite
+it._\
 `A1:` Insofar as it makes sense, you should write it as a python test function.
 However, you can always call the script from a shim python test function. You
 can also add it as a microvm image resource in the s3 bucket (and it will be
 made available under `microvm.slot.path`) or copy it over to a guest filesystem
 as part of your test.
 
-`Q2:` *I want to add more tests that I don't want to commit to the Firecracker
-repository.*\
+`Q2:` _I want to add more tests that I don't want to commit to the Firecracker
+repository._\
 `A2:` Before a testrun or test session, just add your test directory under
 `tests/`. `pytest` will discover all tests in this tree.
 
-`Q3:` *I want to have my own test fixtures, and not commit them in the repo.*\
+`Q3:` _I want to have my own test fixtures, and not commit them in the repo._\
 `A3:` Add a `conftest.py` file in your test directory, and place your fixtures
 there. `pytest` will bring them into scope for all your tests.
 
-`Q4:` *I want to use more/other microvm test images, but I don't want to add
-them to the common s3 bucket.*\
+`Q4:` _I want to use more/other microvm test images, but I don't want to add
+them to the common s3 bucket._\
 `A4:` Add your custom images to the `build/img` subdirectory in the Firecracker
 source tree. This directory is bind-mounted in the container and used as a local
 image cache.
 
-`Q5:` *How can I get live logger output from the tests?*\
+`Q5:` _How can I get live logger output from the tests?_\
 `A5:` Accessing **pytest.ini** will allow you to modify logger settings.
 
-`Q6:` *Is there a way to speed up integration tests execution time?*\
+`Q6:` _Is there a way to speed up integration tests execution time?_\
 `A6:` You can narrow down the test selection as described in the **Running**
 section. For example:
 
@@ -658,3 +658,9 @@ sudo env PYTHONPATH=tests HOME=$HOME ~/.local/bin/ipython3 -i tools/sandbox.py -
 > [!WARNING]
 >
 > **Notice this runs as root!**
+
+## Repro the freeze by
+
+```sh
+tools/devtool -y test -- -s integration_tests/functional/test_snapshot_basic.py::test_snapshot_with_heavy_async_io
+```
diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py
index c4eac866028..1540a8ce01d 100644
--- a/tests/integration_tests/functional/test_snapshot_basic.py
+++ b/tests/integration_tests/functional/test_snapshot_basic.py
@@ -19,8 +19,9 @@
 import host_tools.drive as drive_tools
 import host_tools.network as net_tools
 from framework import utils
+from framework.microvm import SnapshotType
 from framework.properties import global_props
-from framework.utils import check_filesystem, check_output
+from framework.utils import Timeout, check_filesystem, check_output
 from framework.utils_vsock import (
     ECHO_SERVER_PORT,
     VSOCK_UDS_PATH,
@@ -586,3 +587,283 @@ def test_snapshot_rename_interface(uvm_nano, microvm_factory):
         rename_interfaces={iface_override.dev_name: iface_override.tap_name},
         resume=True,
     )
+
+
+@pytest.mark.parametrize("snapshot_type", [SnapshotType.FULL])
+@pytest.mark.parametrize("pci_enabled", [False])
+@pytest.mark.parametrize("iteration", range(100))  # Run many iterations to catch non-zero drain
+def test_snapshot_with_heavy_async_io(
+    microvm_factory, guest_kernel_linux_6_1, rootfs, snapshot_type, pci_enabled, iteration
+):
+    """
+    Test snapshot with heavy filesystem I/O using async/io_uring engine.
+
+    This test verifies that async I/O operations in-flight during snapshot
+    are properly completed and their completion information is written to
+    guest memory (used ring) so that after restore, the guest driver sees
+    the completions and doesn't freeze.
+
+    CRITICAL: The test restores from the snapshot IMMEDIATELY after creation.
+    In the error case (if async I/O completions weren't written to guest memory
+    during prepare_save()), the guest virtio driver will be stuck waiting for
+    completions that will never come, causing the VM to freeze. This test
+    detects that freeze by attempting to run a command immediately after restore.
+
+    The test focuses on freeze detection - if async I/O completions aren't written
+    to guest memory during snapshot, the VM will freeze after restore.
+
+    The test:
+    1. Configures VM with async io_engine (kernel 6.1 only)
+    2. Performs heavy async write operations (no sync) on the filesystem
+    3. Creates a snapshot IMMEDIATELY while I/O operations are still in-flight
+    4. Restores from the snapshot IMMEDIATELY after creation
+    5. Verifies VM is responsive (freeze detection - will timeout if frozen)
+    6. Verifies filesystem integrity and that all I/O completed correctly
+    """
+    logger = logging.getLogger("snapshot_heavy_async_io")
+
+    print("=" * 80)
+    print(f"Starting iteration {iteration + 1}/100 - Testing for non-zero async I/O drain")
+    print("=" * 80)
+
+    # Build VM with kernel 6.1 only
+    vm = microvm_factory.build(guest_kernel_linux_6_1, rootfs, pci=pci_enabled)
+    # Enable Trace-level logging to see ALL async I/O drain operations during snapshot
+    # Trace level shows the most detailed device operations including block device async I/O
+    # The debug! macros in async_io.rs (like drain_and_flush) require Debug or Trace level
+    vm.spawn(log_level="Trace", log_show_level=True, log_show_origin=True)
+    vm.basic_config(
+        vcpu_count=2,
+        mem_size_mib=1024,
+        # track_dirty_pages=snapshot_type.needs_dirty_page_tracking,
+        rootfs_io_engine="Async",  # Use async/io_uring engine
+    )
+    vm.add_net_iface()
+    vm.start()
+
+    logger.info("Starting heavy write I/O workload on guest filesystem...")
+
+    # Perform heavy WRITE operations before snapshot
+    # Writes are more likely to be async and non-blocking, generating many
+    # in-flight async I/O requests that will need completion info written to memory
+    # We want to maximize the chance of having pending_ops > 0 during drain
+    write_io_script = """
+    # Create a test directory
+    mkdir -p /tmp/io_test
+    cd /tmp/io_test
+
+    # Strategy: Fire THOUSANDS of small writes VERY quickly to maximize io_uring queue depth
+    # Small writes complete faster but queue more operations
+    # Goal: Schedule thousands of operations to increase chance of pending_ops > 0
+    
+    # Fire 2000+ very small writes in parallel - these will queue up quickly
+    # Small block sizes (4k-16k) are more likely to stay queued
+    for i in $(seq 1 2000); do
+        dd if=/dev/urandom of=test_file_$i bs=8k count=1 oflag=direct 2>/dev/null &
+    done
+    
+    # Fire 1000 medium writes to add more operations to the queue
+    for i in $(seq 1 1000); do
+        dd if=/dev/urandom of=medium_file_$i bs=16k count=1 oflag=direct 2>/dev/null &
+    done
+    
+    # Fire 500 larger writes
+    for i in $(seq 1 500); do
+        dd if=/dev/urandom of=large_file_$i bs=32k count=1 oflag=direct 2>/dev/null &
+    done
+    
+    # Fire 200 even larger writes
+    for i in $(seq 1 200); do
+        dd if=/dev/urandom of=xlarge_file_$i bs=64k count=1 oflag=direct 2>/dev/null &
+    done
+
+    # Use fio with MAXIMUM iodepth to maximize in-flight operations
+    # io_uring queue depth is typically 4096, so we want to fill it up
+    # Total: 3700+ dd operations + fio operations
+    if command -v fio >/dev/null 2>&1; then
+        # Maximum iodepth (512) with many jobs to fill the queue
+        # Random writes are better than sequential for keeping ops in-flight
+        fio --name=heavy_write --filename=/tmp/io_test/fio_write_test \
+            --rw=randwrite --bs=4k --size=2G --ioengine=libaio \
+            --iodepth=512 --direct=1 --runtime=30 --time_based \
+            --numjobs=64 --group_reporting \
+            --output=/tmp/fio_write.log >/dev/null 2>&1 &
+        
+        # Start multiple fio jobs on different files for even more operations
+        fio --name=heavy_write2 --filename=/tmp/io_test/fio_write_test2 \
+            --rw=randwrite --bs=8k --size=2G --ioengine=libaio \
+            --iodepth=512 --direct=1 --runtime=30 --time_based \
+            --numjobs=64 --group_reporting \
+            --output=/tmp/fio_write2.log >/dev/null 2>&1 &
+            
+        fio --name=heavy_write3 --filename=/tmp/io_test/fio_write_test3 \
+            --rw=randwrite --bs=16k --size=2G --ioengine=libaio \
+            --iodepth=512 --direct=1 --runtime=30 --time_based \
+            --numjobs=64 --group_reporting \
+            --output=/tmp/fio_write3.log >/dev/null 2>&1 &
+            
+        fio --name=heavy_write4 --filename=/tmp/io_test/fio_write_test4 \
+            --rw=randwrite --bs=4k --size=2G --ioengine=libaio \
+            --iodepth=512 --direct=1 --runtime=30 --time_based \
+            --numjobs=64 --group_reporting \
+            --output=/tmp/fio_write4.log >/dev/null 2>&1 &
+    fi
+    
+    # Total operations scheduled: ~3700 dd + ~256 fio workers (64*4) = ~3956 operations
+    """
+
+    # Execute initial write workload
+    vm.ssh.run(write_io_script)
+    # No wait - fire first snapshot immediately to catch operations in-flight
+
+    # Perform 4 snapshot-resume cycles
+    # Strategy: Take snapshots as fast as possible after firing I/O
+    # This keeps operations overlapping across snapshots so each has pending_ops > 0
+    NUM_SNAPSHOTS = 4
+    current_vm = vm
+    snapshots = []
+    all_pending_ops = {}  # Track pending_ops for each snapshot
+    queued_ops_count = []
+    
+    for snap_num in range(1, NUM_SNAPSHOTS + 1):
+        print(f"\n{'='*80}")
+        print(f"Iteration {iteration + 1}, Snapshot {snap_num}/{NUM_SNAPSHOTS}")
+        print(f"{'='*80}")
+        
+        # Fire ONE or FEW really large operations - simple approach
+        # Large operations take time to complete, naturally staying in-flight
+        print(f"Snapshot {snap_num}: Firing large I/O operations...")
+        heavy_io_script = f"""
+        cd /tmp/io_test
+        
+        # Fire one or few really large operations
+        # Large operations take time to complete, keeping them in-flight in io_uring queue
+        # Simple and effective - just big writes that take time
+        
+        # One really big write - this will take time and stay in-flight
+        dd if=/dev/urandom of=snap{snap_num}_huge bs=1M count=1000 oflag=direct 2>/dev/null &
+        
+        # A few more large operations for good measure
+        dd if=/dev/urandom of=snap{snap_num}_large1 bs=1M count=500 oflag=direct 2>/dev/null &
+        dd if=/dev/urandom of=snap{snap_num}_large2 bs=1M count=500 oflag=direct 2>/dev/null &
+        dd if=/dev/urandom of=snap{snap_num}_large3 bs=1M count=500 oflag=direct 2>/dev/null &
+        """
+        current_vm.ssh.run(heavy_io_script)
+        
+        # Short wait - just enough for operations to be submitted to io_uring
+        # Large operations will take time to complete, staying in-flight
+        time.sleep(0.01)  # 10ms - just enough for submission
+        
+        print(f"Snapshot {snap_num}: Creating snapshot immediately (operations still in-flight)...")
+        snapshot = current_vm.make_snapshot(snapshot_type)
+        snapshots.append(snapshot)
+        
+        # Minimal wait for logs to flush
+        time.sleep(0.05)
+        
+        # Parse logs for pending_ops for EVERY snapshot
+        pending_ops_during_drain = None
+        if current_vm.log_file and current_vm.log_file.exists():
+            try:
+                log_data = current_vm.log_data
+                log_lines = log_data.splitlines()
+                
+                # Parse the drain messages to extract pending_ops
+                for line in log_lines:
+                    # Look for: "AsyncFileEngine queued ... request ... pending_ops=X"
+                    if "AsyncFileEngine queued" in line and "pending_ops=" in line:
+                        match = re.search(r'pending_ops=(\d+)', line)
+                        if match:
+                            queued_ops_count.append(int(match.group(1)))
+                    
+                    # Look for: "AsyncFileEngine draining: pending_ops=X discard_cqes=..."
+                    # We want to find the MOST RECENT drain message for this snapshot
+                    if "AsyncFileEngine draining:" in line:
+                        match = re.search(r'pending_ops=(\d+)', line)
+                        if match:
+                            pending_ops_during_drain = int(match.group(1))
+                            print(f"Snapshot {snap_num}: Found drain start: pending_ops={pending_ops_during_drain}")
+            except Exception as e:
+                print(f"ERROR: Failed to parse log file: {e}")
+        
+        # Store pending_ops for this snapshot
+        all_pending_ops[snap_num] = pending_ops_during_drain
+        
+        # Kill current VM before restoring
+        current_vm.kill()
+        
+        # Restore immediately and fire next I/O as fast as possible
+        print(f"Snapshot {snap_num}: Restoring from snapshot IMMEDIATELY...")
+        restored_vm = microvm_factory.build_from_snapshot(snapshot)
+        
+        # Verify VM is responsive (freeze detection) - do this quickly
+        print(f"Snapshot {snap_num}: Verifying VM is responsive (freeze detection)...")
+        print(f"Snapshot {snap_num}: Waiting up to 30 seconds for VM to respond...")
+        try:
+            # Use a timeout to detect freeze - if VM is frozen, this will timeout
+            # The default SSH timeout is 60s, but we want to fail faster for freeze detection
+            with Timeout(30):
+                restored_vm.ssh.check_output("true")
+            print(f"Snapshot {snap_num}: ✓ VM is responsive - no freeze detected")
+            
+            # If not the last snapshot, immediately prepare for next snapshot
+            # This keeps operations overlapping across snapshots
+            if snap_num < NUM_SNAPSHOTS:
+                print(f"Snapshot {snap_num}: Ready for next snapshot (operations may still be running)...")
+            
+            # Report findings for this snapshot
+            print(f"\nSnapshot {snap_num} Results:")
+            print(f"  pending_ops during drain: {pending_ops_during_drain}")
+            
+            if pending_ops_during_drain is not None and pending_ops_during_drain > 0:
+                print(f"  *** NON-ZERO DRAIN: pending_ops={pending_ops_during_drain} ***")
+            elif pending_ops_during_drain == 0:
+                print(f"  WARNING: pending_ops=0 (operations completed before drain)")
+            else:
+                print(f"  WARNING: Could not parse pending_ops from logs")
+            
+            # Report summary for last snapshot
+            if snap_num == NUM_SNAPSHOTS:
+                print(f"\n{'='*80}")
+                print(f"Summary for all {NUM_SNAPSHOTS} snapshots:")
+                for s in range(1, NUM_SNAPSHOTS + 1):
+                    ops = all_pending_ops.get(s, "unknown")
+                    status = "✓" if (isinstance(ops, int) and ops > 0) else "✗"
+                    print(f"  Snapshot {s}: pending_ops={ops} {status}")
+                
+                if queued_ops_count:
+                    max_queued = max(queued_ops_count)
+                    min_queued = min(queued_ops_count)
+                    print(f"\nOperation counts from FC logs:")
+                    print(f"  Max pending_ops seen in logs: {max_queued}")
+                    print(f"  Min pending_ops seen in logs: {min_queued}")
+                    print(f"  Sample of queued ops counts: {queued_ops_count[-10:]}")
+                
+                # Check if we got non-zero drain in any snapshot
+                non_zero_snapshots = [s for s, ops in all_pending_ops.items() 
+                                     if isinstance(ops, int) and ops > 0]
+                if non_zero_snapshots:
+                    print(f"\n{'='*80}")
+                    print(f"SUCCESS: Found non-zero drain in snapshots: {non_zero_snapshots}")
+                    print(f"All snapshots with non-zero drain resumed correctly - no freeze!")
+                    print(f"This proves that non-zero drain with proper completion handling works correctly.")
+                    print(f"{'='*80}\n")
+                else:
+                    print(f"\nNo non-zero drain found in any snapshot - continuing to next iteration...")
+                print(f"{'='*80}")
+            
+        except Exception as e:
+            print(f"\n{'='*80}")
+            print(f"FAILURE: Snapshot {snap_num} - VM FROZE after restore!")
+            print(f"pending_ops during drain: {pending_ops_during_drain}")
+            print(f"Error: {e}")
+            print(f"{'='*80}\n")
+            restored_vm.kill()
+            raise
+        
+        # For next iteration, use the restored VM
+        if snap_num < NUM_SNAPSHOTS:
+            current_vm = restored_vm
+        else:
+            # Last snapshot - cleanup
+            restored_vm.kill()
diff --git a/tools/test.sh b/tools/test.sh
index 0bf67a65666..5b5c80159d3 100755
--- a/tools/test.sh
+++ b/tools/test.sh
@@ -47,7 +47,7 @@ export PYTEST_ADDOPTS="${PYTEST_ADDOPTS:-} --pdbcls=IPython.terminal.debugger:Te
 
 # if the tests failed and we are running in CI, print some disk usage stats
 # to help troubleshooting
-if [ $ret != 0 ] && [ "$BUILDKITE" == "true" ]; then
+if [ $ret != 0 ] && [ "${BUILDKITE:-false}" == "true" ]; then
     df -ih
     df -h
     du -h / 2>/dev/null |sort -h |tail -32

From d056e696cd677a0cfdad494a8cb0db359b9daed9 Mon Sep 17 00:00:00 2001
From: ValentaTomas <valenta.and.thomas@gmail.com>
Date: Mon, 1 Dec 2025 22:53:01 -0800
Subject: [PATCH 7/9] Improve test

---
 .../functional/test_snapshot_basic.py         | 271 ++++--------------
 1 file changed, 48 insertions(+), 223 deletions(-)

diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py
index 1540a8ce01d..2a425811a0e 100644
--- a/tests/integration_tests/functional/test_snapshot_basic.py
+++ b/tests/integration_tests/functional/test_snapshot_basic.py
@@ -591,279 +591,104 @@ def test_snapshot_rename_interface(uvm_nano, microvm_factory):
 
 @pytest.mark.parametrize("snapshot_type", [SnapshotType.FULL])
 @pytest.mark.parametrize("pci_enabled", [False])
-@pytest.mark.parametrize("iteration", range(100))  # Run many iterations to catch non-zero drain
+@pytest.mark.parametrize("iteration", range(100))
 def test_snapshot_with_heavy_async_io(
-    microvm_factory, guest_kernel_linux_6_1, rootfs, snapshot_type, pci_enabled, iteration
+    microvm_factory, guest_kernel_linux_6_1, rootfs_rw, snapshot_type, pci_enabled, iteration
 ):
-    """
-    Test snapshot with heavy filesystem I/O using async/io_uring engine.
-
-    This test verifies that async I/O operations in-flight during snapshot
-    are properly completed and their completion information is written to
-    guest memory (used ring) so that after restore, the guest driver sees
-    the completions and doesn't freeze.
-
-    CRITICAL: The test restores from the snapshot IMMEDIATELY after creation.
-    In the error case (if async I/O completions weren't written to guest memory
-    during prepare_save()), the guest virtio driver will be stuck waiting for
-    completions that will never come, causing the VM to freeze. This test
-    detects that freeze by attempting to run a command immediately after restore.
-
-    The test focuses on freeze detection - if async I/O completions aren't written
-    to guest memory during snapshot, the VM will freeze after restore.
-
-    The test:
-    1. Configures VM with async io_engine (kernel 6.1 only)
-    2. Performs heavy async write operations (no sync) on the filesystem
-    3. Creates a snapshot IMMEDIATELY while I/O operations are still in-flight
-    4. Restores from the snapshot IMMEDIATELY after creation
-    5. Verifies VM is responsive (freeze detection - will timeout if frozen)
-    6. Verifies filesystem integrity and that all I/O completed correctly
-    """
-    logger = logging.getLogger("snapshot_heavy_async_io")
-
     print("=" * 80)
     print(f"Starting iteration {iteration + 1}/100 - Testing for non-zero async I/O drain")
     print("=" * 80)
 
-    # Build VM with kernel 6.1 only
-    vm = microvm_factory.build(guest_kernel_linux_6_1, rootfs, pci=pci_enabled)
-    # Enable Trace-level logging to see ALL async I/O drain operations during snapshot
-    # Trace level shows the most detailed device operations including block device async I/O
-    # The debug! macros in async_io.rs (like drain_and_flush) require Debug or Trace level
+    # CHECK: if we set `rootfs` to just "rootfs" (readonly) it also freezes on async.
+    # By default though, even the rw rootfs has only like 37MB of free space.
+    vm = microvm_factory.build(guest_kernel_linux_6_1, rootfs_rw, pci=pci_enabled)
+
     vm.spawn(log_level="Trace", log_show_level=True, log_show_origin=True)
     vm.basic_config(
         vcpu_count=2,
         mem_size_mib=1024,
-        # track_dirty_pages=snapshot_type.needs_dirty_page_tracking,
-        rootfs_io_engine="Async",  # Use async/io_uring engine
+        # CHECK: If we set this to "Sync", it will not freeze.
+        rootfs_io_engine="Async",
     )
     vm.add_net_iface()
     vm.start()
 
-    logger.info("Starting heavy write I/O workload on guest filesystem...")
-
-    # Perform heavy WRITE operations before snapshot
-    # Writes are more likely to be async and non-blocking, generating many
-    # in-flight async I/O requests that will need completion info written to memory
-    # We want to maximize the chance of having pending_ops > 0 during drain
     write_io_script = """
     # Create a test directory
-    mkdir -p /tmp/io_test
-    cd /tmp/io_test
+    mkdir -p /root/io_test
+    cd /root/io_test
 
-    # Strategy: Fire THOUSANDS of small writes VERY quickly to maximize io_uring queue depth
-    # Small writes complete faster but queue more operations
-    # Goal: Schedule thousands of operations to increase chance of pending_ops > 0
-    
-    # Fire 2000+ very small writes in parallel - these will queue up quickly
-    # Small block sizes (4k-16k) are more likely to stay queued
     for i in $(seq 1 2000); do
         dd if=/dev/urandom of=test_file_$i bs=8k count=1 oflag=direct 2>/dev/null &
     done
     
-    # Fire 1000 medium writes to add more operations to the queue
     for i in $(seq 1 1000); do
         dd if=/dev/urandom of=medium_file_$i bs=16k count=1 oflag=direct 2>/dev/null &
     done
     
-    # Fire 500 larger writes
     for i in $(seq 1 500); do
         dd if=/dev/urandom of=large_file_$i bs=32k count=1 oflag=direct 2>/dev/null &
     done
     
-    # Fire 200 even larger writes
     for i in $(seq 1 200); do
         dd if=/dev/urandom of=xlarge_file_$i bs=64k count=1 oflag=direct 2>/dev/null &
     done
 
-    # Use fio with MAXIMUM iodepth to maximize in-flight operations
-    # io_uring queue depth is typically 4096, so we want to fill it up
-    # Total: 3700+ dd operations + fio operations
     if command -v fio >/dev/null 2>&1; then
-        # Maximum iodepth (512) with many jobs to fill the queue
-        # Random writes are better than sequential for keeping ops in-flight
-        fio --name=heavy_write --filename=/tmp/io_test/fio_write_test \
+        fio --name=heavy_write --filename=/root/io_test/fio_write_test \
             --rw=randwrite --bs=4k --size=2G --ioengine=libaio \
             --iodepth=512 --direct=1 --runtime=30 --time_based \
-            --numjobs=64 --group_reporting \
-            --output=/tmp/fio_write.log >/dev/null 2>&1 &
+            --numjobs=2 --group_reporting \
+            --output=/root/fio_write.log >/dev/null 2>&1 &
         
-        # Start multiple fio jobs on different files for even more operations
-        fio --name=heavy_write2 --filename=/tmp/io_test/fio_write_test2 \
+        fio --name=heavy_write2 --filename=/root/io_test/fio_write_test2 \
             --rw=randwrite --bs=8k --size=2G --ioengine=libaio \
             --iodepth=512 --direct=1 --runtime=30 --time_based \
-            --numjobs=64 --group_reporting \
-            --output=/tmp/fio_write2.log >/dev/null 2>&1 &
+            --numjobs=2 --group_reporting \
+            --output=/root/fio_write2.log >/dev/null 2>&1 &
             
-        fio --name=heavy_write3 --filename=/tmp/io_test/fio_write_test3 \
+        fio --name=heavy_write3 --filename=/root/io_test/fio_write_test3 \
             --rw=randwrite --bs=16k --size=2G --ioengine=libaio \
             --iodepth=512 --direct=1 --runtime=30 --time_based \
-            --numjobs=64 --group_reporting \
-            --output=/tmp/fio_write3.log >/dev/null 2>&1 &
+            --numjobs=2 --group_reporting \
+            --output=/root/fio_write3.log >/dev/null 2>&1 &
             
-        fio --name=heavy_write4 --filename=/tmp/io_test/fio_write_test4 \
+        fio --name=heavy_write4 --filename=/root/io_test/fio_write_test4 \
             --rw=randwrite --bs=4k --size=2G --ioengine=libaio \
             --iodepth=512 --direct=1 --runtime=30 --time_based \
-            --numjobs=64 --group_reporting \
-            --output=/tmp/fio_write4.log >/dev/null 2>&1 &
+            --numjobs=2 --group_reporting \
+            --output=/root/fio_write4.log >/dev/null 2>&1 &
     fi
-    
-    # Total operations scheduled: ~3700 dd + ~256 fio workers (64*4) = ~3956 operations
     """
 
-    # Execute initial write workload
     vm.ssh.run(write_io_script)
-    # No wait - fire first snapshot immediately to catch operations in-flight
-
-    # Perform 4 snapshot-resume cycles
-    # Strategy: Take snapshots as fast as possible after firing I/O
-    # This keeps operations overlapping across snapshots so each has pending_ops > 0
-    NUM_SNAPSHOTS = 4
-    current_vm = vm
-    snapshots = []
-    all_pending_ops = {}  # Track pending_ops for each snapshot
-    queued_ops_count = []
+
+    # if set to cca 2 seconds it stops freezing for Async.
+    time.sleep(0.01)
     
-    for snap_num in range(1, NUM_SNAPSHOTS + 1):
-        print(f"\n{'='*80}")
-        print(f"Iteration {iteration + 1}, Snapshot {snap_num}/{NUM_SNAPSHOTS}")
-        print(f"{'='*80}")
-        
-        # Fire ONE or FEW really large operations - simple approach
-        # Large operations take time to complete, naturally staying in-flight
-        print(f"Snapshot {snap_num}: Firing large I/O operations...")
-        heavy_io_script = f"""
-        cd /tmp/io_test
-        
-        # Fire one or few really large operations
-        # Large operations take time to complete, keeping them in-flight in io_uring queue
-        # Simple and effective - just big writes that take time
-        
-        # One really big write - this will take time and stay in-flight
-        dd if=/dev/urandom of=snap{snap_num}_huge bs=1M count=1000 oflag=direct 2>/dev/null &
-        
-        # A few more large operations for good measure
-        dd if=/dev/urandom of=snap{snap_num}_large1 bs=1M count=500 oflag=direct 2>/dev/null &
-        dd if=/dev/urandom of=snap{snap_num}_large2 bs=1M count=500 oflag=direct 2>/dev/null &
-        dd if=/dev/urandom of=snap{snap_num}_large3 bs=1M count=500 oflag=direct 2>/dev/null &
-        """
-        current_vm.ssh.run(heavy_io_script)
-        
-        # Short wait - just enough for operations to be submitted to io_uring
-        # Large operations will take time to complete, staying in-flight
-        time.sleep(0.01)  # 10ms - just enough for submission
-        
-        print(f"Snapshot {snap_num}: Creating snapshot immediately (operations still in-flight)...")
-        snapshot = current_vm.make_snapshot(snapshot_type)
-        snapshots.append(snapshot)
-        
-        # Minimal wait for logs to flush
-        time.sleep(0.05)
-        
-        # Parse logs for pending_ops for EVERY snapshot
-        pending_ops_during_drain = None
-        if current_vm.log_file and current_vm.log_file.exists():
-            try:
-                log_data = current_vm.log_data
-                log_lines = log_data.splitlines()
-                
-                # Parse the drain messages to extract pending_ops
-                for line in log_lines:
-                    # Look for: "AsyncFileEngine queued ... request ... pending_ops=X"
-                    if "AsyncFileEngine queued" in line and "pending_ops=" in line:
-                        match = re.search(r'pending_ops=(\d+)', line)
-                        if match:
-                            queued_ops_count.append(int(match.group(1)))
-                    
-                    # Look for: "AsyncFileEngine draining: pending_ops=X discard_cqes=..."
-                    # We want to find the MOST RECENT drain message for this snapshot
-                    if "AsyncFileEngine draining:" in line:
-                        match = re.search(r'pending_ops=(\d+)', line)
-                        if match:
-                            pending_ops_during_drain = int(match.group(1))
-                            print(f"Snapshot {snap_num}: Found drain start: pending_ops={pending_ops_during_drain}")
-            except Exception as e:
-                print(f"ERROR: Failed to parse log file: {e}")
-        
-        # Store pending_ops for this snapshot
-        all_pending_ops[snap_num] = pending_ops_during_drain
-        
-        # Kill current VM before restoring
-        current_vm.kill()
-        
-        # Restore immediately and fire next I/O as fast as possible
-        print(f"Snapshot {snap_num}: Restoring from snapshot IMMEDIATELY...")
-        restored_vm = microvm_factory.build_from_snapshot(snapshot)
-        
-        # Verify VM is responsive (freeze detection) - do this quickly
-        print(f"Snapshot {snap_num}: Verifying VM is responsive (freeze detection)...")
-        print(f"Snapshot {snap_num}: Waiting up to 30 seconds for VM to respond...")
+    snapshot = vm.make_snapshot(snapshot_type)
+    
+    time.sleep(0.05)
+    
+    if vm.log_file and vm.log_file.exists():
         try:
-            # Use a timeout to detect freeze - if VM is frozen, this will timeout
-            # The default SSH timeout is 60s, but we want to fail faster for freeze detection
-            with Timeout(30):
-                restored_vm.ssh.check_output("true")
-            print(f"Snapshot {snap_num}: ✓ VM is responsive - no freeze detected")
-            
-            # If not the last snapshot, immediately prepare for next snapshot
-            # This keeps operations overlapping across snapshots
-            if snap_num < NUM_SNAPSHOTS:
-                print(f"Snapshot {snap_num}: Ready for next snapshot (operations may still be running)...")
-            
-            # Report findings for this snapshot
-            print(f"\nSnapshot {snap_num} Results:")
-            print(f"  pending_ops during drain: {pending_ops_during_drain}")
-            
-            if pending_ops_during_drain is not None and pending_ops_during_drain > 0:
-                print(f"  *** NON-ZERO DRAIN: pending_ops={pending_ops_during_drain} ***")
-            elif pending_ops_during_drain == 0:
-                print(f"  WARNING: pending_ops=0 (operations completed before drain)")
-            else:
-                print(f"  WARNING: Could not parse pending_ops from logs")
-            
-            # Report summary for last snapshot
-            if snap_num == NUM_SNAPSHOTS:
-                print(f"\n{'='*80}")
-                print(f"Summary for all {NUM_SNAPSHOTS} snapshots:")
-                for s in range(1, NUM_SNAPSHOTS + 1):
-                    ops = all_pending_ops.get(s, "unknown")
-                    status = "✓" if (isinstance(ops, int) and ops > 0) else "✗"
-                    print(f"  Snapshot {s}: pending_ops={ops} {status}")
-                
-                if queued_ops_count:
-                    max_queued = max(queued_ops_count)
-                    min_queued = min(queued_ops_count)
-                    print(f"\nOperation counts from FC logs:")
-                    print(f"  Max pending_ops seen in logs: {max_queued}")
-                    print(f"  Min pending_ops seen in logs: {min_queued}")
-                    print(f"  Sample of queued ops counts: {queued_ops_count[-10:]}")
-                
-                # Check if we got non-zero drain in any snapshot
-                non_zero_snapshots = [s for s, ops in all_pending_ops.items() 
-                                     if isinstance(ops, int) and ops > 0]
-                if non_zero_snapshots:
-                    print(f"\n{'='*80}")
-                    print(f"SUCCESS: Found non-zero drain in snapshots: {non_zero_snapshots}")
-                    print(f"All snapshots with non-zero drain resumed correctly - no freeze!")
-                    print(f"This proves that non-zero drain with proper completion handling works correctly.")
-                    print(f"{'='*80}\n")
-                else:
-                    print(f"\nNo non-zero drain found in any snapshot - continuing to next iteration...")
-                print(f"{'='*80}")
+            log_data = vm.log_data
+            log_lines = log_data.splitlines()
             
+            for line in log_lines:
+                if "AsyncFileEngine draining:" in line:
+                    match = re.search(r'pending_ops=(\d+)', line)
+                    if match:
+                        pending_ops_during_drain = int(match.group(1))
+                        print(f"DRAIN: pending_ops={pending_ops_during_drain}")
+
         except Exception as e:
-            print(f"\n{'='*80}")
-            print(f"FAILURE: Snapshot {snap_num} - VM FROZE after restore!")
-            print(f"pending_ops during drain: {pending_ops_during_drain}")
-            print(f"Error: {e}")
-            print(f"{'='*80}\n")
-            restored_vm.kill()
-            raise
-        
-        # For next iteration, use the restored VM
-        if snap_num < NUM_SNAPSHOTS:
-            current_vm = restored_vm
-        else:
-            # Last snapshot - cleanup
-            restored_vm.kill()
+            print(f"ERROR: Failed to parse log file: {e}")
+    
+    vm.kill()
+    
+    print(f"Restoring from snapshot...")
+    with Timeout(30):
+        restored_vm = microvm_factory.build_from_snapshot(snapshot)
+    
+    print(f"VM resumed - no freeze detected")

From 4f428d2aa49b4845b0cac6eefc5b076432a54971 Mon Sep 17 00:00:00 2001
From: ValentaTomas <valenta.and.thomas@gmail.com>
Date: Mon, 1 Dec 2025 23:36:53 -0800
Subject: [PATCH 8/9] Resize the rootfs

---
 tests/conftest.py                             | 34 +++++++++++++++++--
 .../functional/test_snapshot_basic.py         | 31 ++++++++++++++---
 2 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 96ee285d192..87ded8df2cc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -26,6 +26,7 @@
 import shutil
 import sys
 import tempfile
+import uuid
 from pathlib import Path
 
 import pytest
@@ -502,10 +503,37 @@ def rootfs():
     return disks("ubuntu-24.04.squashfs")[0]
 
 
-@pytest.fixture
+@pytest.fixture(scope="function")
 def rootfs_rw():
-    """Return an Ubuntu 24.04 ext4 rootfs"""
-    return disks("ubuntu-24.04.ext4")[0]
+    """Return an Ubuntu 24.04 ext4 rootfs, resized to 20GB for each test"""
+    original_rootfs = disks("ubuntu-24.04.ext4")[0]
+    original_ssh_key = original_rootfs.with_suffix(".id_rsa")
+    
+    # Create a unique resized copy for this test
+    unique_id = str(uuid.uuid4())[:8]
+    resized_rootfs_path = Path(tempfile.gettempdir()) / f"rootfs_rw_{unique_id}.ext4"
+    resized_ssh_key_path = resized_rootfs_path.with_suffix(".id_rsa")
+    
+    # Copy the original rootfs
+    shutil.copyfile(original_rootfs, resized_rootfs_path)
+    
+    # Copy the SSH key
+    if original_ssh_key.exists():
+        shutil.copyfile(original_ssh_key, resized_ssh_key_path)
+    
+    # Resize to 20GB
+    new_size_bytes = 20 * 1024 * 1024 * 1024
+    os.truncate(resized_rootfs_path, new_size_bytes)
+    utils.check_output(f"e2fsck -f -y {resized_rootfs_path}")
+    utils.check_output(f"resize2fs {resized_rootfs_path}")
+    
+    yield resized_rootfs_path
+    
+    # Cleanup: remove the resized rootfs file and SSH key after the test
+    if resized_rootfs_path.exists():
+        resized_rootfs_path.unlink()
+    if resized_ssh_key_path.exists():
+        resized_ssh_key_path.unlink()
 
 
 @pytest.fixture
diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py
index 2a425811a0e..60565349927 100644
--- a/tests/integration_tests/functional/test_snapshot_basic.py
+++ b/tests/integration_tests/functional/test_snapshot_basic.py
@@ -591,7 +591,7 @@ def test_snapshot_rename_interface(uvm_nano, microvm_factory):
 
 @pytest.mark.parametrize("snapshot_type", [SnapshotType.FULL])
 @pytest.mark.parametrize("pci_enabled", [False])
-@pytest.mark.parametrize("iteration", range(100))
+@pytest.mark.parametrize("iteration", range(4))
 def test_snapshot_with_heavy_async_io(
     microvm_factory, guest_kernel_linux_6_1, rootfs_rw, snapshot_type, pci_enabled, iteration
 ):
@@ -613,25 +613,29 @@ def test_snapshot_with_heavy_async_io(
     vm.add_net_iface()
     vm.start()
 
+    # Check free space on sandbox start
+    _, free_space_start, _ = vm.ssh.check_output("df -h / | awk 'NR==2 {{print $4}}'")
+    print(f"Free space on sandbox start: {free_space_start.strip()}")
+
     write_io_script = """
     # Create a test directory
     mkdir -p /root/io_test
     cd /root/io_test
 
     for i in $(seq 1 2000); do
-        dd if=/dev/urandom of=test_file_$i bs=8k count=1 oflag=direct 2>/dev/null &
+        dd if=/dev/urandom of=/root/io_test/test_file_$i bs=8k count=1 oflag=direct 2>/dev/null &
     done
     
     for i in $(seq 1 1000); do
-        dd if=/dev/urandom of=medium_file_$i bs=16k count=1 oflag=direct 2>/dev/null &
+        dd if=/dev/urandom of=/root/io_test/medium_file_$i bs=16k count=1 oflag=direct 2>/dev/null &
     done
     
     for i in $(seq 1 500); do
-        dd if=/dev/urandom of=large_file_$i bs=32k count=1 oflag=direct 2>/dev/null &
+        dd if=/dev/urandom of=/root/io_test/large_file_$i bs=32k count=1 oflag=direct 2>/dev/null &
     done
     
     for i in $(seq 1 200); do
-        dd if=/dev/urandom of=xlarge_file_$i bs=64k count=1 oflag=direct 2>/dev/null &
+        dd if=/dev/urandom of=/root/io_test/xlarge_file_$i bs=64k count=1 oflag=direct 2>/dev/null &
     done
 
     if command -v fio >/dev/null 2>&1; then
@@ -691,4 +695,21 @@ def test_snapshot_with_heavy_async_io(
     with Timeout(30):
         restored_vm = microvm_factory.build_from_snapshot(snapshot)
     
+    # Check free space after resume
+    _, free_space_after_resume, _ = restored_vm.ssh.check_output("df -h / | awk 'NR==2 {{print $4}}'")
+    print(f"Free space after resume: {free_space_after_resume.strip()}")
+    
+    # Delete all files/dirs created during the test
+    cleanup_script = """
+    rm -rf /root/io_test
+    rm -f /root/fio_write*.log
+    """
+    restored_vm.ssh.run(cleanup_script)
+    
+    # Check free space after cleanup
+    _, free_space_after_cleanup, _ = restored_vm.ssh.check_output("df -h / | awk 'NR==2 {{print $4}}'")
+    print(f"Free space after cleanup: {free_space_after_cleanup.strip()}")
+    
     print(f"VM resumed - no freeze detected")
+    
+    restored_vm.kill()

From 2e504f9241c75d423e52f3dff99fa63bf86cfaaf Mon Sep 17 00:00:00 2001
From: ValentaTomas <valenta.and.thomas@gmail.com>
Date: Mon, 1 Dec 2025 23:43:46 -0800
Subject: [PATCH 9/9] Change iterations

---
 tests/integration_tests/functional/test_snapshot_basic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py
index 60565349927..f5ebd5e0ad7 100644
--- a/tests/integration_tests/functional/test_snapshot_basic.py
+++ b/tests/integration_tests/functional/test_snapshot_basic.py
@@ -591,7 +591,7 @@ def test_snapshot_rename_interface(uvm_nano, microvm_factory):
 
 @pytest.mark.parametrize("snapshot_type", [SnapshotType.FULL])
 @pytest.mark.parametrize("pci_enabled", [False])
-@pytest.mark.parametrize("iteration", range(4))
+@pytest.mark.parametrize("iteration", range(100))
 def test_snapshot_with_heavy_async_io(
     microvm_factory, guest_kernel_linux_6_1, rootfs_rw, snapshot_type, pci_enabled, iteration
 ):