From 0b9ff9909e0b6738f656ef7c2ce922d252b02ac6 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Mon, 3 Mar 2025 12:05:36 +0000
Subject: [PATCH 01/23] refactor(test): stop hardcoding uffd socket path
 everywhere

Instead use the property of the `UffdHandler` object that
`spawn_uffd_handler` returns.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/framework/utils_uffd.py                  |  6 +++---
 .../integration_tests/functional/test_uffd.py  | 12 +++++++-----
 .../performance/test_huge_pages.py             | 18 +++++++++++-------
 3 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/tests/framework/utils_uffd.py b/tests/framework/utils_uffd.py
index 9de89809a20..690eb423573 100644
--- a/tests/framework/utils_uffd.py
+++ b/tests/framework/utils_uffd.py
@@ -21,7 +21,7 @@ def __init__(self, name, socket_path, mem_file, chroot_path, log_file_name):
         """Instantiate the handler process with arguments."""
         self._proc = None
         self._handler_name = name
-        self._socket_path = socket_path
+        self.socket_path = socket_path
         self._mem_file = mem_file
         self._chroot = chroot_path
         self._log_file = log_file_name
@@ -35,7 +35,7 @@ def spawn(self, uid, gid):
 
             chroot_log_file = Path("/") / self._log_file
             with open(chroot_log_file, "w", encoding="utf-8") as logfile:
-                args = [f"/{self._handler_name}", self._socket_path, self._mem_file]
+                args = [f"/{self._handler_name}", self.socket_path, self._mem_file]
                 self._proc = subprocess.Popen(
                     args, stdout=logfile, stderr=subprocess.STDOUT
                 )
@@ -48,7 +48,7 @@ def spawn(self, uid, gid):
 
             # The page fault handler will create the socket path with root rights.
             # Change rights to the jailer's.
-            os.chown(self._socket_path, uid, gid)
+            os.chown(self.socket_path, uid, gid)
 
     @property
     def proc(self):
diff --git a/tests/integration_tests/functional/test_uffd.py b/tests/integration_tests/functional/test_uffd.py
index cb5ac0c44c9..35b03d7aa34 100644
--- a/tests/integration_tests/functional/test_uffd.py
+++ b/tests/integration_tests/functional/test_uffd.py
@@ -9,7 +9,7 @@
 import requests
 
 from framework.utils import Timeout, check_output
-from framework.utils_uffd import SOCKET_PATH, spawn_pf_handler, uffd_handler
+from framework.utils_uffd import spawn_pf_handler, uffd_handler
 
 
 @pytest.fixture(scope="function", name="snapshot")
@@ -92,9 +92,9 @@ def test_valid_handler(uvm_plain, snapshot):
     vm.spawn()
 
     # Spawn page fault handler process.
-    _pf_handler = spawn_pf_handler(vm, uffd_handler("valid"), snapshot.mem)
+    pf_handler = spawn_pf_handler(vm, uffd_handler("valid"), snapshot.mem)
 
-    vm.restore_from_snapshot(snapshot, resume=True, uffd_path=SOCKET_PATH)
+    vm.restore_from_snapshot(snapshot, resume=True, uffd_path=pf_handler.socket_path)
 
     # Inflate balloon.
     vm.api.balloon.patch(amount_mib=200)
@@ -125,13 +125,15 @@ def test_malicious_handler(uvm_plain, snapshot):
     vm.spawn()
 
     # Spawn page fault handler process.
-    _pf_handler = spawn_pf_handler(vm, uffd_handler("malicious"), snapshot.mem)
+    pf_handler = spawn_pf_handler(vm, uffd_handler("malicious"), snapshot.mem)
 
     # We expect Firecracker to freeze while resuming from a snapshot
     # due to the malicious handler's unavailability.
     try:
         with Timeout(seconds=30):
-            vm.restore_from_snapshot(snapshot, resume=True, uffd_path=SOCKET_PATH)
+            vm.restore_from_snapshot(
+                snapshot, resume=True, uffd_path=pf_handler.socket_path
+            )
             assert False, "Firecracker should freeze"
     except (TimeoutError, requests.exceptions.ReadTimeout):
         pass
diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py
index 65ae2e6fbc2..acfee58139d 100644
--- a/tests/integration_tests/performance/test_huge_pages.py
+++ b/tests/integration_tests/performance/test_huge_pages.py
@@ -10,7 +10,7 @@
 from framework.microvm import HugePagesConfig
 from framework.properties import global_props
 from framework.utils_ftrace import ftrace_events
-from framework.utils_uffd import SOCKET_PATH, spawn_pf_handler, uffd_handler
+from framework.utils_uffd import spawn_pf_handler, uffd_handler
 
 
 def check_hugetlbfs_in_use(pid: int, allocation_name: str):
@@ -93,9 +93,9 @@ def test_hugetlbfs_snapshot(microvm_factory, guest_kernel_linux_5_10, rootfs):
     vm.spawn()
 
     # Spawn page fault handler process.
-    _pf_handler = spawn_pf_handler(vm, uffd_handler("valid"), snapshot.mem)
+    pf_handler = spawn_pf_handler(vm, uffd_handler("valid"), snapshot.mem)
 
-    vm.restore_from_snapshot(snapshot, resume=True, uffd_path=SOCKET_PATH)
+    vm.restore_from_snapshot(snapshot, resume=True, uffd_path=pf_handler.socket_path)
 
     check_hugetlbfs_in_use(vm.firecracker_pid, "/anon_hugepage")
 
@@ -135,9 +135,11 @@ def test_hugetlbfs_diff_snapshot(microvm_factory, uvm_plain):
     vm.spawn()
 
     # Spawn page fault handler process.
-    _pf_handler = spawn_pf_handler(vm, uffd_handler("valid"), snapshot_merged.mem)
+    pf_handler = spawn_pf_handler(vm, uffd_handler("valid"), snapshot_merged.mem)
 
-    vm.restore_from_snapshot(snapshot_merged, resume=True, uffd_path=SOCKET_PATH)
+    vm.restore_from_snapshot(
+        snapshot_merged, resume=True, uffd_path=pf_handler.socket_path
+    )
 
     # Verify if the restored microvm works.
 
@@ -193,10 +195,12 @@ def test_ept_violation_count(
     vm.spawn()
 
     # Spawn page fault handler process.
-    _pf_handler = spawn_pf_handler(vm, uffd_handler("fault_all"), snapshot.mem)
+    pf_handler = spawn_pf_handler(vm, uffd_handler("fault_all"), snapshot.mem)
 
     with ftrace_events("kvm:*"):
-        vm.restore_from_snapshot(snapshot, resume=True, uffd_path=SOCKET_PATH)
+        vm.restore_from_snapshot(
+            snapshot, resume=True, uffd_path=pf_handler.socket_path
+        )
 
         # Verify if guest can run commands, and also wake up the fast page fault helper to trigger page faults.
         vm.ssh.check_output(f"kill -s {signal.SIGUSR1} {pid}")

From f0207f4ff3d1b50195bea40d441857ccc2ca54d8 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Thu, 27 Feb 2025 11:42:47 +0000
Subject: [PATCH 02/23] test: have fast_page_fault_helper print time memset
 took

The memset causes page faults in the guest which will be handled by
UFFD. By timing the memset from inside the guest, we can measure the
latency of these page faults as observed by an application running
inside the guest.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 .../usr/local/bin/fast_page_fault_helper.c    | 26 +++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/resources/overlay/usr/local/bin/fast_page_fault_helper.c b/resources/overlay/usr/local/bin/fast_page_fault_helper.c
index 591ac3b9612..96e565f0c3f 100644
--- a/resources/overlay/usr/local/bin/fast_page_fault_helper.c
+++ b/resources/overlay/usr/local/bin/fast_page_fault_helper.c
@@ -10,17 +10,23 @@
 // This way, the `memset` will trigger a fast page fault for every page in
 // the memory region.
 
-#include <stdio.h>    // perror
+#include <stdio.h>    // perror, fopen, fprintf
 #include <signal.h>   // sigwait and friends
 #include <string.h>   // memset
 #include <sys/mman.h> // mmap
+#include <time.h>     // clock_gettime
+#include <fcntl.h>    // open
 
 #define MEM_SIZE_MIB (128 * 1024 * 1024)
+#define NANOS_PER_SEC 1000000000
 
-int main(int argc, char *const argv[]) {
+int main() {
     sigset_t set;
     int signal;
     void *ptr;
+    struct timespec start, end;
+    long duration_nanos;
+    FILE *out_file;
 
     sigemptyset(&set);
     if (sigaddset(&set, SIGUSR1) == -1) {
@@ -43,7 +49,23 @@ int main(int argc, char *const argv[]) {
 
     sigwait(&set, &signal);
 
+    clock_gettime(CLOCK_BOOTTIME, &start);
     memset(ptr, 2, MEM_SIZE_MIB);
+    clock_gettime(CLOCK_BOOTTIME, &end);
+
+    duration_nanos = (end.tv_sec - start.tv_sec) * NANOS_PER_SEC + end.tv_nsec - start.tv_nsec;
+
+    out_file = fopen("/tmp/fast_page_fault_helper.out", "w");
+    if (out_file == NULL) {
+        perror("fopen");
+        return 1;
+    }
+
+    fprintf(out_file, "%ld", duration_nanos);
+    if (fclose(out_file)) {
+        perror("fclose");
+        return 1;
+    }
 
     return 0;
 }
\ No newline at end of file

From b04661cc68cc9233ed146a473a3edb5bc2f2c3b2 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Mon, 3 Mar 2025 12:48:54 +0000
Subject: [PATCH 03/23] refactor(uffd): Make example UFFD fast in absense of
 balloon device

Make the example UFFD handler use a HashSet to only store address ranges
that were actually removed due to expanding memory balloon. This
eliminates the HashMap that previously held entries for every single
page in guest memory, which was causing the example uffd handler to slow
to a crawl (taking about 40s to fault in 128MB of memory).

This is possible because the UFFD handler doesn't actually care about
the 4 states a page can supposedly be in, it only needs to know whether
a page needs to be served from the memory snapshot file, or zeroed out.
Since zero-ing out is the rarer of the two (only happens if a balloon
device caused some previously faulted in memory to be madvise'd away),
we only store information about pages that we got remove messages about,
and simply assume every other page needs to be served from the snapshot
file. This means that in the happy path of "no balloon device", we never
need to store any information about the memory at all.

This one simple optimization brings the fault time down to 500ms (from
40s) for the 128MB scenario, which makes the example handler usable for
performance tests for page fault latencies.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 .../examples/uffd/fault_all_handler.rs        |   3 +-
 src/firecracker/examples/uffd/uffd_utils.rs   | 121 +++++-------------
 .../examples/uffd/valid_handler.rs            |   7 +-
 3 files changed, 40 insertions(+), 91 deletions(-)

diff --git a/src/firecracker/examples/uffd/fault_all_handler.rs b/src/firecracker/examples/uffd/fault_all_handler.rs
index cfeaa099236..d0f127a8ce1 100644
--- a/src/firecracker/examples/uffd/fault_all_handler.rs
+++ b/src/firecracker/examples/uffd/fault_all_handler.rs
@@ -35,8 +35,7 @@ fn main() {
         match event {
             userfaultfd::Event::Pagefault { .. } => {
                 for region in uffd_handler.mem_regions.clone() {
-                    uffd_handler
-                        .serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size);
+                    uffd_handler.serve_pf(region.base_host_virt_addr as _, region.size);
                 }
             }
             _ => panic!("Unexpected event on userfaultfd"),
diff --git a/src/firecracker/examples/uffd/uffd_utils.rs b/src/firecracker/examples/uffd/uffd_utils.rs
index a2f7879f591..dcc05151967 100644
--- a/src/firecracker/examples/uffd/uffd_utils.rs
+++ b/src/firecracker/examples/uffd/uffd_utils.rs
@@ -4,7 +4,7 @@
 // Not everything is used by both binaries
 #![allow(dead_code)]
 
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::fs::File;
 use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd};
 use std::os::unix::net::UnixStream;
@@ -34,26 +34,20 @@ pub struct GuestRegionUffdMapping {
     pub page_size_kib: usize,
 }
 
-#[derive(Debug, Clone, Copy)]
-pub enum MemPageState {
-    Uninitialized,
-    FromFile,
-    Removed,
-    Anonymous,
-}
-
-#[derive(Debug, Clone)]
-pub struct MemRegion {
-    pub mapping: GuestRegionUffdMapping,
-    page_states: HashMap<u64, MemPageState>,
+impl GuestRegionUffdMapping {
+    fn contains(&self, fault_page_addr: u64) -> bool {
+        fault_page_addr >= self.base_host_virt_addr
+            && fault_page_addr < self.base_host_virt_addr + self.size as u64
+    }
 }
 
 #[derive(Debug)]
 pub struct UffdHandler {
-    pub mem_regions: Vec<MemRegion>,
+    pub mem_regions: Vec<GuestRegionUffdMapping>,
     pub page_size: usize,
     backing_buffer: *const u8,
     uffd: Uffd,
+    removed_pages: HashSet<u64>,
 }
 
 impl UffdHandler {
@@ -92,13 +86,12 @@ impl UffdHandler {
 
         let uffd = unsafe { Uffd::from_raw_fd(file.into_raw_fd()) };
 
-        let mem_regions = create_mem_regions(&mappings, page_size);
-
         Self {
-            mem_regions,
+            mem_regions: mappings,
             page_size,
             backing_buffer,
             uffd,
+            removed_pages: HashSet::new(),
         }
     }
 
@@ -106,13 +99,12 @@ impl UffdHandler {
         self.uffd.read_event()
     }
 
-    pub fn update_mem_state_mappings(&mut self, start: u64, end: u64, state: MemPageState) {
-        for region in self.mem_regions.iter_mut() {
-            for (key, value) in region.page_states.iter_mut() {
-                if key >= &start && key < &end {
-                    *value = state;
-                }
-            }
+    pub fn mark_range_removed(&mut self, start: u64, end: u64) {
+        let pfn_start = start / self.page_size as u64;
+        let pfn_end = end / self.page_size as u64;
+
+        for pfn in pfn_start..pfn_end {
+            self.removed_pages.insert(pfn);
         }
     }
 
@@ -120,33 +112,16 @@ impl UffdHandler {
         // Find the start of the page that the current faulting address belongs to.
         let dst = (addr as usize & !(self.page_size - 1)) as *mut libc::c_void;
         let fault_page_addr = dst as u64;
-
-        // Get the state of the current faulting page.
-        for region in self.mem_regions.iter() {
-            match region.page_states.get(&fault_page_addr) {
-                // Our simple PF handler has a simple strategy:
-                // There exist 4 states in which a memory page can be in:
-                // 1. Uninitialized - page was never touched
-                // 2. FromFile - the page is populated with content from snapshotted memory file
-                // 3. Removed - MADV_DONTNEED was called due to balloon inflation
-                // 4. Anonymous - page was zeroed out -> this implies that more than one page fault
-                //    event was received. This can be a consequence of guest reclaiming back its
-                //    memory from the host (through balloon device)
-                Some(MemPageState::Uninitialized) | Some(MemPageState::FromFile) => {
-                    match self.populate_from_file(region, fault_page_addr, len) {
-                        Some((start, end)) => {
-                            self.update_mem_state_mappings(start, end, MemPageState::FromFile)
-                        }
-                        None => return false,
-                    }
-                    return true;
-                }
-                Some(MemPageState::Removed) | Some(MemPageState::Anonymous) => {
-                    let (start, end) = self.zero_out(fault_page_addr);
-                    self.update_mem_state_mappings(start, end, MemPageState::Anonymous);
-                    return true;
+        let fault_pfn = fault_page_addr / self.page_size as u64;
+
+        if self.removed_pages.contains(&fault_pfn) {
+            self.zero_out(fault_page_addr);
+            return true;
+        } else {
+            for region in self.mem_regions.iter() {
+                if region.contains(fault_page_addr) {
+                    return self.populate_from_file(region, fault_page_addr, len);
                 }
-                None => {}
             }
         }
 
@@ -156,13 +131,14 @@ impl UffdHandler {
         );
     }
 
-    fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> Option<(u64, u64)> {
-        let offset = dst - region.mapping.base_host_virt_addr;
-        let src = self.backing_buffer as u64 + region.mapping.offset + offset;
+    fn populate_from_file(&self, region: &GuestRegionUffdMapping, dst: u64, len: usize) -> bool {
+        let offset = dst - region.base_host_virt_addr;
+        let src = self.backing_buffer as u64 + region.offset + offset;
 
-        let ret = unsafe {
+        unsafe {
             match self.uffd.copy(src as *const _, dst as *mut _, len, true) {
-                Ok(value) => value,
+                // Make sure the UFFD copied some bytes.
+                Ok(value) => assert!(value > 0),
                 // Catch EAGAIN errors, which occur when a `remove` event lands in the UFFD
                 // queue while we're processing `pagefault` events.
                 // The weird cast is because the `bytes_copied` field is based on the
@@ -172,12 +148,12 @@ impl UffdHandler {
                 Err(Error::PartiallyCopied(bytes_copied))
                     if bytes_copied == 0 || bytes_copied == (-libc::EAGAIN) as usize =>
                 {
-                    return None
+                    return false
                 }
                 Err(Error::CopyFailed(errno))
                     if std::io::Error::from(errno).raw_os_error().unwrap() == libc::EEXIST =>
                 {
-                    len
+                    ()
                 }
                 Err(e) => {
                     panic!("Uffd copy failed: {e:?}");
@@ -185,13 +161,10 @@ impl UffdHandler {
             }
         };
 
-        // Make sure the UFFD copied some bytes.
-        assert!(ret > 0);
-
-        Some((dst, dst + len as u64))
+        true
     }
 
-    fn zero_out(&mut self, addr: u64) -> (u64, u64) {
+    fn zero_out(&mut self, addr: u64) {
         let ret = unsafe {
             self.uffd
                 .zeropage(addr as *mut _, self.page_size, true)
@@ -199,8 +172,6 @@ impl UffdHandler {
         };
         // Make sure the UFFD zeroed out some bytes.
         assert!(ret > 0);
-
-        (addr, addr + self.page_size as u64)
     }
 }
 
@@ -345,28 +316,6 @@ impl Runtime {
     }
 }
 
-fn create_mem_regions(mappings: &Vec<GuestRegionUffdMapping>, page_size: usize) -> Vec<MemRegion> {
-    let mut mem_regions: Vec<MemRegion> = Vec::with_capacity(mappings.len());
-
-    for r in mappings.iter() {
-        let mapping = r.clone();
-        let mut addr = r.base_host_virt_addr;
-        let end_addr = r.base_host_virt_addr + r.size as u64;
-        let mut page_states = HashMap::new();
-
-        while addr < end_addr {
-            page_states.insert(addr, MemPageState::Uninitialized);
-            addr += page_size as u64;
-        }
-        mem_regions.push(MemRegion {
-            mapping,
-            page_states,
-        });
-    }
-
-    mem_regions
-}
-
 #[cfg(test)]
 mod tests {
     use std::mem::MaybeUninit;
diff --git a/src/firecracker/examples/uffd/valid_handler.rs b/src/firecracker/examples/uffd/valid_handler.rs
index 936b9f517a3..3be958b3578 100644
--- a/src/firecracker/examples/uffd/valid_handler.rs
+++ b/src/firecracker/examples/uffd/valid_handler.rs
@@ -10,7 +10,7 @@ mod uffd_utils;
 use std::fs::File;
 use std::os::unix::net::UnixListener;
 
-use uffd_utils::{MemPageState, Runtime, UffdHandler};
+use uffd_utils::{Runtime, UffdHandler};
 
 fn main() {
     let mut args = std::env::args();
@@ -86,8 +86,9 @@ fn main() {
                             deferred_events.push(event);
                         }
                     }
-                    userfaultfd::Event::Remove { start, end } => uffd_handler
-                        .update_mem_state_mappings(start as u64, end as u64, MemPageState::Removed),
+                    userfaultfd::Event::Remove { start, end } => {
+                        uffd_handler.mark_range_removed(start as u64, end as u64)
+                    }
                     _ => panic!("Unexpected event on userfaultfd"),
                 }
             }

From 6c914809ea323c388dbe403e8790cff7977b4bde Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Tue, 4 Mar 2025 12:44:36 +0000
Subject: [PATCH 04/23] refactor(test): Store binary_dir inside microvm

Just store the binary dir inside the microvm class instead of separate
paths to firecracker and jailer binaries. This centralizes the existance
assertions, but means we cant theoretically use jailer and firecracker
binaries from different locations anymore (however, nothing in the test
framework was doing this, as MicroVMFactory already did not support this
and hardcoded the binary names relative to the --binary-dir option).

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/conftest.py                              | 17 +++++++----------
 tests/framework/defs.py                        |  7 +++++++
 tests/framework/microvm.py                     | 18 +++++++++++++++---
 tests/host_tools/cargo_build.py                | 11 -----------
 .../security/test_vulnerabilities.py           |  6 ++----
 tools/sandbox.py                               |  9 ++++-----
 .../test-docker-rootfs.py                      |  4 ++--
 7 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index fa309427ef1..fb0b88b677a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -33,6 +33,7 @@
 import host_tools.cargo_build as build_tools
 from framework import defs, utils
 from framework.artifacts import disks, kernel_params
+from framework.defs import DEFAULT_BINARY_DIR
 from framework.microvm import MicroVMFactory
 from framework.properties import global_props
 from framework.utils_cpu_templates import (
@@ -293,14 +294,11 @@ def get(self, _netns_id):
 def microvm_factory(request, record_property, results_dir, netns_factory):
     """Fixture to create microvms simply."""
 
-    if binary_dir := request.config.getoption("--binary-dir"):
-        fc_binary_path = Path(binary_dir) / "firecracker"
-        jailer_binary_path = Path(binary_dir) / "jailer"
-        if not fc_binary_path.exists():
-            raise RuntimeError("Firecracker binary does not exist")
-    else:
-        fc_binary_path, jailer_binary_path = build_tools.get_firecracker_binaries()
-    record_property("firecracker_bin", str(fc_binary_path))
+    binary_dir = request.config.getoption("--binary-dir") or DEFAULT_BINARY_DIR
+    if isinstance(binary_dir, str):
+        binary_dir = Path(binary_dir)
+
+    record_property("firecracker_bin", str(binary_dir / "firecracker"))
 
     # If `--custom-cpu-template` option is provided, the given CPU template will
     # be applied afterwards unless overwritten.
@@ -316,8 +314,7 @@ def microvm_factory(request, record_property, results_dir, netns_factory):
     # We could override the chroot base like so
     # jailer_kwargs={"chroot_base": "/srv/jailo"}
     uvm_factory = MicroVMFactory(
-        fc_binary_path,
-        jailer_binary_path,
+        binary_dir,
         netns_factory=netns_factory,
         custom_cpu_template=custom_cpu_template,
     )
diff --git a/tests/framework/defs.py b/tests/framework/defs.py
index f017dc231ee..38fbe6802df 100644
--- a/tests/framework/defs.py
+++ b/tests/framework/defs.py
@@ -26,6 +26,13 @@
 # Absolute path to the test results folder
 TEST_RESULTS_DIR = FC_WORKSPACE_DIR / "test_results"
 
+DEFAULT_BINARY_DIR = (
+    LOCAL_BUILD_PATH
+    / "cargo_target"
+    / f"{platform.machine()}-unknown-linux-musl"
+    / "release"
+)
+
 # The minimum required host kernel version for which io_uring is supported in
 # Firecracker.
 MIN_KERNEL_VERSION_FOR_IO_URING = "5.10.51"
diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py
index f93a0dabf19..85ed588b809 100644
--- a/tests/framework/microvm.py
+++ b/tests/framework/microvm.py
@@ -1073,13 +1073,25 @@ def wait_for_ssh_up(self):
 class MicroVMFactory:
     """MicroVM factory"""
 
-    def __init__(self, fc_binary_path: Path, jailer_binary_path: Path, **kwargs):
+    def __init__(self, binary_path: Path, **kwargs):
         self.vms = []
-        self.fc_binary_path = Path(fc_binary_path)
-        self.jailer_binary_path = Path(jailer_binary_path)
+        self.binary_path = binary_path
         self.netns_factory = kwargs.pop("netns_factory", net_tools.NetNs)
         self.kwargs = kwargs
 
+        assert self.fc_binary_path.exists(), "missing firecracker binary"
+        assert self.jailer_binary_path.exists(), "missing jailer binary"
+
+    @property
+    def fc_binary_path(self):
+        """The path to the firecracker binary from which this factory will build VMs"""
+        return self.binary_path / "firecracker"
+
+    @property
+    def jailer_binary_path(self):
+        """The path to the jailer binary using which this factory will build VMs"""
+        return self.binary_path / "jailer"
+
     def build(self, kernel=None, rootfs=None, **kwargs):
         """Build a microvm"""
         kwargs = self.kwargs | kwargs
diff --git a/tests/host_tools/cargo_build.py b/tests/host_tools/cargo_build.py
index 5a029f8c5f4..54302901211 100644
--- a/tests/host_tools/cargo_build.py
+++ b/tests/host_tools/cargo_build.py
@@ -65,17 +65,6 @@ def get_binary(name, *, workspace_dir=FC_WORKSPACE_DIR, example=None):
     return bin_path
 
 
-def get_firecracker_binaries(*, workspace_dir=FC_WORKSPACE_DIR):
-    """Build the Firecracker and Jailer binaries if they don't exist.
-
-    Returns the location of the firecracker related binaries eventually after
-    building them in case they do not exist at the specified root_path.
-    """
-    return get_binary("firecracker", workspace_dir=workspace_dir), get_binary(
-        "jailer", workspace_dir=workspace_dir
-    )
-
-
 def get_example(name, *args, package="firecracker", **kwargs):
     """Build an example binary"""
     return get_binary(package, *args, **kwargs, example=name)
diff --git a/tests/integration_tests/security/test_vulnerabilities.py b/tests/integration_tests/security/test_vulnerabilities.py
index a4461aded78..14307e56888 100644
--- a/tests/integration_tests/security/test_vulnerabilities.py
+++ b/tests/integration_tests/security/test_vulnerabilities.py
@@ -206,10 +206,8 @@ def microvm_factory_a(record_property):
     """MicroVMFactory using revision A binaries"""
     revision_a = global_props.buildkite_revision_a
     bin_dir = git_clone(Path("../build") / revision_a, revision_a).resolve()
-    fc_bin = bin_dir / "firecracker"
-    jailer_bin = bin_dir / "jailer"
-    record_property("firecracker_bin", str(fc_bin))
-    uvm_factory = MicroVMFactory(fc_bin, jailer_bin)
+    record_property("firecracker_bin", str(bin_dir / "firecracker"))
+    uvm_factory = MicroVMFactory(bin_dir)
     yield uvm_factory
     uvm_factory.kill()
 
diff --git a/tools/sandbox.py b/tools/sandbox.py
index 3cdcec345a7..8cdf3277352 100755
--- a/tools/sandbox.py
+++ b/tools/sandbox.py
@@ -14,8 +14,8 @@
 from pathlib import Path
 
 from framework.artifacts import disks, kernels
+from framework.defs import DEFAULT_BINARY_DIR
 from framework.microvm import MicroVMFactory
-from host_tools.cargo_build import get_firecracker_binaries
 
 kernels = list(kernels("vmlinux-*"))
 rootfs = list(disks("ubuntu*ext4"))
@@ -61,17 +61,16 @@ def parse_byte_size(param):
 args = parser.parse_args()
 print(args)
 
-bins = None
+binary_dir = None
 if args.binary_dir:
     binary_dir = Path(args.binary_dir).resolve()
-    bins = binary_dir / "firecracker", binary_dir / "jailer"
 else:
-    bins = get_firecracker_binaries()
+    binary_dir = DEFAULT_BINARY_DIR
 
 cpu_template = None
 if args.cpu_template_path is not None:
     cpu_template = json.loads(args.cpu_template_path.read_text())
-vmfcty = MicroVMFactory(*bins)
+vmfcty = MicroVMFactory(binary_dir)
 
 print(f"uvm with kernel {args.kernel} ...")
 uvm = vmfcty.build(args.kernel, args.rootfs)
diff --git a/tools/test-popular-containers/test-docker-rootfs.py b/tools/test-popular-containers/test-docker-rootfs.py
index aac24ac2ecc..1434e40173c 100755
--- a/tools/test-popular-containers/test-docker-rootfs.py
+++ b/tools/test-popular-containers/test-docker-rootfs.py
@@ -17,8 +17,8 @@
 
 # pylint: disable=wrong-import-position
 from framework.artifacts import kernels
+from framework.defs import DEFAULT_BINARY_DIR
 from framework.microvm import MicroVMFactory
-from host_tools.cargo_build import get_firecracker_binaries
 
 # pylint: enable=wrong-import-position
 
@@ -26,7 +26,7 @@
 # Use the latest guest kernel
 kernel = kernels[-1]
 
-vmfcty = MicroVMFactory(*get_firecracker_binaries())
+vmfcty = MicroVMFactory(DEFAULT_BINARY_DIR)
 # (may take a while to compile Firecracker...)
 
 for rootfs in Path(".").glob("*.ext4"):

From 0a2b6d2397e80a84fe2ded2e60ffc3d13174d63d Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Tue, 4 Mar 2025 12:50:02 +0000
Subject: [PATCH 05/23] test: replace workspace_dir arg with binary_dir in
 get_binary

Instead of get_binary assuming it always operates on a cargo build
directory, have it directly operate on a "binary directory" in the sense
of the --binary-dir parameter. Nothing was using the workspace_dir arg,
so no functional changes.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/host_tools/cargo_build.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/host_tools/cargo_build.py b/tests/host_tools/cargo_build.py
index 54302901211..56af399dea7 100644
--- a/tests/host_tools/cargo_build.py
+++ b/tests/host_tools/cargo_build.py
@@ -7,7 +7,7 @@
 from pathlib import Path
 
 from framework import defs, utils
-from framework.defs import FC_WORKSPACE_DIR
+from framework.defs import DEFAULT_BINARY_DIR
 from framework.with_filelock import with_filelock
 
 DEFAULT_TARGET = f"{platform.machine()}-unknown-linux-musl"
@@ -56,12 +56,11 @@ def cargo_test(path, extra_args=""):
     cargo("test", extra_args + " --all --no-fail-fast", env=env)
 
 
-def get_binary(name, *, workspace_dir=FC_WORKSPACE_DIR, example=None):
+def get_binary(name, *, binary_dir=DEFAULT_BINARY_DIR, example=None):
     """Get a binary. The binaries are built before starting a testrun."""
-    target_dir = workspace_dir / "build" / "cargo_target" / DEFAULT_TARGET_DIR
-    bin_path = target_dir / name
+    bin_path = binary_dir / name
     if example:
-        bin_path = target_dir / "examples" / example
+        bin_path = binary_dir / "examples" / example
     return bin_path
 
 

From 73683cb7d0b49cd7dc9950a8c2be8b5b0aec83d7 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Tue, 4 Mar 2025 12:54:51 +0000
Subject: [PATCH 06/23] test: pass through kwargs from uffd_handler() to
 get_example()

To allow specifying a different directory in which to look for the uffd
example binaries, so that they can be used in A/B-tests.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/framework/utils_uffd.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/framework/utils_uffd.py b/tests/framework/utils_uffd.py
index 690eb423573..7715a5c2157 100644
--- a/tests/framework/utils_uffd.py
+++ b/tests/framework/utils_uffd.py
@@ -93,6 +93,6 @@ def spawn_pf_handler(vm, handler_path, mem_path):
     return uffd_handler
 
 
-def uffd_handler(handler_name):
+def uffd_handler(handler_name, **kwargs):
     """Retrieves the uffd handler with the given name"""
-    return cargo_build.get_example(f"uffd_{handler_name}_handler")
+    return cargo_build.get_example(f"uffd_{handler_name}_handler", **kwargs)

From 295bb78c9dca4935f62f17b146e138bdc44283f6 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Tue, 4 Mar 2025 12:52:28 +0000
Subject: [PATCH 07/23] devtool: also copy example binaries when doing build
 --rev

Also copy the example binaries into the target directory when building
some arbitrary revision (for example for A/B-tests).

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tools/devtool | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/devtool b/tools/devtool
index 7bf8d4ad6fb..5f495b838c1 100755
--- a/tools/devtool
+++ b/tools/devtool
@@ -517,8 +517,9 @@ cmd_build() {
     if [ ! -z "$revision" ]; then
       popd
       git branch -D $branch_name
-      mkdir -p build/"$revision"
+      mkdir -p build/"$revision"/examples
       cp $tmp_dir/build/cargo_target/$(uname -m)-unknown-linux-$libc/$profile/* build/"$revision"
+      cp $tmp_dir/build/cargo_target/$(uname -m)-unknown-linux-$libc/$profile/examples/* build/"$revision"/examples
       cmd_sh "rm -rf $tmp_dir"
     fi
 

From a2ce40cffcec4f196cbe28c798b2ca73bf34d173 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Mon, 3 Mar 2025 15:55:44 +0000
Subject: [PATCH 08/23] fix(test): stop passing guest kernel when building vms
 from snapshot

When building a microvm from a snapshot, there is no need to specify a
guest kernel. Remove the parameter from
SnapshotRestoreTest.sample_latency.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/integration_tests/performance/test_snapshot_ab.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py
index 23224de6b31..c718b43f920 100644
--- a/tests/integration_tests/performance/test_snapshot_ab.py
+++ b/tests/integration_tests/performance/test_snapshot_ab.py
@@ -76,15 +76,12 @@ def configure_vm(
 
         return vm
 
-    def sample_latency(
-        self, microvm_factory, snapshot, guest_kernel_linux_5_10
-    ) -> List[float]:
+    def sample_latency(self, microvm_factory, snapshot) -> List[float]:
         """Collects latency samples for the microvm configuration specified by this instance"""
         values = []
 
         for _ in range(ITERATIONS):
             microvm = microvm_factory.build(
-                kernel=guest_kernel_linux_5_10,
                 monitor_memory=False,
             )
             microvm.spawn(emit_metrics=True)
@@ -154,7 +151,6 @@ def test_restore_latency(
     samples = test_setup.sample_latency(
         microvm_factory,
         snapshot,
-        guest_kernel_linux_5_10,
     )
 
     for sample in samples:

From e926f08ff974431413e70b154ddc94097245fce5 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Mon, 3 Mar 2025 16:39:29 +0000
Subject: [PATCH 09/23] test_snapshot_ab: rename guest_kernel_acpi to
 guest_kernel

This was the result of a find and replace gone awry it seems.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/integration_tests/performance/test_snapshot_ab.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py
index c718b43f920..4b09db126e4 100644
--- a/tests/integration_tests/performance/test_snapshot_ab.py
+++ b/tests/integration_tests/performance/test_snapshot_ab.py
@@ -43,12 +43,12 @@ def id(self):
     def configure_vm(
         self,
         microvm_factory,
-        guest_kernel_acpi,
+        guest_kernel,
         rootfs,
     ) -> Microvm:
         """Creates the initial snapshot that will be loaded repeatedly to sample latencies"""
         vm = microvm_factory.build(
-            guest_kernel_acpi,
+            guest_kernel,
             rootfs,
             monitor_memory=False,
         )

From 5ef13faa21ca801e6e0b46d2b013720818b14a22 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Tue, 4 Mar 2025 09:02:03 +0000
Subject: [PATCH 10/23] test: store uffd handler in microvm attribute

So that we can later refer to it in tests, for example to parse its
logs.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/framework/microvm.py    | 1 +
 tests/framework/utils_uffd.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py
index 85ed588b809..8d87d137018 100644
--- a/tests/framework/microvm.py
+++ b/tests/framework/microvm.py
@@ -201,6 +201,7 @@ def __init__(
         self.ssh_key = None
         self.initrd_file = None
         self.boot_args = None
+        self.uffd_handler = None
 
         self.fc_binary_path = Path(fc_binary_path)
         assert fc_binary_path.exists()
diff --git a/tests/framework/utils_uffd.py b/tests/framework/utils_uffd.py
index 7715a5c2157..a502003ff5f 100644
--- a/tests/framework/utils_uffd.py
+++ b/tests/framework/utils_uffd.py
@@ -89,6 +89,7 @@ def spawn_pf_handler(vm, handler_path, mem_path):
         handler_name, SOCKET_PATH, jailed_mem, vm.chroot(), "uffd.log"
     )
     uffd_handler.spawn(vm.jailer.uid, vm.jailer.gid)
+    vm.uffd_handler = uffd_handler
 
     return uffd_handler
 

From 1ac8a3a1d817d498ea4ef3545493cafc92e93e64 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Mon, 3 Mar 2025 16:17:58 +0000
Subject: [PATCH 11/23] test: Add MicroVMFactory.build_n_from_snapshot

Add a utility function that allows building multiple microvms from the
same snapshot sequentially.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/framework/microvm.py | 47 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py
index 8d87d137018..7d2e9002822 100644
--- a/tests/framework/microvm.py
+++ b/tests/framework/microvm.py
@@ -38,6 +38,7 @@
 from framework.microvm_helpers import MicrovmHelpers
 from framework.properties import global_props
 from framework.utils_drive import VhostUserBlkBackend, VhostUserBlkBackendType
+from framework.utils_uffd import spawn_pf_handler, uffd_handler
 from host_tools.fcmetrics import FCMetricsMonitor
 from host_tools.memory import MemoryMonitor
 
@@ -1128,6 +1129,52 @@ def build_from_snapshot(self, snapshot: Snapshot):
         vm.restore_from_snapshot(snapshot, resume=True)
         return vm
 
+    def build_n_from_snapshot(
+        self,
+        snapshot,
+        nr_vms,
+        *,
+        uffd_handler_name=None,
+        incremental=False,
+        use_snapshot_editor=True,
+    ):
+        """A generator of `n` microvms restored, either all restored from the same given snapshot
+        (incremental=False), or created by taking successive snapshots of restored VMs
+        """
+        for _ in range(nr_vms):
+            microvm = self.build()
+            microvm.spawn()
+
+            uffd_path = None
+            if uffd_handler_name is not None:
+                pf_handler = spawn_pf_handler(
+                    microvm,
+                    uffd_handler(uffd_handler_name, binary_dir=self.binary_path),
+                    snapshot.mem,
+                )
+                uffd_path = pf_handler.socket_path
+
+            snapshot_copy = microvm.restore_from_snapshot(
+                snapshot, resume=True, uffd_path=uffd_path
+            )
+
+            yield microvm
+
+            if incremental:
+                new_snapshot = microvm.make_snapshot(snapshot.snapshot_type)
+
+                if snapshot.is_diff:
+                    new_snapshot = new_snapshot.rebase_snapshot(
+                        snapshot, use_snapshot_editor
+                    )
+
+                snapshot = new_snapshot
+
+            microvm.kill()
+            snapshot_copy.delete()
+
+        snapshot.delete()
+
     def kill(self):
         """Clean up all built VMs"""
         for vm in self.vms:

From 2e010088ee9d9101ecbb4f63e93007b8fefb895e Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Mon, 3 Mar 2025 16:27:32 +0000
Subject: [PATCH 12/23] test: use build_n_from_snapshot to replace explicit
 loops

Replace snapshot creation loops in test_5_snapshots, test_vmgenid and
test_restore_latency with calls to build_n_from_snapshot

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 .../functional/test_snapshot_basic.py         | 44 +++----------------
 .../performance/test_snapshot_ab.py           | 11 +----
 2 files changed, 7 insertions(+), 48 deletions(-)

diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py
index 6fe41f4d5a6..09e14b92448 100644
--- a/tests/integration_tests/functional/test_snapshot_basic.py
+++ b/tests/integration_tests/functional/test_snapshot_basic.py
@@ -149,12 +149,9 @@ def test_5_snapshots(
     snapshot = vm.make_snapshot(snapshot_type)
     vm.kill()
 
-    for i in range(seq_len):
-        logger.info("Load snapshot #%s, mem %s", i, snapshot.mem)
-        microvm = microvm_factory.build()
-        microvm.spawn()
-        copied_snapshot = microvm.restore_from_snapshot(snapshot, resume=True)
-
+    for microvm in microvm_factory.build_n_from_snapshot(
+        snapshot, seq_len, incremental=True, use_snapshot_editor=use_snapshot_editor
+    ):
         # FIXME: This and the sleep below reduce the rate of vsock/ssh connection
         # related spurious test failures, although we do not know why this is the case.
         time.sleep(2)
@@ -171,21 +168,6 @@ def test_5_snapshots(
         check_filesystem(microvm.ssh, "squashfs", "/dev/vda")
 
         time.sleep(2)
-        logger.info("Create snapshot %s #%d.", snapshot_type, i + 1)
-        new_snapshot = microvm.make_snapshot(snapshot_type)
-
-        # If we are testing incremental snapshots we must merge the base with
-        # current layer.
-        if snapshot.is_diff:
-            logger.info("Base: %s, Layer: %s", snapshot.mem, new_snapshot.mem)
-            new_snapshot = new_snapshot.rebase_snapshot(
-                snapshot, use_snapshot_editor=use_snapshot_editor
-            )
-
-        microvm.kill()
-        copied_snapshot.delete()
-        # Update the base for next iteration.
-        snapshot = new_snapshot
 
 
 def test_patch_drive_snapshot(uvm_nano, microvm_factory):
@@ -524,27 +506,13 @@ def test_vmgenid(guest_kernel_linux_6_1, rootfs, microvm_factory, snapshot_type)
     base_snapshot = snapshot
     base_vm.kill()
 
-    for i in range(5):
-        vm = microvm_factory.build()
-        vm.spawn()
-        copied_snapshot = vm.restore_from_snapshot(snapshot, resume=True)
-
+    for i, vm in enumerate(
+        microvm_factory.build_n_from_snapshot(base_snapshot, 5, incremental=True)
+    ):
         # We should have as DMESG_VMGENID_RESUME messages as
         # snapshots we have resumed
         check_vmgenid_update_count(vm, i + 1)
 
-        snapshot = vm.make_snapshot(snapshot_type)
-        vm.kill()
-        copied_snapshot.delete()
-
-        # If we are testing incremental snapshots we ust merge the base with
-        # current layer.
-        if snapshot.is_diff:
-            snapshot = snapshot.rebase_snapshot(base_snapshot)
-
-        # Update the base for next iteration
-        base_snapshot = snapshot
-
 
 # TODO add `global_props.host_os == "amzn2"` condition
 # once amazon linux kernels have patches.
diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py
index 4b09db126e4..b91feb03f80 100644
--- a/tests/integration_tests/performance/test_snapshot_ab.py
+++ b/tests/integration_tests/performance/test_snapshot_ab.py
@@ -80,13 +80,7 @@ def sample_latency(self, microvm_factory, snapshot) -> List[float]:
         """Collects latency samples for the microvm configuration specified by this instance"""
         values = []
 
-        for _ in range(ITERATIONS):
-            microvm = microvm_factory.build(
-                monitor_memory=False,
-            )
-            microvm.spawn(emit_metrics=True)
-            snapshot_copy = microvm.restore_from_snapshot(snapshot, resume=True)
-
+        for microvm in microvm_factory.build_n_from_snapshot(snapshot, ITERATIONS):
             value = 0
             # Parse all metric data points in search of load_snapshot time.
             microvm.flush_metrics()
@@ -98,10 +92,7 @@ def sample_latency(self, microvm_factory, snapshot) -> List[float]:
                     break
             assert value > 0
             values.append(value)
-            microvm.kill()
-            snapshot_copy.delete()
 
-        snapshot.delete()
         return values
 
 

From e2690afb408f77597bf5d7eefc42ce9cd0c51025 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Mon, 3 Mar 2025 16:29:43 +0000
Subject: [PATCH 13/23] test_snapshot_ab: move metrics dimension setting into
 configure_vm

Move the initialization of the metrics logger dimensions into
`configure_vm`, as later on there will be multiple tests having to do
exactly the same sequence of configure + setup metrics.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 .../performance/test_snapshot_ab.py           | 34 +++++++++----------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py
index b91feb03f80..5ad055a7495 100644
--- a/tests/integration_tests/performance/test_snapshot_ab.py
+++ b/tests/integration_tests/performance/test_snapshot_ab.py
@@ -40,12 +40,7 @@ def id(self):
         """Computes a unique id for this test instance"""
         return "all_dev" if self.all_devices else f"{self.vcpus}vcpu_{self.mem}mb"
 
-    def configure_vm(
-        self,
-        microvm_factory,
-        guest_kernel,
-        rootfs,
-    ) -> Microvm:
+    def configure_vm(self, microvm_factory, guest_kernel, rootfs, metrics) -> Microvm:
         """Creates the initial snapshot that will be loaded repeatedly to sample latencies"""
         vm = microvm_factory.build(
             guest_kernel,
@@ -74,6 +69,16 @@ def configure_vm(
             )
             vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path="/v.sock")
 
+        metrics.set_dimensions(
+            {
+                "net_devices": str(self.nets),
+                "block_devices": str(self.blocks),
+                "vsock_devices": str(int(self.all_devices)),
+                "balloon_devices": str(int(self.all_devices)),
+                **vm.dimensions,
+            }
+        )
+
         return vm
 
     def sample_latency(self, microvm_factory, snapshot) -> List[float]:
@@ -122,23 +127,16 @@ def test_restore_latency(
 
     We only test a single guest kernel, as the guest kernel does not "participate" in snapshot restore.
     """
-    vm = test_setup.configure_vm(microvm_factory, guest_kernel_linux_5_10, rootfs)
-    vm.start()
-
-    metrics.set_dimensions(
-        {
-            "performance_test": "test_restore_latency",
-            "net_devices": str(test_setup.nets),
-            "block_devices": str(test_setup.blocks),
-            "vsock_devices": str(int(test_setup.all_devices)),
-            "balloon_devices": str(int(test_setup.all_devices)),
-            **vm.dimensions,
-        }
+    vm = test_setup.configure_vm(
+        microvm_factory, guest_kernel_linux_5_10, rootfs, metrics
     )
+    vm.start()
 
     snapshot = vm.snapshot_full()
     vm.kill()
 
+    metrics.put_dimensions({"performance_test": "test_restore_latency"})
+
     samples = test_setup.sample_latency(
         microvm_factory,
         snapshot,

From 5b19b195480c69adfc3cd883b04015389417d240 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Mon, 3 Mar 2025 16:31:02 +0000
Subject: [PATCH 14/23] test_snapshot_ab: start VM inside configure_vm

No need to defer the starting. Also rename it to `boot_vm` to more
accurately reflect what it does now.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/integration_tests/performance/test_snapshot_ab.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py
index 5ad055a7495..189e58b25b1 100644
--- a/tests/integration_tests/performance/test_snapshot_ab.py
+++ b/tests/integration_tests/performance/test_snapshot_ab.py
@@ -40,7 +40,7 @@ def id(self):
         """Computes a unique id for this test instance"""
         return "all_dev" if self.all_devices else f"{self.vcpus}vcpu_{self.mem}mb"
 
-    def configure_vm(self, microvm_factory, guest_kernel, rootfs, metrics) -> Microvm:
+    def boot_vm(self, microvm_factory, guest_kernel, rootfs, metrics) -> Microvm:
         """Creates the initial snapshot that will be loaded repeatedly to sample latencies"""
         vm = microvm_factory.build(
             guest_kernel,
@@ -78,6 +78,7 @@ def configure_vm(self, microvm_factory, guest_kernel, rootfs, metrics) -> Microv
                 **vm.dimensions,
             }
         )
+        vm.start()
 
         return vm
 
@@ -127,10 +128,7 @@ def test_restore_latency(
 
     We only test a single guest kernel, as the guest kernel does not "participate" in snapshot restore.
     """
-    vm = test_setup.configure_vm(
-        microvm_factory, guest_kernel_linux_5_10, rootfs, metrics
-    )
-    vm.start()
+    vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs, metrics)
 
     snapshot = vm.snapshot_full()
     vm.kill()

From 873515a0d51f2dc45f0673db8e0db0dab118c21d Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Mon, 3 Mar 2025 16:32:11 +0000
Subject: [PATCH 15/23] test_snapshot_ab: open code sample_latency

This function only has one possible call-site, and that's
test_restore_latency. Inlining it into its call site makes things a bit
simpler (no need for a temporary array).

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 .../performance/test_snapshot_ab.py           | 38 ++++++-------------
 1 file changed, 11 insertions(+), 27 deletions(-)

diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py
index 189e58b25b1..5caa87bb17f 100644
--- a/tests/integration_tests/performance/test_snapshot_ab.py
+++ b/tests/integration_tests/performance/test_snapshot_ab.py
@@ -4,7 +4,6 @@
 import tempfile
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import List
 
 import pytest
 
@@ -82,25 +81,6 @@ def boot_vm(self, microvm_factory, guest_kernel, rootfs, metrics) -> Microvm:
 
         return vm
 
-    def sample_latency(self, microvm_factory, snapshot) -> List[float]:
-        """Collects latency samples for the microvm configuration specified by this instance"""
-        values = []
-
-        for microvm in microvm_factory.build_n_from_snapshot(snapshot, ITERATIONS):
-            value = 0
-            # Parse all metric data points in search of load_snapshot time.
-            microvm.flush_metrics()
-            metrics = microvm.get_all_metrics()
-            for data_point in metrics:
-                cur_value = data_point["latencies_us"]["load_snapshot"]
-                if cur_value > 0:
-                    value = cur_value / USEC_IN_MSEC
-                    break
-            assert value > 0
-            values.append(value)
-
-        return values
-
 
 @pytest.mark.nonci
 @pytest.mark.parametrize(
@@ -135,10 +115,14 @@ def test_restore_latency(
 
     metrics.put_dimensions({"performance_test": "test_restore_latency"})
 
-    samples = test_setup.sample_latency(
-        microvm_factory,
-        snapshot,
-    )
-
-    for sample in samples:
-        metrics.put_metric("latency", sample, "Milliseconds")
+    for microvm in microvm_factory.build_n_from_snapshot(snapshot, ITERATIONS):
+        value = 0
+        # Parse all metric data points in search of load_snapshot time.
+        microvm.flush_metrics()
+        for data_point in microvm.get_all_metrics():
+            cur_value = data_point["latencies_us"]["load_snapshot"]
+            if cur_value > 0:
+                value = cur_value / USEC_IN_MSEC
+                break
+        assert value > 0
+        metrics.put_metric("latency", value, "Milliseconds")

From 1488c7b48cfb115a62231e9884a68e54d20c935d Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Thu, 27 Feb 2025 16:59:10 +0000
Subject: [PATCH 16/23] test: add post-restore latency test

Add a snapshot latency test that output metrics on how long it takes to
memset a 128MB memory region post-snapshot restore.

We only collect these latencies for a single configuration (1024MB,
2vCPU), since they will be the same no matter the size of the memory.

Run with an on-demand uffd handler and compare to mmap-ing the snapshot
file as a baseline. Also use the fault_all handler, which will allow us
to measure latencies due to "fast" page faults, e.g. those that are just
for setting up stage-2 page able entries but where otherwise memory is
already faulted into userspace.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 .../performance/test_snapshot_ab.py           | 50 ++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py
index 5caa87bb17f..9133eb9d4de 100644
--- a/tests/integration_tests/performance/test_snapshot_ab.py
+++ b/tests/integration_tests/performance/test_snapshot_ab.py
@@ -1,7 +1,9 @@
 # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 # SPDX-License-Identifier: Apache-2.0
 """Performance benchmark for snapshot restore."""
+import signal
 import tempfile
+import time
 from dataclasses import dataclass
 from functools import lru_cache
 
@@ -11,6 +13,7 @@
 from framework.microvm import Microvm
 
 USEC_IN_MSEC = 1000
+NS_IN_MSEC = 1_000_000
 ITERATIONS = 30
 
 
@@ -113,7 +116,9 @@ def test_restore_latency(
     snapshot = vm.snapshot_full()
     vm.kill()
 
-    metrics.put_dimensions({"performance_test": "test_restore_latency"})
+    metrics.put_dimensions(
+        {"performance_test": "test_restore_latency", "uffd_handler": "None"}
+    )
 
     for microvm in microvm_factory.build_n_from_snapshot(snapshot, ITERATIONS):
         value = 0
@@ -126,3 +131,46 @@ def test_restore_latency(
                 break
         assert value > 0
         metrics.put_metric("latency", value, "Milliseconds")
+
+
+# When using the fault-all handler, all guest memory will be faulted in way before the helper tool
+# wakes up, because it gets faulted in on the first page fault. In this scenario, we are not measuring UFFD
+# latencies, but KVM latencies of setting up missing EPT entries.
+@pytest.mark.nonci
+@pytest.mark.parametrize("uffd_handler", [None, "valid", "fault_all"])
+def test_post_restore_latency(
+    microvm_factory, rootfs, guest_kernel_linux_5_10, metrics, uffd_handler
+):
+    """Collects latency metric of post-restore memory accesses done inside the guest"""
+    test_setup = SnapshotRestoreTest(mem=1024, vcpus=2)
+    vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs, metrics)
+
+    vm.ssh.check_output(
+        "nohup /usr/local/bin/fast_page_fault_helper >/dev/null 2>&1 </dev/null &"
+    )
+
+    # Give helper time to initialize
+    time.sleep(5)
+
+    snapshot = vm.snapshot_full()
+    vm.kill()
+
+    metrics.put_dimensions(
+        {
+            "performance_test": "test_post_restore_latency",
+            "uffd_handler": str(uffd_handler),
+        }
+    )
+
+    for microvm in microvm_factory.build_n_from_snapshot(
+        snapshot, ITERATIONS, uffd_handler_name=uffd_handler
+    ):
+        _, pid, _ = microvm.ssh.check_output("pidof fast_page_fault_helper")
+
+        microvm.ssh.check_output(f"kill -s {signal.SIGUSR1} {pid}")
+
+        _, duration, _ = microvm.ssh.check_output(
+            "while [ ! -f /tmp/fast_page_fault_helper.out ]; do sleep 1; done; cat /tmp/fast_page_fault_helper.out"
+        )
+
+        metrics.put_metric("fault_latency", int(duration) / NS_IN_MSEC, "Milliseconds")

From 187f8078dff42296e017dd005a97228f067654fb Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Tue, 4 Mar 2025 08:32:36 +0000
Subject: [PATCH 17/23] test: Also emit post-restore latency metrics for huge
 pages

Runs the new test_post_restore_latency test also for hugepages-backed
VMs

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 .../performance/test_snapshot_ab.py                 | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py
index 9133eb9d4de..70d0ed6adc2 100644
--- a/tests/integration_tests/performance/test_snapshot_ab.py
+++ b/tests/integration_tests/performance/test_snapshot_ab.py
@@ -10,7 +10,7 @@
 import pytest
 
 import host_tools.drive as drive_tools
-from framework.microvm import Microvm
+from framework.microvm import HugePagesConfig, Microvm
 
 USEC_IN_MSEC = 1000
 NS_IN_MSEC = 1_000_000
@@ -36,6 +36,7 @@ class SnapshotRestoreTest:
     nets: int = 3
     blocks: int = 3
     all_devices: bool = False
+    huge_pages: HugePagesConfig = HugePagesConfig.NONE
 
     @property
     def id(self):
@@ -55,6 +56,7 @@ def boot_vm(self, microvm_factory, guest_kernel, rootfs, metrics) -> Microvm:
             vcpu_count=self.vcpus,
             mem_size_mib=self.mem,
             rootfs_io_engine="Sync",
+            huge_pages=self.huge_pages,
         )
 
         for _ in range(self.nets):
@@ -77,6 +79,7 @@ def boot_vm(self, microvm_factory, guest_kernel, rootfs, metrics) -> Microvm:
                 "block_devices": str(self.blocks),
                 "vsock_devices": str(int(self.all_devices)),
                 "balloon_devices": str(int(self.all_devices)),
+                "huge_pages_config": str(self.huge_pages),
                 **vm.dimensions,
             }
         )
@@ -138,11 +141,15 @@ def test_restore_latency(
 # latencies, but KVM latencies of setting up missing EPT entries.
 @pytest.mark.nonci
 @pytest.mark.parametrize("uffd_handler", [None, "valid", "fault_all"])
+@pytest.mark.parametrize("huge_pages", HugePagesConfig)
 def test_post_restore_latency(
-    microvm_factory, rootfs, guest_kernel_linux_5_10, metrics, uffd_handler
+    microvm_factory, rootfs, guest_kernel_linux_5_10, metrics, uffd_handler, huge_pages
 ):
     """Collects latency metric of post-restore memory accesses done inside the guest"""
-    test_setup = SnapshotRestoreTest(mem=1024, vcpus=2)
+    if huge_pages != HugePagesConfig.NONE and uffd_handler is None:
+        pytest.skip("huge page snapshots can only be restored using uffd")
+
+    test_setup = SnapshotRestoreTest(mem=1024, vcpus=2, huge_pages=huge_pages)
     vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs, metrics)
 
     vm.ssh.check_output(

From 5af2dc208f2de31dc7fa58958c3bd716057aa7bc Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Tue, 4 Mar 2025 08:56:28 +0000
Subject: [PATCH 18/23] test: Add post-restore population latency test

While the post_restore_latency test measures the time on-demand faulting
takes from the guest's perspective, this test measures the actual time
it takes to fault in guest memory. We cannot collect this data by simply
using the fault_all handler in the post-restore test, because the
handler will get triggered way before the fast_page_fault_helper script
continues running (because the first page fault will be triggered by
sshd or the kernel, so by the time out helper runs, the uffd handler
will already be done and we won't notice any latency). Therefore, have
the fault_all handler print the time it took to populate all of guest
memory to its log file, and parse this number.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 .../examples/uffd/fault_all_handler.rs        |  5 +++
 .../performance/test_snapshot_ab.py           | 40 +++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/src/firecracker/examples/uffd/fault_all_handler.rs b/src/firecracker/examples/uffd/fault_all_handler.rs
index d0f127a8ce1..6711350497a 100644
--- a/src/firecracker/examples/uffd/fault_all_handler.rs
+++ b/src/firecracker/examples/uffd/fault_all_handler.rs
@@ -11,6 +11,7 @@ use std::fs::File;
 use std::os::unix::net::UnixListener;
 
 use uffd_utils::{Runtime, UffdHandler};
+use utils::time::{get_time_us, ClockType};
 
 fn main() {
     let mut args = std::env::args();
@@ -34,9 +35,13 @@ fn main() {
 
         match event {
             userfaultfd::Event::Pagefault { .. } => {
+                let start = get_time_us(ClockType::Monotonic);
                 for region in uffd_handler.mem_regions.clone() {
                     uffd_handler.serve_pf(region.base_host_virt_addr as _, region.size);
                 }
+                let end = get_time_us(ClockType::Monotonic);
+
+                println!("Finished Faulting All: {}us", end - start);
             }
             _ => panic!("Unexpected event on userfaultfd"),
         }
diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py
index 70d0ed6adc2..7e20befcc27 100644
--- a/tests/integration_tests/performance/test_snapshot_ab.py
+++ b/tests/integration_tests/performance/test_snapshot_ab.py
@@ -1,6 +1,7 @@
 # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 # SPDX-License-Identifier: Apache-2.0
 """Performance benchmark for snapshot restore."""
+import re
 import signal
 import tempfile
 import time
@@ -181,3 +182,42 @@ def test_post_restore_latency(
         )
 
         metrics.put_metric("fault_latency", int(duration) / NS_IN_MSEC, "Milliseconds")
+
+
+@pytest.mark.nonci
+@pytest.mark.parametrize("huge_pages", HugePagesConfig)
+def test_population_latency(
+    microvm_factory, rootfs, guest_kernel_linux_5_10, metrics, huge_pages
+):
+    """Collects population latency metrics (e.g. how long it takes UFFD handler to fault in all memory)"""
+    test_setup = SnapshotRestoreTest(mem=128, vcpus=1, huge_pages=huge_pages)
+    vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs, metrics)
+    snapshot = vm.snapshot_full()
+    vm.kill()
+
+    metrics.put_dimensions(
+        {"performance_test": "test_population_latency", "uffd_handler": "fault_all"}
+    )
+
+    for microvm in microvm_factory.build_n_from_snapshot(
+        snapshot, ITERATIONS, uffd_handler_name="fault_all"
+    ):
+        # do _something_ to trigger a pagefault, which will then cause the UFFD handler to fault in _everything_
+        microvm.ssh.check_output("true")
+
+        for _ in range(5):
+            time.sleep(1)
+
+            match = re.match(
+                r"Finished Faulting All: (\d+)us", microvm.uffd_handler.log_data
+            )
+
+            if match:
+                latency_us = int(match.group(1))
+
+                metrics.put_metric(
+                    "populate_latency", latency_us / 1000, "Milliseconds"
+                )
+                break
+        else:
+            raise RuntimeError("UFFD handler did not print population latency after 5s")

From b0ecfb4b742e6162959309166e44212f36339da8 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Tue, 4 Mar 2025 13:42:55 +0000
Subject: [PATCH 19/23] fix(ab): its dict.keys(), not dict.key()

We were calling a non-existing function on an error path. This is why I
prefer compiled languages smh.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tools/ab_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ab_test.py b/tools/ab_test.py
index 7349adcb862..b296627ba31 100755
--- a/tools/ab_test.py
+++ b/tools/ab_test.py
@@ -136,7 +136,7 @@ def load_data_series(report_path: Path, tag=None, *, reemit: bool = False):
                     # multiple EMF log messages. We need to reassemble :(
                     assert (
                         processed_emf[dimension_set].keys() == result.keys()
-                    ), f"Found incompatible metrics associated with dimension set {dimension_set}: {processed_emf[dimension_set].key()} in one EMF message, but {result.keys()} in another."
+                    ), f"Found incompatible metrics associated with dimension set {dimension_set}: {processed_emf[dimension_set].keys()} in one EMF message, but {result.keys()} in another."
 
                     for metric, (values, unit) in processed_emf[dimension_set].items():
                         assert result[metric][1] == unit

From c2100d90fff1a2a9511faa5bcd280857f1889c8f Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Tue, 4 Mar 2025 14:22:03 +0000
Subject: [PATCH 20/23] test(ab): allow dimensions to be split into multiple
 lists

When using metrics.put_dimensions from python code, then the resulting
EMF messages will have a list of lists in their "Dimensions" key.
Currently, ab_test.py assumes that this list of lists always has length
one, meaning it ignores additional dimensions added via put_dimensions.
Have it instead flatten the list of lists so that it always considers
all dimensions.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tools/ab_test.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/ab_test.py b/tools/ab_test.py
index b296627ba31..7c4cdcada7a 100755
--- a/tools/ab_test.py
+++ b/tools/ab_test.py
@@ -64,7 +64,11 @@ def extract_dimensions(emf):
         # Skipped tests emit a duration metric, but have no dimensions set
         return {}
 
-    dimension_list = emf["_aws"]["CloudWatchMetrics"][0]["Dimensions"][0]
+    dimension_list = [
+        dim
+        for dimensions in emf["_aws"]["CloudWatchMetrics"][0]["Dimensions"]
+        for dim in dimensions
+    ]
     return {key: emf[key] for key in emf if key in dimension_list}
 
 

From 03ff310cb647f5cf85575d64c22a54b4df4e5cd8 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Thu, 6 Mar 2025 12:33:21 +0000
Subject: [PATCH 21/23] refactor: rename uffd_valid_handler to
 uffd_on_demand_handler

This better describes what the handler actually does (technically,
fault_all_handler is also "valid"), and now that some tests emit the
name of the uffd handler as a metric, the names should be descriptive
from the get-go to avoid confusion and future renamings.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 docs/snapshotting/handling-page-faults-on-snapshot-resume.md  | 2 +-
 src/firecracker/Cargo.toml                                    | 4 ++--
 .../examples/uffd/{valid_handler.rs => on_demand_handler.rs}  | 0
 tests/integration_tests/functional/test_uffd.py               | 2 +-
 tests/integration_tests/performance/test_huge_pages.py        | 4 ++--
 tests/integration_tests/performance/test_snapshot_ab.py       | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)
 rename src/firecracker/examples/uffd/{valid_handler.rs => on_demand_handler.rs} (100%)

diff --git a/docs/snapshotting/handling-page-faults-on-snapshot-resume.md b/docs/snapshotting/handling-page-faults-on-snapshot-resume.md
index ea790624a18..a5da980124e 100644
--- a/docs/snapshotting/handling-page-faults-on-snapshot-resume.md
+++ b/docs/snapshotting/handling-page-faults-on-snapshot-resume.md
@@ -162,7 +162,7 @@ connect/send data.
 ### Example
 
 An example of a handler process can be found
-[here](../../src/firecracker/examples/uffd/valid_handler.rs). The process is
+[here](../../src/firecracker/examples/uffd/on_demand_handler.rs). The process is
 designed to tackle faults on a certain address by loading into memory the entire
 region that the address belongs to, but users can choose any other behavior that
 suits their use case best.
diff --git a/src/firecracker/Cargo.toml b/src/firecracker/Cargo.toml
index 29778af061d..40b2795050d 100644
--- a/src/firecracker/Cargo.toml
+++ b/src/firecracker/Cargo.toml
@@ -57,8 +57,8 @@ name = "uffd_malicious_handler"
 path = "examples/uffd/malicious_handler.rs"
 
 [[example]]
-name = "uffd_valid_handler"
-path = "examples/uffd/valid_handler.rs"
+name = "uffd_on_demand_handler"
+path = "examples/uffd/on_demand_handler.rs"
 
 [[example]]
 name = "uffd_fault_all_handler"
diff --git a/src/firecracker/examples/uffd/valid_handler.rs b/src/firecracker/examples/uffd/on_demand_handler.rs
similarity index 100%
rename from src/firecracker/examples/uffd/valid_handler.rs
rename to src/firecracker/examples/uffd/on_demand_handler.rs
diff --git a/tests/integration_tests/functional/test_uffd.py b/tests/integration_tests/functional/test_uffd.py
index 35b03d7aa34..95819844f05 100644
--- a/tests/integration_tests/functional/test_uffd.py
+++ b/tests/integration_tests/functional/test_uffd.py
@@ -92,7 +92,7 @@ def test_valid_handler(uvm_plain, snapshot):
     vm.spawn()
 
     # Spawn page fault handler process.
-    pf_handler = spawn_pf_handler(vm, uffd_handler("valid"), snapshot.mem)
+    pf_handler = spawn_pf_handler(vm, uffd_handler("on_demand"), snapshot.mem)
 
     vm.restore_from_snapshot(snapshot, resume=True, uffd_path=pf_handler.socket_path)
 
diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py
index acfee58139d..5839245ebd9 100644
--- a/tests/integration_tests/performance/test_huge_pages.py
+++ b/tests/integration_tests/performance/test_huge_pages.py
@@ -93,7 +93,7 @@ def test_hugetlbfs_snapshot(microvm_factory, guest_kernel_linux_5_10, rootfs):
     vm.spawn()
 
     # Spawn page fault handler process.
-    pf_handler = spawn_pf_handler(vm, uffd_handler("valid"), snapshot.mem)
+    pf_handler = spawn_pf_handler(vm, uffd_handler("on_demand"), snapshot.mem)
 
     vm.restore_from_snapshot(snapshot, resume=True, uffd_path=pf_handler.socket_path)
 
@@ -135,7 +135,7 @@ def test_hugetlbfs_diff_snapshot(microvm_factory, uvm_plain):
     vm.spawn()
 
     # Spawn page fault handler process.
-    pf_handler = spawn_pf_handler(vm, uffd_handler("valid"), snapshot_merged.mem)
+    pf_handler = spawn_pf_handler(vm, uffd_handler("on_demand"), snapshot_merged.mem)
 
     vm.restore_from_snapshot(
         snapshot_merged, resume=True, uffd_path=pf_handler.socket_path
diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py
index 7e20befcc27..655566dcb13 100644
--- a/tests/integration_tests/performance/test_snapshot_ab.py
+++ b/tests/integration_tests/performance/test_snapshot_ab.py
@@ -141,7 +141,7 @@ def test_restore_latency(
 # wakes up, because it gets faulted in on the first page fault. In this scenario, we are not measuring UFFD
 # latencies, but KVM latencies of setting up missing EPT entries.
 @pytest.mark.nonci
-@pytest.mark.parametrize("uffd_handler", [None, "valid", "fault_all"])
+@pytest.mark.parametrize("uffd_handler", [None, "on_demand", "fault_all"])
 @pytest.mark.parametrize("huge_pages", HugePagesConfig)
 def test_post_restore_latency(
     microvm_factory, rootfs, guest_kernel_linux_5_10, metrics, uffd_handler, huge_pages

From 35861f092a2133a289cc77f908dae68678f919f3 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Thu, 6 Mar 2025 13:17:50 +0000
Subject: [PATCH 22/23] test: avoid memsetting in favor of just touching one
 byte per page

To bring down the overhead of the testing script, only touch a single
byte on each page (as that's enough to fault in the entire page),
instead of memsetting everything.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 .../overlay/usr/local/bin/fast_page_fault_helper.c   | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/resources/overlay/usr/local/bin/fast_page_fault_helper.c b/resources/overlay/usr/local/bin/fast_page_fault_helper.c
index 96e565f0c3f..7558f7b09fc 100644
--- a/resources/overlay/usr/local/bin/fast_page_fault_helper.c
+++ b/resources/overlay/usr/local/bin/fast_page_fault_helper.c
@@ -19,6 +19,14 @@
 
 #define MEM_SIZE_MIB (128 * 1024 * 1024)
 #define NANOS_PER_SEC 1000000000
+#define PAGE_SIZE 4096
+
+void touch_memory(void *mem, size_t size, char val) {
+    void *end = mem + size;
+    for (; mem < end; mem += PAGE_SIZE) {
+        *((char *)mem) = val;
+    }
+}
 
 int main() {
     sigset_t set;
@@ -45,12 +53,12 @@ int main() {
         return 1;
     }
 
-    memset(ptr, 1, MEM_SIZE_MIB);
+    touch_memory(ptr, MEM_SIZE_MIB, 1);
 
     sigwait(&set, &signal);
 
     clock_gettime(CLOCK_BOOTTIME, &start);
-    memset(ptr, 2, MEM_SIZE_MIB);
+    touch_memory(ptr, MEM_SIZE_MIB, 2);
     clock_gettime(CLOCK_BOOTTIME, &end);
 
     duration_nanos = (end.tv_sec - start.tv_sec) * NANOS_PER_SEC + end.tv_nsec - start.tv_nsec;

From a2fb9442393df5c127e797766c70b5a19cd6041c Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Thu, 6 Mar 2025 16:35:12 +0000
Subject: [PATCH 23/23] fix: allocate more huge pages for performance tests

On m6a/6.1, post_populate with fault_all handler and 2M pages fails
because of the UFFD handler panicking during faulting, returning
`PartiallyCopied(1022MB)`. This is because we are depleting the huge
pages pool. Thus, allocate some more.

Admittedly, I do not understand why thisd only happens on m6a/6.1.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tools/devtool | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/devtool b/tools/devtool
index 5f495b838c1..35794256138 100755
--- a/tools/devtool
+++ b/tools/devtool
@@ -743,9 +743,9 @@ cmd_test() {
       # It seems that even if the tests using huge pages run sequentially on ag=1 agents, right-sizing the huge pages
       # pool to the total number of huge pages used across all tests results in spurious failures with pool depletion
       # anyway (something else on the host seems to be stealing our huge pages, and we cannot "ear mark" them for
-      # Firecracker processes). Thus, just allocate 4GB of them and call it a day.
+      # Firecracker processes). Thus, just allocate 8GB of them and call it a day.
       say "Setting up huge pages pool"
-      num_hugetlbfs_pages=2048
+      num_hugetlbfs_pages=4096
 
       huge_pages_old=$(cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages)
       huge_pages_new=$(echo $num_hugetlbfs_pages |sudo tee /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages)