From 9f308f92850e867f4685e7a968877557be6a04c4 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Tue, 3 Sep 2024 15:15:20 +0100
Subject: [PATCH 1/7] test: Use `Microvm.wait_for_up` in tests that boot via
 config

Tests that boot a microvm via a config file do not use `.start()`, as
`.spawn()` already start the bootup process. This is probably why these
tests were overlooked when we did the `.wait_for_up()` refactor.

Fix potential issues by adding the `wait_for_up()` calls here, to
replace various homegrown waiting solutions.

Some tests do not have network devices, and also don't really need to
wait for the guest to have booted as they only validate firecracker API
responses. Don't try to use wait_for_up in those (since we can't
anyway).

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 .../integration_tests/functional/test_cmd_line_start.py  | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/integration_tests/functional/test_cmd_line_start.py b/tests/integration_tests/functional/test_cmd_line_start.py
index 82425458e89..9b76b6fe5f3 100644
--- a/tests/integration_tests/functional/test_cmd_line_start.py
+++ b/tests/integration_tests/functional/test_cmd_line_start.py
@@ -7,7 +7,6 @@
 import platform
 import re
 import shutil
-import time
 from pathlib import Path
 
 import pytest
@@ -164,7 +163,7 @@ def test_config_start_no_api_exit(uvm_plain, vm_config_file):
     test_microvm.jailer.extra_args.update({"no-api": None})
 
     test_microvm.spawn()  # Start Firecracker and MicroVM
-    time.sleep(3)  # Wait for startup
+    test_microvm.wait_for_up()
     test_microvm.ssh.run("reboot")  # Exit
 
     test_microvm.mark_killed()  # waits for process to terminate
@@ -266,7 +265,7 @@ def test_config_start_with_limit(uvm_plain, vm_config_file):
     response += '{ "error": "Request payload with size 260 is larger than '
     response += "the limit of 250 allowed by server.\n"
     response += 'All previous unanswered requests will be dropped." }'
-    _, stdout, _stderr = utils.check_output(cmd)
+    _, stdout, _ = utils.check_output(cmd)
     assert stdout.encode("utf-8") == response.encode("utf-8")
 
 
@@ -420,8 +419,7 @@ def test_config_start_and_mmds_with_api(uvm_plain, vm_config_file):
 
     # Network namespace has already been created.
     test_microvm.spawn()
-
-    assert test_microvm.state == "Running"
+    test_microvm.wait_for_up()
 
     data_store = {
         "latest": {
@@ -480,6 +478,7 @@ def test_with_config_and_metadata_no_api(uvm_plain, vm_config_file, metadata_fil
     _configure_network_interface(test_microvm)
     test_microvm.jailer.extra_args.update({"no-api": None})
     test_microvm.spawn()
+    test_microvm.wait_for_up()
 
     # Get MMDS version and IPv4 address configured from the file.
     version, ipv4_address = _get_optional_fields_from_file(vm_config_file)

From 6b3ead7a94cf9494bd315dd2e9f484ebbd8b5d6e Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Tue, 3 Sep 2024 15:18:35 +0100
Subject: [PATCH 2/7] test: Remove useless variable assignment

The assigned value was immediately overwritten without ever being read.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/integration_tests/functional/test_cmd_line_start.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration_tests/functional/test_cmd_line_start.py b/tests/integration_tests/functional/test_cmd_line_start.py
index 9b76b6fe5f3..79793c2157d 100644
--- a/tests/integration_tests/functional/test_cmd_line_start.py
+++ b/tests/integration_tests/functional/test_cmd_line_start.py
@@ -432,7 +432,7 @@ def test_config_start_and_mmds_with_api(uvm_plain, vm_config_file):
     assert response.json() == {}
 
     # Populate MMDS with data.
-    response = test_microvm.api.mmds.put(**data_store)
+    test_microvm.api.mmds.put(**data_store)
 
     # Ensure the MMDS contents have been successfully updated.
     response = test_microvm.api.mmds.get()

From dfdd510345b90f1f12a5eabb407c6ced02813414 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Tue, 3 Sep 2024 16:07:46 +0100
Subject: [PATCH 3/7] test: remove useless statement

Remove assignment to a variable that was never read

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/integration_tests/functional/test_balloon.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py
index cd4b3d33637..10171ff1f12 100644
--- a/tests/integration_tests/functional/test_balloon.py
+++ b/tests/integration_tests/functional/test_balloon.py
@@ -74,8 +74,6 @@ def make_guest_dirty_memory(ssh_connection, amount_mib=32):
             logger.error("while running: %s", cmd)
             logger.error("stdout: %s", stdout)
             logger.error("stderr: %s", stderr)
-
-        cmd = "cat /tmp/fillmem_output.txt"
     except TimeoutExpired:
         # It's ok if this expires. Some times the SSH connection
         # gets killed by the OOM killer *after* the fillmem program

From 2b2f9f37205144d36b6b9a2c66aaa3282d072449 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Tue, 3 Sep 2024 16:08:18 +0100
Subject: [PATCH 4/7] test: fix typo in comment

"Some times the ..." -> "Sometimes the ..." (no space)

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/integration_tests/functional/test_balloon.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py
index 10171ff1f12..1e16b86e5ca 100644
--- a/tests/integration_tests/functional/test_balloon.py
+++ b/tests/integration_tests/functional/test_balloon.py
@@ -75,7 +75,7 @@ def make_guest_dirty_memory(ssh_connection, amount_mib=32):
             logger.error("stdout: %s", stdout)
             logger.error("stderr: %s", stderr)
     except TimeoutExpired:
-        # It's ok if this expires. Some times the SSH connection
+        # It's ok if this expires. Sometimes the SSH connection
         # gets killed by the OOM killer *after* the fillmem program
         # started. As a result, we can ignore timeouts here.
         pass

From 06c4d9b15305046120a2046bf17b1ecd922b4616 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Tue, 3 Sep 2024 17:17:14 +0100
Subject: [PATCH 5/7] test: Remove `test_snapshot_compatibility`

This test serves no purpose, as all it does is take a snapshot without
verifying anything about it. In this sense, it is perfectly subsumed by
`test_balloon_snapshot`, which also creates snapshots with balloon
devices, but actually verifies they restore as well (something that is
not given in this test, because we do not wait for the guest to boot
before taking a snapshot).

A look at the commit history shows that this test used to be responsible
for verifying cross-release snapshot support for the balloon device, but
since we no longer support such a thing, it became superfluous.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 .../functional/test_balloon.py                 | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py
index 1e16b86e5ca..6b5126cf802 100644
--- a/tests/integration_tests/functional/test_balloon.py
+++ b/tests/integration_tests/functional/test_balloon.py
@@ -518,24 +518,6 @@ def test_balloon_snapshot(microvm_factory, guest_kernel, rootfs):
     assert stats_after_snap["available_memory"] > latest_stats["available_memory"]
 
 
-def test_snapshot_compatibility(microvm_factory, guest_kernel, rootfs):
-    """
-    Test that the balloon serializes correctly.
-    """
-    vm = microvm_factory.build(guest_kernel, rootfs)
-    vm.spawn()
-    vm.basic_config(
-        vcpu_count=2,
-        mem_size_mib=256,
-    )
-
-    # Add a memory balloon with stats enabled.
-    vm.api.balloon.put(amount_mib=0, deflate_on_oom=True, stats_polling_interval_s=1)
-
-    vm.start()
-    vm.snapshot_full()
-
-
 def test_memory_scrub(microvm_factory, guest_kernel, rootfs):
     """
     Test that the memory is zeroed after deflate.

From 61786cf64b5945f41255efb5b7820b42333843cb Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Tue, 3 Sep 2024 16:26:31 +0100
Subject: [PATCH 6/7] test: Add missing `wait_for_up`s to balloon tests

Some of the tests in test_balloon.py were missing
`microvm.wait_for_up()` calls. This was mainly visible from those that
read the RSS of the firecracker process taking a long time to calibrate,
due to RSS being very unstable during guest boot up.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/integration_tests/functional/test_balloon.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py
index 6b5126cf802..166a79b91dd 100644
--- a/tests/integration_tests/functional/test_balloon.py
+++ b/tests/integration_tests/functional/test_balloon.py
@@ -129,6 +129,7 @@ def test_rss_memory_lower(uvm_plain_any):
 
     # Start the microvm.
     test_microvm.start()
+    test_microvm.wait_for_up()
 
     _test_rss_memory_lower(test_microvm)
 
@@ -150,6 +151,7 @@ def test_inflate_reduces_free(uvm_plain_any):
 
     # Start the microvm
     test_microvm.start()
+    test_microvm.wait_for_up()
     firecracker_pid = test_microvm.firecracker_pid
 
     # Get the free memory before ballooning.
@@ -301,6 +303,7 @@ def test_size_reduction(uvm_plain_any):
 
     # Start the microvm.
     test_microvm.start()
+    test_microvm.wait_for_up()
     firecracker_pid = test_microvm.firecracker_pid
 
     # Check memory usage.
@@ -343,6 +346,7 @@ def test_stats(uvm_plain_any):
 
     # Start the microvm.
     test_microvm.start()
+    test_microvm.wait_for_up()
     firecracker_pid = test_microvm.firecracker_pid
 
     # Get an initial reading of the stats.
@@ -403,6 +407,7 @@ def test_stats_update(uvm_plain_any):
 
     # Start the microvm.
     test_microvm.start()
+    test_microvm.wait_for_up()
     firecracker_pid = test_microvm.firecracker_pid
 
     # Dirty 30MB of pages.
@@ -454,6 +459,7 @@ def test_balloon_snapshot(microvm_factory, guest_kernel, rootfs):
     )
 
     vm.start()
+    vm.wait_for_up()
 
     # Dirty 60MB of pages.
     make_guest_dirty_memory(vm.ssh, amount_mib=60)
@@ -533,6 +539,7 @@ def test_memory_scrub(microvm_factory, guest_kernel, rootfs):
     )
 
     microvm.start()
+    microvm.wait_for_up()
 
     # Dirty 60MB of pages.
     make_guest_dirty_memory(microvm.ssh, amount_mib=60)

From 4c35583e3f6c583cb0cbae5cf7d3cd1f5743aa27 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Wed, 4 Sep 2024 08:51:59 +0100
Subject: [PATCH 7/7] test: fix check for major page faults in
 test_balloon.py::test_stats

Major page fault occur when a page fault can only be satisfied via disk
I/O [1]. In `test_stats`, we asserted that the number of major page
faults increased during execution of `make_guest_dirty_memory`, but this
cannot happen, as it uses MAP_ANONYMOUS, and thus will not trigger any
major page faults. The reason the test used to work in the past is that
it was not waiting for the VM to boot, so the first reading of the
balloon statistics happened before the boot process was finished.
However, during boot we _do_ expect major faults to happen, as things
need to be read from the rootfs.

Fix the test to instead just assert a non-zero number of major faults
happened during boot.

[1]: https://en.wikipedia.org/wiki/Page_fault#Major

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/integration_tests/functional/test_balloon.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py
index 166a79b91dd..de6a14b554f 100644
--- a/tests/integration_tests/functional/test_balloon.py
+++ b/tests/integration_tests/functional/test_balloon.py
@@ -341,7 +341,9 @@ def test_stats(uvm_plain_any):
 
     # Add a memory balloon with stats enabled.
     test_microvm.api.balloon.put(
-        amount_mib=0, deflate_on_oom=True, stats_polling_interval_s=1
+        amount_mib=0,
+        deflate_on_oom=True,
+        stats_polling_interval_s=STATS_POLLING_INTERVAL_S,
     )
 
     # Start the microvm.
@@ -349,9 +351,18 @@ def test_stats(uvm_plain_any):
     test_microvm.wait_for_up()
     firecracker_pid = test_microvm.firecracker_pid
 
+    # Give Firecracker enough time to poll the stats at least once post-boot
+    time.sleep(STATS_POLLING_INTERVAL_S * 2)
+
     # Get an initial reading of the stats.
     initial_stats = test_microvm.api.balloon_stats.get().json()
 
+    # Major faults happen when a page fault has to be satisfied from disk. They are not
+    # triggered by our `make_guest_dirty_memory` workload, as it uses MAP_ANONYMOUS, which
+    # only triggers minor faults. However, during the boot process, things are read from the
+    # rootfs, so we should at least see a non-zero number of major faults.
+    assert initial_stats["major_faults"] > 0
+
     # Dirty 10MB of pages.
     make_guest_dirty_memory(test_microvm.ssh, amount_mib=10)
     time.sleep(1)
@@ -361,7 +372,6 @@ def test_stats(uvm_plain_any):
     # Make sure that the stats catch the page faults.
     after_workload_stats = test_microvm.api.balloon_stats.get().json()
     assert initial_stats.get("minor_faults", 0) < after_workload_stats["minor_faults"]
-    assert initial_stats.get("major_faults", 0) < after_workload_stats["major_faults"]
 
     # Now inflate the balloon with 10MB of pages.
     test_microvm.api.balloon.patch(amount_mib=10)