From dd6783b3f485ec91501da00c3aa441b43789f746 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Thu, 17 Oct 2024 13:21:07 +0100
Subject: [PATCH 1/7] test: dump debug information whenever `ssh.check_output`
 fails

Print microvm logs and stack backtraces whenever running a command via
SSH inside the guest fails (instead of only doing this if
`wait_for_ssh_up` fails).

To this by adding an `on_error` hook that gets called from
`SSHConnection.check_output` if running the SSH command fails.
Potentially, this could later be extended to automatically snapshot the
microvm inside the hook, but for now, let's only print some data.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/framework/microvm.py  | 27 ++++++++++++++++-----------
 tests/host_tools/network.py | 25 ++++++++++++++++++++++---
 2 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py
index 7a0b57cf610..ed02488bc7b 100644
--- a/tests/framework/microvm.py
+++ b/tests/framework/microvm.py
@@ -983,6 +983,7 @@ def ssh_iface(self, iface_idx=0):
             ssh_key=self.ssh_key,
             user="root",
             host=guest_ip,
+            on_error=self._dump_debug_information,
         )
 
     @property
@@ -1002,19 +1003,23 @@ def thread_backtraces(self):
                 )
         return "\n".join(backtraces)
 
+    def _dump_debug_information(self, exc: Exception):
+        """
+        Dumps debug information about this microvm
+
+        Used for example when running a command inside the guest via `SSHConnection.check_output` fails.
+        """
+        print(
+            f"Failure executing command via SSH in microVM: {exc}\n\n"
+            f"Firecracker logs:\n{self.log_data}\n"
+            f"Thread backtraces:\n{self.thread_backtraces}"
+        )
+
     def wait_for_ssh_up(self):
         """Wait for guest running inside the microVM to come up and respond."""
-        try:
-            # Ensure that we have an initialized SSH connection to the guest that can
-            # run commands. The actual connection retry loop happens in SSHConnection._init_connection
-            self.ssh_iface(0)
-        except Exception as exc:
-            print(
-                f"Failed to establish SSH connection to guest: {exc}\n\n"
-                f"Firecracker logs:\n{self.log_data}\n"
-                f"Thread backtraces:\n{self.thread_backtraces}"
-            )
-            raise
+        # Ensure that we have an initialized SSH connection to the guest that can
+        # run commands. The actual connection retry loop happens in SSHConnection._init_connection
+        _ = self.ssh_iface(0)
 
 
 class MicroVMFactory:
diff --git a/tests/host_tools/network.py b/tests/host_tools/network.py
index 4aba89da603..5175084b90f 100644
--- a/tests/host_tools/network.py
+++ b/tests/host_tools/network.py
@@ -25,7 +25,7 @@ class SSHConnection:
     ssh -i ssh_key_path username@hostname
     """
 
-    def __init__(self, netns, ssh_key: Path, host, user):
+    def __init__(self, netns, ssh_key: Path, host, user, *, on_error=None):
         """Instantiate a SSH client and connect to a microVM."""
         self.netns = netns
         self.ssh_key = ssh_key
@@ -37,6 +37,8 @@ def __init__(self, netns, ssh_key: Path, host, user):
         self.host = host
         self.user = user
 
+        self._on_error = None
+
         self.options = [
             "-o",
             "LogLevel=ERROR",
@@ -52,7 +54,18 @@ def __init__(self, netns, ssh_key: Path, host, user):
             str(self.ssh_key),
         ]
 
-        self._init_connection()
+        # _init_connection loops until it can connect to the guest
+        # dumping debug state on every iteration is not useful or wanted, so
+        # only dump it once if _all_ iterations fail.
+        try:
+            self._init_connection()
+        except Exception as exc:
+            if on_error:
+                on_error(exc)
+
+            raise
+
+        self._on_error = on_error
 
     def remote_path(self, path):
         """Convert a path to remote"""
@@ -123,7 +136,13 @@ def _exec(self, cmd, timeout=None, check=False):
         if self.netns is not None:
             cmd = ["ip", "netns", "exec", self.netns] + cmd
 
-        return utils.run_cmd(cmd, check=check, timeout=timeout)
+        try:
+            return utils.run_cmd(cmd, check=check, timeout=timeout)
+        except Exception as exc:
+            if self._on_error:
+                self._on_error(exc)
+
+            raise
 
 
 def mac_from_ip(ip_address):

From 6461b20297411e382e61dcd2eb274a5198e7c66b Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Fri, 18 Oct 2024 16:47:15 +0100
Subject: [PATCH 2/7] test: stop logging warnings in run_cmd if exception is
 thrown

If the exception ends up being fatal, then it will get printed in full
anyway, and it contains the entire log message anyway, and we just end
up with duplicate messages. If its not fatal (e.g. caught), then its
arguably up to the caller to decide whether it should be logged.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/framework/utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/framework/utils.py b/tests/framework/utils.py
index b5fb324cbd7..88e520fa5f3 100644
--- a/tests/framework/utils.py
+++ b/tests/framework/utils.py
@@ -415,8 +415,6 @@ def run_cmd(cmd, check=False, shell=True, cwd=None, timeout=None) -> CommandRetu
 
     # If a non-zero return code was thrown, raise an exception
     if check and proc.returncode != 0:
-        CMDLOG.warning("Command failed: %s\n", output_message)
-
         raise ChildProcessError(output_message)
 
     CMDLOG.debug(output_message)

From b1013563a6d1a994b41970c82b6f5057771ab7b7 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Mon, 21 Oct 2024 07:18:30 +0100
Subject: [PATCH 3/7] test_5_snapshots: clarify variable usage

Only keep a single "snapshot" variable outside the loop

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 .../functional/test_snapshot_basic.py                 | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py
index e07175a662b..ac596440f67 100644
--- a/tests/integration_tests/functional/test_snapshot_basic.py
+++ b/tests/integration_tests/functional/test_snapshot_basic.py
@@ -157,7 +157,6 @@ def test_5_snapshots(
     # Create a snapshot from a microvm.
     start_guest_echo_server(vm)
     snapshot = vm.make_snapshot(snapshot_type)
-    base_snapshot = snapshot
     vm.kill()
 
     for i in range(seq_len):
@@ -183,20 +182,20 @@ def test_5_snapshots(
 
         time.sleep(2)
         logger.info("Create snapshot %s #%d.", snapshot_type, i + 1)
-        snapshot = microvm.make_snapshot(snapshot_type)
+        new_snapshot = microvm.make_snapshot(snapshot_type)
 
         # If we are testing incremental snapshots we must merge the base with
         # current layer.
         if snapshot.is_diff:
-            logger.info("Base: %s, Layer: %s", base_snapshot.mem, snapshot.mem)
-            snapshot = snapshot.rebase_snapshot(
-                base_snapshot, use_snapshot_editor=use_snapshot_editor
+            logger.info("Base: %s, Layer: %s", snapshot.mem, new_snapshot.mem)
+            new_snapshot = new_snapshot.rebase_snapshot(
+                snapshot, use_snapshot_editor=use_snapshot_editor
             )
 
         microvm.kill()
         copied_snapshot.delete()
         # Update the base for next iteration.
-        base_snapshot = snapshot
+        snapshot = new_snapshot
 
 
 def test_patch_drive_snapshot(uvm_nano, microvm_factory):

From c0631e87529d17a51e6ef9fa88c5d2d57359680c Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Mon, 21 Oct 2024 07:28:01 +0100
Subject: [PATCH 4/7] test(vsock): Do not exit with 255 in remote bash script

Exit code 255 is already used by the SSH client to communicate
connection errors [1], meaning if the remote command also exits with 255
in failure cases, this will be indistinguishable from a connection
error. Use exit code 1 instead in `check_guest_connections`.

[1]: https://man.openbsd.org/ssh#EXIT_STATUS

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/framework/utils_vsock.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/framework/utils_vsock.py b/tests/framework/utils_vsock.py
index 408aacd7f2f..1f43bb6c897 100644
--- a/tests/framework/utils_vsock.py
+++ b/tests/framework/utils_vsock.py
@@ -180,7 +180,7 @@ def check_guest_connections(vm, server_port_path, blob_path, blob_hash):
     cmd += "  ({})& ".format(worker_cmd)
     cmd += '  workers="$workers $!";'
     cmd += "done;"
-    cmd += "for w in $workers; do wait $w || exit -1; done"
+    cmd += "for w in $workers; do wait $w || exit 1; done"
 
     ecode, _, stderr = vm.ssh.run(cmd)
     echo_server.terminate()

From 6378988a788a1a9b2f84e685dd0e0d4a191a67a1 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Mon, 21 Oct 2024 07:29:37 +0100
Subject: [PATCH 5/7] test(vsock): wait before exit-ing in
 check_guest_connections

Even if one child process failed, still call `wait` to ensure all other
child vsock_helper processes also exit, to ensure clean socket
teardown/etc.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/framework/utils_vsock.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/framework/utils_vsock.py b/tests/framework/utils_vsock.py
index 1f43bb6c897..3f6885e3afd 100644
--- a/tests/framework/utils_vsock.py
+++ b/tests/framework/utils_vsock.py
@@ -180,7 +180,7 @@ def check_guest_connections(vm, server_port_path, blob_path, blob_hash):
     cmd += "  ({})& ".format(worker_cmd)
     cmd += '  workers="$workers $!";'
     cmd += "done;"
-    cmd += "for w in $workers; do wait $w || exit 1; done"
+    cmd += "for w in $workers; do wait $w || (wait; exit 1); done"
 
     ecode, _, stderr = vm.ssh.run(cmd)
     echo_server.terminate()

From 3ae42d670b513e702c500109bf9d4b39f42cb889 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Mon, 21 Oct 2024 07:32:04 +0100
Subject: [PATCH 6/7] test(vsock_helper): close sockets

After we're done with a socket, close it, to avoid leaking resources.

We do not need to rebuild the rootfs for this, this specific binary is
compiled on the fly and SCP'd into the microVM.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/host_tools/vsock_helper.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/host_tools/vsock_helper.c b/tests/host_tools/vsock_helper.c
index 00424dd6b6e..9368ffd79d6 100644
--- a/tests/host_tools/vsock_helper.c
+++ b/tests/host_tools/vsock_helper.c
@@ -84,7 +84,7 @@ int run_echo(uint32_t cid, uint32_t port) {
         }
     }
 
-    return 0;
+    return close(sock);
 }
 
 

From 8cfed0dd87ec11309c4f070a6431c3b46d2027c9 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Mon, 21 Oct 2024 07:33:06 +0100
Subject: [PATCH 7/7] test(net): relax timeouts

The main reason we have timeouts is so that we can get some useful
debugging information in case they do happen. This means the timesouts
only need to be strict enough to avoid running into the pytest timeout
instead. 10s/1s however still gets us spurious failures, since in the
non-performance tests contention on host networking resources (we use a
TON of networking namespaces) is pretty bad. So increase the timeouts to
something that still allows us to not run into the pytest timeout, while
hopefully returning spurious failures once and for all.

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/host_tools/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/host_tools/network.py b/tests/host_tools/network.py
index 5175084b90f..ba460a74233 100644
--- a/tests/host_tools/network.py
+++ b/tests/host_tools/network.py
@@ -103,7 +103,7 @@ def _init_connection(self):
         We'll keep trying to execute a remote command that can't fail
         (`/bin/true`), until we get a successful (0) exit code.
         """
-        self.check_output("true", timeout=10, debug=True)
+        self.check_output("true", timeout=100, debug=True)
 
     def run(self, cmd_string, timeout=None, *, check=False, debug=False):
         """