From dd6783b3f485ec91501da00c3aa441b43789f746 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Thu, 17 Oct 2024 13:21:07 +0100 Subject: [PATCH 1/7] test: dump debug information whenever `ssh.check_output` fails Print microvm logs and stack backtraces whenever running a command via SSH inside the guest fails (instead of only doing this if `wait_for_ssh_up` fails). To this by adding an `on_error` hook that gets called from `SSHConnection.check_output` if running the SSH command fails. Potentially, this could later be extended to automatically snapshot the microvm inside the hook, but for now, let's only print some data. Signed-off-by: Patrick Roy --- tests/framework/microvm.py | 27 ++++++++++++++++----------- tests/host_tools/network.py | 25 ++++++++++++++++++++++--- 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index 7a0b57cf610..ed02488bc7b 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -983,6 +983,7 @@ def ssh_iface(self, iface_idx=0): ssh_key=self.ssh_key, user="root", host=guest_ip, + on_error=self._dump_debug_information, ) @property @@ -1002,19 +1003,23 @@ def thread_backtraces(self): ) return "\n".join(backtraces) + def _dump_debug_information(self, exc: Exception): + """ + Dumps debug information about this microvm + + Used for example when running a command inside the guest via `SSHConnection.check_output` fails. + """ + print( + f"Failure executing command via SSH in microVM: {exc}\n\n" + f"Firecracker logs:\n{self.log_data}\n" + f"Thread backtraces:\n{self.thread_backtraces}" + ) + def wait_for_ssh_up(self): """Wait for guest running inside the microVM to come up and respond.""" - try: - # Ensure that we have an initialized SSH connection to the guest that can - # run commands. The actual connection retry loop happens in SSHConnection._init_connection - self.ssh_iface(0) - except Exception as exc: - print( - f"Failed to establish SSH connection to guest: {exc}\n\n" - f"Firecracker logs:\n{self.log_data}\n" - f"Thread backtraces:\n{self.thread_backtraces}" - ) - raise + # Ensure that we have an initialized SSH connection to the guest that can + # run commands. The actual connection retry loop happens in SSHConnection._init_connection + _ = self.ssh_iface(0) class MicroVMFactory: diff --git a/tests/host_tools/network.py b/tests/host_tools/network.py index 4aba89da603..5175084b90f 100644 --- a/tests/host_tools/network.py +++ b/tests/host_tools/network.py @@ -25,7 +25,7 @@ class SSHConnection: ssh -i ssh_key_path username@hostname """ - def __init__(self, netns, ssh_key: Path, host, user): + def __init__(self, netns, ssh_key: Path, host, user, *, on_error=None): """Instantiate a SSH client and connect to a microVM.""" self.netns = netns self.ssh_key = ssh_key @@ -37,6 +37,8 @@ def __init__(self, netns, ssh_key: Path, host, user): self.host = host self.user = user + self._on_error = None + self.options = [ "-o", "LogLevel=ERROR", @@ -52,7 +54,18 @@ def __init__(self, netns, ssh_key: Path, host, user): str(self.ssh_key), ] - self._init_connection() + # _init_connection loops until it can connect to the guest + # dumping debug state on every iteration is not useful or wanted, so + # only dump it once if _all_ iterations fail. + try: + self._init_connection() + except Exception as exc: + if on_error: + on_error(exc) + + raise + + self._on_error = on_error def remote_path(self, path): """Convert a path to remote""" @@ -123,7 +136,13 @@ def _exec(self, cmd, timeout=None, check=False): if self.netns is not None: cmd = ["ip", "netns", "exec", self.netns] + cmd - return utils.run_cmd(cmd, check=check, timeout=timeout) + try: + return utils.run_cmd(cmd, check=check, timeout=timeout) + except Exception as exc: + if self._on_error: + self._on_error(exc) + + raise def mac_from_ip(ip_address): From 6461b20297411e382e61dcd2eb274a5198e7c66b Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Fri, 18 Oct 2024 16:47:15 +0100 Subject: [PATCH 2/7] test: stop logging warnings in run_cmd if exception is thrown If the exception ends up being fatal, then it will get printed in full anyway, and it contains the entire log message anyway, and we just end up with duplicate messages. If its not fatal (e.g. caught), then its arguably up to the caller to decide whether it should be logged. Signed-off-by: Patrick Roy --- tests/framework/utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/framework/utils.py b/tests/framework/utils.py index b5fb324cbd7..88e520fa5f3 100644 --- a/tests/framework/utils.py +++ b/tests/framework/utils.py @@ -415,8 +415,6 @@ def run_cmd(cmd, check=False, shell=True, cwd=None, timeout=None) -> CommandRetu # If a non-zero return code was thrown, raise an exception if check and proc.returncode != 0: - CMDLOG.warning("Command failed: %s\n", output_message) - raise ChildProcessError(output_message) CMDLOG.debug(output_message) From b1013563a6d1a994b41970c82b6f5057771ab7b7 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Mon, 21 Oct 2024 07:18:30 +0100 Subject: [PATCH 3/7] test_5_snapshots: clarify variable usage Only keep a single "snapshot" variable outside the loop Signed-off-by: Patrick Roy --- .../functional/test_snapshot_basic.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py index e07175a662b..ac596440f67 100644 --- a/tests/integration_tests/functional/test_snapshot_basic.py +++ b/tests/integration_tests/functional/test_snapshot_basic.py @@ -157,7 +157,6 @@ def test_5_snapshots( # Create a snapshot from a microvm. start_guest_echo_server(vm) snapshot = vm.make_snapshot(snapshot_type) - base_snapshot = snapshot vm.kill() for i in range(seq_len): @@ -183,20 +182,20 @@ def test_5_snapshots( time.sleep(2) logger.info("Create snapshot %s #%d.", snapshot_type, i + 1) - snapshot = microvm.make_snapshot(snapshot_type) + new_snapshot = microvm.make_snapshot(snapshot_type) # If we are testing incremental snapshots we must merge the base with # current layer. if snapshot.is_diff: - logger.info("Base: %s, Layer: %s", base_snapshot.mem, snapshot.mem) - snapshot = snapshot.rebase_snapshot( - base_snapshot, use_snapshot_editor=use_snapshot_editor + logger.info("Base: %s, Layer: %s", snapshot.mem, new_snapshot.mem) + new_snapshot = new_snapshot.rebase_snapshot( + snapshot, use_snapshot_editor=use_snapshot_editor ) microvm.kill() copied_snapshot.delete() # Update the base for next iteration. - base_snapshot = snapshot + snapshot = new_snapshot def test_patch_drive_snapshot(uvm_nano, microvm_factory): From c0631e87529d17a51e6ef9fa88c5d2d57359680c Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Mon, 21 Oct 2024 07:28:01 +0100 Subject: [PATCH 4/7] test(vsock): Do not exit with 255 in remote bash script Exit code 255 is already used by the SSH client to communicate connection errors [1], meaning if the remote command also exits with 255 in failure cases, this will be indistinguishable from a connection error. Use exit code 1 instead in `check_guest_connections`. [1]: https://man.openbsd.org/ssh#EXIT_STATUS Signed-off-by: Patrick Roy --- tests/framework/utils_vsock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/framework/utils_vsock.py b/tests/framework/utils_vsock.py index 408aacd7f2f..1f43bb6c897 100644 --- a/tests/framework/utils_vsock.py +++ b/tests/framework/utils_vsock.py @@ -180,7 +180,7 @@ def check_guest_connections(vm, server_port_path, blob_path, blob_hash): cmd += " ({})& ".format(worker_cmd) cmd += ' workers="$workers $!";' cmd += "done;" - cmd += "for w in $workers; do wait $w || exit -1; done" + cmd += "for w in $workers; do wait $w || exit 1; done" ecode, _, stderr = vm.ssh.run(cmd) echo_server.terminate() From 6378988a788a1a9b2f84e685dd0e0d4a191a67a1 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Mon, 21 Oct 2024 07:29:37 +0100 Subject: [PATCH 5/7] test(vsock): wait before exit-ing in check_guest_connections Even if one child process failed, still call `wait` to ensure all other child vsock_helper processes also exit, to ensure clean socket teardown/etc. Signed-off-by: Patrick Roy --- tests/framework/utils_vsock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/framework/utils_vsock.py b/tests/framework/utils_vsock.py index 1f43bb6c897..3f6885e3afd 100644 --- a/tests/framework/utils_vsock.py +++ b/tests/framework/utils_vsock.py @@ -180,7 +180,7 @@ def check_guest_connections(vm, server_port_path, blob_path, blob_hash): cmd += " ({})& ".format(worker_cmd) cmd += ' workers="$workers $!";' cmd += "done;" - cmd += "for w in $workers; do wait $w || exit 1; done" + cmd += "for w in $workers; do wait $w || (wait; exit 1); done" ecode, _, stderr = vm.ssh.run(cmd) echo_server.terminate() From 3ae42d670b513e702c500109bf9d4b39f42cb889 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Mon, 21 Oct 2024 07:32:04 +0100 Subject: [PATCH 6/7] test(vsock_helper): close sockets After we're done with a socket, close it, to avoid leaking resources. We do not need to rebuild the rootfs for this, this specific binary is compiled on the fly and SCP'd into the microVM. Signed-off-by: Patrick Roy --- tests/host_tools/vsock_helper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/host_tools/vsock_helper.c b/tests/host_tools/vsock_helper.c index 00424dd6b6e..9368ffd79d6 100644 --- a/tests/host_tools/vsock_helper.c +++ b/tests/host_tools/vsock_helper.c @@ -84,7 +84,7 @@ int run_echo(uint32_t cid, uint32_t port) { } } - return 0; + return close(sock); } From 8cfed0dd87ec11309c4f070a6431c3b46d2027c9 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Mon, 21 Oct 2024 07:33:06 +0100 Subject: [PATCH 7/7] test(net): relax timeouts The main reason we have timeouts is so that we can get some useful debugging information in case they do happen. This means the timesouts only need to be strict enough to avoid running into the pytest timeout instead. 10s/1s however still gets us spurious failures, since in the non-performance tests contention on host networking resources (we use a TON of networking namespaces) is pretty bad. So increase the timeouts to something that still allows us to not run into the pytest timeout, while hopefully returning spurious failures once and for all. Signed-off-by: Patrick Roy --- tests/host_tools/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/host_tools/network.py b/tests/host_tools/network.py index 5175084b90f..ba460a74233 100644 --- a/tests/host_tools/network.py +++ b/tests/host_tools/network.py @@ -103,7 +103,7 @@ def _init_connection(self): We'll keep trying to execute a remote command that can't fail (`/bin/true`), until we get a successful (0) exit code. """ - self.check_output("true", timeout=10, debug=True) + self.check_output("true", timeout=100, debug=True) def run(self, cmd_string, timeout=None, *, check=False, debug=False): """