chore: Upload FCMetrics with reduced dimensions

FC metrics were uploaded to CW with the same dimensions as the test to make it easy to map the metrics with the test in CW. However, each test has a huge number of dimensions which when multiplied with the number of FC metrics we upload add upto too many datapoints stored in CW. This makes CW slow hard to monitor the dashboard and also has a significant increase in billing. So limit the dimensions to (cpu-host_version-guest_version) so that we can monitor the dashboard properly and reduce the billing. To map the datapoints with the test that emitted them we'll have to use the datapoint timestamp with some CW query. Signed-off-by: Sudan Landge <sudanl@amazon.com>
firecracker-microvm · Jan 26, 2024 · 6f16d70 · 6f16d70
1 parent c910e44
commit 6f16d70
Show file tree

Hide file tree

Showing 6 changed files with 21 additions and 12 deletions.
diff --git a/tests/host_tools/fcmetrics.py b/tests/host_tools/fcmetrics.py
@@ -16,6 +16,9 @@
 import jsonschema
 import pytest
 
+from framework.properties import global_props
+from host_tools.metrics import get_metrics_logger
+
 
 def validate_fc_metrics(metrics):
     """
@@ -434,15 +437,22 @@ class FCMetricsMonitor(Thread):
     of the metrics.
     """
 
-    def __init__(self, vm, metrics_logger, timer=60):
+    def __init__(self, vm, timer=60):
         Thread.__init__(self, daemon=True)
         self.vm = vm
         self.timer = timer
 
         self.metrics_index = 0
         self.running = False
 
-        self.metrics_logger = metrics_logger
+        self.metrics_logger = get_metrics_logger()
+        self.metrics_logger.set_dimensions(
+            {
+                "instance": global_props.instance,
+                "host_kernel": "linux-" + global_props.host_linux_version,
+                "guest_kernel": vm.kernel_file.stem[2:],
+            }
+        )
 
     def _flush_metrics(self):
         """

diff --git a/tests/integration_tests/performance/test_block_ab.py b/tests/integration_tests/performance/test_block_ab.py
@@ -176,7 +176,7 @@ def test_block_performance(
             **vm.dimensions,
         }
     )
-    fcmetrics = FCMetricsMonitor(vm, metrics)
+    fcmetrics = FCMetricsMonitor(vm)
     fcmetrics.start()
 
     vm.pin_threads(0)
@@ -227,7 +227,7 @@ def test_block_vhost_user_performance(
             **vm.dimensions,
         }
     )
-    fcmetrics = FCMetricsMonitor(vm, metrics)
+    fcmetrics = FCMetricsMonitor(vm)
     fcmetrics.start()
 
     next_cpu = vm.pin_threads(0)

diff --git a/tests/integration_tests/performance/test_memory_overhead.py b/tests/integration_tests/performance/test_memory_overhead.py
@@ -47,7 +47,7 @@ def test_memory_overhead(
         metrics.set_dimensions(
             {"performance_test": "test_memory_overhead", **microvm.dimensions}
         )
-        fcmetrics = FCMetricsMonitor(microvm, metrics)
+        fcmetrics = FCMetricsMonitor(microvm)
         fcmetrics.start()
 
         # check that the vm is running

diff --git a/tests/integration_tests/performance/test_network_ab.py b/tests/integration_tests/performance/test_network_ab.py
@@ -83,7 +83,7 @@ def test_network_latency(network_microvm, metrics, iteration):
             "iteration": str(iteration),
         }
     )
-    fcmetrics = FCMetricsMonitor(network_microvm, metrics)
+    fcmetrics = FCMetricsMonitor(network_microvm)
     fcmetrics.start()
 
     samples = []
@@ -160,7 +160,7 @@ def test_network_tcp_throughput(
             **network_microvm.dimensions,
         }
     )
-    fcmetrics = FCMetricsMonitor(network_microvm, metrics)
+    fcmetrics = FCMetricsMonitor(network_microvm)
     fcmetrics.start()
 
     test = TcpIPerf3Test(

diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py
@@ -77,7 +77,7 @@ def configure_vm(
 
         return vm
 
-    def sample_latency(self, microvm_factory, snapshot, metrics_logger) -> List[float]:
+    def sample_latency(self, microvm_factory, snapshot) -> List[float]:
         """Collects latency samples for the microvm configuration specified by this instance"""
         values = []
 
@@ -88,7 +88,7 @@ def sample_latency(self, microvm_factory, snapshot, metrics_logger) -> List[floa
             microvm.spawn()
             microvm.restore_from_snapshot(snapshot, resume=True)
 
-            fcmetrics = FCMetricsMonitor(microvm, metrics_logger)
+            fcmetrics = FCMetricsMonitor(microvm)
             fcmetrics.start()
 
             # Check if guest still runs commands.
@@ -152,7 +152,7 @@ def test_restore_latency(
             **vm.dimensions,
         }
     )
-    fcmetrics = FCMetricsMonitor(vm, metrics)
+    fcmetrics = FCMetricsMonitor(vm)
     fcmetrics.start()
 
     snapshot = vm.snapshot_full()
@@ -162,7 +162,6 @@ def test_restore_latency(
     samples = test_setup.sample_latency(
         microvm_factory,
         snapshot,
-        metrics,
     )
 
     for sample in samples:

diff --git a/tests/integration_tests/performance/test_vsock_ab.py b/tests/integration_tests/performance/test_vsock_ab.py
@@ -102,7 +102,7 @@ def test_vsock_throughput(
             **vm.dimensions,
         }
     )
-    fcmetrics = FCMetricsMonitor(vm, metrics)
+    fcmetrics = FCMetricsMonitor(vm)
     fcmetrics.start()
 
     vm.pin_threads(0)