Skip to content

Commit

Permalink
chore: Upload FCMetrics with reduced dimensions
Browse files Browse the repository at this point in the history
FC metrics were uploaded to CW with the same dimensions
as the test to make it easy to map the metrics with the test
in CW. However, each test has a huge number of dimensions which
when multiplied with the number of FC metrics we upload add upto
too many datapoints stored in CW.
This makes CW slow hard to monitor the dashboard and also has
a significant increase in billing.
So limit the dimensions to (cpu-host_version-guest_version) so
that we can monitor the dashboard properly and reduce the billing.
To map the datapoints with the test that emitted them we'll have
to use the datapoint timestamp with some CW query.

Signed-off-by: Sudan Landge <sudanl@amazon.com>
  • Loading branch information
sudanl0 committed Jan 26, 2024
1 parent c910e44 commit 6f16d70
Show file tree
Hide file tree
Showing 6 changed files with 21 additions and 12 deletions.
14 changes: 12 additions & 2 deletions tests/host_tools/fcmetrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
import jsonschema
import pytest

from framework.properties import global_props
from host_tools.metrics import get_metrics_logger


def validate_fc_metrics(metrics):
"""
Expand Down Expand Up @@ -434,15 +437,22 @@ class FCMetricsMonitor(Thread):
of the metrics.
"""

def __init__(self, vm, metrics_logger, timer=60):
def __init__(self, vm, timer=60):
Thread.__init__(self, daemon=True)
self.vm = vm
self.timer = timer

self.metrics_index = 0
self.running = False

self.metrics_logger = metrics_logger
self.metrics_logger = get_metrics_logger()
self.metrics_logger.set_dimensions(
{
"instance": global_props.instance,
"host_kernel": "linux-" + global_props.host_linux_version,
"guest_kernel": vm.kernel_file.stem[2:],
}
)

def _flush_metrics(self):
"""
Expand Down
4 changes: 2 additions & 2 deletions tests/integration_tests/performance/test_block_ab.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def test_block_performance(
**vm.dimensions,
}
)
fcmetrics = FCMetricsMonitor(vm, metrics)
fcmetrics = FCMetricsMonitor(vm)
fcmetrics.start()

vm.pin_threads(0)
Expand Down Expand Up @@ -227,7 +227,7 @@ def test_block_vhost_user_performance(
**vm.dimensions,
}
)
fcmetrics = FCMetricsMonitor(vm, metrics)
fcmetrics = FCMetricsMonitor(vm)
fcmetrics.start()

next_cpu = vm.pin_threads(0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def test_memory_overhead(
metrics.set_dimensions(
{"performance_test": "test_memory_overhead", **microvm.dimensions}
)
fcmetrics = FCMetricsMonitor(microvm, metrics)
fcmetrics = FCMetricsMonitor(microvm)
fcmetrics.start()

# check that the vm is running
Expand Down
4 changes: 2 additions & 2 deletions tests/integration_tests/performance/test_network_ab.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def test_network_latency(network_microvm, metrics, iteration):
"iteration": str(iteration),
}
)
fcmetrics = FCMetricsMonitor(network_microvm, metrics)
fcmetrics = FCMetricsMonitor(network_microvm)
fcmetrics.start()

samples = []
Expand Down Expand Up @@ -160,7 +160,7 @@ def test_network_tcp_throughput(
**network_microvm.dimensions,
}
)
fcmetrics = FCMetricsMonitor(network_microvm, metrics)
fcmetrics = FCMetricsMonitor(network_microvm)
fcmetrics.start()

test = TcpIPerf3Test(
Expand Down
7 changes: 3 additions & 4 deletions tests/integration_tests/performance/test_snapshot_ab.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def configure_vm(

return vm

def sample_latency(self, microvm_factory, snapshot, metrics_logger) -> List[float]:
def sample_latency(self, microvm_factory, snapshot) -> List[float]:
"""Collects latency samples for the microvm configuration specified by this instance"""
values = []

Expand All @@ -88,7 +88,7 @@ def sample_latency(self, microvm_factory, snapshot, metrics_logger) -> List[floa
microvm.spawn()
microvm.restore_from_snapshot(snapshot, resume=True)

fcmetrics = FCMetricsMonitor(microvm, metrics_logger)
fcmetrics = FCMetricsMonitor(microvm)
fcmetrics.start()

# Check if guest still runs commands.
Expand Down Expand Up @@ -152,7 +152,7 @@ def test_restore_latency(
**vm.dimensions,
}
)
fcmetrics = FCMetricsMonitor(vm, metrics)
fcmetrics = FCMetricsMonitor(vm)
fcmetrics.start()

snapshot = vm.snapshot_full()
Expand All @@ -162,7 +162,6 @@ def test_restore_latency(
samples = test_setup.sample_latency(
microvm_factory,
snapshot,
metrics,
)

for sample in samples:
Expand Down
2 changes: 1 addition & 1 deletion tests/integration_tests/performance/test_vsock_ab.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def test_vsock_throughput(
**vm.dimensions,
}
)
fcmetrics = FCMetricsMonitor(vm, metrics)
fcmetrics = FCMetricsMonitor(vm)
fcmetrics.start()

vm.pin_threads(0)
Expand Down

0 comments on commit 6f16d70

Please sign in to comment.