Skip to content

Commit

Permalink
chore: move validation of FC metrics to host_tools
Browse files Browse the repository at this point in the history
test_cli_metrics_path and test_flush_metrics both validated breaking
change in FC metrics so move the validation functions to host_tools
which already had CI metrics related functions defined.

Signed-off-by: Sudan Landge <sudanl@amazon.com>
  • Loading branch information
Sudan Landge committed Oct 31, 2023
1 parent 02ee882 commit e48694f
Show file tree
Hide file tree
Showing 3 changed files with 327 additions and 353 deletions.
324 changes: 323 additions & 1 deletion tests/host_tools/metrics.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

"""Fixture to send metrics to AWS CloudWatch
"""Fixture to send metrics to AWS CloudWatch and validate Firecracker metrics
We use the aws-embedded-metrics library although it has some sharp corners,
namely:
Expand Down Expand Up @@ -43,11 +43,15 @@
"""

import asyncio
import datetime
import json
import math
import os
import platform
import socket
from urllib.parse import urlparse

import jsonschema
from aws_embedded_metrics.constants import DEFAULT_NAMESPACE
from aws_embedded_metrics.logger.metrics_logger_factory import create_metrics_logger

Expand Down Expand Up @@ -200,3 +204,321 @@ def format_with_reduced_unit(value, unit):
formatted_unit = UNIT_SHORTHANDS.get(reduced_unit, reduced_unit)

return f"{reduced_value:.2f}{formatted_unit}"


FirecrackerMetrics = {
"api_server": [
"process_startup_time_us",
"process_startup_time_cpu_us",
"sync_response_fails",
"sync_vmm_send_timeout_count",
],
"balloon": [
"activate_fails",
"inflate_count",
"stats_updates_count",
"stats_update_fails",
"deflate_count",
"event_fails",
],
"block": [
"activate_fails",
"cfg_fails",
"no_avail_buffer",
"event_fails",
"execute_fails",
"invalid_reqs_count",
"flush_count",
"queue_event_count",
"rate_limiter_event_count",
"update_count",
"update_fails",
"read_bytes",
"write_bytes",
"read_count",
"write_count",
"rate_limiter_throttled_events",
"io_engine_throttled_events",
],
"deprecated_api": [
"deprecated_http_api_calls",
"deprecated_cmd_line_api_calls",
],
"get_api_requests": [
"instance_info_count",
"machine_cfg_count",
"mmds_count",
"vmm_version_count",
],
"i8042": [
"error_count",
"missed_read_count",
"missed_write_count",
"read_count",
"reset_count",
"write_count",
],
"latencies_us": [
"full_create_snapshot",
"diff_create_snapshot",
"load_snapshot",
"pause_vm",
"resume_vm",
"vmm_full_create_snapshot",
"vmm_diff_create_snapshot",
"vmm_load_snapshot",
"vmm_pause_vm",
"vmm_resume_vm",
],
"logger": [
"missed_metrics_count",
"metrics_fails",
"missed_log_count",
"log_fails",
],
"mmds": [
"rx_accepted",
"rx_accepted_err",
"rx_accepted_unusual",
"rx_bad_eth",
"rx_count",
"tx_bytes",
"tx_count",
"tx_errors",
"tx_frames",
"connections_created",
"connections_destroyed",
],
"net": [
"activate_fails",
"cfg_fails",
"mac_address_updates",
"no_rx_avail_buffer",
"no_tx_avail_buffer",
"event_fails",
"rx_queue_event_count",
"rx_event_rate_limiter_count",
"rx_partial_writes",
"rx_rate_limiter_throttled",
"rx_tap_event_count",
"rx_bytes_count",
"rx_packets_count",
"rx_fails",
"rx_count",
"tap_read_fails",
"tap_write_fails",
"tx_bytes_count",
"tx_malformed_frames",
"tx_fails",
"tx_count",
"tx_packets_count",
"tx_partial_reads",
"tx_queue_event_count",
"tx_rate_limiter_event_count",
"tx_rate_limiter_throttled",
"tx_spoofed_mac_count",
],
"patch_api_requests": [
"drive_count",
"drive_fails",
"network_count",
"network_fails",
"machine_cfg_count",
"machine_cfg_fails",
"mmds_count",
"mmds_fails",
],
"put_api_requests": [
"actions_count",
"actions_fails",
"boot_source_count",
"boot_source_fails",
"drive_count",
"drive_fails",
"logger_count",
"logger_fails",
"machine_cfg_count",
"machine_cfg_fails",
"cpu_cfg_count",
"cpu_cfg_fails",
"metrics_count",
"metrics_fails",
"network_count",
"network_fails",
"mmds_count",
"mmds_fails",
"vsock_count",
"vsock_fails",
],
"seccomp": [
"num_faults",
],
"vcpu": [
"exit_io_in",
"exit_io_out",
"exit_mmio_read",
"exit_mmio_write",
"failures",
],
"vmm": [
"device_events",
"panic_count",
],
"uart": [
"error_count",
"flush_count",
"missed_read_count",
"missed_write_count",
"read_count",
"write_count",
],
"signals": [
"sigbus",
"sigsegv",
"sigxfsz",
"sigxcpu",
"sigpipe",
"sighup",
"sigill",
],
"vsock": [
"activate_fails",
"cfg_fails",
"rx_queue_event_fails",
"tx_queue_event_fails",
"ev_queue_event_fails",
"muxer_event_fails",
"conn_event_fails",
"rx_queue_event_count",
"tx_queue_event_count",
"rx_bytes_count",
"tx_bytes_count",
"rx_packets_count",
"tx_packets_count",
"conns_added",
"conns_killed",
"conns_removed",
"killq_resync",
"tx_flush_fails",
"tx_write_fails",
"rx_read_fails",
],
"entropy": [
"activate_fails",
"entropy_event_fails",
"entropy_event_count",
"entropy_bytes",
"host_rng_fails",
"entropy_rate_limiter_throttled",
"rate_limiter_event_count",
],
}


def validate_fc_metrics(metrics):
"""
This functions makes sure that all components
of FirecrackerMetrics struct are present.
"""

if platform.machine() == "aarch64":
FirecrackerMetrics["rtc"] = [
"error_count",
"missed_read_count",
"missed_write_count",
]

firecracker_metrics_schema = {
"type": "object",
"properties": {},
"required": [],
}

for metrics_name, metrics_fields in FirecrackerMetrics.items():
metrics_schema = {
"type": "object",
"required": metrics_fields,
"properties": {},
}
for metrics_field in metrics_fields:
metrics_schema["properties"][metrics_field] = {"type": "number"}
firecracker_metrics_schema["properties"][metrics_name] = metrics_schema
firecracker_metrics_schema["required"].append(metrics_name)

jsonschema.validate(instance=metrics, schema=firecracker_metrics_schema)

# remove some metrics and confirm that fields and not just top level metrics
# are validated.
temp_pop_metrics = metrics["api_server"].pop("process_startup_time_us")
try:
jsonschema.validate(instance=metrics, schema=firecracker_metrics_schema)
except jsonschema.exceptions.ValidationError as error:
if error.message.strip() == "'process_startup_time_us' is a required property":
pass
else:
raise error
metrics["api_server"]["process_startup_time_us"] = temp_pop_metrics

if platform.machine() == "aarch64":
temp_pop_metrics = metrics["rtc"].pop("error_count")
try:
jsonschema.validate(instance=metrics, schema=firecracker_metrics_schema)
except jsonschema.exceptions.ValidationError as error:
if error.message.strip() == "'error_count' is a required property":
pass
else:
raise error
metrics["rtc"]["error_count"] = temp_pop_metrics

utc_time = datetime.datetime.now(datetime.timezone.utc)
utc_timestamp_ms = math.floor(utc_time.timestamp() * 1000)

# Assert that the absolute difference is less than 1 second, to check that
# the reported utc_timestamp_ms is actually a UTC timestamp from the Unix
# Epoch.Regression test for:
# https://github.com/firecracker-microvm/firecracker/issues/2639
assert abs(utc_timestamp_ms - metrics["utc_timestamp_ms"]) < 1000


class FcDeviceMetrics:
"""
Provides functions to validate breaking change and
aggregation of metrics
"""

def __init__(self, name, num_dev):
self.dev_name = name
self.num_dev = num_dev

def validate(self, microvm):
"""
validate breaking change of device metrics
"""
fc_metrics = microvm.flush_metrics()

# make sure all items of FirecrackerMetrics are as expected
validate_fc_metrics(fc_metrics)

# make sure "{self.name}" is aggregate of "{self.name}_*"
# and that there are only {num_dev} entries of "{self.name}_*"
self.validate_aggregation(fc_metrics)
print(f"\nsuccessfully validated aggregate of {self.dev_name} metrics")

def validate_aggregation(self, fc_metrics):
"""
validate aggregation of device metrics
"""
metrics_aggregate = fc_metrics[self.dev_name]
metrics_calculated = {}
actual_num_devices = 0
print(f"In aggregation of {self.dev_name} expected {self.num_dev=}")
for component_metric_names, component_metric_values in fc_metrics.items():
if f"{self.dev_name}_" in component_metric_names:
print(f"found {component_metric_names} during aggr of {self.dev_name}")
actual_num_devices += 1
for metrics_name, metric_value in component_metric_values.items():
if metrics_name not in metrics_calculated:
metrics_calculated[metrics_name] = 0
metrics_calculated[metrics_name] += metric_value
assert metrics_aggregate == metrics_calculated
assert self.num_dev == actual_num_devices
31 changes: 2 additions & 29 deletions tests/integration_tests/functional/test_cmd_line_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
# SPDX-License-Identifier: Apache-2.0
"""Tests that ensure the correctness of the command line parameters."""

import platform
import subprocess
from pathlib import Path

import pytest

from framework.utils import run_cmd
from host_tools.cargo_build import get_firecracker_binaries
from host_tools.metrics import validate_fc_metrics


def test_describe_snapshot_all_versions(
Expand Down Expand Up @@ -56,34 +56,7 @@ def test_cli_metrics_path(uvm_plain):
microvm.basic_config()
microvm.start()
metrics = microvm.flush_metrics()

exp_keys = [
"utc_timestamp_ms",
"api_server",
"balloon",
"block",
"deprecated_api",
"get_api_requests",
"i8042",
"latencies_us",
"logger",
"mmds",
"net",
"patch_api_requests",
"put_api_requests",
"seccomp",
"vcpu",
"vmm",
"uart",
"signals",
"vsock",
"entropy",
]

if platform.machine() == "aarch64":
exp_keys.append("rtc")

assert set(metrics.keys()) == set(exp_keys)
validate_fc_metrics(metrics)


def test_cli_metrics_path_if_metrics_initialized_twice_fail(test_microvm_with_api):
Expand Down
Loading

0 comments on commit e48694f

Please sign in to comment.