Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vhost user blk metrics #4226

Merged
merged 7 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@
- [#4063](https://github.com/firecracker-microvm/firecracker/pull/4063):
Adds source-level instrumentation based tracing. See
[tracing](./docs/tracing.md) for more details.
- [#4226](https://github.com/firecracker-microvm/firecracker/pull/4226):
Added support for vhost-user device metrics. Each vhost-user device will
emit metrics under the label `"vhost_user_{device}_{drive_id}"`.
E.g. the associated metrics for vhost-user block device with endpoint
`"/drives/rootfs"` will be available under `"vhost_user_block_rootfs"`
in the metrics json object.
ShadowCurse marked this conversation as resolved.
Show resolved Hide resolved

### Changed

Expand Down
1 change: 1 addition & 0 deletions src/vmm/src/devices/virtio/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pub mod rng;
pub mod test_utils;
pub mod vhost_user;
pub mod vhost_user_block;
pub mod vhost_user_metrics;
pub mod virtio_block;
pub mod vsock;

Expand Down
22 changes: 21 additions & 1 deletion src/vmm/src/devices/virtio/vhost_user_block/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@
use crate::devices::virtio::gen::virtio_ring::VIRTIO_RING_F_EVENT_IDX;
use crate::devices::virtio::queue::Queue;
use crate::devices::virtio::vhost_user::{VhostUserHandleBackend, VhostUserHandleImpl};
use crate::devices::virtio::vhost_user_metrics::{
VhostUserDeviceMetrics, VhostUserMetricsPerDevice,
};
use crate::devices::virtio::{ActivateError, TYPE_BLOCK};
use crate::logger::{IncMetric, StoreMetric};
use crate::vmm_config::drive::BlockDeviceConfig;
use crate::vstate::memory::GuestMemoryMmap;

Expand Down Expand Up @@ -127,6 +131,7 @@
// Vhost user protocol handle
pub vu_handle: VhostUserHandleImpl<T>,
pub vu_acked_protocol_features: u64,
pub metrics: Arc<VhostUserDeviceMetrics>,
}

// Need custom implementation because otherwise `Debug` is required for `vhost::Master`
Expand All @@ -151,12 +156,14 @@
"vu_acked_protocol_features",
&self.vu_acked_protocol_features,
)
.field("metrics", &self.metrics)

Check warning on line 159 in src/vmm/src/devices/virtio/vhost_user_block/device.rs

View check run for this annotation

Codecov / codecov/patch

src/vmm/src/devices/virtio/vhost_user_block/device.rs#L159

Added line #L159 was not covered by tests
.finish()
}
}

impl<T: VhostUserHandleBackend> VhostUserBlockImpl<T> {
pub fn new(config: VhostUserBlockConfig) -> Result<Self, VhostUserBlockError> {
let start_time = utils::time::get_time_us(utils::time::ClockType::Monotonic);
let mut requested_features = AVAILABLE_FEATURES;

if config.cache_type == CacheType::Writeback {
Expand Down Expand Up @@ -204,6 +211,11 @@
let avail_features = acked_features;
let acked_features = acked_features & VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits();
let read_only = acked_features & (1 << VIRTIO_BLK_F_RO) != 0;
let vhost_user_block_metrics_name = format!("block_{}", config.drive_id);

let metrics = VhostUserMetricsPerDevice::alloc(vhost_user_block_metrics_name);
let delta_us = utils::time::get_time_us(utils::time::ClockType::Monotonic) - start_time;
metrics.init_time_us.store(delta_us);

Ok(Self {
avail_features,
Expand All @@ -224,6 +236,7 @@

vu_handle,
vu_acked_protocol_features: acked_protocol_features,
metrics,
})
}

Expand Down Expand Up @@ -285,6 +298,7 @@
let config_len = self.config_space.len() as u64;
if offset >= config_len {
error!("Failed to read config space");
self.metrics.cfg_fails.inc();
sudanl0 marked this conversation as resolved.
Show resolved Hide resolved
return;
}
if let Some(end) = offset.checked_add(data.len() as u64) {
Expand All @@ -303,6 +317,7 @@
}

fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> {
let start_time = utils::time::get_time_us(utils::time::ClockType::Monotonic);
// Setting features again, because now we negotiated them
// with guest driver as well.
self.vu_handle
Expand All @@ -314,8 +329,13 @@
&[(0, &self.queues[0], &self.queue_evts[0])],
&self.irq_trigger,
)
.map_err(ActivateError::VhostUser)?;
.map_err(|err| {
self.metrics.activate_fails.inc();
ActivateError::VhostUser(err)

Check warning on line 334 in src/vmm/src/devices/virtio/vhost_user_block/device.rs

View check run for this annotation

Codecov / codecov/patch

src/vmm/src/devices/virtio/vhost_user_block/device.rs#L333-L334

Added lines #L333 - L334 were not covered by tests
})?;
self.device_state = DeviceState::Activated(mem);
let delta_us = utils::time::get_time_us(utils::time::ClockType::Monotonic) - start_time;
self.metrics.activate_time_us.store(delta_us);
Ok(())
}

Expand Down
186 changes: 186 additions & 0 deletions src/vmm/src/devices/virtio/vhost_user_metrics.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
// Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

//! Defines the metrics system for vhost-user devices.
//!
//! # Metrics format
//! The metrics are flushed in JSON when requested by vmm::logger::metrics::METRICS.write().
//!
//! ## JSON example with metrics:
//! ```json
//! {
//! "vhost_user_{mod}_id0": {
//! "activate_fails": "SharedIncMetric",
//! "cfg_fails": "SharedIncMetric",
//! "init_time_us": SharedStoreMetric,
//! "activate_time_us": SharedStoreMetric,
//! }
//! "vhost_user_{mod}_id1": {
//! "activate_fails": "SharedIncMetric",
//! "cfg_fails": "SharedIncMetric",
//! "init_time_us": SharedStoreMetric,
//! "activate_time_us": SharedStoreMetric,
//! }
//! ...
//! "vhost_user_{mod}_idN": {
//! "activate_fails": "SharedIncMetric",
//! "cfg_fails": "SharedIncMetric",
//! "init_time_us": SharedStoreMetric,
//! "activate_time_us": SharedStoreMetric,
//! }
//! }
//! ```
//! Each `vhost_user` field in the example above is a serializable `VhostUserDeviceMetrics`
//! structure collecting metrics such as `activate_fails`, `cfg_fails`, `init_time_us` and
//! `activate_time_us` for the vhost_user device.
//! For vhost-user block device having endpoint "/drives/drv0" the emitted metrics would be
//! `vhost_user_block_drv0`.
//! For vhost-user block device having endpoint "/drives/drvN" the emitted metrics would be
//! `vhost_user_block_drvN`.
//! Aggregate metrics for `vhost_user` if `not` emitted as it can be easily obtained in
//! typical observability tools.
//!
//! # Design
//! The main design goals of this system are:
//! * To improve vhost_user device metrics by logging them at per device granularity.
//! * `vhost_user` is a new device with no metrics emitted before so, backward compatibility doesn't
//! come into picture like it was in the case of block/net devices. And since, metrics can be
//! easily aggregated using typical observability tools, we chose not to provide aggregate
//! vhost_user metrics.
//! * Rely on `serde` to provide the actual serialization for writing the metrics.
//! * Since all metrics start at 0, we implement the `Default` trait via derive for all of them, to
//! avoid having to initialize everything by hand.
//!
//! * Follow the design of Block and Net device metrics and use a map of vhost_user device name and
//! corresponding metrics.
//! * Metrics are flushed with key `vhost_user_{module_specific_name}` and each module sets an
//! appropriate `module_specific_name` in the format `{mod}_{id}`. e.g. vhost-user block device in
//! this commit set this as `format!("{}_{}", "block_", config.drive_id.clone());` This way
//! vhost_user_metrics stay generic while the specific vhost_user devices can have their unique
//! metrics.
//!
//! The system implements 2 type of metrics:
//! * Shared Incremental Metrics (SharedIncMetrics) - dedicated for the metrics which need a counter
//! (i.e the number of times activating a device failed). These metrics are reset upon flush.
//! * Shared Store Metrics (SharedStoreMetrics) - are targeted at keeping a persistent value, it is
//! `not` intended to act as a counter (i.e for measure the process start up time for example).
//! We add VhostUserDeviceMetrics entries from vhost_user_metrics::METRICS into vhost_user device
//! instead of vhost_user device having individual separate VhostUserDeviceMetrics entries because
//! vhost_user device is not accessible from signal handlers to flush metrics and
//! vhost_user_metrics::METRICS is.

use std::collections::BTreeMap;
use std::sync::{Arc, RwLock};

use serde::ser::SerializeMap;
use serde::{Serialize, Serializer};

use crate::logger::{SharedIncMetric, SharedStoreMetric};

/// map of vhost_user drive id and metrics
/// this should be protected by a lock before accessing.
#[allow(missing_debug_implementations)]
pub struct VhostUserMetricsPerDevice {
/// used to access per vhost_user device metrics
pub metrics: BTreeMap<String, Arc<VhostUserDeviceMetrics>>,
}

impl VhostUserMetricsPerDevice {
/// Allocate `VhostUserDeviceMetrics` for vhost_user device having
/// id `drive_id`. Also, allocate only if it doesn't
/// exist to avoid overwriting previously allocated data.
/// lock is always initialized so it is safe the unwrap
/// the lock without a check.
pub fn alloc(drive_id: String) -> Arc<VhostUserDeviceMetrics> {
if METRICS.read().unwrap().metrics.get(&drive_id).is_none() {
METRICS.write().unwrap().metrics.insert(
drive_id.clone(),
Arc::new(VhostUserDeviceMetrics::default()),
);
}
METRICS
.read()
.unwrap()
.metrics
.get(&drive_id)
.unwrap()
.clone()
}
}

/// Pool of vhost_user-related metrics per device behind a lock to
/// keep things thread safe. Since the lock is initialized here
/// it is safe to unwrap it without any check.
static METRICS: RwLock<VhostUserMetricsPerDevice> = RwLock::new(VhostUserMetricsPerDevice {
metrics: BTreeMap::new(),
});

/// This function facilitates serialization of vhost_user device metrics.
pub fn flush_metrics<S: Serializer>(serializer: S) -> Result<S::Ok, S::Error> {
let vhost_user_metrics = METRICS.read().unwrap();
let metrics_len = vhost_user_metrics.metrics.len();
let mut seq = serializer.serialize_map(Some(metrics_len))?;

for (name, metrics) in vhost_user_metrics.metrics.iter() {
let devn = format!("vhost_user_{}", name);
seq.serialize_entry(&devn, metrics)?;
}
seq.end()
}

/// vhost_user Device associated metrics.
#[derive(Debug, Default, Serialize)]
pub struct VhostUserDeviceMetrics {
/// Number of times when activate failed on a vhost_user device.
pub activate_fails: SharedIncMetric,
/// Number of times when interacting with the space config of a vhost-user device failed.
pub cfg_fails: SharedIncMetric,
// Vhost-user init time in microseconds.
pub init_time_us: SharedStoreMetric,
// Vhost-user activate time in microseconds.
pub activate_time_us: SharedStoreMetric,
}

#[cfg(test)]
pub mod tests {
use super::*;
use crate::logger::{IncMetric, StoreMetric};

// vhost-user metrics has both SharedIncMetrics and SharedStoreMetrics
// In this test we try to test one field for each type by creating a
// dummy vhost_user_block metric named `vhost_user_block_drvN`.
// There is no specific reason to storing the measured time taken vs a
// random number in `init_time_us`.
// We add an additional test to confirm that `vhost_user_metrics::METRICS`
// actually has an entry for `vhost_user_block_drvN` and compare it.
// We chose serde_json to compare because that seemed easiest to compare
// the entire struct format and serialization of VhostUserDeviceMetrics.
#[test]
fn test_vhost_user_basic_metrics() {
sudanl0 marked this conversation as resolved.
Show resolved Hide resolved
let vhost_user_dev_name: String = String::from("vhost_user_block_drvN");
let start_time = utils::time::get_time_us(utils::time::ClockType::Monotonic);
let vhost_user_metrics: Arc<VhostUserDeviceMetrics> =
VhostUserMetricsPerDevice::alloc(vhost_user_dev_name.clone());
let delta_us = utils::time::get_time_us(utils::time::ClockType::Monotonic) - start_time;
vhost_user_metrics.activate_fails.inc();
assert_eq!(vhost_user_metrics.activate_fails.count(), 1);

vhost_user_metrics.init_time_us.store(delta_us);
assert_eq!(vhost_user_metrics.init_time_us.fetch(), delta_us);

// fill another local variable with the same data and use it to compare with the METRICS
// entry
let vhost_user_metrics_backup: VhostUserDeviceMetrics = VhostUserDeviceMetrics::default();
vhost_user_metrics_backup.activate_fails.inc();
vhost_user_metrics_backup.init_time_us.store(delta_us);

// serializing METRICS also flushes the SharedIncMetric data so we have to use _backup
// variable for comparison.
let vhost_user_metrics_global: String =
serde_json::to_string(&METRICS.read().unwrap().metrics.get(&vhost_user_dev_name))
.unwrap();
let vhost_user_metrics_local: String =
serde_json::to_string(&vhost_user_metrics_backup).unwrap();
assert_eq!(vhost_user_metrics_local, vhost_user_metrics_global);
}
}
6 changes: 6 additions & 0 deletions src/vmm/src/logger/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ use vm_superio::rtc_pl031::RtcEvents;

use super::FcLineWriter;
use crate::devices::virtio::net::metrics as net_metrics;
use crate::devices::virtio::vhost_user_metrics;
use crate::devices::virtio::virtio_block::metrics as block_metrics;
use crate::devices::virtio::vsock::metrics as vsock_metrics;
#[cfg(target_arch = "aarch64")]
Expand Down Expand Up @@ -950,6 +951,7 @@ macro_rules! create_serialize_proxy {
create_serialize_proxy!(VsockMetricsSerializeProxy, vsock_metrics);
create_serialize_proxy!(BlockMetricsSerializeProxy, block_metrics);
create_serialize_proxy!(NetMetricsSerializeProxy, net_metrics);
create_serialize_proxy!(VhostUserMetricsSerializeProxy, vhost_user_metrics);

/// Structure storing all metrics while enforcing serialization support on them.
#[derive(Debug, Default, Serialize)]
Expand Down Expand Up @@ -999,6 +1001,9 @@ pub struct FirecrackerMetrics {
pub vsock: VsockMetricsSerializeProxy,
/// Metrics related to virtio-rng entropy device.
pub entropy: EntropyDeviceMetrics,
#[serde(flatten)]
/// Vhost-user device related metrics.
pub vhost_user_ser: VhostUserMetricsSerializeProxy,
}
impl FirecrackerMetrics {
/// Const default construction.
Expand Down Expand Up @@ -1026,6 +1031,7 @@ impl FirecrackerMetrics {
signals: SignalMetrics::new(),
vsock: VsockMetricsSerializeProxy {},
entropy: EntropyDeviceMetrics::new(),
vhost_user_ser: VhostUserMetricsSerializeProxy {},
}
}
}
Expand Down
Loading