From 73bab7ebc76d623c3e8304e6e68925a9163a0ecb Mon Sep 17 00:00:00 2001 From: folex <0xdxdy@gmail.com> Date: Mon, 30 Oct 2023 13:34:13 -0400 Subject: [PATCH] fix(metrics): measure call time and wait time separately (#1858) * fix(metrics): always report mailbox size, actor count * fix(metrics): measure actual call time, without queueing time * fix(metrics): measure lock time, wait time & call time --- aquamarine/src/particle_functions.rs | 31 ++++++++++++------- aquamarine/src/plumber.rs | 6 ++-- .../src/services_metrics/builtin.rs | 1 + .../src/services_metrics/external.rs | 15 +++++++-- .../src/services_metrics/message.rs | 1 + .../peer-metrics/src/services_metrics/mod.rs | 18 +++++++---- particle-services/src/app_services.rs | 7 +++-- 7 files changed, 54 insertions(+), 25 deletions(-) diff --git a/aquamarine/src/particle_functions.rs b/aquamarine/src/particle_functions.rs index bcaf3cb9b0..26359ae4e0 100644 --- a/aquamarine/src/particle_functions.rs +++ b/aquamarine/src/particle_functions.rs @@ -40,8 +40,10 @@ use crate::log::builtin_log_fn; #[derive(Clone, Debug)] /// Performance statistics about executed function call pub struct SingleCallStat { - /// If execution happened, then how much time it took - pub run_time: Option, + /// If execution happened, then how much time it took to execute the call + pub call_time: Option, + /// If execution happened, then how much time call waited to be scheduled on blocking pool + pub wait_time: Option, pub success: bool, /// Whether function call was to builtin functions (like op noop) or to services pub kind: FunctionKind, @@ -139,7 +141,8 @@ impl Functions { call_id, result, stat: SingleCallStat { - run_time: None, + call_time: None, + wait_time: None, success: false, kind: FunctionKind::NotHappened, }, @@ -155,19 +158,21 @@ impl Functions { json!(&args.function_args) ); let service_id = args.service_id.clone(); - let start = Instant::now(); let params = self.particle.clone(); let builtins = self.builtins.clone(); let particle_function = self.particle_function.clone(); let span = tracing::span!(tracing::Level::INFO, "Function"); + let schedule_wait_start = Instant::now(); let result = tokio::task::Builder::new() .name(&format!( "Call function {}:{}", args.service_id, args.function_name )) - .spawn_blocking(|| { + .spawn_blocking(move || { Handle::current().block_on(async move { + // How much time it took to start execution on blocking pool + let schedule_wait_time = schedule_wait_start.elapsed(); let outcome = builtins.call(args, params).await; // record whether call was handled by builtin or not. needed for stats. let mut call_kind = FunctionKind::Service; @@ -187,14 +192,17 @@ impl Functions { // Builtins were called, return their outcome outcome => outcome, }; - (outcome, call_kind) + // How much time it took to execute the call + // TODO: Time for ParticleFunction includes lock time, which is not good. Low priority cuz ParticleFunctions are barely used. + let call_time = schedule_wait_start.elapsed() - schedule_wait_time; + (outcome, call_kind, call_time, schedule_wait_time) }) }) .expect("Could not spawn task"); async move { - let (result, call_kind) = result.await.expect("Could not 'Call function' join"); - let elapsed = start.elapsed(); + let (result, call_kind, call_time, wait_time) = + result.await.expect("Could not 'Call function' join"); let result = match result { FunctionOutcome::NotDefined { args, .. } => Err(JError::new(format!( @@ -211,15 +219,16 @@ impl Functions { particle_id = particle_id, "Failed host call {} ({}): {}", log_args, - pretty(elapsed), + pretty(call_time), err ) } else { - builtin_log_fn(&service_id, &log_args, pretty(elapsed), particle_id); + builtin_log_fn(&service_id, &log_args, pretty(call_time), particle_id); }; let stats = SingleCallStat { - run_time: Some(elapsed), + call_time: Some(call_time), + wait_time: Some(wait_time), success: result.is_ok(), kind: call_kind, }; diff --git a/aquamarine/src/plumber.rs b/aquamarine/src/plumber.rs index ed054b6ce3..fe522bdfb8 100644 --- a/aquamarine/src/plumber.rs +++ b/aquamarine/src/plumber.rs @@ -281,12 +281,12 @@ impl Plumber { let time = stat.interpretation_time.as_secs_f64(); m.interpretation_time_sec.observe(time); - m.total_actors_mailbox.set(mailbox_size as i64); - m.alive_actors.set(self.actors.len() as i64); } + m.total_actors_mailbox.set(mailbox_size as i64); + m.alive_actors.set(self.actors.len() as i64); for stat in &stats { - m.service_call(stat.success, stat.kind, stat.run_time) + m.service_call(stat.success, stat.kind, stat.call_time) } }); diff --git a/crates/peer-metrics/src/services_metrics/builtin.rs b/crates/peer-metrics/src/services_metrics/builtin.rs index 3cf4132828..4b9f4e18ab 100644 --- a/crates/peer-metrics/src/services_metrics/builtin.rs +++ b/crates/peer-metrics/src/services_metrics/builtin.rs @@ -78,6 +78,7 @@ impl Stats { ServiceCallStats::Success { memory_delta_bytes, call_time_sec, + lock_wait_time_sec: _, timestamp, } => { self.memory_deltas_bytes.update( diff --git a/crates/peer-metrics/src/services_metrics/external.rs b/crates/peer-metrics/src/services_metrics/external.rs index 96dc865263..2cb8e9bb91 100644 --- a/crates/peer-metrics/src/services_metrics/external.rs +++ b/crates/peer-metrics/src/services_metrics/external.rs @@ -86,7 +86,8 @@ pub struct ServicesMetricsExternal { pub modules_in_services_count: Histogram, /// Service call time - pub call_time_msec: Family, + pub call_time_sec: Family, + pub lock_wait_time_sec: Family, pub call_success_count: Family, pub call_failed_count: Family, @@ -182,13 +183,20 @@ impl ServicesMetricsExternal { "number of modules per services", ); - let call_time_msec: Family<_, _> = register( + let call_time_sec: Family<_, _> = register( sub_registry, Family::new_with_constructor(|| Histogram::new(execution_time_buckets())), "call_time_msec", "how long it took to execute a call", ); + let lock_wait_time_sec: Family<_, _> = register( + sub_registry, + Family::new_with_constructor(|| Histogram::new(execution_time_buckets())), + "lock_wait_time_sec", + "how long a service waited for Mutex", + ); + let memory_metrics = ServicesMemoryMetrics { mem_max_bytes, mem_max_per_module_bytes, @@ -217,7 +225,8 @@ impl ServicesMetricsExternal { removal_count, creation_failure_count, modules_in_services_count, - call_time_msec, + call_time_sec, + lock_wait_time_sec, call_success_count, call_failed_count, memory_metrics, diff --git a/crates/peer-metrics/src/services_metrics/message.rs b/crates/peer-metrics/src/services_metrics/message.rs index 1c70a46d3b..6f4f48a7ea 100644 --- a/crates/peer-metrics/src/services_metrics/message.rs +++ b/crates/peer-metrics/src/services_metrics/message.rs @@ -11,6 +11,7 @@ pub enum ServiceCallStats { Success { memory_delta_bytes: f64, call_time_sec: f64, + lock_wait_time_sec: f64, timestamp: u64, }, Fail { diff --git a/crates/peer-metrics/src/services_metrics/mod.rs b/crates/peer-metrics/src/services_metrics/mod.rs index 25ac6e45cd..6998f94a05 100644 --- a/crates/peer-metrics/src/services_metrics/mod.rs +++ b/crates/peer-metrics/src/services_metrics/mod.rs @@ -78,7 +78,7 @@ impl ServicesMetrics { service_type: ServiceType::Builtin, }; external - .call_time_msec + .call_time_sec .get_or_create(&label) .observe(call_time); if is_ok { @@ -99,11 +99,17 @@ impl ServicesMetrics { ) { self.observe_external(|external| { let label = ServiceTypeLabel { service_type }; - if let Success { call_time_sec, .. } = &stats { - external - .call_time_msec - .get_or_create(&label) - .observe(*call_time_sec); + if let Success { + call_time_sec, + lock_wait_time_sec, + .. + } = &stats + { + let call_time_metric = external.call_time_sec.get_or_create(&label); + call_time_metric.observe(*call_time_sec); + + let lock_time_metric = external.lock_wait_time_sec.get_or_create(&label); + lock_time_metric.observe(*lock_wait_time_sec); } external.call_success_count.get_or_create(&label).inc(); self.observe_service_mem(service_id.clone(), label.service_type, memory); diff --git a/particle-services/src/app_services.rs b/particle-services/src/app_services.rs index 4800dd0999..030056b48a 100644 --- a/particle-services/src/app_services.rs +++ b/particle-services/src/app_services.rs @@ -443,7 +443,6 @@ impl ParticleAppServices { particle: ParticleParams, create_vault: bool, ) -> FunctionOutcome { - let call_time_start = Instant::now(); let services = self.services.read(); let aliases = self.aliases.read(); let worker_id = particle.host_id; @@ -509,10 +508,12 @@ impl ParticleAppServices { }; let function_name = function_args.function_name; + let lock_acquire_start = Instant::now(); let mut service = service.lock(); let old_memory = service.module_memory_stats(); let old_mem_usage = ServicesMetricsBuiltin::get_used_memory(&old_memory); // TODO: set execution timeout https://github.com/fluencelabs/fluence/issues/1212 + let call_time_start = Instant::now(); let result = service .call( function_name.clone(), @@ -539,8 +540,9 @@ impl ParticleAppServices { ServiceError::Engine(e) })?; - let call_time_sec = call_time_start.elapsed().as_secs_f64(); if let Some(metrics) = self.metrics.as_ref() { + let call_time_sec = call_time_start.elapsed().as_secs_f64(); + let lock_wait_time_sec = lock_acquire_start.elapsed().as_secs_f64(); let new_memory = service.module_memory_stats(); let new_memory_usage = ServicesMetricsBuiltin::get_used_memory(&new_memory); @@ -548,6 +550,7 @@ impl ParticleAppServices { let stats = ServiceCallStats::Success { memory_delta_bytes: memory_delta_bytes as f64, call_time_sec, + lock_wait_time_sec, timestamp, };