From 8febdea4f7f35f940cf6b9f1c05b6f6e94822393 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sa=C5=A1a=20Tomi=C4=87?= Date: Mon, 27 Nov 2023 11:08:09 +0000 Subject: [PATCH] feat(cli): Replace unhealthy nodes based on ALERTS --- rs/decentralization/src/network.rs | 2 +- .../src/endpoints/subnet.rs | 10 +++- rs/ic-management-backend/src/health.rs | 47 ++++++++++++++----- rs/ic-management-backend/src/prometheus.rs | 2 +- rs/ic-management-types/src/lib.rs | 2 +- rs/ic-management-types/src/requests.rs | 36 ++++++++++++++ 6 files changed, 84 insertions(+), 15 deletions(-) diff --git a/rs/decentralization/src/network.rs b/rs/decentralization/src/network.rs index 7fcff856..f7aaed45 100644 --- a/rs/decentralization/src/network.rs +++ b/rs/decentralization/src/network.rs @@ -797,7 +797,7 @@ pub trait TopologyManager: SubnetQuerier + AvailableNodesQuerier { } } -#[derive(Default, Clone)] +#[derive(Default, Clone, Debug)] pub struct SubnetChangeRequest { subnet: DecentralizedSubnet, available_nodes: Vec, diff --git a/rs/ic-management-backend/src/endpoints/subnet.rs b/rs/ic-management-backend/src/endpoints/subnet.rs index 36adfe00..1fe49768 100644 --- a/rs/ic-management-backend/src/endpoints/subnet.rs +++ b/rs/ic-management-backend/src/endpoints/subnet.rs @@ -6,6 +6,7 @@ use ic_management_types::requests::{ MembershipReplaceRequest, ReplaceTarget, SubnetCreateRequest, SubnetResizeRequest, }; use ic_management_types::Node; +use log::warn; use serde::Deserialize; use std::collections::BTreeMap; @@ -78,6 +79,8 @@ async fn replace( let mut motivations: Vec = vec![]; + info!("Received MembershipReplaceRequest: {}", request); + let change_request = match &request.target { ReplaceTarget::Subnet(subnet) => registry.modify_subnet_nodes(SubnetQueryBy::SubnetId(*subnet)).await?, ReplaceTarget::Nodes { @@ -116,12 +119,17 @@ async fn replace( if *health == ic_management_types::Status::Healthy { None } else { + info!("Node {} is {:?}", n.id, health); Some(n) } } - None => Some(n), + None => { + warn!("Node {} has no known health, assuming unhealthy", n.id); + Some(n) + } }) .collect::>(); + if !unhealthy.is_empty() { replacements_unhealthy.extend(unhealthy); } diff --git a/rs/ic-management-backend/src/health.rs b/rs/ic-management-backend/src/health.rs index 177dc28d..5bb8a536 100644 --- a/rs/ic-management-backend/src/health.rs +++ b/rs/ic-management-backend/src/health.rs @@ -1,4 +1,8 @@ -use std::{collections::BTreeMap, convert::TryInto, str::FromStr}; +use std::{ + collections::{BTreeMap, HashSet}, + convert::TryInto, + str::FromStr, +}; use ic_base_types::PrincipalId; use ic_management_types::{Network, Status}; @@ -20,27 +24,48 @@ impl HealthClient { } pub async fn subnet(&self, subnet: PrincipalId) -> anyhow::Result> { - let query: InstantVector = Selector::new() + let query_up: InstantVector = Selector::new() .metric("up") .with("ic", &self.network.legacy_name()) .with("job", "replica") .with("ic_subnet", subnet.to_string().as_str()) .try_into()?; - let response = self.client.query(query, None, None).await?; - let results = response.as_instant().expect("Expected instant vector"); - Ok(results + let response_up = self.client.query(query_up, None, None).await?; + let instant_up = response_up.as_instant().expect("Expected instant vector"); + + // Alerts are synthetic time series and cannot be queries as regular metrics + // https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#inspecting-alerts-during-runtime + let query_alert = format!( + "ALERTS{{ic=\"{}\", job=\"replica\", ic_subnet=\"{}\", alertstate=\"firing\"}}", + self.network.legacy_name(), + subnet + ); + let response_alert = self.client.query(query_alert, None, None).await?; + let instant_alert = response_alert.as_instant().expect("Expected instant vector"); + let node_ids_with_alerts: HashSet = instant_alert + .iter() + .filter_map(|r| r.metric().get("ic_node").and_then(|id| PrincipalId::from_str(id).ok())) + .collect(); + + Ok(instant_up .iter() .filter_map(|r| { - let status = if r.sample().value() == 1.0 { - Status::Healthy - } else { - Status::Dead - }; r.metric() .get("ic_node") .and_then(|id| PrincipalId::from_str(id).ok()) - .map(|id| (id, status)) + .map(|id| { + let status = if r.sample().value() == 1.0 { + if node_ids_with_alerts.contains(&id) { + Status::Degraded + } else { + Status::Healthy + } + } else { + Status::Dead + }; + (id, status) + }) }) .collect()) } diff --git a/rs/ic-management-backend/src/prometheus.rs b/rs/ic-management-backend/src/prometheus.rs index c322b576..0aba7513 100644 --- a/rs/ic-management-backend/src/prometheus.rs +++ b/rs/ic-management-backend/src/prometheus.rs @@ -4,6 +4,6 @@ use prometheus_http_query::Client; pub fn client(network: &Network) -> Client { match network { Network::Mainnet => Client::try_from("https://victoria.mainnet.dfinity.network/select/0/prometheus/").unwrap(), - _ => Client::try_from("https://ic-metrics-victoria.ch1-obsstage1.dfinity.network/select/0/prometheus").unwrap(), + _ => Client::try_from("https://victoria.testnet.dfinity.network/select/0/prometheus").unwrap(), } } diff --git a/rs/ic-management-types/src/lib.rs b/rs/ic-management-types/src/lib.rs index b11c3ca4..4af3a529 100644 --- a/rs/ic-management-types/src/lib.rs +++ b/rs/ic-management-types/src/lib.rs @@ -247,7 +247,7 @@ impl NodeFeature { } } -#[derive(Clone, Debug, Serialize, Deserialize, Default)] +#[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] pub struct MinNakamotoCoefficients { pub coefficients: BTreeMap, pub average: f64, diff --git a/rs/ic-management-types/src/requests.rs b/rs/ic-management-types/src/requests.rs index 40322701..2fa99576 100644 --- a/rs/ic-management-types/src/requests.rs +++ b/rs/ic-management-types/src/requests.rs @@ -13,6 +13,42 @@ pub struct MembershipReplaceRequest { pub min_nakamoto_coefficients: Option, } +// impl Display for MembershipReplaceRequest +impl std::fmt::Display for MembershipReplaceRequest { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let target = match &self.target { + ReplaceTarget::Subnet(subnet) => format!("subnet {}", subnet), + ReplaceTarget::Nodes { nodes, motivation } => { + format!("nodes {:?} ({})", nodes, motivation) + } + }; + write!(f, "target: {}", target)?; + if self.heal { + write!(f, " heal: {}", self.heal)?; + } + if let Some(optimize) = self.optimize { + write!(f, " optimize: {}", optimize)?; + } + if let Some(exclude) = &self.exclude { + if !exclude.is_empty() { + write!(f, " exclude: {:?}", self.exclude)?; + } + } + if !self.only.is_empty() { + write!(f, " only: {:?}", self.only)?; + } + if let Some(include) = &self.include { + if !include.is_empty() { + write!(f, " include: {:?}", include)?; + } + } + if let Some(min_nakamoto_coefficients) = &self.min_nakamoto_coefficients { + write!(f, " min_nakamoto_coefficients: {:?}", min_nakamoto_coefficients)?; + } + Ok(()) + } +} + #[derive(Serialize, Deserialize)] #[serde(rename_all = "lowercase")] pub enum ReplaceTarget {