Skip to content

Commit

Permalink
feat(cli): Replace unhealthy nodes based on ALERTS
Browse files Browse the repository at this point in the history
  • Loading branch information
sasa-tomic committed Jan 4, 2024
1 parent d435d59 commit 8febdea
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 15 deletions.
2 changes: 1 addition & 1 deletion rs/decentralization/src/network.rs
Original file line number Diff line number Diff line change
Expand Up @@ -797,7 +797,7 @@ pub trait TopologyManager: SubnetQuerier + AvailableNodesQuerier {
}
}

#[derive(Default, Clone)]
#[derive(Default, Clone, Debug)]
pub struct SubnetChangeRequest {
subnet: DecentralizedSubnet,
available_nodes: Vec<Node>,
Expand Down
10 changes: 9 additions & 1 deletion rs/ic-management-backend/src/endpoints/subnet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use ic_management_types::requests::{
MembershipReplaceRequest, ReplaceTarget, SubnetCreateRequest, SubnetResizeRequest,
};
use ic_management_types::Node;
use log::warn;
use serde::Deserialize;
use std::collections::BTreeMap;

Expand Down Expand Up @@ -78,6 +79,8 @@ async fn replace(

let mut motivations: Vec<String> = vec![];

info!("Received MembershipReplaceRequest: {}", request);

let change_request = match &request.target {
ReplaceTarget::Subnet(subnet) => registry.modify_subnet_nodes(SubnetQueryBy::SubnetId(*subnet)).await?,
ReplaceTarget::Nodes {
Expand Down Expand Up @@ -116,12 +119,17 @@ async fn replace(
if *health == ic_management_types::Status::Healthy {
None
} else {
info!("Node {} is {:?}", n.id, health);
Some(n)
}
}
None => Some(n),
None => {
warn!("Node {} has no known health, assuming unhealthy", n.id);
Some(n)
}
})
.collect::<Vec<_>>();

if !unhealthy.is_empty() {
replacements_unhealthy.extend(unhealthy);
}
Expand Down
47 changes: 36 additions & 11 deletions rs/ic-management-backend/src/health.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
use std::{collections::BTreeMap, convert::TryInto, str::FromStr};
use std::{
collections::{BTreeMap, HashSet},
convert::TryInto,
str::FromStr,
};

use ic_base_types::PrincipalId;
use ic_management_types::{Network, Status};
Expand All @@ -20,27 +24,48 @@ impl HealthClient {
}

pub async fn subnet(&self, subnet: PrincipalId) -> anyhow::Result<BTreeMap<PrincipalId, Status>> {
let query: InstantVector = Selector::new()
let query_up: InstantVector = Selector::new()
.metric("up")
.with("ic", &self.network.legacy_name())
.with("job", "replica")
.with("ic_subnet", subnet.to_string().as_str())
.try_into()?;

let response = self.client.query(query, None, None).await?;
let results = response.as_instant().expect("Expected instant vector");
Ok(results
let response_up = self.client.query(query_up, None, None).await?;
let instant_up = response_up.as_instant().expect("Expected instant vector");

// Alerts are synthetic time series and cannot be queries as regular metrics
// https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#inspecting-alerts-during-runtime
let query_alert = format!(
"ALERTS{{ic=\"{}\", job=\"replica\", ic_subnet=\"{}\", alertstate=\"firing\"}}",
self.network.legacy_name(),
subnet
);
let response_alert = self.client.query(query_alert, None, None).await?;
let instant_alert = response_alert.as_instant().expect("Expected instant vector");
let node_ids_with_alerts: HashSet<PrincipalId> = instant_alert
.iter()
.filter_map(|r| r.metric().get("ic_node").and_then(|id| PrincipalId::from_str(id).ok()))
.collect();

Ok(instant_up
.iter()
.filter_map(|r| {
let status = if r.sample().value() == 1.0 {
Status::Healthy
} else {
Status::Dead
};
r.metric()
.get("ic_node")
.and_then(|id| PrincipalId::from_str(id).ok())
.map(|id| (id, status))
.map(|id| {
let status = if r.sample().value() == 1.0 {
if node_ids_with_alerts.contains(&id) {
Status::Degraded
} else {
Status::Healthy
}
} else {
Status::Dead
};
(id, status)
})
})
.collect())
}
Expand Down
2 changes: 1 addition & 1 deletion rs/ic-management-backend/src/prometheus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@ use prometheus_http_query::Client;
pub fn client(network: &Network) -> Client {
match network {
Network::Mainnet => Client::try_from("https://victoria.mainnet.dfinity.network/select/0/prometheus/").unwrap(),
_ => Client::try_from("https://ic-metrics-victoria.ch1-obsstage1.dfinity.network/select/0/prometheus").unwrap(),
_ => Client::try_from("https://victoria.testnet.dfinity.network/select/0/prometheus").unwrap(),
}
}
2 changes: 1 addition & 1 deletion rs/ic-management-types/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ impl NodeFeature {
}
}

#[derive(Clone, Debug, Serialize, Deserialize, Default)]
#[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)]
pub struct MinNakamotoCoefficients {
pub coefficients: BTreeMap<NodeFeature, f64>,
pub average: f64,
Expand Down
36 changes: 36 additions & 0 deletions rs/ic-management-types/src/requests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,42 @@ pub struct MembershipReplaceRequest {
pub min_nakamoto_coefficients: Option<MinNakamotoCoefficients>,
}

// impl Display for MembershipReplaceRequest
impl std::fmt::Display for MembershipReplaceRequest {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let target = match &self.target {
ReplaceTarget::Subnet(subnet) => format!("subnet {}", subnet),
ReplaceTarget::Nodes { nodes, motivation } => {
format!("nodes {:?} ({})", nodes, motivation)
}
};
write!(f, "target: {}", target)?;
if self.heal {
write!(f, " heal: {}", self.heal)?;
}
if let Some(optimize) = self.optimize {
write!(f, " optimize: {}", optimize)?;
}
if let Some(exclude) = &self.exclude {
if !exclude.is_empty() {
write!(f, " exclude: {:?}", self.exclude)?;
}
}
if !self.only.is_empty() {
write!(f, " only: {:?}", self.only)?;
}
if let Some(include) = &self.include {
if !include.is_empty() {
write!(f, " include: {:?}", include)?;
}
}
if let Some(min_nakamoto_coefficients) = &self.min_nakamoto_coefficients {
write!(f, " min_nakamoto_coefficients: {:?}", min_nakamoto_coefficients)?;
}
Ok(())
}
}

#[derive(Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum ReplaceTarget {
Expand Down

0 comments on commit 8febdea

Please sign in to comment.