Skip to content

Commit

Permalink
Merge branch 'kpop/CON-1057/manifest-splitting-v2' into 'master'
Browse files Browse the repository at this point in the history
feat(ic-recovery): [CON-1057] Add a "ComputeExpectedManifests" step to the subnet splitting tool

It works by calling the `state-tool` several times.

Closes CON-1057 

Closes CON-1057

See merge request dfinity-lab/public/ic!13133
  • Loading branch information
kpop-dfinity committed Jun 26, 2023
2 parents 7618e27 + cd37af7 commit 27a80c3
Show file tree
Hide file tree
Showing 11 changed files with 300 additions and 43 deletions.
40 changes: 21 additions & 19 deletions rs/backup/src/backup_helper.rs
Original file line number Diff line number Diff line change
@@ -1,26 +1,28 @@
use crate::notification_client::NotificationClient;
use crate::util::{block_on, sleep_secs};
use ic_recovery::command_helper::exec_cmd;
use ic_recovery::error::RecoveryError;
use ic_recovery::file_sync_helper::download_binary;
use crate::{
notification_client::NotificationClient,
util::{block_on, sleep_secs},
};
use ic_recovery::{
command_helper::exec_cmd, error::RecoveryError, file_sync_helper::download_binary,
};
use ic_registry_client::client::{RegistryClient, RegistryClientImpl};
use ic_registry_client_helpers::node::NodeRegistry;
use ic_registry_client_helpers::subnet::SubnetRegistry;
use ic_registry_client_helpers::{node::NodeRegistry, subnet::SubnetRegistry};
use ic_types::{ReplicaVersion, SubnetId};

use chrono::{DateTime, Utc};
use rand::seq::SliceRandom;
use rand::thread_rng;
use rand::{seq::SliceRandom, thread_rng};
use slog::{debug, error, info, warn, Logger};
use std::collections::BTreeMap;
use std::ffi::OsStr;
use std::fs::{create_dir_all, read_dir, remove_dir_all, DirEntry, File};
use std::io::Write;
use std::net::IpAddr;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
use std::sync::{Arc, Mutex};
use std::time::Instant;
use std::{
collections::BTreeMap,
ffi::OsStr,
fs::{create_dir_all, read_dir, remove_dir_all, DirEntry, File},
io::Write,
net::IpAddr,
path::{Path, PathBuf},
process::{Command, Stdio},
sync::{Arc, Mutex},
time::Instant,
};

const RETRIES_RSYNC_HOST: u64 = 5;
const RETRIES_BINARY_DOWNLOAD: u64 = 3;
Expand Down Expand Up @@ -196,7 +198,7 @@ impl BackupHelper {
&self.log,
replica_version.clone(),
binary_name.to_string(),
self.binary_dir(replica_version),
&self.binary_dir(replica_version),
));
if res.is_ok() {
return Ok(());
Expand Down
2 changes: 2 additions & 0 deletions rs/recovery/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ pub enum RecoveryError {
ParsingError(serde_json::Error),
SerializationError(serde_json::Error),
UnexpectedError(String),
StateToolError(String),
CheckpointError(String, CheckpointError),
StepSkipped,
}
Expand Down Expand Up @@ -88,6 +89,7 @@ impl fmt::Display for RecoveryError {
RecoveryError::CheckpointError(msg, e) => {
write!(f, "Checkpoint error, message: {}, error: {}", msg, e)
}
RecoveryError::StateToolError(msg) => write!(f, "State tool error, message: {}", msg),
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion rs/recovery/src/file_sync_helper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ pub async fn download_binary(
logger: &Logger,
replica_version: ReplicaVersion,
binary_name: String,
target_dir: PathBuf,
target_dir: &Path,
) -> RecoveryResult<PathBuf> {
let binary_url = format!(
"https://download.dfinity.systems/ic/{}/release/{}.gz",
Expand Down
2 changes: 1 addition & 1 deletion rs/recovery/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ impl Recovery {
&r.logger,
version,
String::from("ic-admin"),
r.binary_dir.clone(),
&r.binary_dir,
))?;
} else {
info!(r.logger, "No ic-admin version provided, skipping download.");
Expand Down
6 changes: 2 additions & 4 deletions rs/recovery/subnet_splitting/src/admin_helper.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use crate::utils::canister_id_range_to_string;

use ic_base_types::SubnetId;
use ic_recovery::admin_helper::{
quote, AdminHelper, CommandHelper, IcAdmin, SSH_READONLY_ACCESS_ARG, SUMMARY_ARG,
Expand Down Expand Up @@ -118,10 +120,6 @@ pub(crate) fn get_halt_subnet_at_cup_height_command(
ic_admin
}

fn canister_id_range_to_string(canister_id_range: &CanisterIdRange) -> String {
format!("{}:{}", canister_id_range.start, canister_id_range.end)
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
2 changes: 2 additions & 0 deletions rs/recovery/subnet_splitting/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ use url::Url;
use std::path::PathBuf;

mod admin_helper;
mod state_tool_helper;
mod steps;
mod subnet_splitting;
mod utils;

#[derive(Parser)]
#[clap(version = "1.0")]
Expand Down
155 changes: 155 additions & 0 deletions rs/recovery/subnet_splitting/src/state_tool_helper.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
use crate::utils::canister_id_ranges_to_strings;

use ic_base_types::SubnetId;
use ic_recovery::{
error::{RecoveryError, RecoveryResult},
file_sync_helper::{download_binary, path_exists, write_bytes},
util::block_on,
};
use ic_registry_routing_table::CanisterIdRange;
use ic_types::ReplicaVersion;
use slog::{info, Logger};

use std::{
iter::once,
path::{Path, PathBuf},
process::{Command, Stdio},
};

const BINARY_NAME: &str = "state-tool";

/// Helper struct to simplify executing the `state-tool` binary.
#[derive(Clone)]
pub(crate) struct StateToolHelper {
bin_dir: PathBuf,
logger: Logger,
}

impl StateToolHelper {
/// Creates a new instance of the [StateToolHelper].
///
/// If necessary, it will download the `state-tool` binary.
pub(crate) fn new(
bin_dir: PathBuf,
replica_version: Option<ReplicaVersion>,
logger: Logger,
) -> RecoveryResult<Self> {
let state_tool_helper = Self { bin_dir, logger };

state_tool_helper
.download_if_necessary(replica_version)
.map(|_| state_tool_helper)
}

/// Computes manifest of a checkpoint.
///
/// Calls `state-tool manifest --state $dir > $output_path`.
pub(crate) fn compute_manifest(&self, dir: &Path, output_path: &Path) -> RecoveryResult<()> {
self.execute("manifest", Some(output_path), |command| {
command.args(["--state", dir.display().to_string().as_str()])
})
}

/// Splits a manifest, to verify the manifests resulting from a subnet split.
///
/// Calls `state-tool split_manifest --path $manifest_path --from-subnet $source_subnet
/// --to_subnet $destination_subnet --subnet-type application --migrated-ranges
/// $canister_id_range`c.
pub(crate) fn split_manifest(
&self,
manifest_path: &Path,
source_subnet: SubnetId,
destination_subnet: SubnetId,
canister_id_ranges: &[CanisterIdRange],
output_path: &Path,
) -> RecoveryResult<()> {
self.execute("split_manifest", Some(output_path), |command| {
command
.args(["--path", manifest_path.display().to_string().as_str()])
.args(["--from-subnet", source_subnet.to_string().as_str()])
.args(["--to-subnet", destination_subnet.to_string().as_str()])
.args(["--subnet-type", "application"])
.args(
once("--migrated-ranges".to_string())
.chain(canister_id_ranges_to_strings(canister_id_ranges).into_iter()),
)
})
}

/// Verifies whether the textual representation of a manifest matches its root hash.
///
/// Calls `state-tool verify_manifest --file $manifest_path`.
pub(crate) fn verify_manifest(&self, manifest_path: &Path) -> RecoveryResult<()> {
self.execute("verify_manifest", /*output_path=*/ None, |command| {
command.args(["--file", manifest_path.display().to_string().as_str()])
})
}

fn execute(
&self,
main_argument: &str,
output_path: Option<&Path>,
command_builder: impl Fn(&mut Command) -> &mut Command,
) -> RecoveryResult<()> {
let mut command = Command::new(self.binary_path());
command.arg(main_argument).stderr(Stdio::inherit());
command_builder(&mut command);

info!(self.logger, "Executing {:?}", command);

let output = command.output().map_err(|e| {
RecoveryError::StateToolError(format!("Failed executing the command, error: {}", e))
})?;

if !output.status.success() {
return Err(RecoveryError::StateToolError(format!(
"The command returned non-zero value: {}",
output.status,
)));
}

info!(
self.logger,
"Succeeded executing the command:\n{}",
String::from_utf8_lossy(&output.stdout)
);

if let Some(output_path) = output_path {
write_bytes(output_path, output.stdout)?;
}

Ok(())
}

fn download_if_necessary(&self, replica_version: Option<ReplicaVersion>) -> RecoveryResult<()> {
if path_exists(&self.binary_path())? {
info!(
self.logger,
"{} already exists, skipping download",
&self.binary_path().display(),
);

return Ok(());
}

if let Some(version) = replica_version {
block_on(download_binary(
&self.logger,
version,
BINARY_NAME.to_string(),
&self.bin_dir,
))
.map(|_| ())
} else {
info!(
self.logger,
"No state-tool version provided, skipping download."
);
Ok(())
}
}

fn binary_path(&self) -> PathBuf {
self.bin_dir.join(BINARY_NAME)
}
}
61 changes: 61 additions & 0 deletions rs/recovery/subnet_splitting/src/steps.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use crate::state_tool_helper::StateToolHelper;

use ic_base_types::SubnetId;
use ic_metrics::MetricsRegistry;
use ic_recovery::{
Expand All @@ -12,6 +14,9 @@ use slog::Logger;

use std::path::PathBuf;

const MANIFEST_FILE_NAME: &str = "manifest.data";
const EXPECTED_MANIFESTS_FILE_NAME: &str = "expected_manifests.data";

pub(crate) struct CopyWorkDirStep {
pub(crate) from: PathBuf,
pub(crate) to: PathBuf,
Expand Down Expand Up @@ -68,6 +73,7 @@ impl StateSplitStrategy {
pub(crate) struct SplitStateStep {
pub(crate) subnet_id: SubnetId,
pub(crate) state_split_strategy: StateSplitStrategy,
pub(crate) state_tool_helper: StateToolHelper,
pub(crate) work_dir: PathBuf,
pub(crate) logger: Logger,
}
Expand Down Expand Up @@ -106,6 +112,61 @@ impl Step for SplitStateStep {
)
.map_err(RecoveryError::OutputError)?;

let (max_name, _) = Recovery::get_latest_checkpoint_name_and_height(&checkpoints_dir)?;
let max_checkpoint = checkpoints_dir.join(max_name);
let manifest_path = max_checkpoint.join(MANIFEST_FILE_NAME);

self.state_tool_helper
.compute_manifest(&max_checkpoint, &manifest_path)?;
self.state_tool_helper.verify_manifest(&manifest_path)?;

Recovery::remove_all_but_highest_checkpoints(&checkpoints_dir, &self.logger).map(|_| ())
}
}

pub(crate) struct ComputeExpectedManifestsStep {
pub(crate) recovery_dir: PathBuf,
pub(crate) state_tool_helper: StateToolHelper,
pub(crate) source_subnet_id: SubnetId,
pub(crate) destination_subnet_id: SubnetId,
pub(crate) canister_id_ranges_to_move: Vec<CanisterIdRange>,
}

impl ComputeExpectedManifestsStep {
fn checkpoints(&self) -> PathBuf {
self.recovery_dir
.join("working_dir")
.join(IC_STATE_DIR)
.join(CHECKPOINTS)
}
}

impl Step for ComputeExpectedManifestsStep {
fn descr(&self) -> String {
format!(
"Compute the expected manifests of the states resulting from splitting the manifest \
at {} between {} (hosting all canisters in {:?}) and {} (all remaining canisters)",
self.checkpoints().display(),
self.destination_subnet_id,
self.canister_id_ranges_to_move,
self.source_subnet_id,
)
}

fn exec(&self) -> RecoveryResult<()> {
let checkpoints_dir = self.checkpoints();
let (max_name, _) = Recovery::get_latest_checkpoint_name_and_height(&checkpoints_dir)?;
let max_checkpoint = checkpoints_dir.join(max_name);
let manifest_path = self.recovery_dir.join(MANIFEST_FILE_NAME);

self.state_tool_helper
.compute_manifest(&max_checkpoint, &manifest_path)?;
self.state_tool_helper.split_manifest(
&manifest_path,
self.source_subnet_id,
self.destination_subnet_id,
&self.canister_id_ranges_to_move,
&self.recovery_dir.join(EXPECTED_MANIFESTS_FILE_NAME),
)
}
}

0 comments on commit 27a80c3

Please sign in to comment.