Skip to content

Commit

Permalink
Merge branch 'kpop/CON-1030/more_steps' into 'master'
Browse files Browse the repository at this point in the history
feat(ic-recovery): [CON-1030] reuse some of the steps from `ic-recovery` in `subnet-splitting-tool`

 

See merge request dfinity-lab/public/ic!12891
  • Loading branch information
kpop-dfinity committed Jun 14, 2023
2 parents a6a9e80 + f920336 commit 4639aac
Show file tree
Hide file tree
Showing 7 changed files with 194 additions and 38 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions rs/recovery/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ DEPENDENCIES = [
"//rs/registry/nns_data_provider",
"//rs/registry/subnet_features",
"//rs/replay",
"//rs/state_manager",
"//rs/types/base_types",
"//rs/types/ic00_types",
"//rs/types/types",
Expand Down
1 change: 1 addition & 0 deletions rs/recovery/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ ic-registry-nns-data-provider = { path = "../registry/nns_data_provider" }
ic-registry-replicator = { path = "../orchestrator/registry_replicator" }
ic-registry-subnet-features = { path = "../registry/subnet_features" }
ic-replay = { path = "../replay" }
ic-state-manager = { path = "../state_manager" }
ic-ic00-types = { path = "../types/ic00_types" }
ic-types = { path = "../types/types" }
prost = "0.11.0"
Expand Down
19 changes: 12 additions & 7 deletions rs/recovery/src/error.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use ic_http_utils::file_downloader::FileDownloadError;
use ic_state_manager::CheckpointError;
use std::{
error::Error,
fmt::{self, Display},
Expand All @@ -20,6 +21,7 @@ pub enum RecoveryError {
ParsingError(serde_json::Error),
SerializationError(serde_json::Error),
UnexpectedError(String),
CheckpointError(String, CheckpointError),
StepSkipped,
}

Expand Down Expand Up @@ -60,28 +62,31 @@ impl fmt::Display for RecoveryError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
RecoveryError::IoError(msg, e) => {
write!(f, "IO error, message: {:?}, error: {:?}", msg, e)
write!(f, "IO error, message: {}, error: {}", msg, e)
}
RecoveryError::CommandError(code, msg) => {
write!(f, "Command error, message: {:?}, code: {:?}", msg, code)
write!(f, "Command error, message: {}, code: {:?}", msg, code)
}
RecoveryError::OutputError(msg) => {
write!(f, "Output error, message: {:?}", msg)
write!(f, "Output error, message: {}", msg)
}
RecoveryError::DownloadError(msg, e) => {
write!(f, "Download error, message: {:?}, error: {:?}", msg, e)
write!(f, "Download error, message: {}, error: {}", msg, e)
}
RecoveryError::UnexpectedError(msg) => {
write!(f, "Unexpected error, message: {:?}", msg)
write!(f, "Unexpected error, message: {}", msg)
}
RecoveryError::StepSkipped => {
write!(f, "Recovery step skipped.")
}
RecoveryError::ParsingError(e) => {
write!(f, "Parsing error, error: {:?}", e)
write!(f, "Parsing error, error: {}", e)
}
RecoveryError::SerializationError(e) => {
write!(f, "Serialization error, error: {:?}", e)
write!(f, "Serialization error, error: {}", e)
}
RecoveryError::CheckpointError(msg, e) => {
write!(f, "Checkpoint error, message: {}, error: {}", msg, e)
}
}
}
Expand Down
1 change: 1 addition & 0 deletions rs/recovery/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,7 @@ impl Recovery {
data_src,
require_confirmation: self.ssh_confirmation,
key_file: self.key_file.clone(),
check_ic_replay_height: true,
}
}

Expand Down
18 changes: 11 additions & 7 deletions rs/recovery/src/steps.rs
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,7 @@ pub struct UploadAndRestartStep {
pub data_src: PathBuf,
pub require_confirmation: bool,
pub key_file: Option<PathBuf>,
pub check_ic_replay_height: bool,
}

impl Step for UploadAndRestartStep {
Expand Down Expand Up @@ -546,14 +547,17 @@ impl Step for UploadAndRestartStep {
"Found multiple checkpoints in upload directory"));
};

let replay_height =
replay_helper::read_output(self.work_dir.join(replay_helper::OUTPUT_FILE_NAME))?.height;
if self.check_ic_replay_height {
let replay_height =
replay_helper::read_output(self.work_dir.join(replay_helper::OUTPUT_FILE_NAME))?
.height;

if parse_hex_str(max_checkpoint)? != replay_height.get() {
return Err(RecoveryError::invalid_output_error(format!(
"Latest checkpoint height ({}) doesn't match replay output ({})",
max_checkpoint, replay_height
)));
if parse_hex_str(max_checkpoint)? != replay_height.get() {
return Err(RecoveryError::invalid_output_error(format!(
"Latest checkpoint height ({}) doesn't match replay output ({})",
max_checkpoint, replay_height
)));
}
}

let ic_checkpoints_path = format!("{}/{}", IC_DATA_PATH, IC_CHECKPOINTS_PATH);
Expand Down
191 changes: 167 additions & 24 deletions rs/recovery/subnet_splitting/src/subnet_splitting.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,23 @@ use crate::admin_helper::{
use clap::Parser;
use ic_base_types::SubnetId;
use ic_recovery::{
cli::read_optional,
error::RecoveryResult,
cli::{consent_given, read_optional},
error::{RecoveryError, RecoveryResult},
recovery_iterator::RecoveryIterator,
recovery_state::{HasRecoveryState, RecoveryState},
steps::{AdminStep, Step},
NeuronArgs, Recovery, RecoveryArgs,
steps::{AdminStep, Step, UploadAndRestartStep, WaitForCUPStep},
NeuronArgs, Recovery, RecoveryArgs, CHECKPOINTS, IC_REGISTRY_LOCAL_STORE, IC_STATE_DIR,
};
use ic_registry_routing_table::CanisterIdRange;
use ic_state_manager::manifest::{manifest_from_path, manifest_hash};
use serde::{Deserialize, Serialize};
use slog::Logger;
use std::{iter::Peekable, net::IpAddr};
use std::{iter::Peekable, net::IpAddr, path::PathBuf};
use strum::{EnumMessage, IntoEnumIterator};
use strum_macros::{EnumIter, EnumString};

const DESTINATION_WORK_DIR: &str = "destination_work_dir";

#[derive(
Debug,
Copy,
Expand Down Expand Up @@ -103,6 +106,12 @@ pub(crate) struct SubnetSplitting {
logger: Logger,
}

#[derive(Debug, Copy, Clone, PartialEq)]
enum TargetSubnet {
Source,
Destination,
}

impl SubnetSplitting {
pub(crate) fn new(
logger: Logger,
Expand All @@ -122,6 +131,94 @@ impl SubnetSplitting {
logger,
}
}

fn unhalt(&self, target_subnet: TargetSubnet) -> impl Step {
self.recovery.halt_subnet(
self.subnet_id(target_subnet),
/*is_halted=*/ false,
/*keys=*/ &[],
)
}

fn propose_cup(&self, target_subnet: TargetSubnet) -> RecoveryResult<impl Step> {
let checkpoints_dir = self
.work_dir(target_subnet)
.join(IC_STATE_DIR)
.join(CHECKPOINTS);

let (max_name, max_height) =
Recovery::get_latest_checkpoint_name_and_height(&checkpoints_dir)?;

let max_checkpoint_dir = checkpoints_dir.join(max_name);
let manifest = &manifest_from_path(&max_checkpoint_dir).map_err(|e| {
RecoveryError::CheckpointError(
format!(
"Failed to read the manifest from path {}",
max_checkpoint_dir.display()
),
e,
)
})?;
let state_hash = hex::encode(manifest_hash(manifest));

self.recovery.update_recovery_cup(
self.subnet_id(target_subnet),
Recovery::get_recovery_height(max_height),
state_hash,
/*replacement_nodes=*/ &[],
/*registry_params=*/ None,
/*ecdsa_subnet_id=*/ None,
)
}

fn upload_and_restart_step(&self, target_subnet: TargetSubnet) -> RecoveryResult<impl Step> {
match self.upload_node(target_subnet) {
Some(node_ip) => Ok(UploadAndRestartStep {
logger: self.recovery.logger.clone(),
node_ip,
work_dir: self.work_dir(target_subnet),
data_src: self.work_dir(target_subnet).join(IC_STATE_DIR),
require_confirmation: true,
key_file: self.recovery.key_file.clone(),
check_ic_replay_height: false,
}),
None => Err(RecoveryError::StepSkipped),
}
}

fn wait_for_cup_step(&self, target_subnet: TargetSubnet) -> RecoveryResult<impl Step> {
match self.upload_node(target_subnet) {
Some(node_ip) => Ok(WaitForCUPStep {
logger: self.recovery.logger.clone(),
node_ip,
work_dir: self.work_dir(target_subnet),
}),
None => Err(RecoveryError::StepSkipped),
}
}

fn upload_node(&self, target_subnet: TargetSubnet) -> Option<IpAddr> {
match target_subnet {
TargetSubnet::Source => self.params.upload_node_source,
TargetSubnet::Destination => self.params.upload_node_destination,
}
}

fn subnet_id(&self, target_subnet: TargetSubnet) -> SubnetId {
match target_subnet {
TargetSubnet::Source => self.params.source_subnet_id,
TargetSubnet::Destination => self.params.destination_subnet_id,
}
}

fn work_dir(&self, target_subnet: TargetSubnet) -> PathBuf {
match target_subnet {
TargetSubnet::Source => self.recovery.work_dir.clone(),
TargetSubnet::Destination => {
self.recovery.work_dir.with_file_name(DESTINATION_WORK_DIR)
}
}
}
}

impl RecoveryIterator<StepType, StepTypeIter> for SubnetSplitting {
Expand Down Expand Up @@ -152,18 +249,39 @@ impl RecoveryIterator<StepType, StepTypeIter> for SubnetSplitting {
}
}

StepType::DownloadStateFromSourceSubnet => todo!(),
StepType::CopyDir => todo!(),
StepType::DownloadStateFromSourceSubnet => {
if self.params.download_node_source.is_none() {
self.params.download_node_source =
read_optional(&self.logger, "Enter download IP on the Source Subnet:");
}

self.params.keep_downloaded_state = Some(consent_given(
&self.logger,
"Preserve original downloaded state locally?",
));
}

StepType::SplitOutSourceState => todo!(),
StepType::SplitOutDestinationState => todo!(),
StepType::ProposeCupForSourceSubnet => todo!(),
StepType::UploadStateToSourceSubnet => todo!(),
StepType::ProposeCupForDestinationSubnet => todo!(),
StepType::UploadStateToDestinationSubnet => todo!(),
StepType::WaitForCUPOnSourceSubnet => todo!(),
StepType::WaitForCUPOnDestinationSubnet => todo!(),
StepType::UnhaltSourceSubnet => todo!(),
StepType::UnhaltDestinationSubnet => todo!(),

StepType::UploadStateToSourceSubnet => {
if self.params.upload_node_source.is_none() {
self.params.upload_node_source = read_optional(
&self.logger,
"Enter IP of node in the Source Subnet with admin access: ",
);
}
}

StepType::UploadStateToDestinationSubnet => {
if self.params.upload_node_destination.is_none() {
self.params.upload_node_destination = read_optional(
&self.logger,
"Enter IP of node in the Destination Subnet with admin access: ",
);
}
}

StepType::Cleanup => todo!(),
_ => (),
}
Expand Down Expand Up @@ -203,18 +321,43 @@ impl RecoveryIterator<StepType, StepTypeIter> for SubnetSplitting {
}
.into(),

StepType::DownloadStateFromSourceSubnet => todo!(),
StepType::DownloadStateFromSourceSubnet => {
let Some(node_ip) = self.params.download_node_source else {
return Err(RecoveryError::StepSkipped);
};

self.recovery
.get_download_state_step(
node_ip,
self.params.pub_key.is_some(),
self.params.keep_downloaded_state == Some(true),
/*additional_excludes=*/
vec!["orchestrator", "ic_consensus_pool", IC_REGISTRY_LOCAL_STORE],
)
.into()
}

StepType::CopyDir => todo!(),
StepType::SplitOutSourceState => todo!(),
StepType::SplitOutDestinationState => todo!(),
StepType::ProposeCupForSourceSubnet => todo!(),
StepType::UploadStateToSourceSubnet => todo!(),
StepType::ProposeCupForDestinationSubnet => todo!(),
StepType::UploadStateToDestinationSubnet => todo!(),
StepType::WaitForCUPOnSourceSubnet => todo!(),
StepType::WaitForCUPOnDestinationSubnet => todo!(),
StepType::UnhaltSourceSubnet => todo!(),
StepType::UnhaltDestinationSubnet => todo!(),
StepType::ProposeCupForSourceSubnet => self.propose_cup(TargetSubnet::Source)?.into(),
StepType::UploadStateToSourceSubnet => {
self.upload_and_restart_step(TargetSubnet::Source)?.into()
}
StepType::ProposeCupForDestinationSubnet => {
self.propose_cup(TargetSubnet::Destination)?.into()
}
StepType::UploadStateToDestinationSubnet => self
.upload_and_restart_step(TargetSubnet::Destination)?
.into(),
StepType::WaitForCUPOnSourceSubnet => {
self.wait_for_cup_step(TargetSubnet::Source)?.into()
}
StepType::WaitForCUPOnDestinationSubnet => {
self.wait_for_cup_step(TargetSubnet::Destination)?.into()
}
StepType::UnhaltSourceSubnet => self.unhalt(TargetSubnet::Source).into(),
StepType::UnhaltDestinationSubnet => self.unhalt(TargetSubnet::Destination).into(),

StepType::CompleteCanisterMigration => AdminStep {
logger: self.recovery.logger.clone(),
Expand Down

0 comments on commit 4639aac

Please sign in to comment.