Skip to content

Commit

Permalink
Merge branch 'kpop/CON-1030/common_4.1' into 'master'
Browse files Browse the repository at this point in the history
chore(ic-recovery): [CON-1030] extract the checkpoints removal logic to a function so it can be reused

`remove_all_but_highest_checkpoints` and `get_latest_checkpoint_name_and_height` will be needed by the subnet splitting tool. 

Closes CON-1030

See merge request dfinity-lab/public/ic!12834
  • Loading branch information
kpop-dfinity committed Jun 8, 2023
2 parents 5a082e1 + bd394c0 commit 4497886
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 57 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions rs/recovery/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ DEPENDENCIES = [

DEV_DEPENDENCIES = [
"//rs/test_utilities",
"//rs/test_utilities/tmpdir",
"@crate_index//:tempfile",
]

Expand Down
1 change: 1 addition & 0 deletions rs/recovery/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ url = { version = "2.1.1", features = ["serde"] }
[dev-dependencies]
tempfile = "3.1.0"
ic-test-utilities = { path = "../test_utilities" }
ic-test-utilities-tmpdir = { path = "../test_utilities/tmpdir" }

[[bin]]
name = "ic-recovery"
Expand Down
129 changes: 127 additions & 2 deletions rs/recovery/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
//! returned in form of a recovery [Step], holding the human-readable (and
//! reproducible) description of the step, as well as its potential automatic
//! execution.
use crate::{cli::wait_for_confirmation, file_sync_helper::read_file};
use crate::{
cli::wait_for_confirmation,
file_sync_helper::{read_file, remove_dir},
};
use admin_helper::{AdminHelper, IcAdmin, RegistryParams};
use command_helper::exec_cmd;
use error::{RecoveryError, RecoveryResult};
Expand Down Expand Up @@ -43,7 +46,7 @@ use std::{
};
use steps::*;
use url::Url;
use util::block_on;
use util::{block_on, parse_hex_str};

pub mod admin_helper;
pub mod app_subnet_recovery;
Expand Down Expand Up @@ -348,6 +351,28 @@ impl Recovery {
write_bytes(path, bytes)
}

/// Removes all the checkpoints except the "highest" one.
///
/// Returns an error when there are no checkpoints.
pub fn remove_all_but_highest_checkpoints(
checkpoint_path: &Path,
logger: &Logger,
) -> RecoveryResult<Height> {
let checkpoints = Self::get_checkpoint_names(checkpoint_path)?;
let (max_name, max_height) = Self::get_latest_checkpoint_name_and_height(checkpoint_path)?;

for checkpoint in checkpoints {
if checkpoint == max_name {
continue;
}

info!(logger, "Deleting checkpoint {}", checkpoint);
remove_dir(&checkpoint_path.join(checkpoint))?;
}

Ok(max_height)
}

/// Return a recovery [AdminStep] to halt or unhalt the given subnet
pub fn halt_subnet(&self, subnet_id: SubnetId, is_halted: bool, keys: &[String]) -> impl Step {
AdminStep {
Expand Down Expand Up @@ -536,6 +561,21 @@ impl Recovery {
Ok(res)
}

/// Get the name and the height of the latest checkpoint currently on disk
///
/// Returns an error when there are no checkpoints.
pub fn get_latest_checkpoint_name_and_height(
checkpoints_path: &Path,
) -> RecoveryResult<(String, Height)> {
Self::get_checkpoint_names(checkpoints_path)?
.into_iter()
.map(|name| parse_hex_str(&name).map(|height| (name, Height::from(height))))
.collect::<RecoveryResult<Vec<_>>>()?
.into_iter()
.max_by_key(|(_name, height)| *height)
.ok_or_else(|| RecoveryError::invalid_output_error("No checkpoints"))
}

/// Parse and return the output of the replay step.
pub fn get_replay_output(&self) -> RecoveryResult<StateParams> {
replay_helper::read_output(self.work_dir.join(replay_helper::OUTPUT_FILE_NAME))
Expand Down Expand Up @@ -1092,3 +1132,88 @@ pub fn get_member_ips(
})
.collect()
}

#[cfg(test)]
mod tests {
use super::*;
use ic_test_utilities_tmpdir::tmpdir;

#[test]
fn get_latest_checkpoint_name_and_height_test() {
let checkpoints_dir = tmpdir("checkpoints");
create_fake_checkpoint_dirs(
checkpoints_dir.path(),
&[
/*height=64800*/ "000000000000fd20",
/*height=64900*/ "000000000000fd84",
],
);

let (name, height) =
Recovery::get_latest_checkpoint_name_and_height(checkpoints_dir.path())
.expect("Failed getting the latest checkpoint name and height");

assert_eq!(name, "000000000000fd84");
assert_eq!(height, Height::from(64900));
}

#[test]
fn get_latest_checkpoint_name_and_height_returns_error_on_invalid_checkpoint_name() {
let checkpoints_dir = tmpdir("checkpoints");
create_fake_checkpoint_dirs(
checkpoints_dir.path(),
&[
/*height=64800*/ "000000000000fd20",
/*height=64900*/ "000000000000fd84",
/*height=???*/ "invalid_checkpoint_name",
],
);

assert!(Recovery::get_latest_checkpoint_name_and_height(checkpoints_dir.path()).is_err());
}

#[test]
fn get_latest_checkpoint_name_and_height_returns_error_when_no_checkpoints() {
let checkpoints_dir = tmpdir("checkpoints");

assert!(Recovery::get_latest_checkpoint_name_and_height(checkpoints_dir.path()).is_err());
}

#[test]
fn remove_all_but_highest_checkpoints_test() {
let logger = util::make_logger();
let checkpoints_dir = tmpdir("checkpoints");
create_fake_checkpoint_dirs(
checkpoints_dir.path(),
&[
/*height=64800*/ "000000000000fd20",
/*height=64900*/ "000000000000fd84",
],
);

let height = Recovery::remove_all_but_highest_checkpoints(checkpoints_dir.path(), &logger)
.expect("Failed to remove checkpoints");

assert_eq!(height, Height::from(64900));
assert_eq!(
Recovery::get_checkpoint_names(checkpoints_dir.path()).unwrap(),
vec![String::from("000000000000fd84")]
);
}

#[test]
fn remove_all_but_highest_checkpoints_returns_error_when_no_checkpoints() {
let logger = util::make_logger();
let checkpoints_dir = tmpdir("checkpoints");

assert!(
Recovery::remove_all_but_highest_checkpoints(checkpoints_dir.path(), &logger).is_err()
);
}

fn create_fake_checkpoint_dirs(root: &Path, checkpoint_names: &[&str]) {
for checkpoint_name in checkpoint_names {
create_dir(&root.join(checkpoint_name)).unwrap();
}
}
}
82 changes: 27 additions & 55 deletions rs/recovery/src/steps.rs
Original file line number Diff line number Diff line change
Expand Up @@ -408,60 +408,36 @@ impl Step for ReplayStep {
fn exec(&self) -> RecoveryResult<()> {
let checkpoint_path = self.work_dir.join("data").join(IC_CHECKPOINTS_PATH);

let checkpoints = Recovery::get_checkpoint_names(&checkpoint_path)?;

let checkpoint_heights = checkpoints
.iter()
.map(|c| parse_hex_str(c))
.collect::<RecoveryResult<Vec<u64>>>()?;

let delete_checkpoints = |except: &u64| {
Recovery::get_checkpoint_names(&checkpoint_path)?
.iter()
.filter(|c| parse_hex_str(c).unwrap() != *except)
.map(|c| {
info!(self.logger, "Deleting checkpoint {}", c);
remove_dir(&checkpoint_path.join(c))
})
.collect::<RecoveryResult<Vec<_>>>()
};

if let Some(max) = checkpoint_heights.iter().max() {
delete_checkpoints(max)?;
let height = Height::from(*max);
let checkpoint_height =
Recovery::remove_all_but_highest_checkpoints(&checkpoint_path, &self.logger)?;

let state_params = block_on(replay_helper::replay(
self.subnet_id,
self.config.clone(),
self.canister_caller_id,
self.work_dir.join("data"),
self.subcmd.as_ref().map(|c| c.cmd.clone()),
self.result.clone(),
))?;

let latest_height = state_params.height;
let state_hash = state_params.hash;
let state_params = block_on(replay_helper::replay(
self.subnet_id,
self.config.clone(),
self.canister_caller_id,
self.work_dir.join("data"),
self.subcmd.as_ref().map(|c| c.cmd.clone()),
self.result.clone(),
))?;

info!(self.logger, "Checkpoint height: {}", height);
info!(self.logger, "Height after replay: {}", latest_height);
let latest_height = state_params.height;
let state_hash = state_params.hash;

if latest_height < height {
return Err(RecoveryError::invalid_output_error(
"Replay height and checkpoint height diverged.",
));
}
info!(self.logger, "Checkpoint height: {}", checkpoint_height);
info!(self.logger, "Height after replay: {}", latest_height);

info!(self.logger, "State hash: {}", state_hash);
if latest_height < checkpoint_height {
return Err(RecoveryError::invalid_output_error(
"Replay height and checkpoint height diverged.",
));
}

info!(self.logger, "Deleting old checkpoints");
delete_checkpoints(&latest_height.get())?;
info!(self.logger, "State hash: {}", state_hash);

return Ok(());
}
info!(self.logger, "Deleting old checkpoints");
Recovery::remove_all_but_highest_checkpoints(&checkpoint_path, &self.logger)?;

Err(RecoveryError::invalid_output_error(
"Did not find any checkpoints",
))
Ok(())
}
}

Expand Down Expand Up @@ -559,19 +535,15 @@ impl Step for UploadAndRestartStep {
let checkpoint_path = self.data_src.join(CHECKPOINTS);
let checkpoints = Recovery::get_checkpoint_names(&checkpoint_path)?;

if checkpoints.len() != 1 {
let [max_checkpoint] = checkpoints.as_slice() else {
return Err(RecoveryError::invalid_output_error(
"Found multiple checkpoints in upload directory".to_string(),
));
}
"Found multiple checkpoints in upload directory"));
};

let max_checkpoint = checkpoints.into_iter().max().ok_or_else(|| {
RecoveryError::invalid_output_error("No checkpoints found".to_string())
})?;
let replay_height =
replay_helper::read_output(self.work_dir.join(replay_helper::OUTPUT_FILE_NAME))?.height;

if parse_hex_str(&max_checkpoint)? != replay_height.get() {
if parse_hex_str(max_checkpoint)? != replay_height.get() {
return Err(RecoveryError::invalid_output_error(format!(
"Latest checkpoint height ({}) doesn't match replay output ({})",
max_checkpoint, replay_height
Expand Down

0 comments on commit 4497886

Please sign in to comment.