Skip to content

Commit

Permalink
Merge branch 'kpop/CON-1066/verify_hashes' into 'master'
Browse files Browse the repository at this point in the history
chore(ic-recovery): [CON-1066] Verify that the hashes of the states after split match the expected hashes computed in one of the previous steps

Also, I've created a new struct `Layout` with the layout of the working directory. During the development of `subnet_splitting` I've made too many mistakes by messing up the paths. With having all the paths in one file, I hope it's going to be harder to make mistakes in the future...

Closes CON-1066 

Closes CON-1066

See merge request dfinity-lab/public/ic!13220
  • Loading branch information
kpop-dfinity committed Jun 28, 2023
2 parents cfabc9a + 396da51 commit cce0628
Show file tree
Hide file tree
Showing 12 changed files with 335 additions and 91 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions rs/recovery/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ pub enum RecoveryError {
StateToolError(String),
CheckpointError(String, CheckpointError),
RegistryError(String),
ValidationFailed(String),
StepSkipped,
}

Expand Down Expand Up @@ -92,6 +93,9 @@ impl fmt::Display for RecoveryError {
}
RecoveryError::RegistryError(msg) => write!(f, "Registry error, message: {}", msg),
RecoveryError::StateToolError(msg) => write!(f, "State tool error, message: {}", msg),
RecoveryError::ValidationFailed(msg) => {
write!(f, "Validation failed, message: {}", msg)
}
}
}
}
Expand Down
8 changes: 4 additions & 4 deletions rs/recovery/src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ pub fn parse_hex_str(string: &str) -> RecoveryResult<u64> {
})
}

pub fn subnet_id_from_str(s: &str) -> Result<SubnetId, String> {
pub fn subnet_id_from_str(s: &str) -> RecoveryResult<SubnetId> {
PrincipalId::from_str(s)
.map_err(|e| format!("Unable to parse subnet_id {:?}", e))
.map_err(|e| RecoveryError::UnexpectedError(format!("Unable to parse subnet_id {:?}", e)))
.map(SubnetId::from)
}

pub fn node_id_from_str(s: &str) -> Result<NodeId, String> {
pub fn node_id_from_str(s: &str) -> RecoveryResult<NodeId> {
PrincipalId::from_str(s)
.map_err(|e| format!("Unable to parse node_id {:?}", e))
.map_err(|e| RecoveryError::UnexpectedError(format!("Unable to parse node_id {:?}", e)))
.map(NodeId::from)
}

Expand Down
2 changes: 2 additions & 0 deletions rs/recovery/subnet_splitting/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ DEPENDENCIES = [
]

DEV_DEPENDENCIES = [
"//rs/test_utilities/tmpdir",
]

MACRO_DEPENDENCIES = [
Expand All @@ -44,5 +45,6 @@ rust_binary(
rust_test(
name = "subnet_splitting_tool_test",
crate = "subnet-splitting-tool",
data = ["test_data/fake_expected_manifests.data"],
deps = DEPENDENCIES + DEV_DEPENDENCIES,
)
3 changes: 3 additions & 0 deletions rs/recovery/subnet_splitting/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ slog = { version = "2.5.2", features = ["release_max_level_trace"] }
strum = "0.24.1"
strum_macros = "0.24.1"

[dev-dependencies]
ic-test-utilities-tmpdir = { path = "../../test_utilities/tmpdir" }

[[bin]]
name = "subnet-splitting-tool"
path = "src/main.rs"
76 changes: 76 additions & 0 deletions rs/recovery/subnet_splitting/src/layout.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
use crate::target_subnet::TargetSubnet;

use ic_base_types::SubnetId;
use ic_recovery::{error::RecoveryResult, Recovery, CHECKPOINTS, IC_STATE_DIR};

use std::path::{Path, PathBuf};

#[derive(Clone)]
/// Describes the layout of the working directory of subnet splitting:
///
/// |-- root/
/// | |-- ${source_subnet_id}.manifest
/// | |-- ${destination_subnet_id}.manifest
/// | |-- original_source_manifest.data
/// | |-- expected_manifests.data
/// | |-- (destination_)work_dir/
/// | | |-- data/
/// | | | |-- ic_state/checkpoints/
/// | | | | |-- 1/
/// | | | | |-- 2/
pub(crate) struct Layout {
root: PathBuf,

original_state_manifest: PathBuf,
expected_manifests: PathBuf,
source_working_dir: PathBuf,
}

impl Layout {
pub(crate) fn new(recovery: &Recovery) -> Self {
Self {
root: recovery.recovery_dir.clone(),
source_working_dir: recovery.work_dir.clone(),
original_state_manifest: recovery.recovery_dir.join("original_source_manifest.data"),
expected_manifests: recovery.recovery_dir.join("expected_manifests.data"),
}
}

pub(crate) fn original_state_manifest_file(&self) -> &Path {
&self.original_state_manifest
}

pub(crate) fn expected_manifests_file(&self) -> &Path {
&self.expected_manifests
}

pub(crate) fn actual_manifest_file(&self, subnet_id: SubnetId) -> PathBuf {
self.root.join(format!("{}.manifest", subnet_id))
}

pub(crate) fn ic_state_dir(&self, target_subnet: TargetSubnet) -> PathBuf {
self.work_dir(target_subnet).join(IC_STATE_DIR)
}

pub(crate) fn checkpoints_dir(&self, target_subnet: TargetSubnet) -> PathBuf {
self.ic_state_dir(target_subnet).join(CHECKPOINTS)
}

pub(crate) fn work_dir(&self, target_subnet: TargetSubnet) -> PathBuf {
match target_subnet {
TargetSubnet::Source => self.source_working_dir.clone(),
TargetSubnet::Destination => self.root.join("destination_working_dir"),
}
}

pub(crate) fn latest_checkpoint_dir(
&self,
target_subnet: TargetSubnet,
) -> RecoveryResult<PathBuf> {
let checkpoints_dir = self.checkpoints_dir(target_subnet);

let (max_name, _) = Recovery::get_latest_checkpoint_name_and_height(&checkpoints_dir)?;

Ok(checkpoints_dir.join(max_name))
}
}
2 changes: 2 additions & 0 deletions rs/recovery/subnet_splitting/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@ use url::Url;
use std::path::PathBuf;

mod admin_helper;
mod layout;
mod state_tool_helper;
mod steps;
mod subnet_splitting;
mod target_subnet;
mod utils;

#[derive(Parser)]
Expand Down
113 changes: 67 additions & 46 deletions rs/recovery/subnet_splitting/src/steps.rs
Original file line number Diff line number Diff line change
@@ -1,43 +1,46 @@
use crate::state_tool_helper::StateToolHelper;
use crate::{
layout::Layout,
state_tool_helper::StateToolHelper,
target_subnet::TargetSubnet,
utils::{find_expected_state_hash_for_subnet_id, get_state_hash},
};

use ic_base_types::SubnetId;
use ic_metrics::MetricsRegistry;
use ic_recovery::{
error::{RecoveryError, RecoveryResult},
file_sync_helper::rsync,
steps::Step,
Recovery, CHECKPOINTS, CUPS_DIR, IC_REGISTRY_LOCAL_STORE, IC_STATE_DIR,
Recovery, CUPS_DIR, IC_REGISTRY_LOCAL_STORE,
};
use ic_registry_routing_table::CanisterIdRange;
use ic_state_manager::split::resolve_ranges_and_split;
use slog::Logger;

use std::path::PathBuf;

const MANIFEST_FILE_NAME: &str = "manifest.data";
const EXPECTED_MANIFESTS_FILE_NAME: &str = "expected_manifests.data";
use slog::{info, Logger};

pub(crate) struct CopyWorkDirStep {
pub(crate) from: PathBuf,
pub(crate) to: PathBuf,
pub(crate) layout: Layout,
pub(crate) logger: Logger,
}

impl Step for CopyWorkDirStep {
fn descr(&self) -> String {
format!(
"Copying {} to {}. Excluding cups and registry local store",
self.from.display(),
self.to.display()
self.layout.work_dir(TargetSubnet::Source).display(),
self.layout.work_dir(TargetSubnet::Destination).display(),
)
}

fn exec(&self) -> RecoveryResult<()> {
rsync(
&self.logger,
vec![CUPS_DIR, IC_REGISTRY_LOCAL_STORE],
&format!("{}/", self.from.display()),
&self.to.display().to_string(),
&format!("{}/", self.layout.work_dir(TargetSubnet::Source).display()),
&self
.layout
.work_dir(TargetSubnet::Destination)
.display()
.to_string(),
/*require_confirmation=*/ false,
/*key_file=*/ None,
)
Expand Down Expand Up @@ -74,7 +77,8 @@ pub(crate) struct SplitStateStep {
pub(crate) subnet_id: SubnetId,
pub(crate) state_split_strategy: StateSplitStrategy,
pub(crate) state_tool_helper: StateToolHelper,
pub(crate) work_dir: PathBuf,
pub(crate) layout: Layout,
pub(crate) target_subnet: TargetSubnet,
pub(crate) logger: Logger,
}

Expand All @@ -86,24 +90,23 @@ impl Step for SplitStateStep {
and removing all but the highest checkpoints. Work dir: {}",
retained_canister_id_ranges,
self.subnet_id,
self.work_dir.display(),
self.layout.work_dir(self.target_subnet).display(),
),
StateSplitStrategy::Drop(dropped_canister_id_ranges) => format!(
"Dropping the canister id ranges {:#?} from state for the subnet {}. \
and removing all but the highest checkpoints. Work dir: {}",
dropped_canister_id_ranges,
self.subnet_id,
self.work_dir.display(),
self.layout.work_dir(self.target_subnet).display(),
),
}
}

fn exec(&self) -> RecoveryResult<()> {
let state_dir = self.work_dir.join(IC_STATE_DIR);
let checkpoints_dir = state_dir.join(CHECKPOINTS);

// 1. Split the state.
info!(self.logger, "Splitting the state");
resolve_ranges_and_split(
state_dir,
self.layout.ic_state_dir(self.target_subnet),
self.subnet_id.get(),
self.state_split_strategy.retained_canister_id_ranges(),
self.state_split_strategy.dropped_canister_id_ranges(),
Expand All @@ -112,61 +115,79 @@ impl Step for SplitStateStep {
)
.map_err(RecoveryError::OutputError)?;

let (max_name, _) = Recovery::get_latest_checkpoint_name_and_height(&checkpoints_dir)?;
let max_checkpoint = checkpoints_dir.join(max_name);
let manifest_path = max_checkpoint.join(MANIFEST_FILE_NAME);
// 2. Compute the manifest
info!(self.logger, "Computing the state manifest");
let latest_checkpoint_dir = self.layout.latest_checkpoint_dir(self.target_subnet)?;
let manifest_path = self.layout.actual_manifest_file(self.subnet_id);

self.state_tool_helper
.compute_manifest(&max_checkpoint, &manifest_path)?;
self.state_tool_helper.verify_manifest(&manifest_path)?;
.compute_manifest(&latest_checkpoint_dir, &manifest_path)?;

// 3. Validate the manifest
info!(self.logger, "Validating the manifest");
self.state_tool_helper
.verify_manifest(&manifest_path)
.map_err(|err| {
RecoveryError::ValidationFailed(format!("Manifest verification failed: {}", err))
})?;

let expected_state_hash = find_expected_state_hash_for_subnet_id(
self.layout.expected_manifests_file(),
self.subnet_id,
)?;
let actual_state_hash = get_state_hash(&latest_checkpoint_dir)?;

if actual_state_hash != expected_state_hash {
return Err(RecoveryError::ValidationFailed(format!(
"State hash after split {} doesn't match the expected state hash {}",
actual_state_hash, expected_state_hash,
)));
}

info!(self.logger, "Validation passed!");
// 4. Remove all the other checkpoints
info!(self.logger, "Removing past checkpoints");

Recovery::remove_all_but_highest_checkpoints(&checkpoints_dir, &self.logger).map(|_| ())
Recovery::remove_all_but_highest_checkpoints(
&self.layout.checkpoints_dir(self.target_subnet),
&self.logger,
)
.map(|_| ())
}
}

pub(crate) struct ComputeExpectedManifestsStep {
pub(crate) recovery_dir: PathBuf,
pub(crate) state_tool_helper: StateToolHelper,
pub(crate) source_subnet_id: SubnetId,
pub(crate) destination_subnet_id: SubnetId,
pub(crate) canister_id_ranges_to_move: Vec<CanisterIdRange>,
}

impl ComputeExpectedManifestsStep {
fn checkpoints(&self) -> PathBuf {
self.recovery_dir
.join("working_dir")
.join(IC_STATE_DIR)
.join(CHECKPOINTS)
}
pub(crate) layout: Layout,
}

impl Step for ComputeExpectedManifestsStep {
fn descr(&self) -> String {
format!(
"Compute the expected manifests of the states resulting from splitting the manifest \
at {} between {} (hosting all canisters in {:?}) and {} (all remaining canisters)",
self.checkpoints().display(),
self.layout.checkpoints_dir(TargetSubnet::Source).display(),
self.destination_subnet_id,
self.canister_id_ranges_to_move,
self.source_subnet_id,
)
}

fn exec(&self) -> RecoveryResult<()> {
let checkpoints_dir = self.checkpoints();
let (max_name, _) = Recovery::get_latest_checkpoint_name_and_height(&checkpoints_dir)?;
let max_checkpoint = checkpoints_dir.join(max_name);
let manifest_path = self.recovery_dir.join(MANIFEST_FILE_NAME);
self.state_tool_helper.compute_manifest(
&self.layout.latest_checkpoint_dir(TargetSubnet::Source)?,
self.layout.original_state_manifest_file(),
)?;

self.state_tool_helper
.compute_manifest(&max_checkpoint, &manifest_path)?;
self.state_tool_helper.split_manifest(
&manifest_path,
self.layout.original_state_manifest_file(),
self.source_subnet_id,
self.destination_subnet_id,
&self.canister_id_ranges_to_move,
&self.recovery_dir.join(EXPECTED_MANIFESTS_FILE_NAME),
self.layout.expected_manifests_file(),
)
}
}

0 comments on commit cce0628

Please sign in to comment.