Merge branch 'kpop/CON-1066/verify_hashes' into 'master'

chore(ic-recovery): [CON-1066] Verify that the hashes of the states after split match the expected hashes computed in one of the previous steps Also, I've created a new struct `Layout` with the layout of the working directory. During the development of `subnet_splitting` I've made too many mistakes by messing up the paths. With having all the paths in one file, I hope it's going to be harder to make mistakes in the future... Closes CON-1066 Closes CON-1066 See merge request dfinity-lab/public/ic!13220
dfinity · Jun 28, 2023 · cce0628 · cce0628
2 parents cfabc9a + 396da51
commit cce0628
Show file tree

Hide file tree

Showing 12 changed files with 335 additions and 91 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/rs/recovery/src/error.rs b/rs/recovery/src/error.rs
@@ -24,6 +24,7 @@ pub enum RecoveryError {
     StateToolError(String),
     CheckpointError(String, CheckpointError),
     RegistryError(String),
+    ValidationFailed(String),
     StepSkipped,
 }
 
@@ -92,6 +93,9 @@ impl fmt::Display for RecoveryError {
             }
             RecoveryError::RegistryError(msg) => write!(f, "Registry error, message: {}", msg),
             RecoveryError::StateToolError(msg) => write!(f, "State tool error, message: {}", msg),
+            RecoveryError::ValidationFailed(msg) => {
+                write!(f, "Validation failed, message: {}", msg)
+            }
         }
     }
 }

diff --git a/rs/recovery/src/util.rs b/rs/recovery/src/util.rs
@@ -18,15 +18,15 @@ pub fn parse_hex_str(string: &str) -> RecoveryResult<u64> {
     })
 }
 
-pub fn subnet_id_from_str(s: &str) -> Result<SubnetId, String> {
+pub fn subnet_id_from_str(s: &str) -> RecoveryResult<SubnetId> {
     PrincipalId::from_str(s)
-        .map_err(|e| format!("Unable to parse subnet_id {:?}", e))
+        .map_err(|e| RecoveryError::UnexpectedError(format!("Unable to parse subnet_id {:?}", e)))
         .map(SubnetId::from)
 }
 
-pub fn node_id_from_str(s: &str) -> Result<NodeId, String> {
+pub fn node_id_from_str(s: &str) -> RecoveryResult<NodeId> {
     PrincipalId::from_str(s)
-        .map_err(|e| format!("Unable to parse node_id {:?}", e))
+        .map_err(|e| RecoveryError::UnexpectedError(format!("Unable to parse node_id {:?}", e)))
         .map(NodeId::from)
 }
 

diff --git a/rs/recovery/subnet_splitting/BUILD.bazel b/rs/recovery/subnet_splitting/BUILD.bazel
@@ -22,6 +22,7 @@ DEPENDENCIES = [
 ]
 
 DEV_DEPENDENCIES = [
+    "//rs/test_utilities/tmpdir",
 ]
 
 MACRO_DEPENDENCIES = [
@@ -44,5 +45,6 @@ rust_binary(
 rust_test(
     name = "subnet_splitting_tool_test",
     crate = "subnet-splitting-tool",
+    data = ["test_data/fake_expected_manifests.data"],
     deps = DEPENDENCIES + DEV_DEPENDENCIES,
 )
diff --git a/rs/recovery/subnet_splitting/Cargo.toml b/rs/recovery/subnet_splitting/Cargo.toml
@@ -20,6 +20,9 @@ slog = { version = "2.5.2", features = ["release_max_level_trace"] }
 strum = "0.24.1"
 strum_macros = "0.24.1"
 
+[dev-dependencies]
+ic-test-utilities-tmpdir = { path = "../../test_utilities/tmpdir" }
+
 [[bin]]
 name = "subnet-splitting-tool"
 path = "src/main.rs"
diff --git a/rs/recovery/subnet_splitting/src/layout.rs b/rs/recovery/subnet_splitting/src/layout.rs
@@ -0,0 +1,76 @@
+use crate::target_subnet::TargetSubnet;
+
+use ic_base_types::SubnetId;
+use ic_recovery::{error::RecoveryResult, Recovery, CHECKPOINTS, IC_STATE_DIR};
+
+use std::path::{Path, PathBuf};
+
+#[derive(Clone)]
+/// Describes the layout of the working directory of subnet splitting:
+///
+/// |-- root/
+/// |  |-- ${source_subnet_id}.manifest
+/// |  |-- ${destination_subnet_id}.manifest
+/// |  |-- original_source_manifest.data
+/// |  |-- expected_manifests.data
+/// |  |-- (destination_)work_dir/
+/// |  |   |-- data/
+/// |  |   |   |-- ic_state/checkpoints/
+/// |  |   |   |   |-- 1/
+/// |  |   |   |   |-- 2/
+pub(crate) struct Layout {
+    root: PathBuf,
+
+    original_state_manifest: PathBuf,
+    expected_manifests: PathBuf,
+    source_working_dir: PathBuf,
+}
+
+impl Layout {
+    pub(crate) fn new(recovery: &Recovery) -> Self {
+        Self {
+            root: recovery.recovery_dir.clone(),
+            source_working_dir: recovery.work_dir.clone(),
+            original_state_manifest: recovery.recovery_dir.join("original_source_manifest.data"),
+            expected_manifests: recovery.recovery_dir.join("expected_manifests.data"),
+        }
+    }
+
+    pub(crate) fn original_state_manifest_file(&self) -> &Path {
+        &self.original_state_manifest
+    }
+
+    pub(crate) fn expected_manifests_file(&self) -> &Path {
+        &self.expected_manifests
+    }
+
+    pub(crate) fn actual_manifest_file(&self, subnet_id: SubnetId) -> PathBuf {
+        self.root.join(format!("{}.manifest", subnet_id))
+    }
+
+    pub(crate) fn ic_state_dir(&self, target_subnet: TargetSubnet) -> PathBuf {
+        self.work_dir(target_subnet).join(IC_STATE_DIR)
+    }
+
+    pub(crate) fn checkpoints_dir(&self, target_subnet: TargetSubnet) -> PathBuf {
+        self.ic_state_dir(target_subnet).join(CHECKPOINTS)
+    }
+
+    pub(crate) fn work_dir(&self, target_subnet: TargetSubnet) -> PathBuf {
+        match target_subnet {
+            TargetSubnet::Source => self.source_working_dir.clone(),
+            TargetSubnet::Destination => self.root.join("destination_working_dir"),
+        }
+    }
+
+    pub(crate) fn latest_checkpoint_dir(
+        &self,
+        target_subnet: TargetSubnet,
+    ) -> RecoveryResult<PathBuf> {
+        let checkpoints_dir = self.checkpoints_dir(target_subnet);
+
+        let (max_name, _) = Recovery::get_latest_checkpoint_name_and_height(&checkpoints_dir)?;
+
+        Ok(checkpoints_dir.join(max_name))
+    }
+}
diff --git a/rs/recovery/subnet_splitting/src/main.rs b/rs/recovery/subnet_splitting/src/main.rs
@@ -9,9 +9,11 @@ use url::Url;
 use std::path::PathBuf;
 
 mod admin_helper;
+mod layout;
 mod state_tool_helper;
 mod steps;
 mod subnet_splitting;
+mod target_subnet;
 mod utils;
 
 #[derive(Parser)]

diff --git a/rs/recovery/subnet_splitting/src/steps.rs b/rs/recovery/subnet_splitting/src/steps.rs
@@ -1,43 +1,46 @@
-use crate::state_tool_helper::StateToolHelper;
+use crate::{
+    layout::Layout,
+    state_tool_helper::StateToolHelper,
+    target_subnet::TargetSubnet,
+    utils::{find_expected_state_hash_for_subnet_id, get_state_hash},
+};
 
 use ic_base_types::SubnetId;
 use ic_metrics::MetricsRegistry;
 use ic_recovery::{
     error::{RecoveryError, RecoveryResult},
     file_sync_helper::rsync,
     steps::Step,
-    Recovery, CHECKPOINTS, CUPS_DIR, IC_REGISTRY_LOCAL_STORE, IC_STATE_DIR,
+    Recovery, CUPS_DIR, IC_REGISTRY_LOCAL_STORE,
 };
 use ic_registry_routing_table::CanisterIdRange;
 use ic_state_manager::split::resolve_ranges_and_split;
-use slog::Logger;
-
-use std::path::PathBuf;
-
-const MANIFEST_FILE_NAME: &str = "manifest.data";
-const EXPECTED_MANIFESTS_FILE_NAME: &str = "expected_manifests.data";
+use slog::{info, Logger};
 
 pub(crate) struct CopyWorkDirStep {
-    pub(crate) from: PathBuf,
-    pub(crate) to: PathBuf,
+    pub(crate) layout: Layout,
     pub(crate) logger: Logger,
 }
 
 impl Step for CopyWorkDirStep {
     fn descr(&self) -> String {
         format!(
             "Copying {} to {}. Excluding cups and registry local store",
-            self.from.display(),
-            self.to.display()
+            self.layout.work_dir(TargetSubnet::Source).display(),
+            self.layout.work_dir(TargetSubnet::Destination).display(),
         )
     }
 
     fn exec(&self) -> RecoveryResult<()> {
         rsync(
             &self.logger,
             vec![CUPS_DIR, IC_REGISTRY_LOCAL_STORE],
-            &format!("{}/", self.from.display()),
-            &self.to.display().to_string(),
+            &format!("{}/", self.layout.work_dir(TargetSubnet::Source).display()),
+            &self
+                .layout
+                .work_dir(TargetSubnet::Destination)
+                .display()
+                .to_string(),
             /*require_confirmation=*/ false,
             /*key_file=*/ None,
         )
@@ -74,7 +77,8 @@ pub(crate) struct SplitStateStep {
     pub(crate) subnet_id: SubnetId,
     pub(crate) state_split_strategy: StateSplitStrategy,
     pub(crate) state_tool_helper: StateToolHelper,
-    pub(crate) work_dir: PathBuf,
+    pub(crate) layout: Layout,
+    pub(crate) target_subnet: TargetSubnet,
     pub(crate) logger: Logger,
 }
 
@@ -86,24 +90,23 @@ impl Step for SplitStateStep {
                  and removing all but the highest checkpoints. Work dir: {}",
                 retained_canister_id_ranges,
                 self.subnet_id,
-                self.work_dir.display(),
+                self.layout.work_dir(self.target_subnet).display(),
             ),
             StateSplitStrategy::Drop(dropped_canister_id_ranges) => format!(
                 "Dropping the canister id ranges {:#?} from state for the subnet {}. \
                  and removing all but the highest checkpoints. Work dir: {}",
                 dropped_canister_id_ranges,
                 self.subnet_id,
-                self.work_dir.display(),
+                self.layout.work_dir(self.target_subnet).display(),
             ),
         }
     }
 
     fn exec(&self) -> RecoveryResult<()> {
-        let state_dir = self.work_dir.join(IC_STATE_DIR);
-        let checkpoints_dir = state_dir.join(CHECKPOINTS);
-
+        // 1. Split the state.
+        info!(self.logger, "Splitting the state");
         resolve_ranges_and_split(
-            state_dir,
+            self.layout.ic_state_dir(self.target_subnet),
             self.subnet_id.get(),
             self.state_split_strategy.retained_canister_id_ranges(),
             self.state_split_strategy.dropped_canister_id_ranges(),
@@ -112,61 +115,79 @@ impl Step for SplitStateStep {
         )
         .map_err(RecoveryError::OutputError)?;
 
-        let (max_name, _) = Recovery::get_latest_checkpoint_name_and_height(&checkpoints_dir)?;
-        let max_checkpoint = checkpoints_dir.join(max_name);
-        let manifest_path = max_checkpoint.join(MANIFEST_FILE_NAME);
+        // 2. Compute the manifest
+        info!(self.logger, "Computing the state manifest");
+        let latest_checkpoint_dir = self.layout.latest_checkpoint_dir(self.target_subnet)?;
+        let manifest_path = self.layout.actual_manifest_file(self.subnet_id);
 
         self.state_tool_helper
-            .compute_manifest(&max_checkpoint, &manifest_path)?;
-        self.state_tool_helper.verify_manifest(&manifest_path)?;
+            .compute_manifest(&latest_checkpoint_dir, &manifest_path)?;
+
+        // 3. Validate the manifest
+        info!(self.logger, "Validating the manifest");
+        self.state_tool_helper
+            .verify_manifest(&manifest_path)
+            .map_err(|err| {
+                RecoveryError::ValidationFailed(format!("Manifest verification failed: {}", err))
+            })?;
+
+        let expected_state_hash = find_expected_state_hash_for_subnet_id(
+            self.layout.expected_manifests_file(),
+            self.subnet_id,
+        )?;
+        let actual_state_hash = get_state_hash(&latest_checkpoint_dir)?;
+
+        if actual_state_hash != expected_state_hash {
+            return Err(RecoveryError::ValidationFailed(format!(
+                "State hash after split {} doesn't match the expected state hash {}",
+                actual_state_hash, expected_state_hash,
+            )));
+        }
+
+        info!(self.logger, "Validation passed!");
+        // 4. Remove all the other checkpoints
+        info!(self.logger, "Removing past checkpoints");
 
-        Recovery::remove_all_but_highest_checkpoints(&checkpoints_dir, &self.logger).map(|_| ())
+        Recovery::remove_all_but_highest_checkpoints(
+            &self.layout.checkpoints_dir(self.target_subnet),
+            &self.logger,
+        )
+        .map(|_| ())
     }
 }
 
 pub(crate) struct ComputeExpectedManifestsStep {
-    pub(crate) recovery_dir: PathBuf,
     pub(crate) state_tool_helper: StateToolHelper,
     pub(crate) source_subnet_id: SubnetId,
     pub(crate) destination_subnet_id: SubnetId,
     pub(crate) canister_id_ranges_to_move: Vec<CanisterIdRange>,
-}
-
-impl ComputeExpectedManifestsStep {
-    fn checkpoints(&self) -> PathBuf {
-        self.recovery_dir
-            .join("working_dir")
-            .join(IC_STATE_DIR)
-            .join(CHECKPOINTS)
-    }
+    pub(crate) layout: Layout,
 }
 
 impl Step for ComputeExpectedManifestsStep {
     fn descr(&self) -> String {
         format!(
             "Compute the expected manifests of the states resulting from splitting the manifest \
             at {} between {} (hosting all canisters in {:?}) and {} (all remaining canisters)",
-            self.checkpoints().display(),
+            self.layout.checkpoints_dir(TargetSubnet::Source).display(),
             self.destination_subnet_id,
             self.canister_id_ranges_to_move,
             self.source_subnet_id,
         )
     }
 
     fn exec(&self) -> RecoveryResult<()> {
-        let checkpoints_dir = self.checkpoints();
-        let (max_name, _) = Recovery::get_latest_checkpoint_name_and_height(&checkpoints_dir)?;
-        let max_checkpoint = checkpoints_dir.join(max_name);
-        let manifest_path = self.recovery_dir.join(MANIFEST_FILE_NAME);
+        self.state_tool_helper.compute_manifest(
+            &self.layout.latest_checkpoint_dir(TargetSubnet::Source)?,
+            self.layout.original_state_manifest_file(),
+        )?;
 
-        self.state_tool_helper
-            .compute_manifest(&max_checkpoint, &manifest_path)?;
         self.state_tool_helper.split_manifest(
-            &manifest_path,
+            self.layout.original_state_manifest_file(),
             self.source_subnet_id,
             self.destination_subnet_id,
             &self.canister_id_ranges_to_move,
-            &self.recovery_dir.join(EXPECTED_MANIFESTS_FILE_NAME),
+            self.layout.expected_manifests_file(),
         )
     }
 }