Skip to content

Commit cce0628

Browse files
committed
Merge branch 'kpop/CON-1066/verify_hashes' into 'master'
chore(ic-recovery): [CON-1066] Verify that the hashes of the states after split match the expected hashes computed in one of the previous steps Also, I've created a new struct `Layout` with the layout of the working directory. During the development of `subnet_splitting` I've made too many mistakes by messing up the paths. With having all the paths in one file, I hope it's going to be harder to make mistakes in the future... Closes CON-1066 Closes CON-1066 See merge request dfinity-lab/public/ic!13220
2 parents cfabc9a + 396da51 commit cce0628

File tree

12 files changed

+335
-91
lines changed

12 files changed

+335
-91
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

rs/recovery/src/error.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ pub enum RecoveryError {
2424
StateToolError(String),
2525
CheckpointError(String, CheckpointError),
2626
RegistryError(String),
27+
ValidationFailed(String),
2728
StepSkipped,
2829
}
2930

@@ -92,6 +93,9 @@ impl fmt::Display for RecoveryError {
9293
}
9394
RecoveryError::RegistryError(msg) => write!(f, "Registry error, message: {}", msg),
9495
RecoveryError::StateToolError(msg) => write!(f, "State tool error, message: {}", msg),
96+
RecoveryError::ValidationFailed(msg) => {
97+
write!(f, "Validation failed, message: {}", msg)
98+
}
9599
}
96100
}
97101
}

rs/recovery/src/util.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,15 @@ pub fn parse_hex_str(string: &str) -> RecoveryResult<u64> {
1818
})
1919
}
2020

21-
pub fn subnet_id_from_str(s: &str) -> Result<SubnetId, String> {
21+
pub fn subnet_id_from_str(s: &str) -> RecoveryResult<SubnetId> {
2222
PrincipalId::from_str(s)
23-
.map_err(|e| format!("Unable to parse subnet_id {:?}", e))
23+
.map_err(|e| RecoveryError::UnexpectedError(format!("Unable to parse subnet_id {:?}", e)))
2424
.map(SubnetId::from)
2525
}
2626

27-
pub fn node_id_from_str(s: &str) -> Result<NodeId, String> {
27+
pub fn node_id_from_str(s: &str) -> RecoveryResult<NodeId> {
2828
PrincipalId::from_str(s)
29-
.map_err(|e| format!("Unable to parse node_id {:?}", e))
29+
.map_err(|e| RecoveryError::UnexpectedError(format!("Unable to parse node_id {:?}", e)))
3030
.map(NodeId::from)
3131
}
3232

rs/recovery/subnet_splitting/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ DEPENDENCIES = [
2222
]
2323

2424
DEV_DEPENDENCIES = [
25+
"//rs/test_utilities/tmpdir",
2526
]
2627

2728
MACRO_DEPENDENCIES = [
@@ -44,5 +45,6 @@ rust_binary(
4445
rust_test(
4546
name = "subnet_splitting_tool_test",
4647
crate = "subnet-splitting-tool",
48+
data = ["test_data/fake_expected_manifests.data"],
4749
deps = DEPENDENCIES + DEV_DEPENDENCIES,
4850
)

rs/recovery/subnet_splitting/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ slog = { version = "2.5.2", features = ["release_max_level_trace"] }
2020
strum = "0.24.1"
2121
strum_macros = "0.24.1"
2222

23+
[dev-dependencies]
24+
ic-test-utilities-tmpdir = { path = "../../test_utilities/tmpdir" }
25+
2326
[[bin]]
2427
name = "subnet-splitting-tool"
2528
path = "src/main.rs"
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
use crate::target_subnet::TargetSubnet;
2+
3+
use ic_base_types::SubnetId;
4+
use ic_recovery::{error::RecoveryResult, Recovery, CHECKPOINTS, IC_STATE_DIR};
5+
6+
use std::path::{Path, PathBuf};
7+
8+
#[derive(Clone)]
9+
/// Describes the layout of the working directory of subnet splitting:
10+
///
11+
/// |-- root/
12+
/// | |-- ${source_subnet_id}.manifest
13+
/// | |-- ${destination_subnet_id}.manifest
14+
/// | |-- original_source_manifest.data
15+
/// | |-- expected_manifests.data
16+
/// | |-- (destination_)work_dir/
17+
/// | | |-- data/
18+
/// | | | |-- ic_state/checkpoints/
19+
/// | | | | |-- 1/
20+
/// | | | | |-- 2/
21+
pub(crate) struct Layout {
22+
root: PathBuf,
23+
24+
original_state_manifest: PathBuf,
25+
expected_manifests: PathBuf,
26+
source_working_dir: PathBuf,
27+
}
28+
29+
impl Layout {
30+
pub(crate) fn new(recovery: &Recovery) -> Self {
31+
Self {
32+
root: recovery.recovery_dir.clone(),
33+
source_working_dir: recovery.work_dir.clone(),
34+
original_state_manifest: recovery.recovery_dir.join("original_source_manifest.data"),
35+
expected_manifests: recovery.recovery_dir.join("expected_manifests.data"),
36+
}
37+
}
38+
39+
pub(crate) fn original_state_manifest_file(&self) -> &Path {
40+
&self.original_state_manifest
41+
}
42+
43+
pub(crate) fn expected_manifests_file(&self) -> &Path {
44+
&self.expected_manifests
45+
}
46+
47+
pub(crate) fn actual_manifest_file(&self, subnet_id: SubnetId) -> PathBuf {
48+
self.root.join(format!("{}.manifest", subnet_id))
49+
}
50+
51+
pub(crate) fn ic_state_dir(&self, target_subnet: TargetSubnet) -> PathBuf {
52+
self.work_dir(target_subnet).join(IC_STATE_DIR)
53+
}
54+
55+
pub(crate) fn checkpoints_dir(&self, target_subnet: TargetSubnet) -> PathBuf {
56+
self.ic_state_dir(target_subnet).join(CHECKPOINTS)
57+
}
58+
59+
pub(crate) fn work_dir(&self, target_subnet: TargetSubnet) -> PathBuf {
60+
match target_subnet {
61+
TargetSubnet::Source => self.source_working_dir.clone(),
62+
TargetSubnet::Destination => self.root.join("destination_working_dir"),
63+
}
64+
}
65+
66+
pub(crate) fn latest_checkpoint_dir(
67+
&self,
68+
target_subnet: TargetSubnet,
69+
) -> RecoveryResult<PathBuf> {
70+
let checkpoints_dir = self.checkpoints_dir(target_subnet);
71+
72+
let (max_name, _) = Recovery::get_latest_checkpoint_name_and_height(&checkpoints_dir)?;
73+
74+
Ok(checkpoints_dir.join(max_name))
75+
}
76+
}

rs/recovery/subnet_splitting/src/main.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,11 @@ use url::Url;
99
use std::path::PathBuf;
1010

1111
mod admin_helper;
12+
mod layout;
1213
mod state_tool_helper;
1314
mod steps;
1415
mod subnet_splitting;
16+
mod target_subnet;
1517
mod utils;
1618

1719
#[derive(Parser)]
Lines changed: 67 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,46 @@
1-
use crate::state_tool_helper::StateToolHelper;
1+
use crate::{
2+
layout::Layout,
3+
state_tool_helper::StateToolHelper,
4+
target_subnet::TargetSubnet,
5+
utils::{find_expected_state_hash_for_subnet_id, get_state_hash},
6+
};
27

38
use ic_base_types::SubnetId;
49
use ic_metrics::MetricsRegistry;
510
use ic_recovery::{
611
error::{RecoveryError, RecoveryResult},
712
file_sync_helper::rsync,
813
steps::Step,
9-
Recovery, CHECKPOINTS, CUPS_DIR, IC_REGISTRY_LOCAL_STORE, IC_STATE_DIR,
14+
Recovery, CUPS_DIR, IC_REGISTRY_LOCAL_STORE,
1015
};
1116
use ic_registry_routing_table::CanisterIdRange;
1217
use ic_state_manager::split::resolve_ranges_and_split;
13-
use slog::Logger;
14-
15-
use std::path::PathBuf;
16-
17-
const MANIFEST_FILE_NAME: &str = "manifest.data";
18-
const EXPECTED_MANIFESTS_FILE_NAME: &str = "expected_manifests.data";
18+
use slog::{info, Logger};
1919

2020
pub(crate) struct CopyWorkDirStep {
21-
pub(crate) from: PathBuf,
22-
pub(crate) to: PathBuf,
21+
pub(crate) layout: Layout,
2322
pub(crate) logger: Logger,
2423
}
2524

2625
impl Step for CopyWorkDirStep {
2726
fn descr(&self) -> String {
2827
format!(
2928
"Copying {} to {}. Excluding cups and registry local store",
30-
self.from.display(),
31-
self.to.display()
29+
self.layout.work_dir(TargetSubnet::Source).display(),
30+
self.layout.work_dir(TargetSubnet::Destination).display(),
3231
)
3332
}
3433

3534
fn exec(&self) -> RecoveryResult<()> {
3635
rsync(
3736
&self.logger,
3837
vec![CUPS_DIR, IC_REGISTRY_LOCAL_STORE],
39-
&format!("{}/", self.from.display()),
40-
&self.to.display().to_string(),
38+
&format!("{}/", self.layout.work_dir(TargetSubnet::Source).display()),
39+
&self
40+
.layout
41+
.work_dir(TargetSubnet::Destination)
42+
.display()
43+
.to_string(),
4144
/*require_confirmation=*/ false,
4245
/*key_file=*/ None,
4346
)
@@ -74,7 +77,8 @@ pub(crate) struct SplitStateStep {
7477
pub(crate) subnet_id: SubnetId,
7578
pub(crate) state_split_strategy: StateSplitStrategy,
7679
pub(crate) state_tool_helper: StateToolHelper,
77-
pub(crate) work_dir: PathBuf,
80+
pub(crate) layout: Layout,
81+
pub(crate) target_subnet: TargetSubnet,
7882
pub(crate) logger: Logger,
7983
}
8084

@@ -86,24 +90,23 @@ impl Step for SplitStateStep {
8690
and removing all but the highest checkpoints. Work dir: {}",
8791
retained_canister_id_ranges,
8892
self.subnet_id,
89-
self.work_dir.display(),
93+
self.layout.work_dir(self.target_subnet).display(),
9094
),
9195
StateSplitStrategy::Drop(dropped_canister_id_ranges) => format!(
9296
"Dropping the canister id ranges {:#?} from state for the subnet {}. \
9397
and removing all but the highest checkpoints. Work dir: {}",
9498
dropped_canister_id_ranges,
9599
self.subnet_id,
96-
self.work_dir.display(),
100+
self.layout.work_dir(self.target_subnet).display(),
97101
),
98102
}
99103
}
100104

101105
fn exec(&self) -> RecoveryResult<()> {
102-
let state_dir = self.work_dir.join(IC_STATE_DIR);
103-
let checkpoints_dir = state_dir.join(CHECKPOINTS);
104-
106+
// 1. Split the state.
107+
info!(self.logger, "Splitting the state");
105108
resolve_ranges_and_split(
106-
state_dir,
109+
self.layout.ic_state_dir(self.target_subnet),
107110
self.subnet_id.get(),
108111
self.state_split_strategy.retained_canister_id_ranges(),
109112
self.state_split_strategy.dropped_canister_id_ranges(),
@@ -112,61 +115,79 @@ impl Step for SplitStateStep {
112115
)
113116
.map_err(RecoveryError::OutputError)?;
114117

115-
let (max_name, _) = Recovery::get_latest_checkpoint_name_and_height(&checkpoints_dir)?;
116-
let max_checkpoint = checkpoints_dir.join(max_name);
117-
let manifest_path = max_checkpoint.join(MANIFEST_FILE_NAME);
118+
// 2. Compute the manifest
119+
info!(self.logger, "Computing the state manifest");
120+
let latest_checkpoint_dir = self.layout.latest_checkpoint_dir(self.target_subnet)?;
121+
let manifest_path = self.layout.actual_manifest_file(self.subnet_id);
118122

119123
self.state_tool_helper
120-
.compute_manifest(&max_checkpoint, &manifest_path)?;
121-
self.state_tool_helper.verify_manifest(&manifest_path)?;
124+
.compute_manifest(&latest_checkpoint_dir, &manifest_path)?;
125+
126+
// 3. Validate the manifest
127+
info!(self.logger, "Validating the manifest");
128+
self.state_tool_helper
129+
.verify_manifest(&manifest_path)
130+
.map_err(|err| {
131+
RecoveryError::ValidationFailed(format!("Manifest verification failed: {}", err))
132+
})?;
133+
134+
let expected_state_hash = find_expected_state_hash_for_subnet_id(
135+
self.layout.expected_manifests_file(),
136+
self.subnet_id,
137+
)?;
138+
let actual_state_hash = get_state_hash(&latest_checkpoint_dir)?;
139+
140+
if actual_state_hash != expected_state_hash {
141+
return Err(RecoveryError::ValidationFailed(format!(
142+
"State hash after split {} doesn't match the expected state hash {}",
143+
actual_state_hash, expected_state_hash,
144+
)));
145+
}
146+
147+
info!(self.logger, "Validation passed!");
148+
// 4. Remove all the other checkpoints
149+
info!(self.logger, "Removing past checkpoints");
122150

123-
Recovery::remove_all_but_highest_checkpoints(&checkpoints_dir, &self.logger).map(|_| ())
151+
Recovery::remove_all_but_highest_checkpoints(
152+
&self.layout.checkpoints_dir(self.target_subnet),
153+
&self.logger,
154+
)
155+
.map(|_| ())
124156
}
125157
}
126158

127159
pub(crate) struct ComputeExpectedManifestsStep {
128-
pub(crate) recovery_dir: PathBuf,
129160
pub(crate) state_tool_helper: StateToolHelper,
130161
pub(crate) source_subnet_id: SubnetId,
131162
pub(crate) destination_subnet_id: SubnetId,
132163
pub(crate) canister_id_ranges_to_move: Vec<CanisterIdRange>,
133-
}
134-
135-
impl ComputeExpectedManifestsStep {
136-
fn checkpoints(&self) -> PathBuf {
137-
self.recovery_dir
138-
.join("working_dir")
139-
.join(IC_STATE_DIR)
140-
.join(CHECKPOINTS)
141-
}
164+
pub(crate) layout: Layout,
142165
}
143166

144167
impl Step for ComputeExpectedManifestsStep {
145168
fn descr(&self) -> String {
146169
format!(
147170
"Compute the expected manifests of the states resulting from splitting the manifest \
148171
at {} between {} (hosting all canisters in {:?}) and {} (all remaining canisters)",
149-
self.checkpoints().display(),
172+
self.layout.checkpoints_dir(TargetSubnet::Source).display(),
150173
self.destination_subnet_id,
151174
self.canister_id_ranges_to_move,
152175
self.source_subnet_id,
153176
)
154177
}
155178

156179
fn exec(&self) -> RecoveryResult<()> {
157-
let checkpoints_dir = self.checkpoints();
158-
let (max_name, _) = Recovery::get_latest_checkpoint_name_and_height(&checkpoints_dir)?;
159-
let max_checkpoint = checkpoints_dir.join(max_name);
160-
let manifest_path = self.recovery_dir.join(MANIFEST_FILE_NAME);
180+
self.state_tool_helper.compute_manifest(
181+
&self.layout.latest_checkpoint_dir(TargetSubnet::Source)?,
182+
self.layout.original_state_manifest_file(),
183+
)?;
161184

162-
self.state_tool_helper
163-
.compute_manifest(&max_checkpoint, &manifest_path)?;
164185
self.state_tool_helper.split_manifest(
165-
&manifest_path,
186+
self.layout.original_state_manifest_file(),
166187
self.source_subnet_id,
167188
self.destination_subnet_id,
168189
&self.canister_id_ranges_to_move,
169-
&self.recovery_dir.join(EXPECTED_MANIFESTS_FILE_NAME),
190+
self.layout.expected_manifests_file(),
170191
)
171192
}
172193
}

0 commit comments

Comments
 (0)