Skip to content

Commit 27a80c3

Browse files
committed
Merge branch 'kpop/CON-1057/manifest-splitting-v2' into 'master'
feat(ic-recovery): [CON-1057] Add a "ComputeExpectedManifests" step to the subnet splitting tool It works by calling the `state-tool` several times. Closes CON-1057 Closes CON-1057 See merge request dfinity-lab/public/ic!13133
2 parents 7618e27 + cd37af7 commit 27a80c3

File tree

11 files changed

+300
-43
lines changed

11 files changed

+300
-43
lines changed

rs/backup/src/backup_helper.rs

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,28 @@
1-
use crate::notification_client::NotificationClient;
2-
use crate::util::{block_on, sleep_secs};
3-
use ic_recovery::command_helper::exec_cmd;
4-
use ic_recovery::error::RecoveryError;
5-
use ic_recovery::file_sync_helper::download_binary;
1+
use crate::{
2+
notification_client::NotificationClient,
3+
util::{block_on, sleep_secs},
4+
};
5+
use ic_recovery::{
6+
command_helper::exec_cmd, error::RecoveryError, file_sync_helper::download_binary,
7+
};
68
use ic_registry_client::client::{RegistryClient, RegistryClientImpl};
7-
use ic_registry_client_helpers::node::NodeRegistry;
8-
use ic_registry_client_helpers::subnet::SubnetRegistry;
9+
use ic_registry_client_helpers::{node::NodeRegistry, subnet::SubnetRegistry};
910
use ic_types::{ReplicaVersion, SubnetId};
1011

1112
use chrono::{DateTime, Utc};
12-
use rand::seq::SliceRandom;
13-
use rand::thread_rng;
13+
use rand::{seq::SliceRandom, thread_rng};
1414
use slog::{debug, error, info, warn, Logger};
15-
use std::collections::BTreeMap;
16-
use std::ffi::OsStr;
17-
use std::fs::{create_dir_all, read_dir, remove_dir_all, DirEntry, File};
18-
use std::io::Write;
19-
use std::net::IpAddr;
20-
use std::path::{Path, PathBuf};
21-
use std::process::{Command, Stdio};
22-
use std::sync::{Arc, Mutex};
23-
use std::time::Instant;
15+
use std::{
16+
collections::BTreeMap,
17+
ffi::OsStr,
18+
fs::{create_dir_all, read_dir, remove_dir_all, DirEntry, File},
19+
io::Write,
20+
net::IpAddr,
21+
path::{Path, PathBuf},
22+
process::{Command, Stdio},
23+
sync::{Arc, Mutex},
24+
time::Instant,
25+
};
2426

2527
const RETRIES_RSYNC_HOST: u64 = 5;
2628
const RETRIES_BINARY_DOWNLOAD: u64 = 3;
@@ -196,7 +198,7 @@ impl BackupHelper {
196198
&self.log,
197199
replica_version.clone(),
198200
binary_name.to_string(),
199-
self.binary_dir(replica_version),
201+
&self.binary_dir(replica_version),
200202
));
201203
if res.is_ok() {
202204
return Ok(());

rs/recovery/src/error.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ pub enum RecoveryError {
2121
ParsingError(serde_json::Error),
2222
SerializationError(serde_json::Error),
2323
UnexpectedError(String),
24+
StateToolError(String),
2425
CheckpointError(String, CheckpointError),
2526
StepSkipped,
2627
}
@@ -88,6 +89,7 @@ impl fmt::Display for RecoveryError {
8889
RecoveryError::CheckpointError(msg, e) => {
8990
write!(f, "Checkpoint error, message: {}, error: {}", msg, e)
9091
}
92+
RecoveryError::StateToolError(msg) => write!(f, "State tool error, message: {}", msg),
9193
}
9294
}
9395
}

rs/recovery/src/file_sync_helper.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ pub async fn download_binary(
2323
logger: &Logger,
2424
replica_version: ReplicaVersion,
2525
binary_name: String,
26-
target_dir: PathBuf,
26+
target_dir: &Path,
2727
) -> RecoveryResult<PathBuf> {
2828
let binary_url = format!(
2929
"https://download.dfinity.systems/ic/{}/release/{}.gz",

rs/recovery/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ impl Recovery {
181181
&r.logger,
182182
version,
183183
String::from("ic-admin"),
184-
r.binary_dir.clone(),
184+
&r.binary_dir,
185185
))?;
186186
} else {
187187
info!(r.logger, "No ic-admin version provided, skipping download.");

rs/recovery/subnet_splitting/src/admin_helper.rs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
use crate::utils::canister_id_range_to_string;
2+
13
use ic_base_types::SubnetId;
24
use ic_recovery::admin_helper::{
35
quote, AdminHelper, CommandHelper, IcAdmin, SSH_READONLY_ACCESS_ARG, SUMMARY_ARG,
@@ -118,10 +120,6 @@ pub(crate) fn get_halt_subnet_at_cup_height_command(
118120
ic_admin
119121
}
120122

121-
fn canister_id_range_to_string(canister_id_range: &CanisterIdRange) -> String {
122-
format!("{}:{}", canister_id_range.start, canister_id_range.end)
123-
}
124-
125123
#[cfg(test)]
126124
mod tests {
127125
use super::*;

rs/recovery/subnet_splitting/src/main.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,10 @@ use url::Url;
99
use std::path::PathBuf;
1010

1111
mod admin_helper;
12+
mod state_tool_helper;
1213
mod steps;
1314
mod subnet_splitting;
15+
mod utils;
1416

1517
#[derive(Parser)]
1618
#[clap(version = "1.0")]
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
use crate::utils::canister_id_ranges_to_strings;
2+
3+
use ic_base_types::SubnetId;
4+
use ic_recovery::{
5+
error::{RecoveryError, RecoveryResult},
6+
file_sync_helper::{download_binary, path_exists, write_bytes},
7+
util::block_on,
8+
};
9+
use ic_registry_routing_table::CanisterIdRange;
10+
use ic_types::ReplicaVersion;
11+
use slog::{info, Logger};
12+
13+
use std::{
14+
iter::once,
15+
path::{Path, PathBuf},
16+
process::{Command, Stdio},
17+
};
18+
19+
const BINARY_NAME: &str = "state-tool";
20+
21+
/// Helper struct to simplify executing the `state-tool` binary.
22+
#[derive(Clone)]
23+
pub(crate) struct StateToolHelper {
24+
bin_dir: PathBuf,
25+
logger: Logger,
26+
}
27+
28+
impl StateToolHelper {
29+
/// Creates a new instance of the [StateToolHelper].
30+
///
31+
/// If necessary, it will download the `state-tool` binary.
32+
pub(crate) fn new(
33+
bin_dir: PathBuf,
34+
replica_version: Option<ReplicaVersion>,
35+
logger: Logger,
36+
) -> RecoveryResult<Self> {
37+
let state_tool_helper = Self { bin_dir, logger };
38+
39+
state_tool_helper
40+
.download_if_necessary(replica_version)
41+
.map(|_| state_tool_helper)
42+
}
43+
44+
/// Computes manifest of a checkpoint.
45+
///
46+
/// Calls `state-tool manifest --state $dir > $output_path`.
47+
pub(crate) fn compute_manifest(&self, dir: &Path, output_path: &Path) -> RecoveryResult<()> {
48+
self.execute("manifest", Some(output_path), |command| {
49+
command.args(["--state", dir.display().to_string().as_str()])
50+
})
51+
}
52+
53+
/// Splits a manifest, to verify the manifests resulting from a subnet split.
54+
///
55+
/// Calls `state-tool split_manifest --path $manifest_path --from-subnet $source_subnet
56+
/// --to_subnet $destination_subnet --subnet-type application --migrated-ranges
57+
/// $canister_id_range`c.
58+
pub(crate) fn split_manifest(
59+
&self,
60+
manifest_path: &Path,
61+
source_subnet: SubnetId,
62+
destination_subnet: SubnetId,
63+
canister_id_ranges: &[CanisterIdRange],
64+
output_path: &Path,
65+
) -> RecoveryResult<()> {
66+
self.execute("split_manifest", Some(output_path), |command| {
67+
command
68+
.args(["--path", manifest_path.display().to_string().as_str()])
69+
.args(["--from-subnet", source_subnet.to_string().as_str()])
70+
.args(["--to-subnet", destination_subnet.to_string().as_str()])
71+
.args(["--subnet-type", "application"])
72+
.args(
73+
once("--migrated-ranges".to_string())
74+
.chain(canister_id_ranges_to_strings(canister_id_ranges).into_iter()),
75+
)
76+
})
77+
}
78+
79+
/// Verifies whether the textual representation of a manifest matches its root hash.
80+
///
81+
/// Calls `state-tool verify_manifest --file $manifest_path`.
82+
pub(crate) fn verify_manifest(&self, manifest_path: &Path) -> RecoveryResult<()> {
83+
self.execute("verify_manifest", /*output_path=*/ None, |command| {
84+
command.args(["--file", manifest_path.display().to_string().as_str()])
85+
})
86+
}
87+
88+
fn execute(
89+
&self,
90+
main_argument: &str,
91+
output_path: Option<&Path>,
92+
command_builder: impl Fn(&mut Command) -> &mut Command,
93+
) -> RecoveryResult<()> {
94+
let mut command = Command::new(self.binary_path());
95+
command.arg(main_argument).stderr(Stdio::inherit());
96+
command_builder(&mut command);
97+
98+
info!(self.logger, "Executing {:?}", command);
99+
100+
let output = command.output().map_err(|e| {
101+
RecoveryError::StateToolError(format!("Failed executing the command, error: {}", e))
102+
})?;
103+
104+
if !output.status.success() {
105+
return Err(RecoveryError::StateToolError(format!(
106+
"The command returned non-zero value: {}",
107+
output.status,
108+
)));
109+
}
110+
111+
info!(
112+
self.logger,
113+
"Succeeded executing the command:\n{}",
114+
String::from_utf8_lossy(&output.stdout)
115+
);
116+
117+
if let Some(output_path) = output_path {
118+
write_bytes(output_path, output.stdout)?;
119+
}
120+
121+
Ok(())
122+
}
123+
124+
fn download_if_necessary(&self, replica_version: Option<ReplicaVersion>) -> RecoveryResult<()> {
125+
if path_exists(&self.binary_path())? {
126+
info!(
127+
self.logger,
128+
"{} already exists, skipping download",
129+
&self.binary_path().display(),
130+
);
131+
132+
return Ok(());
133+
}
134+
135+
if let Some(version) = replica_version {
136+
block_on(download_binary(
137+
&self.logger,
138+
version,
139+
BINARY_NAME.to_string(),
140+
&self.bin_dir,
141+
))
142+
.map(|_| ())
143+
} else {
144+
info!(
145+
self.logger,
146+
"No state-tool version provided, skipping download."
147+
);
148+
Ok(())
149+
}
150+
}
151+
152+
fn binary_path(&self) -> PathBuf {
153+
self.bin_dir.join(BINARY_NAME)
154+
}
155+
}

rs/recovery/subnet_splitting/src/steps.rs

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
use crate::state_tool_helper::StateToolHelper;
2+
13
use ic_base_types::SubnetId;
24
use ic_metrics::MetricsRegistry;
35
use ic_recovery::{
@@ -12,6 +14,9 @@ use slog::Logger;
1214

1315
use std::path::PathBuf;
1416

17+
const MANIFEST_FILE_NAME: &str = "manifest.data";
18+
const EXPECTED_MANIFESTS_FILE_NAME: &str = "expected_manifests.data";
19+
1520
pub(crate) struct CopyWorkDirStep {
1621
pub(crate) from: PathBuf,
1722
pub(crate) to: PathBuf,
@@ -68,6 +73,7 @@ impl StateSplitStrategy {
6873
pub(crate) struct SplitStateStep {
6974
pub(crate) subnet_id: SubnetId,
7075
pub(crate) state_split_strategy: StateSplitStrategy,
76+
pub(crate) state_tool_helper: StateToolHelper,
7177
pub(crate) work_dir: PathBuf,
7278
pub(crate) logger: Logger,
7379
}
@@ -106,6 +112,61 @@ impl Step for SplitStateStep {
106112
)
107113
.map_err(RecoveryError::OutputError)?;
108114

115+
let (max_name, _) = Recovery::get_latest_checkpoint_name_and_height(&checkpoints_dir)?;
116+
let max_checkpoint = checkpoints_dir.join(max_name);
117+
let manifest_path = max_checkpoint.join(MANIFEST_FILE_NAME);
118+
119+
self.state_tool_helper
120+
.compute_manifest(&max_checkpoint, &manifest_path)?;
121+
self.state_tool_helper.verify_manifest(&manifest_path)?;
122+
109123
Recovery::remove_all_but_highest_checkpoints(&checkpoints_dir, &self.logger).map(|_| ())
110124
}
111125
}
126+
127+
pub(crate) struct ComputeExpectedManifestsStep {
128+
pub(crate) recovery_dir: PathBuf,
129+
pub(crate) state_tool_helper: StateToolHelper,
130+
pub(crate) source_subnet_id: SubnetId,
131+
pub(crate) destination_subnet_id: SubnetId,
132+
pub(crate) canister_id_ranges_to_move: Vec<CanisterIdRange>,
133+
}
134+
135+
impl ComputeExpectedManifestsStep {
136+
fn checkpoints(&self) -> PathBuf {
137+
self.recovery_dir
138+
.join("working_dir")
139+
.join(IC_STATE_DIR)
140+
.join(CHECKPOINTS)
141+
}
142+
}
143+
144+
impl Step for ComputeExpectedManifestsStep {
145+
fn descr(&self) -> String {
146+
format!(
147+
"Compute the expected manifests of the states resulting from splitting the manifest \
148+
at {} between {} (hosting all canisters in {:?}) and {} (all remaining canisters)",
149+
self.checkpoints().display(),
150+
self.destination_subnet_id,
151+
self.canister_id_ranges_to_move,
152+
self.source_subnet_id,
153+
)
154+
}
155+
156+
fn exec(&self) -> RecoveryResult<()> {
157+
let checkpoints_dir = self.checkpoints();
158+
let (max_name, _) = Recovery::get_latest_checkpoint_name_and_height(&checkpoints_dir)?;
159+
let max_checkpoint = checkpoints_dir.join(max_name);
160+
let manifest_path = self.recovery_dir.join(MANIFEST_FILE_NAME);
161+
162+
self.state_tool_helper
163+
.compute_manifest(&max_checkpoint, &manifest_path)?;
164+
self.state_tool_helper.split_manifest(
165+
&manifest_path,
166+
self.source_subnet_id,
167+
self.destination_subnet_id,
168+
&self.canister_id_ranges_to_move,
169+
&self.recovery_dir.join(EXPECTED_MANIFESTS_FILE_NAME),
170+
)
171+
}
172+
}

0 commit comments

Comments
 (0)