Skip to content

Commit

Permalink
Merge branch 'eichhorl/nns-certifications' into 'master'
Browse files Browse the repository at this point in the history
feat(recovery): CON-926 Download certifications during NNS recovery

In production the step to download certifications would be replaced by manually placing the pools into the correct directory. The pools are to be acquired by contacting NPs. 

See merge request dfinity-lab/public/ic!10362
  • Loading branch information
eichhorl committed Feb 7, 2023
2 parents f153677 + d72b0c1 commit fba0943
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 37 deletions.
3 changes: 2 additions & 1 deletion rs/recovery/src/app_subnet_recovery.rs
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,8 @@ impl RecoveryIterator<StepType> for AppSubnetRecovery {
StepType::DownloadCertifications => {
if self.params.pub_key.is_some() {
Ok(Box::new(
self.recovery.get_download_certs_step(self.params.subnet_id),
self.recovery
.get_download_certs_step(self.params.subnet_id, false),
))
} else {
Err(RecoveryError::StepSkipped)
Expand Down
3 changes: 2 additions & 1 deletion rs/recovery/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -388,14 +388,15 @@ impl Recovery {

/// Return a [DownloadCertificationsStep] downloading the certification pools of all reachable
/// nodes in the given subnet to the recovery data directory using the readonly account.
pub fn get_download_certs_step(&self, subnet_id: SubnetId) -> impl Step {
pub fn get_download_certs_step(&self, subnet_id: SubnetId, admin: bool) -> impl Step {
DownloadCertificationsStep {
logger: self.logger.clone(),
subnet_id,
registry_client: self.registry_client.clone(),
work_dir: self.work_dir.clone(),
require_confirmation: self.ssh_confirmation,
key_file: self.key_file.clone(),
admin,
}
}

Expand Down
15 changes: 11 additions & 4 deletions rs/recovery/src/nns_recovery_failover_nodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ pub const CANISTER_CALLER_ID: &str = "r7inp-6aaaa-aaaaa-aaabq-cai";
#[derive(Debug, Copy, Clone, EnumIter)]
pub enum StepType {
StopReplica,
DownloadCertifications,
MergeCertificationPools,
DownloadState,
ProposeToCreateSubnet,
DownloadParentNNSStore,
Expand All @@ -51,10 +53,6 @@ pub struct NNSRecoveryFailoverNodesArgs {
#[clap(long, parse(try_from_str=::std::convert::TryFrom::try_from))]
pub replica_version: Option<ReplicaVersion>,

/// Public ssh key to be deployed to the subnet for read only access
#[clap(long)]
pub pub_key: Option<String>,

/// IP address of the auxiliary host the registry is uploaded to
#[clap(long)]
pub aux_ip: Option<IpAddr>,
Expand Down Expand Up @@ -220,6 +218,15 @@ impl RecoveryIterator<StepType> for NNSRecoveryFailoverNodes {
}
}

StepType::DownloadCertifications => Ok(Box::new(
self.recovery
.get_download_certs_step(self.params.subnet_id, true),
)),

StepType::MergeCertificationPools => {
Ok(Box::new(self.recovery.get_merge_certification_pools_step()))
}

StepType::DownloadState => {
if let Some(node_ip) = self.params.download_node {
Ok(Box::new(
Expand Down
15 changes: 11 additions & 4 deletions rs/recovery/src/nns_recovery_same_nodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ use crate::{Recovery, Step};
#[derive(Debug, Copy, Clone, EnumIter)]
pub enum StepType {
StopReplica,
DownloadCertifications,
MergeCertificationPools,
DownloadState,
ICReplay,
ValidateReplayOutput,
Expand All @@ -41,10 +43,6 @@ pub struct NNSRecoverySameNodesArgs {
#[clap(long, parse(try_from_str=::std::convert::TryFrom::try_from))]
pub upgrade_version: Option<ReplicaVersion>,

/// Public ssh key to be deployed to the subnet for read only access
#[clap(long)]
pub pub_key: Option<String>,

/// IP address of the node to download the subnet state from. Should be different to node used in nns-url.
#[clap(long)]
pub download_node: Option<IpAddr>,
Expand Down Expand Up @@ -146,6 +144,15 @@ impl RecoveryIterator<StepType> for NNSRecoverySameNodes {
}
}

StepType::DownloadCertifications => Ok(Box::new(
self.recovery
.get_download_certs_step(self.params.subnet_id, true),
)),

StepType::MergeCertificationPools => {
Ok(Box::new(self.recovery.get_merge_certification_pools_step()))
}

StepType::DownloadState => {
if let Some(node_ip) = self.params.download_node {
Ok(Box::new(
Expand Down
12 changes: 9 additions & 3 deletions rs/recovery/src/steps.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ pub struct DownloadCertificationsStep {
pub work_dir: PathBuf,
pub require_confirmation: bool,
pub key_file: Option<PathBuf>,
pub admin: bool,
}

impl Step for DownloadCertificationsStep {
Expand All @@ -79,10 +80,11 @@ impl Step for DownloadCertificationsStep {
}

fn exec(&self) -> RecoveryResult<()> {
let user = if self.admin { ADMIN } else { READONLY };
let cert_path = format!("{IC_DATA_PATH}/{IC_CERTIFICATIONS_PATH}");
let ips = get_member_ips(self.registry_client.clone(), self.subnet_id)?;
let downloaded_at_least_once = ips.iter().fold(false, |success, ip| {
let data_src = format!("{READONLY}@[{ip}]:{cert_path}");
let data_src = format!("{user}@[{ip}]:{cert_path}");
let target = self.work_dir.join("certifications").join(ip.to_string());
if let Err(e) = create_dir(&target) {
warn!(self.logger, "Failed to create target dir: {:?}", e);
Expand Down Expand Up @@ -327,8 +329,12 @@ impl Step for DownloadIcStateStep {
excludes.push(cp);
});

// If we have readonly access, we do not download certifications again.
if self.try_readonly {
// If we already have some certifications, we do not download them again.
if PathBuf::from(self.working_dir.clone())
.join("data/ic_consensus_pool/certification")
.exists()
{
info!(self.logger, "Excluding certifications from download");
excludes.push("certification");
excludes.push("certifications");
}
Expand Down
14 changes: 2 additions & 12 deletions rs/tests/src/orchestrator/subnet_recovery_nns_failover.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ end::catalog[] */

use super::utils::rw_message::install_nns_and_check_progress;
use crate::canister_http::lib::get_universal_vm_address;
use crate::driver::driver_setup::{SSH_AUTHORIZED_PRIV_KEYS_DIR, SSH_AUTHORIZED_PUB_KEYS_DIR};
use crate::driver::driver_setup::SSH_AUTHORIZED_PRIV_KEYS_DIR;
use crate::driver::ic::{InternetComputer, Subnet};
use crate::driver::test_env::SshKeyGen;
use crate::driver::universal_vm::{insert_file_to_config, UniversalVm, UniversalVms};
Expand All @@ -33,7 +33,6 @@ use crate::orchestrator::utils::rw_message::{
};
use crate::orchestrator::utils::subnet_recovery::set_sandbox_env_vars;
use crate::util::block_on;
use ic_recovery::file_sync_helper;
use ic_recovery::nns_recovery_failover_nodes::{
NNSRecoveryFailoverNodes, NNSRecoveryFailoverNodesArgs, StepType,
};
Expand Down Expand Up @@ -85,16 +84,11 @@ pub fn test(env: TestEnv) {
info!(logger, "IC_VERSION_ID: {:?}", ic_version);

let ssh_authorized_priv_keys_dir = env.get_path(SSH_AUTHORIZED_PRIV_KEYS_DIR);
let ssh_authorized_pub_keys_dir = env.get_path(SSH_AUTHORIZED_PUB_KEYS_DIR);

info!(
logger,
"ssh_authorized_priv_keys_dir: {:?}", ssh_authorized_priv_keys_dir
);
info!(
logger,
"ssh_authorized_pub_keys_dir: {:?}", ssh_authorized_pub_keys_dir
);

// choose a node from the nns subnet
let mut orig_nns_nodes = topo_broken_ic.root_subnet().nodes();
let nns_node = orig_nns_nodes.next().expect("there is no NNS node");
Expand Down Expand Up @@ -156,9 +150,6 @@ pub fn test(env: TestEnv) {
msg
));

let pub_key = file_sync_helper::read_file(&ssh_authorized_pub_keys_dir.join(ADMIN))
.expect("Couldn't read public key");

let recovery_dir = env.get_dependency_path("rs/tests");
set_sandbox_env_vars(recovery_dir.join("recovery/binaries"))
.expect("Failed to set sandbox env vars");
Expand All @@ -172,7 +163,6 @@ pub fn test(env: TestEnv) {
let subnet_args = NNSRecoveryFailoverNodesArgs {
subnet_id: topo_broken_ic.root_subnet_id(),
replica_version: Some(ic_version),
pub_key: Some(pub_key),
aux_ip: None,
aux_user: None,
registry_url: None,
Expand Down
14 changes: 2 additions & 12 deletions rs/tests/src/orchestrator/subnet_recovery_nns_subnet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Success::
end::catalog[] */

use super::utils::rw_message::install_nns_and_check_progress;
use crate::driver::driver_setup::{SSH_AUTHORIZED_PRIV_KEYS_DIR, SSH_AUTHORIZED_PUB_KEYS_DIR};
use crate::driver::driver_setup::SSH_AUTHORIZED_PRIV_KEYS_DIR;
use crate::driver::ic::{InternetComputer, Subnet};
use crate::driver::test_env::SshKeyGen;
use crate::driver::{test_env::TestEnv, test_env_api::*};
Expand All @@ -28,7 +28,7 @@ use crate::orchestrator::utils::rw_message::{
use crate::orchestrator::utils::subnet_recovery::set_sandbox_env_vars;
use crate::util::block_on;
use ic_recovery::nns_recovery_same_nodes::{NNSRecoverySameNodes, NNSRecoverySameNodesArgs};
use ic_recovery::{file_sync_helper, get_node_metrics, RecoveryArgs};
use ic_recovery::{get_node_metrics, RecoveryArgs};
use ic_registry_subnet_type::SubnetType;
use ic_types::{Height, ReplicaVersion};
use slog::info;
Expand Down Expand Up @@ -63,16 +63,10 @@ pub fn test(env: TestEnv) {
let working_version =
ReplicaVersion::try_from(format!("{}-test", ic_version.as_ref())).unwrap();
let ssh_authorized_priv_keys_dir = env.get_path(SSH_AUTHORIZED_PRIV_KEYS_DIR);
let ssh_authorized_pub_keys_dir = env.get_path(SSH_AUTHORIZED_PUB_KEYS_DIR);

info!(
logger,
"ssh_authorized_priv_keys_dir: {:?}", ssh_authorized_priv_keys_dir
);
info!(
logger,
"ssh_authorized_pub_keys_dir: {:?}", ssh_authorized_pub_keys_dir
);

// choose a node from the nns subnet
let mut nns_nodes = topo_snapshot.root_subnet().nodes();
Expand Down Expand Up @@ -108,9 +102,6 @@ pub fn test(env: TestEnv) {
msg
));

let pub_key = file_sync_helper::read_file(&ssh_authorized_pub_keys_dir.join(ADMIN))
.expect("Couldn't read public key");

let recovery_dir = env.get_dependency_path("rs/tests");
set_sandbox_env_vars(recovery_dir.join("recovery/binaries"))
.expect("Failed to set sandbox env vars");
Expand All @@ -126,7 +117,6 @@ pub fn test(env: TestEnv) {
let subnet_args = NNSRecoverySameNodesArgs {
subnet_id: topo_snapshot.root_subnet_id(),
upgrade_version: Some(working_version),
pub_key: Some(pub_key),
download_node: Some(download_node.get_ip_addr()),
upload_node: Some(upload_node.get_ip_addr()),
};
Expand Down

0 comments on commit fba0943

Please sign in to comment.