Skip to content

Commit

Permalink
Merge branch 'alic/fix-upgrade-test' into 'master'
Browse files Browse the repository at this point in the history
fix(consensus): [ICSUP-3714] Change upgrade cycle in upgrade_downgrade test

We've been observing test failures on staging that should have happened on CI. This discrepancy was caused by the difference in upgrade cycle. 

- During qualification, we do a `mainnet` -> `RC` -> `mainnet` upgrade-downgrade roundtrip. 
- In this test however, we've been doing `RC` -> `mainnet` -> `RC`. Even though we're testing both upgrade directions, there's still subtle opportunities for failure. 

This MR fixes the upgrade cycle, so that we start from `mainnet`.


Additionally, to verify that the changes work as expected, I've ran this patch on RC `1ad201eb530a5c572bc72d302160783e7fb4c60b` and attempted to downgrade to the mainnet version from that time (`ca5e5052886de781021506814d2c6502e375da48`). I could reproduce the failure we observed during qualification.

- Bazel logs: https://dash.zh1-idx1.dfinity.network/invocation/d53b7724-cc17-4f4f-9280-8e67953ad487
- Kibana logs: [Upgrade failure messages](https://kibana.testnet.dfinity.network/app/discover#/?_g=(time:(from:now-1y,to:now))&_a=(columns:!(host.name,message,level),filters:!(('$state':(store:appState),meta:(alias:!n,disabled:!f,key:tags,negate:!f,params:(query:upgrade_downgrade_app_subnet_test--1698343603734),type:phrase),query:(match_phrase:(tags:upgrade_downgrade_app_subnet_test--1698343603734))),('$state':(store:appState),meta:(alias:!n,disabled:!f,index:'35c5ba4b-6db5-42f6-a760-39ca89a177fe',key:message,negate:!f,params:(query:'Check%20for%20upgrade%20failed:%20Ok(Err(GenericError(%22Failed%20to%20upgrade:%20upgrade-install%20failed%22)))'),type:phrase),query:(match_phrase:(message:'Check%20for%20upgrade%20failed:%20Ok(Err(GenericError(%22Failed%20to%20upgrade:%20upgrade-install%20failed%22)))')))),grid:(columns:(host.name:(width:513))),index:'35c5ba4b-6db5-42f6-a760-39ca89a177fe',interval:auto,query:(language:kuery,query:'syslog.identifier:%22orchestrator%22%20'),sort:!(!('@timestamp',desc)))) 

See merge request dfinity-lab/public/ic!15563
  • Loading branch information
dist1ll committed Oct 27, 2023
2 parents dbb870d + edc1dbf commit afee4a8
Showing 1 changed file with 27 additions and 33 deletions.
60 changes: 27 additions & 33 deletions rs/tests/src/orchestrator/upgrade_downgrade.rs
@@ -1,11 +1,11 @@
/* tag::catalog[]
Title:: Upgradability to/from oldest prod replica version.
Title:: Upgradability from/to the mainnet replica version.
Goal:: Ensure the upgradability of the branch version against the oldest used replica version
Runbook::
. Setup an IC with 4-nodes NNS and 4-nodes app subnet using the code from the branch.
. Downgrade each type of subnet to the mainnet version and back.
. Setup an IC with 4-nodes NNS and 4-nodes app subnet using the mainnet replica version.
. Upgrade each type of subnet to the branch version, and downgrade again.
. During both upgrades simulate a disconnected node and make sure it catches up.
Success:: Upgrades work into both directions for all subnet types.
Expand Down Expand Up @@ -52,6 +52,7 @@ pub const UP_DOWNGRADE_PER_TEST_TIMEOUT: Duration = Duration::from_secs(15 * 60)

pub fn config(env: TestEnv) {
InternetComputer::new()
.with_mainnet_config()
.add_subnet(
Subnet::new(SubnetType::System)
.add_nodes(SUBNET_SIZE)
Expand All @@ -68,17 +69,17 @@ pub fn config(env: TestEnv) {
install_nns_and_check_progress(env.topology_snapshot());
}

// Tests a downgrade of the nns subnet to the mainnet version and an upgrade back to the branch version
// Tests an upgrade of the NNS subnet to the branch version and a downgrade back to the mainnet version
pub fn upgrade_downgrade_nns_subnet(env: TestEnv) {
upgrade_downgrade(env, SubnetType::System);
}

// Tests a downgrade of the app subnet to the mainnet version and an upgrade back to the branch version
// Tests an upgrade of the app subnet to the branch version and a downgrade back to the mainnet version
pub fn upgrade_downgrade_app_subnet(env: TestEnv) {
upgrade_downgrade(env, SubnetType::Application);
}

// Downgrades a subnet to $TARGET_VERSION and back to branch version
// Upgrades to the branch version, and back to mainnet NNS version.
fn upgrade_downgrade(env: TestEnv, subnet_type: SubnetType) {
let logger = env.logger();

Expand Down Expand Up @@ -123,23 +124,10 @@ fn upgrade_downgrade(env: TestEnv, subnet_type: SubnetType) {
let key = enable_ecdsa_signing_on_subnet(&nns_node, &nns_canister, subnet_id, &logger);
run_ecdsa_signature_test(&nns_canister, &logger, key);

let original_branch_version = get_assigned_replica_version(&nns_node).unwrap();
// We have to upgrade to `<VERSION>-test` because the original version is stored without the
// download URL in the registry.
let original_branch_version = "0000000000000000000000000000000000000000".to_string();
let branch_version = format!("{}-test", original_branch_version);

// Check that the two versions do not initially match, which could hide failures.
assert!(mainnet_version != original_branch_version);

// Bless both replica versions
block_on(bless_public_replica_version(
&nns_node,
&mainnet_version,
UpdateImageType::Image,
UpdateImageType::Image,
&logger,
));

// Bless branch version (mainnet is already blessed)
let sha256 = env.get_ic_os_update_img_test_sha256().unwrap();
let upgrade_url = env.get_ic_os_update_img_test_url().unwrap();
block_on(bless_replica_version(
Expand All @@ -152,23 +140,23 @@ fn upgrade_downgrade(env: TestEnv, subnet_type: SubnetType) {
));
info!(&logger, "Blessed all versions");

downgrade_upgrade_roundtrip(
upgrade_downgrade_roundtrip(
env,
&nns_node,
&mainnet_version,
&branch_version,
&mainnet_version,
subnet_type,
&nns_canister,
key,
);
}

// Downgrades and upgrades a subnet with one faulty node.
fn downgrade_upgrade_roundtrip(
// Upgrades and downgrades a subnet with one faulty node.
fn upgrade_downgrade_roundtrip(
env: TestEnv,
nns_node: &IcNodeSnapshot,
target_version: &str,
branch_version: &str,
upgrade_version: &str,
downgrade_version: &str,
subnet_type: SubnetType,
nns_canister: &MessageCanister,
key: VerifyingKey,
Expand Down Expand Up @@ -225,15 +213,15 @@ fn downgrade_upgrade_roundtrip(

stop_node(&logger, &faulty_node);

info!(logger, "Upgrade to version {}", target_version);
upgrade_to(nns_node, subnet_id, &subnet_node, target_version, &logger);
info!(logger, "Upgrade to version {}", upgrade_version);
upgrade_to(nns_node, subnet_id, &subnet_node, upgrade_version, &logger);

// Killing redundant nodes should not prevent the `faulty_node` downgrading to mainnet version and catching up after restarting.
for redundant_node in &redundant_nodes {
stop_node(&logger, redundant_node);
}
start_node(&logger, &faulty_node);
assert_assigned_replica_version(&faulty_node, target_version, env.logger());
assert_assigned_replica_version(&faulty_node, upgrade_version, env.logger());

assert!(can_read_msg(
&logger,
Expand Down Expand Up @@ -265,8 +253,14 @@ fn downgrade_upgrade_roundtrip(

stop_node(&logger, &faulty_node);

info!(logger, "Downgrade to version {}", branch_version);
upgrade_to(nns_node, subnet_id, &subnet_node, branch_version, &logger);
info!(logger, "Downgrade to version {}", downgrade_version);
upgrade_to(
nns_node,
subnet_id,
&subnet_node,
downgrade_version,
&logger,
);

let msg_3 = "hello world after upgrade!";
let can_id_3 = store_message(
Expand All @@ -279,7 +273,7 @@ fn downgrade_upgrade_roundtrip(
stop_node(&logger, redundant_node);
}
start_node(&logger, &faulty_node);
assert_assigned_replica_version(&faulty_node, branch_version, env.logger());
assert_assigned_replica_version(&faulty_node, downgrade_version, env.logger());

for (can_id, msg) in &[(can_id, msg), (can_id_2, msg_2), (can_id_3, msg_3)] {
assert!(can_read_msg_with_retries(
Expand Down

0 comments on commit afee4a8

Please sign in to comment.