Skip to content

Commit 0a7291d

Browse files
committed
Merge branch 'shuo/preparation_for_unverified_marker' into 'master'
chore: [MR-581] Handle unverified checkpoint markers in downgrade This MR is a preliminary step towards adding unverified checkpoint markers. In the case of downgrade, a marker file may be left in the checkpoint if the downgrade cuts in during manifest computation. This MR moves the logic of archiving unverified checkpoint here to to ensure proper handling of leftovers. Otherwise, the marker file will remain in the checkpoint, causing an extra file for manifest computation and even divergence. See merge request dfinity-lab/public/ic!20176
2 parents 07df6d5 + a33c572 commit 0a7291d

File tree

3 files changed

+101
-0
lines changed

3 files changed

+101
-0
lines changed

rs/state_layout/src/state_layout.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ pub const SUBNET_QUEUES_FILE: &str = "subnet_queues.pbuf";
5656
pub const SYSTEM_METADATA_FILE: &str = "system_metadata.pbuf";
5757
pub const STATS_FILE: &str = "stats.pbuf";
5858
pub const WASM_FILE: &str = "software.wasm";
59+
pub const UNVERIFIED_CHECKPOINT_MARKER: &str = "unverified_checkpoint_marker";
5960

6061
/// `ReadOnly` is the access policy used for reading checkpoints. We
6162
/// don't want to ever modify persisted states.
@@ -699,6 +700,19 @@ impl StateLayout {
699700
CheckpointLayout::new(path, height, self.clone())
700701
}
701702

703+
/// Returns the untracked `CheckpointLayout` for the given height (if there is one).
704+
pub fn checkpoint_untracked(
705+
&self,
706+
height: Height,
707+
) -> Result<CheckpointLayout<ReadOnly>, LayoutError> {
708+
let cp_name = Self::checkpoint_name(height);
709+
let path = self.checkpoints().join(cp_name);
710+
if !path.exists() {
711+
return Err(LayoutError::NotFound(height));
712+
}
713+
CheckpointLayout::new_untracked(path, height)
714+
}
715+
702716
fn increment_checkpoint_ref_counter(&self, height: Height) {
703717
let mut checkpoint_ref_registry = self.checkpoint_ref_registry.lock().unwrap();
704718
checkpoint_ref_registry
@@ -1372,6 +1386,10 @@ impl<Permissions: AccessPolicy> CheckpointLayout<Permissions> {
13721386
self.root.join(STATS_FILE).into()
13731387
}
13741388

1389+
pub fn unverified_checkpoint_marker(&self) -> PathBuf {
1390+
self.root.join(UNVERIFIED_CHECKPOINT_MARKER)
1391+
}
1392+
13751393
pub fn canister_ids(&self) -> Result<Vec<CanisterId>, LayoutError> {
13761394
let states_dir = self.root.join(CANISTER_STATES_DIR);
13771395
Permissions::check_dir(&states_dir)?;

rs/state_manager/src/lib.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1411,6 +1411,27 @@ impl StateManagerImpl {
14111411
info!(log, "Loading metadata took {:?}", starting_time.elapsed());
14121412

14131413
let starting_time = Instant::now();
1414+
let checkpoint_heights = state_layout
1415+
.checkpoint_heights()
1416+
.unwrap_or_else(|err| fatal!(&log, "Failed to retrieve checkpoint heights: {:?}", err));
1417+
1418+
for h in checkpoint_heights {
1419+
let cp_layout = state_layout.checkpoint_untracked(h).unwrap_or_else(|err| {
1420+
fatal!(
1421+
log,
1422+
"Failed to create untracked checkpoint layout @{}: {}",
1423+
h,
1424+
err
1425+
)
1426+
});
1427+
if cp_layout.unverified_checkpoint_marker().exists() {
1428+
info!(log, "Archiving unverified checkpoint {} ", h);
1429+
state_layout
1430+
.archive_checkpoint(h)
1431+
.unwrap_or_else(|err| fatal!(&log, "{:?}", err));
1432+
}
1433+
}
1434+
14141435
let mut checkpoint_heights = state_layout
14151436
.checkpoint_heights()
14161437
.unwrap_or_else(|err| fatal!(&log, "Failed to retrieve checkpoint heights: {:?}", err));

rs/state_manager/tests/state_manager.rs

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1304,6 +1304,68 @@ fn can_filter_by_certification_mask() {
13041304
})
13051305
}
13061306

1307+
#[test]
1308+
fn should_archive_checkpoints_correctly() {
1309+
state_manager_restart_test(|state_manager, restart_fn| {
1310+
let mut heights = vec![height(0)];
1311+
for i in 1..14 {
1312+
let (_height, state) = state_manager.take_tip();
1313+
heights.push(height(i));
1314+
1315+
let scope = if i % 2 == 0 {
1316+
CertificationScope::Full
1317+
} else {
1318+
CertificationScope::Metadata
1319+
};
1320+
1321+
state_manager.commit_and_certify(state, height(i), scope.clone());
1322+
}
1323+
1324+
assert_eq!(height(13), state_manager.latest_state_height());
1325+
let latest_state = state_manager.get_latest_state();
1326+
assert_eq!(height(13), latest_state.height());
1327+
1328+
// Manually marks checkpoint at height 6 and 10 as unverified, and it should be archived on restart.
1329+
let marker_file_6 = state_manager
1330+
.state_layout()
1331+
.checkpoint_untracked(height(6))
1332+
.unwrap()
1333+
.unverified_checkpoint_marker();
1334+
std::fs::File::create(marker_file_6).expect("failed to write to marker file");
1335+
1336+
let marker_file_10 = state_manager
1337+
.state_layout()
1338+
.checkpoint_untracked(height(10))
1339+
.unwrap()
1340+
.unverified_checkpoint_marker();
1341+
std::fs::File::create(marker_file_10).expect("failed to write to marker file");
1342+
1343+
let state_manager = restart_fn(state_manager, Some(height(6)));
1344+
1345+
// The unverified checkpoints at height 6 and 10, and any checkpoints at or above height 8 are archived.
1346+
// However, at most one checkpoint will be stored in the backups directory after cleanup.
1347+
assert_eq!(
1348+
state_manager.state_layout().backup_heights().unwrap(),
1349+
vec![height(12)],
1350+
);
1351+
1352+
// The checkpoints at height 2 and 4 should not be accidentally archived.
1353+
assert_eq!(
1354+
state_manager.checkpoint_heights(),
1355+
vec![height(2), height(4)],
1356+
);
1357+
1358+
// State manager should restart from checkpoint at height 4 instead of 6 or 8.
1359+
assert_eq!(
1360+
state_manager.list_state_heights(CERT_ANY),
1361+
vec![height(0), height(2), height(4)],
1362+
);
1363+
assert_eq!(height(4), state_manager.latest_state_height());
1364+
let (latest_height, _) = state_manager.take_tip();
1365+
assert_eq!(height(4), latest_height);
1366+
});
1367+
}
1368+
13071369
#[test]
13081370
fn can_remove_checkpoints() {
13091371
state_manager_restart_test(|state_manager, restart_fn| {

0 commit comments

Comments
 (0)