Skip to content

Commit

Permalink
Merge branch 'exc-1454' into 'master'
Browse files Browse the repository at this point in the history
(chore) EXC-1454: Make call ID of install code call non-optional

This MR transform call ID of an install code call into an non-optional value. 

Once the metrics are showing that there is no call without call ID, this MR can be merged.

[UPDATE]: There is no install code call without call_id in the metrics. 

See merge request dfinity-lab/public/ic!14414
  • Loading branch information
AlexandraZapuc committed Sep 20, 2023
2 parents c901dc2 + 5c831eb commit f3e3693
Show file tree
Hide file tree
Showing 14 changed files with 62 additions and 153 deletions.
12 changes: 5 additions & 7 deletions rs/execution_environment/src/canister_manager.rs
Expand Up @@ -68,7 +68,7 @@ pub(crate) enum DtsInstallCodeResult {
Finished {
canister: CanisterState,
message: CanisterCall,
call_id: Option<InstallCodeCallId>,
call_id: InstallCodeCallId,
instructions_used: NumInstructions,
result: Result<InstallCodeResult, CanisterManagerError>,
},
Expand Down Expand Up @@ -645,6 +645,7 @@ impl CanisterManager {
&self,
context: InstallCodeContext,
message: CanisterCall,
call_id: InstallCodeCallId,
state: &mut ReplicatedState,
mut execution_parameters: ExecutionParameters,
round_limits: &mut RoundLimits,
Expand Down Expand Up @@ -676,7 +677,7 @@ impl CanisterManager {
let dts_result = self.install_code_dts(
context,
message,
None,
call_id,
None,
old_canister,
time,
Expand Down Expand Up @@ -734,7 +735,7 @@ impl CanisterManager {
&self,
context: InstallCodeContext,
message: CanisterCall,
call_id: Option<InstallCodeCallId>,
call_id: InstallCodeCallId,
prepaid_execution_cycles: Option<Cycles>,
mut canister: CanisterState,
time: Time,
Expand Down Expand Up @@ -1818,10 +1819,7 @@ pub(crate) trait PausedInstallCodeExecution: Send + std::fmt::Debug {
/// Aborts the paused execution.
/// Returns the original message, the cycles prepaid for execution,
/// and a call id that exist only for inter-canister messages.
fn abort(
self: Box<Self>,
log: &ReplicaLogger,
) -> (CanisterCall, Option<InstallCodeCallId>, Cycles);
fn abort(self: Box<Self>, log: &ReplicaLogger) -> (CanisterCall, InstallCodeCallId, Cycles);
}

#[cfg(test)]
Expand Down
2 changes: 2 additions & 0 deletions rs/execution_environment/src/canister_manager/tests.rs
Expand Up @@ -35,6 +35,7 @@ use ic_registry_routing_table::{CanisterIdRange, RoutingTable, CANISTER_IDS_PER_
use ic_registry_subnet_type::SubnetType;
use ic_replicated_state::{
canister_state::system_state::CyclesUseCase,
metadata_state::subnet_call_context_manager::InstallCodeCallId,
page_map::{self, TestPageAllocatorFileDescriptorImpl},
testing::CanisterQueuesTesting,
testing::SystemStateTesting,
Expand Down Expand Up @@ -336,6 +337,7 @@ fn install_code(
let (result, instructions_used, canister) = canister_manager.install_code(
context,
CanisterCall::Ingress(Arc::new(ingress)),
InstallCodeCallId::new(0),
state,
execution_parameters,
round_limits,
Expand Down
10 changes: 2 additions & 8 deletions rs/execution_environment/src/execution/install.rs
Expand Up @@ -398,10 +398,7 @@ impl PausedInstallCodeExecution for PausedInitExecution {
}
}

fn abort(
self: Box<Self>,
log: &ReplicaLogger,
) -> (CanisterCall, Option<InstallCodeCallId>, Cycles) {
fn abort(self: Box<Self>, log: &ReplicaLogger) -> (CanisterCall, InstallCodeCallId, Cycles) {
info!(
log,
"[DTS] Aborting (canister_init) execution of canister {}.", self.original.canister_id
Expand Down Expand Up @@ -498,10 +495,7 @@ impl PausedInstallCodeExecution for PausedStartExecutionDuringInstall {
}
}

fn abort(
self: Box<Self>,
log: &ReplicaLogger,
) -> (CanisterCall, Option<InstallCodeCallId>, Cycles) {
fn abort(self: Box<Self>, log: &ReplicaLogger) -> (CanisterCall, InstallCodeCallId, Cycles) {
info!(
log,
"[DTS] Aborting (start) execution of canister {}.", self.original.canister_id,
Expand Down
3 changes: 1 addition & 2 deletions rs/execution_environment/src/execution/install_code.rs
Expand Up @@ -714,8 +714,7 @@ pub(crate) struct OriginalContext {
pub canister_layout_path: PathBuf,
pub config: CanisterMgrConfig,
pub message: CanisterCall,
// TODO(EXC-1454): Make call_id non-optional.
pub call_id: Option<InstallCodeCallId>,
pub call_id: InstallCodeCallId,
pub prepaid_execution_cycles: Cycles,
pub time: Time,
pub compilation_cost_handling: CompilationCostHandling,
Expand Down
4 changes: 2 additions & 2 deletions rs/execution_environment/src/execution/install_code/tests.rs
Expand Up @@ -1386,7 +1386,7 @@ fn assert_consistent_install_code_calls(state: &ReplicatedState, expected_calls:
message, call_id, ..
}) = canister.next_task()
{
Some((call_id.unwrap(), message))
Some((call_id, message))
} else {
None
}
Expand All @@ -1398,7 +1398,7 @@ fn assert_consistent_install_code_calls(state: &ReplicatedState, expected_calls:
let mut subnet_call_context_manager = state.metadata.subnet_call_context_manager.clone();
for (call_id, call) in canister_install_code_contexts {
subnet_call_context_manager
.remove_install_code_call(call_id)
.remove_install_code_call(*call_id)
.unwrap_or_else(|| {
panic!(
"Canister AbortedInstallCode task without matching subnet InstallCodeCall: {} {:?}",
Expand Down
15 changes: 3 additions & 12 deletions rs/execution_environment/src/execution/upgrade.rs
Expand Up @@ -561,10 +561,7 @@ impl PausedInstallCodeExecution for PausedPreUpgradeExecution {
}
}

fn abort(
self: Box<Self>,
log: &ReplicaLogger,
) -> (CanisterCall, Option<InstallCodeCallId>, Cycles) {
fn abort(self: Box<Self>, log: &ReplicaLogger) -> (CanisterCall, InstallCodeCallId, Cycles) {
info!(
log,
"[DTS] Aborting (canister_pre_upgrade) execution of canister {}.",
Expand Down Expand Up @@ -663,10 +660,7 @@ impl PausedInstallCodeExecution for PausedStartExecutionDuringUpgrade {
}
}

fn abort(
self: Box<Self>,
log: &ReplicaLogger,
) -> (CanisterCall, Option<InstallCodeCallId>, Cycles) {
fn abort(self: Box<Self>, log: &ReplicaLogger) -> (CanisterCall, InstallCodeCallId, Cycles) {
info!(
log,
"[DTS] Aborting (start) execution of canister {}.", self.original.canister_id
Expand Down Expand Up @@ -761,10 +755,7 @@ impl PausedInstallCodeExecution for PausedPostUpgradeExecution {
}
}

fn abort(
self: Box<Self>,
log: &ReplicaLogger,
) -> (CanisterCall, Option<InstallCodeCallId>, Cycles) {
fn abort(self: Box<Self>, log: &ReplicaLogger) -> (CanisterCall, InstallCodeCallId, Cycles) {
info!(
log,
"[DTS] Aborting (canister_post_upgrade) execution of canister {}.",
Expand Down
50 changes: 34 additions & 16 deletions rs/execution_environment/src/execution_environment.rs
Expand Up @@ -72,6 +72,7 @@ use phantom_newtype::AmountOf;
use prometheus::IntCounter;
use rand::RngCore;
use std::collections::{BTreeMap, HashMap};
use std::fmt;
use std::str::FromStr;
use std::sync::Mutex;
use std::{convert::Into, convert::TryFrom, sync::Arc};
Expand Down Expand Up @@ -270,12 +271,24 @@ pub struct ExecutionEnvironment {

/// This is a helper enum that indicates whether the current DTS execution of
/// install_code is the first execution or not.
#[derive(PartialEq, Eq)]
#[derive(PartialEq, Eq, Debug)]
pub enum DtsInstallCodeStatus {
StartingFirstExecution,
ResumingPausedOrAbortedExecution,
}

impl fmt::Display for DtsInstallCodeStatus {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let status = match &self {
DtsInstallCodeStatus::StartingFirstExecution => "StartingFirstExecution",
DtsInstallCodeStatus::ResumingPausedOrAbortedExecution => {
"ResumingPausedOrAbortedExecution"
}
};
write!(f, "{}", status)
}
}

impl ExecutionEnvironment {
#[allow(clippy::too_many_arguments)]
pub fn new(
Expand Down Expand Up @@ -2014,21 +2027,28 @@ impl ExecutionEnvironment {
}
};

let call_id = match dts_status {
DtsInstallCodeStatus::StartingFirstExecution => {
let call_id = match call_id {
None => {
// Call ID is not provided only if the current
// DTS execution of install_code is the first execution.
debug_assert_eq!(
dts_status,
DtsInstallCodeStatus::StartingFirstExecution,
"Dts status mismatch: expected StartingFirstExecution, got {}",
dts_status
);
// Keep track of all existing long running install code messages.
// During a subnet split, the requests are rejected if the target canister moved to a new subnet.
let call_id = state
state
.metadata
.subnet_call_context_manager
.push_install_code_call(InstallCodeCall {
call: msg.clone(),
time: state.time(),
effective_canister_id: install_context.canister_id,
});
Some(call_id)
})
}
DtsInstallCodeStatus::ResumingPausedOrAbortedExecution => call_id,
Some(call_id) => call_id,
};

// Check the precondition.
Expand Down Expand Up @@ -2140,19 +2160,17 @@ impl ExecutionEnvironment {
state.put_canister_state(canister);
let refund = message.take_cycles();
// The message can be removed because a response was produced.
if let Some(call_id) = call_id {
let install_code_call = state
.metadata
.subnet_call_context_manager
.remove_install_code_call(call_id);
if install_code_call.is_none() {
info!(
let install_code_call = state
.metadata
.subnet_call_context_manager
.remove_install_code_call(call_id);
if install_code_call.is_none() {
info!(
self.log,
"Could not remove call id {} for canister {} after execution of install_code",
call_id,
canister_id
);
}
}
let state =
self.finish_subnet_message_execution(state, message, result, refund, timer);
Expand Down Expand Up @@ -2247,7 +2265,7 @@ impl ExecutionEnvironment {
prepaid_execution_cycles,
} => self.execute_install_code(
message,
call_id,
Some(call_id),
Some(prepaid_execution_cycles),
DtsInstallCodeStatus::ResumingPausedOrAbortedExecution,
state,
Expand Down
12 changes: 2 additions & 10 deletions rs/execution_environment/src/scheduler.rs
Expand Up @@ -1238,7 +1238,6 @@ impl SchedulerImpl {
state: &ReplicatedState,
current_round_type: ExecutionRoundType,
) {
let mut num_aborted_install_code_without_call_id = 0;
let canisters_with_tasks = state
.canister_states
.iter()
Expand All @@ -1252,12 +1251,8 @@ impl SchedulerImpl {
for (id, canister) in canisters_with_tasks {
for task in canister.system_state.task_queue.iter() {
match task {
ExecutionTask::AbortedExecution { .. } => {}
ExecutionTask::AbortedInstallCode { call_id, .. } => {
if call_id.is_none() {
num_aborted_install_code_without_call_id += 1;
}
}
ExecutionTask::AbortedExecution { .. }
| ExecutionTask::AbortedInstallCode { .. } => {}
ExecutionTask::Heartbeat => {
panic!(
"Unexpected heartbeat task after a round in canister {:?}",
Expand Down Expand Up @@ -1289,9 +1284,6 @@ impl SchedulerImpl {
}
}
}
self.metrics
.aborted_install_code_calls_without_call_id
.set(num_aborted_install_code_without_call_id);
}
}

Expand Down
6 changes: 0 additions & 6 deletions rs/execution_environment/src/scheduler/scheduler_metrics.rs
Expand Up @@ -92,8 +92,6 @@ pub(super) struct SchedulerMetrics {
pub(super) inducted_messages: IntCounterVec,
pub(super) ecdsa_signature_agreements: IntGauge,
// TODO(EXC-1466): Remove metric once all calls have `call_id` present.
pub(super) aborted_install_code_calls_without_call_id: IntGauge,
// TODO(EXC-1466): Remove metric once all calls have `call_id` present.
pub(super) stop_canister_calls_without_call_id: IntGauge,
}

Expand Down Expand Up @@ -585,10 +583,6 @@ impl SchedulerMetrics {
"Number of messages inducted, by destination.",
&["destination"],
),
aborted_install_code_calls_without_call_id: metrics_registry.int_gauge(
"scheduler_aborted_install_code_calls_without_call_id",
"Number of aborted install code calls with missing call ID.",
),
stop_canister_calls_without_call_id: metrics_registry.int_gauge(
"scheduler_stop_canister_calls_without_call_id",
"Number of stop canister calls with missing call ID.",
Expand Down
9 changes: 0 additions & 9 deletions rs/execution_environment/src/scheduler/test_utilities.rs
Expand Up @@ -412,15 +412,6 @@ impl SchedulerTest {
.unwrap();
}

/// Aborts all paused executions.
pub fn abort_all_paused_executions(&mut self) {
let mut state = self.state.take().unwrap();
self.scheduler
.exec_env
.abort_all_paused_executions(&mut state, &no_op_logger());
self.state = Some(state);
}

/// Returns all responses from the management canister to
/// `self.xnet_canister_id()`.
pub fn get_responses_to_injected_calls(&mut self) -> Vec<Response> {
Expand Down

0 comments on commit f3e3693

Please sign in to comment.