Skip to content

Commit 7479249

Browse files
committed
feat(CON-1253): Improve HTTP outcall divergence error message
1 parent 6336dc7 commit 7479249

File tree

5 files changed

+137
-22
lines changed

5 files changed

+137
-22
lines changed

Cargo.lock

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

rs/https_outcalls/consensus/BUILD.bazel

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ DEPENDENCIES = [
1515
"//rs/replicated_state",
1616
"//rs/types/error_types",
1717
"//rs/types/types",
18+
"@crate_index//:hex",
1819
"@crate_index//:prometheus",
1920
"@crate_index//:slog",
2021
]
@@ -32,6 +33,8 @@ DEV_DEPENDENCIES = [
3233
"//rs/test_utilities/types",
3334
"@crate_index//:mockall",
3435
"@crate_index//:proptest",
36+
"@crate_index//:rand",
37+
"@crate_index//:rand_chacha",
3538
]
3639

3740
rust_library(

rs/https_outcalls/consensus/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ ic-protobuf = { path = "../../protobuf" }
1919
ic-registry-client-helpers = { path = "../../registry/helpers" }
2020
ic-replicated-state = { path = "../../replicated_state" }
2121
ic-types = { path = "../../types/types" }
22+
hex = { workspace = true }
2223
prometheus = { workspace = true }
2324
slog = { workspace = true }
2425

@@ -35,3 +36,5 @@ ic-test-utilities-state = { path = "../../test_utilities/state" }
3536
ic-test-utilities-time = { path = "../../test_utilities/time" }
3637
ic-test-utilities-types = { path = "../../test_utilities/types" }
3738
mockall = { workspace = true }
39+
rand = { workspace = true }
40+
rand_chacha = { workspace = true }

rs/https_outcalls/consensus/src/payload_builder.rs

Lines changed: 68 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ use ic_types::{
4444
CountBytes, Height, NodeId, NumBytes, RegistryVersion, SubnetId,
4545
};
4646
use std::{
47-
collections::{BTreeSet, HashSet},
47+
collections::{BTreeMap, BTreeSet, HashSet},
4848
mem::size_of,
4949
sync::{Arc, RwLock},
5050
};
@@ -680,25 +680,10 @@ impl IntoMessages<(Vec<ConsensusResponse>, CanisterHttpBatchStats)>
680680
)
681681
});
682682

683-
let divergece_responses = messages.divergence_responses.iter().filter_map(|response| {
684-
// NOTE: We skip delivering the divergence response, if it has no shares
685-
// Such a divergence response should never validate, therefore this should never happen
686-
// However, if it where ever to happen, we can ignore it here.
687-
// This is sound, since eventually a timeout will end the outstanding callback anyway.
688-
response.shares.first().map(|share| {
689-
// Map divergence responses to reject response
690-
stats.divergence_responses += 1;
691-
ConsensusResponse::new(
692-
share.content.id,
693-
Payload::Reject(RejectContext::new(
694-
RejectCode::SysTransient,
695-
"Canister http responses were different across replicas, \
696-
and no consensus was reached"
697-
.to_string(),
698-
)),
699-
)
700-
})
701-
});
683+
let divergece_responses = messages
684+
.divergence_responses
685+
.iter()
686+
.filter_map(divergence_response_into_reject);
702687

703688
let responses = responses
704689
.chain(timeouts)
@@ -709,6 +694,69 @@ impl IntoMessages<(Vec<ConsensusResponse>, CanisterHttpBatchStats)>
709694
}
710695
}
711696

697+
/// Turns a [`CanisterHttpResponseDivergence`] into a [`ConsensusResponse`] containing a rejection.
698+
///
699+
/// This function generates a detailed error message.
700+
/// This will enable a developer to get some insight into the nature of the divergence problems, which they are facing.
701+
/// It allows to get insight into whether the responses are split among a very small number of possible responses or each replica
702+
/// got a unique response.
703+
/// The first issue could point to some issue rate limiting (e.g. some replicas receive 429s) while the later would point to an
704+
/// issue with the transform function (e.g. some non-deterministic component such as timestamp has not been removed).
705+
///
706+
/// The function includes request id and timeout, which are also part of the hashed value.
707+
fn divergence_response_into_reject(
708+
response: &CanisterHttpResponseDivergence,
709+
) -> Option<ConsensusResponse> {
710+
// Get the id and timeout, which need to be reported in the error message as well
711+
let Some((id, timeout)) = response
712+
.shares
713+
.first()
714+
.map(|share| (share.content.id, share.content.timeout))
715+
else {
716+
// NOTE: We skip delivering the divergence response, if it has no shares
717+
// Such a divergence response should never validate, therefore this should never happen
718+
// However, if it where ever to happen, we can ignore it here.
719+
// This is sound, since eventually a timeout will end the outstanding callback anyway.
720+
return None;
721+
};
722+
723+
// Count the different content hashes, that we have encountered in the divergence resonse
724+
let mut hash_counts = BTreeMap::new();
725+
response
726+
.shares
727+
.iter()
728+
.map(|share| share.content.content_hash.clone().get().0)
729+
.for_each(|share| {
730+
hash_counts
731+
.entry(share)
732+
.and_modify(|count| *count += 1)
733+
.or_insert(1);
734+
});
735+
736+
// Now convert into a vector
737+
let mut hash_counts = hash_counts.into_iter().collect::<Vec<_>>();
738+
739+
// Sort in ascending order by number of counts
740+
hash_counts.sort_by_key(|(_, count)| *count);
741+
// Convert them into hex strings
742+
let hash_counts = hash_counts
743+
.iter()
744+
.rev()
745+
.map(|(hash, count)| format!("[{}: {}]", hex::encode(hash), count))
746+
.collect::<Vec<_>>();
747+
748+
Some(ConsensusResponse::new(
749+
id,
750+
Payload::Reject(RejectContext::new(
751+
RejectCode::SysTransient,
752+
format!(
753+
"No consensus could be reached. Replicas had different responses. Details: request_id: {}, timeout: {}, hashes: {}",
754+
id, timeout.as_nanos_since_unix_epoch(), hash_counts.join(", ")
755+
),
756+
)),
757+
))
758+
}
759+
712760
fn validation_failed(
713761
err: CanisterHttpPayloadValidationFailure,
714762
) -> Result<(), PayloadValidationError> {

rs/https_outcalls/consensus/src/payload_builder/tests.rs

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,15 @@
33
//!
44
//! Some tests are run over a range of subnet configurations to check for corner cases.
55
6-
use crate::payload_builder::parse::{bytes_to_payload, payload_to_bytes};
6+
use crate::payload_builder::{
7+
divergence_response_into_reject,
8+
parse::{bytes_to_payload, payload_to_bytes},
9+
};
710

811
use super::CanisterHttpPayloadBuilderImpl;
912
use ic_artifact_pool::canister_http_pool::CanisterHttpPoolImpl;
1013
use ic_consensus_mocks::{dependencies_with_subnet_params, Dependencies};
14+
use ic_error_types::RejectCode;
1115
use ic_interfaces::{
1216
batch_payload::{BatchPayloadBuilder, PastPayload, ProposalContext},
1317
canister_http::{
@@ -38,12 +42,14 @@ use ic_types::{
3842
},
3943
consensus::get_faults_tolerated,
4044
crypto::{crypto_hash, BasicSig, BasicSigOf, CryptoHash, CryptoHashOf, Signed},
41-
messages::CallbackId,
45+
messages::{CallbackId, Payload, RejectContext},
4246
registry::RegistryClientError,
4347
signature::{BasicSignature, BasicSignatureBatch},
4448
time::UNIX_EPOCH,
4549
Height, NumBytes, RegistryVersion, Time,
4650
};
51+
use rand::Rng;
52+
use rand_chacha::{rand_core::SeedableRng, ChaCha20Rng};
4753
use std::{
4854
collections::BTreeMap,
4955
ops::DerefMut,
@@ -760,6 +766,58 @@ fn divergence_response_validation_test() {
760766
}
761767
}
762768

769+
/// Check that the divergence error message is constructed correctly and readable
770+
#[test]
771+
fn divergence_error_message() {
772+
let (_, metadata) = test_response_and_metadata(1);
773+
774+
let mut rng = ChaCha20Rng::seed_from_u64(1337);
775+
let mut response_shares = (0..6)
776+
.map(|node_id| {
777+
let mut sample = metadata.clone();
778+
let mut new_hash = [0; 32];
779+
rng.fill(&mut new_hash);
780+
781+
sample.content_hash = CryptoHashOf::from(CryptoHash(new_hash.to_vec()));
782+
783+
Signed {
784+
content: sample,
785+
signature: BasicSignature {
786+
signature: BasicSigOf::new(BasicSig(vec![])),
787+
signer: node_test_id(node_id),
788+
},
789+
}
790+
})
791+
.collect::<Vec<_>>();
792+
793+
// Duplicate some responses
794+
response_shares.push(response_shares[0].clone());
795+
response_shares.push(response_shares[0].clone());
796+
response_shares.push(response_shares[1].clone());
797+
798+
let divergence_response = CanisterHttpResponseDivergence {
799+
shares: response_shares,
800+
};
801+
802+
let divergence_reject = divergence_response_into_reject(&divergence_response).unwrap();
803+
804+
assert_eq!(
805+
divergence_reject.payload,
806+
Payload::Reject(RejectContext::new(
807+
RejectCode::SysTransient,
808+
"No consensus could be reached. Replicas had different responses. \
809+
Details: request_id: 1, timeout: 10000000000, hashes: \
810+
[30bb6555f891c35fbc820c74a7db7c1ae621f924340712e12febe78a9be0a908: 3], \
811+
[e93edfdefc581e2bdb54a08b55dd67bc675afafbbb32697ef6b8bf9cc75fe69b: 2], \
812+
[af4dcbc617e83bc998190e3031123dbd26bcf0c5a5013b5465017234a98f7d74: 1], \
813+
[51a9af560377af0994fe4be465ea5adff3372623c6ac692c4d3e23b323ef8486: 1], \
814+
[2b7e888246a3b450c67396062e53c8b6c4b776e082e7d2a81c5536e89fe6013e: 1], \
815+
[000b3b9ca14f1136c076b7f681b0a496f5108f721833e6465d0671c014e60b43: 1]"
816+
.to_string()
817+
))
818+
);
819+
}
820+
763821
/// Build some test metadata and response, which is valid and can be used in
764822
/// different tests
765823
pub(crate) fn test_response_and_metadata(

0 commit comments

Comments
 (0)