@@ -44,7 +44,7 @@ use ic_types::{
44
44
CountBytes , Height , NodeId , NumBytes , RegistryVersion , SubnetId ,
45
45
} ;
46
46
use std:: {
47
- collections:: { BTreeSet , HashSet } ,
47
+ collections:: { BTreeMap , BTreeSet , HashSet } ,
48
48
mem:: size_of,
49
49
sync:: { Arc , RwLock } ,
50
50
} ;
@@ -680,25 +680,10 @@ impl IntoMessages<(Vec<ConsensusResponse>, CanisterHttpBatchStats)>
680
680
)
681
681
} ) ;
682
682
683
- let divergece_responses = messages. divergence_responses . iter ( ) . filter_map ( |response| {
684
- // NOTE: We skip delivering the divergence response, if it has no shares
685
- // Such a divergence response should never validate, therefore this should never happen
686
- // However, if it where ever to happen, we can ignore it here.
687
- // This is sound, since eventually a timeout will end the outstanding callback anyway.
688
- response. shares . first ( ) . map ( |share| {
689
- // Map divergence responses to reject response
690
- stats. divergence_responses += 1 ;
691
- ConsensusResponse :: new (
692
- share. content . id ,
693
- Payload :: Reject ( RejectContext :: new (
694
- RejectCode :: SysTransient ,
695
- "Canister http responses were different across replicas, \
696
- and no consensus was reached"
697
- . to_string ( ) ,
698
- ) ) ,
699
- )
700
- } )
701
- } ) ;
683
+ let divergece_responses = messages
684
+ . divergence_responses
685
+ . iter ( )
686
+ . filter_map ( divergence_response_into_reject) ;
702
687
703
688
let responses = responses
704
689
. chain ( timeouts)
@@ -709,6 +694,69 @@ impl IntoMessages<(Vec<ConsensusResponse>, CanisterHttpBatchStats)>
709
694
}
710
695
}
711
696
697
+ /// Turns a [`CanisterHttpResponseDivergence`] into a [`ConsensusResponse`] containing a rejection.
698
+ ///
699
+ /// This function generates a detailed error message.
700
+ /// This will enable a developer to get some insight into the nature of the divergence problems, which they are facing.
701
+ /// It allows to get insight into whether the responses are split among a very small number of possible responses or each replica
702
+ /// got a unique response.
703
+ /// The first issue could point to some issue rate limiting (e.g. some replicas receive 429s) while the later would point to an
704
+ /// issue with the transform function (e.g. some non-deterministic component such as timestamp has not been removed).
705
+ ///
706
+ /// The function includes request id and timeout, which are also part of the hashed value.
707
+ fn divergence_response_into_reject (
708
+ response : & CanisterHttpResponseDivergence ,
709
+ ) -> Option < ConsensusResponse > {
710
+ // Get the id and timeout, which need to be reported in the error message as well
711
+ let Some ( ( id, timeout) ) = response
712
+ . shares
713
+ . first ( )
714
+ . map ( |share| ( share. content . id , share. content . timeout ) )
715
+ else {
716
+ // NOTE: We skip delivering the divergence response, if it has no shares
717
+ // Such a divergence response should never validate, therefore this should never happen
718
+ // However, if it where ever to happen, we can ignore it here.
719
+ // This is sound, since eventually a timeout will end the outstanding callback anyway.
720
+ return None ;
721
+ } ;
722
+
723
+ // Count the different content hashes, that we have encountered in the divergence resonse
724
+ let mut hash_counts = BTreeMap :: new ( ) ;
725
+ response
726
+ . shares
727
+ . iter ( )
728
+ . map ( |share| share. content . content_hash . clone ( ) . get ( ) . 0 )
729
+ . for_each ( |share| {
730
+ hash_counts
731
+ . entry ( share)
732
+ . and_modify ( |count| * count += 1 )
733
+ . or_insert ( 1 ) ;
734
+ } ) ;
735
+
736
+ // Now convert into a vector
737
+ let mut hash_counts = hash_counts. into_iter ( ) . collect :: < Vec < _ > > ( ) ;
738
+
739
+ // Sort in ascending order by number of counts
740
+ hash_counts. sort_by_key ( |( _, count) | * count) ;
741
+ // Convert them into hex strings
742
+ let hash_counts = hash_counts
743
+ . iter ( )
744
+ . rev ( )
745
+ . map ( |( hash, count) | format ! ( "[{}: {}]" , hex:: encode( hash) , count) )
746
+ . collect :: < Vec < _ > > ( ) ;
747
+
748
+ Some ( ConsensusResponse :: new (
749
+ id,
750
+ Payload :: Reject ( RejectContext :: new (
751
+ RejectCode :: SysTransient ,
752
+ format ! (
753
+ "No consensus could be reached. Replicas had different responses. Details: request_id: {}, timeout: {}, hashes: {}" ,
754
+ id, timeout. as_nanos_since_unix_epoch( ) , hash_counts. join( ", " )
755
+ ) ,
756
+ ) ) ,
757
+ ) )
758
+ }
759
+
712
760
fn validation_failed (
713
761
err : CanisterHttpPayloadValidationFailure ,
714
762
) -> Result < ( ) , PayloadValidationError > {
0 commit comments