Introduces real Number normalization for Crystal::Hasher.

As declared by Crystal language reference, 1i32.hash should equal to 1f64.hash. Extracted from #4675, also replaces #4581.
crystal-lang · Nov 11, 2017 · 4dfa15a · 4dfa15a
1 parent 4b6f1c1
commit 4dfa15a
Show file tree

Hide file tree

Showing 2 changed files with 173 additions and 5 deletions.
diff --git a/spec/std/crystal/hasher_spec.cr b/spec/std/crystal/hasher_spec.cr
@@ -1,5 +1,6 @@
 require "spec"
 require "bit_array"
+require "big"
 require "random/secure"
 
 struct Crystal::Hasher
@@ -51,6 +52,15 @@ describe "Crystal::Hasher" do
       2.hash.should eq(2_u64.hash)
     end
 
+    it "Big i64 numbers should be hashed ok" do
+      Int64::MAX.hash.should eq (Int64::MAX.hash)
+    end
+
+    pending "128bit types should be hashed ok" do
+      1.to_i128.hash.should eq (1_i8.hash)
+      1.to_u128.hash.should eq (1_u8.hash)
+    end
+
     it "#float should change state and differ" do
       hasher = TestHasher.for_test
       hasher1 = 1.0.hash(hasher)
@@ -191,8 +201,8 @@ describe "Crystal::Hasher" do
       hasher = TestHasher.for_test
       hasher1 = 1.0.hash(hasher)
       hasher2 = 2.0.hash(hasher)
-      hasher1.result.should eq(0xecfbe7798e8f67f2_u64)
-      hasher2.result.should eq(0x72847386c9572c30_u64)
+      hasher1.result.should eq(10728791798497425537_u64)
+      hasher2.result.should eq(12628815283865879015_u64)
     end
 
     it "#string should match test vectors" do
@@ -229,4 +239,14 @@ describe "Crystal::Hasher" do
       hasher.inspect.should_not contain(hasher.@b.to_s(16))
     end
   end
+
+  describe "normalization of numbers" do
+    it "should 1_i32 and 1_f64 hashes equal" do
+      1_i32.hash.should eq(1_f64.hash)
+    end
+
+    it "should 1_f32 and 1.to_big_f hashes equal" do
+      1_f32.hash.should eq(1.to_big_f.hash)
+    end
+  end
 end
diff --git a/src/crystal/hasher.cr b/src/crystal/hasher.cr
@@ -35,6 +35,51 @@ struct Crystal::Hasher
   # Do not output calculated hash value to user's console/form/
   # html/api response, etc. Use some from digest package instead.
 
+  # Based on https://github.com/python/cpython/blob/f051e43/Python/pyhash.c#L34
+  #
+  # For numeric types, the hash of a number x is based on the reduction
+  # of x modulo the Mersen Prime P = 2**HASH_BITS - 1.  It's designed
+  # so that hash(x) == hash(y) whenever x and y are numerically equal,
+  # even if x and y have different types.
+  # A quick summary of the hashing strategy:
+  # (1) First define the 'reduction of x modulo P' for any rational
+  # number x; this is a standard extension of the usual notion of
+  # reduction modulo P for integers.  If x == p/q (written in lowest
+  # terms), the reduction is interpreted as the reduction of p times
+  # the inverse of the reduction of q, all modulo P; if q is exactly
+  # divisible by P then define the reduction to be infinity.  So we've
+  # got a well-defined map
+  #   reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
+  # (2) Now for a rational number x, define hash(x) by:
+  #   reduce(x)   if x >= 0
+  #   -reduce(-x) if x < 0
+  # If the result of the reduction is infinity (this is impossible for
+  # integers, floats and Decimals) then use the predefined hash value
+  # HASH_INF_PLUS for x >= 0, or HASH_INF_MINUS for x < 0, instead.
+  # HASH_INF_PLUS, HASH_INF_MINUS and HASH_NAN are also used for the
+  # hashes of float and Decimal infinities and nans.
+  # A selling point for the above strategy is that it makes it possible
+  # to compute hashes of decimal and binary floating-point numbers
+  # efficiently, even if the exponent of the binary or decimal number
+  # is large.  The key point is that
+  #   reduce(x * y) == reduce(x) * reduce(y) (modulo HASH_MODULUS)
+  # provided that {reduce(x), reduce(y)} != {0, infinity}.  The reduction of a
+  # binary or decimal float is never infinity, since the denominator is a power
+  # of 2 (for binary) or a divisor of a power of 10 (for decimal).  So we have,
+  # for nonnegative x,
+  #   reduce(x * 2**e) == reduce(x) * reduce(2**e) % HASH_MODULUS
+  #   reduce(x * 10**e) == reduce(x) * reduce(10**e) % HASH_MODULUS
+  # and reduce(10**e) can be computed efficiently by the usual modular
+  # exponentiation algorithm.  For reduce(2**e) it's even better: since
+  # P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
+  # by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
+
+  private HASH_BITS      = 61
+  private HASH_MODULUS   = (1_i64 << HASH_BITS) - 1
+  private HASH_NAN       =      0_u64
+  private HASH_INF_PLUS  = 314159_u64
+  private HASH_INF_MINUS = (-314159_i64).unsafe_as(UInt64)
+
   @@seed = uninitialized UInt64[2]
   Random::Secure.random_bytes(Slice.new(pointerof(@@seed).as(UInt8*), sizeof(typeof(@@seed))))
 
@@ -75,12 +120,115 @@ struct Crystal::Hasher
     (value ? 1 : 0).hash(self)
   end
 
-  def int(value)
+  def int(value : Int8 | Int16 | Int32)
+    permute(value.to_i64.unsafe_as(UInt64))
+  end
+
+  def int(value : UInt8 | UInt16 | UInt32)
     permute(value.to_u64)
   end
 
-  def float(value)
-    permute(value.to_f64.unsafe_as(UInt64))
+  def int(value : Int::Unsigned)
+    permute(value.remainder(HASH_MODULUS).to_u64)
+  end
+
+  def int(value : Int)
+    permute(value.remainder(HASH_MODULUS).to_i64.unsafe_as(UInt64))
+  end
+
+  # This function is for reference implementation, and it is used for BigFloat.
+  # For Float64 and Float32 all supported architectures allows more effective
+  # bitwise calculation.
+  # Arguments `frac` and `exp` are result of equivalent `Math.frexp`, though
+  # for `BigFloat` custom calculation used for more precision.
+  private def float_normalize_reference(value, frac, exp)
+    if value < 0
+      frac = -frac
+    end
+    # process 28 bits at a time;  this should work well both for binary
+    # and hexadecimal floating point.
+    x = 0_i64
+    while frac > 0
+      x = ((x << 28) & HASH_MODULUS) | x >> (HASH_BITS - 28)
+      frac *= 268435456.0 # 2**28
+      exp -= 28
+      y = frac.to_u32 # pull out integer part
+      frac -= y
+      x += y
+      x -= HASH_MODULUS if x >= HASH_MODULUS
+    end
+    {x, exp}
+  end
+
+  private def float_normalize_wrap(value)
+    return HASH_NAN if value.nan?
+    if value.infinite?
+      return value > 0 ? HASH_INF_PLUS : HASH_INF_MINUS
+    end
+
+    x, exp = yield value
+
+    # adjust for the exponent;  first reduce it modulo HASH_BITS
+    exp = exp >= 0 ? exp % HASH_BITS : HASH_BITS - 1 - ((-1 - exp) % HASH_BITS)
+    x = ((x << exp) & HASH_MODULUS) | x >> (HASH_BITS - exp)
+
+    (x * (value < 0 ? -1 : 1)).to_i64.unsafe_as(UInt64)
+  end
+
+  def float(value : Float32)
+    permute(float_normalize_wrap(value) do |value|
+      # Reference implementation:
+      # ```
+      # frac, exp = Math.frexp value
+      # float_normalize_reference(value, frac, exp)
+      # ```
+      # This optimized version works on every architecture where endianess
+      # of Float32 and Int32 matches and float is IEEE754. All supported
+      # architectures fall into this category.
+      unsafe_int = value.unsafe_as(Int32)
+      exp = (((unsafe_int >> 23) & 0xff) - 127)
+      mantissa = unsafe_int & ((1 << 23) - 1)
+      if exp > -127
+        exp -= 23
+        mantissa |= 1 << 23
+      else
+        # subnormals
+        exp -= 22
+      end
+      {mantissa.to_i64, exp}
+    end)
+  end
+
+  def float(value : Float64)
+    permute(float_normalize_wrap(value) do |value|
+      # Reference implementation:
+      # ```
+      # frac, exp = Math.frexp self
+      # float_normalize_reference(value, frac, exp)
+      # ```
+      # This optimized version works on every architecture where endianess
+      # of Float64 and Int64 matches and float is IEEE754. All supported
+      # architectures fall into this category.
+      unsafe_int = value.unsafe_as(Int64)
+      exp = (((unsafe_int >> 52) & 0x7ff) - 1023)
+      mantissa = unsafe_int & ((1_u64 << 52) - 1)
+      if exp > -1023
+        exp -= 52
+        mantissa |= 1_u64 << 52
+      else
+        # subnormals
+        exp -= 51
+      end
+
+      {mantissa.to_i64, exp}
+    end)
+  end
+
+  def float(value : Float)
+    frac, exp = Math.frexp value
+    permute(float_normalize_wrap(value) do |value|
+      float_normalize_reference(value, frac, exp)
+    end)
   end
 
   def char(value)