Skip to content

Commit

Permalink
Introduces real Number normalization for Crystal::Hasher.
Browse files Browse the repository at this point in the history
As declared by Crystal language reference, 1i32.hash should equal to 1f64.hash.

Extracted from #4675, also replaces #4581.
  • Loading branch information
akzhan committed Nov 11, 2017
1 parent 4b6f1c1 commit 4dfa15a
Show file tree
Hide file tree
Showing 2 changed files with 173 additions and 5 deletions.
24 changes: 22 additions & 2 deletions spec/std/crystal/hasher_spec.cr
@@ -1,5 +1,6 @@
require "spec"
require "bit_array"
require "big"
require "random/secure"

struct Crystal::Hasher
Expand Down Expand Up @@ -51,6 +52,15 @@ describe "Crystal::Hasher" do
2.hash.should eq(2_u64.hash)
end

it "Big i64 numbers should be hashed ok" do
Int64::MAX.hash.should eq (Int64::MAX.hash)
end

pending "128bit types should be hashed ok" do
1.to_i128.hash.should eq (1_i8.hash)
1.to_u128.hash.should eq (1_u8.hash)
end

it "#float should change state and differ" do
hasher = TestHasher.for_test
hasher1 = 1.0.hash(hasher)
Expand Down Expand Up @@ -191,8 +201,8 @@ describe "Crystal::Hasher" do
hasher = TestHasher.for_test
hasher1 = 1.0.hash(hasher)
hasher2 = 2.0.hash(hasher)
hasher1.result.should eq(0xecfbe7798e8f67f2_u64)
hasher2.result.should eq(0x72847386c9572c30_u64)
hasher1.result.should eq(10728791798497425537_u64)
hasher2.result.should eq(12628815283865879015_u64)
end

it "#string should match test vectors" do
Expand Down Expand Up @@ -229,4 +239,14 @@ describe "Crystal::Hasher" do
hasher.inspect.should_not contain(hasher.@b.to_s(16))
end
end

describe "normalization of numbers" do
it "should 1_i32 and 1_f64 hashes equal" do
1_i32.hash.should eq(1_f64.hash)
end

it "should 1_f32 and 1.to_big_f hashes equal" do
1_f32.hash.should eq(1.to_big_f.hash)
end
end
end
154 changes: 151 additions & 3 deletions src/crystal/hasher.cr
Expand Up @@ -35,6 +35,51 @@ struct Crystal::Hasher
# Do not output calculated hash value to user's console/form/
# html/api response, etc. Use some from digest package instead.

# Based on https://github.com/python/cpython/blob/f051e43/Python/pyhash.c#L34
#
# For numeric types, the hash of a number x is based on the reduction
# of x modulo the Mersen Prime P = 2**HASH_BITS - 1. It's designed
# so that hash(x) == hash(y) whenever x and y are numerically equal,
# even if x and y have different types.
# A quick summary of the hashing strategy:
# (1) First define the 'reduction of x modulo P' for any rational
# number x; this is a standard extension of the usual notion of
# reduction modulo P for integers. If x == p/q (written in lowest
# terms), the reduction is interpreted as the reduction of p times
# the inverse of the reduction of q, all modulo P; if q is exactly
# divisible by P then define the reduction to be infinity. So we've
# got a well-defined map
# reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
# (2) Now for a rational number x, define hash(x) by:
# reduce(x) if x >= 0
# -reduce(-x) if x < 0
# If the result of the reduction is infinity (this is impossible for
# integers, floats and Decimals) then use the predefined hash value
# HASH_INF_PLUS for x >= 0, or HASH_INF_MINUS for x < 0, instead.
# HASH_INF_PLUS, HASH_INF_MINUS and HASH_NAN are also used for the
# hashes of float and Decimal infinities and nans.
# A selling point for the above strategy is that it makes it possible
# to compute hashes of decimal and binary floating-point numbers
# efficiently, even if the exponent of the binary or decimal number
# is large. The key point is that
# reduce(x * y) == reduce(x) * reduce(y) (modulo HASH_MODULUS)
# provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a
# binary or decimal float is never infinity, since the denominator is a power
# of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have,
# for nonnegative x,
# reduce(x * 2**e) == reduce(x) * reduce(2**e) % HASH_MODULUS
# reduce(x * 10**e) == reduce(x) * reduce(10**e) % HASH_MODULUS
# and reduce(10**e) can be computed efficiently by the usual modular
# exponentiation algorithm. For reduce(2**e) it's even better: since
# P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
# by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.

private HASH_BITS = 61
private HASH_MODULUS = (1_i64 << HASH_BITS) - 1
private HASH_NAN = 0_u64
private HASH_INF_PLUS = 314159_u64
private HASH_INF_MINUS = (-314159_i64).unsafe_as(UInt64)

@@seed = uninitialized UInt64[2]
Random::Secure.random_bytes(Slice.new(pointerof(@@seed).as(UInt8*), sizeof(typeof(@@seed))))

Expand Down Expand Up @@ -75,12 +120,115 @@ struct Crystal::Hasher
(value ? 1 : 0).hash(self)
end

def int(value)
def int(value : Int8 | Int16 | Int32)
permute(value.to_i64.unsafe_as(UInt64))
end

def int(value : UInt8 | UInt16 | UInt32)
permute(value.to_u64)
end

def float(value)
permute(value.to_f64.unsafe_as(UInt64))
def int(value : Int::Unsigned)
permute(value.remainder(HASH_MODULUS).to_u64)
end

def int(value : Int)
permute(value.remainder(HASH_MODULUS).to_i64.unsafe_as(UInt64))
end

# This function is for reference implementation, and it is used for BigFloat.
# For Float64 and Float32 all supported architectures allows more effective
# bitwise calculation.
# Arguments `frac` and `exp` are result of equivalent `Math.frexp`, though
# for `BigFloat` custom calculation used for more precision.
private def float_normalize_reference(value, frac, exp)
if value < 0
frac = -frac
end
# process 28 bits at a time; this should work well both for binary
# and hexadecimal floating point.
x = 0_i64
while frac > 0
x = ((x << 28) & HASH_MODULUS) | x >> (HASH_BITS - 28)
frac *= 268435456.0 # 2**28
exp -= 28
y = frac.to_u32 # pull out integer part
frac -= y
x += y
x -= HASH_MODULUS if x >= HASH_MODULUS
end
{x, exp}
end

private def float_normalize_wrap(value)
return HASH_NAN if value.nan?
if value.infinite?
return value > 0 ? HASH_INF_PLUS : HASH_INF_MINUS
end

x, exp = yield value

# adjust for the exponent; first reduce it modulo HASH_BITS
exp = exp >= 0 ? exp % HASH_BITS : HASH_BITS - 1 - ((-1 - exp) % HASH_BITS)
x = ((x << exp) & HASH_MODULUS) | x >> (HASH_BITS - exp)

(x * (value < 0 ? -1 : 1)).to_i64.unsafe_as(UInt64)
end

def float(value : Float32)
permute(float_normalize_wrap(value) do |value|
# Reference implementation:
# ```
# frac, exp = Math.frexp value
# float_normalize_reference(value, frac, exp)
# ```
# This optimized version works on every architecture where endianess
# of Float32 and Int32 matches and float is IEEE754. All supported
# architectures fall into this category.
unsafe_int = value.unsafe_as(Int32)
exp = (((unsafe_int >> 23) & 0xff) - 127)
mantissa = unsafe_int & ((1 << 23) - 1)
if exp > -127
exp -= 23
mantissa |= 1 << 23
else
# subnormals
exp -= 22
end
{mantissa.to_i64, exp}
end)
end

def float(value : Float64)
permute(float_normalize_wrap(value) do |value|
# Reference implementation:
# ```
# frac, exp = Math.frexp self
# float_normalize_reference(value, frac, exp)
# ```
# This optimized version works on every architecture where endianess
# of Float64 and Int64 matches and float is IEEE754. All supported
# architectures fall into this category.
unsafe_int = value.unsafe_as(Int64)
exp = (((unsafe_int >> 52) & 0x7ff) - 1023)
mantissa = unsafe_int & ((1_u64 << 52) - 1)
if exp > -1023
exp -= 52
mantissa |= 1_u64 << 52
else
# subnormals
exp -= 51
end

{mantissa.to_i64, exp}
end)
end

def float(value : Float)
frac, exp = Math.frexp value
permute(float_normalize_wrap(value) do |value|
float_normalize_reference(value, frac, exp)
end)
end

def char(value)
Expand Down

0 comments on commit 4dfa15a

Please sign in to comment.