Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Number normalization for Crystal::Hasher #5276

Merged
merged 6 commits into from Nov 24, 2017
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
32 changes: 30 additions & 2 deletions spec/std/crystal/hasher_spec.cr
@@ -1,5 +1,6 @@
require "spec"
require "bit_array"
require "big"
require "random/secure"

struct Crystal::Hasher
Expand Down Expand Up @@ -51,6 +52,15 @@ describe "Crystal::Hasher" do
2.hash.should eq(2_u64.hash)
end

it "Big i64 numbers should be hashed ok" do
Int64::MAX.hash.should eq(Int64::MAX.hash)
end

pending "128bit types should be hashed ok" do
1.to_i128.hash.should eq(1_i8.hash)
1.to_u128.hash.should eq(1_u8.hash)
end

it "#float should change state and differ" do
hasher = TestHasher.for_test
hasher1 = 1.0.hash(hasher)
Expand Down Expand Up @@ -191,8 +201,8 @@ describe "Crystal::Hasher" do
hasher = TestHasher.for_test
hasher1 = 1.0.hash(hasher)
hasher2 = 2.0.hash(hasher)
hasher1.result.should eq(0xecfbe7798e8f67f2_u64)
hasher2.result.should eq(0x72847386c9572c30_u64)
hasher1.result.should eq(10728791798497425537_u64)
hasher2.result.should eq(12628815283865879015_u64)
end

it "#string should match test vectors" do
Expand Down Expand Up @@ -229,4 +239,22 @@ describe "Crystal::Hasher" do
hasher.inspect.should_not contain(hasher.@b.to_s(16))
end
end

describe "normalization of numbers" do
it "should 1_i32 and 1_f64 hashes equal" do
1_i32.hash.should eq(1_f64.hash)
end

it "should 1_f32 and 1.to_big_f hashes equal" do
1_f32.hash.should eq(1.to_big_f.hash)
end

it "should 1_f32 and 1.to_big_r hashes equal" do
1_f32.hash.should eq(1.to_big_r.hash)
end

it "should 1_f32 and 1.to_big_i hashes equal" do
1_f32.hash.should eq(1.to_big_i.hash)
end
end
end
18 changes: 18 additions & 0 deletions src/big/big_float.cr
Expand Up @@ -293,3 +293,21 @@ module Math
BigFloat.new { |mpf| LibGMP.mpf_sqrt(mpf, value) }
end
end

# :nodoc:
struct Crystal::Hasher
def float(value : BigFloat)
permute(float_normalize_wrap(value) do |value|
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please assign the result of the float_normalize_wrap to a variable which is passed to permute. Having a multi-line block inside a method call arguments is just bad style.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@akzhan You missed that one ;)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Sija Thanks!

# more exact version of `Math.frexp`
LibGMP.mpf_get_d_2exp(out exp, value)
frac = BigFloat.new do |mpf|
if exp >= 0
LibGMP.mpf_div_2exp(mpf, value, exp)
else
LibGMP.mpf_mul_2exp(mpf, value, -exp)
end
end
float_normalize_reference(value, frac, exp)
end)
end
end
18 changes: 18 additions & 0 deletions src/big/big_int.cr
Expand Up @@ -551,3 +551,21 @@ module Math
sqrt(value.to_big_f)
end
end

# :nodoc:
struct Crystal::Hasher
private HASH_MODULUS_INT_P = BigInt.new((1_u64 << HASH_BITS) - 1)
private HASH_MODULUS_INT_N = -BigInt.new((1_u64 << HASH_BITS) - 1)

def int(value : BigInt)
# it should calculate `remainder(HASH_MODULUS)`
if LibGMP::ULong == UInt64
v = LibGMP.tdiv_ui(value, HASH_MODULUS).to_i64
value < 0 ? -v : v
elsif value >= HASH_MODULUS_INT_P || value <= HASH_MODULUS_INT_N
value.unsafe_truncated_mod(HASH_MODULUS_INT_P).to_i64
else
value.to_i64
end
end
end
18 changes: 18 additions & 0 deletions src/big/big_rational.cr
Expand Up @@ -275,3 +275,21 @@ module Math
sqrt(value.to_big_f)
end
end

# :nodoc:
struct Crystal::Hasher
private HASH_MODULUS_RAT_P = BigRational.new((1_u64 << HASH_BITS) - 1)
private HASH_MODULUS_RAT_N = -BigRational.new((1_u64 << HASH_BITS) - 1)

def float(value : BigRational)
rem = value
if value >= HASH_MODULUS_RAT_P || value <= HASH_MODULUS_RAT_N
num = value.numerator
denom = value.denominator
div = num.tdiv(denom)
floor = div.tdiv(HASH_MODULUS)
rem -= floor * HASH_MODULUS
end
rem.to_big_f.hash
end
end
148 changes: 145 additions & 3 deletions src/crystal/hasher.cr
Expand Up @@ -35,6 +35,52 @@ struct Crystal::Hasher
# Do not output calculated hash value to user's console/form/
# html/api response, etc. Use some from digest package instead.

# Based on https://github.com/python/cpython/blob/f051e43/Python/pyhash.c#L34
#
# For numeric types, the hash of a number x is based on the reduction
# of x modulo the Mersen Prime P = 2**HASH_BITS - 1. It's designed
# so that hash(x) == hash(y) whenever x and y are numerically equal,
# even if x and y have different types.
# A quick summary of the hashing strategy:
# (1) First define the 'reduction of x modulo P' for any rational
# number x; this is a standard extension of the usual notion of
# reduction modulo P for integers. If x == p/q (written in lowest
# terms), the reduction is interpreted as the reduction of p times
# the inverse of the reduction of q, all modulo P; if q is exactly
# divisible by P then define the reduction to be infinity. So we've
# got a well-defined map
# reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
# (2) Now for a rational number x, define hash(x) by:
# reduce(x) if x >= 0
# -reduce(-x) if x < 0
# If the result of the reduction is infinity (this is impossible for
# integers, floats and Decimals) then use the predefined hash value
# HASH_INF_PLUS for x >= 0, or HASH_INF_MINUS for x < 0, instead.
# HASH_INF_PLUS, HASH_INF_MINUS and HASH_NAN are also used for the
# hashes of float and Decimal infinities and nans.
# A selling point for the above strategy is that it makes it possible
# to compute hashes of decimal and binary floating-point numbers
# efficiently, even if the exponent of the binary or decimal number
# is large. The key point is that
# reduce(x * y) == reduce(x) * reduce(y) (modulo HASH_MODULUS)
# provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a
# binary or decimal float is never infinity, since the denominator is a power
# of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have,
# for nonnegative x,
# reduce(x * 2**e) == reduce(x) * reduce(2**e) % HASH_MODULUS
# reduce(x * 10**e) == reduce(x) * reduce(10**e) % HASH_MODULUS
# and reduce(10**e) can be computed efficiently by the usual modular
# exponentiation algorithm. For reduce(2**e) it's even better: since
# P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
# by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.

private HASH_BITS = 61
private HASH_MODULUS = (1_i64 << HASH_BITS) - 1

private HASH_NAN = 0_u64
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it a formatting issue?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it is aligned with _ of below 314159_u64. I think it is expected.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO it should should have single space after =.

private HASH_INF_PLUS = 314159_u64
private HASH_INF_MINUS = (-314159_i64).unsafe_as(UInt64)

@@seed = uninitialized UInt64[2]
Random::Secure.random_bytes(Slice.new(pointerof(@@seed).as(UInt8*), sizeof(typeof(@@seed))))

Expand Down Expand Up @@ -75,12 +121,108 @@ struct Crystal::Hasher
(value ? 1 : 0).hash(self)
end

def int(value)
def int(value : Int8 | Int16 | Int32)
permute(value.to_i64.unsafe_as(UInt64))
end

def int(value : UInt8 | UInt16 | UInt32)
permute(value.to_u64)
end

def float(value)
permute(value.to_f64.unsafe_as(UInt64))
def int(value : Int::Unsigned)
permute(value.remainder(HASH_MODULUS).to_u64)
end

def int(value : Int)
permute(value.remainder(HASH_MODULUS).to_i64.unsafe_as(UInt64))
end

# This function is for reference implementation, and it is used for `BigFloat`.
# For `Float64` and `Float32` all supported architectures allows more effective
# bitwise calculation.
# Arguments `frac` and `exp` are result of equivalent `Math.frexp`, though
# for `BigFloat` custom calculation used for more precision.
private def float_normalize_reference(value, frac, exp)
if value < 0
frac = -frac
end
# process 28 bits at a time; this should work well both for binary
# and hexadecimal floating point.
x = 0_i64
while frac > 0
x = ((x << 28) & HASH_MODULUS) | x >> (HASH_BITS - 28)
frac *= 268435456.0 # 2**28
exp -= 28
y = frac.to_u32 # pull out integer part
frac -= y
x += y
x -= HASH_MODULUS if x >= HASH_MODULUS
end
{x, exp}
end

private def float_normalize_wrap(value)
return HASH_NAN if value.nan?
if value.infinite?
return value > 0 ? HASH_INF_PLUS : HASH_INF_MINUS
end

x, exp = yield value

# adjust for the exponent; first reduce it modulo HASH_BITS
exp = exp >= 0 ? exp % HASH_BITS : HASH_BITS - 1 - ((-1 - exp) % HASH_BITS)
x = ((x << exp) & HASH_MODULUS) | x >> (HASH_BITS - exp)

(x * (value < 0 ? -1 : 1)).to_i64.unsafe_as(UInt64)
end

def float(value : Float32)
normalized_hash = float_normalize_wrap(value) do |value|
# This optimized version works on every architecture where endianess
# of Float32 and Int32 matches and float is IEEE754. All supported
# architectures fall into this category.
unsafe_int = value.unsafe_as(Int32)
exp = (((unsafe_int >> 23) & 0xff) - 127)
mantissa = unsafe_int & ((1 << 23) - 1)
if exp > -127
exp -= 23
mantissa |= 1 << 23
else
# subnormals
exp -= 22
end
{mantissa.to_i64, exp}
end
permute(normalized_hash)
end

def float(value : Float64)
normalized_hash = float_normalize_wrap(value) do |value|
# This optimized version works on every architecture where endianess
# of Float64 and Int64 matches and float is IEEE754. All supported
# architectures fall into this category.
unsafe_int = value.unsafe_as(Int64)
exp = (((unsafe_int >> 52) & 0x7ff) - 1023)
mantissa = unsafe_int & ((1_u64 << 52) - 1)
if exp > -1023
exp -= 52
mantissa |= 1_u64 << 52
else
# subnormals
exp -= 51
end

{mantissa.to_i64, exp}
end
permute(normalized_hash)
end

def float(value : Float)
normalized_hash = float_normalize_wrap(value) do |value|
frac, exp = Math.frexp value
float_normalize_reference(value, frac, exp)
end
permute(normalized_hash)
end

def char(value)
Expand Down