Skip to content

Commit

Permalink
Deprecate buggy murmur3_32 and introduce murmur3_32_fixed.
Browse files Browse the repository at this point in the history
The bug was found by @findepi who also contributed the fix and the new tests via #5649.

Fixes #5648.

RELNOTES=`hash`: Deprecated buggy `murmur3_32`, and introduced `murmur3_32_fixed`.
PiperOrigin-RevId: 395463974
  • Loading branch information
findepi authored and Google Java Core Libraries committed Sep 8, 2021
1 parent a176cd6 commit a36f08f
Show file tree
Hide file tree
Showing 10 changed files with 268 additions and 66 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ enum HashFunctionEnum {
MD5(Hashing.md5()),
MURMUR3_128(Hashing.murmur3_128()),
MURMUR3_32(Hashing.murmur3_32()),
MURMUR3_32_FIXED(Hashing.murmur3_32_fixed()),
SHA1(Hashing.sha1()),
SHA256(Hashing.sha256()),
SHA384(Hashing.sha384()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,9 @@ public void testHashIntVsForLoop() {
.put(Hashing.murmur3_32(), EMPTY_STRING, "00000000")
.put(Hashing.murmur3_32(), TQBFJOTLD, "23f74f2e")
.put(Hashing.murmur3_32(), TQBFJOTLDP, "fc8bc4d5")
.put(Hashing.murmur3_32_fixed(), EMPTY_STRING, "00000000")
.put(Hashing.murmur3_32_fixed(), TQBFJOTLD, "23f74f2e")
.put(Hashing.murmur3_32_fixed(), TQBFJOTLDP, "fc8bc4d5")
.put(Hashing.sha1(), EMPTY_STRING, "da39a3ee5e6b4b0d3255bfef95601890afd80709")
.put(Hashing.sha1(), TQBFJOTLD, "2fd4e1c67a2d28fced849ee1bb76e7391b93eb12")
.put(Hashing.sha1(), TQBFJOTLDP, "408d94384216f890ff7a0c3528e8bed1e0b01621")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
package com.google.common.hash;

import static com.google.common.hash.Hashing.murmur3_32;
import static com.google.common.hash.Hashing.murmur3_32_fixed;

import com.google.common.base.Charsets;
import com.google.common.hash.HashTestUtils.HashFn;
import java.nio.charset.Charset;
import java.util.Random;
import junit.framework.TestCase;

Expand Down Expand Up @@ -51,28 +53,53 @@ public void testKnownStringInputs() {
-528633700, murmur3_32().hashUnencodedChars("The quick brown fox jumps over the lazy dog"));
}

public void testKnownUtf8StringInputs() {
assertHash(0, murmur3_32().hashString("", Charsets.UTF_8));
assertHash(0xcfbda5d1, murmur3_32().hashString("k", Charsets.UTF_8));
assertHash(0xa167dbf3, murmur3_32().hashString("hell", Charsets.UTF_8));
assertHash(0x248bfa47, murmur3_32().hashString("hello", Charsets.UTF_8));
assertHash(0x3d41b97c, murmur3_32().hashString("http://www.google.com/", Charsets.UTF_8));
assertHash(
0x2e4ff723,
murmur3_32().hashString("The quick brown fox jumps over the lazy dog", Charsets.UTF_8));
assertHash(0xfc5ba834, murmur3_32().hashString("毎月1日,毎週月曜日", Charsets.UTF_8));
assertHash(
0x3f4aff5c,
murmur3_32().hashString(Character.toString(Character.MAX_VALUE), Charsets.UTF_8));
assertHash(
0x81db5903,
murmur3_32()
.hashString(new String(Character.toChars(Character.MAX_VALUE + 1)), Charsets.UTF_8));
// Note (https://github.com/google/guava/issues/5648) the hash expected here is not correct
assertHash(
0x256068c8,
murmur3_32()
.hashString(new String(Character.toChars(Character.MAX_CODE_POINT)), Charsets.UTF_8));
@SuppressWarnings("deprecation")
public void testKnownEncodedStringInputs() {
assertStringHash(0, "", Charsets.UTF_8);
assertStringHash(0xcfbda5d1, "k", Charsets.UTF_8);
assertStringHash(0xa167dbf3, "hell", Charsets.UTF_8);
assertStringHash(0x248bfa47, "hello", Charsets.UTF_8);
assertStringHash(0x3d41b97c, "http://www.google.com/", Charsets.UTF_8);
assertStringHash(0x2e4ff723, "The quick brown fox jumps over the lazy dog", Charsets.UTF_8);
assertStringHash(0xb5a4be05, "ABCDefGHI\u0799", Charsets.UTF_8);
assertStringHash(0xfc5ba834, "毎月1日,毎週月曜日", Charsets.UTF_8);
assertStringHash(0x8a5c3699, "surrogate pair: \uD83D\uDCB0", Charsets.UTF_8);

assertStringHash(0, "", Charsets.UTF_16LE);
assertStringHash(0x288418e4, "k", Charsets.UTF_16LE);
assertStringHash(0x5a0cb7c3, "hell", Charsets.UTF_16LE);
assertStringHash(0xd7c31989, "hello", Charsets.UTF_16LE);
assertStringHash(0x73564d8c, "http://www.google.com/", Charsets.UTF_16LE);
assertStringHash(0xe07db09c, "The quick brown fox jumps over the lazy dog", Charsets.UTF_16LE);
assertStringHash(0xfefa3e76, "ABCDefGHI\u0799", Charsets.UTF_16LE);
assertStringHash(0x6a7be132, "毎月1日,毎週月曜日", Charsets.UTF_16LE);
assertStringHash(0x5a2d41c7, "surrogate pair: \uD83D\uDCB0", Charsets.UTF_16LE);
}

@SuppressWarnings("deprecation")
private void assertStringHash(int expected, String string, Charset charset) {
if (allBmp(string)) {
assertHash(expected, murmur3_32().hashString(string, charset));
}
assertHash(expected, murmur3_32_fixed().hashString(string, charset));
assertHash(expected, murmur3_32().newHasher().putString(string, charset).hash());
assertHash(expected, murmur3_32_fixed().newHasher().putString(string, charset).hash());
assertHash(expected, murmur3_32().hashBytes(string.getBytes(charset)));
assertHash(expected, murmur3_32_fixed().hashBytes(string.getBytes(charset)));
assertHash(expected, murmur3_32().newHasher().putBytes(string.getBytes(charset)).hash());
assertHash(expected, murmur3_32_fixed().newHasher().putBytes(string.getBytes(charset)).hash());
}

private boolean allBmp(String string) {
// Ordinarily we'd use something like i += Character.charCount(string.codePointAt(i)) here. But
// we can get away with i++ because the whole point of this method is to return false if we find
// a code point that doesn't fit in a char.
for (int i = 0; i < string.length(); i++) {
if (string.codePointAt(i) > 0xffff) {
return false;
}
}
return true;
}

@SuppressWarnings("deprecation")
Expand All @@ -83,7 +110,7 @@ public void testSimpleStringUtf8() {
}

@SuppressWarnings("deprecation")
public void testStringInputsUtf8() {
public void testEncodedStringInputs() {
Random rng = new Random(0);
for (int z = 0; z < 100; z++) {
String str;
Expand All @@ -100,9 +127,16 @@ public void testStringInputsUtf8() {
builder.appendCodePoint(codePoints[i]);
}
str = builder.toString();
HashCode hashUtf8 = murmur3_32().hashBytes(str.getBytes(Charsets.UTF_8));
assertEquals(
hashUtf8, murmur3_32().newHasher().putBytes(str.getBytes(Charsets.UTF_8)).hash());
assertEquals(hashUtf8, murmur3_32().hashString(str, Charsets.UTF_8));
assertEquals(hashUtf8, murmur3_32().newHasher().putString(str, Charsets.UTF_8).hash());
HashCode hashUtf16 = murmur3_32().hashBytes(str.getBytes(Charsets.UTF_16));
assertEquals(
murmur3_32().hashBytes(str.getBytes(Charsets.UTF_8)),
murmur3_32().hashString(str, Charsets.UTF_8));
hashUtf16, murmur3_32().newHasher().putBytes(str.getBytes(Charsets.UTF_16)).hash());
assertEquals(hashUtf16, murmur3_32().hashString(str, Charsets.UTF_16));
assertEquals(hashUtf16, murmur3_32().newHasher().putString(str, Charsets.UTF_16).hash());
}
}

Expand Down Expand Up @@ -150,14 +184,21 @@ public void testInvalidUnicodeHashString() {
assertEquals(
murmur3_32().hashBytes(str.getBytes(Charsets.UTF_8)),
murmur3_32().hashString(str, Charsets.UTF_8));
assertEquals(
murmur3_32_fixed().hashBytes(str.getBytes(Charsets.UTF_8)),
murmur3_32().hashString(str, Charsets.UTF_8));
}

@SuppressWarnings("deprecation")
public void testInvalidUnicodeHasherPutString() {
String str =
new String(
new char[] {'a', Character.MIN_HIGH_SURROGATE, Character.MIN_HIGH_SURROGATE, 'z'});
assertEquals(
murmur3_32().hashBytes(str.getBytes(Charsets.UTF_8)),
murmur3_32().newHasher().putString(str, Charsets.UTF_8).hash());
assertEquals(
murmur3_32_fixed().hashBytes(str.getBytes(Charsets.UTF_8)),
murmur3_32_fixed().newHasher().putString(str, Charsets.UTF_8).hash());
}
}
54 changes: 50 additions & 4 deletions android/guava/src/com/google/common/hash/Hashing.java
Original file line number Diff line number Diff line change
Expand Up @@ -91,15 +91,56 @@ public static HashFunction goodFastHash(int minimumBits) {
@SuppressWarnings("GoodTime") // reading system time without TimeSource
static final int GOOD_FAST_HASH_SEED = (int) System.currentTimeMillis();

/**
* Returns a hash function implementing the <a
* href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp">32-bit murmur3
* algorithm, x86 variant</a> (little-endian variant), using the given seed value, <b>with a known
* bug</b> as described in the deprecation text.
*
* <p>The C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A), which however does not
* have the bug.
*
* @deprecated This implementation produces incorrect hash values from the {@link
* HashFunction#hashString} method if the string contains non-BMP characters. Use {@link
* #murmur3_32_fixed(int)} instead.
*/
@Deprecated
public static HashFunction murmur3_32(int seed) {
return new Murmur3_32HashFunction(seed, /* supplementaryPlaneFix= */ false);
}

/**
* Returns a hash function implementing the <a
* href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp">32-bit murmur3
* algorithm, x86 variant</a> (little-endian variant), using the given seed value, <b>with a known
* bug</b> as described in the deprecation text.
*
* <p>The C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A), which however does not
* have the bug.
*
* @deprecated This implementation produces incorrect hash values from the {@link
* HashFunction#hashString} method if the string contains non-BMP characters. Use {@link
* #murmur3_32_fixed()} instead.
*/
@Deprecated
public static HashFunction murmur3_32() {
return Murmur3_32HashFunction.MURMUR3_32;
}

/**
* Returns a hash function implementing the <a
* href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp">32-bit murmur3
* algorithm, x86 variant</a> (little-endian variant), using the given seed value.
*
* <p>The exact C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A).
*
* <p>This method is called {@code murmur3_32_fixed} because it fixes a bug in the {@code
* HashFunction} returned by the original {@code murmur3_32} method.
*
* @since NEXT
*/
public static HashFunction murmur3_32(int seed) {
return new Murmur3_32HashFunction(seed);
public static HashFunction murmur3_32_fixed(int seed) {
return new Murmur3_32HashFunction(seed, /* supplementaryPlaneFix= */ true);
}

/**
Expand All @@ -108,9 +149,14 @@ public static HashFunction murmur3_32(int seed) {
* algorithm, x86 variant</a> (little-endian variant), using a seed value of zero.
*
* <p>The exact C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A).
*
* <p>This method is called {@code murmur3_32_fixed} because it fixes a bug in the {@code
* HashFunction} returned by the original {@code murmur3_32} method.
*
* @since NEXT
*/
public static HashFunction murmur3_32() {
return Murmur3_32HashFunction.MURMUR3_32;
public static HashFunction murmur3_32_fixed() {
return Murmur3_32HashFunction.MURMUR3_32_FIXED;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,20 +53,27 @@
@Immutable
@ElementTypesAreNonnullByDefault
final class Murmur3_32HashFunction extends AbstractHashFunction implements Serializable {
static final HashFunction MURMUR3_32 = new Murmur3_32HashFunction(0);
static final HashFunction MURMUR3_32 =
new Murmur3_32HashFunction(0, /* supplementaryPlaneFix= */ false);
static final HashFunction MURMUR3_32_FIXED =
new Murmur3_32HashFunction(0, /* supplementaryPlaneFix= */ true);

// We can include the non-BMP fix here because Hashing.goodFastHash stresses that the hash is a
// temporary-use one. Therefore it shouldn't be persisted.
static final HashFunction GOOD_FAST_HASH_32 =
new Murmur3_32HashFunction(Hashing.GOOD_FAST_HASH_SEED);
new Murmur3_32HashFunction(Hashing.GOOD_FAST_HASH_SEED, /* supplementaryPlaneFix= */ true);

private static final int CHUNK_SIZE = 4;

private static final int C1 = 0xcc9e2d51;
private static final int C2 = 0x1b873593;

private final int seed;
private final boolean supplementaryPlaneFix;

Murmur3_32HashFunction(int seed) {
Murmur3_32HashFunction(int seed, boolean supplementaryPlaneFix) {
this.seed = seed;
this.supplementaryPlaneFix = supplementaryPlaneFix;
}

@Override
Expand All @@ -88,7 +95,7 @@ public String toString() {
public boolean equals(@CheckForNull Object object) {
if (object instanceof Murmur3_32HashFunction) {
Murmur3_32HashFunction other = (Murmur3_32HashFunction) object;
return seed == other.seed;
return seed == other.seed && supplementaryPlaneFix == other.supplementaryPlaneFix;
}
return false;
}
Expand Down Expand Up @@ -191,6 +198,9 @@ public HashCode hashString(CharSequence input, Charset charset) {
}
i++;
buffer |= codePointToFourUtf8Bytes(codePoint) << shift;
if (supplementaryPlaneFix) { // bug compatibility: earlier versions did not have this add
shift += 32;
}
len += 4;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ enum HashFunctionEnum {
MD5(Hashing.md5()),
MURMUR3_128(Hashing.murmur3_128()),
MURMUR3_32(Hashing.murmur3_32()),
MURMUR3_32_FIXED(Hashing.murmur3_32_fixed()),
SHA1(Hashing.sha1()),
SHA256(Hashing.sha256()),
SHA384(Hashing.sha384()),
Expand Down
3 changes: 3 additions & 0 deletions guava-tests/test/com/google/common/hash/HashingTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,9 @@ public void testHashIntVsForLoop() {
.put(Hashing.murmur3_32(), EMPTY_STRING, "00000000")
.put(Hashing.murmur3_32(), TQBFJOTLD, "23f74f2e")
.put(Hashing.murmur3_32(), TQBFJOTLDP, "fc8bc4d5")
.put(Hashing.murmur3_32_fixed(), EMPTY_STRING, "00000000")
.put(Hashing.murmur3_32_fixed(), TQBFJOTLD, "23f74f2e")
.put(Hashing.murmur3_32_fixed(), TQBFJOTLDP, "fc8bc4d5")
.put(Hashing.sha1(), EMPTY_STRING, "da39a3ee5e6b4b0d3255bfef95601890afd80709")
.put(Hashing.sha1(), TQBFJOTLD, "2fd4e1c67a2d28fced849ee1bb76e7391b93eb12")
.put(Hashing.sha1(), TQBFJOTLDP, "408d94384216f890ff7a0c3528e8bed1e0b01621")
Expand Down
Loading

0 comments on commit a36f08f

Please sign in to comment.