Skip to content

Commit

Permalink
Deprecate buggy murmur3_32 and introduce murmur3_32_fixed.
Browse files Browse the repository at this point in the history
The bug was found by @findepi who also contributed the fix and the new tests via #5649.

Fixes #5648.

RELNOTES=`hash`: Deprecated buggy `murmur3_32`, and introduced `murmur3_32_fixed`.
PiperOrigin-RevId: 386953108
  • Loading branch information
eamonnmcmanus authored and Google Java Core Libraries committed Sep 8, 2021
1 parent a176cd6 commit 5ef0475
Show file tree
Hide file tree
Showing 10 changed files with 268 additions and 66 deletions.
Expand Up @@ -32,6 +32,7 @@ enum HashFunctionEnum {
MD5(Hashing.md5()),
MURMUR3_128(Hashing.murmur3_128()),
MURMUR3_32(Hashing.murmur3_32()),
MURMUR3_32_FIXED(Hashing.murmur3_32_fixed()),
SHA1(Hashing.sha1()),
SHA256(Hashing.sha256()),
SHA384(Hashing.sha384()),
Expand Down
Expand Up @@ -432,6 +432,9 @@ public void testHashIntVsForLoop() {
.put(Hashing.murmur3_32(), EMPTY_STRING, "00000000")
.put(Hashing.murmur3_32(), TQBFJOTLD, "23f74f2e")
.put(Hashing.murmur3_32(), TQBFJOTLDP, "fc8bc4d5")
.put(Hashing.murmur3_32_fixed(), EMPTY_STRING, "00000000")
.put(Hashing.murmur3_32_fixed(), TQBFJOTLD, "23f74f2e")
.put(Hashing.murmur3_32_fixed(), TQBFJOTLDP, "fc8bc4d5")
.put(Hashing.sha1(), EMPTY_STRING, "da39a3ee5e6b4b0d3255bfef95601890afd80709")
.put(Hashing.sha1(), TQBFJOTLD, "2fd4e1c67a2d28fced849ee1bb76e7391b93eb12")
.put(Hashing.sha1(), TQBFJOTLDP, "408d94384216f890ff7a0c3528e8bed1e0b01621")
Expand Down
Expand Up @@ -17,9 +17,11 @@
package com.google.common.hash;

import static com.google.common.hash.Hashing.murmur3_32;
import static com.google.common.hash.Hashing.murmur3_32_fixed;

import com.google.common.base.Charsets;
import com.google.common.hash.HashTestUtils.HashFn;
import java.nio.charset.Charset;
import java.util.Random;
import junit.framework.TestCase;

Expand Down Expand Up @@ -51,28 +53,53 @@ public void testKnownStringInputs() {
-528633700, murmur3_32().hashUnencodedChars("The quick brown fox jumps over the lazy dog"));
}

public void testKnownUtf8StringInputs() {
assertHash(0, murmur3_32().hashString("", Charsets.UTF_8));
assertHash(0xcfbda5d1, murmur3_32().hashString("k", Charsets.UTF_8));
assertHash(0xa167dbf3, murmur3_32().hashString("hell", Charsets.UTF_8));
assertHash(0x248bfa47, murmur3_32().hashString("hello", Charsets.UTF_8));
assertHash(0x3d41b97c, murmur3_32().hashString("http://www.google.com/", Charsets.UTF_8));
assertHash(
0x2e4ff723,
murmur3_32().hashString("The quick brown fox jumps over the lazy dog", Charsets.UTF_8));
assertHash(0xfc5ba834, murmur3_32().hashString("毎月1日,毎週月曜日", Charsets.UTF_8));
assertHash(
0x3f4aff5c,
murmur3_32().hashString(Character.toString(Character.MAX_VALUE), Charsets.UTF_8));
assertHash(
0x81db5903,
murmur3_32()
.hashString(new String(Character.toChars(Character.MAX_VALUE + 1)), Charsets.UTF_8));
// Note (https://github.com/google/guava/issues/5648) the hash expected here is not correct
assertHash(
0x256068c8,
murmur3_32()
.hashString(new String(Character.toChars(Character.MAX_CODE_POINT)), Charsets.UTF_8));
@SuppressWarnings("deprecation")
public void testKnownEncodedStringInputs() {
assertStringHash(0, "", Charsets.UTF_8);
assertStringHash(0xcfbda5d1, "k", Charsets.UTF_8);
assertStringHash(0xa167dbf3, "hell", Charsets.UTF_8);
assertStringHash(0x248bfa47, "hello", Charsets.UTF_8);
assertStringHash(0x3d41b97c, "http://www.google.com/", Charsets.UTF_8);
assertStringHash(0x2e4ff723, "The quick brown fox jumps over the lazy dog", Charsets.UTF_8);
assertStringHash(0xb5a4be05, "ABCDefGHI\u0799", Charsets.UTF_8);
assertStringHash(0xfc5ba834, "毎月1日,毎週月曜日", Charsets.UTF_8);
assertStringHash(0x8a5c3699, "surrogate pair: \uD83D\uDCB0", Charsets.UTF_8);

assertStringHash(0, "", Charsets.UTF_16LE);
assertStringHash(0x288418e4, "k", Charsets.UTF_16LE);
assertStringHash(0x5a0cb7c3, "hell", Charsets.UTF_16LE);
assertStringHash(0xd7c31989, "hello", Charsets.UTF_16LE);
assertStringHash(0x73564d8c, "http://www.google.com/", Charsets.UTF_16LE);
assertStringHash(0xe07db09c, "The quick brown fox jumps over the lazy dog", Charsets.UTF_16LE);
assertStringHash(0xfefa3e76, "ABCDefGHI\u0799", Charsets.UTF_16LE);
assertStringHash(0x6a7be132, "毎月1日,毎週月曜日", Charsets.UTF_16LE);
assertStringHash(0x5a2d41c7, "surrogate pair: \uD83D\uDCB0", Charsets.UTF_16LE);
}

@SuppressWarnings("deprecation")
private void assertStringHash(int expected, String string, Charset charset) {
if (allBmp(string)) {
assertHash(expected, murmur3_32().hashString(string, charset));
}
assertHash(expected, murmur3_32_fixed().hashString(string, charset));
assertHash(expected, murmur3_32().newHasher().putString(string, charset).hash());
assertHash(expected, murmur3_32_fixed().newHasher().putString(string, charset).hash());
assertHash(expected, murmur3_32().hashBytes(string.getBytes(charset)));
assertHash(expected, murmur3_32_fixed().hashBytes(string.getBytes(charset)));
assertHash(expected, murmur3_32().newHasher().putBytes(string.getBytes(charset)).hash());
assertHash(expected, murmur3_32_fixed().newHasher().putBytes(string.getBytes(charset)).hash());
}

private boolean allBmp(String string) {
// Ordinarily we'd use something like i += Character.charCount(string.codePointAt(i)) here. But
// we can get away with i++ because the whole point of this method is to return false if we find
// a code point that doesn't fit in a char.
for (int i = 0; i < string.length(); i++) {
if (string.codePointAt(i) > 0xffff) {
return false;
}
}
return true;
}

@SuppressWarnings("deprecation")
Expand All @@ -83,7 +110,7 @@ public void testSimpleStringUtf8() {
}

@SuppressWarnings("deprecation")
public void testStringInputsUtf8() {
public void testEncodedStringInputs() {
Random rng = new Random(0);
for (int z = 0; z < 100; z++) {
String str;
Expand All @@ -100,9 +127,16 @@ public void testStringInputsUtf8() {
builder.appendCodePoint(codePoints[i]);
}
str = builder.toString();
HashCode hashUtf8 = murmur3_32().hashBytes(str.getBytes(Charsets.UTF_8));
assertEquals(
hashUtf8, murmur3_32().newHasher().putBytes(str.getBytes(Charsets.UTF_8)).hash());
assertEquals(hashUtf8, murmur3_32().hashString(str, Charsets.UTF_8));
assertEquals(hashUtf8, murmur3_32().newHasher().putString(str, Charsets.UTF_8).hash());
HashCode hashUtf16 = murmur3_32().hashBytes(str.getBytes(Charsets.UTF_16));
assertEquals(
murmur3_32().hashBytes(str.getBytes(Charsets.UTF_8)),
murmur3_32().hashString(str, Charsets.UTF_8));
hashUtf16, murmur3_32().newHasher().putBytes(str.getBytes(Charsets.UTF_16)).hash());
assertEquals(hashUtf16, murmur3_32().hashString(str, Charsets.UTF_16));
assertEquals(hashUtf16, murmur3_32().newHasher().putString(str, Charsets.UTF_16).hash());
}
}

Expand Down Expand Up @@ -150,14 +184,21 @@ public void testInvalidUnicodeHashString() {
assertEquals(
murmur3_32().hashBytes(str.getBytes(Charsets.UTF_8)),
murmur3_32().hashString(str, Charsets.UTF_8));
assertEquals(
murmur3_32_fixed().hashBytes(str.getBytes(Charsets.UTF_8)),
murmur3_32().hashString(str, Charsets.UTF_8));
}

@SuppressWarnings("deprecation")
public void testInvalidUnicodeHasherPutString() {
String str =
new String(
new char[] {'a', Character.MIN_HIGH_SURROGATE, Character.MIN_HIGH_SURROGATE, 'z'});
assertEquals(
murmur3_32().hashBytes(str.getBytes(Charsets.UTF_8)),
murmur3_32().newHasher().putString(str, Charsets.UTF_8).hash());
assertEquals(
murmur3_32_fixed().hashBytes(str.getBytes(Charsets.UTF_8)),
murmur3_32_fixed().newHasher().putString(str, Charsets.UTF_8).hash());
}
}
54 changes: 50 additions & 4 deletions android/guava/src/com/google/common/hash/Hashing.java
Expand Up @@ -91,15 +91,56 @@ public static HashFunction goodFastHash(int minimumBits) {
@SuppressWarnings("GoodTime") // reading system time without TimeSource
static final int GOOD_FAST_HASH_SEED = (int) System.currentTimeMillis();

/**
* Returns a hash function implementing the <a
* href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp">32-bit murmur3
* algorithm, x86 variant</a> (little-endian variant), using the given seed value, <b>with a known
* bug</b> as described in the deprecation text.
*
* <p>The C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A), which however does not
* have the bug.
*
* @deprecated This implementation produces incorrect hash values from the {@link
* HashFunction#hashString} method if the string contains non-BMP characters. Use {@link
* #murmur3_32_fixed(int)} instead.
*/
@Deprecated
public static HashFunction murmur3_32(int seed) {
return new Murmur3_32HashFunction(seed, /* supplementaryPlaneFix= */ false);
}

/**
* Returns a hash function implementing the <a
* href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp">32-bit murmur3
* algorithm, x86 variant</a> (little-endian variant), using the given seed value, <b>with a known
* bug</b> as described in the deprecation text.
*
* <p>The C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A), which however does not
* have the bug.
*
* @deprecated This implementation produces incorrect hash values from the {@link
* HashFunction#hashString} method if the string contains non-BMP characters. Use {@link
* #murmur3_32_fixed()} instead.
*/
@Deprecated
public static HashFunction murmur3_32() {
return Murmur3_32HashFunction.MURMUR3_32;
}

/**
* Returns a hash function implementing the <a
* href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp">32-bit murmur3
* algorithm, x86 variant</a> (little-endian variant), using the given seed value.
*
* <p>The exact C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A).
*
* <p>This method is called {@code murmur3_32_fixed} because it fixes a bug in the {@code
* HashFunction} returned by the original {@code murmur3_32} method.
*
* @since NEXT
*/
public static HashFunction murmur3_32(int seed) {
return new Murmur3_32HashFunction(seed);
public static HashFunction murmur3_32_fixed(int seed) {
return new Murmur3_32HashFunction(seed, /* supplementaryPlaneFix= */ true);
}

/**
Expand All @@ -108,9 +149,14 @@ public static HashFunction murmur3_32(int seed) {
* algorithm, x86 variant</a> (little-endian variant), using a seed value of zero.
*
* <p>The exact C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A).
*
* <p>This method is called {@code murmur3_32_fixed} because it fixes a bug in the {@code
* HashFunction} returned by the original {@code murmur3_32} method.
*
* @since NEXT
*/
public static HashFunction murmur3_32() {
return Murmur3_32HashFunction.MURMUR3_32;
public static HashFunction murmur3_32_fixed() {
return Murmur3_32HashFunction.MURMUR3_32_FIXED;
}

/**
Expand Down
Expand Up @@ -53,20 +53,27 @@
@Immutable
@ElementTypesAreNonnullByDefault
final class Murmur3_32HashFunction extends AbstractHashFunction implements Serializable {
static final HashFunction MURMUR3_32 = new Murmur3_32HashFunction(0);
static final HashFunction MURMUR3_32 =
new Murmur3_32HashFunction(0, /* supplementaryPlaneFix= */ false);
static final HashFunction MURMUR3_32_FIXED =
new Murmur3_32HashFunction(0, /* supplementaryPlaneFix= */ true);

// We can include the non-BMP fix here because Hashing.goodFastHash stresses that the hash is a
// temporary-use one. Therefore it shouldn't be persisted.
static final HashFunction GOOD_FAST_HASH_32 =
new Murmur3_32HashFunction(Hashing.GOOD_FAST_HASH_SEED);
new Murmur3_32HashFunction(Hashing.GOOD_FAST_HASH_SEED, /* supplementaryPlaneFix= */ true);

private static final int CHUNK_SIZE = 4;

private static final int C1 = 0xcc9e2d51;
private static final int C2 = 0x1b873593;

private final int seed;
private final boolean supplementaryPlaneFix;

Murmur3_32HashFunction(int seed) {
Murmur3_32HashFunction(int seed, boolean supplementaryPlaneFix) {
this.seed = seed;
this.supplementaryPlaneFix = supplementaryPlaneFix;
}

@Override
Expand All @@ -88,7 +95,7 @@ public String toString() {
public boolean equals(@CheckForNull Object object) {
if (object instanceof Murmur3_32HashFunction) {
Murmur3_32HashFunction other = (Murmur3_32HashFunction) object;
return seed == other.seed;
return seed == other.seed && supplementaryPlaneFix == other.supplementaryPlaneFix;
}
return false;
}
Expand Down Expand Up @@ -191,6 +198,9 @@ public HashCode hashString(CharSequence input, Charset charset) {
}
i++;
buffer |= codePointToFourUtf8Bytes(codePoint) << shift;
if (supplementaryPlaneFix) { // bug compatibility: earlier versions did not have this add
shift += 32;
}
len += 4;
}

Expand Down
Expand Up @@ -32,6 +32,7 @@ enum HashFunctionEnum {
MD5(Hashing.md5()),
MURMUR3_128(Hashing.murmur3_128()),
MURMUR3_32(Hashing.murmur3_32()),
MURMUR3_32_FIXED(Hashing.murmur3_32_fixed()),
SHA1(Hashing.sha1()),
SHA256(Hashing.sha256()),
SHA384(Hashing.sha384()),
Expand Down
3 changes: 3 additions & 0 deletions guava-tests/test/com/google/common/hash/HashingTest.java
Expand Up @@ -432,6 +432,9 @@ public void testHashIntVsForLoop() {
.put(Hashing.murmur3_32(), EMPTY_STRING, "00000000")
.put(Hashing.murmur3_32(), TQBFJOTLD, "23f74f2e")
.put(Hashing.murmur3_32(), TQBFJOTLDP, "fc8bc4d5")
.put(Hashing.murmur3_32_fixed(), EMPTY_STRING, "00000000")
.put(Hashing.murmur3_32_fixed(), TQBFJOTLD, "23f74f2e")
.put(Hashing.murmur3_32_fixed(), TQBFJOTLDP, "fc8bc4d5")
.put(Hashing.sha1(), EMPTY_STRING, "da39a3ee5e6b4b0d3255bfef95601890afd80709")
.put(Hashing.sha1(), TQBFJOTLD, "2fd4e1c67a2d28fced849ee1bb76e7391b93eb12")
.put(Hashing.sha1(), TQBFJOTLDP, "408d94384216f890ff7a0c3528e8bed1e0b01621")
Expand Down

0 comments on commit 5ef0475

Please sign in to comment.