Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster optimized frozen dictionary creation (3/n) #87688

Merged
merged 6 commits into from Jun 22, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -36,23 +36,21 @@ private FrozenHashTable(int[] hashCodes, Bucket[] buckets, ulong fastModMultipli
}

/// <summary>Initializes a frozen hash table.</summary>
/// <param name="hashCodes">Pre-calculated hash codes.</param>
/// <param name="storeDestIndexFromSrcIndex">A delegate that assigns the index to a specific entry. It's passed the destination and source indices.</param>
/// <param name="hashCodes">Pre-calculated hash codes. When the method finishes, it assigns each value to destination index.</param>
/// <param name="optimizeForReading">true to spend additional effort tuning for subsequent read speed on the table; false to prioritize construction time.</param>
/// <param name="hashCodesAreUnique">True when the input hash codes are already unique (provided from a dictionary of integers for example).</param>
/// <remarks>
/// This method will iterate through the incoming entries and will invoke the hasher on each once.
/// It will then determine the optimal number of hash buckets to allocate and will populate the
/// bucket table. In the process of doing so, it calls out to the <paramref name="storeDestIndexFromSrcIndex"/> to indicate
/// the resulting index for that entry. <see cref="FindMatchingEntries(int, out int, out int)"/>
/// then uses this index to reference individual entries by indexing into <see cref="HashCodes"/>.
/// bucket table. The caller is responsible to consume the values written to <paramref name="hashCodes"/> and update the destination (if desired).
/// <see cref="FindMatchingEntries(int, out int, out int)"/> then uses this index to reference individual entries by indexing into <see cref="HashCodes"/>.
/// </remarks>
/// <returns>A frozen hash table.</returns>
public static FrozenHashTable Create(ReadOnlySpan<int> hashCodes, Action<int, int> storeDestIndexFromSrcIndex, bool optimizeForReading = true)
public static FrozenHashTable Create(Span<int> hashCodes, bool optimizeForReading = true, bool hashCodesAreUnique = false)
{
// Determine how many buckets to use. This might be fewer than the number of entries
// if any entries have identical hashcodes (not just different hashcodes that might
// map to the same bucket).
int numBuckets = CalcNumBuckets(hashCodes, optimizeForReading);
int numBuckets = CalcNumBuckets(hashCodes, optimizeForReading, hashCodesAreUnique);
ulong fastModMultiplier = HashHelpers.GetFastModMultiplier((uint)numBuckets);

// Create two spans:
Expand Down Expand Up @@ -100,8 +98,10 @@ public static FrozenHashTable Create(ReadOnlySpan<int> hashCodes, Action<int, in
bucketStart = count;
while (index >= 0)
{
hashtableHashcodes[count] = hashCodes[index];
storeDestIndexFromSrcIndex(count, index);
ref int hashCode = ref hashCodes[index];
hashtableHashcodes[count] = hashCode;
// we have used the hash code for the last time, now we re-use the buffer to store destination index
hashCode = count;
count++;
bucketCount++;

Expand Down Expand Up @@ -144,9 +144,10 @@ public void FindMatchingEntries(int hashCode, out int startIndex, out int endInd
/// sizes, starting at the exact number of hash codes and incrementing the bucket count by 1 per trial,
/// this is a trade-off between speed of determining a good number of buckets and maximal density.
/// </remarks>
private static int CalcNumBuckets(ReadOnlySpan<int> hashCodes, bool optimizeForReading)
private static int CalcNumBuckets(ReadOnlySpan<int> hashCodes, bool optimizeForReading, bool hashCodesAreUnique)
{
Debug.Assert(hashCodes.Length != 0);
Debug.Assert(!hashCodesAreUnique || new HashSet<int>(hashCodes.ToArray()).Count == hashCodes.Length);

const double AcceptableCollisionRate = 0.05; // What is a satisfactory rate of hash collisions?
const int LargeInputSizeThreshold = 1000; // What is the limit for an input to be considered "small"?
Expand All @@ -159,38 +160,42 @@ private static int CalcNumBuckets(ReadOnlySpan<int> hashCodes, bool optimizeForR
}

// Filter out duplicate codes, since no increase in buckets will avoid collisions from duplicate input hash codes.
var codes =
HashSet<int>? codes = hashCodesAreUnique ? null :
#if NETCOREAPP2_0_OR_GREATER
new HashSet<int>(hashCodes.Length);
#else
new HashSet<int>();
#endif
foreach (int hashCode in hashCodes)
if (codes is not null)
{
codes.Add(hashCode);
foreach (int hashCode in hashCodes)
{
codes.Add(hashCode);
}
}
adamsitnik marked this conversation as resolved.
Show resolved Hide resolved
Debug.Assert(codes.Count != 0);
int uniqueCodesCount = hashCodesAreUnique ? hashCodes.Length : codes!.Count;
Debug.Assert(uniqueCodesCount != 0);

// In our precomputed primes table, find the index of the smallest prime that's at least as large as our number of
// hash codes. If there are more codes than in our precomputed primes table, which accommodates millions of values,
// give up and just use the next prime.
ReadOnlySpan<int> primes = HashHelpers.Primes;
int minPrimeIndexInclusive = 0;
while ((uint)minPrimeIndexInclusive < (uint)primes.Length && codes.Count > primes[minPrimeIndexInclusive])
while ((uint)minPrimeIndexInclusive < (uint)primes.Length && uniqueCodesCount > primes[minPrimeIndexInclusive])
{
minPrimeIndexInclusive++;
}

if (minPrimeIndexInclusive >= primes.Length)
{
return HashHelpers.GetPrime(codes.Count);
return HashHelpers.GetPrime(uniqueCodesCount);
}

// Determine the largest number of buckets we're willing to use, based on a multiple of the number of inputs.
// For smaller inputs, we allow for a larger multiplier.
int maxNumBuckets =
codes.Count *
(codes.Count >= LargeInputSizeThreshold ? MaxLargeBucketTableMultiplier : MaxSmallBucketTableMultiplier);
uniqueCodesCount *
(uniqueCodesCount >= LargeInputSizeThreshold ? MaxLargeBucketTableMultiplier : MaxSmallBucketTableMultiplier);

// Find the index of the smallest prime that accommodates our max buckets.
int maxPrimeIndexExclusive = minPrimeIndexInclusive;
Expand All @@ -209,7 +214,7 @@ private static int CalcNumBuckets(ReadOnlySpan<int> hashCodes, bool optimizeForR
int[] seenBuckets = ArrayPool<int>.Shared.Rent((maxNumBuckets / BitsPerInt32) + 1);

int bestNumBuckets = maxNumBuckets;
int bestNumCollisions = codes.Count;
int bestNumCollisions = uniqueCodesCount;

// Iterate through each available prime between the min and max discovered. For each, compute
// the collision ratio.
Expand All @@ -222,22 +227,50 @@ private static int CalcNumBuckets(ReadOnlySpan<int> hashCodes, bool optimizeForR
// Determine the bucket for each hash code and mark it as seen. If it was already seen,
// track it as a collision.
int numCollisions = 0;
foreach (int code in codes)

if (codes is not null && uniqueCodesCount != hashCodes.Length)
{
uint bucketNum = (uint)code % (uint)numBuckets;
if ((seenBuckets[bucketNum / BitsPerInt32] & (1 << (int)bucketNum)) != 0)
foreach (int code in codes)
{
numCollisions++;
if (numCollisions >= bestNumCollisions)
uint bucketNum = (uint)code % (uint)numBuckets;
if ((seenBuckets[bucketNum / BitsPerInt32] & (1 << (int)bucketNum)) != 0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would computing the bitmask value once (instead of in line 238 and 250) help anything? e.g.

int bucketMask = 1 << (int)bucketNum;
if ((seenBuckets[bucketNum / BitsPerInt32] &bucketMask) != 0)
{
...
}
else
{
    seenBuckets[bucketNum / BitsPerInt32] |= bucketMask;
}

{
// If we've already hit the previously known best number of collisions,
// there's no point in continuing as worst case we'd just use that.
break;
numCollisions++;
if (numCollisions >= bestNumCollisions)
{
// If we've already hit the previously known best number of collisions,
// there's no point in continuing as worst case we'd just use that.
break;
}
}
else
{
seenBuckets[bucketNum / BitsPerInt32] |= 1 << (int)bucketNum;
}
}
else
}
else
{
// We can get here in two cases:
// 1. hashCodesAreUnique was true (the input comes from a unique collection of integers), so we have skipped the hash set creation.
// 2. all hash codes turned out to be unique. In such scenario, it's faster to iterate over a span.
adamsitnik marked this conversation as resolved.
Show resolved Hide resolved
foreach (int code in hashCodes)
{
seenBuckets[bucketNum / BitsPerInt32] |= 1 << (int)bucketNum;
uint bucketNum = (uint)code % (uint)numBuckets;
if ((seenBuckets[bucketNum / BitsPerInt32] & (1 << (int)bucketNum)) != 0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would computing the bitmask value once (instead of in line 260 and 272) help anything?

{
numCollisions++;
if (numCollisions >= bestNumCollisions)
{
// If we've already hit the previously known best number of collisions,
// there's no point in continuing as worst case we'd just use that.
break;
}
}
else
{
seenBuckets[bucketNum / BitsPerInt32] |= 1 << (int)bucketNum;
}
}
Comment on lines +257 to 274
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After all of the work done to clone the inputs, allocate the dictionaries, analyze the keys, and so on, iterating over a span instead of a HashSet really provides a meaningful enough gain to duplicate this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After all of the work done to clone the inputs, allocate the dictionaries, analyze the keys, and so on, iterating over a span instead of a HashSet really provides a meaningful enough gain to duplicate this?

Yes: "Up to +5% gain where string keys turned out to have unique hash codes"

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I saw that in the PR description, but I'm still skeptical.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've used a profiler and benchmarked it more than once. This loop is the hottest place in the entire process of creating frozen dictionaries/hash sets (because all other parts got optimized in other PRs and are now relatively cheap).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still skeptical :) But putting aside my skepticism, can you at least dedup this by putting it into an aggressively-inlined helper?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@stephentoub do you mind if I do that in my next PR that is going to touch this area?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure

}

Expand All @@ -247,7 +280,7 @@ private static int CalcNumBuckets(ReadOnlySpan<int> hashCodes, bool optimizeForR
{
bestNumBuckets = numBuckets;

if (numCollisions / (double)codes.Count <= AcceptableCollisionRate)
if (numCollisions / (double)uniqueCodesCount <= AcceptableCollisionRate)
{
break;
}
Expand Down
Expand Up @@ -35,9 +35,14 @@ internal Int32FrozenDictionary(Dictionary<int, TValue> source) : base(EqualityCo
hashCodes[i] = entries[i].Key;
}

_hashTable = FrozenHashTable.Create(
hashCodes,
(destIndex, srcIndex) => _values[destIndex] = entries[srcIndex].Value);
_hashTable = FrozenHashTable.Create(hashCodes, hashCodesAreUnique: true);

for (int srcIndex = 0; srcIndex < hashCodes.Length; srcIndex++)
{
int destIndex = hashCodes[srcIndex];

_values[destIndex] = entries[srcIndex].Value;
}

ArrayPool<int>.Shared.Return(arrayPoolHashCodes);
}
Expand Down
Expand Up @@ -25,9 +25,7 @@ internal Int32FrozenSet(HashSet<int> source) : base(EqualityComparer<int>.Defaul
int[] entries = ArrayPool<int>.Shared.Rent(count);
source.CopyTo(entries);

_hashTable = FrozenHashTable.Create(
new ReadOnlySpan<int>(entries, 0, count),
static delegate { });
_hashTable = FrozenHashTable.Create(new Span<int>(entries, 0, count), hashCodesAreUnique: true);

ArrayPool<int>.Shared.Return(entries);
}
Expand Down
Expand Up @@ -30,10 +30,14 @@ protected ItemsFrozenSet(HashSet<T> source, bool optimizeForReading = true) : ba
hashCodes[i] = entries[i] is T t ? Comparer.GetHashCode(t) : 0;
}

_hashTable = FrozenHashTable.Create(
hashCodes,
(destIndex, srcIndex) => _items[destIndex] = entries[srcIndex],
optimizeForReading);
_hashTable = FrozenHashTable.Create(hashCodes, optimizeForReading);

for (int srcIndex = 0; srcIndex < hashCodes.Length; srcIndex++)
{
int destIndex = hashCodes[srcIndex];

_items[destIndex] = entries[srcIndex];
}

ArrayPool<int>.Shared.Return(arrayPoolHashCodes);
}
Expand Down
Expand Up @@ -32,14 +32,15 @@ protected KeysAndValuesFrozenDictionary(Dictionary<TKey, TValue> source, bool op
hashCodes[i] = Comparer.GetHashCode(entries[i].Key);
}

_hashTable = FrozenHashTable.Create(
hashCodes,
(destIndex, srcIndex) =>
{
_keys[destIndex] = entries[srcIndex].Key;
_values[destIndex] = entries[srcIndex].Value;
},
optimizeForReading);
_hashTable = FrozenHashTable.Create(hashCodes, optimizeForReading);

for (int srcIndex = 0; srcIndex < hashCodes.Length; srcIndex++)
{
int destIndex = hashCodes[srcIndex];

_keys[destIndex] = entries[srcIndex].Key;
_values[destIndex] = entries[srcIndex].Value;
}

ArrayPool<int>.Shared.Return(arrayPoolHashCodes);
}
Expand Down
Expand Up @@ -47,13 +47,15 @@ internal abstract class OrdinalStringFrozenDictionary<TValue> : FrozenDictionary
hashCodes[i] = GetHashCode(keys[i]);
}

_hashTable = FrozenHashTable.Create(
hashCodes,
(destIndex, srcIndex) =>
{
_keys[destIndex] = keys[srcIndex];
_values[destIndex] = values[srcIndex];
});
_hashTable = FrozenHashTable.Create(hashCodes);

for (int srcIndex = 0; srcIndex < hashCodes.Length; srcIndex++)
{
int destIndex = hashCodes[srcIndex];

_keys[destIndex] = keys[srcIndex];
_values[destIndex] = values[srcIndex];
}

ArrayPool<int>.Shared.Return(arrayPoolHashCodes);
}
Expand Down
Expand Up @@ -38,9 +38,14 @@ internal abstract class OrdinalStringFrozenSet : FrozenSetInternalBase<string, O
hashCodes[i] = GetHashCode(entries[i]);
}

_hashTable = FrozenHashTable.Create(
hashCodes,
(destIndex, srcIndex) => _items[destIndex] = entries[srcIndex]);
_hashTable = FrozenHashTable.Create(hashCodes);

for (int srcIndex = 0; srcIndex < hashCodes.Length; srcIndex++)
{
int destIndex = hashCodes[srcIndex];

_items[destIndex] = entries[srcIndex];
}

ArrayPool<int>.Shared.Return(arrayPoolHashCodes);
}
Expand Down