-
Notifications
You must be signed in to change notification settings - Fork 2.7k
Simple trim of ArrayPool buffers #17078
Changes from all commits
25f7ffc
fca3bf1
f9d6628
3d8d0df
f9b1a05
9e1e893
5847dd0
51c18c5
1a2da39
d95957b
2e6630a
91fc5bf
1834b4a
18934b8
65ac364
1f6bcc2
e4211d8
c4014ad
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,11 +2,12 @@ | |
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using Microsoft.Win32; | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Runtime.CompilerServices; | ||
using System.Threading; | ||
|
||
using Internal.Runtime.Augments; | ||
using Internal.Runtime.CompilerServices; | ||
|
||
namespace System.Buffers | ||
{ | ||
|
@@ -24,7 +25,6 @@ internal sealed partial class TlsOverPerCoreLockedStacksArrayPool<T> : ArrayPool | |
{ | ||
// TODO: #7747: "Investigate optimizing ArrayPool heuristics" | ||
// - Explore caching in TLS more than one array per size per thread, and moving stale buffers to the global queue. | ||
// - Explore dumping stale buffers from the global queue, similar to PinnableBufferCache (maybe merging them). | ||
// - Explore changing the size of each per-core bucket, potentially dynamically or based on other factors like array size. | ||
// - Explore changing number of buckets and what sizes of arrays are cached. | ||
// - Investigate whether false sharing is causing any issues, in particular on LockedStack's count and the contents of its array. | ||
|
@@ -48,6 +48,15 @@ internal sealed partial class TlsOverPerCoreLockedStacksArrayPool<T> : ArrayPool | |
[ThreadStatic] | ||
private static T[][] t_tlsBuckets; | ||
|
||
private int _callbackCreated; | ||
|
||
private readonly static bool s_TrimBuffers = GetTrimBuffers(); | ||
|
||
/// <summary> | ||
/// Used to keep track of all thread local buckets for trimming if needed | ||
/// </summary> | ||
private static readonly ConditionalWeakTable<T[][], object> s_AllTlsBuckets = s_TrimBuffers ? new ConditionalWeakTable<T[][], object>() : null; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: s_allTlsBuckets There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixing |
||
|
||
/// <summary>Initialize the pool.</summary> | ||
public TlsOverPerCoreLockedStacksArrayPool() | ||
{ | ||
|
@@ -182,15 +191,24 @@ public override void Return(T[] array, bool clearArray = false) | |
{ | ||
t_tlsBuckets = tlsBuckets = new T[NumBuckets][]; | ||
tlsBuckets[bucketIndex] = array; | ||
if (s_TrimBuffers) | ||
{ | ||
s_AllTlsBuckets.Add(tlsBuckets, null); | ||
if (Interlocked.Exchange(ref _callbackCreated, 1) != 1) | ||
{ | ||
Gen2GcCallback.Register(Gen2GcCallbackFunc, this); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I may have already asked this, but we're now going to end up with one of these Gen2 callback object things per T. We decided that's ok? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. GetGC2Callback is pretty efficient (it holds just the callback and (weak) pointer to 'this'. The main 'overhead' is that the object is finalizable. You could imagine fixing that by sharing the finalizable object, but that is something that arguably should be done in the implementation of Gen2GcCallback. It think it is OK to wait in see (if it is a problem we can fix it). Also we may ditch Gen2GCCallback (since we really want to do this as we APPROACH a gen2 GC not after it), which is another reason not to invest alot of effort in it now. |
||
} | ||
} | ||
} | ||
else | ||
{ | ||
T[] prev = tlsBuckets[bucketIndex]; | ||
tlsBuckets[bucketIndex] = array; | ||
|
||
if (prev != null) | ||
{ | ||
PerCoreLockedStacks bucket = _buckets[bucketIndex] ?? CreatePerCoreLockedStacks(bucketIndex); | ||
bucket.TryPush(prev); | ||
PerCoreLockedStacks stackBucket = _buckets[bucketIndex] ?? CreatePerCoreLockedStacks(bucketIndex); | ||
stackBucket.TryPush(prev); | ||
} | ||
} | ||
} | ||
|
@@ -203,6 +221,92 @@ public override void Return(T[] array, bool clearArray = false) | |
} | ||
} | ||
|
||
public bool Trim() | ||
{ | ||
int milliseconds = Environment.TickCount; | ||
MemoryPressure pressure = GetMemoryPressure(); | ||
|
||
ArrayPoolEventSource log = ArrayPoolEventSource.Log; | ||
if (log.IsEnabled()) | ||
log.BufferTrimPoll(milliseconds, (int)pressure); | ||
|
||
foreach (PerCoreLockedStacks bucket in _buckets) | ||
{ | ||
bucket?.Trim((uint)milliseconds, Id, pressure, _bucketArraySizes); | ||
} | ||
|
||
if (pressure == MemoryPressure.High) | ||
{ | ||
// Under high pressure, release all thread locals | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a little counter-intuitive to me. The array stored in the TLS slot is going to be the most recently used one; presumably we're more interested in keeping that one around than ones in the per-core stacks. But maybe this is the best we can do right now? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Best for now. We didn't want to risk a perf hit from GetTicks or code complexity at this point. |
||
foreach (KeyValuePair<T[][], object> tlsBuckets in s_AllTlsBuckets) | ||
{ | ||
T[][] buckets = tlsBuckets.Key; | ||
for (int i = 0; i < NumBuckets; i++) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why NumBuckets rather than buckets.Length? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changing |
||
{ | ||
T[] buffer = buckets[i]; | ||
buckets[i] = null; | ||
|
||
if (log.IsEnabled() && buffer != null) | ||
{ | ||
log.BufferTrimmed(buffer.GetHashCode(), buffer.Length, Id); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This might be wrong: it's possible a thread grabbed the buffer at the same time we did here, in which case we'll report having trimmed a buffer even though we didn't. I assume we're ok with that? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, unless you can think of a way to do this without incurring a hit. I'll clarify in comments. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, interlocked here isn't a bad idea since we're in cleanup. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
That might reduce the chances, but it won't remove them, unless you also used an interlocked on the rent path, which we don't want to do. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you think? Worth the interlocked for a slimmer chance of false positive? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably not, but if you wanted to, I'd suggest changing it to something like: if (log.IsEnabled())
{
T[] buffer = Interlocked.Exchange(ref buckets[i], null);
if (buffer != null)
{
log.BufferTrimmed(buffer.GetHashCode(), buffer.Length, Id);
}
}
else
{
buckets[i] = null;
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or, actually, better yet: if (log.IsEnabled())
{
foreach (KeyValuePair<T[][], object> tlsBuckets in s_AllTlsBuckets)
{
T[][] buckets = tlsBuckets.Key;
for (int i = 0; i < buckets.Length; i++)
{
T[] bucket = Interlocked.Exchange(ref buckets[i], null);
if (bucket != null)
{
log.BufferTrimmed(buffer.GetHashCode(), buffer.Length, Id);
}
}
}
}
else
{
foreach (KeyValuePair<T[][], object> tlsBuckets in s_AllTlsBuckets)
{
T[][] buckets = tlsBuckets.Key;
Array.Clear(buckets, 0, buckets.Length);
}
} |
||
} | ||
} | ||
} | ||
} | ||
|
||
return true; | ||
} | ||
|
||
/// <summary> | ||
/// This is the static function that is called from the gen2 GC callback. | ||
/// The input object is the instance we want the callback on. | ||
/// </summary> | ||
/// <remarks> | ||
/// The reason that we make this function static and take the instance as a parameter is that | ||
/// we would otherwise root the instance to the Gen2GcCallback object, leaking the instance even when | ||
/// the application no longer needs it. | ||
/// </remarks> | ||
private static bool Gen2GcCallbackFunc(object target) | ||
{ | ||
return ((TlsOverPerCoreLockedStacksArrayPool<T>)(target)).Trim(); | ||
} | ||
|
||
private enum MemoryPressure | ||
{ | ||
Low, | ||
Medium, | ||
High | ||
} | ||
|
||
private static MemoryPressure GetMemoryPressure() | ||
{ | ||
const double HighPressureThreshold = .90; // Percent of GC memory pressure threshold we consider "high" | ||
const double MediumPressureThreshold = .70; // Percent of GC memory pressure threshold we consider "medium" | ||
|
||
GC.GetMemoryInfo(out uint threshold, out _, out uint lastLoad, out _, out _); | ||
if (lastLoad >= threshold * HighPressureThreshold) | ||
{ | ||
return MemoryPressure.High; | ||
} | ||
else if (lastLoad >= threshold * MediumPressureThreshold) | ||
{ | ||
return MemoryPressure.Medium; | ||
} | ||
return MemoryPressure.Low; | ||
} | ||
|
||
private static bool GetTrimBuffers() | ||
{ | ||
// Environment uses ArrayPool, so we have to hit the API directly. | ||
#if !CORECLR | ||
// P/Invokes are different for CoreCLR/RT- for RT we'll not allow | ||
// enabling/disabling for now. | ||
return true; | ||
#else | ||
return CLRConfig.GetBoolValueWithFallbacks("System.Buffers.TrimBuffers", "DOTNET_SYSTEM_BUFFERS_TRIMBUFFERS", defaultValue: true); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we use a name that's specific to ArrayPool, or even to ArrayPool.Shared, rather than for the general System.Buffers namespace? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure |
||
#endif | ||
} | ||
|
||
/// <summary> | ||
/// Stores a set of stacks of arrays, with one stack per core. | ||
/// </summary> | ||
|
@@ -254,13 +358,24 @@ public T[] TryPop() | |
} | ||
return null; | ||
} | ||
|
||
public bool Trim(uint tickCount, int id, MemoryPressure pressure, int[] bucketSizes) | ||
{ | ||
LockedStack[] stacks = _perCoreStacks; | ||
for (int i = 0; i < stacks.Length; i++) | ||
{ | ||
stacks[i].Trim(tickCount, id, pressure, bucketSizes[i]); | ||
} | ||
return true; | ||
} | ||
} | ||
|
||
/// <summary>Provides a simple stack of arrays, protected by a lock.</summary> | ||
private sealed class LockedStack | ||
{ | ||
private readonly T[][] _arrays = new T[MaxBuffersPerArraySizePerCore][]; | ||
private int _count; | ||
private uint _firstStackItemMS; | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
public bool TryPush(T[] array) | ||
|
@@ -269,6 +384,12 @@ public bool TryPush(T[] array) | |
Monitor.Enter(this); | ||
if (_count < MaxBuffersPerArraySizePerCore) | ||
{ | ||
if (s_TrimBuffers && _count == 0) | ||
{ | ||
// Stash the time the bottom of the stack was filled | ||
_firstStackItemMS = (uint)Environment.TickCount; | ||
} | ||
|
||
_arrays[_count++] = array; | ||
enqueued = true; | ||
} | ||
|
@@ -289,6 +410,76 @@ public T[] TryPop() | |
Monitor.Exit(this); | ||
return arr; | ||
} | ||
|
||
public void Trim(uint tickCount, int id, MemoryPressure pressure, int bucketSize) | ||
{ | ||
const uint StackTrimAfterMS = 60 * 1000; // Trim after 60 seconds for low/moderate pressure | ||
const uint StackHighTrimAfterMS = 10 * 1000; // Trim after 10 seconds for high pressure | ||
const uint StackRefreshMS = StackTrimAfterMS / 4; // Time bump after trimming (1/4 trim time) | ||
const int StackLowTrimCount = 1; // Trim one item when pressure is low | ||
const int StackMediumTrimCount = 2; // Trim two items when pressure is moderate | ||
const int StackHighTrimCount = MaxBuffersPerArraySizePerCore; // Trim all items when pressure is high | ||
const int StackLargeBucket = 16384; // If the bucket is larger than this we'll trim an extra when under high pressure | ||
const int StackModerateTypeSize = 16; // If T is larger than this we'll trim an extra when under high pressure | ||
const int StackLargeTypeSize = 32; // If T is larger than this we'll trim an extra (additional) when under high pressure | ||
|
||
if (_count == 0) | ||
return; | ||
|
||
lock (this) | ||
{ | ||
uint trimTicks = pressure == MemoryPressure.High ? StackHighTrimAfterMS : StackTrimAfterMS; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: this can be moved to before the lock There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moving |
||
if (_count > 0 && _firstStackItemMS > tickCount || (tickCount - _firstStackItemMS) > trimTicks) | ||
{ | ||
// We've wrapped the tick count or elapsed enough time since the | ||
// first item went into the stack. Drop the top item so it can | ||
// be collected and make the stack look a little newer. | ||
|
||
ArrayPoolEventSource log = ArrayPoolEventSource.Log; | ||
int trimCount = StackLowTrimCount; | ||
switch (pressure) | ||
{ | ||
case MemoryPressure.High: | ||
trimCount = StackHighTrimCount; | ||
|
||
// When pressure is high, aggressively trim larger arrays. | ||
if (bucketSize > StackLargeBucket) | ||
{ | ||
trimCount++; | ||
} | ||
if (Unsafe.SizeOf<T>() > StackModerateTypeSize) | ||
{ | ||
trimCount++; | ||
} | ||
if (Unsafe.SizeOf<T>() > StackLargeTypeSize) | ||
{ | ||
trimCount++; | ||
} | ||
break; | ||
case MemoryPressure.Medium: | ||
trimCount = StackMediumTrimCount; | ||
break; | ||
} | ||
|
||
while (_count > 0 && trimCount-- > 0) | ||
{ | ||
T[] array = _arrays[--_count]; | ||
_arrays[_count] = null; | ||
|
||
if (log.IsEnabled()) | ||
{ | ||
log.BufferTrimmed(array.GetHashCode(), array.Length, id); | ||
} | ||
} | ||
|
||
if (_count > 0 && _firstStackItemMS < uint.MaxValue - StackRefreshMS) | ||
{ | ||
// Give the remaining items a bit more time | ||
_firstStackItemMS += StackRefreshMS; | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: s_trimBuffers
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixing