Skip to content
This repository has been archived by the owner on Jan 23, 2023. It is now read-only.

Simple trim of ArrayPool buffers #17078

Merged
merged 18 commits into from
Apr 11, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 3 additions & 9 deletions src/mscorlib/System.Private.CoreLib.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,6 @@
<Compile Include="$(BclSourcesRoot)\System\Runtime\CompilerServices\AsyncMethodBuilder.cs" />
<Compile Include="$(BclSourcesRoot)\System\Runtime\CompilerServices\TaskAwaiter.cs" />
</ItemGroup>
<ItemGroup>
<Compile Include="$(BclSourcesRoot)\System\Runtime\Reliability\CriticalFinalizerObject.cs" />
</ItemGroup>
<ItemGroup>
<Compile Include="$(BclSourcesRoot)\System\Runtime\MemoryFailPoint.cs" />
<Compile Include="$(BclSourcesRoot)\System\Runtime\GcSettings.cs" />
Expand Down Expand Up @@ -353,6 +350,7 @@
<Compile Include="$(BclSourcesRoot)\System\MissingFieldException.cs" />
<Compile Include="$(BclSourcesRoot)\System\MissingMemberException.cs" />
<Compile Include="$(BclSourcesRoot)\System\Number.CoreCLR.cs" />
<Compile Include="$(BclSourcesRoot)\System\PinnableBufferCache.cs" />
<Compile Include="$(BclSourcesRoot)\System\RtType.cs" />
<Compile Include="$(BclSourcesRoot)\System\RuntimeArgumentHandle.cs" />
<Compile Include="$(BclSourcesRoot)\System\RuntimeHandles.cs" />
Expand Down Expand Up @@ -473,7 +471,6 @@
<Compile Include="$(BclSourcesRoot)\System\Threading\Monitor.cs" />
<Compile Include="$(BclSourcesRoot)\System\Threading\Mutex.cs" />
<Compile Include="$(BclSourcesRoot)\System\Threading\Overlapped.cs" />
<Compile Include="$(BclSourcesRoot)\System\Threading\PinnableBufferCache.cs" />
<Compile Include="$(BclSourcesRoot)\System\Threading\Semaphore.cs" />
<Compile Include="$(BclSourcesRoot)\System\Threading\Thread.cs" />
<Compile Include="$(BclSourcesRoot)\System\Threading\ThreadInterruptedException.cs" />
Expand Down Expand Up @@ -631,6 +628,7 @@
<Compile Include="$(BclSourcesRoot)\mscorlib.Friends.cs" Condition="'$(FeatureCominterop)' == 'true'" />
</ItemGroup>
<ItemGroup>
<Compile Include="src\System\PinnableBufferCacheEventSource.cs" />
<Compile Include="src\System\Runtime\RuntimeImports.cs" />
</ItemGroup>
<Import Project="shared\System.Private.CoreLib.Shared.projitems" />
Expand Down Expand Up @@ -666,14 +664,10 @@
<!-- Use a different nativeresource file to avoid conflicts with mscorlib-->
<Win32Resource Condition="'$(GenerateNativeVersionInfo)'=='true'">$(IntermediateOutputPath)\System.Private.CoreLib.res</Win32Resource>
</PropertyGroup>

<Import Project="CreateRuntimeRootILLinkDescriptorFile.targets" />

<ItemGroup>
<EmbeddedResource Include="$(_ILLinkRuntimeRootDescriptorFilePath)" />
</ItemGroup>

<Import Project="ILLink.targets" />

<Import Project="GenerateCompilerResponseFile.targets" />
</Project>
</Project>
2 changes: 2 additions & 0 deletions src/mscorlib/shared/System.Private.CoreLib.Shared.projitems
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@
<Compile Include="$(MSBuildThisFileDirectory)System\FlagsAttribute.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\FormatException.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\FormattableString.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Gen2GcCallback.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\BidiCategory.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\Calendar.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\CalendarAlgorithmType.cs" />
Expand Down Expand Up @@ -425,6 +426,7 @@
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\CompilerServices\YieldAwaitable.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\ConstrainedExecution\Cer.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\ConstrainedExecution\Consistency.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\ConstrainedExecution\CriticalFinalizerObject.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\ConstrainedExecution\ReliabilityContractAttribute.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\ExceptionServices\ExceptionNotification.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\ExceptionServices\HandleProcessCorruptedStateExceptionsAttribute.cs" />
Expand Down
4 changes: 1 addition & 3 deletions src/mscorlib/shared/System/Buffers/ArrayPool.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@ public abstract class ArrayPool<T>
/// optimized for very fast access speeds, at the expense of more memory consumption.
/// The shared pool instance is created lazily on first access.
/// </remarks>
public static ArrayPool<T> Shared { get; } =
typeof(T) == typeof(byte) || typeof(T) == typeof(char) ? new TlsOverPerCoreLockedStacksArrayPool<T>() :
Create();
public static ArrayPool<T> Shared { get; } = new TlsOverPerCoreLockedStacksArrayPool<T>();

/// <summary>
/// Creates a new <see cref="ArrayPool{T}"/> instance using default configuration options.
Expand Down
12 changes: 12 additions & 0 deletions src/mscorlib/shared/System/Buffers/ArrayPoolEventSource.cs
Original file line number Diff line number Diff line change
Expand Up @@ -86,5 +86,17 @@ internal unsafe void BufferAllocated(int bufferId, int bufferSize, int poolId, i
/// </summary>
[Event(3, Level = EventLevel.Verbose)]
internal void BufferReturned(int bufferId, int bufferSize, int poolId) => WriteEvent(3, bufferId, bufferSize, poolId);

/// <summary>
/// Event raised when we free a buffer due to inactivity or memory pressure.
/// </summary>
[Event(4, Level = EventLevel.Informational)]
internal void BufferTrimmed(int bufferId, int bufferSize, int poolId) => WriteEvent(4, bufferId, bufferSize, poolId);

/// <summary>
/// Event raised when we check to trim buffers.
/// </summary>
[Event(5, Level = EventLevel.Informational)]
internal void BufferTrimPoll(int milliseconds, int pressure) => WriteEvent(5, milliseconds, pressure);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.Win32;
using System;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System.Threading;

using Internal.Runtime.Augments;
using Internal.Runtime.CompilerServices;

namespace System.Buffers
{
Expand All @@ -24,7 +25,6 @@ internal sealed partial class TlsOverPerCoreLockedStacksArrayPool<T> : ArrayPool
{
// TODO: #7747: "Investigate optimizing ArrayPool heuristics"
// - Explore caching in TLS more than one array per size per thread, and moving stale buffers to the global queue.
// - Explore dumping stale buffers from the global queue, similar to PinnableBufferCache (maybe merging them).
// - Explore changing the size of each per-core bucket, potentially dynamically or based on other factors like array size.
// - Explore changing number of buckets and what sizes of arrays are cached.
// - Investigate whether false sharing is causing any issues, in particular on LockedStack's count and the contents of its array.
Expand All @@ -48,6 +48,15 @@ internal sealed partial class TlsOverPerCoreLockedStacksArrayPool<T> : ArrayPool
[ThreadStatic]
private static T[][] t_tlsBuckets;

private int _callbackCreated;

private readonly static bool s_TrimBuffers = GetTrimBuffers();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: s_trimBuffers

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixing


/// <summary>
/// Used to keep track of all thread local buckets for trimming if needed
/// </summary>
private static readonly ConditionalWeakTable<T[][], object> s_AllTlsBuckets = s_TrimBuffers ? new ConditionalWeakTable<T[][], object>() : null;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: s_allTlsBuckets

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixing


/// <summary>Initialize the pool.</summary>
public TlsOverPerCoreLockedStacksArrayPool()
{
Expand Down Expand Up @@ -182,15 +191,24 @@ public override void Return(T[] array, bool clearArray = false)
{
t_tlsBuckets = tlsBuckets = new T[NumBuckets][];
tlsBuckets[bucketIndex] = array;
if (s_TrimBuffers)
{
s_AllTlsBuckets.Add(tlsBuckets, null);
if (Interlocked.Exchange(ref _callbackCreated, 1) != 1)
{
Gen2GcCallback.Register(Gen2GcCallbackFunc, this);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I may have already asked this, but we're now going to end up with one of these Gen2 callback object things per T. We decided that's ok?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GetGC2Callback is pretty efficient (it holds just the callback and (weak) pointer to 'this'. The main 'overhead' is that the object is finalizable. You could imagine fixing that by sharing the finalizable object, but that is something that arguably should be done in the implementation of Gen2GcCallback.

It think it is OK to wait in see (if it is a problem we can fix it).

Also we may ditch Gen2GCCallback (since we really want to do this as we APPROACH a gen2 GC not after it), which is another reason not to invest alot of effort in it now.

}
}
}
else
{
T[] prev = tlsBuckets[bucketIndex];
tlsBuckets[bucketIndex] = array;

if (prev != null)
{
PerCoreLockedStacks bucket = _buckets[bucketIndex] ?? CreatePerCoreLockedStacks(bucketIndex);
bucket.TryPush(prev);
PerCoreLockedStacks stackBucket = _buckets[bucketIndex] ?? CreatePerCoreLockedStacks(bucketIndex);
stackBucket.TryPush(prev);
}
}
}
Expand All @@ -203,6 +221,92 @@ public override void Return(T[] array, bool clearArray = false)
}
}

public bool Trim()
{
int milliseconds = Environment.TickCount;
MemoryPressure pressure = GetMemoryPressure();

ArrayPoolEventSource log = ArrayPoolEventSource.Log;
if (log.IsEnabled())
log.BufferTrimPoll(milliseconds, (int)pressure);

foreach (PerCoreLockedStacks bucket in _buckets)
{
bucket?.Trim((uint)milliseconds, Id, pressure, _bucketArraySizes);
}

if (pressure == MemoryPressure.High)
{
// Under high pressure, release all thread locals
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a little counter-intuitive to me. The array stored in the TLS slot is going to be the most recently used one; presumably we're more interested in keeping that one around than ones in the per-core stacks. But maybe this is the best we can do right now?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Best for now. We didn't want to risk a perf hit from GetTicks or code complexity at this point.

foreach (KeyValuePair<T[][], object> tlsBuckets in s_AllTlsBuckets)
{
T[][] buckets = tlsBuckets.Key;
for (int i = 0; i < NumBuckets; i++)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why NumBuckets rather than buckets.Length?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using buckets.Length here would be faster (the bounds checks would be eliminated)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing

{
T[] buffer = buckets[i];
buckets[i] = null;

if (log.IsEnabled() && buffer != null)
{
log.BufferTrimmed(buffer.GetHashCode(), buffer.Length, Id);
Copy link
Member

@stephentoub stephentoub Apr 11, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might be wrong: it's possible a thread grabbed the buffer at the same time we did here, in which case we'll report having trimmed a buffer even though we didn't. I assume we're ok with that?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, unless you can think of a way to do this without incurring a hit. I'll clarify in comments.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, interlocked here isn't a bad idea since we're in cleanup.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, interlocked here isn't a bad idea since we're in cleanup.

That might reduce the chances, but it won't remove them, unless you also used an interlocked on the rent path, which we don't want to do.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you think? Worth the interlocked for a slimmer chance of false positive?

Copy link
Member

@stephentoub stephentoub Apr 11, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably not, but if you wanted to, I'd suggest changing it to something like:

if (log.IsEnabled())
{
    T[] buffer = Interlocked.Exchange(ref buckets[i], null);
    if (buffer != null)
    {
        log.BufferTrimmed(buffer.GetHashCode(), buffer.Length, Id);
    }
}
else
{
    buckets[i] = null;
}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or, actually, better yet:

if (log.IsEnabled())
{
    foreach (KeyValuePair<T[][], object> tlsBuckets in s_AllTlsBuckets)
    {
        T[][] buckets = tlsBuckets.Key;
        for (int i = 0; i < buckets.Length; i++)
        {
            T[] bucket = Interlocked.Exchange(ref buckets[i], null);
            if (bucket != null)
            {
                log.BufferTrimmed(buffer.GetHashCode(), buffer.Length, Id);
            }
        }
    }
}
else
{
    foreach (KeyValuePair<T[][], object> tlsBuckets in s_AllTlsBuckets)
    {
         T[][] buckets = tlsBuckets.Key;
         Array.Clear(buckets, 0, buckets.Length);
    }
}

}
}
}
}

return true;
}

/// <summary>
/// This is the static function that is called from the gen2 GC callback.
/// The input object is the instance we want the callback on.
/// </summary>
/// <remarks>
/// The reason that we make this function static and take the instance as a parameter is that
/// we would otherwise root the instance to the Gen2GcCallback object, leaking the instance even when
/// the application no longer needs it.
/// </remarks>
private static bool Gen2GcCallbackFunc(object target)
{
return ((TlsOverPerCoreLockedStacksArrayPool<T>)(target)).Trim();
}

private enum MemoryPressure
{
Low,
Medium,
High
}

private static MemoryPressure GetMemoryPressure()
{
const double HighPressureThreshold = .90; // Percent of GC memory pressure threshold we consider "high"
const double MediumPressureThreshold = .70; // Percent of GC memory pressure threshold we consider "medium"

GC.GetMemoryInfo(out uint threshold, out _, out uint lastLoad, out _, out _);
if (lastLoad >= threshold * HighPressureThreshold)
{
return MemoryPressure.High;
}
else if (lastLoad >= threshold * MediumPressureThreshold)
{
return MemoryPressure.Medium;
}
return MemoryPressure.Low;
}

private static bool GetTrimBuffers()
{
// Environment uses ArrayPool, so we have to hit the API directly.
#if !CORECLR
// P/Invokes are different for CoreCLR/RT- for RT we'll not allow
// enabling/disabling for now.
return true;
#else
return CLRConfig.GetBoolValueWithFallbacks("System.Buffers.TrimBuffers", "DOTNET_SYSTEM_BUFFERS_TRIMBUFFERS", defaultValue: true);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we use a name that's specific to ArrayPool, or even to ArrayPool.Shared, rather than for the general System.Buffers namespace?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure

#endif
}

/// <summary>
/// Stores a set of stacks of arrays, with one stack per core.
/// </summary>
Expand Down Expand Up @@ -254,13 +358,24 @@ public T[] TryPop()
}
return null;
}

public bool Trim(uint tickCount, int id, MemoryPressure pressure, int[] bucketSizes)
{
LockedStack[] stacks = _perCoreStacks;
for (int i = 0; i < stacks.Length; i++)
{
stacks[i].Trim(tickCount, id, pressure, bucketSizes[i]);
}
return true;
}
}

/// <summary>Provides a simple stack of arrays, protected by a lock.</summary>
private sealed class LockedStack
{
private readonly T[][] _arrays = new T[MaxBuffersPerArraySizePerCore][];
private int _count;
private uint _firstStackItemMS;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool TryPush(T[] array)
Expand All @@ -269,6 +384,12 @@ public bool TryPush(T[] array)
Monitor.Enter(this);
if (_count < MaxBuffersPerArraySizePerCore)
{
if (s_TrimBuffers && _count == 0)
{
// Stash the time the bottom of the stack was filled
_firstStackItemMS = (uint)Environment.TickCount;
}

_arrays[_count++] = array;
enqueued = true;
}
Expand All @@ -289,6 +410,76 @@ public T[] TryPop()
Monitor.Exit(this);
return arr;
}

public void Trim(uint tickCount, int id, MemoryPressure pressure, int bucketSize)
{
const uint StackTrimAfterMS = 60 * 1000; // Trim after 60 seconds for low/moderate pressure
const uint StackHighTrimAfterMS = 10 * 1000; // Trim after 10 seconds for high pressure
const uint StackRefreshMS = StackTrimAfterMS / 4; // Time bump after trimming (1/4 trim time)
const int StackLowTrimCount = 1; // Trim one item when pressure is low
const int StackMediumTrimCount = 2; // Trim two items when pressure is moderate
const int StackHighTrimCount = MaxBuffersPerArraySizePerCore; // Trim all items when pressure is high
const int StackLargeBucket = 16384; // If the bucket is larger than this we'll trim an extra when under high pressure
const int StackModerateTypeSize = 16; // If T is larger than this we'll trim an extra when under high pressure
const int StackLargeTypeSize = 32; // If T is larger than this we'll trim an extra (additional) when under high pressure

if (_count == 0)
return;

lock (this)
{
uint trimTicks = pressure == MemoryPressure.High ? StackHighTrimAfterMS : StackTrimAfterMS;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: this can be moved to before the lock

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moving

if (_count > 0 && _firstStackItemMS > tickCount || (tickCount - _firstStackItemMS) > trimTicks)
{
// We've wrapped the tick count or elapsed enough time since the
// first item went into the stack. Drop the top item so it can
// be collected and make the stack look a little newer.

ArrayPoolEventSource log = ArrayPoolEventSource.Log;
int trimCount = StackLowTrimCount;
switch (pressure)
{
case MemoryPressure.High:
trimCount = StackHighTrimCount;

// When pressure is high, aggressively trim larger arrays.
if (bucketSize > StackLargeBucket)
{
trimCount++;
}
if (Unsafe.SizeOf<T>() > StackModerateTypeSize)
{
trimCount++;
}
if (Unsafe.SizeOf<T>() > StackLargeTypeSize)
{
trimCount++;
}
break;
case MemoryPressure.Medium:
trimCount = StackMediumTrimCount;
break;
}

while (_count > 0 && trimCount-- > 0)
{
T[] array = _arrays[--_count];
_arrays[_count] = null;

if (log.IsEnabled())
{
log.BufferTrimmed(array.GetHashCode(), array.Length, id);
}
}

if (_count > 0 && _firstStackItemMS < uint.MaxValue - StackRefreshMS)
{
// Give the remaining items a bit more time
_firstStackItemMS += StackRefreshMS;
}
}
}
}
}
}
}
Loading