Improve scalability of monitored Counter / UpDownCounter #91566

stephentoub · 2023-09-05T00:40:37Z

Avoid locking around every Update. Instead, use interlocked operations, and for CounterAggregator, partition the value being updated into one per core rather than one all-up. Without the partitioning, the compare-exchange loop can actually be measurably slower than the use of locking under heavy contention. With the partitioning, given we're dealing with floating-point, this can conceivably end up changing the reported values slightly, since we end up changing the order of operations (but with concurrency, such order of operations was already non-deterministic).

Contributes to dotnet/aspnetcore#50412

Method	Toolchain	Mean	Ratio
UpDownCounter_Serial	\main\corerun.exe	36.65 ms	1.00
UpDownCounter_Serial	\pr\corerun.exe	24.77 ms	0.68

UpDownCounter_Parallel	\main\corerun.exe	55.55 ms	1.00
UpDownCounter_Parallel	\pr\corerun.exe	14.02 ms	0.25

Counter_Serial	\main\corerun.exe	36.27 ms	1.00
Counter_Serial	\pr\corerun.exe	25.15 ms	0.69

Counter_Parallel	\main\corerun.exe	58.72 ms	1.00
Counter_Parallel	\pr\corerun.exe	13.97 ms	0.22

using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
using System.Diagnostics.Metrics;
using System.Diagnostics.Tracing;

BenchmarkSwitcher.FromAssembly(typeof(Tests).Assembly).Run(args);

[HideColumns("Error", "StdDev", "Median", "RatioSD", "Job")]
public class Tests
{
    const int Iters = 1_000_000;

    private Meter _meter;
    private UpDownCounter<int> _upDownCounter;
    private Counter<int> _counter;
    private MetricsEventListener _listener;

    [GlobalSetup]
    public void Setup()
    {
        _meter = new Meter("Example");
        _upDownCounter = _meter.CreateUpDownCounter<int>("upDownCounter");
        _counter = _meter.CreateCounter<int>("counter");
        _listener = new MetricsEventListener();
    }

    [GlobalCleanup]
    public void Cleanup()
    {
        _listener.Dispose();
        _meter.Dispose();
    }

    [Benchmark]
    public void UpDownCounter_Serial()
    {
        for (int i = 0; i < Iters; i++)
        {
            _upDownCounter.Add(1);
            _upDownCounter.Add(-1);
        }
    }

    [Benchmark]
    public void UpDownCounter_Parallel()
    {
        Parallel.For(0, Iters, i =>
        {
            _upDownCounter.Add(1);
            _upDownCounter.Add(-1);
        });
    }

    [Benchmark]
    public void Counter_Serial()
    {
        for (int i = 0; i < Iters; i++)
        {
            _counter.Add(1);
            _counter.Add(1);
        }
    }

    [Benchmark]
    public void Counter_Parallel()
    {
        Parallel.For(0, Iters, i =>
        {
            _counter.Add(1);
            _counter.Add(1);
        });
    }

    private sealed class MetricsEventListener : EventListener
    {
        protected override void OnEventSourceCreated(EventSource eventSource)
        {
            if (eventSource.Name == "System.Diagnostics.Metrics")
            {
                EnableEvents(eventSource, EventLevel.LogAlways, EventKeywords.All, new Dictionary<string, string>() { { "Metrics", "Example\\upDownCounter;Example\\counter" } });
            }
        }
    }
}

ghost · 2023-09-05T00:40:54Z

Tagging subscribers to this area: @tarekgh, @tommcdon, @pjanotti
See info in area-owners.md if you want to be subscribed.

Issue Details

Avoid locking around every Update. Instead, use interlocked operations, and for CounterAggregator, partition the value being updated into one per core rather than one all-up. Without the partitioning, the compare-exchange loop can actually be measurably slower than the use of locking under heavy contention. With the partitioning, given we're dealing with floating-point, this can conceivably end up changing the reported values slightly, since we end up changing the order of operations (but with concurrency, such order of operations was already non-deterministic).

Contributes to dotnet/aspnetcore#50412

Method	Toolchain	Mean	Ratio
UpDownCounter_Serial	\main\corerun.exe	36.65 ms	1.00
UpDownCounter_Serial	\pr\corerun.exe	24.77 ms	0.68

UpDownCounter_Parallel	\main\corerun.exe	55.55 ms	1.00
UpDownCounter_Parallel	\pr\corerun.exe	14.02 ms	0.25

Counter_Serial	\main\corerun.exe	36.27 ms	1.00
Counter_Serial	\pr\corerun.exe	25.15 ms	0.69

Counter_Parallel	\main\corerun.exe	58.72 ms	1.00
Counter_Parallel	\pr\corerun.exe	13.97 ms	0.22

using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
using System.Diagnostics.Metrics;
using System.Diagnostics.Tracing;

BenchmarkSwitcher.FromAssembly(typeof(Tests).Assembly).Run(args);

[HideColumns("Error", "StdDev", "Median", "RatioSD", "Job")]
public class Tests
{
    const int Iters = 1_000_000;

    private Meter _meter;
    private UpDownCounter<int> _upDownCounter;
    private Counter<int> _counter;
    private MetricsEventListener _listener;

    [GlobalSetup]
    public void Setup()
    {
        _meter = new Meter("Example");
        _upDownCounter = _meter.CreateUpDownCounter<int>("upDownCounter");
        _counter = _meter.CreateCounter<int>("counter");
        _listener = new MetricsEventListener();
    }

    [GlobalCleanup]
    public void Cleanup()
    {
        _listener.Dispose();
        _meter.Dispose();
    }

    [Benchmark]
    public void UpDownCounter_Serial()
    {
        for (int i = 0; i < Iters; i++)
        {
            _upDownCounter.Add(1);
            _upDownCounter.Add(-1);
        }
    }

    [Benchmark]
    public void UpDownCounter_Parallel()
    {
        Parallel.For(0, Iters, i =>
        {
            _upDownCounter.Add(1);
            _upDownCounter.Add(-1);
        });
    }

    [Benchmark]
    public void Counter_Serial()
    {
        for (int i = 0; i < Iters; i++)
        {
            _counter.Add(1);
            _counter.Add(1);
        }
    }

    [Benchmark]
    public void Counter_Parallel()
    {
        Parallel.For(0, Iters, i =>
        {
            _counter.Add(1);
            _counter.Add(1);
        });
    }

    private sealed class MetricsEventListener : EventListener
    {
        protected override void OnEventSourceCreated(EventSource eventSource)
        {
            if (eventSource.Name == "System.Diagnostics.Metrics")
            {
                EnableEvents(eventSource, EventLevel.LogAlways, EventKeywords.All, new Dictionary<string, string>() { { "Metrics", "Example\\upDownCounter;Example\\counter" } });
            }
        }
    }
}

Author:	stephentoub
Assignees:	stephentoub
Labels:	`area-System.Diagnostics.Tracing`
Milestone:	-

...ries/System.Diagnostics.DiagnosticSource/src/System/Diagnostics/Metrics/CounterAggregator.cs

Avoid locking around every Update. Instead, use interlocked operations, and for CounterAggregator, partition the value being updated into one per core rather than one all-up.

...ries/System.Diagnostics.DiagnosticSource/src/System/Diagnostics/Metrics/CounterAggregator.cs

tarekgh

LGTM. Will be good to have @noahfalk have a look too.

noahfalk · 2023-09-06T10:47:56Z

Sorry I was OOF for last week and getting caught up.

I like this as something people could opt into but I worry about the memory usage if we make this the default implementation. In the case of the 80 core machines mentioned in the issue, we are trading in an 8 byte double for a ref to an array with 80*cache_line_size bytes, 80*64=~5KB per time series I think? MetricsEventSource has a default tracking limit of 1000 time series. A time series = 1 combination of counter name + all dimension values so its not that hard to hit the time series limit. For people running this in production it could be a +5MB VM cost. In some cases thats probably no big deal, but in a container with tight limits that might be a significant regression or an OOM.

In terms of fixes I think we could:

cap parallelism at 8 as default behavior
make the cap configurable using a new filter argument on MetricsEventSource and pass it down MetricsEventSource -> AggregationManager -> AggregatorStore -> Aggregator.
Rather than put 1 value per cache line, use the entire cache line for different aggregated values belonging to same core. The data structure gets more complicated but we get much better memory usage density with hopefully minimal increase in contention.

I'd like to find out if they are hitting similar contention issues with the lock on histograms. Histograms are a much bigger data structure so defaulting them to be one copy-per-core probably isn't viable unless it is opt-in or we change the data structure to make them grow more dynamically than they already do.

EgorBo · 2023-09-06T10:51:27Z

80cache_line_size bytes, 8064=~5KB per time series I think? MetricsEventSource has a default tracking limit of 1000 time series. A time series = 1 combination of counter name + all dimension values so its not that hard to hit the time series limit. For people running this in production it could be a +5MB VM cost

cap parallelism at 8 as default behavior

I wonder if it's better to just remove padding then to cap it to 8 as the main beneficiar of this opt are many-cores systems.
so then 80*8=640B, *1000 = 642kb for the worst case. ~~False sharing is probably the least problem here.~~

noahfalk · 2023-09-06T10:57:11Z

I wonder if it's better to just remove padding

If that delivers a better perf result I am fine with it, though I still think we need to cap total memory used for these counters around 512KB (a somewhat arbitrarily selected value). If we had no padding presumably that raises the cap to 64 doubles per time series.

EgorBo · 2023-09-06T11:01:52Z

A microbenchmark to play with (on a system with many cores): https://gist.github.com/EgorBo/fb944b46bf9b1ffd35b01bb3b5726e7f

JamesNK · 2023-09-07T00:27:53Z

What about an adaptive solution? If the number of dimensions increases beyond a limit, then switch back to the old behavior to save memory?

And if a counter has a lot of dimensions, then there is probably less contention on the lock. Updates are spread out across the dimensions. (is there a lock per dimension? If there isn't then ignore this)

noahfalk · 2023-09-07T10:03:49Z

What about an adaptive solution? If the number of dimensions increases beyond a limit, then switch back to the old behavior to save memory?

I worry that the assignment of which time series are optimized for memory and which are optimized for latency will appear largely random based on creation order of the different time series. Under high contention the perf cliff seems severe enough I think I'd rather get an error asking the dev running the perf scenario to raise the memory limits vs. having unexplained significant changes in perf. From I can see in the perf numbers, I think the contention impacts in @EgorBo's ARM multi-core machine are much larger than what I assume @stephentoub measured on x64.

is there a lock per dimension?

Today there is one lock per time-series. (Time series being the a unique combination of instrument + all the dimension values, which I think is what you meant)

stephentoub · 2023-11-06T18:37:01Z

@noahfalk, what would you like me to do with this? Should I do "cap parallelism at 8"?

noahfalk · 2023-11-07T01:01:54Z

Yeah, I think capping at 8 is a fine place to start. I'd still be interested in making further improvements but its likely to get more involved. I assume you'd prefer to bank your incremental improvements and move on vs. working on more complicated options?

stephentoub · 2023-11-07T02:03:09Z

I assume you'd prefer to bank your incremental improvements and move on vs. working on more complicated options?

Yes, as that's acceptable to you, that's my preference.

noahfalk

LGTM, thanks @stephentoub!

bencyoung-Fignum · 2023-11-08T07:30:55Z

...ries/System.Diagnostics.DiagnosticSource/src/System/Diagnostics/Metrics/CounterAggregator.cs

+        /// The array is limited to a semi-arbitrary limit of 8 in order to avoid excessive memory
+        /// consumption when many counters are being used.
+        /// </remarks>
+        private readonly PaddedDouble[] _deltas = new PaddedDouble[Math.Min(Environment.ProcessorCount, 8)];


@stephentoub shouldn't this be a max if the comment is accurate?

I believe both the comment and code are correct. We want to increase the number up to the processor count but not go above 8, so we want the processor count or 8, whichever is lower.

@bencyoung-Fignum min is ensuring limit, here it means 8 or less.

Doh you are right, apologies!

stephentoub requested review from EgorBo, noahfalk and tarekgh September 5, 2023 00:40

dotnet-issue-labeler bot added the area-System.Diagnostics.Tracing label Sep 5, 2023

ghost assigned stephentoub Sep 5, 2023

stephentoub added area-System.Diagnostics.Metric and removed area-System.Diagnostics.Tracing labels Sep 5, 2023

EgorBo reviewed Sep 5, 2023

View reviewed changes

...ries/System.Diagnostics.DiagnosticSource/src/System/Diagnostics/Metrics/CounterAggregator.cs Show resolved Hide resolved

EgorBo reviewed Sep 5, 2023

View reviewed changes

...ries/System.Diagnostics.DiagnosticSource/src/System/Diagnostics/Metrics/CounterAggregator.cs Show resolved Hide resolved

EgorBo reviewed Sep 5, 2023

View reviewed changes

...ries/System.Diagnostics.DiagnosticSource/src/System/Diagnostics/Metrics/CounterAggregator.cs Show resolved Hide resolved

neon-sunset reviewed Sep 5, 2023

View reviewed changes

...ries/System.Diagnostics.DiagnosticSource/src/System/Diagnostics/Metrics/CounterAggregator.cs Outdated Show resolved Hide resolved

stephentoub added 2 commits September 5, 2023 09:01

Improve scalability of monitored Counter / UpDownCounter

fdc4156

Avoid locking around every Update. Instead, use interlocked operations, and for CounterAggregator, partition the value being updated into one per core rather than one all-up.

Address PR feedback

83e59cb

stephentoub force-pushed the counterscale branch from 351333b to 83e59cb Compare September 5, 2023 13:15

EgorBo approved these changes Sep 5, 2023

View reviewed changes

tarekgh reviewed Sep 5, 2023

View reviewed changes

...ries/System.Diagnostics.DiagnosticSource/src/System/Diagnostics/Metrics/CounterAggregator.cs Show resolved Hide resolved

tarekgh approved these changes Sep 5, 2023

View reviewed changes

noahfalk mentioned this pull request Sep 7, 2023

Microsoft.AspNetCore.Hosting perf counters are slow dotnet/aspnetcore#50412

Open

Merge branch 'main' into counterscale

644ee1c

Limit additional memory consumption

d7c97a6

build-analysis bot mentioned this pull request Nov 7, 2023

MSBuild crashing in the build #92290

Open

noahfalk approved these changes Nov 7, 2023

View reviewed changes

stephentoub merged commit 173eb1f into dotnet:main Nov 7, 2023
109 checks passed

stephentoub deleted the counterscale branch November 7, 2023 22:52

bencyoung-Fignum reviewed Nov 8, 2023

View reviewed changes

akoeplinger mentioned this pull request Nov 24, 2023

Mono Windows x86: possible CompareExchange issue causes failures in System.Diagnostics.DiagnosticSource.Tests #95210

Open

github-actions bot locked and limited conversation to collaborators Dec 9, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Improve scalability of monitored Counter / UpDownCounter #91566

Improve scalability of monitored Counter / UpDownCounter #91566

stephentoub commented Sep 5, 2023

ghost commented Sep 5, 2023

tarekgh left a comment

noahfalk commented Sep 6, 2023 •

edited

Loading

EgorBo commented Sep 6, 2023 •

edited

Loading

noahfalk commented Sep 6, 2023

EgorBo commented Sep 6, 2023

JamesNK commented Sep 7, 2023 •

edited

Loading

noahfalk commented Sep 7, 2023 •

edited

Loading

stephentoub commented Nov 6, 2023

noahfalk commented Nov 7, 2023

stephentoub commented Nov 7, 2023

noahfalk left a comment

bencyoung-Fignum Nov 8, 2023

stephentoub Nov 8, 2023 •

edited

Loading

OwnageIsMagic Nov 8, 2023

bencyoung-Fignum Nov 8, 2023

Improve scalability of monitored Counter / UpDownCounter #91566

Improve scalability of monitored Counter / UpDownCounter #91566

Conversation

stephentoub commented Sep 5, 2023

ghost commented Sep 5, 2023

tarekgh left a comment

Choose a reason for hiding this comment

noahfalk commented Sep 6, 2023 • edited Loading

EgorBo commented Sep 6, 2023 • edited Loading

noahfalk commented Sep 6, 2023

EgorBo commented Sep 6, 2023

JamesNK commented Sep 7, 2023 • edited Loading

noahfalk commented Sep 7, 2023 • edited Loading

stephentoub commented Nov 6, 2023

noahfalk commented Nov 7, 2023

stephentoub commented Nov 7, 2023

noahfalk left a comment

Choose a reason for hiding this comment

bencyoung-Fignum Nov 8, 2023

Choose a reason for hiding this comment

stephentoub Nov 8, 2023 • edited Loading

Choose a reason for hiding this comment

OwnageIsMagic Nov 8, 2023

Choose a reason for hiding this comment

bencyoung-Fignum Nov 8, 2023

Choose a reason for hiding this comment

noahfalk commented Sep 6, 2023 •

edited

Loading

EgorBo commented Sep 6, 2023 •

edited

Loading

JamesNK commented Sep 7, 2023 •

edited

Loading

noahfalk commented Sep 7, 2023 •

edited

Loading

stephentoub Nov 8, 2023 •

edited

Loading