From 2b574fb50b5357d55db833efd05584530083a797 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Tue, 1 Feb 2022 23:15:55 +0300 Subject: [PATCH 1/6] Use better default for LLC size on Linux-arm64 and Windows-arm64 --- src/coreclr/gc/unix/gcenv.unix.cpp | 51 +++++++++++++----------- src/coreclr/gc/windows/gcenv.windows.cpp | 15 +++++++ src/coreclr/pal/src/misc/sysinfo.cpp | 51 +++++++++++++----------- 3 files changed, 69 insertions(+), 48 deletions(-) diff --git a/src/coreclr/gc/unix/gcenv.unix.cpp b/src/coreclr/gc/unix/gcenv.unix.cpp index ceadbf1af995b..d55041917ca38 100644 --- a/src/coreclr/gc/unix/gcenv.unix.cpp +++ b/src/coreclr/gc/unix/gcenv.unix.cpp @@ -915,30 +915,33 @@ static size_t GetLogicalProcessorCacheSizeFromOS() #endif #if defined(HOST_ARM64) && !defined(TARGET_OSX) - if (cacheSize == 0) - { - // It is currently expected to be missing cache size info - // - // _SC_LEVEL*_*CACHE_SIZE is not yet present. Work is in progress to enable this for arm64 - // - // /sys/devices/system/cpu/cpu*/cache/index*/ is also not yet present in most systems. - // Arm64 patch is in Linux kernel tip. - // - // midr_el1 is available in "/sys/devices/system/cpu/cpu0/regs/identification/midr_el1", - // but without an exhaustive list of ARM64 processors any decode of midr_el1 - // Would likely be incomplete - - // Published information on ARM64 architectures is limited. - // If we use recent high core count chips as a guide for state of the art, we find - // total L3 cache to be 1-2MB/core. As always, there are exceptions. - - // Estimate cache size based on CPU count - // Assume lower core count are lighter weight parts which are likely to have smaller caches - // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs - DWORD logicalCPUs = g_totalCpuCount; - - cacheSize = logicalCPUs * std::min(1536, std::max(256, (int)logicalCPUs * 128)) * 1024; - } + // It is currently expected to be missing cache size info + // + // _SC_LEVEL*_*CACHE_SIZE is not yet present. Work is in progress to enable this for arm64 + // + // /sys/devices/system/cpu/cpu*/cache/index*/ is also not yet present in most systems. + // Arm64 patch is in Linux kernel tip. + // + // midr_el1 is available in "/sys/devices/system/cpu/cpu0/regs/identification/midr_el1", + // but without an exhaustive list of ARM64 processors any decode of midr_el1 + // Would likely be incomplete + + // Published information on ARM64 architectures is limited. + // If we use recent high core count chips as a guide for state of the art, we find + // total L3 cache to be 1-2MB/core. As always, there are exceptions. + + // Estimate cache size based on CPU count + // Assume lower core count are lighter weight parts which are likely to have smaller caches + // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs + + // As of 2022, in most cases /sys/devices/system/cpu/cpu*/cache/index*/ does present, but only + // reports L2 cache size and says nothing about L3 even if it exists. In this case we don't want + // to stuck with L2 (e.g. 256Kb on our test machine whether the real L3 is 32Mb) + // More details: https://github.com/dotnet/runtime/issues/60166 + DWORD logicalCPUs = PAL_GetLogicalCpuCountFromOS(); + + size_t predictedSize = logicalCPUs*std::min(1536, std::max(256, (int)logicalCPUs*128))*1024; + cacheSize = std::max(predictedSize, cacheSize); #endif #if HAVE_SYSCTLBYNAME diff --git a/src/coreclr/gc/windows/gcenv.windows.cpp b/src/coreclr/gc/windows/gcenv.windows.cpp index 10d3128de2071..0268c4142a4fb 100644 --- a/src/coreclr/gc/windows/gcenv.windows.cpp +++ b/src/coreclr/gc/windows/gcenv.windows.cpp @@ -458,6 +458,21 @@ size_t GetLogicalProcessorCacheSizeFromOS() if(pslpi) delete[] pslpi; // release the memory allocated for the SLPI array. + +#ifdef TARGET_ARM64 + // GetLogicalProcessorInformation doesn't report L3 cache size on our win-arm64 environment (current cache_size most + // likely represent L2 instead). We're going to use a processor-count based heuristic to predict its size and pick + // whatever is bigger. The same heuristic is used for Linux-arm64. + // More info: https://github.com/dotnet/runtime/issues/60166 + uint32_t logicalCPUs = GetTotalProcessorCount(); + + // Estimate cache size based on CPU count + // Assume lower core count are lighter weight parts which are likely to have smaller caches + // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs + size_t predictedSize = logicalCPUs * std::min(1536, std::max(256, (int)logicalCPUs * 128)) * 1024; + cache_size = std::max(predictedSize, cache_size); +#endif + return cache_size; } diff --git a/src/coreclr/pal/src/misc/sysinfo.cpp b/src/coreclr/pal/src/misc/sysinfo.cpp index 19f9c86fd451c..9bcb13b6c5f1c 100644 --- a/src/coreclr/pal/src/misc/sysinfo.cpp +++ b/src/coreclr/pal/src/misc/sysinfo.cpp @@ -582,30 +582,33 @@ PAL_GetLogicalProcessorCacheSizeFromOS() #endif #if defined(HOST_ARM64) && !defined(TARGET_OSX) - if (cacheSize == 0) - { - // It is currently expected to be missing cache size info - // - // _SC_LEVEL*_*CACHE_SIZE is not yet present. Work is in progress to enable this for arm64 - // - // /sys/devices/system/cpu/cpu*/cache/index*/ is also not yet present in most systems. - // Arm64 patch is in Linux kernel tip. - // - // midr_el1 is available in "/sys/devices/system/cpu/cpu0/regs/identification/midr_el1", - // but without an exhaustive list of ARM64 processors any decode of midr_el1 - // Would likely be incomplete - - // Published information on ARM64 architectures is limited. - // If we use recent high core count chips as a guide for state of the art, we find - // total L3 cache to be 1-2MB/core. As always, there are exceptions. - - // Estimate cache size based on CPU count - // Assume lower core count are lighter weight parts which are likely to have smaller caches - // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs - DWORD logicalCPUs = PAL_GetLogicalCpuCountFromOS(); - - cacheSize = logicalCPUs*std::min(1536, std::max(256, (int)logicalCPUs*128))*1024; - } + // It is currently expected to be missing cache size info + // + // _SC_LEVEL*_*CACHE_SIZE is not yet present. Work is in progress to enable this for arm64 + // + // /sys/devices/system/cpu/cpu*/cache/index*/ is also not yet present in most systems. + // Arm64 patch is in Linux kernel tip. + // + // midr_el1 is available in "/sys/devices/system/cpu/cpu0/regs/identification/midr_el1", + // but without an exhaustive list of ARM64 processors any decode of midr_el1 + // Would likely be incomplete + + // Published information on ARM64 architectures is limited. + // If we use recent high core count chips as a guide for state of the art, we find + // total L3 cache to be 1-2MB/core. As always, there are exceptions. + + // Estimate cache size based on CPU count + // Assume lower core count are lighter weight parts which are likely to have smaller caches + // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs + + // As of 2022, in most cases /sys/devices/system/cpu/cpu*/cache/index*/ does present, but only + // reports L2 cache size and says nothing about L3 even if it exists. In this case we don't want + // to stuck with L2 (e.g. 256Kb on our test machine whether the real L3 is 32Mb) + // More details: https://github.com/dotnet/runtime/issues/60166 + DWORD logicalCPUs = PAL_GetLogicalCpuCountFromOS(); + + size_t predictedSize = logicalCPUs*std::min(1536, std::max(256, (int)logicalCPUs*128))*1024; + cacheSize = std::max(predictedSize, cacheSize); #endif #if HAVE_SYSCTLBYNAME From 8b0eececcd5197617fd978e6c311f9419132988b Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Wed, 2 Feb 2022 00:35:47 +0300 Subject: [PATCH 2/6] Update gcenv.unix.cpp --- src/coreclr/gc/unix/gcenv.unix.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/gc/unix/gcenv.unix.cpp b/src/coreclr/gc/unix/gcenv.unix.cpp index d55041917ca38..34d31c3305c49 100644 --- a/src/coreclr/gc/unix/gcenv.unix.cpp +++ b/src/coreclr/gc/unix/gcenv.unix.cpp @@ -938,7 +938,7 @@ static size_t GetLogicalProcessorCacheSizeFromOS() // reports L2 cache size and says nothing about L3 even if it exists. In this case we don't want // to stuck with L2 (e.g. 256Kb on our test machine whether the real L3 is 32Mb) // More details: https://github.com/dotnet/runtime/issues/60166 - DWORD logicalCPUs = PAL_GetLogicalCpuCountFromOS(); + DWORD logicalCPUs = g_totalCpuCount; size_t predictedSize = logicalCPUs*std::min(1536, std::max(256, (int)logicalCPUs*128))*1024; cacheSize = std::max(predictedSize, cacheSize); From 7d54099ac5a40ea734a3b7f877902e61d3998576 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Wed, 2 Feb 2022 04:00:22 +0300 Subject: [PATCH 3/6] Address feedback --- src/coreclr/gc/unix/gcenv.unix.cpp | 4 ++-- src/coreclr/gc/windows/gcenv.windows.cpp | 4 ++-- src/coreclr/pal/src/misc/sysinfo.cpp | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/coreclr/gc/unix/gcenv.unix.cpp b/src/coreclr/gc/unix/gcenv.unix.cpp index 34d31c3305c49..5fd41ee2d415d 100644 --- a/src/coreclr/gc/unix/gcenv.unix.cpp +++ b/src/coreclr/gc/unix/gcenv.unix.cpp @@ -938,9 +938,9 @@ static size_t GetLogicalProcessorCacheSizeFromOS() // reports L2 cache size and says nothing about L3 even if it exists. In this case we don't want // to stuck with L2 (e.g. 256Kb on our test machine whether the real L3 is 32Mb) // More details: https://github.com/dotnet/runtime/issues/60166 - DWORD logicalCPUs = g_totalCpuCount; + DWORD logicalCPUs = GCToOSInterface::GetTotalProcessorCount(); - size_t predictedSize = logicalCPUs*std::min(1536, std::max(256, (int)logicalCPUs*128))*1024; + size_t predictedSize = std::min(4096, std::max(256, (int)logicalCPUs*128))*1024; cacheSize = std::max(predictedSize, cacheSize); #endif diff --git a/src/coreclr/gc/windows/gcenv.windows.cpp b/src/coreclr/gc/windows/gcenv.windows.cpp index 0268c4142a4fb..49bf4b7d4e699 100644 --- a/src/coreclr/gc/windows/gcenv.windows.cpp +++ b/src/coreclr/gc/windows/gcenv.windows.cpp @@ -464,12 +464,12 @@ size_t GetLogicalProcessorCacheSizeFromOS() // likely represent L2 instead). We're going to use a processor-count based heuristic to predict its size and pick // whatever is bigger. The same heuristic is used for Linux-arm64. // More info: https://github.com/dotnet/runtime/issues/60166 - uint32_t logicalCPUs = GetTotalProcessorCount(); + uint32_t logicalCPUs = GCToOSInterface::GetTotalProcessorCount(); // Estimate cache size based on CPU count // Assume lower core count are lighter weight parts which are likely to have smaller caches // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs - size_t predictedSize = logicalCPUs * std::min(1536, std::max(256, (int)logicalCPUs * 128)) * 1024; + size_t predictedSize = std::min(4096, std::max(256, (int)logicalCPUs * 128)) * 1024; cache_size = std::max(predictedSize, cache_size); #endif diff --git a/src/coreclr/pal/src/misc/sysinfo.cpp b/src/coreclr/pal/src/misc/sysinfo.cpp index 9bcb13b6c5f1c..7ba80a4e6fe2a 100644 --- a/src/coreclr/pal/src/misc/sysinfo.cpp +++ b/src/coreclr/pal/src/misc/sysinfo.cpp @@ -607,7 +607,7 @@ PAL_GetLogicalProcessorCacheSizeFromOS() // More details: https://github.com/dotnet/runtime/issues/60166 DWORD logicalCPUs = PAL_GetLogicalCpuCountFromOS(); - size_t predictedSize = logicalCPUs*std::min(1536, std::max(256, (int)logicalCPUs*128))*1024; + size_t predictedSize = std::min(4096, std::max(256, (int)logicalCPUs*128))*1024; cacheSize = std::max(predictedSize, cacheSize); #endif From df1976b706bd960ca89a0d3659b90cab51d9d53c Mon Sep 17 00:00:00 2001 From: EgorBo Date: Wed, 2 Feb 2022 04:32:16 +0300 Subject: [PATCH 4/6] Fix build on win-arm64 --- src/coreclr/gc/windows/gcenv.windows.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/gc/windows/gcenv.windows.cpp b/src/coreclr/gc/windows/gcenv.windows.cpp index 49bf4b7d4e699..acdefaeed9dfa 100644 --- a/src/coreclr/gc/windows/gcenv.windows.cpp +++ b/src/coreclr/gc/windows/gcenv.windows.cpp @@ -469,8 +469,8 @@ size_t GetLogicalProcessorCacheSizeFromOS() // Estimate cache size based on CPU count // Assume lower core count are lighter weight parts which are likely to have smaller caches // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs - size_t predictedSize = std::min(4096, std::max(256, (int)logicalCPUs * 128)) * 1024; - cache_size = std::max(predictedSize, cache_size); + size_t predictedSize = min(4096, max(256, logicalCPUs * 128)) * 1024; + cache_size = max(predictedSize, cache_size); #endif return cache_size; From 21263eba8792428581c6f8ecb387486e84c30b9a Mon Sep 17 00:00:00 2001 From: EgorBo Date: Thu, 3 Feb 2022 18:49:29 +0300 Subject: [PATCH 5/6] Fix comments --- src/coreclr/gc/unix/gcenv.unix.cpp | 2 +- src/coreclr/gc/windows/gcenv.windows.cpp | 2 +- src/coreclr/pal/src/misc/sysinfo.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coreclr/gc/unix/gcenv.unix.cpp b/src/coreclr/gc/unix/gcenv.unix.cpp index 5fd41ee2d415d..689864179093f 100644 --- a/src/coreclr/gc/unix/gcenv.unix.cpp +++ b/src/coreclr/gc/unix/gcenv.unix.cpp @@ -932,7 +932,7 @@ static size_t GetLogicalProcessorCacheSizeFromOS() // Estimate cache size based on CPU count // Assume lower core count are lighter weight parts which are likely to have smaller caches - // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs + // Assume L3$/CPU grows linearly from 256Kb to 4Mb/CPU as logicalCPUs grows from 2 to 12 CPUs // As of 2022, in most cases /sys/devices/system/cpu/cpu*/cache/index*/ does present, but only // reports L2 cache size and says nothing about L3 even if it exists. In this case we don't want diff --git a/src/coreclr/gc/windows/gcenv.windows.cpp b/src/coreclr/gc/windows/gcenv.windows.cpp index acdefaeed9dfa..6fccf0fddbca0 100644 --- a/src/coreclr/gc/windows/gcenv.windows.cpp +++ b/src/coreclr/gc/windows/gcenv.windows.cpp @@ -468,7 +468,7 @@ size_t GetLogicalProcessorCacheSizeFromOS() // Estimate cache size based on CPU count // Assume lower core count are lighter weight parts which are likely to have smaller caches - // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs + // Assume L3$/CPU grows linearly from 256Kb to 4Mb/CPU as logicalCPUs grows from 2 to 12 CPUs size_t predictedSize = min(4096, max(256, logicalCPUs * 128)) * 1024; cache_size = max(predictedSize, cache_size); #endif diff --git a/src/coreclr/pal/src/misc/sysinfo.cpp b/src/coreclr/pal/src/misc/sysinfo.cpp index 7ba80a4e6fe2a..6b96b5c8b7847 100644 --- a/src/coreclr/pal/src/misc/sysinfo.cpp +++ b/src/coreclr/pal/src/misc/sysinfo.cpp @@ -599,7 +599,7 @@ PAL_GetLogicalProcessorCacheSizeFromOS() // Estimate cache size based on CPU count // Assume lower core count are lighter weight parts which are likely to have smaller caches - // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs + // Assume L3$/CPU grows linearly from 256Kb to 4Mb/CPU as logicalCPUs grows from 2 to 12 CPUs // As of 2022, in most cases /sys/devices/system/cpu/cpu*/cache/index*/ does present, but only // reports L2 cache size and says nothing about L3 even if it exists. In this case we don't want From 37b386d952386204e62d102408aa5a1a3adf66d1 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Mon, 7 Feb 2022 22:17:01 +0300 Subject: [PATCH 6/6] Fix comments --- src/coreclr/gc/unix/gcenv.unix.cpp | 2 +- src/coreclr/gc/windows/gcenv.windows.cpp | 2 +- src/coreclr/pal/src/misc/sysinfo.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coreclr/gc/unix/gcenv.unix.cpp b/src/coreclr/gc/unix/gcenv.unix.cpp index 689864179093f..4bda2dbb2020b 100644 --- a/src/coreclr/gc/unix/gcenv.unix.cpp +++ b/src/coreclr/gc/unix/gcenv.unix.cpp @@ -932,7 +932,7 @@ static size_t GetLogicalProcessorCacheSizeFromOS() // Estimate cache size based on CPU count // Assume lower core count are lighter weight parts which are likely to have smaller caches - // Assume L3$/CPU grows linearly from 256Kb to 4Mb/CPU as logicalCPUs grows from 2 to 12 CPUs + // Assume shared L3 grows linearly from 256Kb to 4Mb as logicalCPUs grows from 2 to 32 CPUs // As of 2022, in most cases /sys/devices/system/cpu/cpu*/cache/index*/ does present, but only // reports L2 cache size and says nothing about L3 even if it exists. In this case we don't want diff --git a/src/coreclr/gc/windows/gcenv.windows.cpp b/src/coreclr/gc/windows/gcenv.windows.cpp index 6fccf0fddbca0..5c80881b47d65 100644 --- a/src/coreclr/gc/windows/gcenv.windows.cpp +++ b/src/coreclr/gc/windows/gcenv.windows.cpp @@ -468,7 +468,7 @@ size_t GetLogicalProcessorCacheSizeFromOS() // Estimate cache size based on CPU count // Assume lower core count are lighter weight parts which are likely to have smaller caches - // Assume L3$/CPU grows linearly from 256Kb to 4Mb/CPU as logicalCPUs grows from 2 to 12 CPUs + // Assume shared L3 grows linearly from 256Kb to 4Mb as logicalCPUs grows from 2 to 32 CPUs size_t predictedSize = min(4096, max(256, logicalCPUs * 128)) * 1024; cache_size = max(predictedSize, cache_size); #endif diff --git a/src/coreclr/pal/src/misc/sysinfo.cpp b/src/coreclr/pal/src/misc/sysinfo.cpp index 6b96b5c8b7847..4c131339e9fe1 100644 --- a/src/coreclr/pal/src/misc/sysinfo.cpp +++ b/src/coreclr/pal/src/misc/sysinfo.cpp @@ -599,7 +599,7 @@ PAL_GetLogicalProcessorCacheSizeFromOS() // Estimate cache size based on CPU count // Assume lower core count are lighter weight parts which are likely to have smaller caches - // Assume L3$/CPU grows linearly from 256Kb to 4Mb/CPU as logicalCPUs grows from 2 to 12 CPUs + // Assume shared L3 grows linearly from 256Kb to 4Mb as logicalCPUs grows from 2 to 32 CPUs // As of 2022, in most cases /sys/devices/system/cpu/cpu*/cache/index*/ does present, but only // reports L2 cache size and says nothing about L3 even if it exists. In this case we don't want