From 2b574fb50b5357d55db833efd05584530083a797 Mon Sep 17 00:00:00 2001
From: EgorBo <egorbo@gmail.com>
Date: Tue, 1 Feb 2022 23:15:55 +0300
Subject: [PATCH 1/6] Use better default for LLC size on Linux-arm64 and
 Windows-arm64

---
 src/coreclr/gc/unix/gcenv.unix.cpp       | 51 +++++++++++++-----------
 src/coreclr/gc/windows/gcenv.windows.cpp | 15 +++++++
 src/coreclr/pal/src/misc/sysinfo.cpp     | 51 +++++++++++++-----------
 3 files changed, 69 insertions(+), 48 deletions(-)

diff --git a/src/coreclr/gc/unix/gcenv.unix.cpp b/src/coreclr/gc/unix/gcenv.unix.cpp
index ceadbf1af995b..d55041917ca38 100644
--- a/src/coreclr/gc/unix/gcenv.unix.cpp
+++ b/src/coreclr/gc/unix/gcenv.unix.cpp
@@ -915,30 +915,33 @@ static size_t GetLogicalProcessorCacheSizeFromOS()
 #endif
 
 #if defined(HOST_ARM64) && !defined(TARGET_OSX)
-    if (cacheSize == 0)
-    {
-        // It is currently expected to be missing cache size info
-        //
-        // _SC_LEVEL*_*CACHE_SIZE is not yet present.  Work is in progress to enable this for arm64
-        //
-        // /sys/devices/system/cpu/cpu*/cache/index*/ is also not yet present in most systems.
-        // Arm64 patch is in Linux kernel tip.
-        //
-        // midr_el1 is available in "/sys/devices/system/cpu/cpu0/regs/identification/midr_el1",
-        // but without an exhaustive list of ARM64 processors any decode of midr_el1
-        // Would likely be incomplete
-
-        // Published information on ARM64 architectures is limited.
-        // If we use recent high core count chips as a guide for state of the art, we find
-        // total L3 cache to be 1-2MB/core.  As always, there are exceptions.
-
-        // Estimate cache size based on CPU count
-        // Assume lower core count are lighter weight parts which are likely to have smaller caches
-        // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs
-        DWORD logicalCPUs = g_totalCpuCount;
-
-        cacheSize = logicalCPUs * std::min(1536, std::max(256, (int)logicalCPUs * 128)) * 1024;
-    }
+    // It is currently expected to be missing cache size info
+    //
+    // _SC_LEVEL*_*CACHE_SIZE is not yet present.  Work is in progress to enable this for arm64
+    //
+    // /sys/devices/system/cpu/cpu*/cache/index*/ is also not yet present in most systems.
+    // Arm64 patch is in Linux kernel tip.
+    //
+    // midr_el1 is available in "/sys/devices/system/cpu/cpu0/regs/identification/midr_el1",
+    // but without an exhaustive list of ARM64 processors any decode of midr_el1
+    // Would likely be incomplete
+
+    // Published information on ARM64 architectures is limited.
+    // If we use recent high core count chips as a guide for state of the art, we find
+    // total L3 cache to be 1-2MB/core.  As always, there are exceptions.
+
+    // Estimate cache size based on CPU count
+    // Assume lower core count are lighter weight parts which are likely to have smaller caches
+    // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs
+
+    // As of 2022, in most cases /sys/devices/system/cpu/cpu*/cache/index*/ does present, but only 
+    // reports L2 cache size and says nothing about L3 even if it exists. In this case we don't want
+    // to stuck with L2 (e.g. 256Kb on our test machine whether the real L3 is 32Mb)
+    // More details: https://github.com/dotnet/runtime/issues/60166
+    DWORD logicalCPUs = PAL_GetLogicalCpuCountFromOS();
+
+    size_t predictedSize = logicalCPUs*std::min(1536, std::max(256, (int)logicalCPUs*128))*1024;
+    cacheSize = std::max(predictedSize, cacheSize);
 #endif
 
 #if HAVE_SYSCTLBYNAME
diff --git a/src/coreclr/gc/windows/gcenv.windows.cpp b/src/coreclr/gc/windows/gcenv.windows.cpp
index 10d3128de2071..0268c4142a4fb 100644
--- a/src/coreclr/gc/windows/gcenv.windows.cpp
+++ b/src/coreclr/gc/windows/gcenv.windows.cpp
@@ -458,6 +458,21 @@ size_t GetLogicalProcessorCacheSizeFromOS()
     if(pslpi)
         delete[] pslpi;  // release the memory allocated for the SLPI array.
 
+
+#ifdef TARGET_ARM64
+    // GetLogicalProcessorInformation doesn't report L3 cache size on our win-arm64 environment (current cache_size most
+    // likely represent L2 instead). We're going to use a processor-count based heuristic to predict its size and pick
+    // whatever is bigger. The same heuristic is used for Linux-arm64.
+    // More info: https://github.com/dotnet/runtime/issues/60166
+    uint32_t logicalCPUs = GetTotalProcessorCount();
+
+    // Estimate cache size based on CPU count
+    // Assume lower core count are lighter weight parts which are likely to have smaller caches
+    // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs
+    size_t predictedSize = logicalCPUs * std::min(1536, std::max(256, (int)logicalCPUs * 128)) * 1024;
+    cache_size = std::max(predictedSize, cache_size);
+#endif
+
     return cache_size;
 }
 
diff --git a/src/coreclr/pal/src/misc/sysinfo.cpp b/src/coreclr/pal/src/misc/sysinfo.cpp
index 19f9c86fd451c..9bcb13b6c5f1c 100644
--- a/src/coreclr/pal/src/misc/sysinfo.cpp
+++ b/src/coreclr/pal/src/misc/sysinfo.cpp
@@ -582,30 +582,33 @@ PAL_GetLogicalProcessorCacheSizeFromOS()
 #endif
 
 #if defined(HOST_ARM64) && !defined(TARGET_OSX)
-    if (cacheSize == 0)
-    {
-        // It is currently expected to be missing cache size info
-        //
-        // _SC_LEVEL*_*CACHE_SIZE is not yet present.  Work is in progress to enable this for arm64
-        //
-        // /sys/devices/system/cpu/cpu*/cache/index*/ is also not yet present in most systems.
-        // Arm64 patch is in Linux kernel tip.
-        //
-        // midr_el1 is available in "/sys/devices/system/cpu/cpu0/regs/identification/midr_el1",
-        // but without an exhaustive list of ARM64 processors any decode of midr_el1
-        // Would likely be incomplete
-
-        // Published information on ARM64 architectures is limited.
-        // If we use recent high core count chips as a guide for state of the art, we find
-        // total L3 cache to be 1-2MB/core.  As always, there are exceptions.
-
-        // Estimate cache size based on CPU count
-        // Assume lower core count are lighter weight parts which are likely to have smaller caches
-        // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs
-        DWORD logicalCPUs = PAL_GetLogicalCpuCountFromOS();
-
-        cacheSize = logicalCPUs*std::min(1536, std::max(256, (int)logicalCPUs*128))*1024;
-    }
+    // It is currently expected to be missing cache size info
+    //
+    // _SC_LEVEL*_*CACHE_SIZE is not yet present.  Work is in progress to enable this for arm64
+    //
+    // /sys/devices/system/cpu/cpu*/cache/index*/ is also not yet present in most systems.
+    // Arm64 patch is in Linux kernel tip.
+    //
+    // midr_el1 is available in "/sys/devices/system/cpu/cpu0/regs/identification/midr_el1",
+    // but without an exhaustive list of ARM64 processors any decode of midr_el1
+    // Would likely be incomplete
+
+    // Published information on ARM64 architectures is limited.
+    // If we use recent high core count chips as a guide for state of the art, we find
+    // total L3 cache to be 1-2MB/core.  As always, there are exceptions.
+
+    // Estimate cache size based on CPU count
+    // Assume lower core count are lighter weight parts which are likely to have smaller caches
+    // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs
+
+    // As of 2022, in most cases /sys/devices/system/cpu/cpu*/cache/index*/ does present, but only 
+    // reports L2 cache size and says nothing about L3 even if it exists. In this case we don't want
+    // to stuck with L2 (e.g. 256Kb on our test machine whether the real L3 is 32Mb)
+    // More details: https://github.com/dotnet/runtime/issues/60166
+    DWORD logicalCPUs = PAL_GetLogicalCpuCountFromOS();
+
+    size_t predictedSize = logicalCPUs*std::min(1536, std::max(256, (int)logicalCPUs*128))*1024;
+    cacheSize = std::max(predictedSize, cacheSize);
 #endif
 
 #if HAVE_SYSCTLBYNAME

From 8b0eececcd5197617fd978e6c311f9419132988b Mon Sep 17 00:00:00 2001
From: Egor Bogatov <egorbo@gmail.com>
Date: Wed, 2 Feb 2022 00:35:47 +0300
Subject: [PATCH 2/6] Update gcenv.unix.cpp

---
 src/coreclr/gc/unix/gcenv.unix.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coreclr/gc/unix/gcenv.unix.cpp b/src/coreclr/gc/unix/gcenv.unix.cpp
index d55041917ca38..34d31c3305c49 100644
--- a/src/coreclr/gc/unix/gcenv.unix.cpp
+++ b/src/coreclr/gc/unix/gcenv.unix.cpp
@@ -938,7 +938,7 @@ static size_t GetLogicalProcessorCacheSizeFromOS()
     // reports L2 cache size and says nothing about L3 even if it exists. In this case we don't want
     // to stuck with L2 (e.g. 256Kb on our test machine whether the real L3 is 32Mb)
     // More details: https://github.com/dotnet/runtime/issues/60166
-    DWORD logicalCPUs = PAL_GetLogicalCpuCountFromOS();
+    DWORD logicalCPUs = g_totalCpuCount;
 
     size_t predictedSize = logicalCPUs*std::min(1536, std::max(256, (int)logicalCPUs*128))*1024;
     cacheSize = std::max(predictedSize, cacheSize);

From 7d54099ac5a40ea734a3b7f877902e61d3998576 Mon Sep 17 00:00:00 2001
From: EgorBo <egorbo@gmail.com>
Date: Wed, 2 Feb 2022 04:00:22 +0300
Subject: [PATCH 3/6] Address feedback

---
 src/coreclr/gc/unix/gcenv.unix.cpp       | 4 ++--
 src/coreclr/gc/windows/gcenv.windows.cpp | 4 ++--
 src/coreclr/pal/src/misc/sysinfo.cpp     | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/coreclr/gc/unix/gcenv.unix.cpp b/src/coreclr/gc/unix/gcenv.unix.cpp
index 34d31c3305c49..5fd41ee2d415d 100644
--- a/src/coreclr/gc/unix/gcenv.unix.cpp
+++ b/src/coreclr/gc/unix/gcenv.unix.cpp
@@ -938,9 +938,9 @@ static size_t GetLogicalProcessorCacheSizeFromOS()
     // reports L2 cache size and says nothing about L3 even if it exists. In this case we don't want
     // to stuck with L2 (e.g. 256Kb on our test machine whether the real L3 is 32Mb)
     // More details: https://github.com/dotnet/runtime/issues/60166
-    DWORD logicalCPUs = g_totalCpuCount;
+    DWORD logicalCPUs = GCToOSInterface::GetTotalProcessorCount();
 
-    size_t predictedSize = logicalCPUs*std::min(1536, std::max(256, (int)logicalCPUs*128))*1024;
+    size_t predictedSize = std::min(4096, std::max(256, (int)logicalCPUs*128))*1024;
     cacheSize = std::max(predictedSize, cacheSize);
 #endif
 
diff --git a/src/coreclr/gc/windows/gcenv.windows.cpp b/src/coreclr/gc/windows/gcenv.windows.cpp
index 0268c4142a4fb..49bf4b7d4e699 100644
--- a/src/coreclr/gc/windows/gcenv.windows.cpp
+++ b/src/coreclr/gc/windows/gcenv.windows.cpp
@@ -464,12 +464,12 @@ size_t GetLogicalProcessorCacheSizeFromOS()
     // likely represent L2 instead). We're going to use a processor-count based heuristic to predict its size and pick
     // whatever is bigger. The same heuristic is used for Linux-arm64.
     // More info: https://github.com/dotnet/runtime/issues/60166
-    uint32_t logicalCPUs = GetTotalProcessorCount();
+    uint32_t logicalCPUs = GCToOSInterface::GetTotalProcessorCount();
 
     // Estimate cache size based on CPU count
     // Assume lower core count are lighter weight parts which are likely to have smaller caches
     // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs
-    size_t predictedSize = logicalCPUs * std::min(1536, std::max(256, (int)logicalCPUs * 128)) * 1024;
+    size_t predictedSize = std::min(4096, std::max(256, (int)logicalCPUs * 128)) * 1024;
     cache_size = std::max(predictedSize, cache_size);
 #endif
 
diff --git a/src/coreclr/pal/src/misc/sysinfo.cpp b/src/coreclr/pal/src/misc/sysinfo.cpp
index 9bcb13b6c5f1c..7ba80a4e6fe2a 100644
--- a/src/coreclr/pal/src/misc/sysinfo.cpp
+++ b/src/coreclr/pal/src/misc/sysinfo.cpp
@@ -607,7 +607,7 @@ PAL_GetLogicalProcessorCacheSizeFromOS()
     // More details: https://github.com/dotnet/runtime/issues/60166
     DWORD logicalCPUs = PAL_GetLogicalCpuCountFromOS();
 
-    size_t predictedSize = logicalCPUs*std::min(1536, std::max(256, (int)logicalCPUs*128))*1024;
+    size_t predictedSize = std::min(4096, std::max(256, (int)logicalCPUs*128))*1024;
     cacheSize = std::max(predictedSize, cacheSize);
 #endif
 

From df1976b706bd960ca89a0d3659b90cab51d9d53c Mon Sep 17 00:00:00 2001
From: EgorBo <egorbo@gmail.com>
Date: Wed, 2 Feb 2022 04:32:16 +0300
Subject: [PATCH 4/6] Fix build on win-arm64

---
 src/coreclr/gc/windows/gcenv.windows.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/coreclr/gc/windows/gcenv.windows.cpp b/src/coreclr/gc/windows/gcenv.windows.cpp
index 49bf4b7d4e699..acdefaeed9dfa 100644
--- a/src/coreclr/gc/windows/gcenv.windows.cpp
+++ b/src/coreclr/gc/windows/gcenv.windows.cpp
@@ -469,8 +469,8 @@ size_t GetLogicalProcessorCacheSizeFromOS()
     // Estimate cache size based on CPU count
     // Assume lower core count are lighter weight parts which are likely to have smaller caches
     // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs
-    size_t predictedSize = std::min(4096, std::max(256, (int)logicalCPUs * 128)) * 1024;
-    cache_size = std::max(predictedSize, cache_size);
+    size_t predictedSize = min(4096, max(256, logicalCPUs * 128)) * 1024;
+    cache_size = max(predictedSize, cache_size);
 #endif
 
     return cache_size;

From 21263eba8792428581c6f8ecb387486e84c30b9a Mon Sep 17 00:00:00 2001
From: EgorBo <egorbo@gmail.com>
Date: Thu, 3 Feb 2022 18:49:29 +0300
Subject: [PATCH 5/6] Fix comments

---
 src/coreclr/gc/unix/gcenv.unix.cpp       | 2 +-
 src/coreclr/gc/windows/gcenv.windows.cpp | 2 +-
 src/coreclr/pal/src/misc/sysinfo.cpp     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/coreclr/gc/unix/gcenv.unix.cpp b/src/coreclr/gc/unix/gcenv.unix.cpp
index 5fd41ee2d415d..689864179093f 100644
--- a/src/coreclr/gc/unix/gcenv.unix.cpp
+++ b/src/coreclr/gc/unix/gcenv.unix.cpp
@@ -932,7 +932,7 @@ static size_t GetLogicalProcessorCacheSizeFromOS()
 
     // Estimate cache size based on CPU count
     // Assume lower core count are lighter weight parts which are likely to have smaller caches
-    // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs
+    // Assume L3$/CPU grows linearly from 256Kb to 4Mb/CPU as logicalCPUs grows from 2 to 12 CPUs
 
     // As of 2022, in most cases /sys/devices/system/cpu/cpu*/cache/index*/ does present, but only 
     // reports L2 cache size and says nothing about L3 even if it exists. In this case we don't want
diff --git a/src/coreclr/gc/windows/gcenv.windows.cpp b/src/coreclr/gc/windows/gcenv.windows.cpp
index acdefaeed9dfa..6fccf0fddbca0 100644
--- a/src/coreclr/gc/windows/gcenv.windows.cpp
+++ b/src/coreclr/gc/windows/gcenv.windows.cpp
@@ -468,7 +468,7 @@ size_t GetLogicalProcessorCacheSizeFromOS()
 
     // Estimate cache size based on CPU count
     // Assume lower core count are lighter weight parts which are likely to have smaller caches
-    // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs
+    // Assume L3$/CPU grows linearly from 256Kb to 4Mb/CPU as logicalCPUs grows from 2 to 12 CPUs
     size_t predictedSize = min(4096, max(256, logicalCPUs * 128)) * 1024;
     cache_size = max(predictedSize, cache_size);
 #endif
diff --git a/src/coreclr/pal/src/misc/sysinfo.cpp b/src/coreclr/pal/src/misc/sysinfo.cpp
index 7ba80a4e6fe2a..6b96b5c8b7847 100644
--- a/src/coreclr/pal/src/misc/sysinfo.cpp
+++ b/src/coreclr/pal/src/misc/sysinfo.cpp
@@ -599,7 +599,7 @@ PAL_GetLogicalProcessorCacheSizeFromOS()
 
     // Estimate cache size based on CPU count
     // Assume lower core count are lighter weight parts which are likely to have smaller caches
-    // Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs
+    // Assume L3$/CPU grows linearly from 256Kb to 4Mb/CPU as logicalCPUs grows from 2 to 12 CPUs
 
     // As of 2022, in most cases /sys/devices/system/cpu/cpu*/cache/index*/ does present, but only 
     // reports L2 cache size and says nothing about L3 even if it exists. In this case we don't want

From 37b386d952386204e62d102408aa5a1a3adf66d1 Mon Sep 17 00:00:00 2001
From: EgorBo <egorbo@gmail.com>
Date: Mon, 7 Feb 2022 22:17:01 +0300
Subject: [PATCH 6/6] Fix comments

---
 src/coreclr/gc/unix/gcenv.unix.cpp       | 2 +-
 src/coreclr/gc/windows/gcenv.windows.cpp | 2 +-
 src/coreclr/pal/src/misc/sysinfo.cpp     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/coreclr/gc/unix/gcenv.unix.cpp b/src/coreclr/gc/unix/gcenv.unix.cpp
index 689864179093f..4bda2dbb2020b 100644
--- a/src/coreclr/gc/unix/gcenv.unix.cpp
+++ b/src/coreclr/gc/unix/gcenv.unix.cpp
@@ -932,7 +932,7 @@ static size_t GetLogicalProcessorCacheSizeFromOS()
 
     // Estimate cache size based on CPU count
     // Assume lower core count are lighter weight parts which are likely to have smaller caches
-    // Assume L3$/CPU grows linearly from 256Kb to 4Mb/CPU as logicalCPUs grows from 2 to 12 CPUs
+    // Assume shared L3 grows linearly from 256Kb to 4Mb as logicalCPUs grows from 2 to 32 CPUs
 
     // As of 2022, in most cases /sys/devices/system/cpu/cpu*/cache/index*/ does present, but only 
     // reports L2 cache size and says nothing about L3 even if it exists. In this case we don't want
diff --git a/src/coreclr/gc/windows/gcenv.windows.cpp b/src/coreclr/gc/windows/gcenv.windows.cpp
index 6fccf0fddbca0..5c80881b47d65 100644
--- a/src/coreclr/gc/windows/gcenv.windows.cpp
+++ b/src/coreclr/gc/windows/gcenv.windows.cpp
@@ -468,7 +468,7 @@ size_t GetLogicalProcessorCacheSizeFromOS()
 
     // Estimate cache size based on CPU count
     // Assume lower core count are lighter weight parts which are likely to have smaller caches
-    // Assume L3$/CPU grows linearly from 256Kb to 4Mb/CPU as logicalCPUs grows from 2 to 12 CPUs
+    // Assume shared L3 grows linearly from 256Kb to 4Mb as logicalCPUs grows from 2 to 32 CPUs
     size_t predictedSize = min(4096, max(256, logicalCPUs * 128)) * 1024;
     cache_size = max(predictedSize, cache_size);
 #endif
diff --git a/src/coreclr/pal/src/misc/sysinfo.cpp b/src/coreclr/pal/src/misc/sysinfo.cpp
index 6b96b5c8b7847..4c131339e9fe1 100644
--- a/src/coreclr/pal/src/misc/sysinfo.cpp
+++ b/src/coreclr/pal/src/misc/sysinfo.cpp
@@ -599,7 +599,7 @@ PAL_GetLogicalProcessorCacheSizeFromOS()
 
     // Estimate cache size based on CPU count
     // Assume lower core count are lighter weight parts which are likely to have smaller caches
-    // Assume L3$/CPU grows linearly from 256Kb to 4Mb/CPU as logicalCPUs grows from 2 to 12 CPUs
+    // Assume shared L3 grows linearly from 256Kb to 4Mb as logicalCPUs grows from 2 to 32 CPUs
 
     // As of 2022, in most cases /sys/devices/system/cpu/cpu*/cache/index*/ does present, but only 
     // reports L2 cache size and says nothing about L3 even if it exists. In this case we don't want