Perform PhysicalMemoryLimit check for workstation GC, refactor GetLar…

…gestOnDieCacheSize into GetCacheSizePerLogicalCpu (#15975) * refactor: combine GetLargestOnDieCacheSize and GetLogicalCpuCount in GetCacheSizePerLogicalCpu * Perform PhysicalMemoryLimit check also for workstation GC
dotnet · Jan 29, 2018 · cb73944 · cb73944
1 parent 850a5be
commit cb73944
Show file tree

Hide file tree

Showing 12 changed files with 50 additions and 259 deletions.
diff --git a/src/gc/env/gcenv.os.h b/src/gc/env/gcenv.os.h
@@ -282,16 +282,13 @@ class GCToOSInterface
     // Processor topology
     //
 
-    // Get number of logical processors
-    static uint32_t GetLogicalCpuCount();
-
-    // Get size of the largest cache on the processor die
+    // Get size of the on die cache per logical processor
     // Parameters:
     //  trueSize - true to return true cache size, false to return scaled up size based on
     //             the processor architecture
     // Return:
     //  Size of the cache
-    static size_t GetLargestOnDieCacheSize(bool trueSize = true);
+    static size_t GetCacheSizePerLogicalCpu(bool trueSize = true);
 
     // Get number of processors assigned to the current process
     // Return:

diff --git a/src/gc/gc.cpp b/src/gc/gc.cpp
@@ -15756,7 +15756,7 @@ void gc_heap::gc1()
                     size_t min_gc_size = dd_min_gc_size(dd);
                     // if min GC size larger than true on die cache, then don't bother
                     // limiting the desired size
-                    if ((min_gc_size <= GCToOSInterface::GetLargestOnDieCacheSize(TRUE) / GCToOSInterface::GetLogicalCpuCount()) &&
+                    if ((min_gc_size <= GCToOSInterface::GetCacheSizePerLogicalCpu(TRUE)) &&
                         desired_per_heap <= 2*min_gc_size)
                     {
                         desired_per_heap = min_gc_size;
@@ -35523,19 +35523,26 @@ size_t GCHeap::GetValidGen0MaxSize(size_t seg_size)
 #ifdef SERVER_GC
         // performance data seems to indicate halving the size results
         // in optimal perf.  Ask for adjusted gen0 size.
-        gen0size = max(GCToOSInterface::GetLargestOnDieCacheSize(FALSE)/GCToOSInterface::GetLogicalCpuCount(),(256*1024));
+        gen0size = max(GCToOSInterface::GetCacheSizePerLogicalCpu(FALSE),(256*1024));
 
         // if gen0 size is too large given the available memory, reduce it.
         // Get true cache size, as we don't want to reduce below this.
-        size_t trueSize = max(GCToOSInterface::GetLargestOnDieCacheSize(TRUE)/GCToOSInterface::GetLogicalCpuCount(),(256*1024));
+        size_t trueSize = max(GCToOSInterface::GetCacheSizePerLogicalCpu(TRUE),(256*1024));
         dprintf (2, ("cache: %Id-%Id, cpu: %Id", 
-            GCToOSInterface::GetLargestOnDieCacheSize(FALSE),
-            GCToOSInterface::GetLargestOnDieCacheSize(TRUE),
-            GCToOSInterface::GetLogicalCpuCount()));
+            GCToOSInterface::GetCacheSizePerLogicalCpu(FALSE),
+            GCToOSInterface::GetCacheSizePerLogicalCpu(TRUE)));
+
+        int n_heaps = gc_heap::n_heaps;
+#else //SERVER_GC
+        size_t trueSize = GCToOSInterface::GetCacheSizePerLogicalCpu(TRUE);
+        gen0size = max((4*trueSize/5),(256*1024));
+        trueSize = max(trueSize, (256*1024));
+        int n_heaps = 1;
+#endif //SERVER_GC
 
         // if the total min GC across heaps will exceed 1/6th of available memory,
         // then reduce the min GC size until it either fits or has been reduced to cache size.
-        while ((gen0size * gc_heap::n_heaps) > GCToOSInterface::GetPhysicalMemoryLimit() / 6)
+        while ((gen0size * n_heaps) > GCToOSInterface::GetPhysicalMemoryLimit() / 6)
         {
             gen0size = gen0size / 2;
             if (gen0size <= trueSize)
@@ -35544,9 +35551,6 @@ size_t GCHeap::GetValidGen0MaxSize(size_t seg_size)
                 break;
             }
         }
-#else //SERVER_GC
-        gen0size = max((4*GCToOSInterface::GetLargestOnDieCacheSize(TRUE)/5),(256*1024));
-#endif //SERVER_GC
     }
 
     // Generation 0 must never be more than 1/2 the segment size.

diff --git a/src/gc/unix/gcenv.unix.cpp b/src/gc/unix/gcenv.unix.cpp
@@ -221,12 +221,6 @@ void GCToOSInterface::DebugBreak()
 #endif
 }
 
-// Get number of logical processors
-uint32_t GCToOSInterface::GetLogicalCpuCount()
-{
-    return g_logicalCpuCount;
-}
-
 // Causes the calling thread to sleep for the specified number of milliseconds
 // Parameters:
 //  sleepMSec   - time to sleep before switching to another thread
@@ -403,7 +397,7 @@ bool GCToOSInterface::GetWriteWatch(bool resetState, void* address, size_t size,
 //             the processor architecture
 // Return:
 //  Size of the cache
-size_t GCToOSInterface::GetLargestOnDieCacheSize(bool trueSize)
+size_t GCToOSInterface::GetCacheSizePerLogicalCpu(bool trueSize)
 {
     // TODO(segilles) processor detection
     return 0;

diff --git a/src/gc/windows/gcenv.windows.cpp b/src/gc/windows/gcenv.windows.cpp
@@ -228,13 +228,6 @@ void GCToOSInterface::DebugBreak()
     ::DebugBreak();
 }
 
-// Get number of logical processors
-uint32_t GCToOSInterface::GetLogicalCpuCount()
-{
-    // TODO(segilles) processor detection
-    return 1;
-}
-
 // Causes the calling thread to sleep for the specified number of milliseconds
 // Parameters:
 //  sleepMSec   - time to sleep before switching to another thread
@@ -381,7 +374,7 @@ bool GCToOSInterface::GetWriteWatch(bool resetState, void* address, size_t size,
 //             the processor architecture
 // Return:
 //  Size of the cache
-size_t GCToOSInterface::GetLargestOnDieCacheSize(bool trueSize)
+size_t GCToOSInterface::GetCacheSizePerLogicalCpu(bool trueSize)
 {
     // TODO(segilles) processor detection (see src/vm/util.cpp:1935)
     return 0;

diff --git a/src/vm/CMakeLists.txt b/src/vm/CMakeLists.txt
@@ -477,7 +477,6 @@ elseif(CLR_CMAKE_TARGET_ARCH_ARM)
     )
 elseif(CLR_CMAKE_TARGET_ARCH_ARM64)
     set(VM_SOURCES_DAC_AND_WKS_ARCH
-        ${ARCH_SOURCES_DIR}/cgenarm64.cpp
         ${ARCH_SOURCES_DIR}/stubs.cpp
         exceptionhandling.cpp
         gcinfodecoder.cpp

diff --git a/src/vm/amd64/cgenamd64.cpp b/src/vm/amd64/cgenamd64.cpp
@@ -458,89 +458,6 @@ BOOL GetAnyThunkTarget (CONTEXT *pctx, TADDR *pTarget, TADDR *pTargetMethodDesc)
 // determine the number of logical cpus, or the machine is not populated uniformly with the same
 // type of processors, this function returns 1.
 
-extern "C" DWORD __stdcall getcpuid(DWORD arg, unsigned char result[16]);
-
-// fix this if/when AMD does multicore or SMT
-DWORD GetLogicalCpuCount()
-{
-    // No CONTRACT possible because GetLogicalCpuCount uses SEH
-
-    STATIC_CONTRACT_THROWS;
-    STATIC_CONTRACT_GC_NOTRIGGER;
-
-    static DWORD val = 0;
-
-    // cache value for later re-use
-    if (val)
-    {
-        return val;
-    }   
-
-    struct Param : DefaultCatchFilterParam
-    {
-        DWORD retVal;
-    } param;
-    param.pv = COMPLUS_EXCEPTION_EXECUTE_HANDLER;
-    param.retVal = 1;    
-
-    PAL_TRY(Param *, pParam, &param)
-    {    
-
-        unsigned char buffer[16];
-        DWORD maxCpuId = getcpuid(0, buffer);
-        DWORD* dwBuffer = (DWORD*)buffer;
-
-        if (maxCpuId < 1)
-            goto qExit;
-
-        if (dwBuffer[1] == 'uneG') {
-            if (dwBuffer[3] == 'Ieni') {
-                if (dwBuffer[2] == 'letn')  {        // get SMT/multicore enumeration for Intel EM64T 
-
-
-                    // TODO: Currently GetLogicalCpuCountFromOS() and GetLogicalCpuCountFallback() are broken on 
-                    // multi-core processor, but we never call into those two functions since we don't halve the
-                    // gen0size when it's prescott and above processor. We keep the old version here for earlier
-                    // generation system(Northwood based), perf data suggests on those systems, halve gen0 size 
-                    // still boost the performance(ex:Biztalk boosts about 17%). So on earlier systems(Northwood) 
-                    // based, we still go ahead and halve gen0 size.  The logic in GetLogicalCpuCountFromOS() 
-                    // and GetLogicalCpuCountFallback() works fine for those earlier generation systems. 
-                    // If it's a Prescott and above processor or Multi-core, perf data suggests not to halve gen0 
-                    // size at all gives us overall better performance. 
-                    // This is going to be fixed with a new version in orcas time frame. 
-
-                    if( (maxCpuId > 3) && (maxCpuId < 0x80000000) )   
-                        goto qExit;
-
-                    val = GetLogicalCpuCountFromOS(); //try to obtain HT enumeration from OS API
-                    if (val )
-                    {
-                        pParam->retVal = val;     // OS API HT enumeration successful, we are Done
-                        goto qExit;
-                    }
-
-                    val = GetLogicalCpuCountFallback();    // Fallback to HT enumeration using CPUID
-                    if( val )
-                        pParam->retVal = val;
-                }
-            }
-        }
-qExit: ;
-    }
-
-    PAL_EXCEPT_FILTER(DefaultCatchFilter)
-    {
-    }
-    PAL_ENDTRY
-
-    if (val == 0)
-    {
-        val = param.retVal;  
-    }
-
-    return param.retVal;
-}
-
 void EncodeLoadAndJumpThunk (LPBYTE pBuffer, LPVOID pv, LPVOID pTarget)
 {
     CONTRACTL

diff --git a/src/vm/arm/stubs.cpp b/src/vm/arm/stubs.cpp
@@ -3369,13 +3369,6 @@ void emitCOMStubCall (ComCallMethodDesc *pCOMMethod, PCODE target)
 
 #ifndef CROSSGEN_COMPILE
 
-DWORD GetLogicalCpuCount()
-{
-    // Just use the OS to return this information (the APIs used exist on all versions of Windows which
-    // support ARM).
-    return GetLogicalCpuCountFromOS();
-}
-
 #ifdef FEATURE_READYTORUN
 
 //

diff --git a/src/vm/arm64/cgenarm64.cpp b/src/vm/arm64/cgenarm64.cpp
diff --git a/src/vm/cgensys.h b/src/vm/cgensys.h
@@ -34,10 +34,6 @@ int  CallJitEHFilter (CrawlFrame* pCf, BYTE* startPC, EE_ILEXCEPTION_CLAUSE *EHC
 void CallJitEHFinally(CrawlFrame* pCf, BYTE* startPC, EE_ILEXCEPTION_CLAUSE *EHClausePtr, DWORD nestingLevel);
 #endif // _TARGET_X86_
 
-
-// get number of logical to physical processors.  Returns 1 on failure or non-intel x86 processors.
-DWORD GetLogicalCpuCount();
-
 //These are in util.cpp
 extern size_t GetLogicalProcessorCacheSizeFromOS();
 extern size_t GetIntelDeterministicCacheEnum();
@@ -47,7 +43,7 @@ extern DWORD GetLogicalCpuCountFallback();
 
 
 // Try to determine the largest last-level cache size of the machine - return 0 if unknown or no L2/L3 cache
-size_t GetLargestOnDieCacheSize(BOOL bTrueSize = TRUE);
+size_t GetCacheSizePerLogicalCpu(BOOL bTrueSize = TRUE);
 
 
 #ifdef FEATURE_COMINTEROP

diff --git a/src/vm/gcenv.os.cpp b/src/vm/gcenv.os.cpp
@@ -145,13 +145,6 @@ void GCToOSInterface::DebugBreak()
     ::DebugBreak();
 }
 
-// Get number of logical processors
-uint32_t GCToOSInterface::GetLogicalCpuCount()
-{
-    LIMITED_METHOD_CONTRACT;
-    return ::GetLogicalCpuCount();
-}
-
 // Causes the calling thread to sleep for the specified number of milliseconds
 // Parameters:
 //  sleepMSec   - time to sleep before switching to another thread
@@ -322,11 +315,11 @@ bool GCToOSInterface::GetWriteWatch(bool resetState, void* address, size_t size,
 //             the processor architecture
 // Return:
 //  Size of the cache
-size_t GCToOSInterface::GetLargestOnDieCacheSize(bool trueSize)
+size_t GCToOSInterface::GetCacheSizePerLogicalCpu(bool trueSize)
 {
     LIMITED_METHOD_CONTRACT;
 
-    return ::GetLargestOnDieCacheSize(trueSize);
+    return ::GetCacheSizePerLogicalCpu(trueSize);
 }
 
 // Sets the calling thread's affinity to only run on the processor specified