From 7a54f19f429c776c9d404a14f5a440e943d7bf13 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Mon, 27 Apr 2026 17:23:31 -0700 Subject: [PATCH] JIT: avoid store forward stall for struct params in GS frames If we have a struct param in a GS frame, we will spill it using narrow writes and then copy it to the shadow param with wide stores, causing a store-forward stall. Try and avoid this by forcing the copies to be int-register sized. Addresses #121248. --- src/coreclr/jit/codegenxarch.cpp | 8 +++++++- src/coreclr/jit/lowerxarch.cpp | 19 ++++++++++++++----- src/coreclr/jit/lsraxarch.cpp | 13 +++++++++++-- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 3ddb8389951247..dc9c198775e461 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3434,6 +3434,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) unsigned srcAddrIndexScale = 1; int srcOffset = 0; GenTree* src = node->Data(); + bool srcIsRegArg = false; assert(src->isContained()); @@ -3441,6 +3442,11 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) { srcLclNum = src->AsLclVarCommon()->GetLclNum(); srcOffset = src->AsLclVarCommon()->GetLclOffs(); + + if (src->OperIs(GT_LCL_VAR)) + { + srcIsRegArg = m_compiler->lvaGetDesc(srcLclNum)->lvIsMultiRegArg; + } } else { @@ -3486,7 +3492,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) // Get the largest SIMD register available if the size is large enough unsigned regSize = m_compiler->roundDownSIMDSize(size); - if ((size >= regSize) && (regSize > 0)) + if ((size >= regSize) && (regSize > 0) && !srcIsRegArg) { regNumber tempReg = internalRegisters.GetSingle(node, RBM_ALLFLOAT); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index ea3684e0ff0ba0..1a0e9cb98c7558 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -467,18 +467,27 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) assert(src->OperIs(GT_IND, GT_LCL_VAR, GT_LCL_FLD)); src->SetContained(); + bool isMultiRegArgCopy = false; + if (src->OperIs(GT_LCL_VAR)) { // TODO-1stClassStructs: for now we can't work with STORE_BLOCK source in register. const unsigned srcLclNum = src->AsLclVar()->GetLclNum(); m_compiler->lvaSetVarDoNotEnregister(srcLclNum DEBUGARG(DoNotEnregisterReason::StoreBlkSrc)); + isMultiRegArgCopy = m_compiler->lvaGetDesc(srcLclNum)->lvIsMultiRegArg; + } + + ClassLayout* layout = blkNode->GetLayout(); + bool doCpObj = layout->HasGCPtr(); + bool isNotHeap = blkNode->IsAddressNotOnHeap(m_compiler); + bool canUseSimd = !doCpObj || isNotHeap; + + if (isMultiRegArgCopy) + { + canUseSimd = false; } - ClassLayout* layout = blkNode->GetLayout(); - bool doCpObj = layout->HasGCPtr(); - bool isNotHeap = blkNode->IsAddressNotOnHeap(m_compiler); - bool canUseSimd = !doCpObj || isNotHeap; - unsigned copyBlockUnrollLimit = m_compiler->getUnrollThreshold(Compiler::UnrollKind::Memcpy, canUseSimd); + unsigned copyBlockUnrollLimit = m_compiler->getUnrollThreshold(Compiler::UnrollKind::Memcpy, canUseSimd); #ifndef JIT32_GCENCODER if (doCpObj && (size <= copyBlockUnrollLimit)) diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 947e838fcd799a..f2ec1f85b710ee 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1495,7 +1495,15 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) unsigned regSize = m_compiler->roundDownSIMDSize(size); unsigned remainder = size; - if ((size >= regSize) && (regSize > 0)) + bool srcIsRegArg = false; + + if (src->OperIs(GT_LCL_VAR)) + { + unsigned srcLclNum = src->AsLclVar()->GetLclNum(); + srcIsRegArg = m_compiler->lvaGetDesc(srcLclNum)->lvIsMultiRegArg; + } + + if ((size >= regSize) && (regSize > 0) && !srcIsRegArg) { // We need a float temporary if we're doing SIMD operations @@ -1505,7 +1513,8 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) remainder %= regSize; } - if ((remainder > 0) && ((regSize == 0) || (isPow2(remainder) && (remainder <= REGSIZE_BYTES)))) + if (srcIsRegArg || + ((remainder > 0) && ((regSize == 0) || (isPow2(remainder) && (remainder <= REGSIZE_BYTES))))) { // We need an int temporary if we're not doing SIMD operations // or if are but the remainder is a power of 2 and less than the