Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT: Reorder stores to make them amenable to stp optimization #102133

Merged
merged 8 commits into from
May 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 108 additions & 26 deletions src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -414,8 +414,7 @@ GenTree* Lowering::LowerNode(GenTree* node)
}

case GT_STOREIND:
LowerStoreIndirCommon(node->AsStoreInd());
break;
return LowerStoreIndirCommon(node->AsStoreInd());

case GT_ADD:
{
Expand Down Expand Up @@ -8899,7 +8898,10 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind)
// Arguments:
// ind - the store indirection node we are lowering.
//
void Lowering::LowerStoreIndirCommon(GenTreeStoreInd* ind)
// Return Value:
// Next node to lower.
//
GenTree* Lowering::LowerStoreIndirCommon(GenTreeStoreInd* ind)
{
assert(ind->TypeGet() != TYP_STRUCT);

Expand All @@ -8914,28 +8916,30 @@ void Lowering::LowerStoreIndirCommon(GenTreeStoreInd* ind)
#endif
TryCreateAddrMode(ind->Addr(), isContainable, ind);

if (!comp->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind))
if (comp->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind))
{
return ind->gtNext;
}

#ifndef TARGET_XARCH
if (ind->Data()->IsIconHandle(GTF_ICON_OBJ_HDL))
if (ind->Data()->IsIconHandle(GTF_ICON_OBJ_HDL))
{
const ssize_t handle = ind->Data()->AsIntCon()->IconValue();
if (!comp->info.compCompHnd->isObjectImmutable(reinterpret_cast<CORINFO_OBJECT_HANDLE>(handle)))
{
const ssize_t handle = ind->Data()->AsIntCon()->IconValue();
if (!comp->info.compCompHnd->isObjectImmutable(reinterpret_cast<CORINFO_OBJECT_HANDLE>(handle)))
{
// On platforms with weaker memory model we need to make sure we use a store with the release semantic
// when we publish a potentially mutable object
// See relevant discussions https://github.com/dotnet/runtime/pull/76135#issuecomment-1257258310 and
// https://github.com/dotnet/runtime/pull/76112#discussion_r980639782
// On platforms with weaker memory model we need to make sure we use a store with the release semantic
// when we publish a potentially mutable object
// See relevant discussions https://github.com/dotnet/runtime/pull/76135#issuecomment-1257258310 and
// https://github.com/dotnet/runtime/pull/76112#discussion_r980639782

// This can be relaxed to "just make sure to use stlr/memory barrier" if needed
ind->gtFlags |= GTF_IND_VOLATILE;
}
// This can be relaxed to "just make sure to use stlr/memory barrier" if needed
ind->gtFlags |= GTF_IND_VOLATILE;
}
}
#endif

LowerStoreIndirCoalescing(ind);
LowerStoreIndir(ind);
}
LowerStoreIndirCoalescing(ind);
return LowerStoreIndir(ind);
}

//------------------------------------------------------------------------
Expand Down Expand Up @@ -9018,7 +9022,7 @@ GenTree* Lowering::LowerIndir(GenTreeIndir* ind)
#ifdef TARGET_ARM64
if (comp->opts.OptimizationEnabled() && ind->OperIs(GT_IND))
{
OptimizeForLdp(ind);
OptimizeForLdpStp(ind);
}
#endif

Expand All @@ -9033,7 +9037,7 @@ GenTree* Lowering::LowerIndir(GenTreeIndir* ind)
// cases passing the distance check, but 82 out of these 112 extra cases were
// then rejected due to interference. So 16 seems like a good number to balance
// the throughput costs.
const int LDP_REORDERING_MAX_DISTANCE = 16;
const int LDP_STP_REORDERING_MAX_DISTANCE = 16;

//------------------------------------------------------------------------
// OptimizeForLdp: Record information about an indirection, and try to optimize
Expand All @@ -9046,7 +9050,7 @@ const int LDP_REORDERING_MAX_DISTANCE = 16;
// Returns:
// True if the optimization was successful.
//
bool Lowering::OptimizeForLdp(GenTreeIndir* ind)
bool Lowering::OptimizeForLdpStp(GenTreeIndir* ind)
kunalspathak marked this conversation as resolved.
Show resolved Hide resolved
{
if (!ind->TypeIs(TYP_INT, TYP_LONG, TYP_FLOAT, TYP_DOUBLE, TYP_SIMD8, TYP_SIMD16) || ind->IsVolatile())
{
Expand All @@ -9064,7 +9068,7 @@ bool Lowering::OptimizeForLdp(GenTreeIndir* ind)

// Every indirection takes an expected 2+ nodes, so we only expect at most
// half the reordering distance to be candidates for the optimization.
int maxCount = min(m_blockIndirs.Height(), LDP_REORDERING_MAX_DISTANCE / 2);
int maxCount = min(m_blockIndirs.Height(), LDP_STP_REORDERING_MAX_DISTANCE / 2);
for (int i = 0; i < maxCount; i++)
{
SavedIndir& prev = m_blockIndirs.TopRef(i);
Expand All @@ -9079,11 +9083,22 @@ bool Lowering::OptimizeForLdp(GenTreeIndir* ind)
continue;
}

if (prevIndir->gtNext == nullptr)
{
// Deleted by other optimization
continue;
}

if (prevIndir->OperIsStore() != ind->OperIsStore())
{
continue;
}

JITDUMP("[%06u] and [%06u] are indirs off the same base with offsets +%03u and +%03u\n",
Compiler::dspTreeID(ind), Compiler::dspTreeID(prevIndir), (unsigned)offs, (unsigned)prev.Offset);
if (std::abs(offs - prev.Offset) == genTypeSize(ind))
{
JITDUMP(" ..and they are amenable to ldp optimization\n");
JITDUMP(" ..and they are amenable to ldp/stp optimization\n");
if (TryMakeIndirsAdjacent(prevIndir, ind))
{
// Do not let the previous one participate in
Expand Down Expand Up @@ -9119,7 +9134,7 @@ bool Lowering::OptimizeForLdp(GenTreeIndir* ind)
bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indir)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The baseAddr for both prevIndir and indir is assumed to be same? Can we have an assert for it?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not that easy to assert (it won't be exactly the same, just something that we expect the emitter peephole to kick in for). But it's also not a precondition for this function that the addresses must be related in some way -- the function works fine even without that. So I don't think there is a good reason to try to assert it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean will we mistakenly use offsets of different base address and combine them, leading to bad codegen?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This reordering here doesn't combine the stores or loads. It just puts them next to each other. Combining them is done by the peephole in the emitter.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, what I meant is we might end up reordering unrelated indir and prevIndir but the peephole emitter will make sure that we combine the correct matching indir and prevIndir only.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but this function is still correct even if you ask it to make two indirs off of unrelated addresses adjacent. There is no correctness requirement that the addresses relate in some way; this is not a precondition for the function. Hence why I don't see a good reason to try to assert that (and furthermore, it isn't easy to assert it).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes

Ok, just wanted to confirm my understanding.

{
GenTree* cur = prevIndir;
for (int i = 0; i < LDP_REORDERING_MAX_DISTANCE; i++)
for (int i = 0; i < LDP_STP_REORDERING_MAX_DISTANCE; i++)
{
cur = cur->gtNext;
if (cur == indir)
Expand Down Expand Up @@ -9176,8 +9191,16 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi
INDEBUG(dumpWithMarks());
JITDUMP("\n");

if ((prevIndir->gtLIRFlags & LIR::Flags::Mark) != 0)
{
JITDUMP("Previous indir is part of the data flow of current indir\n");
UnmarkTree(indir);
return false;
}

m_scratchSideEffects.Clear();

bool sawData = false;
for (GenTree* cur = prevIndir->gtNext; cur != indir; cur = cur->gtNext)
{
if ((cur->gtLIRFlags & LIR::Flags::Mark) != 0)
Expand All @@ -9190,6 +9213,11 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi
UnmarkTree(indir);
return false;
}

if (indir->OperIsStore())
{
sawData |= cur == indir->Data();
}
}
else
{
Expand All @@ -9201,6 +9229,13 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi

if (m_scratchSideEffects.InterferesWith(comp, indir, true))
{
if (!indir->OperIsLoad())
{
JITDUMP("Have conservative interference with last store. Giving up.\n");
UnmarkTree(indir);
return false;
}

// Try a bit harder, making use of the following facts:
//
// 1. We know the indir is non-faulting, so we do not need to worry
Expand Down Expand Up @@ -9297,8 +9332,39 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi
}
}

JITDUMP("Interference checks passed. Moving nodes that are not part of data flow of [%06u]\n\n",
Compiler::dspTreeID(indir));
JITDUMP("Interference checks passed: can move unrelated nodes past second indir.\n");

if (sawData)
{
// If the data node of 'indir' is between 'prevIndir' and 'indir' then
// try to move the previous indir up to happen after the data
// computation. We will be moving all nodes unrelated to the data flow
// past 'indir', so we only need to check interference between
// 'prevIndir' and all nodes that are part of 'indir's dataflow.
m_scratchSideEffects.Clear();
m_scratchSideEffects.AddNode(comp, prevIndir);

for (GenTree* cur = prevIndir->gtNext;; cur = cur->gtNext)
{
if ((cur->gtLIRFlags & LIR::Flags::Mark) != 0)
{
if (m_scratchSideEffects.InterferesWith(comp, cur, true))
{
JITDUMP("Cannot move prev indir [%06u] up past [%06u] to get it past the data computation\n",
Compiler::dspTreeID(prevIndir), Compiler::dspTreeID(cur));
UnmarkTree(indir);
return false;
}
}

if (cur == indir->Data())
{
break;
}
}
}

JITDUMP("Moving nodes that are not part of data flow of [%06u]\n\n", Compiler::dspTreeID(indir));

GenTree* previous = prevIndir;
for (GenTree* node = prevIndir->gtNext;;)
Expand All @@ -9321,6 +9387,22 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi
node = next;
}

if (sawData)
{
// For some reason LSRA is not able to reuse a constant if both LIR
// temps are live simultaneously, so skip moving in those cases and
// expect LSRA to reuse the constant instead.
if (indir->Data()->OperIs(GT_CNS_INT, GT_CNS_DBL) && GenTree::Compare(indir->Data(), prevIndir->Data()))
{
JITDUMP("Not moving previous indir since we are expecting constant reuse for the data\n");
}
else
{
BlockRange().Remove(prevIndir);
BlockRange().InsertAfter(indir->Data(), prevIndir);
}
}

JITDUMP("Result:\n\n");
INDEBUG(dumpWithMarks());
JITDUMP("\n");
Expand Down
6 changes: 3 additions & 3 deletions src/coreclr/jit/lower.h
Original file line number Diff line number Diff line change
Expand Up @@ -345,13 +345,13 @@ class Lowering final : public Phase
bool GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescingData* data) const;

// Per tree node member functions
void LowerStoreIndirCommon(GenTreeStoreInd* ind);
GenTree* LowerStoreIndirCommon(GenTreeStoreInd* ind);
GenTree* LowerIndir(GenTreeIndir* ind);
bool OptimizeForLdp(GenTreeIndir* ind);
bool OptimizeForLdpStp(GenTreeIndir* ind);
bool TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indir);
void MarkTree(GenTree* root);
void UnmarkTree(GenTree* root);
void LowerStoreIndir(GenTreeStoreInd* node);
GenTree* LowerStoreIndir(GenTreeStoreInd* node);
void LowerStoreIndirCoalescing(GenTreeIndir* node);
GenTree* LowerAdd(GenTreeOp* node);
GenTree* LowerMul(GenTreeOp* mul);
Expand Down
14 changes: 12 additions & 2 deletions src/coreclr/jit/lowerarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -491,11 +491,21 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
// node - The indirect store node (GT_STORE_IND) of interest
//
// Return Value:
// None.
// Next node to lower.
//
void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
GenTree* Lowering::LowerStoreIndir(GenTreeStoreInd* node)
{
GenTree* next = node->gtNext;
ContainCheckStoreIndir(node);

#ifdef TARGET_ARM64
if (comp->opts.OptimizationEnabled())
{
OptimizeForLdpStp(node);
}
#endif

return next;
}

//------------------------------------------------------------------------
Expand Down
5 changes: 3 additions & 2 deletions src/coreclr/jit/lowerloongarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -266,11 +266,12 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
// node - The indirect store node (GT_STORE_IND) of interest
//
// Return Value:
// None.
// Next node to lower.
//
void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
GenTree* Lowering::LowerStoreIndir(GenTreeStoreInd* node)
{
ContainCheckStoreIndir(node);
return node->gtNext;
}

//------------------------------------------------------------------------
Expand Down
5 changes: 3 additions & 2 deletions src/coreclr/jit/lowerriscv64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,11 +215,12 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
// node - The indirect store node (GT_STORE_IND) of interest
//
// Return Value:
// None.
// Next node to lower.
//
void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
GenTree* Lowering::LowerStoreIndir(GenTreeStoreInd* node)
{
ContainCheckStoreIndir(node);
return node->gtNext;
}

//------------------------------------------------------------------------
Expand Down
14 changes: 8 additions & 6 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
// node - The indirect store node (GT_STORE_IND) of interest
//
// Return Value:
// None.
// Next node to lower.
//
void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
GenTree* Lowering::LowerStoreIndir(GenTreeStoreInd* node)
{
// Mark all GT_STOREIND nodes to indicate that it is not known
// whether it represents a RMW memory op.
Expand All @@ -92,7 +92,7 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
// SSE2 doesn't support RMW form of instructions.
if (LowerRMWMemOp(node))
{
return;
return node->gtNext;
}
}

Expand All @@ -109,23 +109,25 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
{
if (!node->Data()->IsCnsVec())
{
return;
return node->gtNext;
}

if (!node->Data()->AsVecCon()->TypeIs(TYP_SIMD32, TYP_SIMD64))
{
return;
return node->gtNext;
}

if (node->Data()->IsVectorAllBitsSet() || node->Data()->IsVectorZero())
{
// To avoid some unexpected regression, this optimization only applies to non-all 1/0 constant vectors.
return;
return node->gtNext;
}

TryCompressConstVecData(node);
}
#endif

return node->gtNext;
}

//----------------------------------------------------------------------------------------------
Expand Down
Loading