Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Arm64: Combine if conditions into compare chains #79283

Merged
merged 32 commits into from
Mar 21, 2023
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
3aa62af
Create AND chains in bool optimizer pass
a74nh Dec 5, 2022
48b68ec
Remove contained and morph checks
a74nh Jan 26, 2023
56ff631
Add GT_ANDFLAGS node
a74nh Jan 26, 2023
a919a13
Fix X64 build
a74nh Jan 30, 2023
893cbb5
Review fixups
a74nh Feb 1, 2023
31c8560
Add CCMP_EQ/CCMP_NE nodes
a74nh Feb 3, 2023
60341f3
Various cleanups
a74nh Feb 6, 2023
177c92a
remove fgopt changes
a74nh Feb 6, 2023
62dabf4
restore lsraarm64 GT_AND change
a74nh Feb 7, 2023
ca38951
Make GT_CCMP_ into conditional nodes
a74nh Feb 14, 2023
2360f26
Merge branch main
a74nh Mar 8, 2023
8323f71
Remove lowering/codegen changes
a74nh Mar 8, 2023
bef86f0
update header
a74nh Mar 8, 2023
3bdceeb
Add costing with stress overrides
a74nh Mar 9, 2023
adf6b99
Better cbz comment
a74nh Mar 9, 2023
5b99a4d
Use fgRemoveRefPred
a74nh Mar 9, 2023
e8ebd2a
Allow reversed conditions in tests
a74nh Mar 9, 2023
748252f
Move optimize bools pass to a new file
a74nh Mar 9, 2023
6a94a6e
Improve cbz detection
a74nh Mar 9, 2023
8846bc3
Move optbools back into optimizer
a74nh Mar 13, 2023
1350849
Forward iterate through the blocks
a74nh Mar 13, 2023
f056765
Fixup comment block
a74nh Mar 13, 2023
a690be6
Improve scanning for existing chains
a74nh Mar 13, 2023
02bbafe
Fix formatting
a74nh Mar 13, 2023
8fcef67
Allow main optimize bools loop to run again
a74nh Mar 14, 2023
e7f4369
Check for tbz conditions generated from pow2 values
a74nh Mar 15, 2023
7f9fdb8
Fix and expand test cases
a74nh Mar 15, 2023
3d8ed4f
Minor fixups
a74nh Mar 15, 2023
191a650
Allow wider range of conditions
a74nh Mar 16, 2023
6cf9cd3
Minor cleanups
a74nh Mar 17, 2023
18ea05c
Reduce max allowed cost
a74nh Mar 17, 2023
238d810
Fix formatting
a74nh Mar 17, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -9757,6 +9757,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
STRESS_MODE(MERGED_RETURNS) \
STRESS_MODE(BB_PROFILE) \
STRESS_MODE(OPT_BOOLS_GC) \
STRESS_MODE(OPT_BOOLS_COMPARE_CHAIN_COST) \
STRESS_MODE(REMORPH_TREES) \
STRESS_MODE(64RSLT_MUL) \
STRESS_MODE(DO_WHILE_LOOPS) \
Expand Down
284 changes: 281 additions & 3 deletions src/coreclr/jit/optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9100,6 +9100,7 @@ class OptBoolsDsc

public:
bool optOptimizeBoolsCondBlock();
bool optOptimizeCompareChainCondBlock();
bool optOptimizeBoolsReturnBlock(BasicBlock* b3);
#ifdef DEBUG
void optOptimizeBoolsGcStress();
Expand All @@ -9108,8 +9109,9 @@ class OptBoolsDsc
private:
Statement* optOptimizeBoolsChkBlkCond();
GenTree* optIsBoolComp(OptTestInfo* pOptTest);
bool optOptimizeBoolsChkTypeCostCond();
void optOptimizeBoolsUpdateTrees();
bool optOptimizeBoolsChkTypeCostCond();
void optOptimizeBoolsUpdateTrees();
inline bool FindCompareChain(GenTree* condition, bool* isTestCondition);
a74nh marked this conversation as resolved.
Show resolved Hide resolved
};

//-----------------------------------------------------------------------------
Expand Down Expand Up @@ -9335,6 +9337,269 @@ bool OptBoolsDsc::optOptimizeBoolsCondBlock()
return true;
}

//-----------------------------------------------------------------------------
// FindCompareChain: Check if the given condition is a compare chain.
//
// Arguments:
// condition: Condition to check.
// isTestCondition: Returns true if condition is but is not a compare chain.
//
// Returns:
// true if chain optimization is a compare chain.
//
// Assumptions:
// m_b1 and m_b2 are set on entry.
//

inline bool OptBoolsDsc::FindCompareChain(GenTree* condition, bool* isTestCondition)
a74nh marked this conversation as resolved.
Show resolved Hide resolved
{
GenTree* condOp1 = condition->gtGetOp1();
GenTree* condOp2 = condition->gtGetOp2();

*isTestCondition = false;

if (condition->OperIs(GT_EQ, GT_NE) && condOp2->IsIntegralConst())
{
ssize_t condOp2Value = condOp2->AsIntCon()->IconValue();

if (condOp2Value == 0)
{
// Found a EQ/NE(...,0). Does it contain a compare chain (ie - conditions that have
// previously been combined by optOptimizeCompareChainCondBlock) or is it a test condition
// that will be optimised to cbz/cbnz during lowering?

if (condOp1->OperIs(GT_AND, GT_OR))
{
// Check that the second operand of AND/OR ends with a compare operation, as this will be
// the condition the new link in the chain will connect with.
if (condOp1->gtGetOp2()->OperIsCmpCompare() && varTypeIsIntegralOrI(condOp1->gtGetOp2()->gtGetOp1()))
{
return true;
}
}

*isTestCondition = true;
}
else if (condOp1->OperIs(GT_AND) && isPow2(static_cast<target_size_t>(condOp2Value)) &&
condOp1->gtGetOp2()->IsIntegralConst(condOp2Value))
{
// Found a EQ/NE(AND(...,n),n) which will be optimized to tbz/tbnz during lowering.
*isTestCondition = true;
}
}

return false;
}

//-----------------------------------------------------------------------------
// optOptimizeCompareChainCondBlock: Create a chain when when both m_b1 and m_b2 are BBJ_COND.
//
// Returns:
// true if chain optimization is done and m_b1 and m_b2 are folded into m_b1, else false.
//
// Assumptions:
// m_b1 and m_b2 are set on entry.
//
// Notes:
//
// This aims to reduced the number of conditional jumps by joining cases when multiple
// conditions gate the execution of a block.
//
// Example 1:
// If ( a > b || c == d) { x = y; }
//
// Will be represented in IR as:
//
// ------------ BB01 -> BB03 (cond), succs={BB02,BB03}
// * JTRUE (GT a,b)
//
// ------------ BB02 -> BB04 (cond), preds={BB01} succs={BB03,BB04}
// * JTRUE (NE c,d)
//
// ------------ BB03, preds={BB01, BB02} succs={BB04}
// * ASG (x,y)
//
// These operands will be combined into a single AND in the first block (with the first
// condition inverted), wrapped by the test condition (NE(...,0)). Giving:
//
// ------------ BB01 -> BB03 (cond), succs={BB03,BB04}
// * JTRUE (NE (AND (LE a,b), (NE c,d)), 0)
//
// ------------ BB03, preds={BB01} succs={BB04}
// * ASG x,y
//
//
// Example 2:
// If ( a > b && c == d) { x = y; } else { x = z; }
//
// Here the && conditions are connected via an OR. After the pass:
//
// ------------ BB01 -> BB03 (cond), succs={BB03,BB04}
// * JTRUE (NE (OR (LE a,b), (NE c,d)), 0)
//
// ------------ BB03, preds={BB01} succs={BB05}
// * ASG x,y
//
// ------------ BB04, preds={BB01} succs={BB05}
// * ASG x,z
//
//
// Example 3:
// If ( a > b || c == d || e < f ) { x = y; }
// The first pass of the optimization will combine two of the conditions. The
// second pass will then combine remaining condition the earlier chain.
//
// ------------ BB01 -> BB03 (cond), succs={BB03,BB04}
// * JTRUE (NE (OR ((NE (OR (NE c,d), (GE e,f)), 0), (LE a,b))), 0)
//
// ------------ BB03, preds={BB01} succs={BB04}
// * ASG x,y
//
//
// This optimization means that every condition within the IF statement is always evaluated,
// as opposed to stopping at the first positive match.
// Theoretically there is no maximum limit on the size of the generated chain. Therefore cost
// checking is used to limit the maximum number of conditions that can be chained together.
//
bool OptBoolsDsc::optOptimizeCompareChainCondBlock()
{
assert((m_b1 != nullptr) && (m_b2 != nullptr) && (m_b3 == nullptr));
m_t3 = nullptr;

bool foundEndOfOrConditions = false;
if ((m_b1->bbNext == m_b2) && (m_b1->bbJumpDest == m_b2->bbNext))
{
// Found the end of two (or more) conditions being ORed together.
// The final condition has been inverted.
foundEndOfOrConditions = true;
}
else if ((m_b1->bbNext == m_b2) && (m_b1->bbJumpDest == m_b2->bbJumpDest))
{
// Found two conditions connected together.
}
else
{
return false;
}

Statement* const s1 = optOptimizeBoolsChkBlkCond();
if (s1 == nullptr)
{
return false;
}
Statement* s2 = m_b2->firstStmt();

assert(m_testInfo1.testTree->OperIs(GT_JTRUE));
GenTree* cond1 = m_testInfo1.testTree->gtGetOp1();
assert(m_testInfo2.testTree->OperIs(GT_JTRUE));
GenTree* cond2 = m_testInfo2.testTree->gtGetOp1();

// Ensure both conditions are suitable.
if (!cond1->OperIsCompare() || !cond2->OperIsCompare())
{
return false;
}

// Ensure there are no additional side effects.
if ((cond1->gtFlags & (GTF_SIDE_EFFECT | GTF_ORDER_SIDEEFF)) != 0 ||
(cond2->gtFlags & (GTF_SIDE_EFFECT | GTF_ORDER_SIDEEFF)) != 0)
{
return false;
}

// Integer compares only for now (until support for Arm64 fccmp instruction is added)
if (varTypeIsFloating(cond1->gtGetOp1()) || varTypeIsFloating(cond2->gtGetOp1()))
{
return false;
}

// Check for previously optimized compare chains.
bool op1IsTestCond;
bool op2IsTestCond;
bool op1IsCondChain = FindCompareChain(cond1, &op1IsTestCond);
bool op2IsCondChain = FindCompareChain(cond2, &op2IsTestCond);

// Avoid cases where optimizations in lowering will produce better code than optimizing here.
if (op1IsTestCond || op2IsTestCond)
{
return false;
}

a74nh marked this conversation as resolved.
Show resolved Hide resolved
// Combining conditions means that all conditions are always fully evaluated.
// Put a limit on the max size that can be combined.
if (!m_comp->compStressCompile(Compiler::STRESS_OPT_BOOLS_COMPARE_CHAIN_COST, 25))
{
int op1Cost = cond1->GetCostEx();
int op2Cost = cond2->GetCostEx();
int maxOp1Cost = op1IsCondChain ? 35 : 7;
int maxOp2Cost = op2IsCondChain ? 35 : 7;

// Cost to allow for chain size of three.
if (op1Cost > maxOp1Cost || op2Cost > maxOp2Cost)
{
JITDUMP("Skipping CompareChainCond that will evaluate conditions unconditionally at costs %d,%d\n", op1Cost,
op2Cost);
return false;
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you end up adding some handling for potentially expensive trees somewhere else?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're iterating forwards through the blocks now, so that I think gets rid of that issue:

if (expensive1 || cheap1 )

  • both of expensive1 and cheap1 would be checked against cost 7.

if (expensive1 || cheap1 || cheap2 )

  • First iteration both of expensive1 and cheap1 would be checked against cost 7.
  • If first iteration passed, then second pass the combined (expensive1+cheap1) would be checked against 35 and cheap2 against 7.
  • If first iteration failed, then both cheap1 and cheap2 would be checked against 7.

if (cheap1 || cheap2 || expensive1)

  • both cheap1 and cheap2 would be checked against cost 7. Which would pass.
  • Then (cheap1+cheap2) against 35 and expensive1 against 7.

Copy link
Member

@jakobbotsch jakobbotsch Mar 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's odd to me that it's possible to "defeat" the check:

[MethodImpl(MethodImplOptions.NoInlining)]
public static int Foo(int a, int b)
{
    if (a > 10 && (((a * a * a * a * a > 3) & (a == 20)) != false))
    {
        return 42;
    }

    return 100;
}

this produces:

G_M63574_IG01:              ;; offset=0000H
        A9BF7BFD          stp     fp, lr, [sp, #-0x10]!
        910003FD          mov     fp, sp
                                                ;; size=8 bbWeight=1 PerfScore 1.50
G_M63574_IG02:              ;; offset=0008H
        1B007C01          mul     w1, w0, w0
        1B007C21          mul     w1, w1, w0
        1B007C21          mul     w1, w1, w0
        1B007C21          mul     w1, w1, w0
        52800C82          mov     w2, #100
        52800543          mov     w3, #42
        71000C3F          cmp     w1, #3
        7A54C800          ccmp    w0, #20, 0, gt
        7A4A080E          ccmp    w0, #10, nzc, eq
        1A83D040          csel    w0, w2, w3, le
                                                ;; size=40 bbWeight=1 PerfScore 11.00
G_M63574_IG03:              ;; offset=0030H
        A8C17BFD          ldp     fp, lr, [sp], #0x10
        D65F03C0          ret     lr
                                                ;; size=8 bbWeight=1 PerfScore 2.00

Of course it's a very constructed example but I would be afraid that there are more natural looking examples.
I suppose to fix this you would need the recursive walk you wanted to avoid. I don't think that's expensive -- the code below calls gtSetEvalOrder and fgSetStmtSeq and they both end up doing these kinds of full tree walks anyway -- but it's not as elegant. It would make me feel a bit more at ease, however, to find and limit the cost of each "leaf" in the compare chain.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can leave it as is, it's probably fine in practice. Will leave it up to you.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cost of (((a * a * a * a * a > 3) & (a == 20)) is 35, which is on the limit for combing with the a > 10

Whereas the cost after combining, for example, op1 > 3 && op2 == 10 && op3 >= 12 is 32. (It's fairly high because of the two lots of EQ(AND,0)).

Looking at this again, a limit of 35 is probably too high as it's allowing 4 items to chain together. I should probably drop it to 31 to prevent op1 > 3 && op2 == 10 && op3 >= 12 combing with anything else.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the new costing it'll still allow like (((a * a * a * a > 3) & (a == 20)) (ie one less than you had before).
To improve it, further I think we'd have to somehow remove all the EQ(AND,0) blocks from the costings (as we know they'll vanish during lowering). Not sure it's worth it, so happy to leave as is now.

}

GenTree* testcondition = nullptr;

// Remove the first JTRUE statement.
constexpr bool isUnlink = true;
m_comp->fgRemoveStmt(m_b1, s1 DEBUGARG(isUnlink));

// Invert the condition.
if (foundEndOfOrConditions)
{
GenTree* revCond = m_comp->gtReverseCond(cond1);
assert(cond1 == revCond); // Ensure `gtReverseCond` did not create a new node.
}

// Join the two conditions together
genTreeOps chainedOper = foundEndOfOrConditions ? GT_AND : GT_OR;
GenTree* chainedConditions = m_comp->gtNewOperNode(chainedOper, TYP_INT, cond1, cond2);
chainedConditions->AsOp()->gtFlags |= (cond1->gtFlags & GTF_ALL_EFFECT);
chainedConditions->AsOp()->gtFlags |= (cond2->gtFlags & GTF_ALL_EFFECT);
a74nh marked this conversation as resolved.
Show resolved Hide resolved
cond1->gtFlags &= ~GTF_RELOP_JMP_USED;
cond2->gtFlags &= ~GTF_RELOP_JMP_USED;
chainedConditions->gtFlags |= (GTF_RELOP_JMP_USED | GTF_DONT_CSE);

// Add a test condition onto the front of the chain
testcondition = m_comp->gtNewOperNode(GT_NE, TYP_INT, chainedConditions, m_comp->gtNewZeroConNode(TYP_INT));
a74nh marked this conversation as resolved.
Show resolved Hide resolved

// Wire the chain into the second block
m_testInfo2.testTree->AsOp()->gtOp1 = testcondition;
m_testInfo2.testTree->AsOp()->gtFlags |= (testcondition->gtFlags & GTF_ALL_EFFECT);
m_comp->gtSetEvalOrder(m_testInfo2.testTree);
m_comp->fgSetStmtSeq(s2);

// Update the flow.
m_comp->fgRemoveRefPred(m_b1->bbJumpDest, m_b1);
m_b1->bbJumpKind = BBJ_NONE;

// Fixup flags.
m_b2->bbFlags |= (m_b1->bbFlags & BBF_COPY_PROPAGATE);

// Join the two blocks. This is done now to ensure that additional conditions can be chained.
if (m_comp->fgCanCompactBlocks(m_b1, m_b2))
{
m_comp->fgCompactBlocks(m_b1, m_b2);
}

#ifdef DEBUG
if (m_comp->verbose)
{
JITDUMP("\nCombined conditions " FMT_BB " and " FMT_BB " into %s chain :\n", m_b1->bbNum, m_b2->bbNum,
GenTree::OpName(chainedOper));
m_comp->fgDumpBlock(m_b1);
JITDUMP("\n");
}
#endif

return true;
}

//-----------------------------------------------------------------------------
// optOptimizeBoolsChkBlkCond: Checks block conditions if it can be boolean optimized
//
Expand Down Expand Up @@ -10076,6 +10341,7 @@ PhaseStatus Compiler::optOptimizeBools()
}
#endif
bool change = false;
bool retry = false;
unsigned numCond = 0;
unsigned numReturn = 0;
unsigned numPasses = 0;
Expand All @@ -10086,8 +10352,10 @@ PhaseStatus Compiler::optOptimizeBools()
numPasses++;
change = false;

for (BasicBlock* const b1 : Blocks())
for (BasicBlock* b1 = fgFirstBB; b1 != nullptr; b1 = retry ? b1 : b1->bbNext)
{
retry = false;

// We're only interested in conditional jumps here

if (b1->bbJumpKind != BBJ_COND)
Expand Down Expand Up @@ -10127,6 +10395,16 @@ PhaseStatus Compiler::optOptimizeBools()
change = true;
numCond++;
}
#ifdef TARGET_ARM64
else if (optBoolsDsc.optOptimizeCompareChainCondBlock())
{
// The optimization will have merged b1 and b2. Retry the loop so that
// b1 and b2->bbNext can be tested.
change = true;
retry = true;
numCond++;
}
#endif
}
else if (b2->bbJumpKind == BBJ_RETURN)
{
Expand Down
Loading