Permalink
Fetching contributors…
Cannot retrieve contributors at this time
5940 lines (5158 sloc) 216 KB
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XX XX
XX Lower XX
XX XX
XX Preconditions: XX
XX XX
XX Postconditions (for the nodes currently handled): XX
XX - All operands requiring a register are explicit in the graph XX
XX XX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
*/
#include "jitpch.h"
#ifdef _MSC_VER
#pragma hdrstop
#endif
#include "lower.h"
#if !defined(_TARGET_64BIT_)
#include "decomposelongs.h"
#endif // !defined(_TARGET_64BIT_)
//------------------------------------------------------------------------
// MakeSrcContained: Make "childNode" a contained node
//
// Arguments:
// parentNode - is a non-leaf node that can contain its 'childNode'
// childNode - is an op that will now be contained by its parent.
//
// Notes:
// If 'childNode' it has any existing sources, they will now be sources for the parent.
//
void Lowering::MakeSrcContained(GenTree* parentNode, GenTree* childNode)
{
assert(!parentNode->OperIsLeaf());
assert(childNode->canBeContained());
childNode->SetContained();
assert(childNode->isContained());
}
//------------------------------------------------------------------------
// CheckImmedAndMakeContained: Checks if the 'childNode' is a containable immediate
// and, if so, makes it contained.
//
// Arguments:
// parentNode - is any non-leaf node
// childNode - is an child op of 'parentNode'
//
// Return value:
// true if we are able to make childNode a contained immediate
//
bool Lowering::CheckImmedAndMakeContained(GenTree* parentNode, GenTree* childNode)
{
assert(!parentNode->OperIsLeaf());
// If childNode is a containable immediate
if (IsContainableImmed(parentNode, childNode))
{
// then make it contained within the parentNode
MakeSrcContained(parentNode, childNode);
return true;
}
return false;
}
//------------------------------------------------------------------------
// IsSafeToContainMem: Checks for conflicts between childNode and parentNode,
// and returns 'true' iff memory operand childNode can be contained in parentNode.
//
// Arguments:
// parentNode - any non-leaf node
// childNode - some node that is an input to `parentNode`
//
// Return value:
// true if it is safe to make childNode a contained memory operand.
//
bool Lowering::IsSafeToContainMem(GenTree* parentNode, GenTree* childNode)
{
m_scratchSideEffects.Clear();
m_scratchSideEffects.AddNode(comp, childNode);
for (GenTree* node = childNode->gtNext; node != parentNode; node = node->gtNext)
{
if (m_scratchSideEffects.InterferesWith(comp, node, false))
{
return false;
}
}
return true;
}
//------------------------------------------------------------------------
// This is the main entry point for Lowering.
GenTree* Lowering::LowerNode(GenTree* node)
{
assert(node != nullptr);
switch (node->gtOper)
{
case GT_IND:
TryCreateAddrMode(LIR::Use(BlockRange(), &node->gtOp.gtOp1, node), true);
ContainCheckIndir(node->AsIndir());
break;
case GT_STOREIND:
TryCreateAddrMode(LIR::Use(BlockRange(), &node->gtOp.gtOp1, node), true);
if (!comp->codeGen->gcInfo.gcIsWriteBarrierAsgNode(node))
{
LowerStoreIndir(node->AsIndir());
}
break;
case GT_ADD:
{
GenTree* afterTransform = LowerAdd(node);
if (afterTransform != nullptr)
{
return afterTransform;
}
__fallthrough;
}
#if !defined(_TARGET_64BIT_)
case GT_ADD_LO:
case GT_ADD_HI:
case GT_SUB_LO:
case GT_SUB_HI:
#endif
case GT_SUB:
case GT_AND:
case GT_OR:
case GT_XOR:
ContainCheckBinary(node->AsOp());
break;
case GT_MUL:
case GT_MULHI:
#if defined(_TARGET_X86_)
case GT_MUL_LONG:
#endif
ContainCheckMul(node->AsOp());
break;
case GT_UDIV:
case GT_UMOD:
if (!LowerUnsignedDivOrMod(node->AsOp()))
{
ContainCheckDivOrMod(node->AsOp());
}
break;
case GT_DIV:
case GT_MOD:
return LowerSignedDivOrMod(node);
case GT_SWITCH:
return LowerSwitch(node);
case GT_CALL:
LowerCall(node);
break;
case GT_LT:
case GT_LE:
case GT_GT:
case GT_GE:
case GT_EQ:
case GT_NE:
case GT_TEST_EQ:
case GT_TEST_NE:
case GT_CMP:
return LowerCompare(node);
case GT_JTRUE:
return LowerJTrue(node->AsOp());
case GT_JMP:
LowerJmpMethod(node);
break;
case GT_RETURN:
LowerRet(node);
break;
case GT_RETURNTRAP:
ContainCheckReturnTrap(node->AsOp());
break;
case GT_CAST:
LowerCast(node);
break;
#if defined(_TARGET_XARCH_) || defined(_TARGET_ARM64_)
case GT_ARR_BOUNDS_CHECK:
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
#ifdef FEATURE_HW_INTRINSICS
case GT_HW_INTRINSIC_CHK:
#endif // FEATURE_HW_INTRINSICS
ContainCheckBoundsChk(node->AsBoundsChk());
break;
#endif // _TARGET_XARCH_
case GT_ARR_ELEM:
return LowerArrElem(node);
case GT_ARR_OFFSET:
ContainCheckArrOffset(node->AsArrOffs());
break;
case GT_ROL:
case GT_ROR:
LowerRotate(node);
break;
#ifndef _TARGET_64BIT_
case GT_LSH_HI:
case GT_RSH_LO:
ContainCheckShiftRotate(node->AsOp());
break;
#endif // !_TARGET_64BIT_
case GT_LSH:
case GT_RSH:
case GT_RSZ:
#if defined(_TARGET_XARCH_) || defined(_TARGET_ARM64_)
LowerShift(node->AsOp());
#else
ContainCheckShiftRotate(node->AsOp());
#endif
break;
case GT_STORE_BLK:
case GT_STORE_OBJ:
case GT_STORE_DYN_BLK:
{
GenTreeBlk* blkNode = node->AsBlk();
TryCreateAddrMode(LIR::Use(BlockRange(), &blkNode->Addr(), blkNode), false);
LowerBlockStore(blkNode);
}
break;
case GT_LCLHEAP:
ContainCheckLclHeap(node->AsOp());
break;
#ifdef _TARGET_XARCH_
case GT_INTRINSIC:
ContainCheckIntrinsic(node->AsOp());
break;
#endif // _TARGET_XARCH_
#ifdef FEATURE_SIMD
case GT_SIMD:
LowerSIMD(node->AsSIMD());
break;
#endif // FEATURE_SIMD
#ifdef FEATURE_HW_INTRINSICS
case GT_HWIntrinsic:
LowerHWIntrinsic(node->AsHWIntrinsic());
break;
#endif // FEATURE_HW_INTRINSICS
case GT_LCL_FLD:
{
// We should only encounter this for lclVars that are lvDoNotEnregister.
verifyLclFldDoNotEnregister(node->AsLclVarCommon()->gtLclNum);
break;
}
case GT_LCL_VAR:
WidenSIMD12IfNecessary(node->AsLclVarCommon());
break;
case GT_STORE_LCL_VAR:
WidenSIMD12IfNecessary(node->AsLclVarCommon());
__fallthrough;
case GT_STORE_LCL_FLD:
{
#if defined(_TARGET_AMD64_) && defined(FEATURE_SIMD)
GenTreeLclVarCommon* const store = node->AsLclVarCommon();
if ((store->TypeGet() == TYP_SIMD8) != (store->gtOp1->TypeGet() == TYP_SIMD8))
{
GenTreeUnOp* bitcast =
new (comp, GT_BITCAST) GenTreeOp(GT_BITCAST, store->TypeGet(), store->gtOp1, nullptr);
store->gtOp1 = bitcast;
BlockRange().InsertBefore(store, bitcast);
}
#endif // _TARGET_AMD64_
// TODO-1stClassStructs: Once we remove the requirement that all struct stores
// are block stores (GT_STORE_BLK or GT_STORE_OBJ), here is where we would put the local
// store under a block store if codegen will require it.
if ((node->TypeGet() == TYP_STRUCT) && (node->gtGetOp1()->OperGet() != GT_PHI))
{
#if FEATURE_MULTIREG_RET
GenTree* src = node->gtGetOp1();
assert((src->OperGet() == GT_CALL) && src->AsCall()->HasMultiRegRetVal());
#else // !FEATURE_MULTIREG_RET
assert(!"Unexpected struct local store in Lowering");
#endif // !FEATURE_MULTIREG_RET
}
LowerStoreLoc(node->AsLclVarCommon());
break;
}
#if defined(_TARGET_ARM64_)
case GT_CMPXCHG:
CheckImmedAndMakeContained(node, node->AsCmpXchg()->gtOpComparand);
break;
case GT_XADD:
CheckImmedAndMakeContained(node, node->gtOp.gtOp2);
break;
#elif defined(_TARGET_XARCH_)
case GT_XADD:
if (node->IsUnusedValue())
{
node->ClearUnusedValue();
// Make sure the types are identical, since the node type is changed to VOID
// CodeGen relies on op2's type to determine the instruction size.
// Note that the node type cannot be a small int but the data operand can.
assert(genActualType(node->gtGetOp2()->TypeGet()) == node->TypeGet());
node->SetOper(GT_LOCKADD);
node->gtType = TYP_VOID;
CheckImmedAndMakeContained(node, node->gtGetOp2());
}
break;
#endif
#ifndef _TARGET_ARM_
// TODO-ARM-CQ: We should contain this as long as the offset fits.
case GT_OBJ:
if (node->AsObj()->Addr()->OperIsLocalAddr())
{
node->AsObj()->Addr()->SetContained();
}
break;
#endif // !_TARGET_ARM_
default:
break;
}
return node->gtNext;
}
/** -- Switch Lowering --
* The main idea of switch lowering is to keep transparency of the register requirements of this node
* downstream in LSRA. Given that the switch instruction is inherently a control statement which in the JIT
* is represented as a simple tree node, at the time we actually generate code for it we end up
* generating instructions that actually modify the flow of execution that imposes complicated
* register requirement and lifetimes.
*
* So, for the purpose of LSRA, we want to have a more detailed specification of what a switch node actually
* means and more importantly, which and when do we need a register for each instruction we want to issue
* to correctly allocate them downstream.
*
* For this purpose, this procedure performs switch lowering in two different ways:
*
* a) Represent the switch statement as a zero-index jump table construct. This means that for every destination
* of the switch, we will store this destination in an array of addresses and the code generator will issue
* a data section where this array will live and will emit code that based on the switch index, will indirect and
* jump to the destination specified in the jump table.
*
* For this transformation we introduce a new GT node called GT_SWITCH_TABLE that is a specialization of the switch
* node for jump table based switches.
* The overall structure of a GT_SWITCH_TABLE is:
*
* GT_SWITCH_TABLE
* |_________ localVar (a temporary local that holds the switch index)
* |_________ jumpTable (this is a special node that holds the address of the jump table array)
*
* Now, the way we morph a GT_SWITCH node into this lowered switch table node form is the following:
*
* Input: GT_SWITCH (inside a basic block whose Branch Type is BBJ_SWITCH)
* |_____ expr (an arbitrarily complex GT_NODE that represents the switch index)
*
* This gets transformed into the following statements inside a BBJ_COND basic block (the target would be
* the default case of the switch in case the conditional is evaluated to true).
*
* ----- original block, transformed
* GT_ASG
* |_____ tempLocal (a new temporary local variable used to store the switch index)
* |_____ expr (the index expression)
*
* GT_JTRUE
* |_____ GT_COND
* |_____ GT_GE
* |___ Int_Constant (This constant is the index of the default case
* that happens to be the highest index in the jump table).
* |___ tempLocal (The local variable were we stored the index expression).
*
* ----- new basic block
* GT_SWITCH_TABLE
* |_____ tempLocal
* |_____ jumpTable (a new jump table node that now LSRA can allocate registers for explicitly
* and LinearCodeGen will be responsible to generate downstream).
*
* This way there are no implicit temporaries.
*
* b) For small-sized switches, we will actually morph them into a series of conditionals of the form
* if (case falls into the default){ goto jumpTable[size]; // last entry in the jump table is the default case }
* (For the default case conditional, we'll be constructing the exact same code as the jump table case one).
* else if (case == firstCase){ goto jumpTable[1]; }
* else if (case == secondCase) { goto jumptable[2]; } and so on.
*
* This transformation is of course made in JIT-IR, not downstream to CodeGen level, so this way we no longer
* require internal temporaries to maintain the index we're evaluating plus we're using existing code from
* LinearCodeGen to implement this instead of implement all the control flow constructs using InstrDscs and
* InstrGroups downstream.
*/
GenTree* Lowering::LowerSwitch(GenTree* node)
{
unsigned jumpCnt;
unsigned targetCnt;
BasicBlock** jumpTab;
assert(node->gtOper == GT_SWITCH);
// The first step is to build the default case conditional construct that is
// shared between both kinds of expansion of the switch node.
// To avoid confusion, we'll alias m_block to originalSwitchBB
// that represents the node we're morphing.
BasicBlock* originalSwitchBB = m_block;
LIR::Range& switchBBRange = LIR::AsRange(originalSwitchBB);
// jumpCnt is the number of elements in the jump table array.
// jumpTab is the actual pointer to the jump table array.
// targetCnt is the number of unique targets in the jump table array.
jumpCnt = originalSwitchBB->bbJumpSwt->bbsCount;
jumpTab = originalSwitchBB->bbJumpSwt->bbsDstTab;
targetCnt = originalSwitchBB->NumSucc(comp);
// GT_SWITCH must be a top-level node with no use.
#ifdef DEBUG
{
LIR::Use use;
assert(!switchBBRange.TryGetUse(node, &use));
}
#endif
JITDUMP("Lowering switch BB%02u, %d cases\n", originalSwitchBB->bbNum, jumpCnt);
// Handle a degenerate case: if the switch has only a default case, just convert it
// to an unconditional branch. This should only happen in minopts or with debuggable
// code.
if (targetCnt == 1)
{
JITDUMP("Lowering switch BB%02u: single target; converting to BBJ_ALWAYS\n", originalSwitchBB->bbNum);
noway_assert(comp->opts.MinOpts() || comp->opts.compDbgCode);
if (originalSwitchBB->bbNext == jumpTab[0])
{
originalSwitchBB->bbJumpKind = BBJ_NONE;
originalSwitchBB->bbJumpDest = nullptr;
}
else
{
originalSwitchBB->bbJumpKind = BBJ_ALWAYS;
originalSwitchBB->bbJumpDest = jumpTab[0];
}
// Remove extra predecessor links if there was more than one case.
for (unsigned i = 1; i < jumpCnt; ++i)
{
(void)comp->fgRemoveRefPred(jumpTab[i], originalSwitchBB);
}
// We have to get rid of the GT_SWITCH node but a child might have side effects so just assign
// the result of the child subtree to a temp.
GenTree* rhs = node->gtOp.gtOp1;
unsigned lclNum = comp->lvaGrabTemp(true DEBUGARG("Lowering is creating a new local variable"));
comp->lvaSortAgain = true;
comp->lvaTable[lclNum].lvType = rhs->TypeGet();
comp->lvaTable[lclNum].setLvRefCnt(1);
GenTreeLclVar* store =
new (comp, GT_STORE_LCL_VAR) GenTreeLclVar(GT_STORE_LCL_VAR, rhs->TypeGet(), lclNum, BAD_IL_OFFSET);
store->gtOp1 = rhs;
store->gtFlags = (rhs->gtFlags & GTF_COMMON_MASK);
store->gtFlags |= GTF_VAR_DEF;
switchBBRange.InsertAfter(node, store);
switchBBRange.Remove(node);
return store;
}
noway_assert(jumpCnt >= 2);
// Spill the argument to the switch node into a local so that it can be used later.
unsigned blockWeight = originalSwitchBB->getBBWeight(comp);
LIR::Use use(switchBBRange, &(node->gtOp.gtOp1), node);
ReplaceWithLclVar(use);
// GT_SWITCH(indexExpression) is now two statements:
// 1. a statement containing 'asg' (for temp = indexExpression)
// 2. and a statement with GT_SWITCH(temp)
assert(node->gtOper == GT_SWITCH);
GenTree* temp = node->gtOp.gtOp1;
assert(temp->gtOper == GT_LCL_VAR);
unsigned tempLclNum = temp->gtLclVarCommon.gtLclNum;
LclVarDsc* tempVarDsc = comp->lvaTable + tempLclNum;
var_types tempLclType = temp->TypeGet();
BasicBlock* defaultBB = jumpTab[jumpCnt - 1];
BasicBlock* followingBB = originalSwitchBB->bbNext;
/* Is the number of cases right for a test and jump switch? */
const bool fFirstCaseFollows = (followingBB == jumpTab[0]);
const bool fDefaultFollows = (followingBB == defaultBB);
unsigned minSwitchTabJumpCnt = 2; // table is better than just 2 cmp/jcc
// This means really just a single cmp/jcc (aka a simple if/else)
if (fFirstCaseFollows || fDefaultFollows)
{
minSwitchTabJumpCnt++;
}
#if defined(_TARGET_ARM_)
// On ARM for small switch tables we will
// generate a sequence of compare and branch instructions
// because the code to load the base of the switch
// table is huge and hideous due to the relocation... :(
minSwitchTabJumpCnt += 2;
#endif // _TARGET_ARM_
// Once we have the temporary variable, we construct the conditional branch for
// the default case. As stated above, this conditional is being shared between
// both GT_SWITCH lowering code paths.
// This condition is of the form: if (temp > jumpTableLength - 2){ goto jumpTable[jumpTableLength - 1]; }
GenTree* gtDefaultCaseCond = comp->gtNewOperNode(GT_GT, TYP_INT, comp->gtNewLclvNode(tempLclNum, tempLclType),
comp->gtNewIconNode(jumpCnt - 2, genActualType(tempLclType)));
// Make sure we perform an unsigned comparison, just in case the switch index in 'temp'
// is now less than zero 0 (that would also hit the default case).
gtDefaultCaseCond->gtFlags |= GTF_UNSIGNED;
/* Increment the lvRefCnt and lvRefCntWtd for temp */
tempVarDsc->incRefCnts(blockWeight, comp);
GenTree* gtDefaultCaseJump = comp->gtNewOperNode(GT_JTRUE, TYP_VOID, gtDefaultCaseCond);
gtDefaultCaseJump->gtFlags = node->gtFlags;
LIR::Range condRange = LIR::SeqTree(comp, gtDefaultCaseJump);
switchBBRange.InsertAtEnd(std::move(condRange));
BasicBlock* afterDefaultCondBlock = comp->fgSplitBlockAfterNode(originalSwitchBB, condRange.LastNode());
// afterDefaultCondBlock is now the switch, and all the switch targets have it as a predecessor.
// originalSwitchBB is now a BBJ_NONE, and there is a predecessor edge in afterDefaultCondBlock
// representing the fall-through flow from originalSwitchBB.
assert(originalSwitchBB->bbJumpKind == BBJ_NONE);
assert(originalSwitchBB->bbNext == afterDefaultCondBlock);
assert(afterDefaultCondBlock->bbJumpKind == BBJ_SWITCH);
assert(afterDefaultCondBlock->bbJumpSwt->bbsHasDefault);
assert(afterDefaultCondBlock->isEmpty()); // Nothing here yet.
// The GT_SWITCH code is still in originalSwitchBB (it will be removed later).
// Turn originalSwitchBB into a BBJ_COND.
originalSwitchBB->bbJumpKind = BBJ_COND;
originalSwitchBB->bbJumpDest = jumpTab[jumpCnt - 1];
// Fix the pred for the default case: the default block target still has originalSwitchBB
// as a predecessor, but the fgSplitBlockAfterStatement() moved all predecessors to point
// to afterDefaultCondBlock.
flowList* oldEdge = comp->fgRemoveRefPred(jumpTab[jumpCnt - 1], afterDefaultCondBlock);
comp->fgAddRefPred(jumpTab[jumpCnt - 1], originalSwitchBB, oldEdge);
bool useJumpSequence = jumpCnt < minSwitchTabJumpCnt;
#if defined(_TARGET_UNIX_) && defined(_TARGET_ARM_)
// Force using an inlined jumping instead switch table generation.
// Switch jump table is generated with incorrect values in CoreRT case,
// so any large switch will crash after loading to PC any such value.
// I think this is due to the fact that we use absolute addressing
// instead of relative. But in CoreRT is used as a rule relative
// addressing when we generate an executable.
// See also https://github.com/dotnet/coreclr/issues/13194
// Also https://github.com/dotnet/coreclr/pull/13197
useJumpSequence = useJumpSequence || comp->IsTargetAbi(CORINFO_CORERT_ABI);
#endif // defined(_TARGET_UNIX_) && defined(_TARGET_ARM_)
// If we originally had 2 unique successors, check to see whether there is a unique
// non-default case, in which case we can eliminate the switch altogether.
// Note that the single unique successor case is handled above.
BasicBlock* uniqueSucc = nullptr;
if (targetCnt == 2)
{
uniqueSucc = jumpTab[0];
noway_assert(jumpCnt >= 2);
for (unsigned i = 1; i < jumpCnt - 1; i++)
{
if (jumpTab[i] != uniqueSucc)
{
uniqueSucc = nullptr;
break;
}
}
}
if (uniqueSucc != nullptr)
{
// If the unique successor immediately follows this block, we have nothing to do -
// it will simply fall-through after we remove the switch, below.
// Otherwise, make this a BBJ_ALWAYS.
// Now, fixup the predecessor links to uniqueSucc. In the original jumpTab:
// jumpTab[i-1] was the default target, which we handled above,
// jumpTab[0] is the first target, and we'll leave that predecessor link.
// Remove any additional predecessor links to uniqueSucc.
for (unsigned i = 1; i < jumpCnt - 1; ++i)
{
assert(jumpTab[i] == uniqueSucc);
(void)comp->fgRemoveRefPred(uniqueSucc, afterDefaultCondBlock);
}
if (afterDefaultCondBlock->bbNext == uniqueSucc)
{
afterDefaultCondBlock->bbJumpKind = BBJ_NONE;
afterDefaultCondBlock->bbJumpDest = nullptr;
}
else
{
afterDefaultCondBlock->bbJumpKind = BBJ_ALWAYS;
afterDefaultCondBlock->bbJumpDest = uniqueSucc;
}
}
// If the number of possible destinations is small enough, we proceed to expand the switch
// into a series of conditional branches, otherwise we follow the jump table based switch
// transformation.
else if (useJumpSequence || comp->compStressCompile(Compiler::STRESS_SWITCH_CMP_BR_EXPANSION, 50))
{
// Lower the switch into a series of compare and branch IR trees.
//
// In this case we will morph the node in the following way:
// 1. Generate a JTRUE statement to evaluate the default case. (This happens above.)
// 2. Start splitting the switch basic block into subsequent basic blocks, each of which will contain
// a statement that is responsible for performing a comparison of the table index and conditional
// branch if equal.
JITDUMP("Lowering switch BB%02u: using compare/branch expansion\n", originalSwitchBB->bbNum);
// We'll use 'afterDefaultCondBlock' for the first conditional. After that, we'll add new
// blocks. If we end up not needing it at all (say, if all the non-default cases just fall through),
// we'll delete it.
bool fUsedAfterDefaultCondBlock = false;
BasicBlock* currentBlock = afterDefaultCondBlock;
LIR::Range* currentBBRange = &LIR::AsRange(currentBlock);
// Walk to entries 0 to jumpCnt - 1. If a case target follows, ignore it and let it fall through.
// If no case target follows, the last one doesn't need to be a compare/branch: it can be an
// unconditional branch.
bool fAnyTargetFollows = false;
for (unsigned i = 0; i < jumpCnt - 1; ++i)
{
assert(currentBlock != nullptr);
// Remove the switch from the predecessor list of this case target's block.
// We'll add the proper new predecessor edge later.
flowList* oldEdge = comp->fgRemoveRefPred(jumpTab[i], afterDefaultCondBlock);
if (jumpTab[i] == followingBB)
{
// This case label follows the switch; let it fall through.
fAnyTargetFollows = true;
continue;
}
// We need a block to put in the new compare and/or branch.
// If we haven't used the afterDefaultCondBlock yet, then use that.
if (fUsedAfterDefaultCondBlock)
{
BasicBlock* newBlock = comp->fgNewBBafter(BBJ_NONE, currentBlock, true);
comp->fgAddRefPred(newBlock, currentBlock); // The fall-through predecessor.
currentBlock = newBlock;
currentBBRange = &LIR::AsRange(currentBlock);
}
else
{
assert(currentBlock == afterDefaultCondBlock);
fUsedAfterDefaultCondBlock = true;
}
// We're going to have a branch, either a conditional or unconditional,
// to the target. Set the target.
currentBlock->bbJumpDest = jumpTab[i];
// Wire up the predecessor list for the "branch" case.
comp->fgAddRefPred(jumpTab[i], currentBlock, oldEdge);
if (!fAnyTargetFollows && (i == jumpCnt - 2))
{
// We're processing the last one, and there is no fall through from any case
// to the following block, so we can use an unconditional branch to the final
// case: there is no need to compare against the case index, since it's
// guaranteed to be taken (since the default case was handled first, above).
currentBlock->bbJumpKind = BBJ_ALWAYS;
}
else
{
// Otherwise, it's a conditional branch. Set the branch kind, then add the
// condition statement.
currentBlock->bbJumpKind = BBJ_COND;
// Now, build the conditional statement for the current case that is
// being evaluated:
// GT_JTRUE
// |__ GT_COND
// |____GT_EQ
// |____ (switchIndex) (The temp variable)
// |____ (ICon) (The actual case constant)
GenTree* gtCaseCond = comp->gtNewOperNode(GT_EQ, TYP_INT, comp->gtNewLclvNode(tempLclNum, tempLclType),
comp->gtNewIconNode(i, tempLclType));
/* Increment the lvRefCnt and lvRefCntWtd for temp */
tempVarDsc->incRefCnts(blockWeight, comp);
GenTree* gtCaseBranch = comp->gtNewOperNode(GT_JTRUE, TYP_VOID, gtCaseCond);
LIR::Range caseRange = LIR::SeqTree(comp, gtCaseBranch);
currentBBRange->InsertAtEnd(std::move(caseRange));
}
}
if (fAnyTargetFollows)
{
// There is a fall-through to the following block. In the loop
// above, we deleted all the predecessor edges from the switch.
// In this case, we need to add one back.
comp->fgAddRefPred(currentBlock->bbNext, currentBlock);
}
if (!fUsedAfterDefaultCondBlock)
{
// All the cases were fall-through! We don't need this block.
// Convert it from BBJ_SWITCH to BBJ_NONE and unset the BBF_DONT_REMOVE flag
// so fgRemoveBlock() doesn't complain.
JITDUMP("Lowering switch BB%02u: all switch cases were fall-through\n", originalSwitchBB->bbNum);
assert(currentBlock == afterDefaultCondBlock);
assert(currentBlock->bbJumpKind == BBJ_SWITCH);
currentBlock->bbJumpKind = BBJ_NONE;
currentBlock->bbFlags &= ~BBF_DONT_REMOVE;
comp->fgRemoveBlock(currentBlock, /* unreachable */ false); // It's an empty block.
}
}
else
{
// At this point the default case has already been handled and we need to generate a jump
// table based switch or a bit test based switch at the end of afterDefaultCondBlock. Both
// switch variants need the switch value so create the necessary LclVar node here.
GenTree* switchValue = comp->gtNewLclvNode(tempLclNum, tempLclType);
LIR::Range& switchBlockRange = LIR::AsRange(afterDefaultCondBlock);
tempVarDsc->incRefCnts(blockWeight, comp);
switchBlockRange.InsertAtEnd(switchValue);
// Try generating a bit test based switch first,
// if that's not possible a jump table based switch will be generated.
if (!TryLowerSwitchToBitTest(jumpTab, jumpCnt, targetCnt, afterDefaultCondBlock, switchValue))
{
JITDUMP("Lowering switch BB%02u: using jump table expansion\n", originalSwitchBB->bbNum);
#ifdef _TARGET_64BIT_
if (tempLclType != TYP_I_IMPL)
{
// SWITCH_TABLE expects the switch value (the index into the jump table) to be TYP_I_IMPL.
// Note that the switch value is unsigned so the cast should be unsigned as well.
switchValue = comp->gtNewCastNode(TYP_I_IMPL, switchValue, true, TYP_U_IMPL);
switchBlockRange.InsertAtEnd(switchValue);
}
#endif
GenTree* switchTable = comp->gtNewJmpTableNode();
GenTree* switchJump = comp->gtNewOperNode(GT_SWITCH_TABLE, TYP_VOID, switchValue, switchTable);
switchBlockRange.InsertAfter(switchValue, switchTable, switchJump);
// this block no longer branches to the default block
afterDefaultCondBlock->bbJumpSwt->removeDefault();
}
comp->fgInvalidateSwitchDescMapEntry(afterDefaultCondBlock);
}
GenTree* next = node->gtNext;
// Get rid of the GT_SWITCH(temp).
switchBBRange.Remove(node->gtOp.gtOp1);
switchBBRange.Remove(node);
return next;
}
//------------------------------------------------------------------------
// TryLowerSwitchToBitTest: Attempts to transform a jump table switch into a bit test.
//
// Arguments:
// jumpTable - The jump table
// jumpCount - The number of blocks in the jump table
// targetCount - The number of distinct blocks in the jump table
// bbSwitch - The switch block
// switchValue - A LclVar node that provides the switch value
//
// Return value:
// true if the switch has been lowered to a bit test
//
// Notes:
// If the jump table contains less than 32 (64 on 64 bit targets) entries and there
// are at most 2 distinct jump targets then the jump table can be converted to a word
// of bits where a 0 bit corresponds to one jump target and a 1 bit corresponds to the
// other jump target. Instead of the indirect jump a BT-JCC sequence is used to jump
// to the appropriate target:
// mov eax, 245 ; jump table converted to a "bit table"
// bt eax, ebx ; ebx is supposed to contain the switch value
// jc target1
// target0:
// ...
// target1:
// Such code is both shorter and faster (in part due to the removal of a memory load)
// than the traditional jump table base code. And of course, it also avoids the need
// to emit the jump table itself that can reach up to 256 bytes (for 64 entries).
//
bool Lowering::TryLowerSwitchToBitTest(
BasicBlock* jumpTable[], unsigned jumpCount, unsigned targetCount, BasicBlock* bbSwitch, GenTree* switchValue)
{
#ifndef _TARGET_XARCH_
// Other architectures may use this if they substitute GT_BT with equivalent code.
return false;
#else
assert(jumpCount >= 2);
assert(targetCount >= 2);
assert(bbSwitch->bbJumpKind == BBJ_SWITCH);
assert(switchValue->OperIs(GT_LCL_VAR));
//
// Quick check to see if it's worth going through the jump table. The bit test switch supports
// up to 2 targets but targetCount also includes the default block so we need to allow 3 targets.
// We'll ensure that there are only 2 targets when building the bit table.
//
if (targetCount > 3)
{
return false;
}
//
// The number of bits in the bit table is the same as the number of jump table entries. But the
// jump table also includes the default target (at the end) so we need to ignore it. The default
// has already been handled by a JTRUE(GT(switchValue, jumpCount - 2)) that LowerSwitch generates.
//
const unsigned bitCount = jumpCount - 1;
if (bitCount > (genTypeSize(TYP_I_IMPL) * 8))
{
return false;
}
//
// Build a bit table where a bit set to 0 corresponds to bbCase0 and a bit set to 1 corresponds to
// bbCase1. Simply use the first block in the jump table as bbCase1, later we can invert the bit
// table and/or swap the blocks if it's beneficial.
//
BasicBlock* bbCase0 = nullptr;
BasicBlock* bbCase1 = jumpTable[0];
size_t bitTable = 1;
for (unsigned bitIndex = 1; bitIndex < bitCount; bitIndex++)
{
if (jumpTable[bitIndex] == bbCase1)
{
bitTable |= (size_t(1) << bitIndex);
}
else if (bbCase0 == nullptr)
{
bbCase0 = jumpTable[bitIndex];
}
else if (jumpTable[bitIndex] != bbCase0)
{
// If it's neither bbCase0 nor bbCase1 then it means we have 3 targets. There can't be more
// than 3 because of the check at the start of the function.
assert(targetCount == 3);
return false;
}
}
//
// One of the case blocks has to follow the switch block. This requirement could be avoided
// by adding a BBJ_ALWAYS block after the switch block but doing that sometimes negatively
// impacts register allocation.
//
if ((bbSwitch->bbNext != bbCase0) && (bbSwitch->bbNext != bbCase1))
{
return false;
}
#ifdef _TARGET_64BIT_
//
// See if we can avoid a 8 byte immediate on 64 bit targets. If all upper 32 bits are 1
// then inverting the bit table will make them 0 so that the table now fits in 32 bits.
// Note that this does not change the number of bits in the bit table, it just takes
// advantage of the fact that loading a 32 bit immediate into a 64 bit register zero
// extends the immediate value to 64 bit.
//
if (~bitTable <= UINT32_MAX)
{
bitTable = ~bitTable;
std::swap(bbCase0, bbCase1);
}
#endif
//
// Rewire the blocks as needed and figure out the condition to use for JCC.
//
genTreeOps bbSwitchCondition = GT_NONE;
bbSwitch->bbJumpKind = BBJ_COND;
comp->fgRemoveAllRefPreds(bbCase1, bbSwitch);
comp->fgRemoveAllRefPreds(bbCase0, bbSwitch);
if (bbSwitch->bbNext == bbCase0)
{
// GT_LT + GTF_UNSIGNED generates JC so we jump to bbCase1 when the bit is set
bbSwitchCondition = GT_LT;
bbSwitch->bbJumpDest = bbCase1;
comp->fgAddRefPred(bbCase0, bbSwitch);
comp->fgAddRefPred(bbCase1, bbSwitch);
}
else
{
assert(bbSwitch->bbNext == bbCase1);
// GT_GE + GTF_UNSIGNED generates JNC so we jump to bbCase0 when the bit is not set
bbSwitchCondition = GT_GE;
bbSwitch->bbJumpDest = bbCase0;
comp->fgAddRefPred(bbCase0, bbSwitch);
comp->fgAddRefPred(bbCase1, bbSwitch);
}
//
// Append BT(bitTable, switchValue) and JCC(condition) to the switch block.
//
var_types bitTableType = (bitCount <= (genTypeSize(TYP_INT) * 8)) ? TYP_INT : TYP_LONG;
GenTree* bitTableIcon = comp->gtNewIconNode(bitTable, bitTableType);
GenTree* bitTest = comp->gtNewOperNode(GT_BT, TYP_VOID, bitTableIcon, switchValue);
bitTest->gtFlags |= GTF_SET_FLAGS;
GenTreeCC* jcc = new (comp, GT_JCC) GenTreeCC(GT_JCC, bbSwitchCondition);
jcc->gtFlags |= GTF_UNSIGNED | GTF_USE_FLAGS;
LIR::AsRange(bbSwitch).InsertAfter(switchValue, bitTableIcon, bitTest, jcc);
return true;
#endif // _TARGET_XARCH_
}
// NOTE: this method deliberately does not update the call arg table. It must only
// be used by NewPutArg and LowerArg; these functions are responsible for updating
// the call arg table as necessary.
void Lowering::ReplaceArgWithPutArgOrBitcast(GenTree** argSlot, GenTree* putArgOrBitcast)
{
assert(argSlot != nullptr);
assert(*argSlot != nullptr);
assert(putArgOrBitcast->OperIsPutArg() || putArgOrBitcast->OperIs(GT_BITCAST));
GenTree* arg = *argSlot;
// Replace the argument with the putarg/copy
*argSlot = putArgOrBitcast;
putArgOrBitcast->gtOp.gtOp1 = arg;
// Insert the putarg/copy into the block
BlockRange().InsertAfter(arg, putArgOrBitcast);
}
//------------------------------------------------------------------------
// NewPutArg: rewrites the tree to put an arg in a register or on the stack.
//
// Arguments:
// call - the call whose arg is being rewritten.
// arg - the arg being rewritten.
// info - the fgArgTabEntry information for the argument.
// type - the type of the argument.
//
// Return Value:
// The new tree that was created to put the arg in the right place
// or the incoming arg if the arg tree was not rewritten.
//
// Assumptions:
// call, arg, and info must be non-null.
//
// Notes:
// For System V systems with native struct passing (i.e. UNIX_AMD64_ABI defined)
// this method allocates a single GT_PUTARG_REG for 1 eightbyte structs and a GT_FIELD_LIST of two GT_PUTARG_REGs
// for two eightbyte structs.
//
// For STK passed structs the method generates GT_PUTARG_STK tree. For System V systems with native struct passing
// (i.e. UNIX_AMD64_ABI defined) this method also sets the GC pointers count and the pointers
// layout object, so the codegen of the GT_PUTARG_STK could use this for optimizing copying to the stack by value.
// (using block copy primitives for non GC pointers and a single TARGET_POINTER_SIZE copy with recording GC info.)
//
GenTree* Lowering::NewPutArg(GenTreeCall* call, GenTree* arg, fgArgTabEntry* info, var_types type)
{
assert(call != nullptr);
assert(arg != nullptr);
assert(info != nullptr);
GenTree* putArg = nullptr;
bool updateArgTable = true;
bool isOnStack = true;
isOnStack = info->regNum == REG_STK;
#ifdef _TARGET_ARMARCH_
// Mark contained when we pass struct
// GT_FIELD_LIST is always marked contained when it is generated
if (type == TYP_STRUCT)
{
arg->SetContained();
if ((arg->OperGet() == GT_OBJ) && (arg->AsObj()->Addr()->OperGet() == GT_LCL_VAR_ADDR))
{
MakeSrcContained(arg, arg->AsObj()->Addr());
}
}
#endif
#if FEATURE_ARG_SPLIT
// Struct can be split into register(s) and stack on ARM
if (info->isSplit)
{
assert(arg->OperGet() == GT_OBJ || arg->OperGet() == GT_FIELD_LIST);
// TODO: Need to check correctness for FastTailCall
if (call->IsFastTailCall())
{
#ifdef _TARGET_ARM_
NYI_ARM("lower: struct argument by fast tail call");
#endif // _TARGET_ARM_
}
putArg = new (comp, GT_PUTARG_SPLIT)
GenTreePutArgSplit(arg, info->slotNum PUT_STRUCT_ARG_STK_ONLY_ARG(info->numSlots), info->numRegs,
call->IsFastTailCall(), call);
// If struct argument is morphed to GT_FIELD_LIST node(s),
// we can know GC info by type of each GT_FIELD_LIST node.
// So we skip setting GC Pointer info.
//
GenTreePutArgSplit* argSplit = putArg->AsPutArgSplit();
for (unsigned regIndex = 0; regIndex < info->numRegs; regIndex++)
{
argSplit->SetRegNumByIdx(info->getRegNum(regIndex), regIndex);
}
if (arg->OperGet() == GT_OBJ)
{
BYTE* gcLayout = nullptr;
unsigned numRefs = 0;
GenTreeObj* argObj = arg->AsObj();
if (argObj->IsGCInfoInitialized())
{
gcLayout = argObj->gtGcPtrs;
numRefs = argObj->GetGcPtrCount();
}
else
{
// Set GC Pointer info
gcLayout = new (comp, CMK_Codegen) BYTE[info->numSlots + info->numRegs];
numRefs = comp->info.compCompHnd->getClassGClayout(arg->gtObj.gtClass, gcLayout);
argSplit->setGcPointers(numRefs, gcLayout);
}
// Set type of registers
for (unsigned index = 0; index < info->numRegs; index++)
{
var_types regType = comp->getJitGCType(gcLayout[index]);
argSplit->m_regType[index] = regType;
}
}
else
{
GenTreeFieldList* fieldListPtr = arg->AsFieldList();
for (unsigned index = 0; index < info->numRegs; fieldListPtr = fieldListPtr->Rest(), index++)
{
var_types regType = fieldListPtr->gtGetOp1()->TypeGet();
argSplit->m_regType[index] = regType;
// Clear the register assignments on the fieldList nodes, as these are contained.
fieldListPtr->gtRegNum = REG_NA;
}
}
}
else
#endif // FEATURE_ARG_SPLIT
{
if (!isOnStack)
{
#if FEATURE_MULTIREG_ARGS
if ((info->numRegs > 1) && (arg->OperGet() == GT_FIELD_LIST))
{
assert(arg->OperGet() == GT_FIELD_LIST);
assert(arg->AsFieldList()->IsFieldListHead());
unsigned int regIndex = 0;
for (GenTreeFieldList* fieldListPtr = arg->AsFieldList(); fieldListPtr != nullptr;
fieldListPtr = fieldListPtr->Rest())
{
regNumber argReg = info->getRegNum(regIndex);
GenTree* curOp = fieldListPtr->gtOp.gtOp1;
var_types curTyp = curOp->TypeGet();
// Create a new GT_PUTARG_REG node with op1
GenTree* newOper = comp->gtNewPutArgReg(curTyp, curOp, argReg);
// Splice in the new GT_PUTARG_REG node in the GT_FIELD_LIST
ReplaceArgWithPutArgOrBitcast(&fieldListPtr->gtOp.gtOp1, newOper);
regIndex++;
// Initialize all the gtRegNum's since the list won't be traversed in an LIR traversal.
fieldListPtr->gtRegNum = REG_NA;
}
// Just return arg. The GT_FIELD_LIST is not replaced.
// Nothing more to do.
return arg;
}
else
#endif // FEATURE_MULTIREG_ARGS
{
putArg = comp->gtNewPutArgReg(type, arg, info->regNum);
}
}
else
{
// Mark this one as tail call arg if it is a fast tail call.
// This provides the info to put this argument in in-coming arg area slot
// instead of in out-going arg area slot.
// Make sure state is correct. The PUTARG_STK has TYP_VOID, as it doesn't produce
// a result. So the type of its operand must be the correct type to push on the stack.
// For a FIELD_LIST, this will be the type of the field (not the type of the arg),
// but otherwise it is generally the type of the operand.
info->checkIsStruct();
if ((arg->OperGet() != GT_FIELD_LIST))
{
#if defined(FEATURE_SIMD) && defined(FEATURE_PUT_STRUCT_ARG_STK)
if (type == TYP_SIMD12)
{
assert(info->numSlots == 3);
}
else
#endif // defined(FEATURE_SIMD) && defined(FEATURE_PUT_STRUCT_ARG_STK)
{
assert(genActualType(arg->TypeGet()) == type);
}
}
putArg =
new (comp, GT_PUTARG_STK) GenTreePutArgStk(GT_PUTARG_STK, TYP_VOID, arg,
info->slotNum PUT_STRUCT_ARG_STK_ONLY_ARG(info->numSlots),
call->IsFastTailCall(), call);
#ifdef FEATURE_PUT_STRUCT_ARG_STK
// If the ArgTabEntry indicates that this arg is a struct
// get and store the number of slots that are references.
// This is later used in the codegen for PUT_ARG_STK implementation
// for struct to decide whether and how many single eight-byte copies
// to be done (only for reference slots), so gcinfo is emitted.
// For non-reference slots faster/smaller size instructions are used -
// pair copying using XMM registers or rep mov instructions.
if (info->isStruct)
{
// We use GT_OBJ only for non-lclVar, non-SIMD, non-FIELD_LIST struct arguments.
if (arg->OperIsLocal())
{
// This must have a type with a known size (SIMD or has been morphed to a primitive type).
assert(arg->TypeGet() != TYP_STRUCT);
}
else if (arg->OperIs(GT_OBJ))
{
unsigned numRefs = 0;
BYTE* gcLayout = new (comp, CMK_Codegen) BYTE[info->numSlots];
assert(!varTypeIsSIMD(arg));
numRefs = comp->info.compCompHnd->getClassGClayout(arg->gtObj.gtClass, gcLayout);
putArg->AsPutArgStk()->setGcPointers(numRefs, gcLayout);
#ifdef _TARGET_X86_
// On x86 VM lies about the type of a struct containing a pointer sized
// integer field by returning the type of its field as the type of struct.
// Such struct can be passed in a register depending its position in
// parameter list. VM does this unwrapping only one level and therefore
// a type like Struct Foo { Struct Bar { int f}} awlays needs to be
// passed on stack. Also, VM doesn't lie about type of such a struct
// when it is a field of another struct. That is VM doesn't lie about
// the type of Foo.Bar
//
// We now support the promotion of fields that are of type struct.
// However we only support a limited case where the struct field has a
// single field and that single field must be a scalar type. Say Foo.Bar
// field is getting passed as a parameter to a call, Since it is a TYP_STRUCT,
// as per x86 ABI it should always be passed on stack. Therefore GenTree
// node under a PUTARG_STK could be GT_OBJ(GT_LCL_VAR_ADDR(v1)), where
// local v1 could be a promoted field standing for Foo.Bar. Note that
// the type of v1 will be the type of field of Foo.Bar.f when Foo is
// promoted. That is v1 will be a scalar type. In this case we need to
// pass v1 on stack instead of in a register.
//
// TODO-PERF: replace GT_OBJ(GT_LCL_VAR_ADDR(v1)) with v1 if v1 is
// a scalar type and the width of GT_OBJ matches the type size of v1.
// Note that this cannot be done till call node arguments are morphed
// because we should not lose the fact that the type of argument is
// a struct so that the arg gets correctly marked to be passed on stack.
GenTree* objOp1 = arg->gtGetOp1();
if (objOp1->OperGet() == GT_LCL_VAR_ADDR)
{
unsigned lclNum = objOp1->AsLclVarCommon()->GetLclNum();
if (comp->lvaTable[lclNum].lvType != TYP_STRUCT)
{
comp->lvaSetVarDoNotEnregister(lclNum DEBUGARG(Compiler::DNER_VMNeedsStackAddr));
}
}
#endif // _TARGET_X86_
}
else if (!arg->OperIs(GT_FIELD_LIST))
{
assert(varTypeIsSIMD(arg) || (info->numSlots == 1));
}
}
#endif // FEATURE_PUT_STRUCT_ARG_STK
}
}
JITDUMP("new node is : ");
DISPNODE(putArg);
JITDUMP("\n");
if (arg->gtFlags & GTF_LATE_ARG)
{
putArg->gtFlags |= GTF_LATE_ARG;
}
else if (updateArgTable)
{
info->node = putArg;
}
return putArg;
}
//------------------------------------------------------------------------
// LowerArg: Lower one argument of a call. This entails splicing a "putarg" node between
// the argument evaluation and the call. This is the point at which the source is
// consumed and the value transitions from control of the register allocator to the calling
// convention.
//
// Arguments:
// call - The call node
// ppArg - Pointer to the call argument pointer. We might replace the call argument by
// changing *ppArg.
//
// Return Value:
// None.
//
void Lowering::LowerArg(GenTreeCall* call, GenTree** ppArg)
{
GenTree* arg = *ppArg;
JITDUMP("lowering arg : ");
DISPNODE(arg);
// No assignments should remain by Lowering.
assert(!arg->OperIsAssignment());
assert(!arg->OperIsPutArgStk());
// Assignments/stores at this level are not really placing an argument.
// They are setting up temporary locals that will later be placed into
// outgoing regs or stack.
if (arg->OperIsStore() || arg->IsArgPlaceHolderNode() || arg->IsNothingNode() || arg->OperIsCopyBlkOp())
{
return;
}
fgArgTabEntry* info = comp->gtArgEntryByNode(call, arg);
assert(info->node == arg);
var_types type = arg->TypeGet();
if (varTypeIsSmall(type))
{
// Normalize 'type', it represents the item that we will be storing in the Outgoing Args
type = TYP_INT;
}
#if defined(FEATURE_SIMD)
#if defined(_TARGET_X86_)
// Non-param TYP_SIMD12 local var nodes are massaged in Lower to TYP_SIMD16 to match their
// allocated size (see lvSize()). However, when passing the variables as arguments, and
// storing the variables to the outgoing argument area on the stack, we must use their
// actual TYP_SIMD12 type, so exactly 12 bytes is allocated and written.
if (type == TYP_SIMD16)
{
if ((arg->OperGet() == GT_LCL_VAR) || (arg->OperGet() == GT_STORE_LCL_VAR))
{
unsigned varNum = arg->AsLclVarCommon()->GetLclNum();
LclVarDsc* varDsc = &comp->lvaTable[varNum];
type = varDsc->lvType;
}
else if (arg->OperGet() == GT_SIMD)
{
assert((arg->AsSIMD()->gtSIMDSize == 16) || (arg->AsSIMD()->gtSIMDSize == 12));
if (arg->AsSIMD()->gtSIMDSize == 12)
{
type = TYP_SIMD12;
}
}
}
#elif defined(_TARGET_AMD64_)
// TYP_SIMD8 parameters that are passed as longs
if (type == TYP_SIMD8 && genIsValidIntReg(info->regNum))
{
GenTreeUnOp* bitcast = new (comp, GT_BITCAST) GenTreeOp(GT_BITCAST, TYP_LONG, arg, nullptr);
BlockRange().InsertAfter(arg, bitcast);
info->node = *ppArg = arg = bitcast;
type = TYP_LONG;
}
#endif // defined(_TARGET_X86_)
#endif // defined(FEATURE_SIMD)
// If we hit this we are probably double-lowering.
assert(!arg->OperIsPutArg());
#if !defined(_TARGET_64BIT_)
if (varTypeIsLong(type))
{
bool isReg = (info->regNum != REG_STK);
if (isReg)
{
noway_assert(arg->OperGet() == GT_LONG);
assert(info->numRegs == 2);
GenTree* argLo = arg->gtGetOp1();
GenTree* argHi = arg->gtGetOp2();
GenTreeFieldList* fieldList = new (comp, GT_FIELD_LIST) GenTreeFieldList(argLo, 0, TYP_INT, nullptr);
// Only the first fieldList node (GTF_FIELD_LIST_HEAD) is in the instruction sequence.
(void)new (comp, GT_FIELD_LIST) GenTreeFieldList(argHi, 4, TYP_INT, fieldList);
GenTree* putArg = NewPutArg(call, fieldList, info, type);
BlockRange().InsertBefore(arg, putArg);
BlockRange().Remove(arg);
*ppArg = fieldList;
info->node = fieldList;
}
else
{
assert(arg->OperGet() == GT_LONG);
// For longs, we will replace the GT_LONG with a GT_FIELD_LIST, and put that under a PUTARG_STK.
// Although the hi argument needs to be pushed first, that will be handled by the general case,
// in which the fields will be reversed.
assert(info->numSlots == 2);
GenTree* argLo = arg->gtGetOp1();
GenTree* argHi = arg->gtGetOp2();
GenTreeFieldList* fieldList = new (comp, GT_FIELD_LIST) GenTreeFieldList(argLo, 0, TYP_INT, nullptr);
// Only the first fieldList node (GTF_FIELD_LIST_HEAD) is in the instruction sequence.
(void)new (comp, GT_FIELD_LIST) GenTreeFieldList(argHi, 4, TYP_INT, fieldList);
GenTree* putArg = NewPutArg(call, fieldList, info, type);
putArg->gtRegNum = info->regNum;
// We can't call ReplaceArgWithPutArgOrBitcast here because it presumes that we are keeping the original
// arg.
BlockRange().InsertBefore(arg, fieldList, putArg);
BlockRange().Remove(arg);
*ppArg = putArg;
}
}
else
#endif // !defined(_TARGET_64BIT_)
{
#ifdef _TARGET_ARMARCH_
if (call->IsVarargs() || comp->opts.compUseSoftFP)
{
// For vararg call or on armel, reg args should be all integer.
// Insert copies as needed to move float value to integer register.
GenTree* newNode = LowerFloatArg(ppArg, info);
if (newNode != nullptr)
{
type = newNode->TypeGet();
}
}
#endif // _TARGET_ARMARCH_
GenTree* putArg = NewPutArg(call, arg, info, type);
// In the case of register passable struct (in one or two registers)
// the NewPutArg returns a new node (GT_PUTARG_REG or a GT_FIELD_LIST with two GT_PUTARG_REGs.)
// If an extra node is returned, splice it in the right place in the tree.
if (arg != putArg)
{
ReplaceArgWithPutArgOrBitcast(ppArg, putArg);
}
}
}
#ifdef _TARGET_ARMARCH_
//------------------------------------------------------------------------
// LowerFloatArg: Lower float call arguments on the arm platform.
//
// Arguments:
// arg - The arg node
// info - call argument info
//
// Return Value:
// Return nullptr, if no transformation was done;
// return arg if there was in place transformation;
// return a new tree if the root was changed.
//
// Notes:
// This must handle scalar float arguments as well as GT_FIELD_LISTs
// with floating point fields.
//
GenTree* Lowering::LowerFloatArg(GenTree** pArg, fgArgTabEntry* info)
{
GenTree* arg = *pArg;
if (info->regNum != REG_STK)
{
if (arg->OperIsFieldList())
{
GenTreeFieldList* currListNode = arg->AsFieldList();
regNumber currRegNumber = info->regNum;
// Transform fields that are passed as registers in place.
unsigned fieldRegCount;
for (unsigned i = 0; i < info->numRegs; i += fieldRegCount)
{
assert(currListNode != nullptr);
GenTree* node = currListNode->Current();
if (varTypeIsFloating(node))
{
GenTree* intNode = LowerFloatArgReg(node, currRegNumber);
assert(intNode != nullptr);
ReplaceArgWithPutArgOrBitcast(currListNode->pCurrent(), intNode);
currListNode->ChangeType(intNode->TypeGet());
}
if (node->TypeGet() == TYP_DOUBLE)
{
currRegNumber = REG_NEXT(REG_NEXT(currRegNumber));
fieldRegCount = 2;
}
else
{
currRegNumber = REG_NEXT(currRegNumber);
fieldRegCount = 1;
}
currListNode = currListNode->Rest();
}
// List fields were replaced in place.
return arg;
}
else if (varTypeIsFloating(arg))
{
GenTree* intNode = LowerFloatArgReg(arg, info->regNum);
assert(intNode != nullptr);
ReplaceArgWithPutArgOrBitcast(pArg, intNode);
return *pArg;
}
}
return nullptr;
}
//------------------------------------------------------------------------
// LowerFloatArgReg: Lower the float call argument node that is passed via register.
//
// Arguments:
// arg - The arg node
// regNum - register number
//
// Return Value:
// Return new bitcast node, that moves float to int register.
//
GenTree* Lowering::LowerFloatArgReg(GenTree* arg, regNumber regNum)
{
var_types floatType = arg->TypeGet();
assert(varTypeIsFloating(floatType));
var_types intType = (floatType == TYP_DOUBLE) ? TYP_LONG : TYP_INT;
GenTree* intArg = comp->gtNewBitCastNode(intType, arg);
intArg->gtRegNum = regNum;
#ifdef _TARGET_ARM_
if (floatType == TYP_DOUBLE)
{
regNumber nextReg = REG_NEXT(regNum);
intArg->AsMultiRegOp()->gtOtherReg = nextReg;
}
#endif
return intArg;
}
#endif
// do lowering steps for each arg of a call
void Lowering::LowerArgsForCall(GenTreeCall* call)
{
JITDUMP("objp:\n======\n");
if (call->gtCallObjp)
{
LowerArg(call, &call->gtCallObjp);
}
GenTreeArgList* args = call->gtCallArgs;
JITDUMP("\nargs:\n======\n");
for (; args; args = args->Rest())
{
LowerArg(call, &args->Current());
}
JITDUMP("\nlate:\n======\n");
for (args = call->gtCallLateArgs; args; args = args->Rest())
{
LowerArg(call, &args->Current());
}
}
// helper that create a node representing a relocatable physical address computation
GenTree* Lowering::AddrGen(ssize_t addr)
{
// this should end up in codegen as : instGen_Set_Reg_To_Imm(EA_HANDLE_CNS_RELOC, reg, addr)
GenTree* result = comp->gtNewIconHandleNode(addr, GTF_ICON_FTN_ADDR);
return result;
}
// variant that takes a void*
GenTree* Lowering::AddrGen(void* addr)
{
return AddrGen((ssize_t)addr);
}
// do lowering steps for a call
// this includes:
// - adding the placement nodes (either stack or register variety) for arguments
// - lowering the expression that calculates the target address
// - adding nodes for other operations that occur after the call sequence starts and before
// control transfer occurs (profiling and tail call helpers, pinvoke incantations)
//
void Lowering::LowerCall(GenTree* node)
{
GenTreeCall* call = node->AsCall();
JITDUMP("lowering call (before):\n");
DISPTREERANGE(BlockRange(), call);
JITDUMP("\n");
call->ClearOtherRegs();
LowerArgsForCall(call);
// note that everything generated from this point on runs AFTER the outgoing args are placed
GenTree* result = nullptr;
// for x86, this is where we record ESP for checking later to make sure stack is balanced
// Check for Delegate.Invoke(). If so, we inline it. We get the
// target-object and target-function from the delegate-object, and do
// an indirect call.
if (call->IsDelegateInvoke())
{
result = LowerDelegateInvoke(call);
}
else
{
// Virtual and interface calls
switch (call->gtFlags & GTF_CALL_VIRT_KIND_MASK)
{
case GTF_CALL_VIRT_STUB:
result = LowerVirtualStubCall(call);
break;
case GTF_CALL_VIRT_VTABLE:
// stub dispatching is off or this is not a virtual call (could be a tailcall)
result = LowerVirtualVtableCall(call);
break;
case GTF_CALL_NONVIRT:
if (call->IsUnmanaged())
{
result = LowerNonvirtPinvokeCall(call);
}
else if (call->gtCallType == CT_INDIRECT)
{
result = LowerIndirectNonvirtCall(call);
}
else
{
result = LowerDirectCall(call);
}
break;
default:
noway_assert(!"strange call type");
break;
}
}
if (call->IsTailCallViaHelper())
{
// Either controlExpr or gtCallAddr must contain real call target.
if (result == nullptr)
{
assert(call->gtCallType == CT_INDIRECT);
assert(call->gtCallAddr != nullptr);
result = call->gtCallAddr;
}
result = LowerTailCallViaHelper(call, result);
}
else if (call->IsFastTailCall())
{
LowerFastTailCall(call);
}
if (result != nullptr)
{
LIR::Range resultRange = LIR::SeqTree(comp, result);
JITDUMP("results of lowering call:\n");
DISPRANGE(resultRange);
GenTree* insertionPoint = call;
if (!call->IsTailCallViaHelper())
{
// The controlExpr should go before the gtCallCookie and the gtCallAddr, if they exist
//
// TODO-LIR: find out what's really required here, as this is currently a tree order
// dependency.
if (call->gtCallType == CT_INDIRECT)
{
bool isClosed = false;
if (call->gtCallCookie != nullptr)
{
#ifdef DEBUG
GenTree* firstCallAddrNode = BlockRange().GetTreeRange(call->gtCallAddr, &isClosed).FirstNode();
assert(isClosed);
assert(call->gtCallCookie->Precedes(firstCallAddrNode));
#endif // DEBUG
insertionPoint = BlockRange().GetTreeRange(call->gtCallCookie, &isClosed).FirstNode();
assert(isClosed);
}
else if (call->gtCallAddr != nullptr)
{
insertionPoint = BlockRange().GetTreeRange(call->gtCallAddr, &isClosed).FirstNode();
assert(isClosed);
}
}
}
ContainCheckRange(resultRange);
BlockRange().InsertBefore(insertionPoint, std::move(resultRange));
call->gtControlExpr = result;
}
if (comp->opts.IsJit64Compat())
{
CheckVSQuirkStackPaddingNeeded(call);
}
ContainCheckCallOperands(call);
JITDUMP("lowering call (after):\n");
DISPTREERANGE(BlockRange(), call);
JITDUMP("\n");
}
// Though the below described issue gets fixed in intellitrace dll of VS2015 (a.k.a Dev14),
// we still need this quirk for desktop so that older version of VS (e.g. VS2010/2012)
// continues to work.
// This quirk is excluded from other targets that have no back compat burden.
//
// Quirk for VS debug-launch scenario to work:
// See if this is a PInvoke call with exactly one param that is the address of a struct local.
// In such a case indicate to frame-layout logic to add 16-bytes of padding
// between save-reg area and locals. This is to protect against the buffer
// overrun bug in microsoft.intellitrace.11.0.0.dll!ProfilerInterop.InitInterop().
//
// A work-around to this bug is to disable IntelliTrace debugging
// (VS->Tools->Options->IntelliTrace->Enable IntelliTrace - uncheck this option).
// The reason why this works on Jit64 is that at the point of AV the call stack is
//
// GetSystemInfo() Native call
// IL_Stub generated for PInvoke declaration.
// ProfilerInterface::InitInterop()
// ProfilerInterface.Cctor()
// VM asm worker
//
// The cctor body has just the call to InitInterop(). VM asm worker is holding
// something in rbx that is used immediately after the Cctor call. Jit64 generated
// InitInterop() method is pushing the registers in the following order
//
// rbx
// rbp
// rsi
// rdi
// r12
// r13
// Struct local
//
// Due to buffer overrun, rbx doesn't get impacted. Whereas RyuJIT jitted code of
// the same method is pushing regs in the following order
//
// rbp
// rdi
// rsi
// rbx
// struct local
//
// Therefore as a fix, we add padding between save-reg area and locals to
// make this scenario work against JB.
//
// Note: If this quirk gets broken due to other JIT optimizations, we should consider
// more tolerant fix. One such fix is to padd the struct.
void Lowering::CheckVSQuirkStackPaddingNeeded(GenTreeCall* call)
{
assert(comp->opts.IsJit64Compat());
#ifdef _TARGET_AMD64_
// Confine this to IL stub calls which aren't marked as unmanaged.
if (call->IsPInvoke() && !call->IsUnmanaged())
{
bool paddingNeeded = false;
GenTree* firstPutArgReg = nullptr;
for (GenTreeArgList* args = call->gtCallLateArgs; args; args = args->Rest())
{
GenTree* tmp = args->Current();
if (tmp->OperGet() == GT_PUTARG_REG)
{
if (firstPutArgReg == nullptr)
{
firstPutArgReg = tmp;
GenTree* op1 = firstPutArgReg->gtOp.gtOp1;
if (op1->OperGet() == GT_LCL_VAR_ADDR)
{
unsigned lclNum = op1->AsLclVarCommon()->GetLclNum();
// TODO-1stClassStructs: This is here to duplicate previous behavior,
// but is not needed because the scenario being quirked did not involve
// a SIMD or enregisterable struct.
// if(comp->lvaTable[lclNum].TypeGet() == TYP_STRUCT)
if (varTypeIsStruct(comp->lvaTable[lclNum].TypeGet()))
{
// First arg is addr of a struct local.
paddingNeeded = true;
}
else
{
// Not a struct local.
assert(paddingNeeded == false);
break;
}
}
else
{
// First arg is not a local var addr.
assert(paddingNeeded == false);
break;
}
}
else
{
// Has more than one arg.
paddingNeeded = false;
break;
}
}
}
if (paddingNeeded)
{
comp->compVSQuirkStackPaddingNeeded = VSQUIRK_STACK_PAD;
}
}
#endif // _TARGET_AMD64_
}
// Inserts profiler hook, GT_PROF_HOOK for a tail call node.
//
// AMD64:
// We need to insert this after all nested calls, but before all the arguments to this call have been set up.
// To do this, we look for the first GT_PUTARG_STK or GT_PUTARG_REG, and insert the hook immediately before
// that. If there are no args, then it should be inserted before the call node.
//
// For example:
// * stmtExpr void (top level) (IL 0x000...0x010)
// arg0 SETUP | /--* argPlace ref REG NA $c5
// this in rcx | | /--* argPlace ref REG NA $c1
// | | | /--* call ref System.Globalization.CultureInfo.get_InvariantCulture $c2
// arg1 SETUP | | +--* st.lclVar ref V02 tmp1 REG NA $c2
// | | | /--* lclVar ref V02 tmp1 u : 2 (last use) REG NA $c2
// arg1 in rdx | | +--* putarg_reg ref REG NA
// | | | /--* lclVar ref V00 arg0 u : 2 (last use) REG NA $80
// this in rcx | | +--* putarg_reg ref REG NA
// | | /--* call nullcheck ref System.String.ToLower $c5
// | | { * stmtExpr void (embedded)(IL 0x000... ? ? ? )
// | | { \--* prof_hook void REG NA
// arg0 in rcx | +--* putarg_reg ref REG NA
// control expr | +--* const(h) long 0x7ffe8e910e98 ftn REG NA
// \--* call void System.Runtime.Remoting.Identity.RemoveAppNameOrAppGuidIfNecessary $VN.Void
//
// In this case, the GT_PUTARG_REG src is a nested call. We need to put the instructions after that call
// (as shown). We assume that of all the GT_PUTARG_*, only the first one can have a nested call.
//
// X86:
// Insert the profiler hook immediately before the call. The profiler hook will preserve
// all argument registers (ECX, EDX), but nothing else.
//
// Params:
// callNode - tail call node
// insertionPoint - if non-null, insert the profiler hook before this point.
// If null, insert the profiler hook before args are setup
// but after all arg side effects are computed.
//
void Lowering::InsertProfTailCallHook(GenTreeCall* call, GenTree* insertionPoint)
{
assert(call->IsTailCall());
assert(comp->compIsProfilerHookNeeded());
#if defined(_TARGET_X86_)
if (insertionPoint == nullptr)
{
insertionPoint = call;
}
#else // !defined(_TARGET_X86_)
if (insertionPoint == nullptr)
{
GenTree* tmp = nullptr;
for (GenTreeArgList* args = call->gtCallArgs; args; args = args->Rest())
{
tmp = args->Current();
assert(tmp->OperGet() != GT_PUTARG_REG); // We don't expect to see these in gtCallArgs
if (tmp->OperGet() == GT_PUTARG_STK)
{
// found it
insertionPoint = tmp;
break;
}
}
if (insertionPoint == nullptr)
{
for (GenTreeArgList* args = call->gtCallLateArgs; args; args = args->Rest())
{
tmp = args->Current();
if ((tmp->OperGet() == GT_PUTARG_REG) || (tmp->OperGet() == GT_PUTARG_STK))
{
// found it
insertionPoint = tmp;
break;
}
}
// If there are no args, insert before the call node
if (insertionPoint == nullptr)
{
insertionPoint = call;
}
}
}
#endif // !defined(_TARGET_X86_)
assert(insertionPoint != nullptr);
GenTree* profHookNode = new (comp, GT_PROF_HOOK) GenTree(GT_PROF_HOOK, TYP_VOID);
BlockRange().InsertBefore(insertionPoint, profHookNode);
}
// Lower fast tail call implemented as epilog+jmp.
// Also inserts PInvoke method epilog if required.
void Lowering::LowerFastTailCall(GenTreeCall* call)
{
#if FEATURE_FASTTAILCALL
// Tail call restrictions i.e. conditions under which tail prefix is ignored.
// Most of these checks are already done by importer or fgMorphTailCall().
// This serves as a double sanity check.
assert((comp->info.compFlags & CORINFO_FLG_SYNCH) == 0); // tail calls from synchronized methods
assert(!comp->opts.compNeedSecurityCheck); // tail call from methods that need security check
assert(!call->IsUnmanaged()); // tail calls to unamanaged methods
assert(!comp->compLocallocUsed); // tail call from methods that also do localloc
#ifdef _TARGET_AMD64_
assert(!comp->getNeedsGSSecurityCookie()); // jit64 compat: tail calls from methods that need GS check
#endif // _TARGET_AMD64_
// We expect to see a call that meets the following conditions
assert(call->IsFastTailCall());
// VM cannot use return address hijacking when A() and B() tail call each
// other in mutual recursion. Therefore, this block is reachable through
// a GC-safe point or the whole method is marked as fully interruptible.
//
// TODO-Cleanup:
// optReachWithoutCall() depends on the fact that loop headers blocks
// will have a block number > fgLastBB. These loop headers gets added
// after dominator computation and get skipped by OptReachWithoutCall().
// The below condition cannot be asserted in lower because fgSimpleLowering()
// can add a new basic block for range check failure which becomes
// fgLastBB with block number > loop header block number.
// assert((comp->compCurBB->bbFlags & BBF_GC_SAFE_POINT) ||
// !comp->optReachWithoutCall(comp->fgFirstBB, comp->compCurBB) || comp->genInterruptible);
// If PInvokes are in-lined, we have to remember to execute PInvoke method epilog anywhere that
// a method returns. This is a case of caller method has both PInvokes and tail calls.
if (comp->info.compCallUnmanaged)
{
InsertPInvokeMethodEpilog(comp->compCurBB DEBUGARG(call));
}
// Args for tail call are setup in incoming arg area. The gc-ness of args of
// caller and callee (which being tail called) may not match. Therefore, everything
// from arg setup until the epilog need to be non-interuptible by GC. This is
// achieved by inserting GT_START_NONGC before the very first GT_PUTARG_STK node
// of call is setup. Note that once a stack arg is setup, it cannot have nested
// calls subsequently in execution order to setup other args, because the nested
// call could over-write the stack arg that is setup earlier.
GenTree* firstPutArgStk = nullptr;
GenTreeArgList* args;
ArrayStack<GenTree*> putargs(comp->getAllocator(CMK_ArrayStack));
for (args = call->gtCallArgs; args; args = args->Rest())
{
GenTree* tmp = args->Current();
if (tmp->OperGet() == GT_PUTARG_STK)
{
putargs.Push(tmp);
}
}
for (args = call->gtCallLateArgs; args; args = args->Rest())
{
GenTree* tmp = args->Current();
if (tmp->OperGet() == GT_PUTARG_STK)
{
putargs.Push(tmp);
}
}
if (putargs.Height() > 0)
{
firstPutArgStk = putargs.Bottom();
}
// If we have a putarg_stk node, also count the number of non-standard args the
// call node has. Note that while determining whether a tail call can be fast
// tail called, we don't count non-standard args (passed in R10 or R11) since they
// don't contribute to outgoing arg space. These non-standard args are not
// accounted in caller's arg count but accounted in callee's arg count after
// fgMorphArgs(). Therefore, exclude callee's non-standard args while mapping
// callee's stack arg num to corresponding caller's stack arg num.
unsigned calleeNonStandardArgCount = call->GetNonStandardAddedArgCount(comp);
// Say Caller(a, b, c, d, e) fast tail calls Callee(e, d, c, b, a)
// i.e. passes its arguments in reverse to Callee. During call site
// setup, after computing argument side effects, stack args are setup
// first and reg args next. In the above example, both Callers and
// Callee stack args (e and a respectively) share the same stack slot
// and are alive at the same time. The act of setting up Callee's
// stack arg will over-write the stack arg of Caller and if there are
// further uses of Caller stack arg we have to make sure that we move
// it to a temp before over-writing its slot and use temp in place of
// the corresponding Caller stack arg.
//
// For the above example, conceptually this is what is done
// tmp = e;
// Stack slot of e = a
// R9 = b, R8 = c, RDx = d
// RCX = tmp
//
// The below logic is meant to detect cases like this and introduce
// temps to set up args correctly for Callee.
for (int i = 0; i < putargs.Height(); i++)
{
GenTree* putArgStkNode = putargs.Bottom(i);
assert(putArgStkNode->OperGet() == GT_PUTARG_STK);
// Get the caller arg num corresponding to this callee arg.
// Note that these two args share the same stack slot. Therefore,
// if there are further uses of corresponding caller arg, we need
// to move it to a temp and use the temp in this call tree.
//
// Note that Caller is guaranteed to have a param corresponding to
// this Callee's arg since fast tail call mechanism counts the
// stack slots required for both Caller and Callee for passing params
// and allow fast tail call only if stack slots required by Caller >=
// Callee.
fgArgTabEntry* argTabEntry = comp->gtArgEntryByNode(call, putArgStkNode);
assert(argTabEntry);
unsigned callerArgNum = argTabEntry->argNum - calleeNonStandardArgCount;
noway_assert(callerArgNum < comp->info.compArgsCount);
unsigned callerArgLclNum = callerArgNum;
LclVarDsc* callerArgDsc = comp->lvaTable + callerArgLclNum;
if (callerArgDsc->lvPromoted)
{
callerArgLclNum =
callerArgDsc->lvFieldLclStart; // update the callerArgNum to the promoted struct field's lclNum
callerArgDsc = comp->lvaTable + callerArgLclNum;
}
noway_assert(callerArgDsc->lvIsParam);
// Start searching in execution order list till we encounter call node
unsigned tmpLclNum = BAD_VAR_NUM;
var_types tmpType = TYP_UNDEF;
for (GenTree* treeNode = putArgStkNode->gtNext; treeNode != call; treeNode = treeNode->gtNext)
{
if (treeNode->OperIsLocal() || treeNode->OperIsLocalAddr())
{
// This should neither be a GT_REG_VAR nor GT_PHI_ARG.
assert((treeNode->OperGet() != GT_REG_VAR) && (treeNode->OperGet() != GT_PHI_ARG));
GenTreeLclVarCommon* lcl = treeNode->AsLclVarCommon();
LclVarDsc* lclVar = &comp->lvaTable[lcl->gtLclNum];
// Fast tail calling criteria permits passing of structs of size 1, 2, 4 and 8 as args.
// It is possible that the callerArgLclNum corresponds to such a struct whose stack slot
// is getting over-written by setting up of a stack arg and there are further uses of
// any of its fields if such a struct is type-dependently promoted. In this case too
// we need to introduce a temp.
if ((lcl->gtLclNum == callerArgNum) || (lcl->gtLclNum == callerArgLclNum))
{
// Create tmp and use it in place of callerArgDsc
if (tmpLclNum == BAD_VAR_NUM)
{
// Set tmpType first before calling lvaGrabTemp, as that call invalidates callerArgDsc
tmpType = genActualType(callerArgDsc->lvaArgType());
tmpLclNum = comp->lvaGrabTemp(
true DEBUGARG("Fast tail call lowering is creating a new local variable"));
comp->lvaSortAgain = true;
comp->lvaTable[tmpLclNum].lvType = tmpType;
comp->lvaTable[tmpLclNum].setLvRefCnt(1);
comp->lvaTable[tmpLclNum].lvDoNotEnregister = comp->lvaTable[lcl->gtLclNum].lvDoNotEnregister;
}
lcl->SetLclNum(tmpLclNum);
}
}
}
// If we have created a temp, insert an embedded assignment stmnt before
// the first putargStkNode i.e.
// tmpLcl = CallerArg
if (tmpLclNum != BAD_VAR_NUM)
{
assert(tmpType != TYP_UNDEF);
GenTreeLclVar* local =
new (comp, GT_LCL_VAR) GenTreeLclVar(GT_LCL_VAR, tmpType, callerArgLclNum, BAD_IL_OFFSET);
GenTree* assignExpr = comp->gtNewTempAssign(tmpLclNum, local);
ContainCheckRange(local, assignExpr);
BlockRange().InsertBefore(firstPutArgStk, LIR::SeqTree(comp, assignExpr));
}
}
// Insert GT_START_NONGC node before the first GT_PUTARG_STK node.
// Note that if there are no args to be setup on stack, no need to
// insert GT_START_NONGC node.
GenTree* startNonGCNode = nullptr;
if (firstPutArgStk != nullptr)
{
startNonGCNode = new (comp, GT_START_NONGC) GenTree(GT_START_NONGC, TYP_VOID);
BlockRange().InsertBefore(firstPutArgStk, startNonGCNode);
// Gc-interruptability in the following case:
// foo(a, b, c, d, e) { bar(a, b, c, d, e); }
// bar(a, b, c, d, e) { foo(a, b, d, d, e); }
//
// Since the instruction group starting from the instruction that sets up first
// stack arg to the end of the tail call is marked as non-gc interruptible,
// this will form a non-interruptible tight loop causing gc-starvation. To fix
// this we insert GT_NO_OP as embedded stmt before GT_START_NONGC, if the method
// has a single basic block and is not a GC-safe point. The presence of a single
// nop outside non-gc interruptible region will prevent gc starvation.
if ((comp->fgBBcount == 1) && !(comp->compCurBB->bbFlags & BBF_GC_SAFE_POINT))
{
assert(comp->fgFirstBB == comp->compCurBB);
GenTree* noOp = new (comp, GT_NO_OP) GenTree(GT_NO_OP, TYP_VOID);
BlockRange().InsertBefore(startNonGCNode, noOp);
}
}
// Insert GT_PROF_HOOK node to emit profiler tail call hook. This should be
// inserted before the args are setup but after the side effects of args are
// computed. That is, GT_PROF_HOOK node needs to be inserted before GT_START_NONGC
// node if one exists.
if (comp->compIsProfilerHookNeeded())
{
InsertProfTailCallHook(call, startNonGCNode);
}
#else // !FEATURE_FASTTAILCALL
// Platform choose not to implement fast tail call mechanism.
// In such a case we should never be reaching this method as
// the expectation is that IsTailCallViaHelper() will always
// be true on such a platform.
unreached();
#endif
}
//------------------------------------------------------------------------
// LowerTailCallViaHelper: lower a call via the tailcall helper. Morph
// has already inserted tailcall helper special arguments. This function
// inserts actual data for some placeholders.
//
// For ARM32, AMD64, lower
// tail.call(void* copyRoutine, void* dummyArg, ...)
// as
// Jit_TailCall(void* copyRoutine, void* callTarget, ...)
//
// For x86, lower
// tail.call(<function args>, int numberOfOldStackArgs, int dummyNumberOfNewStackArgs, int flags, void* dummyArg)
// as
// JIT_TailCall(<function args>, int numberOfOldStackArgsWords, int numberOfNewStackArgsWords, int flags, void*
// callTarget)
// Note that the special arguments are on the stack, whereas the function arguments follow the normal convention.
//
// Also inserts PInvoke method epilog if required.
//
// Arguments:
// call - The call node
// callTarget - The real call target. This is used to replace the dummyArg during lowering.
//
// Return Value:
// Returns control expression tree for making a call to helper Jit_TailCall.
//
GenTree* Lowering::LowerTailCallViaHelper(GenTreeCall* call, GenTree* callTarget)
{
// Tail call restrictions i.e. conditions under which tail prefix is ignored.
// Most of these checks are already done by importer or fgMorphTailCall().
// This serves as a double sanity check.
assert((comp->info.compFlags & CORINFO_FLG_SYNCH) == 0); // tail calls from synchronized methods
assert(!comp->opts.compNeedSecurityCheck); // tail call from methods that need security check
assert(!call->IsUnmanaged()); // tail calls to unamanaged methods
assert(!comp->compLocallocUsed); // tail call from methods that also do localloc
#ifdef _TARGET_AMD64_
assert(!comp->getNeedsGSSecurityCookie()); // jit64 compat: tail calls from methods that need GS check
#endif // _TARGET_AMD64_
// We expect to see a call that meets the following conditions
assert(call->IsTailCallViaHelper());
assert(callTarget != nullptr);
// The TailCall helper call never returns to the caller and is not GC interruptible.
// Therefore the block containing the tail call should be a GC safe point to avoid
// GC starvation. It is legal for the block to be unmarked iff the entry block is a
// GC safe point, as the entry block trivially dominates every reachable block.
assert((comp->compCurBB->bbFlags & BBF_GC_SAFE_POINT) || (comp->fgFirstBB->bbFlags & BBF_GC_SAFE_POINT));
// If PInvokes are in-lined, we have to remember to execute PInvoke method epilog anywhere that
// a method returns. This is a case of caller method has both PInvokes and tail calls.
if (comp->info.compCallUnmanaged)
{
InsertPInvokeMethodEpilog(comp->compCurBB DEBUGARG(call));
}
// Remove gtCallAddr from execution order if present.
if (call->gtCallType == CT_INDIRECT)
{
assert(call->gtCallAddr != nullptr);
bool isClosed;
LIR::ReadOnlyRange callAddrRange = BlockRange().GetTreeRange(call->gtCallAddr, &isClosed);
assert(isClosed);
BlockRange().Remove(std::move(callAddrRange));
}
// The callTarget tree needs to be sequenced.
LIR::Range callTargetRange = LIR::SeqTree(comp, callTarget);
#if defined(_TARGET_AMD64_) || defined(_TARGET_ARM_)
// For ARM32 and AMD64, first argument is CopyRoutine and second argument is a place holder node.
fgArgTabEntry* argEntry;
#ifdef DEBUG
argEntry = comp->gtArgEntryByArgNum(call, 0);
assert(argEntry != nullptr);
assert(argEntry->node->gtOper == GT_PUTARG_REG);
GenTree* firstArg = argEntry->node->gtOp.gtOp1;
assert(firstArg->gtOper == GT_CNS_INT);
#endif
// Replace second arg by callTarget.
argEntry = comp->gtArgEntryByArgNum(call, 1);
assert(argEntry != nullptr);
assert(argEntry->node->gtOper == GT_PUTARG_REG);
GenTree* secondArg = argEntry->node->gtOp.gtOp1;
ContainCheckRange(callTargetRange);
BlockRange().InsertAfter(secondArg, std::move(callTargetRange));
bool isClosed;
LIR::ReadOnlyRange secondArgRange = BlockRange().GetTreeRange(secondArg, &isClosed);
assert(isClosed);
BlockRange().Remove(std::move(secondArgRange));
argEntry->node->gtOp.gtOp1 = callTarget;
#elif defined(_TARGET_X86_)
// Verify the special args are what we expect, and replace the dummy args with real values.
// We need to figure out the size of the outgoing stack arguments, not including the special args.
// The number of 4-byte words is passed to the helper for the incoming and outgoing argument sizes.
// This number is exactly the next slot number in the call's argument info struct.
unsigned nNewStkArgsWords = call->fgArgInfo->GetNextSlotNum();
assert(nNewStkArgsWords >= 4); // There must be at least the four special stack args.
nNewStkArgsWords -= 4;
unsigned numArgs = call->fgArgInfo->ArgCount();
fgArgTabEntry* argEntry;
// arg 0 == callTarget.
argEntry = comp->gtArgEntryByArgNum(call, numArgs - 1);
assert(argEntry != nullptr);
assert(argEntry->node->gtOper == GT_PUTARG_STK);
GenTree* arg0 = argEntry->node->gtOp.gtOp1;
ContainCheckRange(callTargetRange);
BlockRange().InsertAfter(arg0, std::move(callTargetRange));
bool isClosed;
LIR::ReadOnlyRange secondArgRange = BlockRange().GetTreeRange(arg0, &isClosed);
assert(isClosed);
BlockRange().Remove(std::move(secondArgRange));
argEntry->node->gtOp.gtOp1 = callTarget;
// arg 1 == flags
argEntry = comp->gtArgEntryByArgNum(call, numArgs - 2);
assert(argEntry != nullptr);
assert(argEntry->node->gtOper == GT_PUTARG_STK);
GenTree* arg1 = argEntry->node->gtOp.gtOp1;
assert(arg1->gtOper == GT_CNS_INT);
ssize_t tailCallHelperFlags = 1 | // always restore EDI,ESI,EBX
(call->IsVirtualStub() ? 0x2 : 0x0); // Stub dispatch flag
arg1->gtIntCon.gtIconVal = tailCallHelperFlags;
// arg 2 == numberOfNewStackArgsWords
argEntry = comp->gtArgEntryByArgNum(call, numArgs - 3);
assert(argEntry != nullptr);
assert(argEntry->node->gtOper == GT_PUTARG_STK);
GenTree* arg2 = argEntry->node->gtOp.gtOp1;
assert(arg2->gtOper == GT_CNS_INT);
arg2->gtIntCon.gtIconVal = nNewStkArgsWords;
#ifdef DEBUG
// arg 3 == numberOfOldStackArgsWords
argEntry = comp->gtArgEntryByArgNum(call, numArgs - 4);
assert(argEntry != nullptr);
assert(argEntry->node->gtOper == GT_PUTARG_STK);
GenTree* arg3 = argEntry->node->gtOp.gtOp1;
assert(arg3->gtOper == GT_CNS_INT);
#endif // DEBUG
#else
NYI("LowerTailCallViaHelper");
#endif // _TARGET_*
// Transform this call node into a call to Jit tail call helper.
call->gtCallType = CT_HELPER;
call->gtCallMethHnd = comp->eeFindHelper(CORINFO_HELP_TAILCALL);
call->gtFlags &= ~GTF_CALL_VIRT_KIND_MASK;
// Lower this as if it were a pure helper call.
call->gtCallMoreFlags &= ~(GTF_CALL_M_TAILCALL | GTF_CALL_M_TAILCALL_VIA_HELPER);
GenTree* result = LowerDirectCall(call);
// Now add back tail call flags for identifying this node as tail call dispatched via helper.
call->gtCallMoreFlags |= GTF_CALL_M_TAILCALL | GTF_CALL_M_TAILCALL_VIA_HELPER;
#ifdef PROFILING_SUPPORTED
// Insert profiler tail call hook if needed.
// Since we don't know the insertion point, pass null for second param.
if (comp->compIsProfilerHookNeeded())
{
InsertProfTailCallHook(call, nullptr);
}
#endif // PROFILING_SUPPORTED
assert(call->IsTailCallViaHelper());
return result;
}
#ifndef _TARGET_64BIT_
//------------------------------------------------------------------------
// Lowering::DecomposeLongCompare: Decomposes a TYP_LONG compare node.
//
// Arguments:
// cmp - the compare node
//
// Return Value:
// The next node to lower.
//
// Notes:
// This is done during lowering because DecomposeLongs handles only nodes
// that produce TYP_LONG values. Compare nodes may consume TYP_LONG values
// but produce TYP_INT values.
//
GenTree* Lowering::DecomposeLongCompare(GenTree* cmp)
{
assert(cmp->gtGetOp1()->TypeGet() == TYP_LONG);
GenTree* src1 = cmp->gtGetOp1();
GenTree* src2 = cmp->gtGetOp2();
assert(src1->OperIs(GT_LONG));
assert(src2->OperIs(GT_LONG));
GenTree* loSrc1 = src1->gtGetOp1();
GenTree* hiSrc1 = src1->gtGetOp2();
GenTree* loSrc2 = src2->gtGetOp1();
GenTree* hiSrc2 = src2->gtGetOp2();
BlockRange().Remove(src1);
BlockRange().Remove(src2);
genTreeOps condition = cmp->OperGet();
GenTree* loCmp;
GenTree* hiCmp;
if (cmp->OperIs(GT_EQ, GT_NE))
{
//
// Transform (x EQ|NE y) into (((x.lo XOR y.lo) OR (x.hi XOR y.hi)) EQ|NE 0). If y is 0 then this can
// be reduced to just ((x.lo OR x.hi) EQ|NE 0). The OR is expected to set the condition flags so we
// don't need to generate a redundant compare against 0, we only generate a SETCC|JCC instruction.
//
// XOR is used rather than SUB because it is commutative and thus allows swapping the operands when
// the first happens to be a constant. Usually only the second compare operand is a constant but it's
// still possible to have a constant on the left side. For example, when src1 is a uint->ulong cast
// then hiSrc1 would be 0.
//
if (loSrc1->OperIs(GT_CNS_INT))
{
std::swap(loSrc1, loSrc2);
}
if (loSrc2->IsIntegralConst(0))
{
BlockRange().Remove(loSrc2);
loCmp = loSrc1;
}
else
{
loCmp = comp->gtNewOperNode(GT_XOR, TYP_INT, loSrc1, loSrc2);
BlockRange().InsertBefore(cmp, loCmp);
ContainCheckBinary(loCmp->AsOp());
}
if (hiSrc1->OperIs(GT_CNS_INT))
{
std::swap(hiSrc1, hiSrc2);
}
if (hiSrc2->IsIntegralConst(0))
{
BlockRange().Remove(hiSrc2);
hiCmp = hiSrc1;
}
else
{
hiCmp = comp->gtNewOperNode(GT_XOR, TYP_INT, hiSrc1, hiSrc2);
BlockRange().InsertBefore(cmp, hiCmp);
ContainCheckBinary(hiCmp->AsOp());
}
hiCmp = comp->gtNewOperNode(GT_OR, TYP_INT, loCmp, hiCmp);
BlockRange().InsertBefore(cmp, hiCmp);
ContainCheckBinary(hiCmp->AsOp());
}
else
{
assert(cmp->OperIs(GT_LT, GT_LE, GT_GE, GT_GT));
//
// If the compare is signed then (x LT|GE y) can be transformed into ((x SUB y) LT|GE 0).
// If the compare is unsigned we can still use SUB but we need to check the Carry flag,
// not the actual result. In both cases we can simply check the appropiate condition flags
// and ignore the actual result:
// SUB_LO loSrc1, loSrc2
// SUB_HI hiSrc1, hiSrc2
// SETCC|JCC (signed|unsigned LT|GE)
// If loSrc2 happens to be 0 then the first SUB can be eliminated and the second one can
// be turned into a CMP because the first SUB would have set carry to 0. This effectively
// transforms a long compare against 0 into an int compare of the high part against 0.
//
// (x LE|GT y) can to be transformed into ((x SUB y) LE|GT 0) but checking that a long value
// is greater than 0 is not so easy. We need to turn this into a positive/negative check
// like the one we get for LT|GE compares, this can be achieved by swapping the compare:
// (x LE|GT y) becomes (y GE|LT x)
//
// Having to swap operands is problematic when the second operand is a constant. The constant
// moves to the first operand where it cannot be contained and thus needs a register. This can
// be avoided by changing the constant such that LE|GT becomes LT|GE:
// (x LE|GT 41) becomes (x LT|GE 42)
//
if (cmp->OperIs(GT_LE, GT_GT))
{
bool mustSwap = true;
if (loSrc2->OperIs(GT_CNS_INT) && hiSrc2->OperIs(GT_CNS_INT))
{
uint32_t loValue = static_cast<uint32_t>(loSrc2->AsIntCon()->IconValue());
uint32_t hiValue = static_cast<uint32_t>(hiSrc2->AsIntCon()->IconValue());
uint64_t value = static_cast<uint64_t>(loValue) | (static_cast<uint64_t>(hiValue) << 32);
uint64_t maxValue = cmp->IsUnsigned() ? UINT64_MAX : INT64_MAX;
if (value != maxValue)
{
value++;
loValue = value & UINT32_MAX;
hiValue = (value >> 32) & UINT32_MAX;
loSrc2->AsIntCon()->SetIconValue(loValue);
hiSrc2->AsIntCon()->SetIconValue(hiValue);
condition = cmp->OperIs(GT_LE) ? GT_LT : GT_GE;
mustSwap = false;
}
}
if (mustSwap)
{
std::swap(loSrc1, loSrc2);
std::swap(hiSrc1, hiSrc2);
condition = GenTree::SwapRelop(condition);
}
}
assert((condition == GT_LT) || (condition == GT_GE));
if (loSrc2->IsIntegralConst(0))
{
BlockRange().Remove(loSrc2);
// Very conservative dead code removal... but it helps.
if (loSrc1->OperIs(GT_CNS_INT, GT_LCL_VAR, GT_LCL_FLD))
{
BlockRange().Remove(loSrc1);
if (loSrc1->OperIs(GT_LCL_VAR, GT_LCL_FLD))
{
comp->lvaDecRefCnts(m_block, loSrc1);
}
}
else
{
loSrc1->SetUnusedValue();
}
hiCmp = comp->gtNewOperNode(GT_CMP, TYP_VOID, hiSrc1, hiSrc2);
BlockRange().InsertBefore(cmp, hiCmp);
ContainCheckCompare(hiCmp->AsOp());
}
else
{
loCmp = comp->gtNewOperNode(GT_CMP, TYP_VOID, loSrc1, loSrc2);
hiCmp = comp->gtNewOperNode(GT_SUB_HI, TYP_INT, hiSrc1, hiSrc2);
BlockRange().InsertBefore(cmp, loCmp, hiCmp);
ContainCheckCompare(loCmp->AsOp());
ContainCheckBinary(hiCmp->AsOp());
//
// Try to move the first SUB_HI operands right in front of it, this allows using
// a single temporary register instead of 2 (one for CMP and one for SUB_HI). Do
// this only for locals as they won't change condition flags. Note that we could
// move constants (except 0 which generates XOR reg, reg) but it's extremly rare
// to have a constant as the first operand.
//
if (hiSrc1->OperIs(GT_LCL_VAR, GT_LCL_FLD))
{
BlockRange().Remove(hiSrc1);
BlockRange().InsertBefore(hiCmp, hiSrc1);
}
}
}
hiCmp->gtFlags |= GTF_SET_FLAGS;
if (hiCmp->IsValue())
{
hiCmp->SetUnusedValue();
}
LIR::Use cmpUse;
if (BlockRange().TryGetUse(cmp, &cmpUse) && cmpUse.User()->OperIs(GT_JTRUE))
{
BlockRange().Remove(cmp);
GenTree* jcc = cmpUse.User();
jcc->gtOp.gtOp1 = nullptr;
jcc->ChangeOper(GT_JCC);
jcc->gtFlags |= (cmp->gtFlags & GTF_UNSIGNED) | GTF_USE_FLAGS;
jcc->AsCC()->gtCondition = condition;
}
else
{
cmp->gtOp.gtOp1 = nullptr;
cmp->gtOp.gtOp2 = nullptr;
cmp->ChangeOper(GT_SETCC);
cmp->gtFlags |= GTF_USE_FLAGS;
cmp->AsCC()->gtCondition = condition;
}
return cmp->gtNext;
}
#endif // !_TARGET_64BIT_
//------------------------------------------------------------------------
// Lowering::OptimizeConstCompare: Performs various "compare with const" optimizations.
//
// Arguments:
// cmp - the compare node
//
// Return Value:
// The original compare node if lowering should proceed as usual or the next node
// to lower if the compare node was changed in such a way that lowering is no
// longer needed.
//
// Notes:
// - Narrow operands to enable memory operand containment (XARCH specific).
// - Transform cmp(and(x, y), 0) into test(x, y) (XARCH/Arm64 specific but could
// be used for ARM as well if support for GT_TEST_EQ/GT_TEST_NE is added).
// - Transform TEST(x, LSH(1, y)) into BT(x, y) (XARCH specific)
// - Transform RELOP(OP, 0) into SETCC(OP) or JCC(OP) if OP can set the
// condition flags appropriately (XARCH/ARM64 specific but could be extended
// to ARM32 as well if ARM32 codegen supports GTF_SET_FLAGS).
//
GenTree* Lowering::OptimizeConstCompare(GenTree* cmp)
{
assert(cmp->gtGetOp2()->IsIntegralConst());
#if defined(_TARGET_XARCH_) || defined(_TARGET_ARM64_)
GenTree* op1 = cmp->gtGetOp1();
var_types op1Type = op1->TypeGet();
GenTreeIntCon* op2 = cmp->gtGetOp2()->AsIntCon();
ssize_t op2Value = op2->IconValue();
#ifdef _TARGET_XARCH_
if (IsContainableMemoryOp(op1) && varTypeIsSmall(op1Type) && genSmallTypeCanRepresentValue(op1Type, op2Value))
{
//
// If op1's type is small then try to narrow op2 so it has the same type as op1.
// Small types are usually used by memory loads and if both compare operands have
// the same type then the memory load can be contained. In certain situations
// (e.g "cmp ubyte, 200") we also get a smaller instruction encoding.
//
op2->gtType = op1Type;
}
else
#endif
if (op1->OperIs(GT_CAST) && !op1->gtOverflow())
{
GenTreeCast* cast = op1->AsCast();
var_types castToType = cast->CastToType();
GenTree* castOp = cast->gtGetOp1();
if (((castToType == TYP_BOOL) || (castToType == TYP_UBYTE)) && FitsIn<UINT8>(op2Value))
{
//
// Since we're going to remove the cast we need to be able to narrow the cast operand
// to the cast type. This can be done safely only for certain opers (e.g AND, OR, XOR).
// Some opers just can't be narrowed (e.g DIV, MUL) while other could be narrowed but
// doing so would produce incorrect results (e.g. RSZ, RSH).
//
// The below list of handled opers is conservative but enough to handle the most common
// situations. In particular this include CALL, sometimes the JIT unnecessarilly widens
// the result of bool returning calls.
//
bool removeCast =
#ifdef _TARGET_ARM64_
(op2Value == 0) && cmp->OperIs(GT_EQ, GT_NE, GT_GT) &&
#endif
(castOp->OperIs(GT_CALL, GT_LCL_VAR) || castOp->OperIsLogical()
#ifdef _TARGET_XARCH_
|| IsContainableMemoryOp(castOp)
#endif
);
if (removeCast)
{
assert(!castOp->gtOverflowEx()); // Must not be an overflow checking operation
#ifdef _TARGET_ARM64_
bool cmpEq = cmp->OperIs(GT_EQ);
cmp->SetOperRaw(cmpEq ? GT_TEST_EQ : GT_TEST_NE);
op2->SetIconValue(0xff);
op2->gtType = castOp->gtType;
#else
castOp->gtType = castToType;
op2->gtType = castToType;
#endif
// If we have any contained memory ops on castOp, they must now not be contained.
if (castOp->OperIsLogical())
{
GenTree* op1 = castOp->gtGetOp1();
if ((op1 != nullptr) && !op1->IsCnsIntOrI())
{
op1->ClearContained();
}
GenTree* op2 = castOp->gtGetOp2();
if ((op2 != nullptr) && !op2->IsCnsIntOrI())
{
op2->ClearContained();
}
}
cmp->gtOp.gtOp1 = castOp;
BlockRange().Remove(cast);
}
}
}
else if (op1->OperIs(GT_AND) && cmp->OperIs(GT_EQ, GT_NE))
{
//
// Transform ((x AND y) EQ|NE 0) into (x TEST_EQ|TEST_NE y) when possible.
//
GenTree* andOp1 = op1->gtGetOp1();
GenTree* andOp2 = op1->gtGetOp2();
if (op2Value != 0)
{
//
// If we don't have a 0 compare we can get one by transforming ((x AND mask) EQ|NE mask)
// into ((x AND mask) NE|EQ 0) when mask is a single bit.
//
if (isPow2(static_cast<size_t>(op2Value)) && andOp2->IsIntegralConst(op2Value))
{
op2Value = 0;
op2->SetIconValue(0);
cmp->SetOperRaw(GenTree::ReverseRelop(cmp->OperGet()));
}
}
if (op2Value == 0)
{
BlockRange().Remove(op1);
BlockRange().Remove(op2);
cmp->SetOperRaw(cmp->OperIs(GT_EQ) ? GT_TEST_EQ : GT_TEST_NE);
cmp->gtOp.gtOp1 = andOp1;
cmp->gtOp.gtOp2 = andOp2;
// We will re-evaluate containment below
andOp1->ClearContained();
andOp2->ClearContained();
#ifdef _TARGET_XARCH_
if (IsContainableMemoryOp(andOp1) && andOp2->IsIntegralConst())
{
//
// For "test" we only care about the bits that are set in the second operand (mask).
// If the mask fits in a small type then we can narrow both operands to generate a "test"
// instruction with a smaller encoding ("test" does not have a r/m32, imm8 form) and avoid
// a widening load in some cases.
//
// For 16 bit operands we narrow only if the memory operand is already 16 bit. This matches
// the behavior of a previous implementation and avoids adding more cases where we generate
// 16 bit instructions that require a length changing prefix (0x66). These suffer from
// significant decoder stalls on Intel CPUs.
//
// We could also do this for 64 bit masks that fit into 32 bit but it doesn't help.
// In such cases morph narrows down the existing GT_AND by inserting a cast between it and
// the memory operand so we'd need to add more code to recognize and eliminate that cast.
//
size_t mask = static_cast<size_t>(andOp2->AsIntCon()->IconValue());
if (FitsIn<UINT8>(mask))
{
andOp1->gtType = TYP_UBYTE;
andOp2->gtType = TYP_UBYTE;
}
else if (FitsIn<UINT16>(mask) && genTypeSize(andOp1) == 2)
{
andOp1->gtType = TYP_USHORT;
andOp2->gtType = TYP_USHORT;
}
}
#endif
}
}
if (cmp->OperIs(GT_TEST_EQ, GT_TEST_NE))
{
#ifdef _TARGET_XARCH_
//
// Transform TEST_EQ|NE(x, LSH(1, y)) into BT(x, y) when possible. Using BT
// results in smaller and faster code. It also doesn't have special register
// requirements, unlike LSH that requires the shift count to be in ECX.
// Note that BT has the same behavior as LSH when the bit index exceeds the
// operand bit size - it uses (bit_index MOD bit_size).
//
GenTree* lsh = cmp->gtGetOp2();
LIR::Use cmpUse;
if (lsh->OperIs(GT_LSH) && varTypeIsIntOrI(lsh->TypeGet()) && lsh->gtGetOp1()->IsIntegralConst(1) &&
BlockRange().TryGetUse(cmp, &cmpUse))
{
genTreeOps condition = cmp->OperIs(GT_TEST_NE) ? GT_LT : GT_GE;
cmp->SetOper(GT_BT);
cmp->gtType = TYP_VOID;
cmp->gtFlags |= GTF_SET_FLAGS;
cmp->gtOp.gtOp2 = lsh->gtGetOp2();
cmp->gtGetOp2()->ClearContained();
BlockRange().Remove(lsh->gtGetOp1());
BlockRange().Remove(lsh);
GenTreeCC* cc;
if (cmpUse.User()->OperIs(GT_JTRUE))
{
cmpUse.User()->ChangeOper(GT_JCC);
cc = cmpUse.User()->AsCC();
cc->gtCondition = condition;
}
else
{
cc = new (comp, GT_SETCC) GenTreeCC(GT_SETCC, condition, TYP_INT);
BlockRange().InsertAfter(cmp, cc);
cmpUse.ReplaceWith(comp, cc);
}
cc->gtFlags |= GTF_USE_FLAGS | GTF_UNSIGNED;
return cmp->gtNext;
}
#endif // _TARGET_XARCH_
}
else if (cmp->OperIs(GT_EQ, GT_NE))
{
GenTree* op1 = cmp->gtGetOp1();
GenTree* op2 = cmp->gtGetOp2();
// TODO-CQ: right now the below peep is inexpensive and gets the benefit in most
// cases because in majority of cases op1, op2 and cmp would be in that order in
// execution. In general we should be able to check that all the nodes that come
// after op1 do not modify the flags so that it is safe to avoid generating a
// test instruction.
if (op2->IsIntegralConst(0) && (op1->gtNext == op2) && (op2->gtNext == cmp) &&
#ifdef _TARGET_XARCH_
op1->OperIs(GT_AND, GT_OR, GT_XOR, GT_ADD, GT_SUB, GT_NEG))
#else // _TARGET_ARM64_
op1->OperIs(GT_AND, GT_ADD, GT_SUB))
#endif
{
op1->gtFlags |= GTF_SET_FLAGS;
op1->SetUnusedValue();
BlockRange().Remove(op2);
GenTree* next = cmp->gtNext;
GenTree* cc;
genTreeOps ccOp;
LIR::Use cmpUse;
// Fast check for the common case - relop used by a JTRUE that immediately follows it.
if ((next != nullptr) && next->OperIs(GT_JTRUE) && (next->gtGetOp1() == cmp))
{
cc = next;
ccOp = GT_JCC;
next = nullptr;
BlockRange().Remove(cmp);
}
else if (BlockRange().TryGetUse(cmp, &cmpUse) && cmpUse.User()->OperIs(GT_JTRUE))
{
cc = cmpUse.User();
ccOp = GT_JCC;
next = nullptr;
BlockRange().Remove(cmp);
}
else // The relop is not used by a JTRUE or it is not used at all.
{
// Transform the relop node it into a SETCC. If it's not used we could remove
// it completely but that means doing more work to handle a rare case.
cc = cmp;
ccOp = GT_SETCC;
}
genTreeOps condition = cmp->OperGet();
cc->ChangeOper(ccOp);
cc->AsCC()->gtCondition = condition;
cc->gtFlags |= GTF_USE_FLAGS | (cmp->gtFlags & GTF_UNSIGNED);
return next;
}
}
#endif // defined(_TARGET_XARCH_) || defined(_TARGET_ARM64_)
return cmp;
}
//------------------------------------------------------------------------
// Lowering::LowerCompare: Lowers a compare node.
//
// Arguments:
// cmp - the compare node
//
// Return Value:
// The next node to lower.
//
GenTree* Lowering::LowerCompare(GenTree* cmp)
{
#ifndef _TARGET_64BIT_
if (cmp->gtGetOp1()->TypeGet() == TYP_LONG)
{
return DecomposeLongCompare(cmp);
}
#endif
if (cmp->gtGetOp2()->IsIntegralConst() && !comp->opts.MinOpts())
{
GenTree* next = OptimizeConstCompare(cmp);
// If OptimizeConstCompare return the compare node as "next" then we need to continue lowering.
if (next != cmp)
{
return next;
}
}
#ifdef _TARGET_XARCH_
if (cmp->gtGetOp1()->TypeGet() == cmp->gtGetOp2()->TypeGet())
{
if (varTypeIsSmall(cmp->gtGetOp1()->TypeGet()) && varTypeIsUnsigned(cmp->gtGetOp1()->TypeGet()))
{
//
// If both operands have the same type then codegen will use the common operand type to
// determine the instruction type. For small types this would result in performing a
// signed comparison of two small unsigned values without zero extending them to TYP_INT
// which is incorrect. Note that making the comparison unsigned doesn't imply that codegen
// has to generate a small comparison, it can still correctly generate a TYP_INT comparison.
//
cmp->gtFlags |= GTF_UNSIGNED;
}
}
#endif // _TARGET_XARCH_
ContainCheckCompare(cmp->AsOp());
return cmp->gtNext;
}
//------------------------------------------------------------------------
// Lowering::LowerJTrue: Lowers a JTRUE node.
//
// Arguments:
// jtrue - the JTRUE node
//
// Return Value:
// The next node to lower (usually nullptr).
//
// Notes:
// On ARM64 this may remove the JTRUE node and transform its associated
// relop into a JCMP node.
//
GenTree* Lowering::LowerJTrue(GenTreeOp* jtrue)
{
#ifdef _TARGET_ARM64_
GenTree* relop = jtrue->gtGetOp1();
GenTree* relopOp2 = relop->gtOp.gtGetOp2();
if ((relop->gtNext == jtrue) && relopOp2->IsCnsIntOrI())
{
bool useJCMP = false;
unsigned flags = 0;
if (relop->OperIs(GT_EQ, GT_NE) && relopOp2->IsIntegralConst(0))
{
// Codegen will use cbz or cbnz in codegen which do not affect the flag register
flags = relop->OperIs(GT_EQ) ? GTF_JCMP_EQ : 0;
useJCMP = true;
}
else if (relop->OperIs(GT_TEST_EQ, GT_TEST_NE) && isPow2(relopOp2->AsIntCon()->IconValue()))
{
// Codegen will use tbz or tbnz in codegen which do not affect the flag register
flags = GTF_JCMP_TST | (relop->OperIs(GT_TEST_EQ) ? GTF_JCMP_EQ : 0);
useJCMP = true;
}
if (useJCMP)
{
relop->SetOper(GT_JCMP);
relop->gtFlags &= ~(GTF_JCMP_TST | GTF_JCMP_EQ);
relop->gtFlags |= flags;
relop->gtType = TYP_VOID;
relopOp2->SetContained();
BlockRange().Remove(jtrue);
assert(relop->gtNext == nullptr);
return nullptr;
}
}
#endif // _TARGET_ARM64_
ContainCheckJTrue(jtrue);
assert(jtrue->gtNext == nullptr);
return nullptr;
}
// Lower "jmp <method>" tail call to insert PInvoke method epilog if required.
void Lowering::LowerJmpMethod(GenTree* jmp)
{
assert(jmp->OperGet() == GT_JMP);
JITDUMP("lowering GT_JMP\n");
DISPNODE(jmp);
JITDUMP("============");
// If PInvokes are in-lined, we have to remember to execute PInvoke method epilog anywhere that
// a method returns.
if (comp->info.compCallUnmanaged)
{
InsertPInvokeMethodEpilog(comp->compCurBB DEBUGARG(jmp));
}
}
// Lower GT_RETURN node to insert PInvoke method epilog if required.
void Lowering::LowerRet(GenTree* ret)
{
assert(ret->OperGet() == GT_RETURN);
JITDUMP("lowering GT_RETURN\n");
DISPNODE(ret);
JITDUMP("============");
#if defined(_TARGET_AMD64_) && defined(FEATURE_SIMD)
GenTreeUnOp* const unOp = ret->AsUnOp();
if ((unOp->TypeGet() == TYP_LONG) && (unOp->gtOp1->TypeGet() == TYP_SIMD8))
{
GenTreeUnOp* bitcast = new (comp, GT_BITCAST) GenTreeOp(GT_BITCAST, TYP_LONG, unOp->gtOp1, nullptr);
unOp->gtOp1 = bitcast;
BlockRange().InsertBefore(unOp, bitcast);
}
#endif // _TARGET_AMD64_
// Method doing PInvokes has exactly one return block unless it has tail calls.
if (comp->info.compCallUnmanaged && (comp->compCurBB == comp->genReturnBB))
{
InsertPInvokeMethodEpilog(comp->compCurBB DEBUGARG(ret));
}
ContainCheckRet(ret->AsOp());
}
GenTree* Lowering::LowerDirectCall(GenTreeCall* call)
{
noway_assert(call->gtCallType == CT_USER_FUNC || call->gtCallType == CT_HELPER);
// Don't support tail calling helper methods.
// But we might encounter tail calls dispatched via JIT helper appear as a tail call to helper.
noway_assert(!call->IsTailCall() || call->IsTailCallViaHelper() || call->gtCallType == CT_USER_FUNC);
// Non-virtual direct/indirect calls: Work out if the address of the
// call is known at JIT time. If not it is either an indirect call
// or the address must be accessed via an single/double indirection.
void* addr;
InfoAccessType accessType;
CorInfoHelpFunc helperNum = comp->eeGetHelperNum(call->gtCallMethHnd);
#ifdef FEATURE_READYTORUN_COMPILER
if (call->gtEntryPoint.addr != nullptr)
{
accessType = call->gtEntryPoint.accessType;
addr = call->gtEntryPoint.addr;
}
else
#endif
if (call->gtCallType == CT_HELPER)
{
noway_assert(helperNum != CORINFO_HELP_UNDEF);
// the convention on getHelperFtn seems to be (it's not documented)
// that it returns an address or if it returns null, pAddr is set to
// another address, which requires an indirection
void* pAddr;
addr = comp->info.compCompHnd->getHelperFtn(helperNum, (void**)&pAddr);
if (addr != nullptr)
{
assert(pAddr == nullptr);
accessType = IAT_VALUE;
}
else
{
accessType = IAT_PVALUE;
addr = pAddr;
}
}
else
{
noway_assert(helperNum == CORINFO_HELP_UNDEF);
CORINFO_ACCESS_FLAGS aflags = CORINFO_ACCESS_ANY;
if (call->IsSameThis())
{
aflags = (CORINFO_ACCESS_FLAGS)(aflags | CORINFO_ACCESS_THIS);
}
if (!call->NeedsNullCheck())
{
aflags = (CORINFO_ACCESS_FLAGS)(aflags | CORINFO_ACCESS_NONNULL);
}
CORINFO_CONST_LOOKUP addrInfo;
comp->info.compCompHnd->getFunctionEntryPoint(call->gtCallMethHnd, &addrInfo, aflags);
accessType = addrInfo.accessType;
addr = addrInfo.addr;
}
GenTree* result = nullptr;
switch (accessType)
{
case IAT_VALUE:
// Non-virtual direct call to known address
if (!IsCallTargetInRange(addr) || call->IsTailCall())
{
result = AddrGen(addr);
}
else
{
// a direct call within range of hardware relative call instruction
// stash the address for codegen
call->gtDirectCallAddress = addr;
}
break;
case IAT_PVALUE:
{
// Non-virtual direct calls to addresses accessed by
// a single indirection.
GenTree* cellAddr = AddrGen(addr);
GenTree* indir = Ind(cellAddr);
result = indir;
break;
}
case IAT_PPVALUE:
// Non-virtual direct calls to addresses accessed by
// a double indirection.
//
// Double-indirection. Load the address into a register
// and call indirectly through the register
noway_assert(helperNum == CORINFO_HELP_UNDEF);
result = AddrGen(addr);
result = Ind(Ind(result));
break;
case IAT_RELPVALUE:
{
// Non-virtual direct calls to addresses accessed by
// a single relative indirection.
GenTree* cellAddr = AddrGen(addr);
GenTree* indir = Ind(cellAddr);
result = comp->gtNewOperNode(GT_ADD, TYP_I_IMPL, indir, AddrGen(addr));
break;
}
default:
noway_assert(!"Bad accessType");
break;
}
return result;
}
GenTree* Lowering::LowerDelegateInvoke(GenTreeCall* call)
{
noway_assert(call->gtCallType == CT_USER_FUNC);
assert((comp->info.compCompHnd->getMethodAttribs(call->gtCallMethHnd) &
(CORINFO_FLG_DELEGATE_INVOKE | CORINFO_FLG_FINAL)) == (CORINFO_FLG_DELEGATE_INVOKE | CORINFO_FLG_FINAL));
GenTree* thisArgNode;
if (call->IsTailCallViaHelper())
{
#ifdef _TARGET_X86_ // x86 tailcall via helper follows normal calling convention, but with extra stack args.
const unsigned argNum = 0;
#else // !_TARGET_X86_
// In case of helper dispatched tail calls, "thisptr" will be the third arg.
// The first two args are: real call target and addr of args copy routine.
const unsigned argNum = 2;
#endif // !_TARGET_X86_
fgArgTabEntry* thisArgTabEntry = comp->gtArgEntryByArgNum(call, argNum);
thisArgNode = thisArgTabEntry->node;
}
else
{
thisArgNode = comp->gtGetThisArg(call);
}
assert(thisArgNode->gtOper == GT_PUTARG_REG);
GenTree* originalThisExpr = thisArgNode->gtOp.gtOp1;
GenTree* thisExpr = originalThisExpr;
// We're going to use the 'this' expression multiple times, so make a local to copy it.
unsigned lclNum;
#ifdef _TARGET_X86_
if (call->IsTailCallViaHelper() && originalThisExpr->IsLocal())
{
// For ordering purposes for the special tailcall arguments on x86, we forced the
// 'this' pointer in this case to a local in Compiler::fgMorphTailCall().
// We could possibly use this case to remove copies for all architectures and non-tailcall
// calls by creating a new lcl var or lcl field reference, as is done in the
// LowerVirtualVtableCall() code.
assert(originalThisExpr->OperGet() == GT_LCL_VAR);
lclNum = originalThisExpr->AsLclVarCommon()->GetLclNum();
}
else
#endif // _TARGET_X86_
{
unsigned delegateInvokeTmp = comp->lvaGrabTemp(true DEBUGARG("delegate invoke call"));
LIR::Use thisExprUse(BlockRange(), &thisArgNode->gtOp.gtOp1, thisArgNode);
ReplaceWithLclVar(thisExprUse, delegateInvokeTmp);
thisExpr = thisExprUse.Def(); // it's changed; reload it.
lclNum = delegateInvokeTmp;
}
// replace original expression feeding into thisPtr with
// [originalThis + offsetOfDelegateInstance]
GenTree* newThisAddr = new (comp, GT_LEA)
GenTreeAddrMode(TYP_BYREF, thisExpr, nullptr, 0, comp->eeGetEEInfo()->offsetOfDelegateInstance);
GenTree* newThis = comp->gtNewOperNode(GT_IND, TYP_REF, newThisAddr);
BlockRange().InsertAfter(thisExpr, newThisAddr, newThis);
thisArgNode->gtOp.gtOp1 = newThis;
ContainCheckIndir(newThis->AsIndir());
// the control target is
// [originalThis + firstTgtOffs]
GenTree* base = new (comp, GT_LCL_VAR) GenTreeLclVar(originalThisExpr->TypeGet(), lclNum, BAD_IL_OFFSET);
unsigned targetOffs = comp->eeGetEEInfo()->offsetOfDelegateFirstTarget;
GenTree* result = new (comp, GT_LEA) GenTreeAddrMode(TYP_REF, base, nullptr, 0, targetOffs);
GenTree* callTarget = Ind(result);
// don't need to sequence and insert this tree, caller will do it
return callTarget;
}
GenTree* Lowering::LowerIndirectNonvirtCall(GenTreeCall* call)
{
#ifdef _TARGET_X86_
if (call->gtCallCookie != nullptr)
{
NYI_X86("Morphing indirect non-virtual call with non-standard args");
}
#endif
// Indirect cookie calls gets transformed by fgMorphArgs as indirect call with non-standard args.
// Hence we should never see this type of call in lower.
noway_assert(call->gtCallCookie == nullptr);
return nullptr;
}
//------------------------------------------------------------------------
// CreateReturnTrapSeq: Create a tree to perform a "return trap", used in PInvoke
// epilogs to invoke a GC under a condition. The return trap checks some global
// location (the runtime tells us where that is and how many indirections to make),
// then, based on the result, conditionally calls a GC helper. We use a special node
// for this because at this time (late in the compilation phases), introducing flow
// is tedious/difficult.
//
// This is used for PInvoke inlining.
//
// Return Value:
// Code tree to perform the action.
//
GenTree* Lowering::CreateReturnTrapSeq()
{
// The GT_RETURNTRAP node expands to this:
// if (g_TrapReturningThreads)
// {
// RareDisablePreemptiveGC();
// }
// The only thing to do here is build up the expression that evaluates 'g_TrapReturningThreads'.
void* pAddrOfCaptureThreadGlobal = nullptr;
LONG* addrOfCaptureThreadGlobal = comp->info.compCompHnd->getAddrOfCaptureThreadGlobal(&pAddrOfCaptureThreadGlobal);
GenTree* testTree;
if (addrOfCaptureThreadGlobal != nullptr)
{
testTree = Ind(AddrGen(addrOfCaptureThreadGlobal));
}
else
{
testTree = Ind(Ind(AddrGen(pAddrOfCaptureThreadGlobal)));
}
return comp->gtNewOperNode(GT_RETURNTRAP, TYP_INT, testTree);
}
//------------------------------------------------------------------------
// SetGCState: Create a tree that stores the given constant (0 or 1) into the
// thread's GC state field.
//
// This is used for PInvoke inlining.
//
// Arguments:
// state - constant (0 or 1) to store into the thread's GC state field.
//
// Return Value:
// Code tree to perform the action.
//
GenTree* Lowering::SetGCState(int state)
{
// Thread.offsetOfGcState = 0/1
assert(state == 0 || state == 1);
const CORINFO_EE_INFO* pInfo = comp->eeGetEEInfo();
GenTree* base = new (comp, GT_LCL_VAR) GenTreeLclVar(TYP_I_IMPL, comp->info.compLvFrameListRoot, -1);
GenTree* stateNode = new (comp, GT_CNS_INT) GenTreeIntCon(TYP_BYTE, state);
GenTree* addr = new (comp, GT_LEA) GenTreeAddrMode(TYP_I_IMPL, base, nullptr, 1, pInfo->offsetOfGCState);
GenTree* storeGcState = new (comp, GT_STOREIND) GenTreeStoreInd(TYP_BYTE, addr, stateNode);
return storeGcState;
}
//------------------------------------------------------------------------
// CreateFrameLinkUpdate: Create a tree that either links or unlinks the
// locally-allocated InlinedCallFrame from the Frame list.
//
// This is used for PInvoke inlining.
//
// Arguments:
// action - whether to link (push) or unlink (pop) the Frame
//
// Return Value:
// Code tree to perform the action.
//
GenTree* Lowering::CreateFrameLinkUpdate(FrameLinkAction action)
{
const CORINFO_EE_INFO* pInfo = comp->eeGetEEInfo();
const CORINFO_EE_INFO::InlinedCallFrameInfo& callFrameInfo = pInfo->inlinedCallFrameInfo;
GenTree* TCB = new (comp, GT_LCL_VAR) GenTreeLclVar(GT_LCL_VAR, TYP_I_IMPL, comp->info.compLvFrameListRoot,
(IL_OFFSET)-1); // cast to resolve ambiguity.
// Thread->m_pFrame
GenTree* addr = new (comp, GT_LEA) GenTreeAddrMode(TYP_I_IMPL, TCB, nullptr, 1, pInfo->offsetOfThreadFrame);
GenTree* data = nullptr;
if (action == PushFrame)
{
// Thread->m_pFrame = &inlinedCallFrame;
data = new (comp, GT_LCL_FLD_ADDR)
GenTreeLclFld(GT_LCL_FLD_ADDR, TYP_BYREF, comp->lvaInlinedPInvokeFrameVar, callFrameInfo.offsetOfFrameVptr);
}
else
{
assert(action == PopFrame);
// Thread->m_pFrame = inlinedCallFrame.m_pNext;
data = new (comp, GT_LCL_FLD) GenTreeLclFld(GT_LCL_FLD, TYP_BYREF, comp->lvaInlinedPInvokeFrameVar,
pInfo->inlinedCallFrameInfo.offsetOfFrameLink);
}
GenTree* storeInd = new (comp, GT_STOREIND) GenTreeStoreInd(TYP_I_IMPL, addr, data);
return storeInd;
}
//------------------------------------------------------------------------
// InsertPInvokeMethodProlog: Create the code that runs at the start of
// every method that has PInvoke calls.
//
// Initialize the TCB local and the InlinedCallFrame object. Then link ("push")
// the InlinedCallFrame object on the Frame chain. The layout of InlinedCallFrame
// is defined in vm/frames.h. See also vm/jitinterface.cpp for more information.
// The offsets of these fields is returned by the VM in a call to ICorStaticInfo::getEEInfo().
//
// The (current) layout is as follows:
//
// 64-bit 32-bit CORINFO_EE_INFO
// offset offset field name offset when set
// -----------------------------------------------------------------------------------------
// +00h +00h GS cookie offsetOfGSCookie
// +08h +04h vptr for class InlinedCallFrame offsetOfFrameVptr method prolog
// +10h +08h m_Next offsetOfFrameLink method prolog
// +18h +0Ch m_Datum offsetOfCallTarget call site
// +20h n/a m_StubSecretArg not set by JIT
// +28h +10h m_pCallSiteSP offsetOfCallSiteSP x86: call site, and zeroed in method
// prolog;
// non-x86: method prolog (SP remains
// constant in function, after prolog: no
// localloc and PInvoke in same function)
// +30h +14h m_pCallerReturnAddress offsetOfReturnAddress call site
// +38h +18h m_pCalleeSavedFP offsetOfCalleeSavedFP not set by JIT
// +1Ch JIT retval spill area (int) before call_gc ???
// +20h JIT retval spill area (long) before call_gc ???
// +24h Saved value of EBP method prolog ???
//
// Note that in the VM, InlinedCallFrame is a C++ class whose objects have a 'this' pointer that points
// to the InlinedCallFrame vptr (the 2nd field listed above), and the GS cookie is stored *before*
// the object. When we link the InlinedCallFrame onto the Frame chain, we must point at this location,
// and not at the beginning of the InlinedCallFrame local, which is actually the GS cookie.
//
// Return Value:
// none
//
void Lowering::InsertPInvokeMethodProlog()
{
noway_assert(comp->info.compCallUnmanaged);
noway_assert(comp->lvaInlinedPInvokeFrameVar != BAD_VAR_NUM);
if (comp->opts.ShouldUsePInvokeHelpers())
{
return;
}
JITDUMP("======= Inserting PInvoke method prolog\n");
// The first BB must be a scratch BB in order for us to be able to safely insert the P/Invoke prolog.
assert(comp->fgFirstBBisScratch());
LIR::Range& firstBlockRange = LIR::AsRange(comp->fgFirstBB);
const CORINFO_EE_INFO* pInfo = comp->eeGetEEInfo();
const CORINFO_EE_INFO::InlinedCallFrameInfo& callFrameInfo = pInfo->inlinedCallFrameInfo;
// First arg: &compiler->lvaInlinedPInvokeFrameVar + callFrameInfo.offsetOfFrameVptr
GenTree* frameAddr = new (comp, GT_LCL_FLD_ADDR)
GenTreeLclFld(GT_LCL_FLD_ADDR, TYP_BYREF, comp->lvaInlinedPInvokeFrameVar, callFrameInfo.offsetOfFrameVptr);
// Call runtime helper to fill in our InlinedCallFrame and push it on the Frame list:
// TCB = CORINFO_HELP_INIT_PINVOKE_FRAME(&symFrameStart, secretArg);
// for x86, don't pass the secretArg.
CLANG_FORMAT_COMMENT_ANCHOR;
#if defined(_TARGET_X86_) || defined(_TARGET_ARM_)
GenTreeArgList* argList = comp->gtNewArgList(frameAddr);
#else
GenTreeArgList* argList = comp->gtNewArgList(frameAddr, PhysReg(REG_SECRET_STUB_PARAM));
#endif
GenTree* call = comp->gtNewHelperCallNode(CORINFO_HELP_INIT_PINVOKE_FRAME, TYP_I_IMPL, argList);
// some sanity checks on the frame list root vardsc
LclVarDsc* varDsc = &comp->lvaTable[comp->info.compLvFrameListRoot];
noway_assert(!varDsc->lvIsParam);
noway_assert(varDsc->lvType == TYP_I_IMPL);
GenTree* store =
new (comp, GT_STORE_LCL_VAR) GenTreeLclVar(GT_STORE_LCL_VAR, TYP_I_IMPL, comp->info.compLvFrameListRoot,
(IL_OFFSET)-1); // cast to resolve ambiguity.
store->gtOp.gtOp1 = call;
store->gtFlags |= GTF_VAR_DEF;
GenTree* const insertionPoint = firstBlockRange.FirstNonPhiOrCatchArgNode();
comp->fgMorphTree(store);
firstBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, store));
DISPTREERANGE(firstBlockRange, store);
#if !defined(_TARGET_X86_) && !defined(_TARGET_ARM_)
// For x86, this step is done at the call site (due to stack pointer not being static in the function).
// For arm32, CallSiteSP is set up by the call to CORINFO_HELP_INIT_PINVOKE_FRAME.
// --------------------------------------------------------
// InlinedCallFrame.m_pCallSiteSP = @RSP;
GenTreeLclFld* storeSP = new (comp, GT_STORE_LCL_FLD)
GenTreeLclFld(GT_STORE_LCL_FLD, TYP_I_IMPL, comp->lvaInlinedPInvokeFrameVar, callFrameInfo.offsetOfCallSiteSP);
storeSP->gtOp1 = PhysReg(REG_SPBASE);
storeSP->gtFlags |= GTF_VAR_DEF;
firstBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, storeSP));
DISPTREERANGE(firstBlockRange, storeSP);
#endif // !defined(_TARGET_X86_) && !defined(_TARGET_ARM_)
#if !defined(_TARGET_ARM_)
// For arm32, CalleeSavedFP is set up by the call to CORINFO_HELP_INIT_PINVOKE_FRAME.
// --------------------------------------------------------
// InlinedCallFrame.m_pCalleeSavedEBP = @RBP;
GenTreeLclFld* storeFP =
new (comp, GT_STORE_LCL_FLD) GenTreeLclFld(GT_STORE_LCL_FLD, TYP_I_IMPL, comp->lvaInlinedPInvokeFrameVar,
callFrameInfo.offsetOfCalleeSavedFP);
storeFP->gtOp1 = PhysReg(REG_FPBASE);
storeFP->gtFlags |= GTF_VAR_DEF;
firstBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, storeFP));
DISPTREERANGE(firstBlockRange, storeFP);
#endif // !defined(_TARGET_ARM_)
// --------------------------------------------------------
// On 32-bit targets, CORINFO_HELP_INIT_PINVOKE_FRAME initializes the PInvoke frame and then pushes it onto
// the current thread's Frame stack. On 64-bit targets, it only initializes the PInvoke frame.
CLANG_FORMAT_COMMENT_ANCHOR;
#ifdef _TARGET_64BIT_
if (comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB))
{
// Push a frame - if we are NOT in an IL stub, this is done right before the call
// The init routine sets InlinedCallFrame's m_pNext, so we just set the thead's top-of-stack
GenTree* frameUpd = CreateFrameLinkUpdate(PushFrame);
firstBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, frameUpd));
ContainCheckStoreIndir(frameUpd->AsIndir());
DISPTREERANGE(firstBlockRange, frameUpd);
}
#endif // _TARGET_64BIT_
}
//------------------------------------------------------------------------
// InsertPInvokeMethodEpilog: Code that needs to be run when exiting any method
// that has PInvoke inlines. This needs to be inserted any place you can exit the
// function: returns, tailcalls and jmps.
//
// Arguments:
// returnBB - basic block from which a method can return
// lastExpr - GenTree of the last top level stmnt of returnBB (debug only arg)
//
// Return Value:
// Code tree to perform the action.
//
void Lowering::InsertPInvokeMethodEpilog(BasicBlock* returnBB DEBUGARG(GenTree* lastExpr))
{
assert(returnBB != nullptr);
assert(comp->info.compCallUnmanaged);
if (comp->opts.ShouldUsePInvokeHelpers())
{
return;
}
JITDUMP("======= Inserting PInvoke method epilog\n");
// Method doing PInvoke calls has exactly one return block unless it has "jmp" or tail calls.
assert(((returnBB == comp->genReturnBB) && (returnBB->bbJumpKind == BBJ_RETURN)) ||
returnBB->endsWithTailCallOrJmp(comp));
LIR::Range& returnBlockRange = LIR::AsRange(returnBB);
GenTree* insertionPoint = returnBlockRange.LastNode();
assert(insertionPoint == lastExpr);
// Note: PInvoke Method Epilog (PME) needs to be inserted just before GT_RETURN, GT_JMP or GT_CALL node in execution
// order so that it is guaranteed that there will be no further PInvokes after that point in the method.
//
// Example1: GT_RETURN(op1) - say execution order is: Op1, GT_RETURN. After inserting PME, execution order would be
// Op1, PME, GT_RETURN
//
// Example2: GT_CALL(arg side effect computing nodes, Stk Args Setup, Reg Args setup). The execution order would be
// arg side effect computing nodes, Stk Args setup, Reg Args setup, GT_CALL
// After inserting PME execution order would be:
// arg side effect computing nodes, Stk Args setup, Reg Args setup, PME, GT_CALL
//
// Example3: GT_JMP. After inserting PME execution order would be: PME, GT_JMP
// That is after PME, args for GT_JMP call will be setup.
// TODO-Cleanup: setting GCState to 1 seems to be redundant as InsertPInvokeCallProlog will set it to zero before a
// PInvoke call and InsertPInvokeCallEpilog() will set it back to 1 after the PInvoke. Though this is redundant,
// it is harmeless.
// Note that liveness is artificially extending the life of compLvFrameListRoot var if the method being compiled has
// PInvokes. Deleting the below stmnt would cause an an assert in lsra.cpp::SetLastUses() since compLvFrameListRoot
// will be live-in to a BBJ_RETURN block without any uses. Long term we need to fix liveness for x64 case to
// properly extend the life of compLvFrameListRoot var.
//
// Thread.offsetOfGcState = 0/1
// That is [tcb + offsetOfGcState] = 1
GenTree* storeGCState = SetGCState(1);
returnBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, storeGCState));
ContainCheckStoreIndir(storeGCState->AsIndir());
// Pop the frame if necessary. This always happens in the epilog on 32-bit targets. For 64-bit targets, we only do
// this in the epilog for IL stubs; for non-IL stubs the frame is popped after every PInvoke call.
CLANG_FORMAT_COMMENT_ANCHOR;
#ifdef _TARGET_64BIT_
if (comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB))
#endif // _TARGET_64BIT_
{
GenTree* frameUpd = CreateFrameLinkUpdate(PopFrame);
returnBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, frameUpd));
ContainCheckStoreIndir(frameUpd->AsIndir());
}
}
//------------------------------------------------------------------------
// InsertPInvokeCallProlog: Emit the call-site prolog for direct calls to unmanaged code.
// It does all the necessary call-site setup of the InlinedCallFrame.
//
// Arguments:
// call - the call for which we are inserting the PInvoke prolog.
//
// Return Value:
// None.
//
void Lowering::InsertPInvokeCallProlog(GenTreeCall* call)
{
JITDUMP("======= Inserting PInvoke call prolog\n");
GenTree* insertBefore = call;
if (call->gtCallType == CT_INDIRECT)
{
bool isClosed;
insertBefore = BlockRange().GetTreeRange(call->gtCallAddr, &isClosed).FirstNode();
assert(isClosed);
}
const CORINFO_EE_INFO::InlinedCallFrameInfo& callFrameInfo = comp->eeGetEEInfo()->inlinedCallFrameInfo;
gtCallTypes callType = (gtCallTypes)call->gtCallType;
noway_assert(comp->lvaInlinedPInvokeFrameVar != BAD_VAR_NUM);
if (comp->opts.ShouldUsePInvokeHelpers())
{
// First argument is the address of the frame variable.
GenTree* frameAddr = new (comp, GT_LCL_VAR_ADDR)
GenTreeLclVar(GT_LCL_VAR_ADDR, TYP_BYREF, comp->lvaInlinedPInvokeFrameVar, BAD_IL_OFFSET);
// Insert call to CORINFO_HELP_JIT_PINVOKE_BEGIN
GenTree* helperCall =
comp->gtNewHelperCallNode(CORINFO_HELP_JIT_PINVOKE_BEGIN, TYP_VOID, comp->gtNewArgList(frameAddr));
comp->fgMorphTree(helperCall);
BlockRange().InsertBefore(insertBefore, LIR::SeqTree(comp, helperCall));
LowerNode(helperCall); // helper call is inserted before current node and should be lowered here.
return;
}
// Emit the following sequence:
//
// InlinedCallFrame.callTarget = methodHandle // stored in m_Datum
// InlinedCallFrame.m_pCallSiteSP = SP // x86 only
// InlinedCallFrame.m_pCallerReturnAddress = return address
// Thread.gcState = 0
// (non-stub) - update top Frame on TCB // 64-bit targets only
// ----------------------------------------------------------------------------------
// Setup InlinedCallFrame.callSiteTarget (which is how the JIT refers to it).
// The actual field is InlinedCallFrame.m_Datum which has many different uses and meanings.
GenTree* src = nullptr;
if (callType == CT_INDIRECT)
{
#if !defined(_TARGET_64BIT_)
// On 32-bit targets, indirect calls need the size of the stack args in InlinedCallFrame.m_Datum.
const unsigned numStkArgBytes = call->fgArgInfo->GetNextSlotNum() * TARGET_POINTER_SIZE;
src = comp->gtNewIconNode(numStkArgBytes, TYP_INT);
#else
// On 64-bit targets, indirect calls may need the stub parameter value in InlinedCallFrame.m_Datum.
// If the stub parameter value is not needed, m_Datum will be initialized by the VM.
if (comp->info.compPublishStubParam)
{
src = comp->gtNewLclvNode(comp->lvaStubArgumentVar, TYP_I_IMPL);
}
#endif // !defined(_TARGET_64BIT_)
}
else
{
assert(callType == CT_USER_FUNC);
void* pEmbedMethodHandle = nullptr;
CORINFO_METHOD_HANDLE embedMethodHandle =
comp->info.compCompHnd->embedMethodHandle(call->gtCallMethHnd, &pEmbedMethodHandle);
noway_assert((!embedMethodHandle) != (!pEmbedMethodHandle));
if (embedMethodHandle != nullptr)
{
// InlinedCallFrame.callSiteTarget = methodHandle
src = AddrGen(embedMethodHandle);
}
else
{
// InlinedCallFrame.callSiteTarget = *pEmbedMethodHandle
src = Ind(AddrGen(pEmbedMethodHandle));
}
}
if (src != nullptr)
{
// Store into InlinedCallFrame.m_Datum, the offset of which is given by offsetOfCallTarget.
GenTreeLclFld* store =
new (comp, GT_STORE_LCL_FLD) GenTreeLclFld(GT_STORE_LCL_FLD, TYP_I_IMPL, comp->lvaInlinedPInvokeFrameVar,
callFrameInfo.offsetOfCallTarget);
store->gtOp1 = src;
store->gtFlags |= GTF_VAR_DEF;
InsertTreeBeforeAndContainCheck(insertBefore, store);
}
#ifdef _TARGET_X86_
// ----------------------------------------------------------------------------------
// InlinedCallFrame.m_pCallSiteSP = SP
GenTreeLclFld* storeCallSiteSP = new (comp, GT_STORE_LCL_FLD)
GenTreeLclFld(GT_STORE_LCL_FLD, TYP_I_IMPL, comp->lvaInlinedPInvokeFrameVar, callFrameInfo.offsetOfCallSiteSP);
storeCallSiteSP->gtOp1 = PhysReg(REG_SPBASE);
storeCallSiteSP->gtFlags |= GTF_VAR_DEF;
InsertTreeBeforeAndContainCheck(insertBefore, storeCallSiteSP);
#endif
// ----------------------------------------------------------------------------------
// InlinedCallFrame.m_pCallerReturnAddress = &label (the address of the instruction immediately following the call)
GenTreeLclFld* storeLab =
new (comp, GT_STORE_LCL_FLD) GenTreeLclFld(GT_STORE_LCL_FLD, TYP_I_IMPL, comp->lvaInlinedPInvokeFrameVar,
callFrameInfo.offsetOfReturnAddress);
// We don't have a real label, and inserting one is hard (even if we made a special node),
// so for now we will just 'know' what this means in codegen.
GenTreeLabel* labelRef = new (comp, GT_LABEL) GenTreeLabel(nullptr);
labelRef->gtType = TYP_I_IMPL;
storeLab->gtOp1 = labelRef;
storeLab->gtFlags |= GTF_VAR_DEF;
InsertTreeBeforeAndContainCheck(insertBefore, storeLab);
// Push the PInvoke frame if necessary. On 32-bit targets this only happens in the method prolog if a method
// contains PInvokes; on 64-bit targets this is necessary in non-stubs.
CLANG_FORMAT_COMMENT_ANCHOR;
#ifdef _TARGET_64BIT_
if (!comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB))
{
// Set the TCB's frame to be the one we just created.
// Note the init routine for the InlinedCallFrame (CORINFO_HELP_INIT_PINVOKE_FRAME)
// has prepended it to the linked list to maintain the stack of Frames.
//
// Stubs do this once per stub, not once per call.
GenTree* frameUpd = CreateFrameLinkUpdate(PushFrame);
BlockRange().InsertBefore(insertBefore, LIR::SeqTree(comp, frameUpd));
ContainCheckStoreIndir(frameUpd->AsIndir());
}
#endif // _TARGET_64BIT_
// IMPORTANT **** This instruction must come last!!! ****
// It changes the thread's state to Preemptive mode
// ----------------------------------------------------------------------------------
// [tcb + offsetOfGcState] = 0
GenTree* storeGCState = SetGCState(0);
BlockRange().InsertBefore(insertBefore, LIR::SeqTree(comp, storeGCState));
ContainCheckStoreIndir(storeGCState->AsIndir());
}
//------------------------------------------------------------------------
// InsertPInvokeCallEpilog: Insert the code that goes after every inlined pinvoke call.
//
// Arguments:
// call - the call for which we are inserting the PInvoke epilog.
//
// Return Value:
// None.
//
void Lowering::InsertPInvokeCallEpilog(GenTreeCall* call)
{
JITDUMP("======= Inserting PInvoke call epilog\n");
if (comp->opts.ShouldUsePInvokeHelpers())
{
noway_assert(comp->lvaInlinedPInvokeFrameVar != BAD_VAR_NUM);
// First argument is the address of the frame variable.
GenTree* frameAddr =
new (comp, GT_LCL_VAR) GenTreeLclVar(GT_LCL_VAR, TYP_BYREF, comp->lvaInlinedPInvokeFrameVar, BAD_IL_OFFSET);
frameAddr->SetOperRaw(GT_LCL_VAR_ADDR);
// Insert call to CORINFO_HELP_JIT_PINVOKE_END
GenTreeCall* helperCall =
comp->gtNewHelperCallNode(CORINFO_HELP_JIT_PINVOKE_END, TYP_VOID, comp->gtNewArgList(frameAddr));
comp->fgMorphTree(helperCall);
BlockRange().InsertAfter(call, LIR::SeqTree(comp, helperCall));
ContainCheckCallOperands(helperCall);
return;
}
// gcstate = 1
GenTree* insertionPoint = call->gtNext;
GenTree* tree = SetGCState(1);
BlockRange().InsertBefore(insertionPoint, LIR::SeqTree(comp, tree));
ContainCheckStoreIndir(tree->AsIndir());
tree = CreateReturnTrapSeq();
BlockRange().InsertBefore(insertionPoint, LIR::SeqTree(comp, tree));
ContainCheckReturnTrap(tree->AsOp());
// Pop the frame if necessary. On 32-bit targets this only happens in the method epilog; on 64-bit targets thi
// happens after every PInvoke call in non-stubs. 32-bit targets instead mark the frame as inactive.
CLANG_FORMAT_COMMENT_ANCHOR;
#ifdef _TARGET_64BIT_
if (!comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB))
{
tree = CreateFrameLinkUpdate(PopFrame);
BlockRange().InsertBefore(insertionPoint, LIR::SeqTree(comp, tree));
ContainCheckStoreIndir(tree->AsIndir());
}
#else
const CORINFO_EE_INFO::InlinedCallFrameInfo& callFrameInfo = comp->eeGetEEInfo()->inlinedCallFrameInfo;
// ----------------------------------------------------------------------------------
// InlinedCallFrame.m_pCallerReturnAddress = nullptr
GenTreeLclFld* const storeCallSiteTracker =
new (comp, GT_STORE_LCL_FLD) GenTreeLclFld(GT_STORE_LCL_FLD, TYP_I_IMPL, comp->lvaInlinedPInvokeFrameVar,