Permalink
Fetching contributors…
Cannot retrieve contributors at this time
8791 lines (7659 sloc) 308 KB
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XX XX
XX Amd64/x86 Code Generator XX
XX XX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
*/
#include "jitpch.h"
#ifdef _MSC_VER
#pragma hdrstop
#endif
#ifdef _TARGET_XARCH_
#include "emit.h"
#include "codegen.h"
#include "lower.h"
#include "gcinfo.h"
#include "gcinfoencoder.h"
/*****************************************************************************
*
* Generate code that will set the given register to the integer constant.
*/
void CodeGen::genSetRegToIcon(regNumber reg, ssize_t val, var_types type, insFlags flags)
{
// Reg cannot be a FP reg
assert(!genIsValidFloatReg(reg));
// The only TYP_REF constant that can come this path is a managed 'null' since it is not
// relocatable. Other ref type constants (e.g. string objects) go through a different
// code path.
noway_assert(type != TYP_REF || val == 0);
if (val == 0)
{
instGen_Set_Reg_To_Zero(emitActualTypeSize(type), reg, flags);
}
else
{
// TODO-XArch-CQ: needs all the optimized cases
getEmitter()->emitIns_R_I(INS_mov, emitActualTypeSize(type), reg, val);
}
}
/*****************************************************************************
*
* Generate code to check that the GS cookie wasn't thrashed by a buffer
* overrun. If pushReg is true, preserve all registers around code sequence.
* Otherwise ECX could be modified.
*
* Implementation Note: pushReg = true, in case of tail calls.
*/
void CodeGen::genEmitGSCookieCheck(bool pushReg)
{
noway_assert(compiler->gsGlobalSecurityCookieAddr || compiler->gsGlobalSecurityCookieVal);
// Make sure that EAX is reported as live GC-ref so that any GC that kicks in while
// executing GS cookie check will not collect the object pointed to by EAX.
//
// For Amd64 System V, a two-register-returned struct could be returned in RAX and RDX
// In such case make sure that the correct GC-ness of RDX is reported as well, so
// a GC object pointed by RDX will not be collected.
if (!pushReg)
{
// Handle multi-reg return type values
if (compiler->compMethodReturnsMultiRegRetType())
{
ReturnTypeDesc retTypeDesc;
if (varTypeIsLong(compiler->info.compRetNativeType))
{
retTypeDesc.InitializeLongReturnType(compiler);
}
else // we must have a struct return type
{
retTypeDesc.InitializeStructReturnType(compiler, compiler->info.compMethodInfo->args.retTypeClass);
}
unsigned regCount = retTypeDesc.GetReturnRegCount();
// Only x86 and x64 Unix ABI allows multi-reg return and
// number of result regs should be equal to MAX_RET_REG_COUNT.
assert(regCount == MAX_RET_REG_COUNT);
for (unsigned i = 0; i < regCount; ++i)
{
gcInfo.gcMarkRegPtrVal(retTypeDesc.GetABIReturnReg(i), retTypeDesc.GetReturnRegType(i));
}
}
else if (compiler->compMethodReturnsRetBufAddr())
{
// This is for returning in an implicit RetBuf.
// If the address of the buffer is returned in REG_INTRET, mark the content of INTRET as ByRef.
// In case the return is in an implicit RetBuf, the native return type should be a struct
assert(varTypeIsStruct(compiler->info.compRetNativeType));
gcInfo.gcMarkRegPtrVal(REG_INTRET, TYP_BYREF);
}
// ... all other cases.
else
{
#ifdef _TARGET_AMD64_
// For x64, structs that are not returned in registers are always
// returned in implicit RetBuf. If we reached here, we should not have
// a RetBuf and the return type should not be a struct.
assert(compiler->info.compRetBuffArg == BAD_VAR_NUM);
assert(!varTypeIsStruct(compiler->info.compRetNativeType));
#endif // _TARGET_AMD64_
// For x86 Windows we can't make such assertions since we generate code for returning of
// the RetBuf in REG_INTRET only when the ProfilerHook is enabled. Otherwise
// compRetNativeType could be TYP_STRUCT.
gcInfo.gcMarkRegPtrVal(REG_INTRET, compiler->info.compRetNativeType);
}
}
regNumber regGSCheck;
regMaskTP regMaskGSCheck = RBM_NONE;
if (!pushReg)
{
// Non-tail call: we can use any callee trash register that is not
// a return register or contain 'this' pointer (keep alive this), since
// we are generating GS cookie check after a GT_RETURN block.
// Note: On Amd64 System V RDX is an arg register - REG_ARG_2 - as well
// as return register for two-register-returned structs.
if (compiler->lvaKeepAliveAndReportThis() && compiler->lvaTable[compiler->info.compThisArg].lvRegister &&
(compiler->lvaTable[compiler->info.compThisArg].lvRegNum == REG_ARG_0))
{
regGSCheck = REG_ARG_1;
}
else
{
regGSCheck = REG_ARG_0;
}
}
else
{
#ifdef _TARGET_X86_
// It doesn't matter which register we pick, since we're going to save and restore it
// around the check.
// TODO-CQ: Can we optimize the choice of register to avoid doing the push/pop sometimes?
regGSCheck = REG_EAX;
regMaskGSCheck = RBM_EAX;
#else // !_TARGET_X86_
// Tail calls from methods that need GS check: We need to preserve registers while
// emitting GS cookie check for a tail prefixed call or a jmp. To emit GS cookie
// check, we might need a register. This won't be an issue for jmp calls for the
// reason mentioned below (see comment starting with "Jmp Calls:").
//
// The following are the possible solutions in case of tail prefixed calls:
// 1) Use R11 - ignore tail prefix on calls that need to pass a param in R11 when
// present in methods that require GS cookie check. Rest of the tail calls that
// do not require R11 will be honored.
// 2) Internal register - GT_CALL node reserves an internal register and emits GS
// cookie check as part of tail call codegen. GenExitCode() needs to special case
// fast tail calls implemented as epilog+jmp or such tail calls should always get
// dispatched via helper.
// 3) Materialize GS cookie check as a separate node hanging off GT_CALL node in
// right execution order during rationalization.
//
// There are two calls that use R11: VSD and calli pinvokes with cookie param. Tail
// prefix on pinvokes is ignored. That is, options 2 and 3 will allow tail prefixed
// VSD calls from methods that need GS check.
//
// Tail prefixed calls: Right now for Jit64 compat, method requiring GS cookie check
// ignores tail prefix. In future, if we intend to support tail calls from such a method,
// consider one of the options mentioned above. For now adding an assert that we don't
// expect to see a tail call in a method that requires GS check.
noway_assert(!compiler->compTailCallUsed);
// Jmp calls: specify method handle using which JIT queries VM for its entry point
// address and hence it can neither be a VSD call nor PInvoke calli with cookie
// parameter. Therefore, in case of jmp calls it is safe to use R11.
regGSCheck = REG_R11;
#endif // !_TARGET_X86_
}
regMaskTP byrefPushedRegs = RBM_NONE;
regMaskTP norefPushedRegs = RBM_NONE;
regMaskTP pushedRegs = RBM_NONE;
if (compiler->gsGlobalSecurityCookieAddr == nullptr)
{
#if defined(_TARGET_AMD64_)
// If GS cookie value fits within 32-bits we can use 'cmp mem64, imm32'.
// Otherwise, load the value into a reg and use 'cmp mem64, reg64'.
if ((int)compiler->gsGlobalSecurityCookieVal != (ssize_t)compiler->gsGlobalSecurityCookieVal)
{
genSetRegToIcon(regGSCheck, compiler->gsGlobalSecurityCookieVal, TYP_I_IMPL);
getEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, regGSCheck, compiler->lvaGSSecurityCookie, 0);
}
else
#endif // defined(_TARGET_AMD64_)
{
assert((int)compiler->gsGlobalSecurityCookieVal == (ssize_t)compiler->gsGlobalSecurityCookieVal);
getEmitter()->emitIns_S_I(INS_cmp, EA_PTRSIZE, compiler->lvaGSSecurityCookie, 0,
(int)compiler->gsGlobalSecurityCookieVal);
}
}
else
{
// Ngen case - GS cookie value needs to be accessed through an indirection.
pushedRegs = genPushRegs(regMaskGSCheck, &byrefPushedRegs, &norefPushedRegs);
instGen_Set_Reg_To_Imm(EA_HANDLE_CNS_RELOC, regGSCheck, (ssize_t)compiler->gsGlobalSecurityCookieAddr);
getEmitter()->emitIns_R_AR(ins_Load(TYP_I_IMPL), EA_PTRSIZE, regGSCheck, regGSCheck, 0);
getEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, regGSCheck, compiler->lvaGSSecurityCookie, 0);
}
BasicBlock* gsCheckBlk = genCreateTempLabel();
emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
inst_JMP(jmpEqual, gsCheckBlk);
genEmitHelperCall(CORINFO_HELP_FAIL_FAST, 0, EA_UNKNOWN);
genDefineTempLabel(gsCheckBlk);
genPopRegs(pushedRegs, byrefPushedRegs, norefPushedRegs);
}
BasicBlock* CodeGen::genCallFinally(BasicBlock* block)
{
#if FEATURE_EH_FUNCLETS
// Generate a call to the finally, like this:
// mov rcx,qword ptr [rbp + 20H] // Load rcx with PSPSym
// call finally-funclet
// jmp finally-return // Only for non-retless finally calls
// The jmp can be a NOP if we're going to the next block.
// If we're generating code for the main function (not a funclet), and there is no localloc,
// then RSP at this point is the same value as that stored in the PSPSym. So just copy RSP
// instead of loading the PSPSym in this case, or if PSPSym is not used (CoreRT ABI).
if ((compiler->lvaPSPSym == BAD_VAR_NUM) ||
(!compiler->compLocallocUsed && (compiler->funCurrentFunc()->funKind == FUNC_ROOT)))
{
#ifndef UNIX_X86_ABI
inst_RV_RV(INS_mov, REG_ARG_0, REG_SPBASE, TYP_I_IMPL);
#endif // !UNIX_X86_ABI
}
else
{
getEmitter()->emitIns_R_S(ins_Load(TYP_I_IMPL), EA_PTRSIZE, REG_ARG_0, compiler->lvaPSPSym, 0);
}
getEmitter()->emitIns_J(INS_call, block->bbJumpDest);
if (block->bbFlags & BBF_RETLESS_CALL)
{
// We have a retless call, and the last instruction generated was a call.
// If the next block is in a different EH region (or is the end of the code
// block), then we need to generate a breakpoint here (since it will never
// get executed) to get proper unwind behavior.
if ((block->bbNext == nullptr) || !BasicBlock::sameEHRegion(block, block->bbNext))
{
instGen(INS_BREAKPOINT); // This should never get executed
}
}
else
{
// TODO-Linux-x86: Do we need to handle the GC information for this NOP or JMP specially, as is done for other
// architectures?
#ifndef JIT32_GCENCODER
// Because of the way the flowgraph is connected, the liveness info for this one instruction
// after the call is not (can not be) correct in cases where a variable has a last use in the
// handler. So turn off GC reporting for this single instruction.
getEmitter()->emitDisableGC();
#endif // JIT32_GCENCODER
// Now go to where the finally funclet needs to return to.
if (block->bbNext->bbJumpDest == block->bbNext->bbNext)
{
// Fall-through.
// TODO-XArch-CQ: Can we get rid of this instruction, and just have the call return directly
// to the next instruction? This would depend on stack walking from within the finally
// handler working without this instruction being in this special EH region.
instGen(INS_nop);
}
else
{
inst_JMP(EJ_jmp, block->bbNext->bbJumpDest);
}
#ifndef JIT32_GCENCODER
getEmitter()->emitEnableGC();
#endif // JIT32_GCENCODER
}
#else // !FEATURE_EH_FUNCLETS
// If we are about to invoke a finally locally from a try block, we have to set the ShadowSP slot
// corresponding to the finally's nesting level. When invoked in response to an exception, the
// EE does this.
//
// We have a BBJ_CALLFINALLY followed by a BBJ_ALWAYS.
//
// We will emit :
// mov [ebp - (n + 1)], 0
// mov [ebp - n ], 0xFC
// push &step
// jmp finallyBlock
// ...
// step:
// mov [ebp - n ], 0
// jmp leaveTarget
// ...
// leaveTarget:
noway_assert(isFramePointerUsed());
// Get the nesting level which contains the finally
unsigned finallyNesting = 0;
compiler->fgGetNestingLevel(block, &finallyNesting);
// The last slot is reserved for ICodeManager::FixContext(ppEndRegion)
unsigned filterEndOffsetSlotOffs;
filterEndOffsetSlotOffs = (unsigned)(compiler->lvaLclSize(compiler->lvaShadowSPslotsVar) - TARGET_POINTER_SIZE);
unsigned curNestingSlotOffs;
curNestingSlotOffs = (unsigned)(filterEndOffsetSlotOffs - ((finallyNesting + 1) * TARGET_POINTER_SIZE));
// Zero out the slot for the next nesting level
instGen_Store_Imm_Into_Lcl(TYP_I_IMPL, EA_PTRSIZE, 0, compiler->lvaShadowSPslotsVar,
curNestingSlotOffs - TARGET_POINTER_SIZE);
instGen_Store_Imm_Into_Lcl(TYP_I_IMPL, EA_PTRSIZE, LCL_FINALLY_MARK, compiler->lvaShadowSPslotsVar,
curNestingSlotOffs);
// Now push the address where the finally funclet should return to directly.
if (!(block->bbFlags & BBF_RETLESS_CALL))
{
assert(block->isBBCallAlwaysPair());
getEmitter()->emitIns_J(INS_push_hide, block->bbNext->bbJumpDest);
}
else
{
// EE expects a DWORD, so we give him 0
inst_IV(INS_push_hide, 0);
}
// Jump to the finally BB
inst_JMP(EJ_jmp, block->bbJumpDest);
#endif // !FEATURE_EH_FUNCLETS
// The BBJ_ALWAYS is used because the BBJ_CALLFINALLY can't point to the
// jump target using bbJumpDest - that is already used to point
// to the finally block. So just skip past the BBJ_ALWAYS unless the
// block is RETLESS.
if (!(block->bbFlags & BBF_RETLESS_CALL))
{
assert(block->isBBCallAlwaysPair());
block = block->bbNext;
}
return block;
}
#if FEATURE_EH_FUNCLETS
void CodeGen::genEHCatchRet(BasicBlock* block)
{
// Set RAX to the address the VM should return to after the catch.
// Generate a RIP-relative
// lea reg, [rip + disp32] ; the RIP is implicit
// which will be position-independent.
getEmitter()->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, block->bbJumpDest, REG_INTRET);
}
#else // !FEATURE_EH_FUNCLETS
void CodeGen::genEHFinallyOrFilterRet(BasicBlock* block)
{
// The last statement of the block must be a GT_RETFILT, which has already been generated.
assert(block->lastNode() != nullptr);
assert(block->lastNode()->OperGet() == GT_RETFILT);
if (block->bbJumpKind == BBJ_EHFINALLYRET)
{
assert(block->lastNode()->gtOp.gtOp1 == nullptr); // op1 == nullptr means endfinally
// Return using a pop-jmp sequence. As the "try" block calls
// the finally with a jmp, this leaves the x86 call-ret stack
// balanced in the normal flow of path.
noway_assert(isFramePointerRequired());
inst_RV(INS_pop_hide, REG_EAX, TYP_I_IMPL);
inst_RV(INS_i_jmp, REG_EAX, TYP_I_IMPL);
}
else
{
assert(block->bbJumpKind == BBJ_EHFILTERRET);
// The return value has already been computed.
instGen_Return(0);
}
}
#endif // !FEATURE_EH_FUNCLETS
// Move an immediate value into an integer register
void CodeGen::instGen_Set_Reg_To_Imm(emitAttr size, regNumber reg, ssize_t imm, insFlags flags)
{
// reg cannot be a FP register
assert(!genIsValidFloatReg(reg));
if (!compiler->opts.compReloc)
{
size = EA_SIZE(size); // Strip any Reloc flags from size if we aren't doing relocs
}
if ((imm == 0) && !EA_IS_RELOC(size))
{
instGen_Set_Reg_To_Zero(size, reg, flags);
}
else
{
if (genDataIndirAddrCanBeEncodedAsPCRelOffset(imm))
{
getEmitter()->emitIns_R_AI(INS_lea, EA_PTR_DSP_RELOC, reg, imm);
}
else
{
getEmitter()->emitIns_R_I(INS_mov, size, reg, imm);
}
}
regSet.verifyRegUsed(reg);
}
/***********************************************************************************
*
* Generate code to set a register 'targetReg' of type 'targetType' to the constant
* specified by the constant (GT_CNS_INT or GT_CNS_DBL) in 'tree'. This does not call
* genProduceReg() on the target register.
*/
void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTree* tree)
{
switch (tree->gtOper)
{
case GT_CNS_INT:
{
// relocatable values tend to come down as a CNS_INT of native int type
// so the line between these two opcodes is kind of blurry
GenTreeIntConCommon* con = tree->AsIntConCommon();
ssize_t cnsVal = con->IconValue();
if (con->ImmedValNeedsReloc(compiler))
{
instGen_Set_Reg_To_Imm(EA_HANDLE_CNS_RELOC, targetReg, cnsVal);
regSet.verifyRegUsed(targetReg);
}
else
{
genSetRegToIcon(targetReg, cnsVal, targetType);
}
}
break;
case GT_CNS_DBL:
{
emitter* emit = getEmitter();
emitAttr size = emitTypeSize(targetType);
double constValue = tree->gtDblCon.gtDconVal;
// Make sure we use "xorps reg, reg" only for +ve zero constant (0.0) and not for -ve zero (-0.0)
if (*(__int64*)&constValue == 0)
{
// A faster/smaller way to generate 0
emit->emitIns_R_R(INS_xorps, size, targetReg, targetReg);
}
else
{
CORINFO_FIELD_HANDLE hnd = emit->emitFltOrDblConst(constValue, size);
emit->emitIns_R_C(ins_Load(targetType), size, targetReg, hnd, 0);
}
}
break;
default:
unreached();
}
}
//------------------------------------------------------------------------
// genCodeForNegNot: Produce code for a GT_NEG/GT_NOT node.
//
// Arguments:
// tree - the node
//
void CodeGen::genCodeForNegNot(GenTree* tree)
{
assert(tree->OperIs(GT_NEG, GT_NOT));
regNumber targetReg = tree->gtRegNum;
var_types targetType = tree->TypeGet();
if (varTypeIsFloating(targetType))
{
assert(tree->gtOper == GT_NEG);
genSSE2BitwiseOp(tree);
}
else
{
GenTree* operand = tree->gtGetOp1();
assert(operand->isUsedFromReg());
regNumber operandReg = genConsumeReg(operand);
if (operandReg != targetReg)
{
inst_RV_RV(INS_mov, targetReg, operandReg, targetType);
}
instruction ins = genGetInsForOper(tree->OperGet(), targetType);
inst_RV(ins, targetReg, targetType);
}
genProduceReg(tree);
}
// Generate code to get the high N bits of a N*N=2N bit multiplication result
void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
{
assert(!treeNode->gtOverflowEx());
regNumber targetReg = treeNode->gtRegNum;
var_types targetType = treeNode->TypeGet();
emitter* emit = getEmitter();
emitAttr size = emitTypeSize(treeNode);
GenTree* op1 = treeNode->gtOp.gtOp1;
GenTree* op2 = treeNode->gtOp.gtOp2;
// to get the high bits of the multiply, we are constrained to using the
// 1-op form: RDX:RAX = RAX * rm
// The 3-op form (Rx=Ry*Rz) does not support it.
genConsumeOperands(treeNode->AsOp());
GenTree* regOp = op1;
GenTree* rmOp = op2;
// Set rmOp to the memory operand (if any)
if (op1->isUsedFromMemory() || (op2->isUsedFromReg() && (op2->gtRegNum == REG_RAX)))
{
regOp = op2;
rmOp = op1;
}
assert(regOp->isUsedFromReg());
// Setup targetReg when neither of the source operands was a matching register
if (regOp->gtRegNum != REG_RAX)
{
inst_RV_RV(ins_Copy(targetType), REG_RAX, regOp->gtRegNum, targetType);
}
instruction ins;
if ((treeNode->gtFlags & GTF_UNSIGNED) == 0)
{
ins = INS_imulEAX;
}
else
{
ins = INS_mulEAX;
}
emit->emitInsBinary(ins, size, treeNode, rmOp);
// Move the result to the desired register, if necessary
if (treeNode->OperGet() == GT_MULHI && targetReg != REG_RDX)
{
inst_RV_RV(INS_mov, targetReg, REG_RDX, targetType);
}
genProduceReg(treeNode);
}
#ifdef _TARGET_X86_
//------------------------------------------------------------------------
// genCodeForLongUMod: Generate code for a tree of the form
// `(umod (gt_long x y) (const int))`
//
// Arguments:
// node - the node for which to generate code
//
void CodeGen::genCodeForLongUMod(GenTreeOp* node)
{
assert(node != nullptr);
assert(node->OperGet() == GT_UMOD);
assert(node->TypeGet() == TYP_INT);
GenTreeOp* const dividend = node->gtOp1->AsOp();
assert(dividend->OperGet() == GT_LONG);
assert(varTypeIsLong(dividend));
genConsumeOperands(node);
GenTree* const dividendLo = dividend->gtOp1;
GenTree* const dividendHi = dividend->gtOp2;
assert(dividendLo->isUsedFromReg());
assert(dividendHi->isUsedFromReg());
GenTree* const divisor = node->gtOp2;
assert(divisor->gtSkipReloadOrCopy()->OperGet() == GT_CNS_INT);
assert(divisor->gtSkipReloadOrCopy()->isUsedFromReg());
assert(divisor->gtSkipReloadOrCopy()->AsIntCon()->gtIconVal >= 2);
assert(divisor->gtSkipReloadOrCopy()->AsIntCon()->gtIconVal <= 0x3fffffff);
// dividendLo must be in RAX; dividendHi must be in RDX
genCopyRegIfNeeded(dividendLo, REG_EAX);
genCopyRegIfNeeded(dividendHi, REG_EDX);
// At this point, EAX:EDX contains the 64bit dividend and op2->gtRegNum
// contains the 32bit divisor. We want to generate the following code:
//
// cmp edx, divisor->gtRegNum
// jb noOverflow
//
// mov temp, eax
// mov eax, edx
// xor edx, edx
// div divisor->gtRegNum
// mov eax, temp
//
// noOverflow:
// div divisor->gtRegNum
//
// This works because (a * 2^32 + b) % c = ((a % c) * 2^32 + b) % c.
BasicBlock* const noOverflow = genCreateTempLabel();
// cmp edx, divisor->gtRegNum
// jb noOverflow
inst_RV_RV(INS_cmp, REG_EDX, divisor->gtRegNum);
inst_JMP(EJ_jb, noOverflow);
// mov temp, eax
// mov eax, edx
// xor edx, edx
// div divisor->gtRegNum
// mov eax, temp
const regNumber tempReg = node->GetSingleTempReg();
inst_RV_RV(INS_mov, tempReg, REG_EAX, TYP_INT);
inst_RV_RV(INS_mov, REG_EAX, REG_EDX, TYP_INT);
instGen_Set_Reg_To_Zero(EA_PTRSIZE, REG_EDX);
inst_RV(INS_div, divisor->gtRegNum, TYP_INT);
inst_RV_RV(INS_mov, REG_EAX, tempReg, TYP_INT);
// noOverflow:
// div divisor->gtRegNum
genDefineTempLabel(noOverflow);
inst_RV(INS_div, divisor->gtRegNum, TYP_INT);
const regNumber targetReg = node->gtRegNum;
if (targetReg != REG_EDX)
{
inst_RV_RV(INS_mov, targetReg, REG_RDX, TYP_INT);
}
genProduceReg(node);
}
#endif // _TARGET_X86_
//------------------------------------------------------------------------
// genCodeForDivMod: Generate code for a DIV or MOD operation.
//
// Arguments:
// treeNode - the node to generate the code for
//
void CodeGen::genCodeForDivMod(GenTreeOp* treeNode)
{
assert(treeNode->OperIs(GT_DIV, GT_UDIV, GT_MOD, GT_UMOD));
// We shouldn't be seeing GT_MOD on float/double args as it should get morphed into a
// helper call by front-end. Similarly we shouldn't be seeing GT_UDIV and GT_UMOD
// on float/double args.
assert(treeNode->OperIs(GT_DIV) || !varTypeIsFloating(treeNode));
GenTree* dividend = treeNode->gtOp1;
#ifdef _TARGET_X86_
if (varTypeIsLong(dividend->TypeGet()))
{
genCodeForLongUMod(treeNode);
return;
}
#endif // _TARGET_X86_
GenTree* divisor = treeNode->gtOp2;
genTreeOps oper = treeNode->OperGet();
emitAttr size = emitTypeSize(treeNode);
regNumber targetReg = treeNode->gtRegNum;
var_types targetType = treeNode->TypeGet();
emitter* emit = getEmitter();
// dividend is in a register.
assert(dividend->isUsedFromReg());
genConsumeOperands(treeNode->AsOp());
if (varTypeIsFloating(targetType))
{
// Floating point div/rem operation
assert(oper == GT_DIV || oper == GT_MOD);
if (dividend->gtRegNum == targetReg)
{
emit->emitInsBinary(genGetInsForOper(treeNode->gtOper, targetType), size, treeNode, divisor);
}
else if (divisor->isUsedFromReg() && divisor->gtRegNum == targetReg)
{
// It is not possible to generate 2-operand divss or divsd where reg2 = reg1 / reg2
// because divss/divsd reg1, reg2 will over-write reg1. Therefore, in case of AMD64
// LSRA has to make sure that such a register assignment is not generated for floating
// point div/rem operations.
noway_assert(
!"GT_DIV/GT_MOD (float): case of reg2 = reg1 / reg2, LSRA should never generate such a reg assignment");
}
else
{
inst_RV_RV(ins_Copy(targetType), targetReg, dividend->gtRegNum, targetType);
emit->emitInsBinary(genGetInsForOper(treeNode->gtOper, targetType), size, treeNode, divisor);
}
}
else
{
// dividend must be in RAX
genCopyRegIfNeeded(dividend, REG_RAX);
// zero or sign extend rax to rdx
if (oper == GT_UMOD || oper == GT_UDIV)
{
instGen_Set_Reg_To_Zero(EA_PTRSIZE, REG_EDX);
}
else
{
emit->emitIns(INS_cdq, size);
// the cdq instruction writes RDX, So clear the gcInfo for RDX
gcInfo.gcMarkRegSetNpt(RBM_RDX);
}
// Perform the 'targetType' (64-bit or 32-bit) divide instruction
instruction ins;
if (oper == GT_UMOD || oper == GT_UDIV)
{
ins = INS_div;
}
else
{
ins = INS_idiv;
}
emit->emitInsBinary(ins, size, treeNode, divisor);
// DIV/IDIV instructions always store the quotient in RAX and the remainder in RDX.
// Move the result to the desired register, if necessary
if (oper == GT_DIV || oper == GT_UDIV)
{
if (targetReg != REG_RAX)
{
inst_RV_RV(INS_mov, targetReg, REG_RAX, targetType);
}
}
else
{
assert((oper == GT_MOD) || (oper == GT_UMOD));
if (targetReg != REG_RDX)
{
inst_RV_RV(INS_mov, targetReg, REG_RDX, targetType);
}
}
}
genProduceReg(treeNode);
}
//------------------------------------------------------------------------
// genCodeForBinary: Generate code for many binary arithmetic operators
// This method is expected to have called genConsumeOperands() before calling it.
//
// Arguments:
// treeNode - The binary operation for which we are generating code.
//
// Return Value:
// None.
//
// Notes:
// Mul and div variants have special constraints on x64 so are not handled here.
// See teh assert below for the operators that are handled.
void CodeGen::genCodeForBinary(GenTree* treeNode)
{
const genTreeOps oper = treeNode->OperGet();
regNumber targetReg = treeNode->gtRegNum;
var_types targetType = treeNode->TypeGet();
emitter* emit = getEmitter();
#if defined(_TARGET_64BIT_)
assert(oper == GT_OR || oper == GT_XOR || oper == GT_AND || oper == GT_ADD || oper == GT_SUB);
#else // !defined(_TARGET_64BIT_)
assert(oper == GT_OR || oper == GT_XOR || oper == GT_AND || oper == GT_ADD_LO || oper == GT_ADD_HI ||
oper == GT_SUB_LO || oper == GT_SUB_HI || oper == GT_ADD || oper == GT_SUB);
#endif // !defined(_TARGET_64BIT_)
GenTree* op1 = treeNode->gtGetOp1();
GenTree* op2 = treeNode->gtGetOp2();
// Commutative operations can mark op1 as contained or reg-optional to generate "op reg, memop/immed"
if (!op1->isUsedFromReg())
{
assert(treeNode->OperIsCommutative());
assert(op1->isMemoryOp() || op1->IsLocal() || op1->IsCnsNonZeroFltOrDbl() || op1->IsIntCnsFitsInI32() ||
op1->IsRegOptional());
op1 = treeNode->gtGetOp2();
op2 = treeNode->gtGetOp1();
}
instruction ins = genGetInsForOper(treeNode->OperGet(), targetType);
// The arithmetic node must be sitting in a register (since it's not contained)
noway_assert(targetReg != REG_NA);
regNumber op1reg = op1->isUsedFromReg() ? op1->gtRegNum : REG_NA;
regNumber op2reg = op2->isUsedFromReg() ? op2->gtRegNum : REG_NA;
GenTree* dst;
GenTree* src;
// This is the case of reg1 = reg1 op reg2
// We're ready to emit the instruction without any moves
if (op1reg == targetReg)
{
dst = op1;
src = op2;
}
// We have reg1 = reg2 op reg1
// In order for this operation to be correct
// we need that op is a commutative operation so
// we can convert it into reg1 = reg1 op reg2 and emit
// the same code as above
else if (op2reg == targetReg)
{
noway_assert(GenTree::OperIsCommutative(oper));
dst = op2;
src = op1;
}
// now we know there are 3 different operands so attempt to use LEA
else if (oper == GT_ADD && !varTypeIsFloating(treeNode) && !treeNode->gtOverflowEx() // LEA does not set flags
&& (op2->isContainedIntOrIImmed() || op2->isUsedFromReg()) && !treeNode->gtSetFlags())
{
if (op2->isContainedIntOrIImmed())
{
emit->emitIns_R_AR(INS_lea, emitTypeSize(treeNode), targetReg, op1reg,
(int)op2->AsIntConCommon()->IconValue());
}
else
{
assert(op2reg != REG_NA);
emit->emitIns_R_ARX(INS_lea, emitTypeSize(treeNode), targetReg, op1reg, op2reg, 1, 0);
}
genProduceReg(treeNode);
return;
}
// dest, op1 and op2 registers are different:
// reg3 = reg1 op reg2
// We can implement this by issuing a mov:
// reg3 = reg1
// reg3 = reg3 op reg2
else
{
inst_RV_RV(ins_Copy(targetType), targetReg, op1reg, targetType);
regSet.verifyRegUsed(targetReg);
gcInfo.gcMarkRegPtrVal(targetReg, targetType);
dst = treeNode;
src = op2;
}
// try to use an inc or dec
if (oper == GT_ADD && !varTypeIsFloating(treeNode) && src->isContainedIntOrIImmed() && !treeNode->gtOverflowEx())
{
if (src->IsIntegralConst(1))
{
emit->emitIns_R(INS_inc, emitTypeSize(treeNode), targetReg);
genProduceReg(treeNode);
return;
}
else if (src->IsIntegralConst(-1))
{
emit->emitIns_R(INS_dec, emitTypeSize(treeNode), targetReg);
genProduceReg(treeNode);
return;
}
}
regNumber r = emit->emitInsBinary(ins, emitTypeSize(treeNode), dst, src);
noway_assert(r == targetReg);
if (treeNode->gtOverflowEx())
{
#if !defined(_TARGET_64BIT_)
assert(oper == GT_ADD || oper == GT_SUB || oper == GT_ADD_HI || oper == GT_SUB_HI);
#else
assert(oper == GT_ADD || oper == GT_SUB);
#endif
genCheckOverflow(treeNode);
}
genProduceReg(treeNode);
}
//------------------------------------------------------------------------
// genCodeForMul: Generate code for a MUL operation.
//
// Arguments:
// treeNode - the node to generate the code for
//
void CodeGen::genCodeForMul(GenTreeOp* treeNode)
{
assert(treeNode->OperIs(GT_MUL));
regNumber targetReg = treeNode->gtRegNum;
var_types targetType = treeNode->TypeGet();
emitter* emit = getEmitter();
instruction ins;
emitAttr size = emitTypeSize(treeNode);
bool isUnsignedMultiply = ((treeNode->gtFlags & GTF_UNSIGNED) != 0);
bool requiresOverflowCheck = treeNode->gtOverflowEx();
GenTree* op1 = treeNode->gtGetOp1();
GenTree* op2 = treeNode->gtGetOp2();
// there are 3 forms of x64 multiply:
// 1-op form with 128 result: RDX:RAX = RAX * rm
// 2-op form: reg *= rm
// 3-op form: reg = rm * imm
genConsumeOperands(treeNode->AsOp());
// This matches the 'mul' lowering in Lowering::SetMulOpCounts()
//
// immOp :: Only one operand can be an immediate
// rmOp :: Only one operand can be a memory op.
// regOp :: A register op (especially the operand that matches 'targetReg')
// (can be nullptr when we have both a memory op and an immediate op)
GenTree* immOp = nullptr;
GenTree* rmOp = op1;
GenTree* regOp;
if (op2->isContainedIntOrIImmed())
{
immOp = op2;
}
else if (op1->isContainedIntOrIImmed())
{
immOp = op1;
rmOp = op2;
}
if (immOp != nullptr)
{
// This must be a non-floating point operation.
assert(!varTypeIsFloating(treeNode));
// CQ: When possible use LEA for mul by imm 3, 5 or 9
ssize_t imm = immOp->AsIntConCommon()->IconValue();
if (!requiresOverflowCheck && rmOp->isUsedFromReg() && ((imm == 3) || (imm == 5) || (imm == 9)))
{
// We will use the LEA instruction to perform this multiply
// Note that an LEA with base=x, index=x and scale=(imm-1) computes x*imm when imm=3,5 or 9.
unsigned int scale = (unsigned int)(imm - 1);
getEmitter()->emitIns_R_ARX(INS_lea, size, targetReg, rmOp->gtRegNum, rmOp->gtRegNum, scale, 0);
}
else if (!requiresOverflowCheck && rmOp->isUsedFromReg() && (imm == genFindLowestBit(imm)) && (imm != 0))
{
// Use shift for constant multiply when legal
uint64_t zextImm = static_cast<uint64_t>(static_cast<size_t>(imm));
unsigned int shiftAmount = genLog2(zextImm);
if (targetReg != rmOp->gtRegNum)
{
// Copy reg src to dest register
inst_RV_RV(ins_Copy(targetType), targetReg, rmOp->gtRegNum, targetType);
}
inst_RV_SH(INS_shl, size, targetReg, shiftAmount);
}
else
{
// use the 3-op form with immediate
ins = getEmitter()->inst3opImulForReg(targetReg);
emit->emitInsBinary(ins, size, rmOp, immOp);
}
}
else // we have no contained immediate operand
{
regOp = op1;
rmOp = op2;
regNumber mulTargetReg = targetReg;
if (isUnsignedMultiply && requiresOverflowCheck)
{
ins = INS_mulEAX;
mulTargetReg = REG_RAX;
}
else
{
ins = genGetInsForOper(GT_MUL, targetType);
}
// Set rmOp to the memory operand (if any)
// or set regOp to the op2 when it has the matching target register for our multiply op
//
if (op1->isUsedFromMemory() || (op2->isUsedFromReg() && (op2->gtRegNum == mulTargetReg)))
{
regOp = op2;
rmOp = op1;
}
assert(regOp->isUsedFromReg());
// Setup targetReg when neither of the source operands was a matching register
if (regOp->gtRegNum != mulTargetReg)
{
inst_RV_RV(ins_Copy(targetType), mulTargetReg, regOp->gtRegNum, targetType);
}
emit->emitInsBinary(ins, size, treeNode, rmOp);
// Move the result to the desired register, if necessary
if ((ins == INS_mulEAX) && (targetReg != REG_RAX))
{
inst_RV_RV(INS_mov, targetReg, REG_RAX, targetType);
}
}
if (requiresOverflowCheck)
{
// Overflow checking is only used for non-floating point types
noway_assert(!varTypeIsFloating(treeNode));
genCheckOverflow(treeNode);
}
genProduceReg(treeNode);
}
//------------------------------------------------------------------------
// isStructReturn: Returns whether the 'treeNode' is returning a struct.
//
// Arguments:
// treeNode - The tree node to evaluate whether is a struct return.
//
// Return Value:
// For AMD64 *nix: returns true if the 'treeNode" is a GT_RETURN node, of type struct.
// Otherwise returns false.
// For other platforms always returns false.
//
bool CodeGen::isStructReturn(GenTree* treeNode)
{
// This method could be called for 'treeNode' of GT_RET_FILT or GT_RETURN.
// For the GT_RET_FILT, the return is always
// a bool or a void, for the end of a finally block.
noway_assert(treeNode->OperGet() == GT_RETURN || treeNode->OperGet() == GT_RETFILT);
if (treeNode->OperGet() != GT_RETURN)
{
return false;
}
#ifdef UNIX_AMD64_ABI
return varTypeIsStruct(treeNode);
#else // !UNIX_AMD64_ABI
assert(!varTypeIsStruct(treeNode));
return false;
#endif // UNIX_AMD64_ABI
}
//------------------------------------------------------------------------
// genStructReturn: Generates code for returning a struct.
//
// Arguments:
// treeNode - The GT_RETURN tree node.
//
// Return Value:
// None
//
// Assumption:
// op1 of GT_RETURN node is either GT_LCL_VAR or multi-reg GT_CALL
void CodeGen::genStructReturn(GenTree* treeNode)
{
assert(treeNode->OperGet() == GT_RETURN);
GenTree* op1 = treeNode->gtGetOp1();
#ifdef UNIX_AMD64_ABI
if (op1->OperGet() == GT_LCL_VAR)
{
GenTreeLclVarCommon* lclVar = op1->AsLclVarCommon();
LclVarDsc* varDsc = &(compiler->lvaTable[lclVar->gtLclNum]);
assert(varDsc->lvIsMultiRegRet);
ReturnTypeDesc retTypeDesc;
retTypeDesc.InitializeStructReturnType(compiler, varDsc->lvVerTypeInfo.GetClassHandle());
unsigned regCount = retTypeDesc.GetReturnRegCount();
assert(regCount == MAX_RET_REG_COUNT);
if (varTypeIsEnregisterableStruct(op1))
{
// Right now the only enregistrable structs supported are SIMD vector types.
assert(varTypeIsSIMD(op1));
assert(op1->isUsedFromReg());
// This is a case of operand is in a single reg and needs to be
// returned in multiple ABI return registers.
regNumber opReg = genConsumeReg(op1);
regNumber reg0 = retTypeDesc.GetABIReturnReg(0);
regNumber reg1 = retTypeDesc.GetABIReturnReg(1);
if (opReg != reg0 && opReg != reg1)
{
// Operand reg is different from return regs.
// Copy opReg to reg0 and let it to be handled by one of the
// two cases below.
inst_RV_RV(ins_Copy(TYP_DOUBLE), reg0, opReg, TYP_DOUBLE);
opReg = reg0;
}
if (opReg == reg0)
{
assert(opReg != reg1);
// reg0 - already has required 8-byte in bit position [63:0].
// reg1 = opReg.
// swap upper and lower 8-bytes of reg1 so that desired 8-byte is in bit position [63:0].
inst_RV_RV(ins_Copy(TYP_DOUBLE), reg1, opReg, TYP_DOUBLE);
}
else
{
assert(opReg == reg1);
// reg0 = opReg.
// swap upper and lower 8-bytes of reg1 so that desired 8-byte is in bit position [63:0].
inst_RV_RV(ins_Copy(TYP_DOUBLE), reg0, opReg, TYP_DOUBLE);
}
inst_RV_RV_IV(INS_shufpd, EA_16BYTE, reg1, reg1, 0x01);
}
else
{
assert(op1->isUsedFromMemory());
// Copy var on stack into ABI return registers
int offset = 0;
for (unsigned i = 0; i < regCount; ++i)
{
var_types type = retTypeDesc.GetReturnRegType(i);
regNumber reg = retTypeDesc.GetABIReturnReg(i);
getEmitter()->emitIns_R_S(ins_Load(type), emitTypeSize(type), reg, lclVar->gtLclNum, offset);
offset += genTypeSize(type);
}
}
}
else
{
assert(op1->IsMultiRegCall() || op1->IsCopyOrReloadOfMultiRegCall());
genConsumeRegs(op1);
GenTree* actualOp1 = op1->gtSkipReloadOrCopy();
GenTreeCall* call = actualOp1->AsCall();
ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
unsigned regCount = retTypeDesc->GetReturnRegCount();
assert(regCount == MAX_RET_REG_COUNT);
// Handle circular dependency between call allocated regs and ABI return regs.
//
// It is possible under LSRA stress that originally allocated regs of call node,
// say rax and rdx, are spilled and reloaded to rdx and rax respectively. But
// GT_RETURN needs to move values as follows: rdx->rax, rax->rdx. Similar kind
// kind of circular dependency could arise between xmm0 and xmm1 return regs.
// Codegen is expected to handle such circular dependency.
//
var_types regType0 = retTypeDesc->GetReturnRegType(0);
regNumber returnReg0 = retTypeDesc->GetABIReturnReg(0);
regNumber allocatedReg0 = call->GetRegNumByIdx(0);
var_types regType1 = retTypeDesc->GetReturnRegType(1);
regNumber returnReg1 = retTypeDesc->GetABIReturnReg(1);
regNumber allocatedReg1 = call->GetRegNumByIdx(1);
if (op1->IsCopyOrReload())
{
// GT_COPY/GT_RELOAD will have valid reg for those positions
// that need to be copied or reloaded.
regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(0);
if (reloadReg != REG_NA)
{
allocatedReg0 = reloadReg;
}
reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(1);
if (reloadReg != REG_NA)
{
allocatedReg1 = reloadReg;
}
}
if (allocatedReg0 == returnReg1 && allocatedReg1 == returnReg0)
{
// Circular dependency - swap allocatedReg0 and allocatedReg1
if (varTypeIsFloating(regType0))
{
assert(varTypeIsFloating(regType1));
// The fastest way to swap two XMM regs is using PXOR
inst_RV_RV(INS_pxor, allocatedReg0, allocatedReg1, TYP_DOUBLE);
inst_RV_RV(INS_pxor, allocatedReg1, allocatedReg0, TYP_DOUBLE);
inst_RV_RV(INS_pxor, allocatedReg0, allocatedReg1, TYP_DOUBLE);
}
else
{
assert(varTypeIsIntegral(regType0));
assert(varTypeIsIntegral(regType1));
inst_RV_RV(INS_xchg, allocatedReg1, allocatedReg0, TYP_I_IMPL);
}
}
else if (allocatedReg1 == returnReg0)
{
// Change the order of moves to correctly handle dependency.
if (allocatedReg1 != returnReg1)
{
inst_RV_RV(ins_Copy(regType1), returnReg1, allocatedReg1, regType1);
}
if (allocatedReg0 != returnReg0)
{
inst_RV_RV(ins_Copy(regType0), returnReg0, allocatedReg0, regType0);
}
}
else
{
// No circular dependency case.
if (allocatedReg0 != returnReg0)
{
inst_RV_RV(ins_Copy(regType0), returnReg0, allocatedReg0, regType0);
}
if (allocatedReg1 != returnReg1)
{
inst_RV_RV(ins_Copy(regType1), returnReg1, allocatedReg1, regType1);
}
}
}
#else
unreached();
#endif
}
#if defined(_TARGET_X86_)
//------------------------------------------------------------------------
// genFloatReturn: Generates code for float return statement for x86.
//
// Note: treeNode's and op1's registers are already consumed.
//
// Arguments:
// treeNode - The GT_RETURN or GT_RETFILT tree node with float type.
//
// Return Value:
// None
//
void CodeGen::genFloatReturn(GenTree* treeNode)
{
assert(treeNode->OperGet() == GT_RETURN || treeNode->OperGet() == GT_RETFILT);
assert(varTypeIsFloating(treeNode));
GenTree* op1 = treeNode->gtGetOp1();
// Spill the return value register from an XMM register to the stack, then load it on the x87 stack.
// If it already has a home location, use that. Otherwise, we need a temp.
if (genIsRegCandidateLocal(op1) && compiler->lvaTable[op1->gtLclVarCommon.gtLclNum].lvOnFrame)
{
if (compiler->lvaTable[op1->gtLclVarCommon.gtLclNum].lvRegNum != REG_STK)
{
op1->gtFlags |= GTF_SPILL;
inst_TT_RV(ins_Store(op1->gtType, compiler->isSIMDTypeLocalAligned(op1->gtLclVarCommon.gtLclNum)), op1,
op1->gtRegNum);
}
// Now, load it to the fp stack.
getEmitter()->emitIns_S(INS_fld, emitTypeSize(op1), op1->AsLclVarCommon()->gtLclNum, 0);
}
else
{
// Spill the value, which should be in a register, then load it to the fp stack.
// TODO-X86-CQ: Deal with things that are already in memory (don't call genConsumeReg yet).
op1->gtFlags |= GTF_SPILL;
regSet.rsSpillTree(op1->gtRegNum, op1);
op1->gtFlags |= GTF_SPILLED;
op1->gtFlags &= ~GTF_SPILL;
TempDsc* t = regSet.rsUnspillInPlace(op1, op1->gtRegNum);
inst_FS_ST(INS_fld, emitActualTypeSize(op1->gtType), t, 0);
op1->gtFlags &= ~GTF_SPILLED;
regSet.tmpRlsTemp(t);
}
}
#endif // _TARGET_X86_
//------------------------------------------------------------------------
// genCodeForCompare: Produce code for a GT_EQ/GT_NE/GT_LT/GT_LE/GT_GE/GT_GT/GT_TEST_EQ/GT_TEST_NE/GT_CMP node.
//
// Arguments:
// tree - the node
//
void CodeGen::genCodeForCompare(GenTreeOp* tree)
{
assert(tree->OperIs(GT_EQ, GT_NE, GT_LT, GT_LE, GT_GE, GT_GT, GT_TEST_EQ, GT_TEST_NE, GT_CMP));
// TODO-XArch-CQ: Check if we can use the currently set flags.
// TODO-XArch-CQ: Check for the case where we can simply transfer the carry bit to a register
// (signed < or >= where targetReg != REG_NA)
GenTree* op1 = tree->gtOp1;
var_types op1Type = op1->TypeGet();
if (varTypeIsFloating(op1Type))
{
genCompareFloat(tree);
}
else
{
genCompareInt(tree);
}
}
//------------------------------------------------------------------------
// genCodeForBT: Generates code for a GT_BT node.
//
// Arguments:
// tree - The node.
//
void CodeGen::genCodeForBT(GenTreeOp* bt)
{
assert(bt->OperIs(GT_BT));
GenTree* op1 = bt->gtGetOp1();
GenTree* op2 = bt->gtGetOp2();
var_types type = genActualType(op1->TypeGet());
assert(op1->isUsedFromReg() && op2->isUsedFromReg());
assert((genTypeSize(type) >= genTypeSize(TYP_INT)) && (genTypeSize(type) <= genTypeSize(TYP_I_IMPL)));
genConsumeOperands(bt);
// Note that the emitter doesn't fully support INS_bt, it only supports the reg,reg
// form and encodes the registers in reverse order. To get the correct order we need
// to reverse the operands when calling emitIns_R_R.
getEmitter()->emitIns_R_R(INS_bt, emitTypeSize(type), op2->gtRegNum, op1->gtRegNum);
}
//------------------------------------------------------------------------
// genCodeForJumpTrue: Generates code for jmpTrue statement.
//
// Arguments:
// tree - The GT_JTRUE tree node.
//
// Return Value:
// None
//
void CodeGen::genCodeForJumpTrue(GenTree* tree)
{
GenTree* cmp = tree->gtOp.gtOp1;
assert(cmp->OperIsCompare());
assert(compiler->compCurBB->bbJumpKind == BBJ_COND);
#if !defined(_TARGET_64BIT_)
// Long-typed compares should have been handled by Lowering::LowerCompare.
assert(!varTypeIsLong(cmp->gtGetOp1()));
#endif
// Get the "kind" and type of the comparison. Note that whether it is an unsigned cmp
// is governed by a flag NOT by the inherent type of the node
// TODO-XArch-CQ: Check if we can use the currently set flags.
emitJumpKind jumpKind[2];
bool branchToTrueLabel[2];
genJumpKindsForTree(cmp, jumpKind, branchToTrueLabel);
BasicBlock* skipLabel = nullptr;
if (jumpKind[0] != EJ_NONE)
{
BasicBlock* jmpTarget;
if (branchToTrueLabel[0])
{
jmpTarget = compiler->compCurBB->bbJumpDest;
}
else
{
// This case arises only for ordered GT_EQ right now
assert((cmp->gtOper == GT_EQ) && ((cmp->gtFlags & GTF_RELOP_NAN_UN) == 0));
skipLabel = genCreateTempLabel();
jmpTarget = skipLabel;
}
inst_JMP(jumpKind[0], jmpTarget);
}
if (jumpKind[1] != EJ_NONE)
{
// the second conditional branch always has to be to the true label
assert(branchToTrueLabel[1]);
inst_JMP(jumpKind[1], compiler->compCurBB->bbJumpDest);
}
if (skipLabel != nullptr)
{
genDefineTempLabel(skipLabel);
}
}
//------------------------------------------------------------------------
// genCodeForJcc: Produce code for a GT_JCC node.
//
// Arguments:
// tree - the node
//
void CodeGen::genCodeForJcc(GenTreeCC* tree)
{
assert(compiler->compCurBB->bbJumpKind == BBJ_COND);
CompareKind compareKind = ((tree->gtFlags & GTF_UNSIGNED) != 0) ? CK_UNSIGNED : CK_SIGNED;
emitJumpKind jumpKind = genJumpKindForOper(tree->gtCondition, compareKind);
inst_JMP(jumpKind, compiler->compCurBB->bbJumpDest);
}
//------------------------------------------------------------------------
// genCodeForSetcc: Generates a setcc instruction for a GT_SETCC node.
//
// Arguments:
// tree - the GT_SETCC node
//
// Assumptions:
// The condition represents an integer comparison. This code doesn't
// have the necessary logic to deal with floating point comparisons,
// in fact it doesn't even know if the comparison is integer or floating
// point because SETCC nodes do not have any operands.
//
void CodeGen::genCodeForSetcc(GenTreeCC* setcc)
{
regNumber dstReg = setcc->gtRegNum;
CompareKind compareKind = setcc->IsUnsigned() ? CK_UNSIGNED : CK_SIGNED;
emitJumpKind jumpKind = genJumpKindForOper(setcc->gtCondition, compareKind);
assert(genIsValidIntReg(dstReg) && isByteReg(dstReg));
// Make sure nobody is setting GTF_RELOP_NAN_UN on this node as it is ignored.
assert((setcc->gtFlags & GTF_RELOP_NAN_UN) == 0);
inst_SET(jumpKind, dstReg);
inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), dstReg, dstReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
genProduceReg(setcc);
}
//------------------------------------------------------------------------
// genCodeForReturnTrap: Produce code for a GT_RETURNTRAP node.
//
// Arguments:
// tree - the GT_RETURNTRAP node
//
void CodeGen::genCodeForReturnTrap(GenTreeOp* tree)
{
assert(tree->OperGet() == GT_RETURNTRAP);
// this is nothing but a conditional call to CORINFO_HELP_STOP_FOR_GC
// based on the contents of 'data'
GenTree* data = tree->gtOp1;
genConsumeRegs(data);
GenTreeIntCon cns = intForm(TYP_INT, 0);
cns.SetContained();
getEmitter()->emitInsBinary(INS_cmp, emitTypeSize(TYP_INT), data, &cns);
BasicBlock* skipLabel = genCreateTempLabel();
emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
inst_JMP(jmpEqual, skipLabel);
// emit the call to the EE-helper that stops for GC (or other reasons)
regNumber tmpReg = tree->GetSingleTempReg(RBM_ALLINT);
assert(genIsValidIntReg(tmpReg));
genEmitHelperCall(CORINFO_HELP_STOP_FOR_GC, 0, EA_UNKNOWN, tmpReg);
genDefineTempLabel(skipLabel);
}
/*****************************************************************************
*
* Generate code for a single node in the tree.
* Preconditions: All operands have been evaluated
*
*/
void CodeGen::genCodeForTreeNode(GenTree* treeNode)
{
regNumber targetReg;
#if !defined(_TARGET_64BIT_)
if (treeNode->TypeGet() == TYP_LONG)
{
// All long enregistered nodes will have been decomposed into their
// constituent lo and hi nodes.
targetReg = REG_NA;
}
else
#endif // !defined(_TARGET_64BIT_)
{
targetReg = treeNode->gtRegNum;
}
var_types targetType = treeNode->TypeGet();
emitter* emit = getEmitter();
#ifdef DEBUG
// Validate that all the operands for the current node are consumed in order.
// This is important because LSRA ensures that any necessary copies will be
// handled correctly.
lastConsumedNode = nullptr;
if (compiler->verbose)
{
unsigned seqNum = treeNode->gtSeqNum; // Useful for setting a conditional break in Visual Studio
compiler->gtDispLIRNode(treeNode, "Generating: ");
}
#endif // DEBUG
// Is this a node whose value is already in a register? LSRA denotes this by
// setting the GTF_REUSE_REG_VAL flag.
if (treeNode->IsReuseRegVal())
{
// For now, this is only used for constant nodes.
assert((treeNode->OperIsConst()));
JITDUMP(" TreeNode is marked ReuseReg\n");
return;
}
// contained nodes are part of their parents for codegen purposes
// ex : immediates, most LEAs
if (treeNode->isContained())
{
return;
}
switch (treeNode->gtOper)
{
#ifndef JIT32_GCENCODER
case GT_START_NONGC:
getEmitter()->emitDisableGC();
break;
#endif // !defined(JIT32_GCENCODER)
case GT_PROF_HOOK:
#ifdef PROFILING_SUPPORTED
// We should be seeing this only if profiler hook is needed
noway_assert(compiler->compIsProfilerHookNeeded());
// Right now this node is used only for tail calls. In future if
// we intend to use it for Enter or Leave hooks, add a data member
// to this node indicating the kind of profiler hook. For example,
// helper number can be used.
genProfilingLeaveCallback(CORINFO_HELP_PROF_FCN_TAILCALL);
#endif // PROFILING_SUPPORTED
break;
case GT_LCLHEAP:
genLclHeap(treeNode);
break;
case GT_CNS_INT:
#ifdef _TARGET_X86_
assert(!treeNode->IsIconHandle(GTF_ICON_TLS_HDL));
#endif // _TARGET_X86_
__fallthrough;
case GT_CNS_DBL:
genSetRegToConst(targetReg, targetType, treeNode);
genProduceReg(treeNode);
break;
case GT_NOT:
case GT_NEG:
genCodeForNegNot(treeNode);
break;
case GT_MOD:
case GT_UMOD:
case GT_DIV:
case GT_UDIV:
genCodeForDivMod(treeNode->AsOp());
break;
case GT_OR:
case GT_XOR:
case GT_AND:
assert(varTypeIsIntegralOrI(treeNode));
__fallthrough;
#if !defined(_TARGET_64BIT_)
case GT_ADD_LO:
case GT_ADD_HI:
case GT_SUB_LO:
case GT_SUB_HI:
#endif // !defined(_TARGET_64BIT_)
case GT_ADD:
case GT_SUB:
genConsumeOperands(treeNode->AsOp());
genCodeForBinary(treeNode);
break;
case GT_MUL:
genCodeForMul(treeNode->AsOp());
break;
case GT_LSH:
case GT_RSH:
case GT_RSZ:
case GT_ROL:
case GT_ROR:
genCodeForShift(treeNode);
break;
#if !defined(_TARGET_64BIT_)
case GT_LSH_HI:
case GT_RSH_LO:
genCodeForShiftLong(treeNode);
break;
#endif // !defined(_TARGET_64BIT_)
case GT_CAST:
genCodeForCast(treeNode->AsOp());
break;
case GT_BITCAST:
{
GenTree* const op1 = treeNode->AsOp()->gtOp1;
genConsumeReg(op1);
const bool srcFltReg = varTypeIsFloating(op1) || varTypeIsSIMD(op1);
const bool dstFltReg = varTypeIsFloating(treeNode) || varTypeIsSIMD(treeNode);
if (srcFltReg != dstFltReg)
{
instruction ins;
regNumber fltReg;
regNumber intReg;
if (dstFltReg)
{
ins = ins_CopyIntToFloat(op1->TypeGet(), treeNode->TypeGet());
fltReg = treeNode->gtRegNum;
intReg = op1->gtRegNum;
}
else
{
ins = ins_CopyFloatToInt(op1->TypeGet(), treeNode->TypeGet());
intReg = treeNode->gtRegNum;
fltReg = op1->gtRegNum;
}
inst_RV_RV(ins, fltReg, intReg, treeNode->TypeGet());
}
else if (treeNode->gtRegNum != op1->gtRegNum)
{
inst_RV_RV(ins_Copy(treeNode->TypeGet()), treeNode->gtRegNum, op1->gtRegNum, treeNode->TypeGet());
}
genProduceReg(treeNode);
break;
}
case GT_LCL_FLD_ADDR:
case GT_LCL_VAR_ADDR:
genCodeForLclAddr(treeNode);
break;
case GT_LCL_FLD:
genCodeForLclFld(treeNode->AsLclFld());
break;
case GT_LCL_VAR:
genCodeForLclVar(treeNode->AsLclVar());
break;
case GT_STORE_LCL_FLD:
genCodeForStoreLclFld(treeNode->AsLclFld());
break;
case GT_STORE_LCL_VAR:
genCodeForStoreLclVar(treeNode->AsLclVar());
break;
case GT_RETFILT:
case GT_RETURN:
genReturn(treeNode);
break;
case GT_LEA:
// If we are here, it is the case where there is an LEA that cannot be folded into a parent instruction.
genLeaInstruction(treeNode->AsAddrMode());
break;
case GT_INDEX_ADDR:
genCodeForIndexAddr(treeNode->AsIndexAddr());
break;
case GT_IND:
genCodeForIndir(treeNode->AsIndir());
break;
case GT_MULHI:
#ifdef _TARGET_X86_
case GT_MUL_LONG:
#endif
genCodeForMulHi(treeNode->AsOp());
break;
case GT_INTRINSIC:
genIntrinsic(treeNode);
break;
#ifdef FEATURE_SIMD
case GT_SIMD:
genSIMDIntrinsic(treeNode->AsSIMD());
break;
#endif // FEATURE_SIMD
#ifdef FEATURE_HW_INTRINSICS
case GT_HWIntrinsic:
genHWIntrinsic(treeNode->AsHWIntrinsic());
break;
#endif // FEATURE_HW_INTRINSICS
case GT_CKFINITE:
genCkfinite(treeNode);
break;
case GT_EQ:
case GT_NE:
case GT_LT:
case GT_LE:
case GT_GE:
case GT_GT:
case GT_TEST_EQ:
case GT_TEST_NE:
case GT_CMP:
genCodeForCompare(treeNode->AsOp());
break;
case GT_JTRUE:
genCodeForJumpTrue(treeNode);
break;
case GT_JCC:
genCodeForJcc(treeNode->AsCC());
break;
case GT_SETCC:
genCodeForSetcc(treeNode->AsCC());
break;
case GT_BT:
genCodeForBT(treeNode->AsOp());
break;
case GT_RETURNTRAP:
genCodeForReturnTrap(treeNode->AsOp());
break;
case GT_STOREIND:
genCodeForStoreInd(treeNode->AsStoreInd());
break;
case GT_COPY:
// This is handled at the time we call genConsumeReg() on the GT_COPY
break;
case GT_LIST:
case GT_FIELD_LIST:
// Should always be marked contained.
assert(!"LIST, FIELD_LIST nodes should always be marked contained.");
break;
case GT_SWAP:
genCodeForSwap(treeNode->AsOp());
break;
case GT_PUTARG_STK:
genPutArgStk(treeNode->AsPutArgStk());
break;
case GT_PUTARG_REG:
genPutArgReg(treeNode->AsOp());
break;
case GT_CALL:
genCallInstruction(treeNode->AsCall());
break;
case GT_JMP:
genJmpMethod(treeNode);
break;
case GT_LOCKADD:
genCodeForLockAdd(treeNode->AsOp());
break;
case GT_XCHG:
case GT_XADD:
genLockedInstructions(treeNode->AsOp());
break;
case GT_MEMORYBARRIER:
instGen_MemoryBarrier();
break;
case GT_CMPXCHG:
genCodeForCmpXchg(treeNode->AsCmpXchg());
break;
case GT_RELOAD:
// do nothing - reload is just a marker.
// The parent node will call genConsumeReg on this which will trigger the unspill of this node's child
// into the register specified in this node.
break;
case GT_NOP:
break;
case GT_NO_OP:
getEmitter()->emitIns_Nop(1);
break;
case GT_ARR_BOUNDS_CHECK:
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
#ifdef FEATURE_HW_INTRINSICS
case GT_HW_INTRINSIC_CHK:
#endif // FEATURE_HW_INTRINSICS
genRangeCheck(treeNode);
break;
case GT_PHYSREG:
genCodeForPhysReg(treeNode->AsPhysReg());
break;
case GT_NULLCHECK:
genCodeForNullCheck(treeNode->AsOp());
break;
case GT_CATCH_ARG:
noway_assert(handlerGetsXcptnObj(compiler->compCurBB->bbCatchTyp));
/* Catch arguments get passed in a register. genCodeForBBlist()
would have marked it as holding a GC object, but not used. */
noway_assert(gcInfo.gcRegGCrefSetCur & RBM_EXCEPTION_OBJECT);
genConsumeReg(treeNode);
break;
#if !FEATURE_EH_FUNCLETS
case GT_END_LFIN:
// Have to clear the ShadowSP of the nesting level which encloses the finally. Generates:
// mov dword ptr [ebp-0xC], 0 // for some slot of the ShadowSP local var
unsigned finallyNesting;
finallyNesting = treeNode->gtVal.gtVal1;
noway_assert(treeNode->gtVal.gtVal1 < compiler->compHndBBtabCount);
noway_assert(finallyNesting < compiler->compHndBBtabCount);
// The last slot is reserved for ICodeManager::FixContext(ppEndRegion)
unsigned filterEndOffsetSlotOffs;
PREFIX_ASSUME(compiler->lvaLclSize(compiler->lvaShadowSPslotsVar) >
TARGET_POINTER_SIZE); // below doesn't underflow.
filterEndOffsetSlotOffs =
(unsigned)(compiler->lvaLclSize(compiler->lvaShadowSPslotsVar) - TARGET_POINTER_SIZE);
unsigned curNestingSlotOffs;
curNestingSlotOffs = filterEndOffsetSlotOffs - ((finallyNesting + 1) * TARGET_POINTER_SIZE);
instGen_Store_Imm_Into_Lcl(TYP_I_IMPL, EA_PTRSIZE, 0, compiler->lvaShadowSPslotsVar, curNestingSlotOffs);
break;
#endif // !FEATURE_EH_FUNCLETS
case GT_PINVOKE_PROLOG:
noway_assert(((gcInfo.gcRegGCrefSetCur | gcInfo.gcRegByrefSetCur) & ~fullIntArgRegMask()) == 0);
// the runtime side requires the codegen here to be consistent
emit->emitDisableRandomNops();
break;
case GT_LABEL:
genPendingCallLabel = genCreateTempLabel();
treeNode->gtLabel.gtLabBB = genPendingCallLabel;
emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, genPendingCallLabel, treeNode->gtRegNum);
break;
case GT_STORE_OBJ:
case GT_STORE_DYN_BLK:
case GT_STORE_BLK:
genCodeForStoreBlk(treeNode->AsBlk());
break;
case GT_JMPTABLE:
genJumpTable(treeNode);
break;
case GT_SWITCH_TABLE:
genTableBasedSwitch(treeNode);
break;
case GT_ARR_INDEX:
genCodeForArrIndex(treeNode->AsArrIndex());
break;
case GT_ARR_OFFSET:
genCodeForArrOffset(treeNode->AsArrOffs());
break;
case GT_CLS_VAR_ADDR:
emit->emitIns_R_C(INS_lea, EA_PTRSIZE, targetReg, treeNode->gtClsVar.gtClsVarHnd, 0);
genProduceReg(treeNode);
break;
#if !defined(_TARGET_64BIT_)
case GT_LONG:
assert(treeNode->isUsedFromReg());
genConsumeRegs(treeNode);
break;
#endif
case GT_IL_OFFSET:
// Do nothing; these nodes are simply markers for debug info.
break;
default:
{
#ifdef DEBUG
char message[256];
_snprintf_s(message, _countof(message), _TRUNCATE, "NYI: Unimplemented node type %s\n",
GenTree::OpName(treeNode->OperGet()));
NYIRAW(message);
#endif
assert(!"Unknown node in codegen");
}
break;
}
}
//----------------------------------------------------------------------------------
// genMultiRegCallStoreToLocal: store multi-reg return value of a call node to a local
//
// Arguments:
// treeNode - Gentree of GT_STORE_LCL_VAR
//
// Return Value:
// None
//
// Assumption:
// The child of store is a multi-reg call node.
// genProduceReg() on treeNode is made by caller of this routine.
//
void CodeGen::genMultiRegCallStoreToLocal(GenTree* treeNode)
{
assert(treeNode->OperGet() == GT_STORE_LCL_VAR);
#ifdef UNIX_AMD64_ABI
// Structs of size >=9 and <=16 are returned in two return registers on x64 Unix.
assert(varTypeIsStruct(treeNode));
// Assumption: current x64 Unix implementation requires that a multi-reg struct
// var in 'var = call' is flagged as lvIsMultiRegRet to prevent it from
// being struct promoted.
unsigned lclNum = treeNode->AsLclVarCommon()->gtLclNum;
LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
noway_assert(varDsc->lvIsMultiRegRet);
GenTree* op1 = treeNode->gtGetOp1();
GenTree* actualOp1 = op1->gtSkipReloadOrCopy();
GenTreeCall* call = actualOp1->AsCall();
assert(call->HasMultiRegRetVal());
genConsumeRegs(op1);
ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
assert(retTypeDesc->GetReturnRegCount() == MAX_RET_REG_COUNT);
unsigned regCount = retTypeDesc->GetReturnRegCount();
if (treeNode->gtRegNum != REG_NA)
{
// Right now the only enregistrable structs supported are SIMD types.
assert(varTypeIsSIMD(treeNode));
assert(varTypeIsFloating(retTypeDesc->GetReturnRegType(0)));
assert(varTypeIsFloating(retTypeDesc->GetReturnRegType(1)));
// This is a case of two 8-bytes that comprise the operand is in
// two different xmm registers and needs to assembled into a single
// xmm register.
regNumber targetReg = treeNode->gtRegNum;
regNumber reg0 = call->GetRegNumByIdx(0);
regNumber reg1 = call->GetRegNumByIdx(1);
if (op1->IsCopyOrReload())
{
// GT_COPY/GT_RELOAD will have valid reg for those positions
// that need to be copied or reloaded.
regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(0);
if (reloadReg != REG_NA)
{
reg0 = reloadReg;
}
reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(1);
if (reloadReg != REG_NA)
{
reg1 = reloadReg;
}
}
if (targetReg != reg0 && targetReg != reg1)
{
// Copy reg0 into targetReg and let it to be handled by one
// of the cases below.
inst_RV_RV(ins_Copy(TYP_DOUBLE), targetReg, reg0, TYP_DOUBLE);
targetReg = reg0;
}
if (targetReg == reg0)
{
// targeReg[63:0] = targetReg[63:0]
// targetReg[127:64] = reg1[127:64]
inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, reg1, 0x00);
}
else
{
assert(targetReg == reg1);
// We need two shuffles to achieve this
// First:
// targeReg[63:0] = targetReg[63:0]
// targetReg[127:64] = reg0[63:0]
//
// Second:
// targeReg[63:0] = targetReg[127:64]
// targetReg[127:64] = targetReg[63:0]
//
// Essentially copy low 8-bytes from reg0 to high 8-bytes of targetReg
// and next swap low and high 8-bytes of targetReg to have them
// rearranged in the right order.
inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, reg0, 0x00);
inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, targetReg, 0x01);
}
}
else
{
// Stack store
int offset = 0;
for (unsigned i = 0; i < regCount; ++i)
{
var_types type = retTypeDesc->GetReturnRegType(i);
regNumber reg = call->GetRegNumByIdx(i);
if (op1->IsCopyOrReload())
{
// GT_COPY/GT_RELOAD will have valid reg for those positions
// that need to be copied or reloaded.
regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(i);
if (reloadReg != REG_NA)
{
reg = reloadReg;
}
}
assert(reg != REG_NA);
getEmitter()->emitIns_S_R(ins_Store(type), emitTypeSize(type), reg, lclNum, offset);
offset += genTypeSize(type);
}
varDsc->lvRegNum = REG_STK;
}
#elif defined(_TARGET_X86_)
// Longs are returned in two return registers on x86.
assert(varTypeIsLong(treeNode));
// Assumption: current x86 implementation requires that a multi-reg long
// var in 'var = call' is flagged as lvIsMultiRegRet to prevent it from
// being promoted.
unsigned lclNum = treeNode->AsLclVarCommon()->gtLclNum;
LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
noway_assert(varDsc->lvIsMultiRegRet);
GenTree* op1 = treeNode->gtGetOp1();
GenTree* actualOp1 = op1->gtSkipReloadOrCopy();
GenTreeCall* call = actualOp1->AsCall();
assert(call->HasMultiRegRetVal());
genConsumeRegs(op1);
ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
unsigned regCount = retTypeDesc->GetReturnRegCount();
assert(regCount == MAX_RET_REG_COUNT);
// Stack store
int offset = 0;
for (unsigned i = 0; i < regCount; ++i)
{
var_types type = retTypeDesc->GetReturnRegType(i);
regNumber reg = call->GetRegNumByIdx(i);
if (op1->IsCopyOrReload())
{
// GT_COPY/GT_RELOAD will have valid reg for those positions
// that need to be copied or reloaded.
regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(i);
if (reloadReg != REG_NA)
{
reg = reloadReg;
}
}
assert(reg != REG_NA);
getEmitter()->emitIns_S_R(ins_Store(type), emitTypeSize(type), reg, lclNum, offset);
offset += genTypeSize(type);
}
varDsc->lvRegNum = REG_STK;
#else // !UNIX_AMD64_ABI && !_TARGET_X86_
assert(!"Unreached");
#endif // !UNIX_AMD64_ABI && !_TARGET_X86_
}
//------------------------------------------------------------------------
// genLclHeap: Generate code for localloc.
//
// Arguments:
// tree - the localloc tree to generate.
//
// Notes:
// Note that for x86, we don't track ESP movements while generating the localloc code.
// The ESP tracking is used to report stack pointer-relative GC info, which is not
// interesting while doing the localloc construction. Also, for functions with localloc,
// we have EBP frames, and EBP-relative locals, and ESP-relative accesses only for function
// call arguments. We store the ESP after the localloc is complete in the LocAllocSP
// variable. This variable is implicitly reported to the VM in the GC info (its position
// is defined by convention relative to other items), and is used by the GC to find the
// "base" stack pointer in functions with localloc.
//
void CodeGen::genLclHeap(GenTree* tree)
{
assert(tree->OperGet() == GT_LCLHEAP);
assert(compiler->compLocallocUsed);
GenTree* size = tree->gtOp.gtOp1;
noway_assert((genActualType(size->gtType) == TYP_INT) || (genActualType(size->gtType) == TYP_I_IMPL));
regNumber targetReg = tree->gtRegNum;
regNumber regCnt = REG_NA;
var_types type = genActualType(size->gtType);
emitAttr easz = emitTypeSize(type);
BasicBlock* endLabel = nullptr;
#ifdef DEBUG
// Verify ESP
if (compiler->opts.compStackCheckOnRet)
{
noway_assert(compiler->lvaReturnEspCheck != 0xCCCCCCCC &&
compiler->lvaTable[compiler->lvaReturnEspCheck].lvDoNotEnregister &&
compiler->lvaTable[compiler->lvaReturnEspCheck].lvOnFrame);
getEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, REG_SPBASE, compiler->lvaReturnEspCheck, 0);
BasicBlock* esp_check = genCreateTempLabel();
emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
inst_JMP(jmpEqual, esp_check);
getEmitter()->emitIns(INS_BREAKPOINT);
genDefineTempLabel(esp_check);
}
#endif
noway_assert(isFramePointerUsed()); // localloc requires Frame Pointer to be established since SP changes
noway_assert(genStackLevel == 0); // Can't have anything on the stack
unsigned stackAdjustment = 0;
BasicBlock* loop = nullptr;
// compute the amount of memory to allocate to properly STACK_ALIGN.
size_t amount = 0;
if (size->IsCnsIntOrI())
{
// If size is a constant, then it must be contained.
assert(size->isContained());
// If amount is zero then return null in targetReg
amount = size->gtIntCon.gtIconVal;
if (amount == 0)
{
instGen_Set_Reg_To_Zero(EA_PTRSIZE, targetReg);
goto BAILOUT;
}
// 'amount' is the total number of bytes to localloc to properly STACK_ALIGN
amount = AlignUp(amount, STACK_ALIGN);
}
else
{
// The localloc requested memory size is non-constant.
// Put the size value in targetReg. If it is zero, bail out by returning null in targetReg.
genConsumeRegAndCopy(size, targetReg);
endLabel = genCreateTempLabel();
getEmitter()->emitIns_R_R(INS_test, easz, targetReg, targetReg);
inst_JMP(EJ_je, endLabel);
// Compute the size of the block to allocate and perform alignment.
// If compInitMem=true, we can reuse targetReg as regcnt,
// since we don't need any internal registers.
if (compiler->info.compInitMem)
{
assert(tree->AvailableTempRegCount() == 0);
regCnt = targetReg;
}
else
{
regCnt = tree->ExtractTempReg();
if (regCnt != targetReg)
{
// Above, we put the size in targetReg. Now, copy it to our new temp register if necessary.
inst_RV_RV(INS_mov, regCnt, targetReg, size->TypeGet());
}
}
// Round up the number of bytes to allocate to a STACK_ALIGN boundary. This is done
// by code like:
// add reg, 15
// and reg, -16
// However, in the initialized memory case, we need the count of STACK_ALIGN-sized
// elements, not a byte count, after the alignment. So instead of the "and", which
// becomes unnecessary, generate a shift, e.g.:
// add reg, 15
// shr reg, 4
inst_RV_IV(INS_add, regCnt, STACK_ALIGN - 1, emitActualTypeSize(type));
if (compiler->info.compInitMem)
{
// Convert the count from a count of bytes to a loop count. We will loop once per
// stack alignment size, so each loop will zero 4 bytes on Windows/x86, and 16 bytes
// on x64 and Linux/x86.
//
// Note that we zero a single reg-size word per iteration on x86, and 2 reg-size
// words per iteration on x64. We will shift off all the stack alignment bits
// added above, so there is no need for an 'and' instruction.
// --- shr regCnt, 2 (or 4) ---
inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_PTRSIZE, regCnt, STACK_ALIGN_SHIFT);
}
else
{
// Otherwise, mask off the low bits to align the byte count.
inst_RV_IV(INS_AND, regCnt, ~(STACK_ALIGN - 1), emitActualTypeSize(type));
}
}
#if FEATURE_FIXED_OUT_ARGS
// If we have an outgoing arg area then we must adjust the SP by popping off the
// outgoing arg area. We will restore it right before we return from this method.
//
// Localloc returns stack space that aligned to STACK_ALIGN bytes. The following
// are the cases that need to be handled:
// i) Method has out-going arg area.
// It is guaranteed that size of out-going arg area is STACK_ALIGN'ed (see fgMorphArgs).
// Therefore, we will pop off the out-going arg area from RSP before allocating the localloc space.
// ii) Method has no out-going arg area.
// Nothing to pop off from the stack.
if (compiler->lvaOutgoingArgSpaceSize > 0)
{
assert((compiler->lvaOutgoingArgSpaceSize % STACK_ALIGN) == 0); // This must be true for the stack to remain
// aligned
inst_RV_IV(INS_add, REG_SPBASE, compiler->lvaOutgoingArgSpaceSize, EA_PTRSIZE);
stackAdjustment += compiler->lvaOutgoingArgSpaceSize;
}
#endif
if (size->IsCnsIntOrI())
{
// We should reach here only for non-zero, constant size allocations.
assert(amount > 0);
assert((amount % STACK_ALIGN) == 0);
assert((amount % REGSIZE_BYTES) == 0);
// For small allocations we will generate up to six push 0 inline
size_t cntRegSizedWords = amount / REGSIZE_BYTES;
if (cntRegSizedWords <= 6)
{
for (; cntRegSizedWords != 0; cntRegSizedWords--)
{
inst_IV(INS_push_hide, 0); // push_hide means don't track the stack
}
goto ALLOC_DONE;
}
bool doNoInitLessThanOnePageAlloc =
!compiler->info.compInitMem && (amount < compiler->eeGetPageSize()); // must be < not <=
#ifdef _TARGET_X86_
bool needRegCntRegister = true;
#else // !_TARGET_X86_
bool needRegCntRegister = !doNoInitLessThanOnePageAlloc;
#endif // !_TARGET_X86_
if (needRegCntRegister)
{
// If compInitMem=true, we can reuse targetReg as regcnt.
// Since size is a constant, regCnt is not yet initialized.
assert(regCnt == REG_NA);
if (compiler->info.compInitMem)
{
assert(tree->AvailableTempRegCount() == 0);
regCnt = targetReg;
}
else
{
regCnt = tree->ExtractTempReg();
}
}
if (doNoInitLessThanOnePageAlloc)
{
// Since the size is less than a page, simply adjust ESP.
// ESP might already be in the guard page, so we must touch it BEFORE
// the alloc, not after.
CLANG_FORMAT_COMMENT_ANCHOR;
#ifdef _TARGET_X86_
// For x86, we don't want to use "sub ESP" because we don't want the emitter to track the adjustment
// to ESP. So do the work in the count register.
// TODO-CQ: manipulate ESP directly, to share code, reduce #ifdefs, and improve CQ. This would require
// creating a way to temporarily turn off the emitter's tracking of ESP, maybe marking instrDescs as "don't
// track".
inst_RV_RV(INS_mov, regCnt, REG_SPBASE, TYP_I_IMPL);
getEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, 0);
inst_RV_IV(INS_sub, regCnt, amount, EA_PTRSIZE);
inst_RV_RV(INS_mov, REG_SPBASE, regCnt, TYP_I_IMPL);
#else // !_TARGET_X86_
getEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, 0);
inst_RV_IV(INS_sub, REG_SPBASE, amount, EA_PTRSIZE);
#endif // !_TARGET_X86_
goto ALLOC_DONE;
}
// else, "mov regCnt, amount"
if (compiler->info.compInitMem)
{
// When initializing memory, we want 'amount' to be the loop count.
assert((amount % STACK_ALIGN) == 0);
amount /= STACK_ALIGN;
}
genSetRegToIcon(regCnt, amount, ((int)amount == amount) ? TYP_INT : TYP_LONG);
}
loop = genCreateTempLabel();
if (compiler->info.compInitMem)
{
// At this point 'regCnt' is set to the number of loop iterations for this loop, if each
// iteration zeros (and subtracts from the stack pointer) STACK_ALIGN bytes.
// Since we have to zero out the allocated memory AND ensure that RSP is always valid
// by tickling the pages, we will just push 0's on the stack.
assert(genIsValidIntReg(regCnt));
// Loop:
genDefineTempLabel(loop);
static_assert_no_msg((STACK_ALIGN % REGSIZE_BYTES) == 0);
unsigned const count = (STACK_ALIGN / REGSIZE_BYTES);
for (unsigned i = 0; i < count; i++)
{
inst_IV(INS_push_hide, 0); // --- push REG_SIZE bytes of 0
}
// Note that the stack must always be aligned to STACK_ALIGN bytes
// Decrement the loop counter and loop if not done.
inst_RV(INS_dec, regCnt, TYP_I_IMPL);
inst_JMP(EJ_jne, loop);
}
else
{
// At this point 'regCnt' is set to the total number of bytes to localloc.
//
// We don't need to zero out the allocated memory. However, we do have
// to tickle the pages to ensure that ESP is always valid and is
// in sync with the "stack guard page". Note that in the worst
// case ESP is on the last byte of the guard page. Thus you must
// touch ESP+0 first not ESP+x01000.
//
// Another subtlety is that you don't want ESP to be exactly on the
// boundary of the guard page because PUSH is predecrement, thus
// call setup would not touch the guard page but just beyond it
//
// Note that we go through a few hoops so that ESP never points to
// illegal pages at any time during the tickling process
//
// neg REGCNT
// add REGCNT, ESP // reg now holds ultimate ESP
// jb loop // result is smaller than orignial ESP (no wrap around)
// xor REGCNT, REGCNT, // Overflow, pick lowest possible number
// loop:
// test ESP, [ESP+0] // tickle the page
// mov REGTMP, ESP
// sub REGTMP, GetOsPageSize()
// mov ESP, REGTMP
// cmp ESP, REGCNT
// jae loop
//
// mov ESP, REG
// end:
inst_RV(INS_NEG, regCnt, TYP_I_IMPL);
inst_RV_RV(INS_add, regCnt, REG_SPBASE, TYP_I_IMPL);
inst_JMP(EJ_jb, loop);
instGen_Set_Reg_To_Zero(EA_PTRSIZE, regCnt);
genDefineTempLabel(loop);
// Tickle the decremented value, and move back to ESP,
// note that it has to be done BEFORE the update of ESP since
// ESP might already be on the guard page. It is OK to leave
// the final value of ESP on the guard page
getEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, 0);
// This is a harmless trick to avoid the emitter trying to track the
// decrement of the ESP - we do the subtraction in another reg instead
// of adjusting ESP directly.
regNumber regTmp = tree->GetSingleTempReg();
inst_RV_RV(INS_mov, regTmp, REG_SPBASE, TYP_I_IMPL);
inst_RV_IV(INS_sub, regTmp, compiler->eeGetPageSize(), EA_PTRSIZE);
inst_RV_RV(INS_mov, REG_SPBASE, regTmp, TYP_I_IMPL);
inst_RV_RV(INS_cmp, REG_SPBASE, regCnt, TYP_I_IMPL);
inst_JMP(EJ_jae, loop);
// Move the final value to ESP
inst_RV_RV(INS_mov, REG_SPBASE, regCnt);
}
ALLOC_DONE:
// Re-adjust SP to allocate out-going arg area
if (stackAdjustment > 0)
{
assert((stackAdjustment % STACK_ALIGN) == 0); // This must be true for the stack to remain aligned
inst_RV_IV(INS_sub, REG_SPBASE, stackAdjustment, EA_PTRSIZE);
}
// Return the stackalloc'ed address in result register.
// TargetReg = RSP + stackAdjustment.
getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, targetReg, REG_SPBASE, stackAdjustment);
if (endLabel != nullptr)
{
genDefineTempLabel(endLabel);
}
BAILOUT:
// Write the lvaLocAllocSPvar stack frame slot
if (compiler->lvaLocAllocSPvar != BAD_VAR_NUM)
{
getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_SPBASE, compiler->lvaLocAllocSPvar, 0);
}
#if STACK_PROBES
if (compiler->opts.compNeedStackProbes)
{
genGenerateStackProbe();
}
#endif
#ifdef DEBUG
// Update new ESP
if (compiler->opts.compStackCheckOnRet)
{
noway_assert(compiler->lvaReturnEspCheck != 0xCCCCCCCC &&
compiler->lvaTable[compiler->lvaReturnEspCheck].lvDoNotEnregister &&
compiler->lvaTable[compiler->lvaReturnEspCheck].lvOnFrame);
getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_SPBASE, compiler->lvaReturnEspCheck, 0);
}
#endif
genProduceReg(tree);
}
void CodeGen::genCodeForStoreBlk(GenTreeBlk* storeBlkNode)
{
assert(storeBlkNode->OperIs(GT_STORE_OBJ, GT_STORE_DYN_BLK, GT_STORE_BLK));
if (storeBlkNode->OperIs(GT_STORE_OBJ) && storeBlkNode->OperIsCopyBlkOp() && !storeBlkNode->gtBlkOpGcUnsafe)
{
assert(storeBlkNode->AsObj()->gtGcPtrCount != 0);
genCodeForCpObj(storeBlkNode->AsObj());
return;
}
#ifdef JIT32_GCENCODER
assert(!storeBlkNode->gtBlkOpGcUnsafe);
#else
if (storeBlkNode->gtBlkOpGcUnsafe)
{
getEmitter()->emitDisableGC();
}
#endif // JIT32_GCENCODER
bool isCopyBlk = storeBlkNode->OperIsCopyBlkOp();
switch (storeBlkNode->gtBlkOpKind)
{
#ifdef _TARGET_AMD64_
case GenTreeBlk::BlkOpKindHelper:
if (isCopyBlk)
{
genCodeForCpBlk(storeBlkNode);
}
else
{
genCodeForInitBlk(storeBlkNode);
}
break;
#endif // _TARGET_AMD64_
case GenTreeBlk::BlkOpKindRepInstr:
if (isCopyBlk)
{
genCodeForCpBlkRepMovs(storeBlkNode);
}
else
{
genCodeForInitBlkRepStos(storeBlkNode);
}
break;
case GenTreeBlk::BlkOpKindUnroll:
if (isCopyBlk)
{
genCodeForCpBlkUnroll(storeBlkNode);
}
else
{
genCodeForInitBlkUnroll(storeBlkNode);
}
break;
default:
unreached();
}
#ifndef JIT32_GCENCODER
if (storeBlkNode->gtBlkOpGcUnsafe)
{
getEmitter()->emitEnableGC();
}
#endif // !defined(JIT32_GCENCODER)
}
//
//------------------------------------------------------------------------
// genCodeForInitBlkRepStos: Generate code for InitBlk using rep stos.
//
// Arguments:
// initBlkNode - The Block store for which we are generating code.
//
// Preconditions:
// On x64:
// The size of the buffers must be a constant and also less than INITBLK_STOS_LIMIT bytes.
// Any value larger than that, we'll use the helper even if both the fill byte and the
// size are integer constants.
// On x86:
// The size must either be a non-constant or less than INITBLK_STOS_LIMIT bytes.
//
void CodeGen::genCodeForInitBlkRepStos(GenTreeBlk* initBlkNode)
{
// Make sure we got the arguments of the initblk/initobj operation in the right registers.
unsigned size = initBlkNode->Size();
GenTree* dstAddr = initBlkNode->Addr();
GenTree* initVal = initBlkNode->Data();
if (initVal->OperIsInitVal())
{
initVal = initVal->gtGetOp1();
}
#ifdef DEBUG
assert(dstAddr->isUsedFromReg());
assert(initVal->isUsedFromReg());
#ifdef _TARGET_AMD64_
assert(size != 0);
#endif
if (initVal->IsCnsIntOrI())
{
#ifdef _TARGET_AMD64_
assert(size > CPBLK_UNROLL_LIMIT && size < CPBLK_MOVS_LIMIT);
#else
// Note that a size of zero means a non-constant size.
assert((size == 0) || (size > CPBLK_UNROLL_LIMIT));
#endif
}
#endif // DEBUG
genConsumeBlockOp(initBlkNode, REG_RDI, REG_RAX, REG_RCX);
instGen(INS_r_stosb);
}
// Generate code for InitBlk by performing a loop unroll
// Preconditions:
// a) Both the size and fill byte value are integer constants.
// b) The size of the struct to initialize is smaller than INITBLK_UNROLL_LIMIT bytes.
//
void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* initBlkNode)
{
// Make sure we got the arguments of the initblk/initobj operation in the right registers
unsigned size = initBlkNode->Size();
GenTree* dstAddr = initBlkNode->Addr();
GenTree* initVal = initBlkNode->Data();
if (initVal->OperIsInitVal())
{
initVal = initVal->gtGetOp1();
}
assert(dstAddr->isUsedFromReg());
assert(initVal->isUsedFromReg() || (initVal->IsIntegralConst(0) && ((size & 0xf) == 0)));
assert(size != 0);
assert(size <= INITBLK_UNROLL_LIMIT);
assert(initVal->gtSkipReloadOrCopy()->IsCnsIntOrI());
emitter* emit = getEmitter();
genConsumeOperands(initBlkNode);
// If the initVal was moved, or spilled and reloaded to a different register,
// get the original initVal from below the GT_RELOAD, but only after capturing the valReg,
// which needs to be the new register.
regNumber valReg = initVal->gtRegNum;
initVal = initVal->gtSkipReloadOrCopy();
unsigned offset = 0;
// Perform an unroll using SSE2 loads and stores.
if (size >= XMM_REGSIZE_BYTES)
{
regNumber tmpReg = initBlkNode->GetSingleTempReg();
assert(genIsValidFloatReg(tmpReg));
if (initVal->gtIntCon.gtIconVal != 0)
{
emit->emitIns_R_R(INS_mov_i2xmm, EA_PTRSIZE, tmpReg, valReg);
emit->emitIns_R_R(INS_punpckldq, EA_8BYTE, tmpReg, tmpReg);
#ifdef _TARGET_X86_
// For x86, we need one more to convert it from 8 bytes to 16 bytes.
emit->emitIns_R_R(INS_punpckldq, EA_8BYTE, tmpReg, tmpReg);
#endif // _TARGET_X86_
}
else
{
emit->emitIns_R_R(INS_xorps, EA_8BYTE, tmpReg, tmpReg);
}
// Determine how many 16 byte slots we're going to fill using SSE movs.
size_t slots = size / XMM_REGSIZE_BYTES;
while (slots-- > 0)
{
emit->emitIns_AR_R(INS_movdqu, EA_8BYTE, tmpReg, dstAddr->gtRegNum, offset);
offset += XMM_REGSIZE_BYTES;
}
}
// Fill the remainder (or a < 16 byte sized struct)
if ((size & 8) != 0)
{
#ifdef _TARGET_X86_
// TODO-X86-CQ: [1091735] Revisit block ops codegen. One example: use movq for 8 byte movs.
emit->emitIns_AR_R(INS_mov, EA_4BYTE, valReg, dstAddr->gtRegNum, offset);
offset += 4;
emit->emitIns_AR_R(INS_mov, EA_4BYTE, valReg, dstAddr->gtRegNum, offset);
offset += 4;
#else // !_TARGET_X86_
emit->emitIns_AR_R(INS_mov, EA_8BYTE, valReg, dstAddr->gtRegNum, offset);
offset += 8;
#endif // !_TARGET_X86_
}
if ((size & 4) != 0)
{
emit->emitIns_AR_R(INS_mov, EA_4BYTE, valReg, dstAddr->gtRegNum, offset);
offset += 4;
}
if ((size & 2) != 0)
{
emit->emitIns_AR_R(INS_mov, EA_2BYTE, valReg, dstAddr->gtRegNum, offset);
offset += 2;
}
if ((size & 1) != 0)
{
emit->emitIns_AR_R(INS_mov, EA_1BYTE, valReg, dstAddr->gtRegNum, offset);
}
}
// Generates code for InitBlk by calling the VM memset helper function.
// Preconditions:
// a) The size argument of the InitBlk is not an integer constant.
// b) The size argument of the InitBlk is >= INITBLK_STOS_LIMIT bytes.
void CodeGen::genCodeForInitBlk(GenTreeBlk* initBlkNode)
{
#ifdef _TARGET_AMD64_
// Make sure we got the arguments of the initblk operation in the right registers
unsigned blockSize = initBlkNode->Size();
GenTree* dstAddr = initBlkNode->Addr();
GenTree* initVal = initBlkNode->Data();
if (initVal->OperIsInitVal())
{
initVal = initVal->gtGetOp1();
}
assert(dstAddr->isUsedFromReg());
assert(initVal->isUsedFromReg());
if (blockSize != 0)
{
assert(blockSize >= CPBLK_MOVS_LIMIT);
}
genConsumeBlockOp(initBlkNode, REG_ARG_0, REG_ARG_1, REG_ARG_2);
genEmitHelperCall(CORINFO_HELP_MEMSET, 0, EA_UNKNOWN);
#else // !_TARGET_AMD64_
NYI_X86("Helper call for InitBlk");
#endif // !_TARGET_AMD64_
}
// Generate code for a load from some address + offset
// baseNode: tree node which can be either a local address or arbitrary node
// offset: distance from the baseNode from which to load
void CodeGen::genCodeForLoadOffset(instruction ins, emitAttr size, regNumber dst, GenTree* baseNode, unsigned offset)
{
emitter* emit = getEmitter();
if (baseNode->OperIsLocalAddr())
{
if (baseNode->gtOper == GT_LCL_FLD_ADDR)
{
offset += baseNode->gtLclFld.gtLclOffs;
}
emit->emitIns_R_S(ins, size, dst, baseNode->gtLclVarCommon.gtLclNum, offset);
}
else
{
emit->emitIns_R_AR(ins, size, dst, baseNode->gtRegNum, offset);
}
}
//------------------------------------------------------------------------
// genCodeForStoreOffset: Generate code to store a reg to [base + offset].
//
// Arguments:
// ins - the instruction to generate.
// size - the size that needs to be stored.
// src - the register which needs to be stored.
// baseNode - the base, relative to which to store the src register.
// offset - the offset that is added to the baseNode to calculate the address to store into.
//
void CodeGen::genCodeForStoreOffset(instruction ins, emitAttr size, regNumber src, GenTree* baseNode, unsigned offset)
{
emitter* emit = getEmitter();
if (baseNode->OperIsLocalAddr())
{
if (baseNode->gtOper == GT_LCL_FLD_ADDR)
{
offset += baseNode->gtLclFld.gtLclOffs;
}
emit->emitIns_S_R(ins, size, src, baseNode->AsLclVarCommon()->GetLclNum(), offset);
}
else
{
emit->emitIns_AR_R(ins, size, src, baseNode->gtRegNum, offset);
}
}
// Generates CpBlk code by performing a loop unroll
// Preconditions:
// The size argument of the CpBlk node is a constant and <= 64 bytes.
// This may seem small but covers >95% of the cases in several framework assemblies.
//
void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* cpBlkNode)
{
// Make sure we got the arguments of the cpblk operation in the right registers
unsigned size = cpBlkNode->Size();
GenTree* dstAddr = cpBlkNode->Addr();
GenTree* source = cpBlkNode->Data();
GenTree* srcAddr = nullptr;
assert(size <= CPBLK_UNROLL_LIMIT);
emitter* emit = getEmitter();
if (dstAddr->isUsedFromReg())
{
genConsumeReg(dstAddr);
}
if (source->gtOper == GT_IND)
{
srcAddr = source->gtGetOp1();
if (srcAddr->isUsedFromReg())
{
genConsumeReg(srcAddr);
}
}
else
{
noway_assert(source->IsLocal());
// TODO-Cleanup: Consider making the addrForm() method in Rationalize public, e.g. in GenTree.
// OR: transform source to GT_IND(GT_LCL_VAR_ADDR)
if (source->OperGet() == GT_LCL_VAR)
{
source->SetOper(GT_LCL_VAR_ADDR);
}
else
{
assert(source->OperGet() == GT_LCL_FLD);
source->SetOper(GT_LCL_FLD_ADDR);
}
srcAddr = source;
}
unsigned offset = 0;
// If the size of this struct is larger than 16 bytes
// let's use SSE2 to be able to do 16 byte at a time
// loads and stores.
if (size >= XMM_REGSIZE_BYTES)
{
regNumber xmmReg = cpBlkNode->GetSingleTempReg(RBM_ALLFLOAT);
assert(genIsValidFloatReg(xmmReg));
size_t slots = size / XMM_REGSIZE_BYTES;
// TODO: In the below code the load and store instructions are for 16 bytes, but the
// type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but
// this probably needs to be changed.
while (slots-- > 0)
{
// Load
genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmReg, srcAddr, offset);
// Store
genCodeForStoreOffset(INS_movdqu, EA_8BYTE, xmmReg, dstAddr, offset);
offset += XMM_REGSIZE_BYTES;
}
}
// Fill the remainder (15 bytes or less) if there's one.
if ((size & 0xf) != 0)
{
// Grab the integer temp register to emit the remaining loads and stores.
regNumber tmpReg = cpBlkNode->GetSingleTempReg(RBM_ALLINT);
if ((size & 8) != 0)
{
#ifdef _TARGET_X86_
// TODO-X86-CQ: [1091735] Revisit block ops codegen. One example: use movq for 8 byte movs.
for (unsigned savedOffs = offset; offset < savedOffs + 8; offset += 4)
{
genCodeForLoadOffset(INS_mov, EA_4BYTE, tmpReg, srcAddr, offset);
genCodeForStoreOffset(INS_mov, EA_4BYTE, tmpReg, dstAddr, offset);
}
#else // !_TARGET_X86_
genCodeForLoadOffset(INS_mov, EA_8BYTE, tmpReg, srcAddr, offset);
genCodeForStoreOffset(INS_mov, EA_8BYTE, tmpReg, dstAddr, offset);
offset += 8;
#endif // !_TARGET_X86_
}
if ((size & 4) != 0)
{
genCodeForLoadOffset(INS_mov, EA_4BYTE, tmpReg, srcAddr, offset);
genCodeForStoreOffset(INS_mov, EA_4BYTE, tmpReg, dstAddr, offset);
offset += 4;
}
if ((size & 2) != 0)
{
genCodeForLoadOffset(INS_mov, EA_2BYTE, tmpReg, srcAddr, offset);
genCodeForStoreOffset(INS_mov, EA_2BYTE, tmpReg, dstAddr, offset);
offset += 2;
}
if ((size & 1) != 0)
{
genCodeForLoadOffset(INS_mov, EA_1BYTE, tmpReg, srcAddr, offset);
genCodeForStoreOffset(INS_mov, EA_1BYTE, tmpReg, dstAddr, offset);
}
}
}
// Generate code for CpBlk by using rep movs
// Preconditions:
// The size argument of the CpBlk is a constant and is between
// CPBLK_UNROLL_LIMIT and CPBLK_MOVS_LIMIT bytes.
void CodeGen::genCodeForCpBlkRepMovs(GenTreeBlk* cpBlkNode)
{
// Make sure we got the arguments of the cpblk operation in the right registers
unsigned size = cpBlkNode->Size();
GenTree* dstAddr = cpBlkNode->Addr();
GenTree* source = cpBlkNode->Data();
GenTree* srcAddr = nullptr;
#ifdef DEBUG
assert(dstAddr->isUsedFromReg());
assert(source->isContained());
#ifdef _TARGET_X86_
if (size == 0)
{
noway_assert(cpBlkNode->OperGet() == GT_STORE_DYN_BLK);
}
else
#endif
{
#ifdef _TARGET_X64_
assert(size > CPBLK_UNROLL_LIMIT && size < CPBLK_MOVS_LIMIT);
#else
assert(size > CPBLK_UNROLL_LIMIT);
#endif
}
#endif // DEBUG
genConsumeBlockOp(cpBlkNode, REG_RDI, REG_RSI, REG_RCX);
instGen(INS_r_movsb);
}
#ifdef FEATURE_PUT_STRUCT_ARG_STK
//------------------------------------------------------------------------
// CodeGen::genMove8IfNeeded: Conditionally move 8 bytes of a struct to the argument area
//
// Arguments:
// size - The size of bytes remaining to be moved
// longTmpReg - The tmp register to be used for the long value
// srcAddr - The address of the source struct
// offset - The current offset being copied
//
// Return Value:
// Returns the number of bytes moved (8 or 0).
//
// Notes:
// This is used in the PutArgStkKindUnroll case, to move any bytes that are
// not an even multiple of 16.
// On x86, longTmpReg must be an xmm reg; on x64 it must be an integer register.
// This is checked by genStoreRegToStackArg.
//
unsigned CodeGen::genMove8IfNeeded(unsigned size, regNumber longTmpReg, GenTree* srcAddr, unsigned offset)
{
#ifdef _TARGET_X86_
instruction longMovIns = INS_movq;
#else // !_TARGET_X86_
instruction longMovIns = INS_mov;
#endif // !_TARGET_X86_
if ((size & 8) != 0)
{
genCodeForLoadOffset(longMovIns, EA_8BYTE, longTmpReg, srcAddr, offset);
genStoreRegToStackArg(TYP_LONG, longTmpReg, offset);
return 8;
}
return 0;
}
//------------------------------------------------------------------------
// CodeGen::genMove4IfNeeded: Conditionally move 4 bytes of a struct to the argument area
//
// Arguments:
// size - The size of bytes remaining to be moved
// intTmpReg - The tmp register to be used for the long value
// srcAddr - The address of the source struct
// offset - The current offset being copied
//
// Return Value:
// Returns the number of bytes moved (4 or 0).
//
// Notes:
// This is used in the PutArgStkKindUnroll case, to move any bytes that are
// not an even multiple of 16.
// intTmpReg must be an integer register.
// This is checked by genStoreRegToStackArg.
//
unsigned CodeGen::genMove4IfNeeded(unsigned size, regNumber intTmpReg, GenTree* srcAddr, unsigned offset)
{
if ((size & 4) != 0)
{
genCodeForLoadOffset(INS_mov, EA_4BYTE, intTmpReg, srcAddr, offset);
genStoreRegToStackArg(TYP_INT, intTmpReg, offset);
return 4;
}
return 0;
}
//------------------------------------------------------------------------
// CodeGen::genMove2IfNeeded: Conditionally move 2 bytes of a struct to the argument area
//
// Arguments:
// size - The size of bytes remaining to be moved
// intTmpReg - The tmp register to be used for the long value
// srcAddr - The address of the source struct
// offset - The current offset being copied
//
// Return Value:
// Returns the number of bytes moved (2 or 0).
//
// Notes:
// This is used in the PutArgStkKindUnroll case, to move any bytes that are
// not an even multiple of 16.
// intTmpReg must be an integer register.
// This is checked by genStoreRegToStackArg.
//
unsigned CodeGen::genMove2IfNeeded(unsigned size, regNumber intTmpReg, GenTree* srcAddr, unsigned offset)
{
if ((size & 2) != 0)
{
genCodeForLoadOffset(INS_mov, EA_2BYTE, intTmpReg, srcAddr, offset);
genStoreRegToStackArg(TYP_SHORT, intTmpReg, offset);
return 2;
}
return 0;
}
//------------------------------------------------------------------------
// CodeGen::genMove1IfNeeded: Conditionally move 1 byte of a struct to the argument area
//
// Arguments:
// size - The size of bytes remaining to be moved
// intTmpReg - The tmp register to be used for the long value
// srcAddr - The address of the source struct
// offset - The current offset being copied
//
// Return Value:
// Returns the number of bytes moved (1 or 0).
//
// Notes:
// This is used in the PutArgStkKindUnroll case, to move any bytes that are
// not an even multiple of 16.
// intTmpReg must be an integer register.
// This is checked by genStoreRegToStackArg.
//
unsigned CodeGen::genMove1IfNeeded(unsigned size, regNumber intTmpReg, GenTree* srcAddr, unsigned offset)
{
if ((size & 1) != 0)
{
genCodeForLoadOffset(INS_mov, EA_1BYTE, intTmpReg, srcAddr, offset);
genStoreRegToStackArg(TYP_BYTE, intTmpReg, offset);
return 1;
}
return 0;
}
//---------------------------------------------------------------------------------------------------------------//
// genStructPutArgUnroll: Generates code for passing a struct arg on stack by value using loop unrolling.
//
// Arguments:
// putArgNode - the PutArgStk tree.
//
// Notes:
// m_stkArgVarNum must be set to the base var number, relative to which the by-val struct will be copied to the
// stack.
//
// TODO-Amd64-Unix: Try to share code with copyblk.
// Need refactoring of copyblk before it could be used for putarg_stk.
// The difference for now is that a putarg_stk contains its children, while cpyblk does not.
// This creates differences in code. After some significant refactoring it could be reused.
//
void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode)
{
GenTree* src = putArgNode->gtOp.gtOp1;
// We will never call this method for SIMD types, which are stored directly
// in genPutStructArgStk().
noway_assert(src->TypeGet() == TYP_STRUCT);
unsigned size = putArgNode->getArgSize();
assert(size <= CPBLK_UNROLL_LIMIT);
emitter* emit = getEmitter();
unsigned putArgOffset = putArgNode->getArgOffset();
assert(src->isContained());
assert(src->gtOper == GT_OBJ);
if (src->gtOp.gtOp1->isUsedFromReg())
{
genConsumeReg(src->gtOp.gtOp1);
}
unsigned offset = 0;
regNumber xmmTmpReg = REG_NA;
regNumber intTmpReg = REG_NA;
regNumber longTmpReg = REG_NA;
#ifdef _TARGET_X86_
// On x86 we use an XMM register for both 16 and 8-byte chunks, but if it's
// less than 16 bytes, we will just be using pushes
if (size >= 8)
{
xmmTmpReg = putArgNode->GetSingleTempReg(RBM_ALLFLOAT);
longTmpReg = xmmTmpReg;
}
if ((size & 0x7) != 0)
{
intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT);
}
#else // !_TARGET_X86_
// On x64 we use an XMM register only for 16-byte chunks.
if (size >= XMM_REGSIZE_BYTES)
{
xmmTmpReg = putArgNode->GetSingleTempReg(RBM_ALLFLOAT);
}
if ((size & 0xf) != 0)
{
intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT);
longTmpReg = intTmpReg;
}
#endif // !_TARGET_X86_
// If the size of this struct is larger than 16 bytes
// let's use SSE2 to be able to do 16 byte at a time
// loads and stores.
if (size >= XMM_REGSIZE_BYTES)
{
#ifdef _TARGET_X86_
assert(!m_pushStkArg);
#endif // _TARGET_X86_
size_t slots = size / XMM_REGSIZE_BYTES;
assert(putArgNode->gtGetOp1()->isContained());
assert(putArgNode->gtGetOp1()->gtOp.gtOper == GT_OBJ);
// TODO: In the below code the load and store instructions are for 16 bytes, but the
// type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but
// this probably needs to be changed.
while (slots-- > 0)
{
// Load
genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmTmpReg, src->gtGetOp1(), offset);
// Store
genStoreRegToStackArg(TYP_STRUCT, xmmTmpReg, offset);
offset += XMM_REGSIZE_BYTES;
}
}
// Fill the remainder (15 bytes or less) if there's one.
if ((size & 0xf) != 0)
{
#ifdef _TARGET_X86_
if (m_pushStkArg)
{
// This case is currently supported only for the case where the total size is
// less than XMM_REGSIZE_BYTES. We need to push the remaining chunks in reverse
// order. However, morph has ensured that we have a struct that is an even
// multiple of TARGET_POINTER_SIZE, so we don't need to worry about alignment.
assert(((size & 0xc) == size) && (offset == 0));
// If we have a 4 byte chunk, load it from either offset 0 or 8, depending on
// whether we've got an 8 byte chunk, and then push it on the stack.
unsigned pushedBytes = genMove4IfNeeded(size, intTmpReg, src->gtOp.gtOp1, size & 0x8);
// Now if we have an 8 byte chunk, load it from offset 0 (it's the first chunk)
// and push it on the stack.
pushedBytes += genMove8IfNeeded(size, longTmpReg, src->gtOp.gtOp1, 0);
}
else
#endif // _TARGET_X86_
{
offset += genMove8IfNeeded(size, longTmpReg, src->gtOp.gtOp1, offset);
offset += genMove4IfNeeded(size, intTmpReg, src->gtOp.gtOp1, offset);
offset += genMove2IfNeeded(size, intTmpReg, src->gtOp.gtOp1, offset);
offset += genMove1IfNeeded(size, intTmpReg, src->gtOp.gtOp1, offset);
assert(offset == size);
}
}
}
//------------------------------------------------------------------------
// genStructPutArgRepMovs: Generates code for passing a struct arg by value on stack using Rep Movs.
//
// Arguments:
// putArgNode - the PutArgStk tree.
//
// Preconditions:
// The size argument of the PutArgStk (for structs) is a constant and is between
// CPBLK_UNROLL_LIMIT and CPBLK_MOVS_LIMIT bytes.
// m_stkArgVarNum must be set to the base var number, relative to which the by-val struct bits will go.
//
void CodeGen::genStructPutArgRepMovs(GenTreePutArgStk* putArgNode)
{
GenTree* srcAddr = putArgNode->gtGetOp1();
assert(srcAddr->TypeGet() == TYP_STRUCT);
assert(putArgNode->getArgSize() > CPBLK_UNROLL_LIMIT);
// Make sure we got the arguments of the cpblk operation in the right registers, and that
// 'srcAddr' is contained as expected.
assert(putArgNode->gtRsvdRegs == (RBM_RDI | RBM_RCX | RBM_RSI));
assert(srcAddr->isContained());
genConsumePutStructArgStk(putArgNode, REG_RDI, REG_RSI, REG_RCX);
instGen(INS_r_movsb);
}
//------------------------------------------------------------------------
// If any Vector3 args are on stack and they are not pass-by-ref, the upper 32bits
// must be cleared to zeroes. The native compiler doesn't clear the upper bits
// and there is no way to know if the caller is native or not. So, the upper
// 32 bits of Vector argument on stack are always cleared to zero.
#if defined(UNIX_AMD64_ABI) && defined(FEATURE_SIMD)
void CodeGen::genClearStackVec3ArgUpperBits()
{
#ifdef DEBUG
if (verbose)
{
printf("*************** In genClearStackVec3ArgUpperBits()\n");
}
#endif
assert(compiler->compGeneratingProlog);
unsigned varNum = 0;
for (unsigned varNum = 0; varNum < compiler->info.compArgsCount; varNum++)
{
LclVarDsc* varDsc = &(compiler->lvaTable[varNum]);
assert(varDsc->lvIsParam);
// Does var has simd12 type?
if (varDsc->lvType != TYP_SIMD12)
{
continue;
}
if (!varDsc->lvIsRegArg)
{
// Clear the upper 32 bits by mov dword ptr [V_ARG_BASE+0xC], 0
getEmitter()->emitIns_S_I(ins_Store(TYP_INT), EA_4BYTE, varNum, genTypeSize(TYP_FLOAT) * 3, 0);
}
else
{
// Assume that for x64 linux, an argument is fully in registers
// or fully on stack.
regNumber argReg = varDsc->GetOtherArgReg();
// Clear the upper 32 bits by two shift instructions.
// argReg = argReg << 96
getEmitter()->emitIns_R_I(INS_pslldq, emitActualTypeSize(TYP_SIMD12), argReg, 12);
// argReg = argReg >> 96
getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(TYP_SIMD12), argReg, 12);
}
}
}
#endif // defined(UNIX_AMD64_ABI) && defined(FEATURE_SIMD)
#endif // FEATURE_PUT_STRUCT_ARG_STK
// Generate code for CpObj nodes wich copy structs that have interleaved
// GC pointers.
// This will generate a sequence of movsp instructions for the cases of non-gc members.
// Note that movsp is an alias for movsd on x86 and movsq on x64.
// and calls to the BY_REF_ASSIGN helper otherwise.
void CodeGen::genCodeForCpObj(GenTreeObj* cpObjNode)
{
// Make sure we got the arguments of the cpobj operation in the right registers
GenTree* dstAddr = cpObjNode->Addr();
GenTree* source = cpObjNode->Data();
GenTree* srcAddr = nullptr;
var_types srcAddrType = TYP_BYREF;
bool sourceIsLocal = false;
assert(source->isContained());
if (source->gtOper == GT_IND)
{
srcAddr = source->gtGetOp1();
assert(srcAddr->isUsedFromReg());
}
else
{
noway_assert(source->IsLocal());
sourceIsLocal = true;
}
bool dstOnStack = dstAddr->gtSkipReloadOrCopy()->OperIsLocalAddr();
#ifdef DEBUG
assert(dstAddr->isUsedFromReg());
// If the GenTree node has data about GC pointers, this means we're dealing
// with CpObj, so this requires special logic.
assert(cpObjNode->gtGcPtrCount > 0);
// MovSp (alias for movsq on x64 and movsd on x86) instruction is used for copying non-gcref fields
// and it needs src = RSI and dst = RDI.
// Either these registers must not contain lclVars, or they must be dying or marked for spill.
// This is because these registers are incremented as we go through the struct.
if (!sourceIsLocal)
{
GenTree* actualSrcAddr = srcAddr->gtSkipReloadOrCopy();
GenTree* actualDstAddr = dstAddr->gtSkipReloadOrCopy();
unsigned srcLclVarNum = BAD_VAR_NUM;
unsigned dstLclVarNum = BAD_VAR_NUM;
bool isSrcAddrLiveOut = false;
bool isDstAddrLiveOut = false;
if (genIsRegCandidateLocal(actualSrcAddr))
{
srcLclVarNum = actualSrcAddr->AsLclVarCommon()->gtLclNum;
isSrcAddrLiveOut = ((actualSrcAddr->gtFlags & (GTF_VAR_DEATH | GTF_SPILL)) == 0);
}
if (genIsRegCandidateLocal(actualDstAddr))
{
dstLclVarNum = actualDstAddr->AsLclVarCommon()->gtLclNum;
isDstAddrLiveOut = ((actualDstAddr->gtFlags & (GTF_VAR_DEATH | GTF_SPILL)) == 0);
}
assert((actualSrcAddr->gtRegNum != REG_RSI) || !isSrcAddrLiveOut ||
((srcLclVarNum == dstLclVarNum) && !isDstAddrLiveOut));
assert((actualDstAddr->gtRegNum != REG_RDI) || !isDstAddrLiveOut ||
((srcLclVarNum == dstLclVarNum) && !isSrcAddrLiveOut));
srcAddrType = srcAddr->TypeGet();
}
#endif // DEBUG
// Consume the operands and get them into the right registers.
// They may now contain gc pointers (depending on their type; gcMarkRegPtrVal will "do the right thing").
genConsumeBlockOp(cpObjNode, REG_RDI, REG_RSI, REG_NA);
gcInfo.gcMarkRegPtrVal(REG_RSI, srcAddrType);
gcInfo.gcMarkRegPtrVal(REG_RDI, dstAddr->TypeGet());
unsigned slots = cpObjNode->gtSlots;
// If we can prove it's on the stack we don't need to use the write barrier.
if (dstOnStack)
{
if (slots >= CPOBJ_NONGC_SLOTS_LIMIT)
{
// If the destination of the CpObj is on the stack, make sure we allocated
// RCX to emit the movsp (alias for movsd or movsq for 32 and 64 bits respectively).
assert((cpObjNode->gtRsvdRegs & RBM_RCX) != 0);
getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, slots);
instGen(INS_r_movsp);
}
else
{
// For small structs, it's better to emit a sequence of movsp than to
// emit a rep movsp instruction.
while (slots > 0)
{
instGen(INS_movsp);
slots--;
}
}
}
else
{
BYTE* gcPtrs = cpObjNode->gtGcPtrs;
unsigned gcPtrCount = cpObjNode->gtGcPtrCount;
unsigned i = 0;
while (i < slots)
{
switch (gcPtrs[i])
{
case TYPE_GC_NONE:
// Let's see if we can use rep movsp instead of a sequence of movsp instructions
// to save cycles and code size.
{
unsigned nonGcSlotCount = 0;
do
{
nonGcSlotCount++;
i++;
} while (i < slots && gcPtrs[i] == TYPE_GC_NONE);
// If we have a very small contiguous non-gc region, it's better just to
// emit a sequence of movsp instructions
if (nonGcSlotCount < CPOBJ_NONGC_SLOTS_LIMIT)
{
while (nonGcSlotCount > 0)
{
instGen(INS_movsp);
nonGcSlotCount--;
}
}
else
{
// Otherwise, we can save code-size and improve CQ by emitting
// rep movsp (alias for movsd/movsq for x86/x64)
assert((cpObjNode->gtRsvdRegs & RBM_RCX) != 0);
getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, nonGcSlotCount);
instGen(INS_r_movsp);
}
}
break;
default:
// We have a GC pointer, call the memory barrier.
genEmitHelperCall(CORINFO_HELP_ASSIGN_BYREF, 0, EA_PTRSIZE);
gcPtrCount--;
i++;
}
}
assert(gcPtrCount == 0);
}
// Clear the gcInfo for RSI and RDI.
// While we normally update GC info prior to the last instruction that uses them,
// these actually live into the helper call.
gcInfo.gcMarkRegSetNpt(RBM_RSI);
gcInfo.gcMarkRegSetNpt(RBM_RDI);
}
// Generate code for a CpBlk node by the means of the VM memcpy helper call
// Preconditions:
// a) The size argument of the CpBlk is not an integer constant
// b) The size argument is a constant but is larger than CPBLK_MOVS_LIMIT bytes.
void CodeGen::genCodeForCpBlk(GenTreeBlk* cpBlkNode)
{
#ifdef _TARGET_AMD64_
// Make sure we got the arguments of the cpblk operation in the right registers
unsigned blockSize = cpBlkNode->Size();
GenTree* dstAddr = cpBlkNode->Addr();
GenTree* source = cpBlkNode->Data();
GenTree* srcAddr = nullptr;
// Size goes in arg2
if (blockSize != 0)
{
assert(blockSize >= CPBLK_MOVS_LIMIT);
assert((cpBlkNode->gtRsvdRegs & RBM_ARG_2) != 0);
}
else
{
noway_assert(cpBlkNode->gtOper == GT_STORE_DYN_BLK);
}
// Source address goes in arg1
if (source->gtOper == GT_IND)
{
srcAddr = source->gtGetOp1();
assert(srcAddr->isUsedFromReg());
}
else
{
noway_assert(source->IsLocal());
assert((cpBlkNode->gtRsvdRegs & RBM_ARG_1) != 0);
inst_RV_TT(INS_lea, REG_ARG_1, source, 0, EA_BYREF);
}
genConsumeBlockOp(cpBlkNode, REG_ARG_0, REG_ARG_1, REG_ARG_2);
genEmitHelperCall(CORINFO_HELP_MEMCPY, 0, EA_UNKNOWN);
#else // !_TARGET_AMD64_
noway_assert(false && "Helper call for CpBlk is not needed.");
#endif // !_TARGET_AMD64_
}
// generate code do a switch statement based on a table of ip-relative offsets
void CodeGen::genTableBasedSwitch(GenTree* treeNode)
{
genConsumeOperands(treeNode->AsOp());
regNumber idxReg = treeNode->gtOp.gtOp1->gtRegNum;
regNumber baseReg = treeNode->gtOp.gtOp2->gtRegNum;
regNumber tmpReg = treeNode->GetSingleTempReg();
// load the ip-relative offset (which is relative to start of fgFirstBB)
getEmitter()->emitIns_R_ARX(INS_mov, EA_4BYTE, baseReg, baseReg, idxReg, 4, 0);
// add it to the absolute address of fgFirstBB
compiler->fgFirstBB->bbFlags |= BBF_JMP_TARGET;
getEmitter()->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, tmpReg);
getEmitter()->emitIns_R_R(INS_add, EA_PTRSIZE, baseReg, tmpReg);
// jmp baseReg
getEmitter()->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), baseReg);
}
// emits the table and an instruction to get the address of the first element
void CodeGen::genJumpTable(GenTree* treeNode)
{
noway_assert(compiler->compCurBB->bbJumpKind == BBJ_SWITCH);
assert(treeNode->OperGet() == GT_JMPTABLE);
unsigned jumpCount = compiler->compCurBB->bbJumpSwt->bbsCount;
BasicBlock** jumpTable = compiler->compCurBB->bbJumpSwt->bbsDstTab;
unsigned jmpTabOffs;
unsigned jmpTabBase;
jmpTabBase = getEmitter()->emitBBTableDataGenBeg(jumpCount, true);
jmpTabOffs = 0;
JITDUMP("\n J_M%03u_DS%02u LABEL DWORD\n", Compiler::s_compMethodsCount, jmpTabBase);
for (unsigned i = 0; i < jumpCount; i++)
{
BasicBlock* target = *jumpTable++;
noway_assert(target->bbFlags & BBF_JMP_TARGET);
JITDUMP(" DD L_M%03u_BB%02u\n", Compiler::s_compMethodsCount, target->bbNum);
getEmitter()->emitDataGenData(i, target);
};
getEmitter()->emitDataGenEnd();
// Access to inline data is 'abstracted' by a special type of static member
// (produced by eeFindJitDataOffs) which the emitter recognizes as being a reference
// to constant data, not a real static field.
getEmitter()->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), treeNode->gtRegNum,
compiler->eeFindJitDataOffs(jmpTabBase), 0);
genProduceReg(treeNode);
}
//------------------------------------------------------------------------
// genCodeForLockAdd: Generate code for a GT_LOCKADD node
//
// Arguments:
// node - the GT_LOCKADD node
//
void CodeGen::genCodeForLockAdd(GenTreeOp* node)
{
assert(node->OperIs(GT_LOCKADD));
GenTree* addr = node->gtGetOp1();
GenTree* data = node->gtGetOp2();
emitAttr size = emitActualTypeSize(data->TypeGet());
assert(addr->isUsedFromReg());
assert(data->isUsedFromReg() || data->isContainedIntOrIImmed());
assert((size == EA_4BYTE) || (size == EA_PTRSIZE));
genConsumeOperands(node);
instGen(INS_lock);
if (data->isContainedIntOrIImmed())
{
int imm = static_cast<int>(data->AsIntCon()->IconValue());
assert(imm == data->AsIntCon()->IconValue());
getEmitter()->emitIns_I_AR(INS_add, size, imm, addr->gtRegNum, 0);
}
else
{
getEmitter()->emitIns_AR_R(INS_add, size, data->gtRegNum, addr->gtRegNum, 0);
}
}
//------------------------------------------------------------------------
// genLockedInstructions: Generate code for a GT_XADD or GT_XCHG node.
//
// Arguments:
// node - the GT_XADD/XCHG node
//
void CodeGen::genLockedInstructions(GenTreeOp* node)
{
assert(node->OperIs(GT_XADD, GT_XCHG));
GenTree* addr = node->gtGetOp1();
GenTree* data = node->gtGetOp2();
emitAttr size = emitTypeSize(node->TypeGet());
assert(addr->isUsedFromReg());
assert(data->isUsedFromReg());
assert((size == EA_4BYTE) || (size == EA_PTRSIZE));
genConsumeOperands(node);
if (node->gtRegNum != data->gtRegNum)
{
// If the destination register is different from the data register then we need
// to first move the data to the target register. Make sure we don't overwrite
// the address, the register allocator should have taken care of this.
assert(node->gtRegNum != addr->gtRegNum);
getEmitter()->emitIns_R_R(INS_mov, size, node->gtRegNum, data->gtRegNum);
}
instruction ins = node->OperIs(GT_XADD) ? INS_xadd : INS_xchg;
// XCHG has an implied lock prefix when the first operand is a memory operand.
if (ins != INS_xchg)
{
instGen(INS_lock);
}
getEmitter()->emitIns_AR_R(ins, size, node->gtRegNum, addr->gtRegNum, 0);
genProduceReg(node);
}
//------------------------------------------------------------------------
// genCodeForCmpXchg: Produce code for a GT_CMPXCHG node.
//
// Arguments:
// tree - the GT_CMPXCHG node
//
void CodeGen::genCodeForCmpXchg(GenTreeCmpXchg* tree)
{
assert(tree->OperIs(GT_CMPXCHG));
var_types targetType = tree->TypeGet();
regNumber targetReg = tree->gtRegNum;
GenTree* location = tree->gtOpLocation; // arg1
GenTree* value = tree->gtOpValue; // arg2
GenTree* comparand = tree->gtOpComparand; // arg3
assert(location->gtRegNum != REG_NA && location->gtRegNum != REG_RAX);
assert(value->gtRegNum != REG_NA && value->gtRegNum != REG_RAX);
genConsumeReg(location);
genConsumeReg(value);
genConsumeReg(comparand);
// comparand goes to RAX;
// Note that we must issue this move after the genConsumeRegs(), in case any of the above
// have a GT_COPY from RAX.
if (comparand->gtRegNum != REG_RAX)
{
inst_RV_RV(ins_Copy(comparand->TypeGet()), REG_RAX, comparand->gtRegNum, comparand->TypeGet());
}
// location is Rm
instGen(INS_lock);
getEmitter()->emitIns_AR_R(INS_cmpxchg, emitTypeSize(targetType), value->gtRegNum, location->gtRegNum, 0);
// Result is in RAX
if (targetReg != REG_RAX)
{
inst_RV_RV(ins_Copy(targetType), targetReg, REG_RAX, targetType);
}
genProduceReg(tree);
}
// generate code for BoundsCheck nodes
void CodeGen::genRangeCheck(GenTree* oper)
{
noway_assert(oper->OperIsBoundsCheck());
GenTreeBoundsChk* bndsChk = oper->AsBoundsChk();
GenTree* arrIndex = bndsChk->gtIndex;
GenTree* arrLen = bndsChk->gtArrLen;
GenTree* arrRef = nullptr;
int lenOffset = 0;
GenTree * src1, *src2;
emitJumpKind jmpKind;
genConsumeRegs(arrIndex);
genConsumeRegs(arrLen);
if (arrIndex->isContainedIntOrIImmed())
{
// arrIndex is a contained constant. In this case
// we will generate one of the following
// cmp [mem], immed (if arrLen is a memory op)
// cmp reg, immed (if arrLen is in a reg)
//
// That is arrLen cannot be a contained immed.
assert(!arrLen->isContainedIntOrIImmed());
src1 = arrLen;
src2 = arrIndex;
jmpKind = EJ_jbe;
}
else
{
// arrIndex could either be a contained memory op or a reg
// In this case we will generate one of the following
// cmp [mem], immed (if arrLen is a constant)
// cmp [mem], reg (if arrLen is in a reg)
// cmp reg, immed (if arrIndex is in a reg)
// cmp reg1, reg2 (if arraIndex is in reg1)
// cmp reg, [mem] (if arrLen is a memory op)
//
// That is only one of arrIndex or arrLen can be a memory op.
assert(!arrIndex->isUsedFromMemory() || !arrLen->isUsedFromMemory());
src1 = arrIndex;
src2 = arrLen;
jmpKind = EJ_jae;
}
var_types bndsChkType = src2->TypeGet();
#if DEBUG
// Bounds checks can only be 32 or 64 bit sized comparisons.
assert(bndsChkType == TYP_INT || bndsChkType == TYP_LONG);
// The type of the bounds check should always wide enough to compare against the index.
assert(emitTypeSize(bndsChkType) >= emitTypeSize(src1->TypeGet()));
#endif // DEBUG
getEmitter()->emitInsBinary(INS_cmp, emitTypeSize(bndsChkType), src1, src2);
genJumpToThrowHlpBlk(jmpKind, bndsChk->gtThrowKind, bndsChk->gtIndRngFailBB);
}
//---------------------------------------------------------------------
// genCodeForPhysReg - generate code for a GT_PHYSREG node
//
// Arguments
// tree - the GT_PHYSREG node
//
// Return value:
// None
//
void CodeGen::genCodeForPhysReg(GenTreePhysReg* tree)
{
assert(tree->OperIs(GT_PHYSREG));
var_types targetType = tree->TypeGet();
regNumber targetReg = tree->gtRegNum;
if (targetReg != tree->gtSrcReg)
{
inst_RV_RV(ins_Copy(targetType), targetReg, tree->gtSrcReg, targetType);
genTransferRegGCState(targetReg, tree->gtSrcReg);
}
genProduceReg(tree);
}
//---------------------------------------------------------------------
// genCodeForNullCheck - generate code for a GT_NULLCHECK node
//
// Arguments
// tree - the GT_NULLCHECK node
//
// Return value:
// None
//
void CodeGen::genCodeForNullCheck(GenTreeOp* tree)
{
assert(tree->OperIs(GT_NULLCHECK));
assert(tree->gtOp1->isUsedFromReg());
regNumber reg = genConsumeReg(tree->gtOp1);
getEmitter()->emitIns_AR_R(INS_cmp, EA_4BYTE, reg, reg, 0);
}
//------------------------------------------------------------------------
// genOffsetOfMDArrayLowerBound: Returns the offset from the Array object to the
// lower bound for the given dimension.
//
// Arguments:
// elemType - the element type of the array
// rank - the rank of the array
// dimension - the dimension for which the lower bound offset will be returned.
//
// Return Value:
// The offset.
unsigned CodeGen::genOffsetOfMDArrayLowerBound(var_types elemType, unsigned rank, unsigned dimension)
{
// Note that the lower bound and length fields of the Array object are always TYP_INT, even on 64-bit targets.
return compiler->eeGetArrayDataOffset(elemType) + genTypeSize(TYP_INT) * (dimension + rank);
}
//------------------------------------------------------------------------
// genOffsetOfMDArrayLength: Returns the offset from the Array object to the
// size for the given dimension.
//
// Arguments:
// elemType - the element type of the array
// rank - the rank of the array
// dimension - the dimension for which the lower bound offset will be returned.
//
// Return Value:
// The offset.
unsigned CodeGen::genOffsetOfMDArrayDimensionSize(var_types elemType, unsigned rank, unsigned dimension)
{
// Note that the lower bound and length fields of the Array object are always TYP_INT, even on 64-bit targets.
return compiler->eeGetArrayDataOffset(elemType) + genTypeSize(TYP_INT) * dimension;
}
//------------------------------------------------------------------------
// genCodeForArrIndex: Generates code to bounds check the index for one dimension of an array reference,
// producing the effective index by subtracting the lower bound.
//
// Arguments:
// arrIndex - the node for which we're generating code
//
// Return Value:
// None.
//
void CodeGen::genCodeForArrIndex(GenTreeArrIndex* arrIndex)
{
GenTree* arrObj = arrIndex->ArrObj();
GenTree* indexNode = arrIndex->IndexExpr();
regNumber arrReg = genConsumeReg(arrObj);
regNumber indexReg = genConsumeReg(indexNode);
regNumber tgtReg = arrIndex->gtRegNum;
unsigned dim = arrIndex->gtCurrDim;
unsigned rank = arrIndex->gtArrRank;
var_types elemType = arrIndex->gtArrElemType;
noway_assert(tgtReg != REG_NA);
// Subtract the lower bound for this dimension.
// TODO-XArch-CQ: make this contained if it's an immediate that fits.
if (tgtReg != indexReg)
{
inst_RV_RV(INS_mov, tgtReg, indexReg, indexNode->TypeGet());
}
getEmitter()->emitIns_R_AR(INS_sub, emitActualTypeSize(TYP_INT), tgtReg, arrReg,
genOffsetOfMDArrayLowerBound(elemType, rank, dim));
getEmitter()->emitIns_R_AR(INS_cmp, emitActualTypeSize(TYP_INT), tgtReg, arrReg,
genOffsetOfMDArrayDimensionSize(elemType, rank, dim));
genJumpToThrowHlpBlk(EJ_jae, SCK_RNGCHK_FAIL);
genProduceReg(arrIndex);
}
//------------------------------------------------------------------------
// genCodeForArrOffset: Generates code to compute the flattened array offset for
// one dimension of an array reference:
// result = (prevDimOffset * dimSize) + effectiveIndex
// where dimSize is obtained from the arrObj operand
//
// Arguments:
// arrOffset - the node for which we're generating code
//
// Return Value:
// None.
//
// Notes:
// dimSize and effectiveIndex are always non-negative, the former by design,
// and the latter because it has been normalized to be zero-based.
void CodeGen::genCodeForArrOffset(GenTreeArrOffs* arrOffset)
{
GenTree* offsetNode = arrOffset->gtOffset;
GenTree* indexNode = arrOffset->gtIndex;
GenTree* arrObj = arrOffset->gtArrObj;
regNumber tgtReg = arrOffset->gtRegNum;
assert(tgtReg != REG_NA);
unsigned dim = arrOffset->gtCurrDim;
unsigned rank = arrOffset->gtArrRank;
var_types elemType = arrOffset->gtArrElemType;
// First, consume the operands in the correct order.
regNumber offsetReg = REG_NA;
regNumber tmpReg = REG_NA;
if (!offsetNode->IsIntegralConst(0))
{
offsetReg = genConsumeReg(offsetNode);
// We will use a temp register for the offset*scale+effectiveIndex computation.
tmpReg = arrOffset->GetSingleTempReg();
}
else
{
assert(offsetNode->isContained());
}
regNumber indexReg = genConsumeReg(indexNode);
// Although arrReg may not be used in the constant-index case, if we have generated
// the value into a register, we must consume it, otherwise we will fail to end the
// live range of the gc ptr.
// TODO-CQ: Currently arrObj will always have a register allocated to it.
// We could avoid allocating a register for it, which would be of value if the arrObj
// is an on-stack lclVar.
regNumber arrReg = REG_NA;