Permalink
Fetching contributors…
Cannot retrieve contributors at this time
8467 lines (7171 sloc) 362 KB
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XX XX
XX Arm64 Code Generator XX
XX XX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
*/
#include "jitpch.h"
#ifdef _MSC_VER
#pragma hdrstop
#endif
#ifdef _TARGET_ARM64_
#include "emit.h"
#include "codegen.h"
#include "lower.h"
#include "gcinfo.h"
#include "gcinfoencoder.h"
/*
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XX XX
XX Prolog / Epilog XX
XX XX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
*/
//------------------------------------------------------------------------
// genInstrWithConstant: we will typically generate one instruction
//
// ins reg1, reg2, imm
//
// However the imm might not fit as a directly encodable immediate,
// when it doesn't fit we generate extra instruction(s) that sets up
// the 'regTmp' with the proper immediate value.
//
// mov regTmp, imm
// ins reg1, reg2, regTmp
//
// Arguments:
// ins - instruction
// attr - operation size and GC attribute
// reg1, reg2 - first and second register operands
// imm - immediate value (third operand when it fits)
// tmpReg - temp register to use when the 'imm' doesn't fit
// inUnwindRegion - true if we are in a prolog/epilog region with unwind codes
//
// Return Value:
// returns true if the immediate was too large and tmpReg was used and modified.
//
bool CodeGen::genInstrWithConstant(instruction ins,
emitAttr attr,
regNumber reg1,
regNumber reg2,
ssize_t imm,
regNumber tmpReg,
bool inUnwindRegion /* = false */)
{
bool immFitsInIns = false;
emitAttr size = EA_SIZE(attr);
// reg1 is usually a dest register
// reg2 is always source register
assert(tmpReg != reg2); // regTmp can not match any source register
switch (ins)
{
case INS_add:
case INS_sub:
if (imm < 0)
{
imm = -imm;
ins = (ins == INS_add) ? INS_sub : INS_add;
}
immFitsInIns = emitter::emitIns_valid_imm_for_add(imm, size);
break;
case INS_strb:
case INS_strh:
case INS_str:
// reg1 is a source register for store instructions
assert(tmpReg != reg1); // regTmp can not match any source register
immFitsInIns = emitter::emitIns_valid_imm_for_ldst_offset(imm, size);
break;
case INS_ldrsb:
case INS_ldrsh:
case INS_ldrsw:
case INS_ldrb:
case INS_ldrh:
case INS_ldr:
immFitsInIns = emitter::emitIns_valid_imm_for_ldst_offset(imm, size);
break;
default:
assert(!"Unexpected instruction in genInstrWithConstant");
break;
}
if (immFitsInIns)
{
// generate a single instruction that encodes the immediate directly
getEmitter()->emitIns_R_R_I(ins, attr, reg1, reg2, imm);
}
else
{
// caller can specify REG_NA for tmpReg, when it "knows" that the immediate will always fit
assert(tmpReg != REG_NA);
// generate two or more instructions
// first we load the immediate into tmpReg
instGen_Set_Reg_To_Imm(size, tmpReg, imm);
regSet.verifyRegUsed(tmpReg);
// when we are in an unwind code region
// we record the extra instructions using unwindPadding()
if (inUnwindRegion)
{
compiler->unwindPadding();
}
// generate the instruction using a three register encoding with the immediate in tmpReg
getEmitter()->emitIns_R_R_R(ins, attr, reg1, reg2, tmpReg);
}
return immFitsInIns;
}
//------------------------------------------------------------------------
// genStackPointerAdjustment: add a specified constant value to the stack pointer in either the prolog
// or the epilog. The unwind codes for the generated instructions are produced. An available temporary
// register is required to be specified, in case the constant is too large to encode in an "add"
// instruction (or "sub" instruction if we choose to use one), such that we need to load the constant
// into a register first, before using it.
//
// Arguments:
// spDelta - the value to add to SP (can be negative)
// tmpReg - an available temporary register
// pTmpRegIsZero - If we use tmpReg, and pTmpRegIsZero is non-null, we set *pTmpRegIsZero to 'false'.
// Otherwise, we don't touch it.
//
// Return Value:
// None.
void CodeGen::genStackPointerAdjustment(ssize_t spDelta, regNumber tmpReg, bool* pTmpRegIsZero)
{
// Even though INS_add is specified here, the encoder will choose either
// an INS_add or an INS_sub and encode the immediate as a positive value
//
if (genInstrWithConstant(INS_add, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, spDelta, tmpReg, true))
{
if (pTmpRegIsZero != nullptr)
{
*pTmpRegIsZero = false;
}
}
// spDelta is negative in the prolog, positive in the epilog, but we always tell the unwind codes the positive
// value.
ssize_t spDeltaAbs = abs(spDelta);
unsigned unwindSpDelta = (unsigned)spDeltaAbs;
assert((ssize_t)unwindSpDelta == spDeltaAbs); // make sure that it fits in a unsigned
compiler->unwindAllocStack(unwindSpDelta);
}
//------------------------------------------------------------------------
// genPrologSaveRegPair: Save a pair of general-purpose or floating-point/SIMD registers in a function or funclet
// prolog. If possible, we use pre-indexed addressing to adjust SP and store the registers with a single instruction.
// The caller must ensure that we can use the STP instruction, and that spOffset will be in the legal range for that
// instruction.
//
// Arguments:
// reg1 - First register of pair to save.
// reg2 - Second register of pair to save.
// spOffset - The offset from SP to store reg1 (must be positive or zero).
// spDelta - If non-zero, the amount to add to SP before the register saves (must be negative or
// zero).
// lastSavedWasPreviousPair - True if the last prolog instruction was to save the previous register pair. This
// allows us to emit the "save_next" unwind code.
// tmpReg - An available temporary register. Needed for the case of large frames.
// pTmpRegIsZero - If we use tmpReg, and pTmpRegIsZero is non-null, we set *pTmpRegIsZero to 'false'.
// Otherwise, we don't touch it.
//
// Return Value:
// None.
void CodeGen::genPrologSaveRegPair(regNumber reg1,
regNumber reg2,
int spOffset,
int spDelta,
bool lastSavedWasPreviousPair,
regNumber tmpReg,
bool* pTmpRegIsZero)
{
assert(spOffset >= 0);
assert(spDelta <= 0);
assert((spDelta % 16) == 0); // SP changes must be 16-byte aligned
assert(genIsValidFloatReg(reg1) == genIsValidFloatReg(reg2)); // registers must be both general-purpose, or both
// FP/SIMD
bool needToSaveRegs = true;
if (spDelta != 0)
{
if ((spOffset == 0) && (spDelta >= -512))
{
// We can use pre-indexed addressing.
// stp REG, REG + 1, [SP, #spDelta]!
// 64-bit STP offset range: -512 to 504, multiple of 8.
getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, reg1, reg2, REG_SPBASE, spDelta, INS_OPTS_PRE_INDEX);
compiler->unwindSaveRegPairPreindexed(reg1, reg2, spDelta);
needToSaveRegs = false;
}
else // (spDelta < -512))
{
// We need to do SP adjustment separately from the store; we can't fold in a pre-indexed addressing and the
// non-zero offset.
// generate sub SP,SP,imm
genStackPointerAdjustment(spDelta, tmpReg, pTmpRegIsZero);
}
}
if (needToSaveRegs)
{
// stp REG, REG + 1, [SP, #offset]
// 64-bit STP offset range: -512 to 504, multiple of 8.
assert(spOffset <= 504);
getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, reg1, reg2, REG_SPBASE, spOffset);
if (lastSavedWasPreviousPair)
{
// This works as long as we've only been saving pairs, in order, and we've saved the previous one just
// before this one.
compiler->unwindSaveNext();
}
else
{
compiler->unwindSaveRegPair(reg1, reg2, spOffset);
}
}
}
//------------------------------------------------------------------------
// genPrologSaveReg: Like genPrologSaveRegPair, but for a single register. Save a single general-purpose or
// floating-point/SIMD register in a function or funclet prolog. Note that if we wish to change SP (i.e., spDelta != 0),
// then spOffset must be 8. This is because otherwise we would create an alignment hole above the saved register, not
// below it, which we currently don't support. This restriction could be loosened if the callers change to handle it
// (and this function changes to support using pre-indexed STR addressing). The caller must ensure that we can use the
// STR instruction, and that spOffset will be in the legal range for that instruction.
//
// Arguments:
// reg1 - Register to save.
// spOffset - The offset from SP to store reg1 (must be positive or zero).
// spDelta - If non-zero, the amount to add to SP before the register saves (must be negative or
// zero).
// tmpReg - An available temporary register. Needed for the case of large frames.
// pTmpRegIsZero - If we use tmpReg, and pTmpRegIsZero is non-null, we set *pTmpRegIsZero to 'false'.
// Otherwise, we don't touch it.
//
// Return Value:
// None.
void CodeGen::genPrologSaveReg(regNumber reg1, int spOffset, int spDelta, regNumber tmpReg, bool* pTmpRegIsZero)
{
assert(spOffset >= 0);
assert(spDelta <= 0);
assert((spDelta % 16) == 0); // SP changes must be 16-byte aligned
if (spDelta != 0)
{
// generate sub SP,SP,imm
genStackPointerAdjustment(spDelta, tmpReg, pTmpRegIsZero);
}
// str REG, [SP, #offset]
// 64-bit STR offset range: 0 to 32760, multiple of 8.
getEmitter()->emitIns_R_R_I(INS_str, EA_PTRSIZE, reg1, REG_SPBASE, spOffset);
compiler->unwindSaveReg(reg1, spOffset);
}
//------------------------------------------------------------------------
// genEpilogRestoreRegPair: This is the opposite of genPrologSaveRegPair(), run in the epilog instead of the prolog.
// The stack pointer adjustment, if requested, is done after the register restore, using post-index addressing.
// The caller must ensure that we can use the LDP instruction, and that spOffset will be in the legal range for that
// instruction.
//
// Arguments:
// reg1 - First register of pair to restore.
// reg2 - Second register of pair to restore.
// spOffset - The offset from SP to load reg1 (must be positive or zero).
// spDelta - If non-zero, the amount to add to SP after the register restores (must be positive or
// zero).
// tmpReg - An available temporary register. Needed for the case of large frames.
// pTmpRegIsZero - If we use tmpReg, and pTmpRegIsZero is non-null, we set *pTmpRegIsZero to 'false'.
// Otherwise, we don't touch it.
//
// Return Value:
// None.
void CodeGen::genEpilogRestoreRegPair(
regNumber reg1, regNumber reg2, int spOffset, int spDelta, regNumber tmpReg, bool* pTmpRegIsZero)
{
assert(spOffset >= 0);
assert(spDelta >= 0);
assert((spDelta % 16) == 0); // SP changes must be 16-byte aligned
if (spDelta != 0)
{
if ((spOffset == 0) && (spDelta <= 504))
{
// Fold the SP change into this instruction.
// ldp reg1, reg2, [SP], #spDelta
getEmitter()->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, reg1, reg2, REG_SPBASE, spDelta, INS_OPTS_POST_INDEX);
compiler->unwindSaveRegPairPreindexed(reg1, reg2, -spDelta);
}
else // (spDelta > 504))
{
// Can't fold in the SP change; need to use a separate ADD instruction.
// ldp reg1, reg2, [SP, #offset]
getEmitter()->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, reg1, reg2, REG_SPBASE, spOffset);
compiler->unwindSaveRegPair(reg1, reg2, spOffset);
// generate add SP,SP,imm
genStackPointerAdjustment(spDelta, tmpReg, pTmpRegIsZero);
}
}
else
{
// ldp reg1, reg2, [SP, #offset]
getEmitter()->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, reg1, reg2, REG_SPBASE, spOffset);
compiler->unwindSaveRegPair(reg1, reg2, spOffset);
}
}
//------------------------------------------------------------------------
// genEpilogRestoreReg: The opposite of genPrologSaveReg(), run in the epilog instead of the prolog.
//
// Arguments:
// reg1 - Register to restore.
// spOffset - The offset from SP to restore reg1 (must be positive or zero).
// spDelta - If non-zero, the amount to add to SP after the register restores (must be positive or
// zero).
// tmpReg - An available temporary register. Needed for the case of large frames.
// pTmpRegIsZero - If we use tmpReg, and pTmpRegIsZero is non-null, we set *pTmpRegIsZero to 'false'.
// Otherwise, we don't touch it.
//
// Return Value:
// None.
void CodeGen::genEpilogRestoreReg(regNumber reg1, int spOffset, int spDelta, regNumber tmpReg, bool* pTmpRegIsZero)
{
assert(spOffset >= 0);
assert(spDelta >= 0);
assert((spDelta % 16) == 0); // SP changes must be 16-byte aligned
// ldr reg1, [SP, #offset]
getEmitter()->emitIns_R_R_I(INS_ldr, EA_PTRSIZE, reg1, REG_SPBASE, spOffset);
compiler->unwindSaveReg(reg1, spOffset);
if (spDelta != 0)
{
// generate add SP,SP,imm
genStackPointerAdjustment(spDelta, tmpReg, pTmpRegIsZero);
}
}
//------------------------------------------------------------------------
// genSaveCalleeSavedRegistersHelp: Save the callee-saved registers in 'regsToSaveMask' to the stack frame
// in the function or funclet prolog. The save set does not contain FP, since that is
// guaranteed to be saved separately, so we can set up chaining. We can only use the instructions
// that are allowed by the unwind codes. Integer registers are stored at lower addresses,
// FP/SIMD registers are stored at higher addresses. There are no gaps. The caller ensures that
// there is enough space on the frame to store these registers, and that the store instructions
// we need to use (STR or STP) are encodable with the stack-pointer immediate offsets we need to
// use. Note that the save set can contain LR if this is a frame without a frame pointer, in
// which case LR is saved along with the other callee-saved registers. The caller can tell us
// to fold in a stack pointer adjustment, which we will do with the first instruction. Note that
// the stack pointer adjustment must be by a multiple of 16 to preserve the invariant that the
// stack pointer is always 16 byte aligned. If we are saving an odd number of callee-saved
// registers, though, we will have an empty aligment slot somewhere. It turns out we will put
// it below (at a lower address) the callee-saved registers, as that is currently how we
// do frame layout. This means that the first stack offset will be 8 and the stack pointer
// adjustment must be done by a SUB, and not folded in to a pre-indexed store.
//
// Arguments:
// regsToSaveMask - The mask of callee-saved registers to save. If empty, this function does nothing.
// lowestCalleeSavedOffset - The offset from SP that is the beginning of the callee-saved register area. Note that
// if non-zero spDelta, then this is the offset of the first save *after* that
// SP adjustment.
// spDelta - If non-zero, the amount to add to SP before the register saves (must be negative or
// zero).
//
// Return Value:
// None.
void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowestCalleeSavedOffset, int spDelta)
{
assert(spDelta <= 0);
unsigned regsToSaveCount = genCountBits(regsToSaveMask);
if (regsToSaveCount == 0)
{
if (spDelta != 0)
{
// Currently this is the case for varargs only
// whose size is MAX_REG_ARG * REGSIZE_BYTES = 64 bytes.
genStackPointerAdjustment(spDelta, REG_NA, nullptr);
}
return;
}
assert((spDelta % 16) == 0);
assert((regsToSaveMask & RBM_FP) == 0); // we never save FP here
assert(regsToSaveCount <= genCountBits(RBM_CALLEE_SAVED | RBM_LR)); // We also save LR, even though it is not in
// RBM_CALLEE_SAVED.
regMaskTP maskSaveRegsFloat = regsToSaveMask & RBM_ALLFLOAT;
regMaskTP maskSaveRegsInt = regsToSaveMask & ~maskSaveRegsFloat;
int spOffset = lowestCalleeSavedOffset; // this is the offset *after* we change SP.
unsigned intRegsToSaveCount = genCountBits(maskSaveRegsInt);
unsigned floatRegsToSaveCount = genCountBits(maskSaveRegsFloat);
bool isPairSave = false;
#ifdef DEBUG
bool isRegsToSaveCountOdd = ((intRegsToSaveCount + floatRegsToSaveCount) % 2 != 0);
#endif
// Save the integer registers
bool lastSavedWasPair = false;
while (maskSaveRegsInt != RBM_NONE)
{
// If this is the first store that needs to change SP (spDelta != 0),
// then the offset must be 8 to account for alignment for the odd count
// or it must be 0 for the even count.
assert((spDelta == 0) || (isRegsToSaveCountOdd && spOffset == REGSIZE_BYTES) ||
(!isRegsToSaveCountOdd && spOffset == 0));
isPairSave = (intRegsToSaveCount >= 2);
regMaskTP reg1Mask = genFindLowestBit(maskSaveRegsInt);
regNumber reg1 = genRegNumFromMask(reg1Mask);
maskSaveRegsInt &= ~reg1Mask;
intRegsToSaveCount -= 1;
if (isPairSave)
{
// We can use a STP instruction.
regMaskTP reg2Mask = genFindLowestBit(maskSaveRegsInt);
regNumber reg2 = genRegNumFromMask(reg2Mask);
assert((reg2 == REG_NEXT(reg1)) || (reg2 == REG_LR));
maskSaveRegsInt &= ~reg2Mask;
intRegsToSaveCount -= 1;
genPrologSaveRegPair(reg1, reg2, spOffset, spDelta, lastSavedWasPair, REG_IP0, nullptr);
// TODO-ARM64-CQ: this code works in the prolog, but it's a bit weird to think about "next" when generating
// this epilog, to get the codes to match. Turn this off until that is better understood.
// lastSavedWasPair = true;
spOffset += 2 * REGSIZE_BYTES;
}
else
{
// No register pair; we use a STR instruction.
genPrologSaveReg(reg1, spOffset, spDelta, REG_IP0, nullptr);
lastSavedWasPair = false;
spOffset += REGSIZE_BYTES;
}
spDelta = 0; // We've now changed SP already, if necessary; don't do it again.
}
assert(intRegsToSaveCount == 0);
// Save the floating-point/SIMD registers
lastSavedWasPair = false;
while (maskSaveRegsFloat != RBM_NONE)
{
// If this is the first store that needs to change SP (spDelta != 0),
// then the offset must be 8 to account for alignment for the odd count
// or it must be 0 for the even count.
assert((spDelta == 0) || (isRegsToSaveCountOdd && spOffset == REGSIZE_BYTES) ||
(!isRegsToSaveCountOdd && spOffset == 0));
isPairSave = (floatRegsToSaveCount >= 2);
regMaskTP reg1Mask = genFindLowestBit(maskSaveRegsFloat);
regNumber reg1 = genRegNumFromMask(reg1Mask);
maskSaveRegsFloat &= ~reg1Mask;
floatRegsToSaveCount -= 1;
if (isPairSave)
{
// We can use a STP instruction.
regMaskTP reg2Mask = genFindLowestBit(maskSaveRegsFloat);
regNumber reg2 = genRegNumFromMask(reg2Mask);
assert(reg2 == REG_NEXT(reg1));
maskSaveRegsFloat &= ~reg2Mask;
floatRegsToSaveCount -= 1;
genPrologSaveRegPair(reg1, reg2, spOffset, spDelta, lastSavedWasPair, REG_IP0, nullptr);
// TODO-ARM64-CQ: this code works in the prolog, but it's a bit weird to think about "next" when generating
// this epilog, to get the codes to match. Turn this off until that is better understood.
// lastSavedWasPair = true;
spOffset += 2 * FPSAVE_REGSIZE_BYTES;
}
else
{
// No register pair; we use a STR instruction.
genPrologSaveReg(reg1, spOffset, spDelta, REG_IP0, nullptr);
lastSavedWasPair = false;
spOffset += FPSAVE_REGSIZE_BYTES;
}
spDelta = 0; // We've now changed SP already, if necessary; don't do it again.
}
assert(floatRegsToSaveCount == 0);
}
//------------------------------------------------------------------------
// genRestoreCalleeSavedRegistersHelp: Restore the callee-saved registers in 'regsToRestoreMask' from the stack frame
// in the function or funclet epilog. This exactly reverses the actions of genSaveCalleeSavedRegistersHelp().
//
// Arguments:
// regsToRestoreMask - The mask of callee-saved registers to restore. If empty, this function does nothing.
// lowestCalleeSavedOffset - The offset from SP that is the beginning of the callee-saved register area.
// spDelta - If non-zero, the amount to add to SP after the register restores (must be positive or
// zero).
//
// Here's an example restore sequence:
// ldp x27, x28, [sp,#96]
// ldp x25, x26, [sp,#80]
// ldp x23, x24, [sp,#64]
// ldp x21, x22, [sp,#48]
// ldp x19, x20, [sp,#32]
//
// For the case of non-zero spDelta, we assume the base of the callee-save registers to restore is at SP, and
// the last restore adjusts SP by the specified amount. For example:
// ldp x27, x28, [sp,#64]
// ldp x25, x26, [sp,#48]
// ldp x23, x24, [sp,#32]
// ldp x21, x22, [sp,#16]
// ldp x19, x20, [sp], #80
//
// Note you call the unwind functions specifying the prolog operation that is being un-done. So, for example, when
// generating a post-indexed load, you call the unwind function for specifying the corresponding preindexed store.
//
// Return Value:
// None.
void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, int lowestCalleeSavedOffset, int spDelta)
{
assert(spDelta >= 0);
unsigned regsToRestoreCount = genCountBits(regsToRestoreMask);
if (regsToRestoreCount == 0)
{
if (spDelta != 0)
{
// Currently this is the case for varargs only
// whose size is MAX_REG_ARG * REGSIZE_BYTES = 64 bytes.
genStackPointerAdjustment(spDelta, REG_NA, nullptr);
}
return;
}
assert((spDelta % 16) == 0);
assert((regsToRestoreMask & RBM_FP) == 0); // we never restore FP here
assert(regsToRestoreCount <=
genCountBits(RBM_CALLEE_SAVED | RBM_LR)); // We also save LR, even though it is not in RBM_CALLEE_SAVED.
regMaskTP maskRestoreRegsFloat = regsToRestoreMask & RBM_ALLFLOAT;
regMaskTP maskRestoreRegsInt = regsToRestoreMask & ~maskRestoreRegsFloat;
assert(REGSIZE_BYTES == FPSAVE_REGSIZE_BYTES);
int spOffset = lowestCalleeSavedOffset + regsToRestoreCount * REGSIZE_BYTES; // Point past the end, to start. We
// predecrement to find the offset to
// load from.
unsigned floatRegsToRestoreCount = genCountBits(maskRestoreRegsFloat);
unsigned intRegsToRestoreCount = genCountBits(maskRestoreRegsInt);
int stackDelta = 0;
bool isPairRestore = false;
bool thisIsTheLastRestoreInstruction = false;
#ifdef DEBUG
bool isRegsToRestoreCountOdd = ((floatRegsToRestoreCount + intRegsToRestoreCount) % 2 != 0);
#endif
// We want to restore in the opposite order we saved, so the unwind codes match. Be careful to handle odd numbers of
// callee-saved registers properly.
// Restore the floating-point/SIMD registers
while (maskRestoreRegsFloat != RBM_NONE)
{
thisIsTheLastRestoreInstruction = (floatRegsToRestoreCount <= 2) && (maskRestoreRegsInt == RBM_NONE);
isPairRestore = (floatRegsToRestoreCount % 2) == 0;
// Update stack delta only if it is the last restore (the first save).
if (thisIsTheLastRestoreInstruction)
{
assert(stackDelta == 0);
stackDelta = spDelta;
}
// Update stack offset.
if (isPairRestore)
{
spOffset -= 2 * FPSAVE_REGSIZE_BYTES;
}
else
{
spOffset -= FPSAVE_REGSIZE_BYTES;
}
// If this is the last restore (the first save) that needs to change SP (stackDelta != 0),
// then the offset must be 8 to account for alignment for the odd count
// or it must be 0 for the even count.
assert((stackDelta == 0) || (isRegsToRestoreCountOdd && spOffset == FPSAVE_REGSIZE_BYTES) ||
(!isRegsToRestoreCountOdd && spOffset == 0));
regMaskTP reg2Mask = genFindHighestBit(maskRestoreRegsFloat);
regNumber reg2 = genRegNumFromMask(reg2Mask);
maskRestoreRegsFloat &= ~reg2Mask;
floatRegsToRestoreCount -= 1;
if (isPairRestore)
{
regMaskTP reg1Mask = genFindHighestBit(maskRestoreRegsFloat);
regNumber reg1 = genRegNumFromMask(reg1Mask);
maskRestoreRegsFloat &= ~reg1Mask;
floatRegsToRestoreCount -= 1;
genEpilogRestoreRegPair(reg1, reg2, spOffset, stackDelta, REG_IP1, nullptr);
}
else
{
genEpilogRestoreReg(reg2, spOffset, stackDelta, REG_IP1, nullptr);
}
}
assert(floatRegsToRestoreCount == 0);
// Restore the integer registers
while (maskRestoreRegsInt != RBM_NONE)
{
thisIsTheLastRestoreInstruction = (intRegsToRestoreCount <= 2);
isPairRestore = (intRegsToRestoreCount % 2) == 0;
// Update stack delta only if it is the last restore (the first save).
if (thisIsTheLastRestoreInstruction)
{
assert(stackDelta == 0);
stackDelta = spDelta;
}
// Update stack offset.
spOffset -= REGSIZE_BYTES;
if (isPairRestore)
{
spOffset -= REGSIZE_BYTES;
}
// If this is the last restore (the first save) that needs to change SP (stackDelta != 0),
// then the offset must be 8 to account for alignment for the odd count
// or it must be 0 for the even count.
assert((stackDelta == 0) || (isRegsToRestoreCountOdd && spOffset == REGSIZE_BYTES) ||
(!isRegsToRestoreCountOdd && spOffset == 0));
regMaskTP reg2Mask = genFindHighestBit(maskRestoreRegsInt);
regNumber reg2 = genRegNumFromMask(reg2Mask);
maskRestoreRegsInt &= ~reg2Mask;
intRegsToRestoreCount -= 1;
if (isPairRestore)
{
regMaskTP reg1Mask = genFindHighestBit(maskRestoreRegsInt);
regNumber reg1 = genRegNumFromMask(reg1Mask);
maskRestoreRegsInt &= ~reg1Mask;
intRegsToRestoreCount -= 1;
genEpilogRestoreRegPair(reg1, reg2, spOffset, stackDelta, REG_IP1, nullptr);
}
else
{
genEpilogRestoreReg(reg2, spOffset, stackDelta, REG_IP1, nullptr);
}
}
assert(intRegsToRestoreCount == 0);
}
// clang-format off
/*****************************************************************************
*
* Generates code for an EH funclet prolog.
*
* Funclets have the following incoming arguments:
*
* catch: x0 = the exception object that was caught (see GT_CATCH_ARG)
* filter: x0 = the exception object to filter (see GT_CATCH_ARG), x1 = CallerSP of the containing function
* finally/fault: none
*
* Funclets set the following registers on exit:
*
* catch: x0 = the address at which execution should resume (see BBJ_EHCATCHRET)
* filter: x0 = non-zero if the handler should handle the exception, zero otherwise (see GT_RETFILT)
* finally/fault: none
*
* The ARM64 funclet prolog sequence is one of the following (Note: #framesz is total funclet frame size,
* including everything; #outsz is outgoing argument space. #framesz must be a multiple of 16):
*
* Frame type 1:
* For #outsz == 0 and #framesz <= 512:
* stp fp,lr,[sp,-#framesz]! ; establish the frame, save FP/LR
* stp x19,x20,[sp,#xxx] ; save callee-saved registers, as necessary
*
* The funclet frame is thus:
*
* | |
* |-----------------------|
* | incoming |
* | arguments |
* +=======================+ <---- Caller's SP
* |Callee saved registers | // multiple of 8 bytes
* |-----------------------|
* | PSP slot | // 8 bytes (omitted in CoreRT ABI)
* |-----------------------|
* ~ alignment padding ~ // To make the whole frame 16 byte aligned.
* |-----------------------|
* | Saved FP, LR | // 16 bytes
* |-----------------------| <---- Ambient SP
* | | |
* ~ | Stack grows ~
* | | downward |
* V
*
* Frame type 2:
* For #outsz != 0 and #framesz <= 512:
* sub sp,sp,#framesz ; establish the frame
* stp fp,lr,[sp,#outsz] ; save FP/LR.
* stp x19,x20,[sp,#xxx] ; save callee-saved registers, as necessary
*
* The funclet frame is thus:
*
* | |
* |-----------------------|
* | incoming |
* | arguments |
* +=======================+ <---- Caller's SP
* |Callee saved registers | // multiple of 8 bytes
* |-----------------------|
* | PSP slot | // 8 bytes (omitted in CoreRT ABI)
* |-----------------------|
* ~ alignment padding ~ // To make the whole frame 16 byte aligned.
* |-----------------------|
* | Saved FP, LR | // 16 bytes
* |-----------------------|
* | Outgoing arg space | // multiple of 8 bytes
* |-----------------------| <---- Ambient SP
* | | |
* ~ | Stack grows ~
* | | downward |
* V
*
* Frame type 3:
* For #framesz > 512:
* stp fp,lr,[sp,- (#framesz - #outsz)]! ; establish the frame, save FP/LR: note that it is guaranteed here that (#framesz - #outsz) <= 168
* stp x19,x20,[sp,#xxx] ; save callee-saved registers, as necessary
* sub sp,sp,#outsz ; create space for outgoing argument space
*
* The funclet frame is thus:
*
* | |
* |-----------------------|
* | incoming |
* | arguments |
* +=======================+ <---- Caller's SP
* |Callee saved registers | // multiple of 8 bytes
* |-----------------------|
* | PSP slot | // 8 bytes (omitted in CoreRT ABI)
* |-----------------------|
* ~ alignment padding ~ // To make the first SP subtraction 16 byte aligned
* |-----------------------|
* | Saved FP, LR | // 16 bytes
* |-----------------------|
* ~ alignment padding ~ // To make the whole frame 16 byte aligned (specifically, to 16-byte align the outgoing argument space).
* |-----------------------|
* | Outgoing arg space | // multiple of 8 bytes
* |-----------------------| <---- Ambient SP
* | | |
* ~ | Stack grows ~
* | | downward |
* V
*
* Both #1 and #2 only change SP once. That means that there will be a maximum of one alignment slot needed. For the general case, #3,
* it is possible that we will need to add alignment to both changes to SP, leading to 16 bytes of alignment. Remember that the stack
* pointer needs to be 16 byte aligned at all times. The size of the PSP slot plus callee-saved registers space is a maximum of 168 bytes:
* (1 PSP slot + 12 integer registers + 8 FP/SIMD registers) * 8 bytes. The outgoing argument size, however, can be very large, if we call a
* function that takes a large number of arguments (note that we currently use the same outgoing argument space size in the funclet as for the main
* function, even if the funclet doesn't have any calls, or has a much smaller, or larger, maximum number of outgoing arguments for any call).
* In that case, we need to 16-byte align the initial change to SP, before saving off the callee-saved registers and establishing the PSPsym,
* so we can use the limited immediate offset encodings we have available, before doing another 16-byte aligned SP adjustment to create the
* outgoing argument space. Both changes to SP might need to add alignment padding.
*
* Note that in all cases, the PSPSym is in exactly the same position with respect to Caller-SP, and that location is the same relative to Caller-SP
* as in the main function.
*
* ; After this header, fill the PSP slot, for use by the VM (it gets reported with the GC info), or by code generation of nested filters.
* ; This is not part of the "OS prolog"; it has no associated unwind data, and is not reversed in the funclet epilog.
*
* if (this is a filter funclet)
* {
* // x1 on entry to a filter funclet is CallerSP of the containing function:
* // either the main function, or the funclet for a handler that this filter is dynamically nested within.
* // Note that a filter can be dynamically nested within a funclet even if it is not statically within
* // a funclet. Consider:
* //
* // try {
* // try {
* // throw new Exception();
* // } catch(Exception) {
* // throw new Exception(); // The exception thrown here ...
* // }
* // } filter { // ... will be processed here, while the "catch" funclet frame is still on the stack
* // } filter-handler {
* // }
* //
* // Because of this, we need a PSP in the main function anytime a filter funclet doesn't know whether the enclosing frame will
* // be a funclet or main function. We won't know any time there is a filter protecting nested EH. To simplify, we just always
* // create a main function PSP for any function with a filter.
*
* ldr x1, [x1, #CallerSP_to_PSP_slot_delta] ; Load the CallerSP of the main function (stored in the PSP of the dynamically containing funclet or function)
* str x1, [sp, #SP_to_PSP_slot_delta] ; store the PSP
* add fp, x1, #Function_CallerSP_to_FP_delta ; re-establish the frame pointer
* }
* else
* {
* // This is NOT a filter funclet. The VM re-establishes the frame pointer on entry.
* // TODO-ARM64-CQ: if VM set x1 to CallerSP on entry, like for filters, we could save an instruction.
*
* add x3, fp, #Function_FP_to_CallerSP_delta ; compute the CallerSP, given the frame pointer. x3 is scratch.
* str x3, [sp, #SP_to_PSP_slot_delta] ; store the PSP
* }
*
* An example epilog sequence is then:
*
* add sp,sp,#outsz ; if any outgoing argument space
* ... ; restore callee-saved registers
* ldp x19,x20,[sp,#xxx]
* ldp fp,lr,[sp],#framesz
* ret lr
*
* The funclet frame is thus:
*
* | |
* |-----------------------|
* | incoming |
* | arguments |
* +=======================+ <---- Caller's SP
* |Callee saved registers | // multiple of 8 bytes
* |-----------------------|
* | PSP slot | // 8 bytes (omitted in CoreRT ABI)
* |-----------------------|
* | Saved FP, LR | // 16 bytes
* |-----------------------|
* ~ alignment padding ~ // To make the whole frame 16 byte aligned.
* |-----------------------|
* | Outgoing arg space | // multiple of 8 bytes
* |-----------------------| <---- Ambient SP
* | | |
* ~ | Stack grows ~
* | | downward |
* V
*/
// clang-format on
void CodeGen::genFuncletProlog(BasicBlock* block)
{
#ifdef DEBUG
if (verbose)
printf("*************** In genFuncletProlog()\n");
#endif
assert(block != NULL);
assert(block->bbFlags & BBF_FUNCLET_BEG);
ScopedSetVariable<bool> _setGeneratingProlog(&compiler->compGeneratingProlog, true);
gcInfo.gcResetForBB();
compiler->unwindBegProlog();
regMaskTP maskSaveRegsFloat = genFuncletInfo.fiSaveRegs & RBM_ALLFLOAT;
regMaskTP maskSaveRegsInt = genFuncletInfo.fiSaveRegs & ~maskSaveRegsFloat;
// Funclets must always save LR and FP, since when we have funclets we must have an FP frame.
assert((maskSaveRegsInt & RBM_LR) != 0);
assert((maskSaveRegsInt & RBM_FP) != 0);
bool isFilter = (block->bbCatchTyp == BBCT_FILTER);
regMaskTP maskArgRegsLiveIn;
if (isFilter)
{
maskArgRegsLiveIn = RBM_R0 | RBM_R1;
}
else if ((block->bbCatchTyp == BBCT_FINALLY) || (block->bbCatchTyp == BBCT_FAULT))
{
maskArgRegsLiveIn = RBM_NONE;
}
else
{
maskArgRegsLiveIn = RBM_R0;
}
int lowestCalleeSavedOffset = genFuncletInfo.fiSP_to_CalleeSave_delta;
if (genFuncletInfo.fiFrameType == 1)
{
getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, genFuncletInfo.fiSpDelta1,
INS_OPTS_PRE_INDEX);
compiler->unwindSaveRegPairPreindexed(REG_FP, REG_LR, genFuncletInfo.fiSpDelta1);
assert(genFuncletInfo.fiSpDelta2 == 0);
assert(genFuncletInfo.fiSP_to_FPLR_save_delta == 0);
}
else if (genFuncletInfo.fiFrameType == 2)
{
// fiFrameType==2 constraints:
assert(genFuncletInfo.fiSpDelta1 < 0);
assert(genFuncletInfo.fiSpDelta1 >= -512);
// generate sub SP,SP,imm
genStackPointerAdjustment(genFuncletInfo.fiSpDelta1, REG_NA, nullptr);
assert(genFuncletInfo.fiSpDelta2 == 0);
getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE,
genFuncletInfo.fiSP_to_FPLR_save_delta);
compiler->unwindSaveRegPair(REG_FP, REG_LR, genFuncletInfo.fiSP_to_FPLR_save_delta);
}
else
{
assert(genFuncletInfo.fiFrameType == 3);
getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, genFuncletInfo.fiSpDelta1,
INS_OPTS_PRE_INDEX);
compiler->unwindSaveRegPairPreindexed(REG_FP, REG_LR, genFuncletInfo.fiSpDelta1);
lowestCalleeSavedOffset += genFuncletInfo.fiSpDelta2; // We haven't done the second adjustment of SP yet.
}
maskSaveRegsInt &= ~(RBM_LR | RBM_FP); // We've saved these now
genSaveCalleeSavedRegistersHelp(maskSaveRegsInt | maskSaveRegsFloat, lowestCalleeSavedOffset, 0);
if (genFuncletInfo.fiFrameType == 3)
{
// Note that genFuncletInfo.fiSpDelta2 is always a negative value
assert(genFuncletInfo.fiSpDelta2 < 0);
// generate sub SP,SP,imm
genStackPointerAdjustment(genFuncletInfo.fiSpDelta2, REG_R2, nullptr);
}
// This is the end of the OS-reported prolog for purposes of unwinding
compiler->unwindEndProlog();
// If there is no PSPSym (CoreRT ABI), we are done.
if (compiler->lvaPSPSym == BAD_VAR_NUM)
{
return;
}
if (isFilter)
{
// This is the first block of a filter
// Note that register x1 = CallerSP of the containing function
// X1 is overwritten by the first Load (new callerSP)
// X2 is scratch when we have a large constant offset
// Load the CallerSP of the main function (stored in the PSP of the dynamically containing funclet or function)
genInstrWithConstant(ins_Load(TYP_I_IMPL), EA_PTRSIZE, REG_R1, REG_R1,
genFuncletInfo.fiCallerSP_to_PSP_slot_delta, REG_R2, false);
regSet.verifyRegUsed(REG_R1);
// Store the PSP value (aka CallerSP)
genInstrWithConstant(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_R1, REG_SPBASE,
genFuncletInfo.fiSP_to_PSP_slot_delta, REG_R2, false);
// re-establish the frame pointer
genInstrWithConstant(INS_add, EA_PTRSIZE, REG_FPBASE, REG_R1, genFuncletInfo.fiFunction_CallerSP_to_FP_delta,
REG_R2, false);
}
else // This is a non-filter funclet
{
// X3 is scratch, X2 can also become scratch
// compute the CallerSP, given the frame pointer. x3 is scratch.
genInstrWithConstant(INS_add, EA_PTRSIZE, REG_R3, REG_FPBASE, -genFuncletInfo.fiFunction_CallerSP_to_FP_delta,
REG_R2, false);
regSet.verifyRegUsed(REG_R3);
genInstrWithConstant(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_R3, REG_SPBASE,
genFuncletInfo.fiSP_to_PSP_slot_delta, REG_R2, false);
}
}
/*****************************************************************************
*
* Generates code for an EH funclet epilog.
*/
void CodeGen::genFuncletEpilog()
{
#ifdef DEBUG
if (verbose)
printf("*************** In genFuncletEpilog()\n");
#endif
ScopedSetVariable<bool> _setGeneratingEpilog(&compiler->compGeneratingEpilog, true);
bool unwindStarted = false;
if (!unwindStarted)
{
// We can delay this until we know we'll generate an unwindable instruction, if necessary.
compiler->unwindBegEpilog();
unwindStarted = true;
}
regMaskTP maskRestoreRegsFloat = genFuncletInfo.fiSaveRegs & RBM_ALLFLOAT;
regMaskTP maskRestoreRegsInt = genFuncletInfo.fiSaveRegs & ~maskRestoreRegsFloat;
// Funclets must always save LR and FP, since when we have funclets we must have an FP frame.
assert((maskRestoreRegsInt & RBM_LR) != 0);
assert((maskRestoreRegsInt & RBM_FP) != 0);
maskRestoreRegsInt &= ~(RBM_LR | RBM_FP); // We restore FP/LR at the end
int lowestCalleeSavedOffset = genFuncletInfo.fiSP_to_CalleeSave_delta;
if (genFuncletInfo.fiFrameType == 3)
{
// Note that genFuncletInfo.fiSpDelta2 is always a negative value
assert(genFuncletInfo.fiSpDelta2 < 0);
// generate add SP,SP,imm
genStackPointerAdjustment(-genFuncletInfo.fiSpDelta2, REG_R2, nullptr);
lowestCalleeSavedOffset += genFuncletInfo.fiSpDelta2;
}
regMaskTP regsToRestoreMask = maskRestoreRegsInt | maskRestoreRegsFloat;
genRestoreCalleeSavedRegistersHelp(regsToRestoreMask, lowestCalleeSavedOffset, 0);
if (genFuncletInfo.fiFrameType == 1)
{
getEmitter()->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, -genFuncletInfo.fiSpDelta1,
INS_OPTS_POST_INDEX);
compiler->unwindSaveRegPairPreindexed(REG_FP, REG_LR, genFuncletInfo.fiSpDelta1);
assert(genFuncletInfo.fiSpDelta2 == 0);
assert(genFuncletInfo.fiSP_to_FPLR_save_delta == 0);
}
else if (genFuncletInfo.fiFrameType == 2)
{
getEmitter()->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE,
genFuncletInfo.fiSP_to_FPLR_save_delta);
compiler->unwindSaveRegPair(REG_FP, REG_LR, genFuncletInfo.fiSP_to_FPLR_save_delta);
// fiFrameType==2 constraints:
assert(genFuncletInfo.fiSpDelta1 < 0);
assert(genFuncletInfo.fiSpDelta1 >= -512);
// generate add SP,SP,imm
genStackPointerAdjustment(-genFuncletInfo.fiSpDelta1, REG_NA, nullptr);
assert(genFuncletInfo.fiSpDelta2 == 0);
}
else
{
assert(genFuncletInfo.fiFrameType == 3);
getEmitter()->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, -genFuncletInfo.fiSpDelta1,
INS_OPTS_POST_INDEX);
compiler->unwindSaveRegPairPreindexed(REG_FP, REG_LR, genFuncletInfo.fiSpDelta1);
}
inst_RV(INS_ret, REG_LR, TYP_I_IMPL);
compiler->unwindReturn(REG_LR);
compiler->unwindEndEpilog();
}
/*****************************************************************************
*
* Capture the information used to generate the funclet prologs and epilogs.
* Note that all funclet prologs are identical, and all funclet epilogs are
* identical (per type: filters are identical, and non-filters are identical).
* Thus, we compute the data used for these just once.
*
* See genFuncletProlog() for more information about the prolog/epilog sequences.
*/
void CodeGen::genCaptureFuncletPrologEpilogInfo()
{
if (!compiler->ehAnyFunclets())
return;
assert(isFramePointerUsed());
assert(compiler->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT); // The frame size and offsets must be
// finalized
genFuncletInfo.fiFunction_CallerSP_to_FP_delta = genCallerSPtoFPdelta();
regMaskTP rsMaskSaveRegs = regSet.rsMaskCalleeSaved;
assert((rsMaskSaveRegs & RBM_LR) != 0);
assert((rsMaskSaveRegs & RBM_FP) != 0);
unsigned PSPSize = (compiler->lvaPSPSym != BAD_VAR_NUM) ? REGSIZE_BYTES : 0;
unsigned saveRegsCount = genCountBits(rsMaskSaveRegs);
unsigned saveRegsPlusPSPSize = saveRegsCount * REGSIZE_BYTES + PSPSize;
if (compiler->info.compIsVarArgs)
{
// For varargs we always save all of the integer register arguments
// so that they are contiguous with the incoming stack arguments.
saveRegsPlusPSPSize += MAX_REG_ARG * REGSIZE_BYTES;
}
unsigned saveRegsPlusPSPSizeAligned = (unsigned)roundUp(saveRegsPlusPSPSize, STACK_ALIGN);
assert(compiler->lvaOutgoingArgSpaceSize % REGSIZE_BYTES == 0);
unsigned outgoingArgSpaceAligned = (unsigned)roundUp(compiler->lvaOutgoingArgSpaceSize, STACK_ALIGN);
unsigned maxFuncletFrameSizeAligned = saveRegsPlusPSPSizeAligned + outgoingArgSpaceAligned;
assert((maxFuncletFrameSizeAligned % STACK_ALIGN) == 0);
int SP_to_FPLR_save_delta;
int SP_to_PSP_slot_delta;
int CallerSP_to_PSP_slot_delta;
if (maxFuncletFrameSizeAligned <= 512)
{
unsigned funcletFrameSize = saveRegsPlusPSPSize + compiler->lvaOutgoingArgSpaceSize;
unsigned funcletFrameSizeAligned = (unsigned)roundUp(funcletFrameSize, STACK_ALIGN);
assert(funcletFrameSizeAligned <= maxFuncletFrameSizeAligned);
unsigned funcletFrameAlignmentPad = funcletFrameSizeAligned - funcletFrameSize;
assert((funcletFrameAlignmentPad == 0) || (funcletFrameAlignmentPad == REGSIZE_BYTES));
SP_to_FPLR_save_delta = compiler->lvaOutgoingArgSpaceSize;
SP_to_PSP_slot_delta = SP_to_FPLR_save_delta + 2 /* FP, LR */ * REGSIZE_BYTES + funcletFrameAlignmentPad;
CallerSP_to_PSP_slot_delta = -(int)(saveRegsPlusPSPSize - 2 /* FP, LR */ * REGSIZE_BYTES);
if (compiler->lvaOutgoingArgSpaceSize == 0)
{
genFuncletInfo.fiFrameType = 1;
}
else
{
genFuncletInfo.fiFrameType = 2;
}
genFuncletInfo.fiSpDelta1 = -(int)funcletFrameSizeAligned;
genFuncletInfo.fiSpDelta2 = 0;
assert(genFuncletInfo.fiSpDelta1 + genFuncletInfo.fiSpDelta2 == -(int)funcletFrameSizeAligned);
}
else
{
unsigned saveRegsPlusPSPAlignmentPad = saveRegsPlusPSPSizeAligned - saveRegsPlusPSPSize;
assert((saveRegsPlusPSPAlignmentPad == 0) || (saveRegsPlusPSPAlignmentPad == REGSIZE_BYTES));
SP_to_FPLR_save_delta = outgoingArgSpaceAligned;
SP_to_PSP_slot_delta = SP_to_FPLR_save_delta + 2 /* FP, LR */ * REGSIZE_BYTES + saveRegsPlusPSPAlignmentPad;
CallerSP_to_PSP_slot_delta =
-(int)(saveRegsPlusPSPSizeAligned - 2 /* FP, LR */ * REGSIZE_BYTES - saveRegsPlusPSPAlignmentPad);
genFuncletInfo.fiFrameType = 3;
genFuncletInfo.fiSpDelta1 = -(int)saveRegsPlusPSPSizeAligned;
genFuncletInfo.fiSpDelta2 = -(int)outgoingArgSpaceAligned;
assert(genFuncletInfo.fiSpDelta1 + genFuncletInfo.fiSpDelta2 == -(int)maxFuncletFrameSizeAligned);
}
/* Now save it for future use */
genFuncletInfo.fiSaveRegs = rsMaskSaveRegs;
genFuncletInfo.fiSP_to_FPLR_save_delta = SP_to_FPLR_save_delta;
genFuncletInfo.fiSP_to_PSP_slot_delta = SP_to_PSP_slot_delta;
genFuncletInfo.fiSP_to_CalleeSave_delta = SP_to_PSP_slot_delta + REGSIZE_BYTES;
genFuncletInfo.fiCallerSP_to_PSP_slot_delta = CallerSP_to_PSP_slot_delta;
#ifdef DEBUG
if (verbose)
{
printf("\n");
printf("Funclet prolog / epilog info\n");
printf(" Save regs: ");
dspRegMask(genFuncletInfo.fiSaveRegs);
printf("\n");
printf(" Function CallerSP-to-FP delta: %d\n", genFuncletInfo.fiFunction_CallerSP_to_FP_delta);
printf(" SP to FP/LR save location delta: %d\n", genFuncletInfo.fiSP_to_FPLR_save_delta);
printf(" SP to PSP slot delta: %d\n", genFuncletInfo.fiSP_to_PSP_slot_delta);
printf(" SP to callee-saved area delta: %d\n", genFuncletInfo.fiSP_to_CalleeSave_delta);
printf(" Caller SP to PSP slot delta: %d\n", genFuncletInfo.fiCallerSP_to_PSP_slot_delta);
printf(" Frame type: %d\n", genFuncletInfo.fiFrameType);
printf(" SP delta 1: %d\n", genFuncletInfo.fiSpDelta1);
printf(" SP delta 2: %d\n", genFuncletInfo.fiSpDelta2);
if (compiler->lvaPSPSym != BAD_VAR_NUM)
{
if (CallerSP_to_PSP_slot_delta !=
compiler->lvaGetCallerSPRelativeOffset(compiler->lvaPSPSym)) // for debugging
{
printf("lvaGetCallerSPRelativeOffset(lvaPSPSym): %d\n",
compiler->lvaGetCallerSPRelativeOffset(compiler->lvaPSPSym));
}
}
}
assert(genFuncletInfo.fiSP_to_FPLR_save_delta >= 0);
assert(genFuncletInfo.fiSP_to_PSP_slot_delta >= 0);
assert(genFuncletInfo.fiSP_to_CalleeSave_delta >= 0);
assert(genFuncletInfo.fiCallerSP_to_PSP_slot_delta <= 0);
if (compiler->lvaPSPSym != BAD_VAR_NUM)
{
assert(genFuncletInfo.fiCallerSP_to_PSP_slot_delta ==
compiler->lvaGetCallerSPRelativeOffset(compiler->lvaPSPSym)); // same offset used in main function and
// funclet!
}
#endif // DEBUG
}
/*
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XX XX
XX End Prolog / Epilog XX
XX XX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
*/
BasicBlock* CodeGen::genCallFinally(BasicBlock* block)
{
// Generate a call to the finally, like this:
// mov x0,qword ptr [fp + 10H] / sp // Load x0 with PSPSym, or sp if PSPSym is not used
// bl finally-funclet
// b finally-return // Only for non-retless finally calls
// The 'b' can be a NOP if we're going to the next block.
if (compiler->lvaPSPSym != BAD_VAR_NUM)
{
getEmitter()->emitIns_R_S(ins_Load(TYP_I_IMPL), EA_PTRSIZE, REG_R0, compiler->lvaPSPSym, 0);
}
else
{
getEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_R0, REG_SPBASE);
}
getEmitter()->emitIns_J(INS_bl_local, block->bbJumpDest);
if (block->bbFlags & BBF_RETLESS_CALL)
{
// We have a retless call, and the last instruction generated was a call.
// If the next block is in a different EH region (or is the end of the code
// block), then we need to generate a breakpoint here (since it will never
// get executed) to get proper unwind behavior.
if ((block->bbNext == nullptr) || !BasicBlock::sameEHRegion(block, block->bbNext))
{
instGen(INS_bkpt); // This should never get executed
}
}
else
{
// Because of the way the flowgraph is connected, the liveness info for this one instruction
// after the call is not (can not be) correct in cases where a variable has a last use in the
// handler. So turn off GC reporting for this single instruction.
getEmitter()->emitDisableGC();
// Now go to where the finally funclet needs to return to.
if (block->bbNext->bbJumpDest == block->bbNext->bbNext)
{
// Fall-through.
// TODO-ARM64-CQ: Can we get rid of this instruction, and just have the call return directly
// to the next instruction? This would depend on stack walking from within the finally
// handler working without this instruction being in this special EH region.
instGen(INS_nop);
}
else
{
inst_JMP(EJ_jmp, block->bbNext->bbJumpDest);
}
getEmitter()->emitEnableGC();
}
// The BBJ_ALWAYS is used because the BBJ_CALLFINALLY can't point to the
// jump target using bbJumpDest - that is already used to point
// to the finally block. So just skip past the BBJ_ALWAYS unless the
// block is RETLESS.
if (!(block->bbFlags & BBF_RETLESS_CALL))
{
assert(block->isBBCallAlwaysPair());
block = block->bbNext;
}
return block;
}
void CodeGen::genEHCatchRet(BasicBlock* block)
{
// For long address (default): `adrp + add` will be emitted.
// For short address (proven later): `adr` will be emitted.
getEmitter()->emitIns_R_L(INS_adr, EA_PTRSIZE, block->bbJumpDest, REG_INTRET);
}
// move an immediate value into an integer register
void CodeGen::instGen_Set_Reg_To_Imm(emitAttr size, regNumber reg, ssize_t imm, insFlags flags)
{
// reg cannot be a FP register
assert(!genIsValidFloatReg(reg));
if (!compiler->opts.compReloc)
{
size = EA_SIZE(size); // Strip any Reloc flags from size if we aren't doing relocs
}
if (EA_IS_RELOC(size))
{
// This emits a pair of adrp/add (two instructions) with fix-ups.
getEmitter()->emitIns_R_AI(INS_adrp, size, reg, imm);
}
else if (imm == 0)
{
instGen_Set_Reg_To_Zero(size, reg, flags);
}
else
{
if (emitter::emitIns_valid_imm_for_mov(imm, size))
{
getEmitter()->emitIns_R_I(INS_mov, size, reg, imm);
}
else
{
// Arm64 allows any arbitrary 16-bit constant to be loaded into a register halfword
// There are three forms
// movk which loads into any halfword preserving the remaining halfwords
// movz which loads into any halfword zeroing the remaining halfwords
// movn which loads into any halfword zeroing the remaining halfwords then bitwise inverting the register
// In some cases it is preferable to use movn, because it has the side effect of filling the other halfwords
// with ones
// Determine whether movn or movz will require the fewest instructions to populate the immediate
int preferMovn = 0;
for (int i = (size == EA_8BYTE) ? 48 : 16; i >= 0; i -= 16)
{
if (uint16_t(imm >> i) == 0xffff)
++preferMovn; // a single movk 0xffff could be skipped if movn was used
else if (uint16_t(imm >> i) == 0x0000)
--preferMovn; // a single movk 0 could be skipped if movz was used
}
// Select the first instruction. Any additional instruction will use movk
instruction ins = (preferMovn > 0) ? INS_movn : INS_movz;
// Initial movz or movn will fill the remaining bytes with the skipVal
// This can allow skipping filling a halfword
uint16_t skipVal = (preferMovn > 0) ? 0xffff : 0;
unsigned bits = (size == EA_8BYTE) ? 64 : 32;
// Iterate over imm examining 16 bits at a time
for (unsigned i = 0; i < bits; i += 16)
{
uint16_t imm16 = uint16_t(imm >> i);
if (imm16 != skipVal)
{
if (ins == INS_movn)
{
// For the movn case, we need to bitwise invert the immediate. This is because
// (movn x0, ~imm16) === (movz x0, imm16; or x0, x0, #0xffff`ffff`ffff`0000)
imm16 = ~imm16;
}
getEmitter()->emitIns_R_I_I(ins, size, reg, imm16, i, INS_OPTS_LSL);
// Once the initial movz/movn is emitted the remaining instructions will all use movk
ins = INS_movk;
}
}
// We must emit a movn or movz or we have not done anything
// The cases which hit this assert should be (emitIns_valid_imm_for_mov() == true) and
// should not be in this else condition
assert(ins == INS_movk);
}
// The caller may have requested that the flags be set on this mov (rarely/never)
if (flags == INS_FLAGS_SET)
{
getEmitter()->emitIns_R_I(INS_tst, size, reg, 0);
}
}
regSet.verifyRegUsed(reg);
}
/***********************************************************************************
*
* Generate code to set a register 'targetReg' of type 'targetType' to the constant
* specified by the constant (GT_CNS_INT or GT_CNS_DBL) in 'tree'. This does not call
* genProduceReg() on the target register.
*/
void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTree* tree)
{
switch (tree->gtOper)
{
case GT_CNS_INT:
{
// relocatable values tend to come down as a CNS_INT of native int type
// so the line between these two opcodes is kind of blurry
GenTreeIntConCommon* con = tree->AsIntConCommon();
ssize_t cnsVal = con->IconValue();
if (con->ImmedValNeedsReloc(compiler))
{
instGen_Set_Reg_To_Imm(EA_HANDLE_CNS_RELOC, targetReg, cnsVal);
regSet.verifyRegUsed(targetReg);
}
else
{
genSetRegToIcon(targetReg, cnsVal, targetType);
}
}
break;
case GT_CNS_DBL:
{
emitter* emit = getEmitter();
emitAttr size = emitActualTypeSize(tree);
double constValue = tree->AsDblCon()->gtDconVal;
// Make sure we use "movi reg, 0x00" only for positive zero (0.0) and not for negative zero (-0.0)
if (*(__int64*)&constValue == 0)
{
// A faster/smaller way to generate 0.0
// We will just zero out the entire vector register for both float and double
emit->emitIns_R_I(INS_movi, EA_16BYTE, targetReg, 0x00, INS_OPTS_16B);
}
else if (emitter::emitIns_valid_imm_for_fmov(constValue))
{
// We can load the FP constant using the fmov FP-immediate for this constValue
emit->emitIns_R_F(INS_fmov, size, targetReg, constValue);
}
else
{
// Get a temp integer register to compute long address.
regNumber addrReg = tree->GetSingleTempReg();
// We must load the FP constant from the constant pool
// Emit a data section constant for the float or double constant.
CORINFO_FIELD_HANDLE hnd = emit->emitFltOrDblConst(constValue, size);
// For long address (default): `adrp + ldr + fmov` will be emitted.
// For short address (proven later), `ldr` will be emitted.
emit->emitIns_R_C(INS_ldr, size, targetReg, addrReg, hnd, 0);
}
}
break;
default:
unreached();
}
}
// Generate code to get the high N bits of a N*N=2N bit multiplication result
void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
{
assert(!treeNode->gtOverflowEx());
genConsumeOperands(treeNode);
regNumber targetReg = treeNode->gtRegNum;
var_types targetType = treeNode->TypeGet();
emitter* emit = getEmitter();
emitAttr attr = emitActualTypeSize(treeNode);
unsigned isUnsigned = (treeNode->gtFlags & GTF_UNSIGNED);
GenTree* op1 = treeNode->gtGetOp1();
GenTree* op2 = treeNode->gtGetOp2();
assert(!varTypeIsFloating(targetType));
// The arithmetic node must be sitting in a register (since it's not contained)
assert(targetReg != REG_NA);
if (EA_SIZE(attr) == EA_8BYTE)
{
instruction ins = isUnsigned ? INS_umulh : INS_smulh;
regNumber r = emit->emitInsTernary(ins, attr, treeNode, op1, op2);
assert(r == targetReg);
}
else
{
assert(EA_SIZE(attr) == EA_4BYTE);
instruction ins = isUnsigned ? INS_umull : INS_smull;
regNumber r = emit->emitInsTernary(ins, EA_4BYTE, treeNode, op1, op2);
emit->emitIns_R_R_I(isUnsigned ? INS_lsr : INS_asr, EA_8BYTE, targetReg, targetReg, 32);
}
genProduceReg(treeNode);
}
// Generate code for ADD, SUB, MUL, DIV, UDIV, AND, OR and XOR
// This method is expected to have called genConsumeOperands() before calling it.
void CodeGen::genCodeForBinary(GenTree* treeNode)
{
const genTreeOps oper = treeNode->OperGet();
regNumber targetReg = treeNode->gtRegNum;
var_types targetType = treeNode->TypeGet();
emitter* emit = getEmitter();
assert(oper == GT_ADD || oper == GT_SUB || oper == GT_MUL || oper == GT_DIV || oper == GT_UDIV || oper == GT_AND ||
oper == GT_OR || oper == GT_XOR);
GenTree* op1 = treeNode->gtGetOp1();
GenTree* op2 = treeNode->gtGetOp2();
instruction ins = genGetInsForOper(treeNode->OperGet(), targetType);
if ((treeNode->gtFlags & GTF_SET_FLAGS) != 0)
{
switch (oper)
{
case GT_ADD:
ins = INS_adds;
break;
case GT_SUB:
ins = INS_subs;
break;
case GT_AND:
ins = INS_ands;
break;
default:
noway_assert(!"Unexpected BinaryOp with GTF_SET_FLAGS set");
}
}
// The arithmetic node must be sitting in a register (since it's not contained)
assert(targetReg != REG_NA);
regNumber r = emit->emitInsTernary(ins, emitActualTypeSize(treeNode), treeNode, op1, op2);
assert(r == targetReg);
genProduceReg(treeNode);
}
//------------------------------------------------------------------------
// genCodeForLclVar: Produce code for a GT_LCL_VAR node.
//
// Arguments:
// tree - the GT_LCL_VAR node
//
void CodeGen::genCodeForLclVar(GenTreeLclVar* tree)
{
var_types targetType = tree->TypeGet();
emitter* emit = getEmitter();
unsigned varNum = tree->gtLclNum;
assert(varNum < compiler->lvaCount);
LclVarDsc* varDsc = &(compiler->lvaTable[varNum]);
bool isRegCandidate = varDsc->lvIsRegCandidate();
// lcl_vars are not defs
assert((tree->gtFlags & GTF_VAR_DEF) == 0);
// If this is a register candidate that has been spilled, genConsumeReg() will
// reload it at the point of use. Otherwise, if it's not in a register, we load it here.
if (!isRegCandidate && !(tree->gtFlags & GTF_SPILLED))
{
// targetType must be a normal scalar type and not a TYP_STRUCT
assert(targetType != TYP_STRUCT);
instruction ins = ins_Load(targetType);
emitAttr attr = emitTypeSize(targetType);
attr = varTypeIsFloating(targetType) ? attr : emit->emitInsAdjustLoadStoreAttr(ins, attr);
emit->emitIns_R_S(ins, attr, tree->gtRegNum, varNum, 0);
genProduceReg(tree);
}
}
//------------------------------------------------------------------------
// genCodeForStoreLclFld: Produce code for a GT_STORE_LCL_FLD node.
//
// Arguments:
// tree - the GT_STORE_LCL_FLD node
//
void CodeGen::genCodeForStoreLclFld(GenTreeLclFld* tree)
{
var_types targetType = tree->TypeGet();
regNumber targetReg = tree->gtRegNum;
emitter* emit = getEmitter();
noway_assert(targetType != TYP_STRUCT);
#ifdef FEATURE_SIMD
// storing of TYP_SIMD12 (i.e. Vector3) field
if (tree->TypeGet() == TYP_SIMD12)
{
genStoreLclTypeSIMD12(tree);
return;
}
#endif // FEATURE_SIMD
// record the offset
unsigned offset = tree->gtLclOffs;
// We must have a stack store with GT_STORE_LCL_FLD
noway_assert(targetReg == REG_NA);
unsigned varNum = tree->gtLclNum;
assert(varNum < compiler->lvaCount);
LclVarDsc* varDsc = &(compiler->lvaTable[varNum]);
// Ensure that lclVar nodes are typed correctly.
assert(!varDsc->lvNormalizeOnStore() || targetType == genActualType(varDsc->TypeGet()));
GenTree* data = tree->gtOp1;
genConsumeRegs(data);
regNumber dataReg = REG_NA;
if (data->isContainedIntOrIImmed())
{
assert(data->IsIntegralConst(0));
dataReg = REG_ZR;
}
else
{
assert(!data->isContained());
dataReg = data->gtRegNum;
}
assert(dataReg != REG_NA);
instruction ins = ins_Store(targetType);
emitAttr attr = emitTypeSize(targetType);
attr = varTypeIsFloating(targetType) ? attr : emit->emitInsAdjustLoadStoreAttr(ins, attr);
emit->emitIns_S_R(ins, attr, dataReg, varNum, offset);
genUpdateLife(tree);
varDsc->lvRegNum = REG_STK;
}
//------------------------------------------------------------------------
// genCodeForStoreLclVar: Produce code for a GT_STORE_LCL_VAR node.
//
// Arguments:
// tree - the GT_STORE_LCL_VAR node
//
void CodeGen::genCodeForStoreLclVar(GenTreeLclVar* tree)
{
var_types targetType = tree->TypeGet();
regNumber targetReg = tree->gtRegNum;
emitter* emit = getEmitter();
unsigned varNum = tree->gtLclNum;
assert(varNum < compiler->lvaCount);
LclVarDsc* varDsc = &(compiler->lvaTable[varNum]);
// Ensure that lclVar nodes are typed correctly.
assert(!varDsc->lvNormalizeOnStore() || targetType == genActualType(varDsc->TypeGet()));
GenTree* data = tree->gtOp1;
// var = call, where call returns a multi-reg return value
// case is handled separately.
if (data->gtSkipReloadOrCopy()->IsMultiRegCall())
{
genMultiRegCallStoreToLocal(tree);
}
else
{
#ifdef FEATURE_SIMD
// storing of TYP_SIMD12 (i.e. Vector3) field
if (tree->TypeGet() == TYP_SIMD12)
{
genStoreLclTypeSIMD12(tree);
return;
}
#endif // FEATURE_SIMD
genConsumeRegs(data);
regNumber dataReg = REG_NA;
if (data->isContainedIntOrIImmed())
{
// This is only possible for a zero-init.
assert(data->IsIntegralConst(0));
if (varTypeIsSIMD(targetType))
{
assert(targetReg != REG_NA);
getEmitter()->emitIns_R_I(INS_movi, EA_16BYTE, targetReg, 0x00, INS_OPTS_16B);
genProduceReg(tree);
return;
}
dataReg = REG_ZR;
}
else
{
assert(!data->isContained());
dataReg = data->gtRegNum;
}
assert(dataReg != REG_NA);
if (targetReg == REG_NA) // store into stack based LclVar
{
inst_set_SV_var(tree);
instruction ins = ins_Store(targetType);
emitAttr attr = emitTypeSize(targetType);
attr = varTypeIsFloating(targetType) ? attr : emit->emitInsAdjustLoadStoreAttr(ins, attr);
emit->emitIns_S_R(ins, attr, dataReg, varNum, /* offset */ 0);
genUpdateLife(tree);
varDsc->lvRegNum = REG_STK;
}
else // store into register (i.e move into register)
{
if (dataReg != targetReg)
{
// Assign into targetReg when dataReg (from op1) is not the same register
inst_RV_RV(ins_Copy(targetType), targetReg, dataReg, targetType);
}
genProduceReg(tree);
}
}
}
//------------------------------------------------------------------------
// genSimpleReturn: Generates code for simple return statement for arm64.
//
// Note: treeNode's and op1's registers are already consumed.
//
// Arguments:
// treeNode - The GT_RETURN or GT_RETFILT tree node with non-struct and non-void type
//
// Return Value:
// None
//
void CodeGen::genSimpleReturn(GenTree* treeNode)
{
assert(treeNode->OperGet() == GT_RETURN || treeNode->OperGet() == GT_RETFILT);
GenTree* op1 = treeNode->gtGetOp1();
var_types targetType = treeNode->TypeGet();
assert(!isStructReturn(treeNode));
assert(targetType != TYP_VOID);
regNumber retReg = varTypeIsFloating(treeNode) ? REG_FLOATRET : REG_INTRET;
bool movRequired = (op1->gtRegNum != retReg);
if (!movRequired)
{
if (op1->OperGet() == GT_LCL_VAR)
{
GenTreeLclVarCommon* lcl = op1->AsLclVarCommon();
bool isRegCandidate = compiler->lvaTable[lcl->gtLclNum].lvIsRegCandidate();
if (isRegCandidate && ((op1->gtFlags & GTF_SPILLED) == 0))
{
// We may need to generate a zero-extending mov instruction to load the value from this GT_LCL_VAR
unsigned lclNum = lcl->gtLclNum;
LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
var_types op1Type = genActualType(op1->TypeGet());
var_types lclType = genActualType(varDsc->TypeGet());
if (genTypeSize(op1Type) < genTypeSize(lclType))
{
movRequired = true;
}
}
}
}
if (movRequired)
{
emitAttr attr = emitActualTypeSize(targetType);
getEmitter()->emitIns_R_R(INS_mov, attr, retReg, op1->gtRegNum);
}
}
/***********************************************************************************************
* Generate code for localloc
*/
void CodeGen::genLclHeap(GenTree* tree)
{
assert(tree->OperGet() == GT_LCLHEAP);
GenTree* size = tree->gtOp.gtOp1;
noway_assert((genActualType(size->gtType) == TYP_INT) || (genActualType(size->gtType) == TYP_I_IMPL));
regNumber targetReg = tree->gtRegNum;
regNumber regCnt = REG_NA;
regNumber pspSymReg = REG_NA;
var_types type = genActualType(size->gtType);
emitAttr easz = emitTypeSize(type);
BasicBlock* endLabel = nullptr;
BasicBlock* loop = nullptr;
unsigned stackAdjustment = 0;
#ifdef DEBUG
// Verify ESP
if (compiler->opts.compStackCheckOnRet)
{
noway_assert(compiler->lvaReturnEspCheck != 0xCCCCCCCC &&
compiler->lvaTable[compiler->lvaReturnEspCheck].lvDoNotEnregister &&
compiler->lvaTable[compiler->lvaReturnEspCheck].lvOnFrame);
getEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, REG_SPBASE, compiler->lvaReturnEspCheck, 0);
BasicBlock* esp_check = genCreateTempLabel();
emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
inst_JMP(jmpEqual, esp_check);
getEmitter()->emitIns(INS_bkpt);
genDefineTempLabel(esp_check);
}
#endif
noway_assert(isFramePointerUsed()); // localloc requires Frame Pointer to be established since SP changes
noway_assert(genStackLevel == 0); // Can't have anything on the stack
// Whether method has PSPSym.
bool hasPspSym;
#if FEATURE_EH_FUNCLETS
hasPspSym = (compiler->lvaPSPSym != BAD_VAR_NUM);
#else
hasPspSym = false;
#endif
// compute the amount of memory to allocate to properly STACK_ALIGN.
size_t amount = 0;
if (size->IsCnsIntOrI())
{
// If size is a constant, then it must be contained.
assert(size->isContained());
// If amount is zero then return null in targetReg
amount = size->gtIntCon.gtIconVal;
if (amount == 0)
{
instGen_Set_Reg_To_Zero(EA_PTRSIZE, targetReg);
goto BAILOUT;
}
// 'amount' is the total numbe of bytes to localloc to properly STACK_ALIGN
amount = AlignUp(amount, STACK_ALIGN);
}
else
{
// If 0 bail out by returning null in targetReg
genConsumeRegAndCopy(size, targetReg);
endLabel = genCreateTempLabel();
getEmitter()->emitIns_R_R(INS_tst, easz, targetReg, targetReg);
emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
inst_JMP(jmpEqual, endLabel);
// Compute the size of the block to allocate and perform alignment.
// If the method has no PSPSym and compInitMem=true, we can reuse targetReg as regcnt,
// since we don't need any internal registers.
if (!hasPspSym && compiler->info.compInitMem)
{
assert(tree->AvailableTempRegCount() == 0);
regCnt = targetReg;
}
else
{
regCnt = tree->ExtractTempReg();
if (regCnt != targetReg)
{
inst_RV_RV(INS_mov, regCnt, targetReg, size->TypeGet());
}
}
// Align to STACK_ALIGN
// regCnt will be the total number of bytes to localloc
inst_RV_IV(INS_add, regCnt, (STACK_ALIGN - 1), emitActualTypeSize(type));
inst_RV_IV(INS_and, regCnt, ~(STACK_ALIGN - 1), emitActualTypeSize(type));
}
stackAdjustment = 0;
#if FEATURE_EH_FUNCLETS
// If we have PSPsym, then need to re-locate it after localloc.
if (hasPspSym)
{
stackAdjustment += STACK_ALIGN;
// Save a copy of PSPSym
pspSymReg = tree->ExtractTempReg();
getEmitter()->emitIns_R_S(ins_Load(TYP_I_IMPL), EA_PTRSIZE, pspSymReg, compiler->lvaPSPSym, 0);
}
#endif
#if FEATURE_FIXED_OUT_ARGS
// If we have an outgoing arg area then we must adjust the SP by popping off the
// outgoing arg area. We will restore it right before we return from this method.
//
// Localloc is supposed to return stack space that is STACK_ALIGN'ed. The following
// are the cases that needs to be handled:
// i) Method has PSPSym + out-going arg area.
// It is guaranteed that size of out-going arg area is STACK_ALIGNED (see fgMorphArgs).
// Therefore, we will pop-off RSP upto out-going arg area before locallocating.
// We need to add padding to ensure RSP is STACK_ALIGN'ed while re-locating PSPSym + arg area.
// ii) Method has no PSPSym but out-going arg area.
// Almost same case as above without the requirement to pad for the final RSP to be STACK_ALIGN'ed.
// iii) Method has PSPSym but no out-going arg area.
// Nothing to pop-off from the stack but needs to relocate PSPSym with SP padded.
// iv) Method has neither PSPSym nor out-going arg area.
// Nothing needs to popped off from stack nor relocated.
if (compiler->lvaOutgoingArgSpaceSize > 0)
{
assert((compiler->lvaOutgoingArgSpaceSize % STACK_ALIGN) == 0); // This must be true for the stack to remain
// aligned
inst_RV_IV(INS_add, REG_SPBASE, compiler->lvaOutgoingArgSpaceSize, EA_PTRSIZE);
stackAdjustment += compiler->lvaOutgoingArgSpaceSize;
}
#endif
if (size->IsCnsIntOrI())
{
// We should reach here only for non-zero, constant size allocations.
assert(amount > 0);
// For small allocations we will generate up to four stp instructions
size_t cntStackAlignedWidthItems = (amount >> STACK_ALIGN_SHIFT);
if (cntStackAlignedWidthItems <= 4)
{
while (cntStackAlignedWidthItems != 0)
{
// We can use pre-indexed addressing.
// stp ZR, ZR, [SP, #-16]!
getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, REG_SPBASE, -16, INS_OPTS_PRE_INDEX);
cntStackAlignedWidthItems -= 1;
}
goto ALLOC_DONE;
}
else if (!compiler->info.compInitMem && (amount < compiler->eeGetPageSize())) // must be < not <=
{
// Since the size is a page or less, simply adjust the SP value
// The SP might already be in the guard page, must touch it BEFORE
// the alloc, not after.
// ldr wz, [SP, #0]
getEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SP, 0);
inst_RV_IV(INS_sub, REG_SP, amount, EA_PTRSIZE);
goto ALLOC_DONE;
}
// else, "mov regCnt, amount"
// If the method has no PSPSym and compInitMem=true, we can reuse targetReg as regcnt.
// Since size is a constant, regCnt is not yet initialized.
assert(regCnt == REG_NA);
if (!hasPspSym && compiler->info.compInitMem)
{
assert(tree->AvailableTempRegCount() == 0);
regCnt = targetReg;
}
else
{
regCnt = tree->ExtractTempReg();
}
genSetRegToIcon(regCnt, amount, ((int)amount == amount) ? TYP_INT : TYP_LONG);
}
if (compiler->info.compInitMem)
{
BasicBlock* loop = genCreateTempLabel();
// At this point 'regCnt' is set to the total number of bytes to locAlloc.
// Since we have to zero out the allocated memory AND ensure that RSP is always valid
// by tickling the pages, we will just push 0's on the stack.
//
// Note: regCnt is guaranteed to be even on Amd64 since STACK_ALIGN/TARGET_POINTER_SIZE = 2
// and localloc size is a multiple of STACK_ALIGN.
// Loop:
genDefineTempLabel(loop);
// We can use pre-indexed addressing.
// stp ZR, ZR, [SP, #-16]!
getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, REG_SPBASE, -16, INS_OPTS_PRE_INDEX);
// If not done, loop
// Note that regCnt is the number of bytes to stack allocate.
// Therefore we need to subtract 16 from regcnt here.
assert(genIsValidIntReg(regCnt));
inst_RV_IV(INS_subs, regCnt, 16, emitActualTypeSize(type));
emitJumpKind jmpNotEqual = genJumpKindForOper(GT_NE, CK_SIGNED);
inst_JMP(jmpNotEqual, loop);
}
else
{
// At this point 'regCnt' is set to the total number of bytes to locAlloc.
//
// We don't need to zero out the allocated memory. However, we do have
// to tickle the pages to ensure that SP is always valid and is
// in sync with the "stack guard page". Note that in the worst
// case SP is on the last byte of the guard page. Thus you must
// touch SP-0 first not SP-0x1000.
//
// Another subtlety is that you don't want SP to be exactly on the
// boundary of the guard page because PUSH is predecrement, thus
// call setup would not touch the guard page but just beyond it
//
// Note that we go through a few hoops so that SP never points to
// illegal pages at any time during the tickling process
//
// subs regCnt, SP, regCnt // regCnt now holds ultimate SP
// bvc Loop // result is smaller than orignial SP (no wrap around)
// mov regCnt, #0 // Overflow, pick lowest possible value
//
// Loop:
// ldr wzr, [SP + 0] // tickle the page - read from the page
// sub regTmp, SP, PAGE_SIZE // decrement SP by eeGetPageSize()
// cmp regTmp, regCnt
// jb Done
// mov SP, regTmp
// j Loop
//
// Done:
// mov SP, regCnt
//
// Setup the regTmp
regNumber regTmp = tree->GetSingleTempReg();
BasicBlock* loop = genCreateTempLabel();
BasicBlock* done = genCreateTempLabel();
// subs regCnt, SP, regCnt // regCnt now holds ultimate SP
getEmitter()->emitIns_R_R_R(INS_subs, EA_PTRSIZE, regCnt, REG_SPBASE, regCnt);
inst_JMP(EJ_vc, loop); // branch if the V flag is not set
// Overflow, set regCnt to lowest possible value
instGen_Set_Reg_To_Zero(EA_PTRSIZE, regCnt);
genDefineTempLabel(loop);
// tickle the page - Read from the updated SP - this triggers a page fault when on the guard page
getEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, 0);
// decrement SP by eeGetPageSize()
getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, regTmp, REG_SPBASE, compiler->eeGetPageSize());
getEmitter()->emitIns_R_R(INS_cmp, EA_PTRSIZE, regTmp, regCnt);
emitJumpKind jmpLTU = genJumpKindForOper(GT_LT, CK_UNSIGNED);
inst_JMP(jmpLTU, done);
// Update SP to be at the next page of stack that we will tickle
getEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_SPBASE, regTmp);
// Jump to loop and tickle new stack address
inst_JMP(EJ_jmp, loop);
// Done with stack tickle loop
genDefineTempLabel(done);
// Now just move the final value to SP
getEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_SPBASE, regCnt);
}
ALLOC_DONE:
// Re-adjust SP to allocate PSPSym and out-going arg area
if (stackAdjustment != 0)
{
assert((stackAdjustment % STACK_ALIGN) == 0); // This must be true for the stack to remain aligned
assert(stackAdjustment > 0);
getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, (int)stackAdjustment);
#if FEATURE_EH_FUNCLETS
// Write PSPSym to its new location.
if (hasPspSym)
{
assert(genIsValidIntReg(pspSymReg));
getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, pspSymReg, compiler->lvaPSPSym, 0);
}
#endif
// Return the stackalloc'ed address in result register.
// TargetReg = RSP + stackAdjustment.
//
getEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, targetReg, REG_SPBASE, (int)stackAdjustment);
}
else // stackAdjustment == 0
{
// Move the final value of SP to targetReg
inst_RV_RV(INS_mov, targetReg, REG_SPBASE);
}
BAILOUT:
if (endLabel != nullptr)
genDefineTempLabel(endLabel);
// Write the lvaLocAllocSPvar stack frame slot
if (compiler->lvaLocAllocSPvar != BAD_VAR_NUM)
{
getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, targetReg, compiler->lvaLocAllocSPvar, 0);
}
#if STACK_PROBES
if (compiler->opts.compNeedStackProbes)
{
genGenerateStackProbe();
}
#endif
#ifdef DEBUG
// Update new ESP
if (compiler->opts.compStackCheckOnRet)
{
noway_assert(compiler->lvaReturnEspCheck != 0xCCCCCCCC &&
compiler->lvaTable[compiler->lvaReturnEspCheck].lvDoNotEnregister &&
compiler->lvaTable[compiler->lvaReturnEspCheck].lvOnFrame);
getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, targetReg, compiler->lvaReturnEspCheck, 0);
}
#endif
genProduceReg(tree);
}
//------------------------------------------------------------------------
// genCodeForNegNot: Produce code for a GT_NEG/GT_NOT node.
//
// Arguments:
// tree - the node
//
void CodeGen::genCodeForNegNot(GenTree* tree)
{
assert(tree->OperIs(GT_NEG, GT_NOT));
var_types targetType = tree->TypeGet();
assert(!tree->OperIs(GT_NOT) || !varTypeIsFloating(targetType));
regNumber targetReg = tree->gtRegNum;
instruction ins = genGetInsForOper(tree->OperGet(), targetType);
// The arithmetic node must be sitting in a register (since it's not contained)
assert(!tree->isContained());
// The dst can only be a register.
assert(targetReg != REG_NA);
GenTree* operand = tree->gtGetOp1();
assert(!operand->isContained());
// The src must be a register.
regNumber operandReg = genConsumeReg(operand);
getEmitter()->emitIns_R_R(ins, emitActualTypeSize(tree), targetReg, operandReg);
genProduceReg(tree);
}
//------------------------------------------------------------------------
// genCodeForDivMod: Produce code for a GT_DIV/GT_UDIV node. We don't see MOD:
// (1) integer MOD is morphed into a sequence of sub, mul, div in fgMorph;
// (2) float/double MOD is morphed into a helper call by front-end.
//
// Arguments:
// tree - the node
//
void CodeGen::genCodeForDivMod(GenTreeOp* tree)
{
assert(tree->OperIs(GT_DIV, GT_UDIV));
var_types targetType = tree->TypeGet();
emitter* emit = getEmitter();
genConsumeOperands(tree);
if (varTypeIsFloating(targetType))
{
// Floating point divide never raises an exception
genCodeForBinary(tree);
}
else // an integer divide operation
{
GenTree* divisorOp = tree->gtGetOp2();
emitAttr size = EA_ATTR(genTypeSize(genActualType(tree->TypeGet())));
if (divisorOp->IsIntegralConst(0))
{
// We unconditionally throw a divide by zero exception
genJumpToThrowHlpBlk(EJ_jmp, SCK_DIV_BY_ZERO);
// We still need to call genProduceReg
genProduceReg(tree);
}
else // the divisor is not the constant zero
{
regNumber divisorReg = divisorOp->gtRegNum;
// Generate the require runtime checks for GT_DIV or GT_UDIV
if (tree->gtOper == GT_DIV)
{
BasicBlock* sdivLabel = genCreateTempLabel();
// Two possible exceptions:
// (AnyVal / 0) => DivideByZeroException
// (MinInt / -1) => ArithmeticException
//
bool checkDividend = true;
// Do we have an immediate for the 'divisorOp'?
//
if (divisorOp->IsCnsIntOrI())
{
GenTreeIntConCommon* intConstTree = divisorOp->AsIntConCommon();
ssize_t intConstValue = intConstTree->IconValue();
assert(intConstValue != 0); // already checked above by IsIntegralConst(0))
if (intConstValue != -1)
{
checkDividend = false; // We statically know that the dividend is not -1
}
}
else // insert check for divison by zero
{
// Check if the divisor is zero throw a DivideByZeroException
emit->emitIns_R_I(INS_cmp, size, divisorReg, 0);
emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
genJumpToThrowHlpBlk(jmpEqual, SCK_DIV_BY_ZERO);
}
if (checkDividend)
{
// Check if the divisor is not -1 branch to 'sdivLabel'
emit->emitIns_R_I(INS_cmp, size, divisorReg, -1);
emitJumpKind jmpNotEqual = genJumpKindForOper(GT_NE, CK_SIGNED);
inst_JMP(jmpNotEqual, sdivLabel);
// If control flow continues past here the 'divisorReg' is known to be -1
regNumber dividendReg = tree->gtGetOp1()->gtRegNum;
// At this point the divisor is known to be -1
//
// Issue the 'adds zr, dividendReg, dividendReg' instruction
// this will set both the Z and V flags only when dividendReg is MinInt
//
emit->emitIns_R_R_R(INS_adds, size, REG_ZR, dividendReg, dividendReg);
inst_JMP(jmpNotEqual, sdivLabel); // goto sdiv if the Z flag is clear
genJumpToThrowHlpBlk(EJ_vs, SCK_ARITH_EXCPN); // if the V flags is set throw
// ArithmeticException
genDefineTempLabel(sdivLabel);
}
genCodeForBinary(tree); // Generate the sdiv instruction
}
else // (tree->gtOper == GT_UDIV)
{
// Only one possible exception
// (AnyVal / 0) => DivideByZeroException
//
// Note that division by the constant 0 was already checked for above by the
// op2->IsIntegralConst(0) check
//
if (!divisorOp->IsCnsIntOrI())
{
// divisorOp is not a constant, so it could be zero
//
emit->emitIns_R_I(INS_cmp, size, divisorReg, 0);
emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
genJumpToThrowHlpBlk(jmpEqual, SCK_DIV_BY_ZERO);
}
genCodeForBinary(tree);
}
}
}
}
// Generate code for InitBlk by performing a loop unroll
// Preconditions:
// a) Both the size and fill byte value are integer constants.
// b) The size of the struct to initialize is smaller than INITBLK_UNROLL_LIMIT bytes.
void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* initBlkNode)
{
// Make sure we got the arguments of the initblk/initobj operation in the right registers
unsigned size = initBlkNode->Size();
GenTree* dstAddr = initBlkNode->Addr();
GenTree* initVal = initBlkNode->Data();
if (initVal->OperIsInitVal())
{
initVal = initVal->gtGetOp1();
}
assert(dstAddr->isUsedFromReg());
assert(initVal->isUsedFromReg() && !initVal->IsIntegralConst(0) || initVal->IsIntegralConst(0));
assert(size != 0);
assert(size <= INITBLK_UNROLL_LIMIT);
emitter* emit = getEmitter();
genConsumeOperands(initBlkNode);
if (initBlkNode->gtFlags & GTF_BLK_VOLATILE)
{
// issue a full memory barrier before a volatile initBlockUnroll operation
instGen_MemoryBarrier();
}
regNumber valReg = initVal->IsIntegralConst(0) ? REG_ZR : initVal->gtRegNum;
assert(!initVal->IsIntegralConst(0) || (valReg == REG_ZR));
unsigned offset = 0;
// Perform an unroll using stp.
if (size >= 2 * REGSIZE_BYTES)
{
// Determine how many 16 byte slots
size_t slots = size / (2 * REGSIZE_BYTES);
while (slots-- > 0)
{
emit->emitIns_R_R_R_I(INS_stp, EA_8BYTE, valReg, valReg, dstAddr->gtRegNum, offset);
offset += (2 * REGSIZE_BYTES);
}
}
// Fill the remainder (15 bytes or less) if there's any.
if ((size & 0xf) != 0)
{
if ((size & 8) != 0)
{
emit->emitIns_R_R_I(INS_str, EA_8BYTE, valReg, dstAddr->gtRegNum, offset);
offset += 8;
}
if ((size & 4) != 0)
{
emit->emitIns_R_R_I(INS_str, EA_4BYTE, valReg, dstAddr->gtRegNum, offset);
offset += 4;
}
if ((size & 2) != 0)
{
emit->emitIns_R_R_I(INS_strh, EA_2BYTE, valReg, dstAddr->gtRegNum, offset);
offset += 2;
}
if ((size & 1) != 0)
{
emit->emitIns_R_R_I(INS_strb, EA_1BYTE, valReg, dstAddr->gtRegNum, offset);
}
}
}
// Generate code for a load pair from some address + offset
// base: tree node which can be either a local address or arbitrary node
// offset: distance from the base from which to load
void CodeGen::genCodeForLoadPairOffset(regNumber dst, regNumber dst2, GenTree* base, unsigned offset)
{
emitter* emit = getEmitter();
if (base->OperIsLocalAddr())
{
if (base->gtOper == GT_LCL_FLD_ADDR)
offset += base->gtLclFld.gtLclOffs;
emit->emitIns_R_R_S_S(INS_ldp, EA_8BYTE, EA_8BYTE, dst, dst2, base->gtLclVarCommon.gtLclNum, offset);
}
else
{
emit->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, dst, dst2, base->gtRegNum, offset);
}
}
// Generate code for a store pair to some address + offset
// base: tree node which can be either a local address or arbitrary node
// offset: distance from the base from which to load
void CodeGen::genCodeForStorePairOffset(regNumber src, regNumber src2, GenTree* base, unsigned offset)
{
emitter* emit = getEmitter();
if (base->OperIsLocalAddr())
{
if (base->gtOper == GT_LCL_FLD_ADDR)
offset += base->gtLclFld.gtLclOffs;
emit->emitIns_S_S_R_R(INS_stp, EA_8BYTE, EA_8BYTE, src, src2, base->gtLclVarCommon.gtLclNum, offset);
}
else
{
emit->emitIns_R_R_R_I(INS_stp, EA_8BYTE, src, src2, base->gtRegNum, offset);
}
}
// Generate code for CpObj nodes wich copy structs that have interleaved
// GC pointers.
// For this case we'll generate a sequence of loads/stores in the case of struct
// slots that don't contain GC pointers. The generated code will look like:
// ldr tempReg, [R13, #8]
// str tempReg, [R14, #8]
//
// In the case of a GC-Pointer we'll call the ByRef write barrier helper
// who happens to use the same registers as the previous call to maintain
// the same register requirements and register killsets:
// bl CORINFO_HELP_ASSIGN_BYREF
//
// So finally an example would look like this:
// ldr tempReg, [R13, #8]
// str tempReg, [R14, #8]
// bl CORINFO_HELP_ASSIGN_BYREF
// ldr tempReg, [R13, #8]
// str tempReg, [R14, #8]
// bl CORINFO_HELP_ASSIGN_BYREF
// ldr tempReg, [R13, #8]
// str tempReg, [R14, #8]
void CodeGen::genCodeForCpObj(GenTreeObj* cpObjNode)
{
GenTree* dstAddr = cpObjNode->Addr();
GenTree* source = cpObjNode->Data();
var_types srcAddrType = TYP_BYREF;
bool sourceIsLocal = false;
assert(source->isContained());
if (source->gtOper == GT_IND)
{
GenTree* srcAddr = source->gtGetOp1();
assert(!srcAddr->isContained());
srcAddrType = srcAddr->TypeGet();
}
else
{
noway_assert(source->IsLocal());
sourceIsLocal = true;
}
bool dstOnStack = dstAddr->OperIsLocalAddr();
#ifdef DEBUG
assert(!dstAddr->isContained());
// This GenTree node has data about GC pointers, this means we're dealing
// with CpObj.
assert(cpObjNode->gtGcPtrCount > 0);
#endif // DEBUG
// Consume the operands and get them into the right registers.
// They may now contain gc pointers (depending on their type; gcMarkRegPtrVal will "do the right thing").
genConsumeBlockOp(cpObjNode, REG_WRITE_BARRIER_DST_BYREF, REG_WRITE_BARRIER_SRC_BYREF, REG_NA);
gcInfo.gcMarkRegPtrVal(REG_WRITE_BARRIER_SRC_BYREF, srcAddrType);
gcInfo.gcMarkRegPtrVal(REG_WRITE_BARRIER_DST_BYREF, dstAddr->TypeGet());
unsigned slots = cpObjNode->gtSlots;
// Temp register(s) used to perform the sequence of loads and stores.
regNumber tmpReg = cpObjNode->ExtractTempReg();
regNumber tmpReg2 = REG_NA;
assert(genIsValidIntReg(tmpReg));
assert(tmpReg != REG_WRITE_BARRIER_SRC_BYREF);
assert(tmpReg != REG_WRITE_BARRIER_DST_BYREF);
if (slots > 1)
{
tmpReg2 = cpObjNode->GetSingleTempReg();
assert(tmpReg2 != tmpReg);
assert(genIsValidIntReg(tmpReg2));
assert(tmpReg2 != REG_WRITE_BARRIER_DST_BYREF);
assert(tmpReg2 != REG_WRITE_BARRIER_SRC_BYREF);
}
if (cpObjNode->gtFlags & GTF_BLK_VOLATILE)
{
// issue a full memory barrier before a volatile CpObj operation
instGen_MemoryBarrier();
}
emitter* emit = getEmitter();
BYTE* gcPtrs = cpObjNode->gtGcPtrs;
// If we can prove it's on the stack we don't need to use the write barrier.
if (dstOnStack)
{
unsigned i = 0;
// Check if two or more remaining slots and use a ldp/stp sequence
while (i < slots - 1)
{
emitAttr attr0 = emitTypeSize(compiler->getJitGCType(gcPtrs[i + 0]));
emitAttr attr1 = emitTypeSize(compiler->getJitGCType(gcPtrs[i + 1]));
emit->emitIns_R_R_R_I(INS_ldp, attr0, tmpReg, tmpReg2, REG_WRITE_BARRIER_SRC_BYREF, 2 * TARGET_POINTER_SIZE,
INS_OPTS_POST_INDEX, attr1);
emit->emitIns_R_R_R_I(INS_stp, attr0, tmpReg, tmpReg2, REG_WRITE_BARRIER_DST_BYREF, 2 * TARGET_POINTER_SIZE,
INS_OPTS_POST_INDEX, attr1);
i += 2;
}
// Use a ldr/str sequence for the last remainder
if (i < slots)
{
emitAttr attr0 = emitTypeSize(compiler->getJitGCType(gcPtrs[i + 0]));
emit->emitIns_R_R_I(INS_ldr, attr0, tmpReg, REG_WRITE_BARRIER_SRC_BYREF, TARGET_POINTER_SIZE,
INS_OPTS_POST_INDEX);
emit->emitIns_R_R_I(INS_str, attr0, tmpReg, REG_WRITE_BARRIER_DST_BYREF, TARGET_POINTER_SIZE,
INS_OPTS_POST_INDEX);
}
}
else
{
unsigned gcPtrCount = cpObjNode->gtGcPtrCount;
unsigned i = 0;
while (i < slots)
{
switch (gcPtrs[i])
{
case TYPE_GC_NONE:
// Check if the next slot's type is also TYP_GC_NONE and use ldp/stp
if ((i + 1 < slots) && (gcPtrs[i + 1] == TYPE_GC_NONE))
{
emit->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, tmpReg, tmpReg2, REG_WRITE_BARRIER_SRC_BYREF,
2 * TARGET_POINTER_SIZE, INS_OPTS_POST_INDEX);
emit->emitIns_R_R_R_I(INS_stp, EA_8BYTE, tmpReg, tmpReg2, REG_WRITE_BARRIER_DST_BYREF,
2 * TARGET_POINTER_SIZE, INS_OPTS_POST_INDEX);
++i; // extra increment of i, since we are copying two items
}
else
{
emit->emitIns_R_R_I(INS_ldr, EA_8BYTE, tmpReg, REG_WRITE_BARRIER_SRC_BYREF, TARGET_POINTER_SIZE,
INS_OPTS_POST_INDEX);
emit->emitIns_R_R_I(INS_str, EA_8BYTE, tmpReg, REG_WRITE_BARRIER_DST_BYREF, TARGET_POINTER_SIZE,
INS_OPTS_POST_INDEX);
}
break;
default:
// In the case of a GC-Pointer we'll call the ByRef write barrier helper
genEmitHelperCall(CORINFO_HELP_ASSIGN_BYREF, 0, EA_PTRSIZE);
gcPtrCount--;
break;
}
++i;
}
assert(gcPtrCount == 0);
}
if (cpObjNode->gtFlags & GTF_BLK_VOLATILE)
{
// issue a INS_BARRIER_ISHLD after a volatile CpObj operation
instGen_MemoryBarrier(INS_BARRIER_ISHLD);
}
// Clear the gcInfo for REG_WRITE_BARRIER_SRC_BYREF and REG_WRITE_BARRIER_DST_BYREF.
// While we normally update GC info prior to the last instruction that uses them,
// these actually live into the helper call.
gcInfo.gcMarkRegSetNpt(RBM_WRITE_BARRIER_SRC_BYREF | RBM_WRITE_BARRIER_DST_BYREF);
}
// generate code do a switch statement based on a table of ip-relative offsets
void CodeGen::genTableBasedSwitch(GenTree* treeNode)
{
genConsumeOperands(treeNode->AsOp());
regNumber idxReg = treeNode->gtOp.gtOp1->gtRegNum;
regNumber baseReg = treeNode->gtOp.gtOp2->gtRegNum;
regNumber tmpReg = treeNode->GetSingleTempReg();
// load the ip-relative offset (which is relative to start of fgFirstBB)
getEmitter()->emitIns_R_R_R(INS_ldr, EA_4BYTE, baseReg, baseReg, idxReg, INS_OPTS_LSL);
// add it to the absolute address of fgFirstBB
compiler->fgFirstBB->bbFlags |= BBF_JMP_TARGET;
getEmitter()->emitIns_R_L(INS_adr, EA_PTRSIZE, compiler->fgFirstBB, tmpReg);
getEmitter()->emitIns_R_R_R(INS_add, EA_PTRSIZE, baseReg, baseReg, tmpReg);
// br baseReg
getEmitter()->emitIns_R(INS_br, emitActualTypeSize(TYP_I_IMPL), baseReg);
}
// emits the table and an instruction to get the address of the first element
void CodeGen::genJumpTable(GenTree* treeNode)
{
noway_assert(compiler->compCurBB->bbJumpKind == BBJ_SWITCH);
assert(treeNode->OperGet() == GT_JMPTABLE);
unsigned jumpCount = compiler->compCurBB->bbJumpSwt->bbsCount;
BasicBlock** jumpTable = compiler->compCurBB->bbJumpSwt->bbsDstTab;
unsigned jmpTabOffs;
unsigned jmpTabBase;
jmpTabBase = getEmitter()->emitBBTableDataGenBeg(jumpCount, true);
jmpTabOffs = 0;
JITDUMP("\n J_M%03u_DS%02u LABEL DWORD\n", Compiler::s_compMethodsCount, jmpTabBase);
for (unsigned i = 0; i < jumpCount; i++)
{
BasicBlock* target = *jumpTable++;
noway_assert(target->bbFlags & BBF_JMP_TARGET);
JITDUMP(" DD L_M%03u_BB%02u\n", Compiler::s_compMethodsCount, target->bbNum);
getEmitter()->emitDataGenData(i, target);
};
getEmitter()->emitDataGenEnd();
// Access to inline data is 'abstracted' by a special type of static member
// (produced by eeFindJitDataOffs) which the emitter recognizes as being a reference
// to constant data, not a real static field.
getEmitter()->emitIns_R_C(INS_adr, emitActualTypeSize(TYP_I_IMPL), treeNode->gtRegNum, REG_NA,
compiler->eeFindJitDataOffs(jmpTabBase), 0);
genProduceReg(treeNode);
}
//------------------------------------------------------------------------
// genLockedInstructions: Generate code for a GT_XADD or GT_XCHG node.
//
// Arguments:
// treeNode - the GT_XADD/XCHG node
//
void CodeGen::genLockedInstructions(GenTreeOp* treeNode)
{
GenTree* data = treeNode->gtOp.gtOp2;
GenTree* addr = treeNode->gtOp.gtOp1;
regNumber targetReg = treeNode->gtRegNum;
regNumber dataReg = data->gtRegNum;
regNumber addrReg = addr->gtRegNum;
genConsumeAddress(addr);
genConsumeRegs(data);
emitAttr dataSize = emitActualTypeSize(data);
if (compiler->compSupports(InstructionSet_Atomics))
{
assert(!data->isContainedIntOrIImmed());
switch (treeNode->gtOper)
{
case GT_XCHG:
getEmitter()->emitIns_R_R_R(INS_swpal, dataSize, dataReg, targetReg, addrReg);
break;
case GT_XADD:
if ((targetReg == REG_NA) || (targetReg == REG_ZR))
{
getEmitter()->emitIns_R_R(INS_staddl, dataSize, dataReg, addrReg);
}
else
{
getEmitter()->emitIns_R_R_R(INS_ldaddal, dataSize, dataReg, targetReg, addrReg);
}
break;
default:
assert(!"Unexpected treeNode->gtOper");
}
instGen_MemoryBarrier(INS_BARRIER_ISH);
}
else
{
regNumber exResultReg = treeNode->ExtractTempReg(RBM_ALLINT);
regNumber storeDataReg = (treeNode->OperGet() == GT_XCHG) ? dataReg : treeNode->ExtractTempReg(RBM_ALLINT);
regNumber loadReg = (targetReg != REG_NA) ? targetReg : storeDataReg;
// Check allocator assumptions
//
// The register allocator should have extended the lifetimes of all input and internal registers so that
// none interfere with the target.
noway_assert(addrReg != targetReg);
noway_assert(addrReg != loadReg);
noway_assert(dataReg != loadReg);
noway_assert(addrReg != storeDataReg);
noway_assert((treeNode->OperGet() == GT_XCHG) || (addrReg != dataReg));
assert(addr->isUsedFromReg());
noway_assert(exResultReg != REG_NA);
noway_assert(exResultReg != targetReg);
noway_assert((targetReg != REG_NA) || (treeNode->OperGet() != GT_XCHG));
// Store exclusive unpredictable cases must be avoided
noway_assert(exResultReg != storeDataReg);
noway_assert(exResultReg != addrReg);
// NOTE: `genConsumeAddress` marks the consumed register as not a GC pointer, as it assumes that the input
// registers
// die at the first instruction generated by the node. This is not the case for these atomics as the input
// registers are multiply-used. As such, we need to mark the addr register as containing a GC pointer until
// we are finished generating the code for this node.
gcInfo.gcMarkRegPtrVal(addrReg, addr->TypeGet());
// Emit code like this:
// retry:
// ldxr loadReg, [addrReg]
// add storeDataReg, loadReg, dataReg # Only for GT_XADD
// # GT_XCHG storeDataReg === dataReg
// stxr exResult, storeDataReg, [addrReg]
// cbnz exResult, retry
// dmb ish
BasicBlock* labelRetry = genCreateTempLabel();
genDefineTempLabel(labelRetry);
// The following instruction includes a acquire half barrier
getEmitter()->emitIns_R_R(INS_ldaxr, dataSize, loadReg, addrReg);
switch (treeNode->OperGet())
{
case GT_XADD:
if (data->isContainedIntOrIImmed())
{
// Even though INS_add is specified here, the encoder will choose either
// an INS_add or an INS_sub and encode the immediate as a positive value
genInstrWithConstant(INS_add, dataSize, storeDataReg, loadReg, data->AsIntConCommon()->IconValue(),
REG_NA);
}
else
{
getEmitter()->emitIns_R_R_R(INS_add, dataSize, storeDataReg, loadReg, dataReg);
}
break;
case GT_XCHG:
assert(!data->isContained());
storeDataReg = dataReg;
break;
default:
unreached();
}
// The following instruction includes a release half barrier
getEmitter()->emitIns_R_R_R(INS_stlxr, dataSize, exResultReg, storeDataReg, addrReg);
getEmitter()->emitIns_J_R(INS_cbnz, EA_4BYTE, labelRetry, exResultReg);
instGen_MemoryBarrier(INS_BARRIER_ISH);
gcInfo.gcMarkRegSetNpt(addr->gtGetRegMask());
}
if (treeNode->gtRegNum != REG_NA)
{
genProduceReg(treeNode);
}
}
//------------------------------------------------------------------------
// genCodeForCmpXchg: Produce code for a GT_CMPXCHG node.
//
// Arguments:
// tree - the GT_CMPXCHG node
//
void CodeGen::genCodeForCmpXchg(GenTreeCmpXchg* treeNode)
{
assert(treeNode->OperIs(GT_CMPXCHG));
GenTree* addr = treeNode->gtOpLocation; // arg1
GenTree* data = treeNode->gtOpValue; // arg2
GenTree* comparand = treeNode->gtOpComparand; // arg3
regNumber targetReg = treeNode->gtRegNum;
regNumber dataReg = data->gtRegNum;
regNumber addrReg = addr->gtRegNum;
regNumber comparandReg = comparand->gtRegNum;
genConsumeAddress(addr);
genConsumeRegs(data);
genConsumeRegs(comparand);
if (compiler->compSupports(InstructionSet_Atomics))
{
emitAttr dataSize = emitActualTypeSize(data);
// casal use the comparand as the target reg
if (targetReg != comparandReg)
{
getEmitter()->emitIns_R_R(INS_mov, dataSize, targetReg, comparandReg);
// Catch case we destroyed data or address before use
noway_assert(addrReg != targetReg);
noway_assert(dataReg != targetReg);
}
getEmitter()->emitIns_R_R_R(INS_casal, dataSize, targetReg, dataReg, addrReg);
instGen_MemoryBarrier(INS_BARRIER_ISH);
}
else
{
regNumber exResultReg = treeNode->ExtractTempReg(RBM_ALLINT);
// Check allocator assumptions
//
// The register allocator should have extended the lifetimes of all input and internal registers so that
// none interfere with the target.
noway_assert(addrReg != targetReg);
noway_assert(dataReg != targetReg);
noway_assert(comparandReg != targetReg);
noway_assert(addrReg != dataReg);
noway_assert(targetReg != REG_NA);
noway_assert(exResultReg != REG_NA);
noway_assert(exResultReg != targetReg);
assert(addr->isUsedFromReg());
assert(data->isUsedFromReg());
assert(!comparand->isUsedFromMemory());
// Store exclusive unpredictable cases must be avoided
noway_assert(exResultReg != dataReg);
noway_assert(exResultReg != addrReg);
// NOTE: `genConsumeAddress` marks the consumed register as not a GC pointer, as it assumes that the input
// registers
// die at the first instruction generated by the node. This is not the case for these atomics as the input
// registers are multiply-used. As such, we need to mark the addr register as containing a GC pointer until
// we are finished generating the code for this node.
gcInfo.gcMarkRegPtrVal(addrReg, addr->TypeGet());
// TODO-ARM64-CQ Use ARMv8.1 atomics if available
// https://github.com/dotnet/coreclr/issues/11881
// Emit code like this:
// retry:
// ldxr targetReg, [addrReg]
// cmp targetReg, comparandReg
// bne compareFail
// stxr exResult, dataReg, [addrReg]
// cbnz exResult, retry
// compareFail:
// dmb ish
BasicBlock* labelRetry = genCreateTempLabel();
BasicBlock* labelCompareFail = genCreateTempLabel();
genDefineTempLabel(labelRetry);
// The following instruction includes a acquire half barrier
getEmitter()->emitIns_R_R(INS_ldaxr, emitTypeSize(treeNode), targetReg, addrReg);
if (comparand->isContainedIntOrIImmed())
{
if (comparand->IsIntegralConst(0))
{
getEmitter()->emitIns_J_R(INS_cbnz, emitActualTypeSize(treeNode), labelCompareFail, targetReg);
}
else
{
getEmitter()->emitIns_R_I(INS_cmp, emitActualTypeSize(treeNode), targetReg,
comparand->AsIntConCommon()->IconValue());
getEmitter()->emitIns_J(INS_bne, labelCompareFail);
}
}
else
{
getEmitter()->emitIns_R_R(INS_cmp, emitActualTypeSize(treeNode), targetReg, comparandReg);
getEmitter()->emitIns_J(INS_bne, labelCompareFail);
}
// The following instruction includes a release half barrier
getEmitter()->emitIns_R_R_R(INS_stlxr, emitTypeSize(treeNode), exResultReg, dataReg, addrReg);
getEmitter()->emitIns_J_R(INS_cbnz, EA_4BYTE, labelRetry, exResultReg);
genDefineTempLabel(labelCompareFail);
instGen_MemoryBarrier(INS_BARRIER_ISH);
gcInfo.gcMarkRegSetNpt(addr->gtGetRegMask());
}
genProduceReg(treeNode);
}
instruction CodeGen::genGetInsForOper(genTreeOps oper, var_types type)
{
instruction ins = INS_brk;
if (varTypeIsFloating(type))
{
switch (oper)
{
case GT_ADD:
ins = INS_fadd;
break;
case GT_SUB:
ins = INS_fsub;
break;
case GT_MUL:
ins = INS_fmul;
break;
case GT_DIV:
ins = INS_fdiv;
break;
case GT_NEG:
ins = INS_fneg;
break;
default:
NYI("Unhandled oper in genGetInsForOper() - float");
unreached();
break;
}
}
else
{
switch (oper)
{
case GT_ADD:
ins = INS_add;
break;
case GT_AND:
ins = INS_and;
break;
case GT_DIV:
ins = INS_sdiv;
break;
case GT_UDIV:
ins = INS_udiv;
break;
case GT_MUL:
ins = INS_mul;
break;
case GT_LSH:
ins = INS_lsl;
break;
case GT_NEG:
ins = INS_neg;
break;
case GT_NOT:
ins = INS_mvn;
break;
case GT_OR:
ins = INS_orr;
break;
case GT_ROR:
ins = INS_ror;
break;
case GT_RSH:
ins = INS_asr;
break;
case GT_RSZ:
ins = INS_lsr;
break;
case GT_SUB:
ins = INS_sub;
break;
case GT_XOR:
ins = INS_eor;
break;
default:
NYI("Unhandled oper in genGetInsForOper() - integer");
unreached();
break;
}
}
return ins;
}
//------------------------------------------------------------------------
// genCodeForReturnTrap: Produce code for a GT_RETURNTRAP node.
//
// Arguments:
// tree - the GT_RETURNTRAP node
//
void CodeGen::genCodeForReturnTrap(GenTreeOp* tree)
{
assert(tree->OperGet() == GT_RETURNTRAP);
// this is nothing but a conditional call to CORINFO_HELP_STOP_FOR_GC
// based on the contents of 'data'
GenTree* data = tree->gtOp1;
genConsumeRegs(data);
getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, data->gtRegNum, 0);
BasicBlock* skipLabel = genCreateTempLabel();
emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
inst_JMP(jmpEqual, skipLabel);
// emit the call to the EE-helper that stops for GC (or other reasons)
genEmitHelperCall(CORINFO_HELP_STOP_FOR_GC, 0, EA_UNKNOWN);
genDefineTempLabel(skipLabel);
}
//------------------------------------------------------------------------
// genCodeForStoreInd: Produce code for a GT_STOREIND node.
//
// Arguments:
// tree - the GT_STOREIND node
//
void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
{
GenTree* data = tree->Data();
GenTree* addr = tree->Addr();
var_types targetType = tree->TypeGet();
emitter* emit = getEmitter();
emitAttr attr = emitTypeSize(tree);
instruction ins = ins_Store(targetType);
#ifdef FEATURE_SIMD
// Storing Vector3 of size 12 bytes through indirection
if (tree->TypeGet() == TYP_SIMD12)
{
genStoreIndTypeSIMD12(tree);
return;
}
#endif // FEATURE_SIMD
GCInfo::WriteBarrierForm writeBarrierForm = gcInfo.gcIsWriteBarrierCandidate(tree, data);
if (writeBarrierForm != GCInfo::WBF_NoBarrier)
{
// data and addr must be in registers.
// Consume both registers so that any copies of interfering
// registers are taken care of.
genConsumeOperands(tree);
// At this point, we should not have any interference.
// That is, 'data' must not be in REG_WRITE_BARRIER_DST_BYREF,
// as that is where 'addr' must go.
noway_assert(data->gtRegNum != REG_WRITE_BARRIER_DST_BYREF);
// 'addr' goes into x14 (REG_WRITE_BARRIER_DST)
genCopyRegIfNeeded(addr, REG_WRITE_BARRIER_DST);
// 'data' goes into x15 (REG_WRITE_BARRIER_SRC)
genCopyRegIfNeeded(data, REG_WRITE_BARRIER_SRC);
genGCWriteBarrier(tree, writeBarrierForm);
}
else // A normal store, not a WriteBarrier store
{
bool dataIsUnary = false;
GenTree* nonRMWsrc = nullptr;
// We must consume the operands in the proper execution order,
// so that liveness is updated appropriately.
genConsumeAddress(addr);
if (!data->isContained())
{
genConsumeRegs(data);
}
regNumber dataReg = REG_NA;
if (data->isContainedIntOrIImmed())
{
assert(data->IsIntegralConst(0));
dataReg = REG_ZR;
}
else // data is not contained, so evaluate it into a register
{
assert(!data->isContained());
dataReg = data->gtRegNum;
}
assert((attr != EA_1BYTE) || !(tree->gtFlags & GTF_IND_UNALIGNED));
if (tree->gtFlags & GTF_IND_VOLATILE)
{
bool useStoreRelease =
genIsValidIntReg(dataReg) && !addr->isContained() && !(tree->gtFlags & GTF_IND_UNALIGNED);
if (useStoreRelease)
{
switch (EA_SIZE(attr))
{
case EA_1BYTE:
assert(ins == INS_strb);
ins = INS_stlrb;
break;
case EA_2BYTE:
assert(ins == INS_strh);
ins = INS_stlrh;
break;
case EA_4BYTE:
case EA_8BYTE:
assert(ins == INS_str);
ins = INS_stlr;
break;
default:
assert(false); // We should not get here
}
}
else
{
// issue a full memory barrier before a volatile StInd
instGen_MemoryBarrier();
}
}
emit->emitInsLoadStoreOp(ins, attr, dataReg, tree);
}
}
//------------------------------------------------------------------------
// genCodeForSwap: Produce code for a GT_SWAP node.
//
// Arguments:
// tree - the GT_SWAP node
//
void CodeGen::genCodeForSwap(GenTreeOp* tree)
{
assert(tree->OperIs(GT_SWAP));
// Swap is only supported for lclVar operands that are enregistered
// We do not consume or produce any registers. Both operands remain enregistered.
// However, the gc-ness may change.
assert(genIsRegCandidateLocal(tree->gtOp1) && genIsRegCandidateLocal(tree->gtOp2));
GenTreeLclVarCommon* lcl1 = tree->gtOp1->AsLclVarCommon();
LclVarDsc* varDsc1 = &(compiler->lvaTable[lcl1->gtLclNum]);
var_types type1 = varDsc1->TypeGet();
GenTreeLclVarCommon* lcl2 = tree->gtOp2->AsLclVarCommon();
LclVarDsc* varDsc2 = &(compiler->lvaTable[lcl2->gtLclNum]);
var_types type2 = varDsc2->TypeGet();
// We must have both int or both fp regs
assert(!varTypeIsFloating(type1) || varTypeIsFloating(type2));
// FP swap is not yet implemented (and should have NYI'd in LSRA)
assert(!varTypeIsFloating(type1));
regNumber oldOp1Reg = lcl1->gtRegNum;
regMaskTP oldOp1RegMask = genRegMask(oldOp1Reg);
regNumber oldOp2Reg = lcl2->gtRegNum;
regMaskTP oldOp2RegMask = genRegMask(oldOp2Reg);
// We don't call genUpdateVarReg because we don't have a tree node with the new register.
varDsc1->lvRegNum = oldOp2Reg;
varDsc2->lvRegNum = oldOp1Reg;
// Do the xchg
emitAttr size = EA_PTRSIZE;
if (varTypeGCtype(type1) != varTypeGCtype(type2))
{
// If the type specified to the emitter is a GC type, it will swap the GC-ness of the registers.
// Otherwise it will leave them alone, which is correct if they have the same GC-ness.
size = EA_GCREF;
}
NYI("register swap");
// inst_RV_RV(INS_xchg, oldOp1Reg, oldOp2Reg, TYP_I_IMPL, size);
// Update the gcInfo.
// Manually remove these regs for the gc sets (mostly to avoid confusing duplicative dump output)
gcInfo.gcRegByrefSetCur &= ~(oldOp1RegMask | oldOp2RegMask);
gcInfo.gcRegGCrefSetCur &= ~(oldOp1RegMask | oldOp2RegMask);
// gcMarkRegPtrVal will do the appropriate thing for non-gc types.
// It will also dump the updates.
gcInfo.gcMarkRegPtrVal(oldOp2Reg, type1);
gcInfo.gcMarkRegPtrVal(oldOp1Reg, type2);
}
//-------------------------------------------------------------------------------------------
// genSetRegToCond: Set a register 'dstReg' to the appropriate one or zero value
// corresponding to a binary Relational operator result.
//
// Arguments:
// dstReg - The target register to set to 1 or 0
// tree - The GenTree Relop node that was used to set the Condition codes
//
// Return Value: none
//
// Notes:
// A full 64-bit value of either 1 or 0 is setup in the 'dstReg'
//-------------------------------------------------------------------------------------------
void CodeGen::genSetRegToCond(regNumber dstReg, GenTree* tree)
{
emitJumpKind jumpKind[2];
bool branchToTrueLabel[2];
genJumpKindsForTree(tree, jumpKind, branchToTrueLabel);
assert(jumpKind[0] != EJ_NONE);
// Set the reg according to the flags
inst_SET(jumpKind[0], dstReg);
// Do we need to use two operation to set the flags?
//
if (jumpKind[1] != EJ_NONE)
{
emitter* emit = getEmitter();
bool ordered = ((tree->gtFlags & GTF_RELOP_NAN_UN) == 0);
insCond secondCond;
// The only ones that require two operations are the
// floating point compare operations of BEQ or BNE.UN
//
if (tree->gtOper == GT_EQ)
{
// This must be an ordered comparison.
assert(ordered);
assert(jumpKind[1] == EJ_vs); // We complement this value
secondCond = INS_COND_VC; // for the secondCond
}
else // gtOper == GT_NE
{
// This must be BNE.UN (unordered comparison)
assert((tree->gtOper == GT_NE) && !ordered);
assert(jumpKind[1] == EJ_lo); // We complement this value
secondCond = INS_COND_HS; // for the secondCond
}
// The second instruction is a 'csinc' instruction that either selects the previous dstReg
// or increments the ZR register, which produces a 1 result.
emit->emitIns_R_R_R_COND(INS_csinc, EA_8BYTE, dstReg, dstReg, REG_ZR, secondCond);
}
}
//------------------------------------------------------------------------
// genIntToFloatCast: Generate code to cast an int/long to float/double
//
// Arguments:
// treeNode - The GT_CAST node
//
// Return Value:
// None.
//
// Assumptions:
// Cast is a non-overflow conversion.
// The treeNode must have an assigned register.
// SrcType= int32/uint32/int64/uint64 and DstType=float/double.
//
void CodeGen::genIntToFloatCast(GenTree* treeNode)
{
// int type --> float/double conversions are always non-overflow ones
assert(treeNode->OperGet() == GT_CAST);
assert(!treeNode->gtOverflow());
regNumber targetReg = treeNode->gtRegNum;
assert(genIsValidFloatReg(targetReg));
GenTree* op1 = treeNode->gtOp.gtOp1;
assert(!op1->isContained()); // Cannot be contained
assert(genIsValidIntReg(op1->gtRegNum)); // Must be a valid int reg.
var_types dstType = treeNode->CastToType();
var_types srcType = genActualType(op1->TypeGet());
assert(!varTypeIsFloating(srcType) && varTypeIsFloating(dstType));
// force the srcType to unsigned if GT_UNSIGNED flag is set
if (treeNode->gtFlags & GTF_UNSIGNED)
{
srcType = genUnsignedType(srcType);
}
// We should never see a srcType whose size is neither EA_4BYTE or EA_8BYTE
emitAttr srcSize = EA_ATTR(genTypeSize(srcType));
noway_assert((srcSize == EA_4BYTE) || (srcSize == EA_8BYTE));
instruction ins = varTypeIsUnsigned(srcType) ? INS_ucvtf : INS_scvtf;
insOpts cvtOption = INS_OPTS_NONE; // invalid value
if (dstType == TYP_DOUBLE)
{
if (srcSize == EA_4BYTE)
{
cvtOption = INS_OPTS_4BYTE_TO_D;
}
else
{
assert(srcSize == EA_8BYTE);
cvtOption = INS_OPTS_8BYTE_TO_D;
}
}
else
{
assert(dstType == TYP_FLOAT);
if (srcSize == EA_4BYTE)
{
cvtOption = INS_OPTS_4BYTE_TO_S;
}
else
{
assert(srcSize == EA_8BYTE);
cvtOption = INS_OPTS_8BYTE_TO_S;
}
}
genConsumeOperands(treeNode->AsOp());
getEmitter()->emitIns_R_R(ins, emitActualTypeSize(dstType), treeNode->gtRegNum, op1->gtRegNum, cvtOption);
genProduceReg(treeNode);
}
//------------------------------------------------------------------------
// genFloatToIntCast: Generate code to cast float/double to int/long
//
// Arguments:
// treeNode - The GT_CAST node
//
// Return Value:
// None.
//
// Assumptions:
// Cast is a non-overflow conversion.
// The treeNode must have an assigned register.
// SrcType=float/double and DstType= int32/uint32/int64/uint64
//
void CodeGen::genFloatToIntCast(GenTree* treeNode)
{
// we don't expect to see overflow detecting float/double --> int type conversions here
// as they should have been converted into helper calls by front-end.
assert(treeNode->OperGet() == GT_CAST);
assert(!treeNode->gtOverflow());
regNumber targetReg = treeNode->gtRegNum;
assert(genIsValidIntReg(targetReg)); // Must be a valid int reg.
GenTree* op1 = treeNode->gtOp.gtOp1;
assert(!op1->isContained()); // Cannot be contained
assert(genIsValidFloatReg(op1->gtRegNum)); // Must be a valid float reg.
var_types dstType = treeNode->CastToType();
var_types srcType = op1->TypeGet();
assert(varTypeIsFloating(srcType) && !varTypeIsFloating(dstType));
// We should never see a dstType whose size is neither EA_4BYTE or EA_8BYTE
// For conversions to small types (byte/sbyte/int16/uint16) from float/double,
// we expect the front-end or lowering phase to have generated two levels of cast.
//
emitAttr dstSize = EA_ATTR(genTypeSize(dstType));
noway_assert((dstSize == EA_4BYTE) || (dstSize == EA_8BYTE));
instruction ins = INS_fcvtzs; // default to sign converts
insOpts cvtOption = INS_OPTS_NONE; // invalid value
if (varTypeIsUnsigned(dstType))
{
ins = INS_fcvtzu; // use unsigned converts
}
if (srcType == TYP_DOUBLE)
{
if (dstSize == EA_4BYTE)
{
cvtOption = INS_OPTS_D_TO_4BYTE;
}
else
{
assert(dstSize == EA_8BYTE);
cvtOption = INS_OPTS_D_TO_8BYTE;
}
}
else
{
assert(srcType == TYP_FLOAT);
if (dstSize == EA_4BYTE)
{
cvtOption = INS_OPTS_S_TO_4BYTE;
}
else
{
assert(dstSize == EA_8BYTE);
cvtOption = INS_OPTS_S_TO_8BYTE;
}
}
genConsumeOperands(treeNode->AsOp());
getEmitter()->emitIns_R_R(ins, dstSize, treeNode->gtRegNum, op1->gtRegNum, cvtOption);
genProduceReg(treeNode);
}
//------------------------------------------------------------------------
// genCkfinite: Generate code for ckfinite opcode.
//
// Arguments:
// treeNode - The GT_CKFINITE node
//
// Return Value:
// None.
//
// Assumptions:
// GT_CKFINITE node has reserved an internal register.
//
void CodeGen::genCkfinite(GenTree* treeNode)
{
assert(treeNode->OperGet() == GT_CKFINITE);
GenTree* op1 = treeNode->gtOp.gtOp1;
var_types targetType = treeNode->TypeGet();
int expMask = (targetType == TYP_FLOAT) ? 0x7F8 : 0x7FF; // Bit mask to extract exponent.
int shiftAmount = targetType == TYP_FLOAT ? 20 : 52;
emitter* emit = getEmitter();
// Extract exponent into a register.
regNumber intReg = treeNode->GetSingleTempReg();
regNumber fpReg = genConsumeReg(op1);
emit->emitIns_R_R(ins_Copy(targetType), emitActualTypeSize(treeNode), intReg, fpReg);
emit->emitIns_R_R_I(INS_lsr, emitActualTypeSize(targetType), intReg, intReg, shiftAmount);
// Mask of exponent with all 1's and check if the exponent is all 1's
emit->emitIns_R_R_I(INS_and, EA_4BYTE, intReg, intReg, expMask);
emit->emitIns_R_I(INS_cmp, EA_4BYTE, intReg, expMask);
// If exponent is all 1's, throw ArithmeticException
emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
genJumpToThrowHlpBlk(jmpEqual, SCK_ARITH_EXCPN);
// if it is a finite value copy it to targetReg
if (treeNode->gtRegNum != fpReg)
{
emit->emitIns_R_R(ins_Copy(targetType), emitActualTypeSize(treeNode), treeNode->gtRegNum, fpReg);
}
genProduceReg(treeNode);
}
//------------------------------------------------------------------------
// genCodeForCompare: Produce code for a GT_EQ/GT_NE/GT_LT/GT_LE/GT_GE/GT_GT/GT_TEST_EQ/GT_TEST_NE node.
//
// Arguments:
// tree - the node
//
void CodeGen::genCodeForCompare(GenTreeOp* tree)
{
regNumber targetReg = tree->gtRegNum;
emitter* emit = getEmitter();
GenTree* op1 = tree->gtOp1;
GenTree* op2 = tree->gtOp2;
var_types op1Type = genActualType(op1->TypeGet());
var_types op2Type = genActualType(op2->TypeGet());
assert(!op1->isUsedFromMemory());
assert(!op2->isUsedFromMemory());
genConsumeOperands(tree);
emitAttr cmpSize = EA_ATTR(genTypeSize(op1Type));
assert(genTypeSize(op1Type) == genTypeSize(op2Type));
if (varTypeIsFloating(op1Type))
{
assert(varTypeIsFloating(op2Type));
assert(!op1->isContained());
assert(op1Type == op2Type);
if (op2->IsIntegralConst(0))
{
assert(op2->isContained());
emit->emitIns_R_F(INS_fcmp, cmpSize, op1->gtRegNum, 0.0);
}
else
{
assert(!op2->isContained());
emit->emitIns_R_R(INS_fcmp, cmpSize, op1->gtRegNum, op2->gtRegNum);
}
}
else
{
assert(!varTypeIsFloating(op2Type));
// We don't support swapping op1 and op2 to generate cmp reg, imm
assert(!op1->isContainedIntOrIImmed());
instruction ins = tree->OperIs(GT_TEST_EQ, GT_TEST_NE) ? INS_tst : INS_cmp;
if (op2->isContainedIntOrIImmed())
{
GenTreeIntConCommon* intConst = op2->AsIntConCommon();
emit->emitIns_R_I(ins, cmpSize, op1->gtRegNum, intConst->IconValue());
}
else
{
emit->emitIns_R_R(ins, cmpSize, op1->gtRegNum, op2->gtRegNum);
}
}
// Are we evaluating this into a register?
if (targetReg != REG_NA)
{
genSetRegToCond(targetReg, tree);
genProduceReg(tree);
}
}
//------------------------------------------------------------------------
// genCodeForJumpCompare: Generates code for jmpCompare statement.
//
// A GT_JCMP node is created when a comparison and conditional branch
// can be executed in a single instruction.
//
// Arm64 has a few instructions with this behavior.
// - cbz/cbnz -- Compare and branch register zero/not zero
// - tbz/tbnz -- Test and branch register bit zero/not zero
//
// The cbz/cbnz supports the normal +/- 1MB branch range for conditional branches
// The tbz/tbnz supports a smaller +/- 32KB branch range
//
// A GT_JCMP cbz/cbnz node is created when there is a GT_EQ or GT_NE
// integer/unsigned comparison against #0 which is used by a GT_JTRUE
// condition jump node.
//
// A GT_JCMP tbz/tbnz node is created when there is a GT_TEST_EQ or GT_TEST_NE
// integer/unsigned comparison against against a mask with a single bit set
// which is used by a GT_JTRUE condition jump node.
//
// This node is repsonsible for consuming the register, and emitting the
// appropriate fused compare/test and branch instruction
//
// Two flags guide code generation
// GTF_JCMP_TST -- Set if this is a tbz/tbnz rather than cbz/cbnz
// GTF_JCMP_EQ -- Set if this is cbz/tbz rather than cbnz/tbnz
//
// Arguments:
// tree - The GT_JCMP tree node.
//
// Return Value:
// None
//
void CodeGen::genCodeForJumpCompare(GenTreeOp* tree)
{
assert(compiler->compCurBB->bbJumpKind == BBJ_COND);
GenTree* op1 = tree->gtGetOp1();
GenTree* op2 = tree->gtGetOp2();
assert(tree->OperIs(GT_JCMP));
assert(!varTypeIsFloating(tree));
assert(!op1->isUsedFromMemory());
assert(!op2->isUsedFromMemory());
assert(op2->IsCnsIntOrI());
assert(op2->isContained());
genConsumeOperands(tree);
regNumber reg = op1->gtRegNum;
emitAttr attr = emitActualTypeSize(op1->TypeGet());
if (tree->gtFlags & GTF_JCMP_TST)
{
ssize_t compareImm = op2->gtIntCon.IconValue();
assert(isPow2(compareImm));
instruction ins = (tree->gtFlags & GTF_JCMP_EQ) ? INS_tbz : INS_tbnz;
int imm = genLog2((size_t)compareImm);
getEmitter()->emitIns_J_R_I(ins, attr, compiler->compCurBB->bbJumpDest, reg, imm);
}
else
{
assert(op2->IsIntegralConst(0));
instruction ins = (tree->gtFlags & GTF_JCMP_EQ) ? INS_cbz : INS_cbnz;
getEmitter()->emitIns_J_R(ins, attr, compiler->compCurBB->bbJumpDest, reg);
}
}
int CodeGenInterface::genSPtoFPdelta()
{
int delta;
// We place the saved frame pointer immediately above the outgoing argument space.
delta = (int)compiler->lvaOutgoingArgSpaceSize;
assert(delta >= 0);
return delta;
}
//---------------------------------------------------------------------
// genTotalFrameSize - return the total size of the stack frame, including local size,
// callee-saved register size, etc.
//
// Return value:
// Total frame size
//
int CodeGenInterface::genTotalFrameSize()
{
// For varargs functions, we home all the incoming register arguments. They are not
// included in the compCalleeRegsPushed count. This is like prespill on ARM32, but
// since we don't use "push" instructions to save them, we don't have to do the
// save of these varargs register arguments as the first thing in the prolog.
assert(!IsUninitialized(compiler->compCalleeRegsPushed));
int totalFrameSize = (compiler->info.compIsVarArgs ? MAX_REG_ARG * REGSIZE_BYTES : 0) +
compiler->compCalleeRegsPushed * REGSIZE_BYTES + compiler->compLclFrameSize;
assert(totalFrameSize >= 0);
return totalFrameSize;
}
//---------------------------------------------------------------------
// genCallerSPtoFPdelta - return the offset from Caller-SP to the frame pointer.
// This number is going to be negative, since the Caller-SP is at a higher
// address than the frame pointer.
//
// There must be a frame pointer to call this function!
int CodeGenInterface::genCallerSPtoFPdelta()
{
assert(isFramePointerUsed());
int callerSPtoFPdelta;
callerSPtoFPdelta = genCallerSPtoInitialSPdelta() + genSPtoFPdelta();
assert(callerSPtoFPdelta <= 0);
return callerSPtoFPdelta;
}
//---------------------------------------------------------------------
// genCallerSPtoInitialSPdelta - return the offset from Caller-SP to Initial SP.
//
// This number will be negative.
int CodeGenInterface::genCallerSPtoInitialSPdelta()
{
int callerSPtoSPdelta = 0;
callerSPtoSPdelta -= genTotalFrameSize();
assert(callerSPtoSPdelta <= 0);
return callerSPtoSPdelta;
}
/*****************************************************************************
* Emit a call to a helper function.
*
*/
void CodeGen::genEmitHelperCall(unsigned helper, int argSize, emitAttr retSize, regNumber callTargetReg /*= REG_NA */)
{
void* addr = nullptr;
void* pAddr = nullptr;
emitter::EmitCallType callType = emitter::EC_FUNC_TOKEN;
addr = compiler->compGetHelperFtn((CorInfoHelpFunc)helper, &pAddr);
regNumber callTarget = REG_NA;
if (addr == nullptr)
{
// This is call to a runtime helper.
// adrp x, [reloc:rel page addr]
// add x, x, [reloc:page offset]
// ldr x, [x]
// br x
if (callTargetReg == REG_NA)
{
// If a callTargetReg has not been explicitly provided, we will use REG_DEFAULT_HELPER_CALL_TARGET, but
// this is only a valid assumption if the helper call is known to kill REG_DEFAULT_HELPER_CALL_TARGET.
callTargetReg = REG_DEFAULT_HELPER_CALL_TARGET;
}
regMaskTP callTargetMask = genRegMask(callTargetReg);
regMaskTP callKillSet = compiler->compHelperCallKillSet((CorInfoHelpFunc)helper);
// assert that all registers in callTargetMask are in the callKillSet
noway_assert((callTargetMask & callKillSet) == callTargetMask);
callTarget = callTargetReg;
// adrp + add with relocations will be emitted
getEmitter()->emitIns_R_AI(INS_adrp, EA_PTR_DSP_RELOC, callTarget, (ssize_t)pAddr);
getEmitter()->emitIns_R_R(INS_ldr, EA_PTRSIZE, callTarget, callTarget);
callType = emitter::EC_INDIR_R;
}
getEmitter()->emitIns_Call(callType, compiler->eeFindHelper(helper), INDEBUG_LDISASM_COMMA(nullptr) addr, argSize,
retSize, EA_UNKNOWN, gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur,
gcInfo.gcRegByrefSetCur, BAD_IL_OFFSET, /* IL offset */
callTarget, /* ireg */
REG_NA, 0, 0, /* xreg, xmul, disp */
false, /* isJump */
emitter::emitNoGChelper(helper));
regMaskTP killMask = compiler->compHelperCallKillSet((CorInfoHelpFunc)helper);
regSet.verifyRegistersUsed(killMask);
}
#ifdef FEATURE_SIMD
//------------------------------------------------------------------------
// genSIMDIntrinsic: Generate code for a SIMD Intrinsic. This is the main
// routine which in turn calls apropriate genSIMDIntrinsicXXX() routine.
//
// Arguments:
// simdNode - The GT_SIMD node
//
// Return Value:
// None.
//
// Notes:
// Currently, we only recognize SIMDVector<float> and SIMDVector<int>, and
// a limited set of methods.
//
// TODO-CLEANUP Merge all versions of this function and move to new file simdcodegencommon.cpp.
void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode)
{
// NYI for unsupported base types
if (simdNode->gtSIMDBaseType != TYP_INT && simdNode->gtSIMDBaseType != TYP_LONG &&
simdNode->gtSIMDBaseType != TYP_FLOAT && simdNode->gtSIMDBaseType != TYP_DOUBLE &&
simdNode->gtSIMDBaseType != TYP_USHORT && simdNode->gtSIMDBaseType != TYP_UBYTE &&
simdNode->gtSIMDBaseType != TYP_SHORT && simdNode->gtSIMDBaseType != TYP_BYTE &&
simdNode->gtSIMDBaseType != TYP_UINT && simdNode->gtSIMDBaseType != TYP_ULONG)
{
noway_assert(!"SIMD intrinsic with unsupported base type.");
}
switch (simdNode->gtSIMDIntrinsicID)
{
case SIMDIntrinsicInit:
genSIMDIntrinsicInit(simdNode);
break;
case SIMDIntrinsicInitN:
genSIMDIntrinsicInitN(simdNode);
break;
case SIMDIntrinsicSqrt:
case SIMDIntrinsicAbs:
case SIMDIntrinsicCast:
case SIMDIntrinsicConvertToSingle:
case SIMDIntrinsicConvertToInt32:
case SIMDIntrinsicConvertToDouble:
case SIMDIntrinsicConvertToInt64:
genSIMDIntrinsicUnOp(simdNode);
break;
case SIMDIntrinsicWidenLo:
case SIMDIntrinsicWidenHi:
genSIMDIntrinsicWiden(simdNode);
break;
case SIMDIntrinsicNarrow:
genSIMDIntrinsicNarrow(simdNode);
break;
case SIMDIntrinsicAdd:
case SIMDIntrinsicSub:
case SIMDIntrinsicMul:
case SIMDIntrinsicDiv:
case SIMDIntrinsicBitwiseAnd:
case SIMDIntrinsicBitwiseAndNot:
case SIMDIntrinsicBitwiseOr:
case SIMDIntrinsicBitwiseXor:
case SIMDIntrinsicMin:
case SIMDIntrinsicMax:
case SIMDIntrinsicEqual:
case SIMDIntrinsicLessThan:
case SIMDIntrinsicGreaterThan:
case SIMDIntrinsicLessThanOrEqual:
case SIMDIntrinsicGreaterThanOrEqual:
genSIMDIntrinsicBinOp(simdNode);
break;
case SIMDIntrinsicOpEquality:
case SIMDIntrinsicOpInEquality:
genSIMDIntrinsicRelOp(simdNode);
break;
case SIMDIntrinsicDotProduct:
genSIMDIntrinsicDotProduct(simdNode);
break;
case SIMDIntrinsicGetItem:
genSIMDIntrinsicGetItem(simdNode);
break;
case SIMDIntrinsicSetX:
case SIMDIntrinsicSetY:
case SIMDIntrinsicSetZ:
case SIMDIntrinsicSetW:
genSIMDIntrinsicSetItem(simdNode);
break;
case SIMDIntrinsicUpperSave:
genSIMDIntrinsicUpperSave(simdNode);
break;
case SIMDIntrinsicUpperRestore:
genSIMDIntrinsicUpperRestore(simdNode);
break;
case SIMDIntrinsicSelect:
NYI("SIMDIntrinsicSelect lowered during import to (a & sel) | (b & ~sel)");
break;
default:
noway_assert(!"Unimplemented SIMD intrinsic.");
unreached();
}
}
insOpts CodeGen::genGetSimdInsOpt(emitAttr size, var_types elementType)
{
assert((size == EA_16BYTE) || (size == EA_8BYTE));
insOpts result = INS_OPTS_NONE;
switch (elementType)
{
case TYP_DOUBLE:
case TYP_ULONG:
case TYP_LONG:
result = (size == EA_16BYTE) ? INS_OPTS_2D : INS_OPTS_1D;
break;