Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 58 additions & 46 deletions src/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3046,9 +3046,22 @@ CodeGen::genMultiRegCallStoreToLocal(GenTreePtr treeNode)
}


/***********************************************************************************************
* Generate code for localloc
*/
//------------------------------------------------------------------------
// genLclHeap: Generate code for localloc.
//
// Arguments:
// tree - the localloc tree to generate.
//
// Notes:
// Note that for x86, we don't track ESP movements while generating the localloc code.
// The ESP tracking is used to report stack pointer-relative GC info, which is not
// interesting while doing the localloc construction. Also, for functions with localloc,
// we have EBP frames, and EBP-relative locals, and ESP-relative accesses only for function
// call arguments. We store the ESP after the localloc is complete in the LocAllocSP
// variable. This variable is implicitly reported to the VM in the GC info (its position
// is defined by convention relative to other items), and is used by the GC to find the
// "base" stack pointer in functions with localloc.
//
void
CodeGen::genLclHeap(GenTreePtr tree)
{
Expand Down Expand Up @@ -3106,7 +3119,9 @@ CodeGen::genLclHeap(GenTreePtr tree)
}
else
{
// If 0 bail out by returning null in targetReg
// The localloc requested memory size is non-constant.

// Put the size value in targetReg. If it is zero, bail out by returning null in targetReg.
genConsumeRegAndCopy(size, targetReg);
endLabel = genCreateTempLabel();
getEmitter()->emitIns_R_R(INS_test, easz, targetReg, targetReg);
Expand All @@ -3127,33 +3142,40 @@ CodeGen::genLclHeap(GenTreePtr tree)
tmpRegsMask &= ~regCntMask;
regCnt = genRegNumFromMask(regCntMask);
if (regCnt != targetReg)
{
// Above, we put the size in targetReg. Now, copy it to our new temp register if necessary.
inst_RV_RV(INS_mov, regCnt, targetReg, size->TypeGet());
}
}

// Align to STACK_ALIGN
// regCnt will be the total number of bytes to localloc
inst_RV_IV(INS_add, regCnt, (STACK_ALIGN - 1), emitActualTypeSize(type));
// Round up the number of bytes to allocate to a STACK_ALIGN boundary. This is done
// by code like:
// add reg, 15
// and reg, -16
// However, in the initialized memory case, we need the count of STACK_ALIGN-sized
// elements, not a byte count, after the alignment. So instead of the "and", which
// becomes unnecessary, generate a shift, e.g.:
// add reg, 15
// shr reg, 4

inst_RV_IV(INS_add, regCnt, STACK_ALIGN - 1, emitActualTypeSize(type));

#if defined(_TARGET_X86_)
// TODO-Cleanup: change amd64 to use the same code path as x86 to reduce #ifdefs
// and improve amd64 CQ (use a dec loop instead of sub rsp loop).
if (compiler->info.compInitMem)
{
// Convert the count from a count of bytes to a count of pointer-sized words.
// We don't need the 'and' because we'll shift off those bits anyway. That's
// asserted by the following.
C_ASSERT((STACK_ALIGN >> STACK_ALIGN_SHIFT) <= 1);
// Convert the count from a count of bytes to a loop count. We will loop once per
// stack alignment size, so each loop will zero 4 bytes on x86 and 16 bytes on x64.
// Note that we zero a single reg-size word per iteration on x86, and 2 reg-size
// words per iteration on x64. We will shift off all the stack alignment bits
// added above, so there is no need for an 'and' instruction.

// --- shr regCnt, 2 ---
inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_PTRSIZE, regCnt, STACK_ALIGN_SHIFT);
// --- shr regCnt, 2 (or 4) ---
inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_PTRSIZE, regCnt, STACK_ALIGN_SHIFT_ALL);
}
else
{
// Otherwise, mask off the low bits to align the byte count.
inst_RV_IV(INS_AND, regCnt, ~(STACK_ALIGN - 1), emitActualTypeSize(type));
}
#else // !_TARGET_X86_
inst_RV_IV(INS_AND, regCnt, ~(STACK_ALIGN - 1), emitActualTypeSize(type));
#endif // !_TARGET_X86_
}

#if FEATURE_FIXED_OUT_ARGS
Expand All @@ -3179,18 +3201,17 @@ CodeGen::genLclHeap(GenTreePtr tree)
{
// We should reach here only for non-zero, constant size allocations.
assert(amount > 0);
assert((amount % STACK_ALIGN) == 0);
assert((amount % REGSIZE_BYTES) == 0);

// For small allocations we will generate up to six push 0 inline
size_t cntPtrSizedWords = (amount >> STACK_ALIGN_SHIFT);
if (cntPtrSizedWords <= 6)
size_t cntRegSizedWords = amount / REGSIZE_BYTES;
if (cntRegSizedWords <= 6)
{
while (cntPtrSizedWords != 0)
for (; cntRegSizedWords != 0; cntRegSizedWords--)
{
// push_hide means don't track the stack
inst_IV(INS_push_hide, 0);
cntPtrSizedWords--;
inst_IV(INS_push_hide, 0); // push_hide means don't track the stack

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An extra comment might be nice here - we're not tracking the stack because the stack tracking is updated separately for localloc, or ...?

}

goto ALLOC_DONE;
}

Expand Down Expand Up @@ -3246,51 +3267,42 @@ CodeGen::genLclHeap(GenTreePtr tree)

// else, "mov regCnt, amount"

#if defined(_TARGET_X86_)
if (compiler->info.compInitMem)
{
// For x86, when initializing memory with a constant count, we want 'amount' to be the
// count of pointer-sized words, not bytes.
amount = cntPtrSizedWords;
// When initializing memory, we want 'amount' to be the loop count.
assert((amount % STACK_ALIGN) == 0);
amount /= STACK_ALIGN;
}
#endif // _TARGET_X86_

genSetRegToIcon(regCnt, amount, ((int)amount == amount)? TYP_INT : TYP_LONG);
}

loop = genCreateTempLabel();
if (compiler->info.compInitMem)
{
// At this point 'regCnt' is set to the total number of bytes (or words, for constant counts) to locAlloc.
// At this point 'regCnt' is set to the number of loop iterations for this loop, if each
// iteration zeros (and subtracts from the stack pointer) STACK_ALIGN bytes.
// Since we have to zero out the allocated memory AND ensure that RSP is always valid
// by tickling the pages, we will just push 0's on the stack.
//
// Note: regCnt is guaranteed to be even on Amd64 since STACK_ALIGN/TARGET_POINTER_SIZE = 2
// and localloc size is a multiple of STACK_ALIGN.

assert(genIsValidIntReg(regCnt));

// Loop:
genDefineTempLabel(loop);

#if defined(_TARGET_AMD64_)
// dec is a 2 byte instruction, but sub is 4 (could be 3 if
// we know size is TYP_INT instead of TYP_I_IMPL)
// Also we know that we can only push 8 bytes at a time, but
// alignment is 16 bytes, so we can push twice and do a sub
// for just a little bit of loop unrolling
// Push two 8-byte zeros. This matches the 16-byte STACK_ALIGN value.
static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2));

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I realize that this would be self-evident and redundant with what is above, but a small comment noting that this relies on the fact that STACK_ALIGN is 16 bytes would be useful.

inst_IV(INS_push_hide, 0); // --- push 8-byte 0
inst_IV(INS_push_hide, 0); // --- push 8-byte 0

// Note that regCnt is the number of bytes to stack allocate.
// Therefore we need to subtract 16 from regcnt here.
inst_RV_IV(INS_sub, regCnt, 16, emitActualTypeSize(type));
#elif defined(_TARGET_X86_)
// Push a single 4-byte zero. This matches the 4-byte STACK_ALIGN value.
static_assert_no_msg(STACK_ALIGN == REGSIZE_BYTES);
inst_IV(INS_push_hide, 0); // --- push 4-byte 0
inst_RV(INS_dec, regCnt, TYP_I_IMPL);
#endif // _TARGET_X86_

// If not done, loop
// Decrement the loop counter and loop if not done.
inst_RV(INS_dec, regCnt, TYP_I_IMPL);
inst_JMP(EJ_jne, loop);
}
else
Expand Down
9 changes: 5 additions & 4 deletions src/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1901,8 +1901,8 @@ Lowering::TreeNodeInfoInitLclHeap(GenTree* tree)
//
// Size? Init Memory? # temp regs
// 0 - 0
// const and <=6 ptr words - 0
// const and >6 ptr words Yes 0
// const and <=6 reg words - 0
// const and >6 reg words Yes 0
// const and <PageSize No 0 (amd64) 1 (x86)
// const and >=PageSize No 2
// Non-const Yes 0
Expand All @@ -1925,11 +1925,12 @@ Lowering::TreeNodeInfoInitLclHeap(GenTree* tree)
// Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
// This should also help in debugging as we can examine the original size specified with localloc.
sizeVal = AlignUp(sizeVal, STACK_ALIGN);
size_t cntStackAlignedWidthItems = (sizeVal >> STACK_ALIGN_SHIFT);

// For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
// we will generate 'push 0'.
if (cntStackAlignedWidthItems <= 6)
assert((sizeVal % REGSIZE_BYTES) == 0);
size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES;
if (cntRegSizedWords <= 6)
{
info->internalIntCount = 0;
}
Expand Down
2 changes: 2 additions & 0 deletions src/jit/target.h
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,7 @@ typedef unsigned short regPairNoSmall; // arm: need 12 bits
#define CODE_ALIGN 1 // code alignment requirement
#define STACK_ALIGN 4 // stack alignment requirement
#define STACK_ALIGN_SHIFT 2 // Shift-right amount to convert stack size in bytes to size in DWORD_PTRs
#define STACK_ALIGN_SHIFT_ALL 2 // Shift-right amount to convert stack size in bytes to size in STACK_ALIGN units

#define RBM_INT_CALLEE_SAVED (RBM_EBX|RBM_ESI|RBM_EDI)
#define RBM_INT_CALLEE_TRASH (RBM_EAX|RBM_ECX|RBM_EDX)
Expand Down Expand Up @@ -780,6 +781,7 @@ typedef unsigned short regPairNoSmall; // arm: need 12 bits
#define CODE_ALIGN 1 // code alignment requirement
#define STACK_ALIGN 16 // stack alignment requirement
#define STACK_ALIGN_SHIFT 3 // Shift-right amount to convert stack size in bytes to size in pointer sized words
#define STACK_ALIGN_SHIFT_ALL 4 // Shift-right amount to convert stack size in bytes to size in STACK_ALIGN units

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is STACK_ALIGN_SHIFT still used after your changes?

I have always found it to be confusing and would like to eliminate it eventually.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it's still used, by legacy JIT. It certainly confused me. I suppose we could delete it from the non-legacy archs in target.h.


#if ETW_EBP_FRAMED
#define RBM_ETW_FRAMED_EBP RBM_NONE
Expand Down