Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,10 @@ class CodeGen final : public CodeGenInterface

void genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroed);

#if defined(TARGET_ARM64)
void genUnknownSizeFrame();
#endif

#elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
bool genInstrWithConstant(instruction ins,
emitAttr attr,
Expand Down
38 changes: 38 additions & 0 deletions src/coreclr/jit/codegenarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4876,9 +4876,47 @@ void CodeGen::genPushCalleeSavedRegisters()
m_compiler->compFrameInfo.calleeSaveSpOffset = calleeSaveSpOffset;
m_compiler->compFrameInfo.calleeSaveSpDelta = calleeSaveSpDelta;
m_compiler->compFrameInfo.offsetSpToSavedFp = offsetSpToSavedFp;

if (m_compiler->compUsesUnknownSizeFrame)
{
genUnknownSizeFrame();
}
#endif // TARGET_ARM64
}

#if defined(TARGET_ARM64)
// See Compiler::UnknownSizeFrame for implementation details.
void CodeGen::genUnknownSizeFrame()
{
assert(m_compiler->compLocallocUsed && m_compiler->compUsesUnknownSizeFrame);
assert(m_compiler->unkSizeFrame.isFinalized);
unsigned totalVectorCount = m_compiler->unkSizeFrame.FrameSizeInVectors();

// We reserve REG_UNKBASE for addressing SVE locals. This will always point at the top of
// of the UnknownSizeFrame and we index into it.
// TODO-SVE: We may want this to point into the middle of the frame to reduce address
// computations (we have a signed 9-bit indexing immediate).
inst_Mov(TYP_I_IMPL, REG_UNKBASE, REG_SP, false);

if (0 < totalVectorCount && totalVectorCount <= 32)
{
GetEmitter()->emitIns_R_R_I(INS_sve_addvl, EA_8BYTE, REG_SP, REG_SP, -(ssize_t)totalVectorCount);
}
else
{
// Generate `sp = sp - totalVectorCount * VL`
assert(totalVectorCount != 0);
regNumber rsvd = rsGetRsvdReg();
// mov rsvd, #totalVectorCount
// rdvl scratch, #1
// msub sp, rsvd, scratch, sp
instGen_Set_Reg_To_Imm(EA_8BYTE, rsvd, totalVectorCount);
GetEmitter()->emitIns_R_I(INS_sve_rdvl, EA_8BYTE, REG_SCRATCH, 1);
GetEmitter()->emitIns_R_R_R_R(INS_msub, EA_8BYTE, REG_SP, rsvd, REG_SCRATCH, REG_SP);
}
}
#endif

/*****************************************************************************
*
* Generates code for a function epilog.
Expand Down
16 changes: 16 additions & 0 deletions src/coreclr/jit/codegencommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3644,6 +3644,11 @@ void CodeGen::genCheckUseBlockInit()
continue;
}

if (m_compiler->lvaIsUnknownSizeLocal(varNum))
{
continue;
}

if (m_compiler->fgVarIsNeverZeroInitializedInProlog(varNum))
{
varDsc->lvMustInit = 0;
Expand Down Expand Up @@ -4001,6 +4006,12 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg,

noway_assert(varDsc->lvOnFrame);

if (m_compiler->lvaIsUnknownSizeLocal(varNum))
{
// This local will belong on the UnknownSizeFrame, which will handle zeroing instead.
continue;
}

// lvMustInit can only be set for GC types or TYP_STRUCT types
// or when compInitMem is true
// or when in debug code
Expand Down Expand Up @@ -5067,6 +5078,11 @@ void CodeGen::genFnProlog()
continue;
}

if (m_compiler->lvaIsUnknownSizeLocal(varNum))
{
continue;
}

signed int loOffs = varDsc->GetStackOffset();
signed int hiOffs = varDsc->GetStackOffset() + m_compiler->lvaLclStackHomeSize(varNum);

Expand Down
170 changes: 170 additions & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -4124,6 +4124,176 @@ class Compiler

int lvaOSRLocalTier0FrameOffset(unsigned varNum);

//------------------------- UnknownSizeFrame ---------------------------------

void lvaInitUnknownSizeFrame();
void lvaAllocUnknownSizeLocal(unsigned varNum);

bool compUsesUnknownSizeFrame;

#if defined(FEATURE_SIMD) && defined(TARGET_ARM64)
// For ARM64, the UnknownSizeFrame lives at the end of the statically
// allocated stack space. This means it belongs to the 'alloca' space on the
// frame, and it is essentially the first dynamically allocated stack
// variable.
//
// Currently, the only locals with unknown size are SIMD types supporting
// Vector<T>, TYP_SIMD and TYP_MASK. We do not know the size of these types
// at compile time, so we need to execute the rdvl/addvl instruction to
// learn this size and allocate the UnknownSizeFrame.
//
// We reserve the x19 register to point to the top of the UnknownSizeFrame
// and use this as the base address for local variables with unknown size.
// Reserving a register is simpler than using fp/sp, as fp may point
// to different locations depending on various properties of the frame, and
// the value of sp may change at runtime.
//
// Typically, a vector is loaded using a base address and some index which
// the instruction will scale by VL, for example: `ldr z0, [x19, #3 MUL VL]`.
// A mask is loaded with `ldr p0, [x19, #3 MUL VL]`, but in this case the
// `MUL VL` indicates we are scaling with the length of the predicate
// register rather than the vector. A predicate register is defined to have
// 1/8th the length of a vector register.
//
// We know that sizeof(TYP_SIMD) and sizeof(TYP_MASK) are invariant despite
// being unknown at compile time, so we allocate them in single homogeneous
// blocks per type. An individual local can be referenced from the start of
// its block by an index into the block.
//
// The difference in addressing-mode index scaling means we have to be
// careful where we place the mask locals block with respect to the vector
// locals block. If we place the mask locals after the vector locals, we'll
// need to offset the load index by (8 * nVector) to account for the vector
// locals.
//
// Instead, we choose to pad the mask locals block to VL and place it at the
// beginning of the frame (closest to fp). This way we'll need to offset
// vector load indices by `roundUp(nMask, 8) / 8`. This is less likely to
// put pressure on the immediate encoding range and result in requiring an
// address computation.
//
// The maximum wasted space from the padding is 7/8ths VL (224 bytes with
// the architectural maximum 256 byte vectors), which occurs when 1 mask
// local is spilled to the frame. Alternatively this is 28 bytes for 32 byte
// vectors, for an example closer to today's implementations.
//
// The padding also makes it simple to allocate the UnknownSizeFrame since
// the UnknownSizeFrame will be aligned to VL. The total number of vectors
// to allocate is `(roundUp(nMask, 8) / 8) + nVector`. The stack pointer
// can be adjusted with a single instruction `addvl sp, sp, #totalVectors`.
//
// See the diagram below for a visual representation of this scheme.
//
// ...
// | static space |
// | (totalFrameSize) |
// +----------------------------------+ x19, begin UnknownSizeFrame
// | mask locals block | ^
// | (nMask * VL/8) | |
// +----------------------------------+ |
// | padding to VL alignment | |
// +----------------------------------+ (roundUp(nMask, 8)/8 + nVector)*VL
// | | |
// | vector locals block | |
// | (nVector * VL) | |
// | | v
// +----------------------------------+ end UnknownSizeFrame
// | |
// | rest of alloca space |
// ... sp
struct UnknownSizeFrame
{
// Number of allocated vectors/masks. These also represent the end of
// the allocation space for each block. The allocator for each block is
// a simple bump allocator.
unsigned nVector = 0;
unsigned nMask = 0;

#ifdef DEBUG
bool isFinalized = false;
#endif

// Returns the size of the mask block in number of vector lengths.
unsigned MaskBlockSizeInVectors()
{
assert(roundUp(0U, 8U) == 0);
return roundUp(nMask, 8) / 8;
}

// Returns the size of the vector block in number of vector lengths.
unsigned VectorBlockSize()
{
return nVector;
}

// Returns the size of the total UnknownSizeFrame in number of vector
// lengths.
unsigned FrameSizeInVectors()
{
return MaskBlockSizeInVectors() + VectorBlockSize();
}

// Allocate a mask, returning an index of the mask in the mask block.
unsigned AllocMask()
{
assert(!isFinalized);
unsigned idx = nMask;
nMask++;
return idx;
}

// Allocate a vector, returning an index of the vector in the vector
// block.
unsigned AllocVector()
{
assert(!isFinalized);
unsigned idx = nVector;
nVector++;
return idx;
}

// Returns a negative offset relative to the base of the UnknownSizeFrame
// for addressing an allocated vector or mask local.
// If `isMask == true`, given an index that was assigned to mask local,
// the returned offset is an index measured in units of VL/8.
// Otherwise given an index that was assigned to a vector local, the
// returned offset is measured in units of VL.
// The index parameter should have been obtained through AllocMask() or
// AllocVector().
int GetOffset(unsigned index, bool isMask = false)
{
// We can't compute addresses if we haven't finished allocating.
assert(isFinalized);

unsigned offset = UINT32_MAX;
if (isMask)
{
assert(index < nMask);
offset = index;
}
else
{
assert(index < nVector);
offset = MaskBlockSizeInVectors() + index;
}
assert(offset != UINT32_MAX);
// The index is always offset by 1 as we are writing from below fp
// upwards.
return -(int)(offset + 1);
}

// This system ensures we don't try and generate an address on the frame
// without finishing all allocations.
void Finalize()
{
#ifdef DEBUG
isFinalized = true;
#endif
}

} unkSizeFrame;
#endif

//------------------------ For splitting types ----------------------------

void lvaInitTypeRef();
Expand Down
14 changes: 12 additions & 2 deletions src/coreclr/jit/compiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2744,7 +2744,7 @@ inline
#endif // !TARGET_AMD64
}

FPbased = varDsc->lvFramePointerBased;
FPbased = varDsc->lvFramePointerBased && !lvaIsUnknownSizeLocal(varNum);

#ifdef DEBUG
#if FEATURE_FIXED_OUT_ARGS
Expand All @@ -2765,7 +2765,17 @@ inline
}
#endif // DEBUG

varOffset = varDsc->GetStackOffset();
#ifdef TARGET_ARM64
if (lvaIsUnknownSizeLocal(varNum) && !varDsc->lvIsStructField)
{
assert(!FPbased);
varOffset = unkSizeFrame.GetOffset(varDsc->GetStackOffset(), varDsc->TypeIs(TYP_MASK));
}
else
#endif
{
varOffset = varDsc->GetStackOffset();
}
}
else // Its a spill-temp
{
Expand Down
Loading
Loading