Skip to content

Commit

Permalink
Fix Apple Silicon shuffle thunks (#53250)
Browse files Browse the repository at this point in the history
* Fix Apple Silicon shuffle thunks

Fixes 47294

* Set and use m_hfaFieldSize for stack arguments

m_hfaFieldSize is needed to calculate correct shuffle size.
  • Loading branch information
sdmaclea committed Jun 8, 2021
1 parent 84c11e4 commit d737521
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 54 deletions.
4 changes: 2 additions & 2 deletions src/coreclr/vm/arm64/cgencpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ class StubLinkerCPU : public StubLinker

private:
void EmitLoadStoreRegPairImm(DWORD flags, int regNum1, int regNum2, IntReg Xn, int offset, BOOL isVec);
void EmitLoadStoreRegImm(DWORD flags, int regNum, IntReg Xn, int offset, BOOL isVec);
void EmitLoadStoreRegImm(DWORD flags, int regNum, IntReg Xn, int offset, BOOL isVec, int log2Size = 3);
public:

// BitFlags for EmitLoadStoreReg(Pair)Imm methods
Expand Down Expand Up @@ -484,7 +484,7 @@ class StubLinkerCPU : public StubLinker
void EmitLoadStoreRegPairImm(DWORD flags, IntReg Xt1, IntReg Xt2, IntReg Xn, int offset=0);
void EmitLoadStoreRegPairImm(DWORD flags, VecReg Vt1, VecReg Vt2, IntReg Xn, int offset=0);

void EmitLoadStoreRegImm(DWORD flags, IntReg Xt, IntReg Xn, int offset=0);
void EmitLoadStoreRegImm(DWORD flags, IntReg Xt, IntReg Xn, int offset=0, int log2Size = 3);
void EmitLoadStoreRegImm(DWORD flags, VecReg Vt, IntReg Xn, int offset=0);

void EmitLoadRegReg(IntReg Xt, IntReg Xn, IntReg Xm, DWORD option);
Expand Down
54 changes: 42 additions & 12 deletions src/coreclr/vm/arm64/stubs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1198,7 +1198,7 @@ AdjustContextForVirtualStub(
{
pExceptionRecord->ExceptionAddress = (PVOID)callsite;
}

SetIP(pContext, callsite);

return TRUE;
Expand Down Expand Up @@ -1567,29 +1567,32 @@ void StubLinkerCPU::EmitLoadStoreRegPairImm(DWORD flags, int regNum1, int regNum
}


void StubLinkerCPU::EmitLoadStoreRegImm(DWORD flags, IntReg Xt, IntReg Xn, int offset)
void StubLinkerCPU::EmitLoadStoreRegImm(DWORD flags, IntReg Xt, IntReg Xn, int offset, int log2Size)
{
EmitLoadStoreRegImm(flags, (int)Xt, Xn, offset, FALSE);
EmitLoadStoreRegImm(flags, (int)Xt, Xn, offset, FALSE, log2Size);
}
void StubLinkerCPU::EmitLoadStoreRegImm(DWORD flags, VecReg Vt, IntReg Xn, int offset)
{
EmitLoadStoreRegImm(flags, (int)Vt, Xn, offset, TRUE);
}

void StubLinkerCPU::EmitLoadStoreRegImm(DWORD flags, int regNum, IntReg Xn, int offset, BOOL isVec)
void StubLinkerCPU::EmitLoadStoreRegImm(DWORD flags, int regNum, IntReg Xn, int offset, BOOL isVec, int log2Size)
{
// Encoding:
// wb=1 : [size(2)=11] | 1 | 1 | 1 | [IsVec(1)] | 0 | [!writeBack(1)] | 0 | [isLoad(1)] | 0 | [imm(7)] | [!postIndex(1)] | [Xn(5)] | [Xt(5)]
// wb=0 : [size(2)=11] | 1 | 1 | 1 | [IsVec(1)] | 0 | [!writeBack(1)] | 0 | [isLoad(1)] | [ imm(12) ] | [Xn(5)] | [Xt(5)]
// where IsVec=0 for IntReg, 1 for VecReg

_ASSERTE((log2Size & ~0x3ULL) == 0);

BOOL isLoad = flags & 1;
BOOL writeBack = flags & 2;
BOOL postIndex = flags & 4;
if (writeBack)
{
_ASSERTE(-256 <= offset && offset <= 255);
Emit32((DWORD) ( (0x1F<<27) |
Emit32((DWORD) ( (log2Size << 30) |
(0x7<<27) |
(!!isVec<<26) |
(!writeBack<<24) |
(!!isLoad<<22) |
Expand All @@ -1602,13 +1605,16 @@ void StubLinkerCPU::EmitLoadStoreRegImm(DWORD flags, int regNum, IntReg Xn, int
}
else
{
_ASSERTE((0 <= offset) && (offset <= 32760));
_ASSERTE((offset & 7) == 0);
Emit32((DWORD) ( (0x1F<<27) |
int scaledOffset = 0xFFF & (offset >> log2Size);

_ASSERTE(offset == (scaledOffset << log2Size));

Emit32((DWORD) ( (log2Size << 30) |
(0x7<<27) |
(!!isVec<<26) |
(!writeBack<<24) |
(!!isLoad<<22) |
((0xFFF & (offset >> 3)) << 10) |
(scaledOffset << 10) |
(Xn<<5) |
(regNum))
);
Expand Down Expand Up @@ -1715,15 +1721,25 @@ VOID StubLinkerCPU::EmitShuffleThunk(ShuffleEntry *pShuffleEntryArray)
{
// If source is present in register then destination must also be a register
_ASSERTE(pEntry->dstofs & ShuffleEntry::REGMASK);
_ASSERTE(!(pEntry->dstofs & ShuffleEntry::FPREGMASK));
_ASSERTE(!(pEntry->srcofs & ShuffleEntry::FPREGMASK));

EmitMovReg(IntReg(pEntry->dstofs & ShuffleEntry::OFSMASK), IntReg(pEntry->srcofs & ShuffleEntry::OFSMASK));
EmitMovReg(IntReg(pEntry->dstofs & ShuffleEntry::OFSREGMASK), IntReg(pEntry->srcofs & ShuffleEntry::OFSREGMASK));
}
else if (pEntry->dstofs & ShuffleEntry::REGMASK)
{
// source must be on the stack
_ASSERTE(!(pEntry->srcofs & ShuffleEntry::REGMASK));
_ASSERTE(!(pEntry->dstofs & ShuffleEntry::FPREGMASK));


EmitLoadStoreRegImm(eLOAD, IntReg(pEntry->dstofs & ShuffleEntry::OFSMASK), RegSp, pEntry->srcofs * sizeof(void*));
#if !defined(TARGET_OSX)
EmitLoadStoreRegImm(eLOAD, IntReg(pEntry->dstofs & ShuffleEntry::OFSREGMASK), RegSp, pEntry->srcofs * sizeof(void*));
#else
int log2Size = (pEntry->srcofs >> 12);
int srcOffset = int(pEntry->srcofs & 0xfff) << log2Size;
EmitLoadStoreRegImm(eLOAD, IntReg(pEntry->dstofs & ShuffleEntry::OFSREGMASK), RegSp, srcOffset, log2Size);
#endif
}
else
{
Expand All @@ -1733,8 +1749,22 @@ VOID StubLinkerCPU::EmitShuffleThunk(ShuffleEntry *pShuffleEntryArray)
// dest must be on the stack
_ASSERTE(!(pEntry->dstofs & ShuffleEntry::REGMASK));

#if !defined(TARGET_OSX)
EmitLoadStoreRegImm(eLOAD, IntReg(9), RegSp, pEntry->srcofs * sizeof(void*));
EmitLoadStoreRegImm(eSTORE, IntReg(9), RegSp, pEntry->dstofs * sizeof(void*));
#else
// Decode ShuffleIterator::GetNextOfs() encoded entries
// See comments in that function

// We expect src and dst stack size to be the same.
_ASSERTE((pEntry->srcofs >> 12) == (pEntry->dstofs >> 12));
int log2Size = (pEntry->srcofs >> 12);
int srcOffset = int(pEntry->srcofs & 0xfff) << log2Size;
int dstOffset = int(pEntry->dstofs & 0xfff) << log2Size;

EmitLoadStoreRegImm(eLOAD, IntReg(9), RegSp, srcOffset, log2Size);
EmitLoadStoreRegImm(eSTORE, IntReg(9), RegSp, dstOffset, log2Size);
#endif
}
}

Expand All @@ -1757,7 +1787,7 @@ VOID StubLinkerCPU::EmitComputedInstantiatingMethodStub(MethodDesc* pSharedMD, s
_ASSERTE(pEntry->dstofs != ShuffleEntry::HELPERREG);
_ASSERTE(pEntry->srcofs != ShuffleEntry::HELPERREG);

EmitMovReg(IntReg(pEntry->dstofs & ShuffleEntry::OFSMASK), IntReg(pEntry->srcofs & ShuffleEntry::OFSMASK));
EmitMovReg(IntReg(pEntry->dstofs & ShuffleEntry::OFSREGMASK), IntReg(pEntry->srcofs & ShuffleEntry::OFSREGMASK));
}

MetaSig msig(pSharedMD);
Expand Down
5 changes: 5 additions & 0 deletions src/coreclr/vm/callingconvention.h
Original file line number Diff line number Diff line change
Expand Up @@ -715,6 +715,11 @@ class ArgIteratorTemplate : public ARGITERATOR_BASE
pLoc->m_byteStackIndex = TransitionBlock::GetStackArgumentByteIndexFromOffset(argOffset);
const bool isValueType = (m_argType == ELEMENT_TYPE_VALUETYPE);
const bool isFloatHfa = (isValueType && !m_argTypeHandle.IsNull() && m_argTypeHandle.IsHFA());
if (isFloatHfa)
{
CorInfoHFAElemType type = m_argTypeHandle.GetHFAType();
pLoc->setHFAFieldSize(type);
}
pLoc->m_byteStackSize = StackElemSize(byteArgSize, isValueType, isFloatHfa);
}
}
Expand Down
101 changes: 72 additions & 29 deletions src/coreclr/vm/comdelegate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,11 @@ class ShuffleIterator
}

// Get next offset to shuffle. There has to be at least one offset left.
// For register arguments it returns regNum | ShuffleEntry::REGMASK | ShuffleEntry::FPREGMASK.
// For stack arguments it returns stack offset in bytes with negative sign.
// It returns an offset encoded properly for a ShuffleEntry offset.
// - For floating register arguments it returns regNum | ShuffleEntry::REGMASK | ShuffleEntry::FPREGMASK.
// - For register arguments it returns regNum | ShuffleEntry::REGMASK.
// - For stack arguments it returns stack offset index in stack slots for most architectures. For macOS-arm64,
// it returns an encoded stack offset, see below.
int GetNextOfs()
{
int index;
Expand Down Expand Up @@ -189,6 +192,8 @@ class ShuffleIterator
if (m_currentByteStackIndex < m_argLocDesc->m_byteStackSize)
{
const unsigned byteIndex = m_argLocDesc->m_byteStackIndex + m_currentByteStackIndex;

#if !defined(TARGET_OSX) || !defined(TARGET_ARM64)
index = byteIndex / TARGET_POINTER_SIZE;
m_currentByteStackIndex += TARGET_POINTER_SIZE;

Expand All @@ -198,7 +203,68 @@ class ShuffleIterator
COMPlusThrow(kNotSupportedException);
}

return -(int)byteIndex;
// Only Apple Silicon ABI currently supports unaligned stack argument shuffling
_ASSERTE(byteIndex == unsigned(index * TARGET_POINTER_SIZE));
return index;
#else
// Tha Apple Silicon ABI does not consume an entire stack slot for every argument
// Arguments smaller than TARGET_POINTER_SIZE are always aligned to their argument size
// But may not begin at the beginning of a stack slot
//
// The argument location description has been updated to describe the stack offest and
// size in bytes. We will use it as our source of truth.
//
// The ShuffleEntries will be implemented by the Arm64 StubLinkerCPU::EmitLoadStoreRegImm
// using the 12-bit scaled immediate stack offset. The load/stores can be implemented as 1/2/4/8
// bytes each (natural binary sizes).
//
// Each offset is encode as a log2 size and a 12-bit unsigned scaled offset.
// We only emit offsets of these natural binary sizes
//
// We choose the offset based on the ABI stack alignment requirements
// - Small integers are shuffled based on their size
// - HFA are shuffled based on their element size
// - Others are shuffled in full 8 byte chunks.
int bytesRemaining = m_argLocDesc->m_byteStackSize - m_currentByteStackIndex;
int log2Size = 3;

// If isHFA, shuffle based on field size
// otherwise shuffle based on stack size
switch(m_argLocDesc->m_hfaFieldSize ? m_argLocDesc->m_hfaFieldSize : m_argLocDesc->m_byteStackSize)
{
case 1:
log2Size = 0;
break;
case 2:
log2Size = 1;
break;
case 4:
log2Size = 2;
break;
case 3: // Unsupported Size
case 5: // Unsupported Size
case 6: // Unsupported Size
case 7: // Unsupported Size
_ASSERTE(false);
break;
default: // Should be a multiple of 8 (TARGET_POINTER_SIZE)
_ASSERTE(bytesRemaining >= TARGET_POINTER_SIZE);
break;
}

m_currentByteStackIndex += (1 << log2Size);

// Delegates cannot handle overly large argument stacks due to shuffle entry encoding limitations.
// Arm64 current implementation only supports 12 bit unsigned scaled offset
if ((byteIndex >> log2Size) > 0xfff)
{
COMPlusThrow(kNotSupportedException);
}

_ASSERTE((byteIndex & ((1 << log2Size) - 1)) == 0);

return (byteIndex >> log2Size) | (log2Size << 12);
#endif
}

// There are no more offsets to get, the caller should not have called us
Expand Down Expand Up @@ -274,31 +340,8 @@ BOOL AddNextShuffleEntryToArray(ArgLocDesc sArgSrc, ArgLocDesc sArgDst, SArray<S
// different).
if (srcOffset != dstOffset)
{
if (srcOffset <= 0)
{
// It was a stack byte offset.
const unsigned srcStackByteOffset = -srcOffset;
_ASSERT(((srcStackByteOffset % TARGET_POINTER_SIZE) == 0) && "NYI: does not support shuffling of such args");
entry.srcofs = (UINT16)(srcStackByteOffset / TARGET_POINTER_SIZE);
}
else
{
_ASSERT((srcOffset & ShuffleEntry::REGMASK) != 0);
entry.srcofs = (UINT16)srcOffset;
}

if (dstOffset <= 0)
{
// It was a stack byte offset.
const unsigned dstStackByteOffset = -dstOffset;
_ASSERT((dstStackByteOffset % TARGET_POINTER_SIZE) == 0 && "NYI: does not support shuffling of such args");
entry.dstofs = (UINT16)(dstStackByteOffset / TARGET_POINTER_SIZE);
}
else
{
_ASSERT((dstOffset & ShuffleEntry::REGMASK) != 0);
entry.dstofs = (UINT16)dstOffset;
}
entry.srcofs = (UINT16)srcOffset;
entry.dstofs = (UINT16)dstOffset;

if (shuffleType == ShuffleComputationType::InstantiatingStub)
{
Expand Down Expand Up @@ -657,7 +700,7 @@ VOID GenerateShuffleArray(MethodDesc* pInvoke, MethodDesc *pTargetMeth, SArray<S
}

entry.srcofs = ShuffleEntry::SENTINEL;
entry.dstofs = static_cast<UINT16>(stackSizeDelta);
entry.stacksizedelta = static_cast<UINT16>(stackSizeDelta);
pShuffleEntryArray->Append(entry);

#else
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/vm/comdelegate.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ struct ShuffleEntry

union {
UINT16 dstofs; //if srcofs != SENTINEL
UINT16 stacksizedelta; //if dstofs == SENTINEL, difference in stack size between virtual and static sigs
UINT16 stacksizedelta; //if srcofs == SENTINEL, difference in stack size between virtual and static sigs
};
};

Expand Down
10 changes: 0 additions & 10 deletions src/tests/issues.targets
Original file line number Diff line number Diff line change
Expand Up @@ -734,16 +734,6 @@

<!-- MacOS arm64 specific -->
<ItemGroup Condition="'$(XunitTestBinBase)' != '' and '$(TargetOS)' == 'OSX' and '$(TargetArchitecture)' == 'arm64' and '$(RuntimeFlavor)' == 'coreclr'">
<!-- Hard failures -->
<ExcludeList Include="$(XunitTestBinBase)/Regressions/coreclr/GitHub_35000/test35000/*">
<Issue>https://github.com/dotnet/runtime/issues/47294</Issue>
</ExcludeList>
<ExcludeList Include="$(XunitTestBinBase)/JIT/Stress/ABI/pinvokes_d/*">
<Issue>https://github.com/dotnet/runtime/issues/47294</Issue>
</ExcludeList>
<ExcludeList Include="$(XunitTestBinBase)/JIT/Stress/ABI/pinvokes_do/*">
<Issue>https://github.com/dotnet/runtime/issues/47294</Issue>
</ExcludeList>
<!-- Tracing failures -->
<ExcludeList Include="$(XunitTestBinBase)/tracing/eventpipe/eventsourceerror/eventsourceerror/*">
<Issue>https://github.com/dotnet/runtime/issues/48786</Issue>
Expand Down

0 comments on commit d737521

Please sign in to comment.