Skip to content

Commit

Permalink
x86-64: Implement an enhancement for char array System.arraycopy
Browse files Browse the repository at this point in the history
The setup to run `rep movsd` is not efficient on copying smaller sizes.
The enhancement inlines copy size equal or less than 64 bytes without
using `rep movsd`.

Signed-off-by: Annabelle Huo <Annabelle.Huo@ibm.com>
  • Loading branch information
a7ehuo committed May 23, 2024
1 parent 49ff22b commit 5eb9e92
Showing 1 changed file with 243 additions and 1 deletion.
244 changes: 243 additions & 1 deletion compiler/x/codegen/OMRTreeEvaluator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1767,6 +1767,239 @@ static void arrayCopy16BitPrimitive(TR::Node* node, TR::Register* dstReg, TR::Re
generateLabelInstruction(TR::InstOpCode::label, node, mainEndLabel, dependencies, cg);
}

static void arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(TR::Node* node,
TR::Register* dstReg,
TR::Register* srcReg,
TR::Register* sizeReg,
TR::Register* tmpReg1,
TR::Register* tmpReg2,
TR::Register* tmpXmmYmmReg1,
TR::Register* tmpXmmYmmReg2,
TR::CodeGenerator* cg,
TR::LabelSymbol* mainEndLabel)
{
int32_t REP_MOVS_THRESHOLD_BYTES = 64;

if (cg->comp()->getOption(TR_TraceCG))
{
traceMsg(cg->comp(), "%s: node n%dn srcReg %s dstReg %s sizeReg %s REP_MOVS_THRESHOLD_BYTES %d\n", __FUNCTION__,
node->getGlobalIndex(), cg->comp()->getDebug()->getName(srcReg), cg->comp()->getDebug()->getName(dstReg),
cg->comp()->getDebug()->getName(sizeReg), REP_MOVS_THRESHOLD_BYTES);
}

/*
* The setup to run `rep movsd` is not efficient on copying smaller sizes.
* This method inlines copy size <= REP_MOVS_THRESHOLD_BYTES without using `rep movsd`.
*
* if copySize > 16
* jmp copy18ORMoreBytesLabel ------+
* if copySize > 2 |
* jmp copy4ORMoreBytesLabel ---+ |
* if copySize == 0 | |
* jmp mainEndLabel | |
* | |
* copy 2 bytes | |
* jmp mainEndLabel | |
* | |
* copy4ORMoreBytesLabel: <--------+ |
* if copySize > 8 |
* jmp copy10ORMoreBytesLabel --+ |
* | |
* copy 4-8 bytes | |
* jmp mainEndLabel | |
* | |
* copy10ORMoreBytesLabel: <-------+ |
* copy 10-16 bytes |
* jmp mainEndLabel |
* |
* copy18ORMoreBytesLabel: <-----------+
* if copySize > 32
* jmp copy34ORMoreBytesLabel ------+
* |
* copy 18-32 bytes |
* jmp mainEndLabel |
* |
* copy34ORMoreBytesLabel: <-----------+
* if copySize > 64 // REP_MOVS_THRESHOLD_BYTES
* jmp repMovsLabel ----------------+
* |
* copy 34-64 bytes |
* jmp mainEndLabel |
* |
* repMovsLabel: <---------------------+
* copy 66 or more bytes
*
* --------------------------------------------------
*
* Here is an example if we need to copy 48 bytes:
* - Load 32 bytes from [src + size - 32] = [src + 48 - 32] = [src + 16] into temp1
* - Load 32 bytes from [src] into temp2
* - Store temp1 into [dst + size - 32]
* - Store temp2 into [dst]
*
* 0 1 2 ......15 16............ 31 32 33.......47
* |----------- temp1 ------------|
* |----------- temp2 ------------|
*/

TR::LabelSymbol* repMovsLabel = generateLabelSymbol(cg);

TR::LabelSymbol* copy4ORMoreBytesLabel = generateLabelSymbol(cg);
TR::LabelSymbol* copy10ORMoreBytesLabel = generateLabelSymbol(cg);
TR::LabelSymbol* copy18ORMoreBytesLabel = generateLabelSymbol(cg);
TR::LabelSymbol* copy34ORMoreBytesLabel = generateLabelSymbol(cg);

/* ---------------------------------
* size <= REP_MOVS_THRESHOLD_BYTES
*/
generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, sizeReg, 16, cg);
generateLabelInstruction(TR::InstOpCode::JA4, node, copy18ORMoreBytesLabel, cg);

generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, sizeReg, 2, cg);
generateLabelInstruction(TR::InstOpCode::JA4, node, copy4ORMoreBytesLabel, cg);

generateRegRegInstruction(TR::InstOpCode::TESTRegReg(), node, sizeReg, sizeReg, cg);
generateLabelInstruction(TR::InstOpCode::JE4, node, mainEndLabel, cg);

// 2 Bytes
generateRegMemInstruction(TR::InstOpCode::L2RegMem, node, tmpReg1, generateX86MemoryReference(srcReg, 0, cg), cg);
generateMemRegInstruction(TR::InstOpCode::S2MemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpReg1, cg);
generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg);

// ---------------------------------
generateLabelInstruction(TR::InstOpCode::label, node, copy4ORMoreBytesLabel, cg);
generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, sizeReg, 8, cg);
generateLabelInstruction(TR::InstOpCode::JA4, node, copy10ORMoreBytesLabel, cg);

// 4, 6, 8 Bytes
generateRegMemInstruction(TR::InstOpCode::L4RegMem, node, tmpReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -4, cg), cg);
generateRegMemInstruction(TR::InstOpCode::L4RegMem, node, tmpReg2, generateX86MemoryReference(srcReg, 0, cg), cg);
generateMemRegInstruction(TR::InstOpCode::S4MemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -4, cg), tmpReg1, cg);
generateMemRegInstruction(TR::InstOpCode::S4MemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpReg2, cg);
generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg);

// ---------------------------------
generateLabelInstruction(TR::InstOpCode::label, node, copy10ORMoreBytesLabel, cg);

// 10-16 Bytes
generateRegMemInstruction(TR::InstOpCode::L8RegMem, node, tmpReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -8, cg), cg);
generateRegMemInstruction(TR::InstOpCode::L8RegMem, node, tmpReg2, generateX86MemoryReference(srcReg, 0, cg), cg);
generateMemRegInstruction(TR::InstOpCode::S8MemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -8, cg), tmpReg1, cg);
generateMemRegInstruction(TR::InstOpCode::S8MemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpReg2, cg);
generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg);

// ---------------------------------
generateLabelInstruction(TR::InstOpCode::label, node, copy18ORMoreBytesLabel, cg);
generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, sizeReg, 32, cg);
generateLabelInstruction(TR::InstOpCode::JA4, node, copy34ORMoreBytesLabel, cg);

// 18-32 Bytes
generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -16, cg), cg);
generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg);
generateMemRegInstruction(TR::InstOpCode::MOVDQUMemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -16, cg), tmpXmmYmmReg1, cg);
generateMemRegInstruction(TR::InstOpCode::MOVDQUMemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg);
generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg);

// ---------------------------------
generateLabelInstruction(TR::InstOpCode::label, node, copy34ORMoreBytesLabel, cg);
generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, sizeReg, 64, cg);
generateLabelInstruction(TR::InstOpCode::JA4, node, repMovsLabel, cg);

// 34-64 Bytes
generateRegMemInstruction(TR::InstOpCode::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -32, cg), cg);
generateRegMemInstruction(TR::InstOpCode::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg);
generateMemRegInstruction(TR::InstOpCode::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, sizeReg, 0, -32, cg), tmpXmmYmmReg1, cg);
generateMemRegInstruction(TR::InstOpCode::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg);
generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg);

/* ---------------------------------
* size > REP_MOVS_THRESHOLD_BYTES
* rep movsd
* if remainder is 0 -> mainEndLabel
* process remainder
*
* mainEndLabel:
*/
generateLabelInstruction(TR::InstOpCode::label, node, repMovsLabel, cg);
if (node->isForwardArrayCopy())
{
generateRegImmInstruction(TR::InstOpCode::SHRRegImm1(), node, sizeReg, 2, cg);
generateInstruction(TR::InstOpCode::REPMOVSD, node, cg);
generateLabelInstruction(TR::InstOpCode::JAE1, node, mainEndLabel, cg);
generateRegMemInstruction(TR::InstOpCode::L2RegMem, node, sizeReg, generateX86MemoryReference(srcReg, 0, cg), cg);
generateMemRegInstruction(TR::InstOpCode::S2MemReg, node, generateX86MemoryReference(dstReg, 0, cg), sizeReg, cg);
}
else // decide direction during runtime
{
TR::LabelSymbol* backwardLabel = generateLabelSymbol(cg);

generateRegRegInstruction(TR::InstOpCode::SUBRegReg(), node, dstReg, srcReg, cg); // dst = dst - src
generateRegRegInstruction(TR::InstOpCode::CMPRegReg(), node, dstReg, sizeReg, cg); // cmp dst, size
generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, dstReg, generateX86MemoryReference(dstReg, srcReg, 0, cg), cg); // dst = dst + src
generateLabelInstruction(TR::InstOpCode::JB4, node, backwardLabel, cg); // jb, skip backward copy setup

generateRegImmInstruction(TR::InstOpCode::SHRRegImm1(), node, sizeReg, 2, cg);
generateInstruction(TR::InstOpCode::REPMOVSD, node, cg);
generateLabelInstruction(TR::InstOpCode::JAE1, node, mainEndLabel, cg);
generateRegMemInstruction(TR::InstOpCode::L2RegMem, node, sizeReg, generateX86MemoryReference(srcReg, 0, cg), cg);
generateMemRegInstruction(TR::InstOpCode::S2MemReg, node, generateX86MemoryReference(dstReg, 0, cg), sizeReg, cg);

{
TR_OutlinedInstructionsGenerator og(backwardLabel, node, cg);
generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, srcReg, generateX86MemoryReference(srcReg, sizeReg, 0, -2, cg), cg);
generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, dstReg, generateX86MemoryReference(dstReg, sizeReg, 0, -2, cg), cg);
generateInstruction(TR::InstOpCode::STD, node, cg);
generateRepMovsInstruction(TR::InstOpCode::REPMOVSW, node, sizeReg, NULL, cg);
generateInstruction(TR::InstOpCode::CLD, node, cg);
generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg);
og.endOutlinedInstructionSequence();
}
}
}

static void arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVS(TR::Node* node, TR::Register* dstReg, TR::Register* srcReg, TR::Register* sizeReg, TR::CodeGenerator* cg)
{
TR::Register* tmpReg1 = cg->allocateRegister(TR_GPR);
TR::Register* tmpReg2 = cg->allocateRegister(TR_GPR);
TR::Register* tmpXmmYmmReg1 = cg->allocateRegister(TR_VRF);
TR::Register* tmpXmmYmmReg2 = cg->allocateRegister(TR_VRF);

TR::RegisterDependencyConditions* dependencies = generateRegisterDependencyConditions((uint8_t)7, (uint8_t)7, cg);

dependencies->addPreCondition(srcReg, TR::RealRegister::esi, cg);
dependencies->addPreCondition(dstReg, TR::RealRegister::edi, cg);
dependencies->addPreCondition(sizeReg, TR::RealRegister::ecx, cg);
dependencies->addPreCondition(tmpReg1, TR::RealRegister::NoReg, cg);
dependencies->addPreCondition(tmpReg2, TR::RealRegister::NoReg, cg);
dependencies->addPreCondition(tmpXmmYmmReg1, TR::RealRegister::NoReg, cg);
dependencies->addPreCondition(tmpXmmYmmReg2, TR::RealRegister::NoReg, cg);

dependencies->addPostCondition(srcReg, TR::RealRegister::esi, cg);
dependencies->addPostCondition(dstReg, TR::RealRegister::edi, cg);
dependencies->addPostCondition(sizeReg, TR::RealRegister::ecx, cg);
dependencies->addPostCondition(tmpReg1, TR::RealRegister::NoReg, cg);
dependencies->addPostCondition(tmpReg2, TR::RealRegister::NoReg, cg);
dependencies->addPostCondition(tmpXmmYmmReg1, TR::RealRegister::NoReg, cg);
dependencies->addPostCondition(tmpXmmYmmReg2, TR::RealRegister::NoReg, cg);

TR::LabelSymbol* mainBegLabel = generateLabelSymbol(cg);
mainBegLabel->setStartInternalControlFlow();

TR::LabelSymbol* mainEndLabel = generateLabelSymbol(cg);
mainEndLabel->setEndInternalControlFlow();

generateLabelInstruction(TR::InstOpCode::label, node, mainBegLabel, cg);

arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(node, dstReg, srcReg, sizeReg, tmpReg1, tmpReg2, tmpXmmYmmReg1, tmpXmmYmmReg2, cg, mainEndLabel);

generateLabelInstruction(TR::InstOpCode::label, node, mainEndLabel, dependencies, cg);

cg->stopUsingRegister(tmpReg1);
cg->stopUsingRegister(tmpReg2);
cg->stopUsingRegister(tmpXmmYmmReg1);
cg->stopUsingRegister(tmpXmmYmmReg2);
}

/** \brief
* Generate instructions to do array copy.
*
Expand Down Expand Up @@ -2250,7 +2483,16 @@ TR::Register *OMR::X86::TreeEvaluator::arraycopyEvaluator(TR::Node *node, TR::Co
}
else if (elementSize == 2 && !useREPMOVSW)
{
arrayCopy16BitPrimitive(node, dstReg, srcReg, sizeReg, cg);
static bool disable16BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS = feGetEnv("TR_Disable16BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS") != NULL;

bool enable16BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS = (!disable16BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS &&
cg->comp()->target().cpu.supportsAVX() &&
cg->comp()->target().is64Bit()) ? true : false;

if (enable16BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS)
arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVS(node, dstReg, srcReg, sizeReg, cg);
else
arrayCopy16BitPrimitive(node, dstReg, srcReg, sizeReg, cg);
}
else
{
Expand Down

0 comments on commit 5eb9e92

Please sign in to comment.