From 2a2e13046b1a7c52179fd5f83380e633ccbec11f Mon Sep 17 00:00:00 2001 From: Annabelle Huo Date: Tue, 4 Jun 2024 11:51:54 -0400 Subject: [PATCH] x86-64: Inline small array copy size for int and long arrays (1) Inline small array copy size without using `rep movs` for int array and long array - If AVX-512 is supported and copy size <= 128 bytes, the inlined copy sequence is used. - If AVX is supported, the threshold for inlined sequence is 64 bytes instead. - The enhancement for 32 bit and 64 bit arrays will be reused in the downstream OpenJ9 project for reference arrays. (2) Refactor char array enhancement implementation to support both `rep movsw` and `rep movsd` if the copysize is greater than the supported enhancement threshold. (3) Add the following options to disable the enhancement - `disableArrayCopyIntArrayInlineSmallSizeWithoutREPMOVS`: disable the enhancement for int array - `disableArrayCopyLongArrayInlineSmallSizeWithoutREPMOVS`: disable the enhancement for long array - `disableArrayCopyReferenceArrayInlineSmallSizeWithoutREPMOVS`: disable the enhancement for reference array, which will be implemented in downstream project OpenJ9. Signed-off-by: Annabelle Huo --- compiler/control/OMROptions.cpp | 3 + compiler/control/OMROptions.hpp | 6 +- compiler/x/codegen/OMRTreeEvaluator.cpp | 540 ++++++++++++++++++++---- compiler/x/codegen/OMRTreeEvaluator.hpp | 25 ++ 4 files changed, 493 insertions(+), 81 deletions(-) diff --git a/compiler/control/OMROptions.cpp b/compiler/control/OMROptions.cpp index c85c9e5727b..e237e0ff8c4 100644 --- a/compiler/control/OMROptions.cpp +++ b/compiler/control/OMROptions.cpp @@ -253,7 +253,10 @@ TR::OptionTable OMR::Options::_jitOptions[] = { {"disableAOTWarmRunThroughputImprovement", "O\tdisable change iprofiler entry choosing heuristic to improve aot warm run throughput", SET_OPTION_BIT(TR_DisableAOTWarmRunThroughputImprovement), "F"}, {"disableArrayCopyByteArrayInlineSmallSizeWithoutREPMOVS", "O\tdisable array copy optimizations enhancement for 8 bit primitive array", SET_OPTION_BIT(TR_Disable8BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS), "F"}, {"disableArrayCopyCharArrayInlineSmallSizeWithoutREPMOVS", "O\tdisable array copy optimizations enhancement for 16 bit primitive array", SET_OPTION_BIT(TR_Disable16BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS), "F"}, + {"disableArrayCopyIntArrayInlineSmallSizeWithoutREPMOVS", "O\tdisable array copy optimizations enhancement for 32 bit primitive array", SET_OPTION_BIT(TR_Disable32BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS), "F"}, + {"disableArrayCopyLongArrayInlineSmallSizeWithoutREPMOVS", "O\tdisable array copy optimizations enhancement for 64 bit primitive array", SET_OPTION_BIT(TR_Disable64BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS), "F"}, {"disableArrayCopyOpts", "O\tdisable array copy optimizations", SET_OPTION_BIT(TR_DisableArrayCopyOpts), "F"}, + {"disableArrayCopyReferenceArrayInlineSmallSizeWithoutREPMOVS", "O\tdisable array copy optimizations enhancement for reference array", SET_OPTION_BIT(TR_DisableReferenceArrayCopyInlineSmallSizeWithoutREPMOVS), "F"}, {"disableArraySetOpts", "O\tdisable array set optimizations", SET_OPTION_BIT(TR_DisableArraySetOpts), "F"}, {"disableArraySetStoreElimination", "O\tdisable arrayset store elimination", SET_OPTION_BIT(TR_DisableArraysetStoreElimination), "F"}, {"disableArrayStoreCheckOpts", "O\tdisable array store check optimizations",SET_OPTION_BIT(TR_DisableArrayStoreCheckOpts), "F"}, diff --git a/compiler/control/OMROptions.hpp b/compiler/control/OMROptions.hpp index 629bdcf23de..9b82fa71462 100644 --- a/compiler/control/OMROptions.hpp +++ b/compiler/control/OMROptions.hpp @@ -344,20 +344,20 @@ enum TR_CompilationOptions TR_DisablePeephole = 0x00200000 + 8, TR_NoOptServer = 0x00400000 + 8, TR_DisableDLTrecompilationPrevention = 0x00800000 + 8, - // Available = 0x01000000 + 8, + TR_Disable32BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS = 0x01000000 + 8, TR_DisablePeekAOTResolutions = 0x02000000 + 8, TR_UseFlattenedArrayElementRuntimeHelpers = 0x04000000 + 8, TR_UseFlattenedFieldRuntimeHelpers = 0x08000000 + 8, TR_DisableLiveRangeSplitter = 0x10000000 + 8, TR_DisableHalfSlotSpills = 0x20000000 + 8, TR_DisableMHInlineWithoutPeeking = 0x40000000 + 8, - // Available = 0x80000000 + 8, + TR_Disable64BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS = 0x80000000 + 8, // Option word 9 // TR_SplitWarmAndColdBlocks = 0x00000020 + 9, - // Available = 0x00000040 + 9, + TR_DisableReferenceArrayCopyInlineSmallSizeWithoutREPMOVS = 0x00000040 + 9, TR_DisableTLHPrefetch = 0x00000080 + 9, TR_DisableJProfilerThread = 0x00000100 + 9, TR_DisableIProfilerThread = 0x00000200 + 9, diff --git a/compiler/x/codegen/OMRTreeEvaluator.cpp b/compiler/x/codegen/OMRTreeEvaluator.cpp index 62dffb4a17f..d87753047b0 100644 --- a/compiler/x/codegen/OMRTreeEvaluator.cpp +++ b/compiler/x/codegen/OMRTreeEvaluator.cpp @@ -1696,6 +1696,294 @@ static void arrayCopy64BitPrimitiveOnIA32(TR::Node* node, TR::Register* dstReg, cg->stopUsingRegister(scratch); } +void OMR::X86::TreeEvaluator::arrayCopy64BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(TR::Node *node, + TR::Register *dstReg, + TR::Register *srcReg, + TR::Register *sizeReg, + TR::Register *tmpReg1, + TR::Register *tmpReg2, + TR::Register *tmpXmmYmmReg1, + TR::Register *tmpXmmYmmReg2, + TR::CodeGenerator *cg, + int repMovsThresholdBytes, + TR::LabelSymbol *repMovsLabel, + TR::LabelSymbol *mainEndLabel) + { + if (cg->comp()->getOption(TR_TraceCG)) + { + traceMsg(cg->comp(), "%s: node n%dn srcReg %s dstReg %s sizeReg %s repMovsThresholdBytes %d\n", __FUNCTION__, + node->getGlobalIndex(), cg->comp()->getDebug()->getName(srcReg), cg->comp()->getDebug()->getName(dstReg), + cg->comp()->getDebug()->getName(sizeReg), repMovsThresholdBytes); + } + + TR_ASSERT_FATAL((repMovsThresholdBytes == 64) || (repMovsThresholdBytes == 128), "repMovsThresholdBytes %d is not supported\n", repMovsThresholdBytes); + + /* + * This method is adapted from `arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16`. + * + * The setup to run `rep movsq` is not efficient on copying smaller sizes. + * This method inlines copy size <= repMovsThresholdBytes without using `rep movsq`. + * + * Below is an example of repMovsThresholdBytes as 64 bytes + * + * if copySize > 16 + * jmp copy24ORMoreBytesLabel ------+ + * | + * if copySize == 0 | + * jmp mainEndLabel | + * | + * copy 8 or 16 bytes | + * jmp mainEndLabel | + * | + * copy24ORMoreBytesLabel: <--------+ + * if copySize > 32 + * jmp copy40ORMoreBytesLabel ---+ + * | + * copy 24 or 32 bytes | + * jmp mainEndLabel | + * | + * copy40ORMoreBytesLabel: <--------+ + * if copySize > 64 // repMovsThresholdBytes + * jmp repMovsLabel -------------+ + * | + * copy 40-64 bytes | + * jmp mainEndLabel | + * | + * repMovsLabel: <------------------+ + * copy 72 or more bytes + */ + + TR::LabelSymbol* copy16BytesLabel = generateLabelSymbol(cg); + TR::LabelSymbol* copy24ORMoreBytesLabel = generateLabelSymbol(cg); + TR::LabelSymbol* copy40ORMoreBytesLabel = generateLabelSymbol(cg); + + /* --------------------------------- + * size <= repMovsThresholdBytes + */ + generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, sizeReg, 16, cg); + generateLabelInstruction(TR::InstOpCode::JA4, node, copy24ORMoreBytesLabel, cg); + + generateRegRegInstruction(TR::InstOpCode::TESTRegReg(), node, sizeReg, sizeReg, cg); + generateLabelInstruction(TR::InstOpCode::JE4, node, mainEndLabel, cg); + + // 8 or 16 Bytes + generateRegMemInstruction(TR::InstOpCode::L8RegMem, node, tmpReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -8, cg), cg); + generateRegMemInstruction(TR::InstOpCode::L8RegMem, node, tmpReg2, generateX86MemoryReference(srcReg, 0, cg), cg); + generateMemRegInstruction(TR::InstOpCode::S8MemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -8, cg), tmpReg1, cg); + generateMemRegInstruction(TR::InstOpCode::S8MemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpReg2, cg); + generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); + + // --------------------------------- + generateLabelInstruction(TR::InstOpCode::label, node, copy24ORMoreBytesLabel, cg); + generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, sizeReg, 32, cg); + generateLabelInstruction(TR::InstOpCode::JA4, node, copy40ORMoreBytesLabel, cg); + + // 24 or 32 Bytes + generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -16, cg), cg); + generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); + generateMemRegInstruction(TR::InstOpCode::MOVDQUMemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -16, cg), tmpXmmYmmReg1, cg); + generateMemRegInstruction(TR::InstOpCode::MOVDQUMemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); + + if (repMovsThresholdBytes == 64) + { + // --------------------------------- + generateLabelInstruction(TR::InstOpCode::label, node, copy40ORMoreBytesLabel, cg); + generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, sizeReg, 64, cg); + generateLabelInstruction(TR::InstOpCode::JA4, node, repMovsLabel, cg); + + // 40-64 Bytes + generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -32, cg), cg); + generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); + generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, sizeReg, 0, -32, cg), tmpXmmYmmReg1, cg); + generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); + } + else // repMovsThresholdBytes = 128 + { + TR::LabelSymbol* copy64ORMoreBytesLabel = generateLabelSymbol(cg); + + // --------------------------------- + generateLabelInstruction(TR::InstOpCode::label, node, copy40ORMoreBytesLabel, cg); + generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, sizeReg, 64, cg); + generateLabelInstruction(TR::InstOpCode::JA4, node, copy64ORMoreBytesLabel, cg); + + // 40-64 Bytes + generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -32, cg), cg); + generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); + generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, sizeReg, 0, -32, cg), tmpXmmYmmReg1, cg); + generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); + + // --------------------------------- + generateLabelInstruction(TR::InstOpCode::label, node, copy64ORMoreBytesLabel, cg); + generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, sizeReg, 128, cg); + generateLabelInstruction(TR::InstOpCode::JA4, node, repMovsLabel, cg); + + // 72-128 Bytes + generateRegMemInstruction(TR::InstOpCode::VMOVDQUZmmMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -64, cg), cg); + generateRegMemInstruction(TR::InstOpCode::VMOVDQUZmmMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); + generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemZmm, node, generateX86MemoryReference(dstReg, sizeReg, 0, -64, cg), tmpXmmYmmReg1, cg); + generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemZmm, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + + generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); + } + } + +void OMR::X86::TreeEvaluator::arrayCopy32BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(TR::Node *node, + TR::Register *dstReg, + TR::Register *srcReg, + TR::Register *sizeReg, + TR::Register *tmpReg1, + TR::Register *tmpReg2, + TR::Register *tmpXmmYmmReg1, + TR::Register *tmpXmmYmmReg2, + TR::CodeGenerator *cg, + int repMovsThresholdBytes, + TR::LabelSymbol *repMovsLabel, + TR::LabelSymbol *mainEndLabel) + { + if (cg->comp()->getOption(TR_TraceCG)) + { + traceMsg(cg->comp(), "%s: node n%dn srcReg %s dstReg %s sizeReg %s repMovsThresholdBytes %d\n", __FUNCTION__, + node->getGlobalIndex(), cg->comp()->getDebug()->getName(srcReg), cg->comp()->getDebug()->getName(dstReg), + cg->comp()->getDebug()->getName(sizeReg), repMovsThresholdBytes); + } + + TR_ASSERT_FATAL((repMovsThresholdBytes == 64) || (repMovsThresholdBytes == 128), "repMovsThresholdBytes %d is not supported\n", repMovsThresholdBytes); + + /* + * This method is adapted from `arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16`. + * + * The setup to run `rep movsd` is not efficient on copying smaller sizes. + * This method inlines copy size <= repMovsThresholdBytes without using `rep movsd`. + * + * Below is an example of repMovsThresholdBytes as 64 bytes + * + * if copySize > 16 + * jmp copy20ORMoreBytesLabel --------+ + * | + * if copySize > 8 | + * jmp copy12RMoreBytesLabel ------+ | + * | | + * if copySize == 0 | | + * jmp mainEndLabel | | + * | | + * copy 4 or 8 bytes | | + * jmp mainEndLabel | | + * | | + * copy12RMoreBytesLabel: <--------+ | + * copy 12 or 16 bytes | + * jmp mainEndLabel | + * | + * copy20ORMoreBytesLabel: <----------+ + * if copySize > 32 + * jmp copy36ORMoreBytesLabel -----+ + * | + * copy 20-32 bytes | + * jmp mainEndLabel | + * | + * copy36ORMoreBytesLabel: <----------+ + * if copySize > 64 // repMovsThresholdBytes + * jmp repMovsLabel ---------------+ + * | + * copy 34-64 bytes | + * jmp mainEndLabel | + * | + * repMovsLabel: <--------------------+ + * copy 68 or more bytes + */ + + TR::LabelSymbol* copy12RMoreBytesLabel = generateLabelSymbol(cg); + TR::LabelSymbol* copy20ORMoreBytesLabel = generateLabelSymbol(cg); + TR::LabelSymbol* copy36ORMoreBytesLabel = generateLabelSymbol(cg); + + /* --------------------------------- + * size <= repMovsThresholdBytes + */ + generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, sizeReg, 16, cg); + generateLabelInstruction(TR::InstOpCode::JA4, node, copy20ORMoreBytesLabel, cg); + + generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, sizeReg, 8, cg); + generateLabelInstruction(TR::InstOpCode::JA4, node, copy12RMoreBytesLabel, cg); + + generateRegRegInstruction(TR::InstOpCode::TESTRegReg(), node, sizeReg, sizeReg, cg); + generateLabelInstruction(TR::InstOpCode::JE4, node, mainEndLabel, cg); + + // 4 or 8 Bytes + generateRegMemInstruction(TR::InstOpCode::L4RegMem, node, tmpReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -4, cg), cg); + generateRegMemInstruction(TR::InstOpCode::L4RegMem, node, tmpReg2, generateX86MemoryReference(srcReg, 0, cg), cg); + generateMemRegInstruction(TR::InstOpCode::S4MemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -4, cg), tmpReg1, cg); + generateMemRegInstruction(TR::InstOpCode::S4MemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpReg2, cg); + generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); + + // --------------------------------- + generateLabelInstruction(TR::InstOpCode::label, node, copy12RMoreBytesLabel, cg); + + // 12 or 16 Bytes + generateRegMemInstruction(TR::InstOpCode::L8RegMem, node, tmpReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -8, cg), cg); + generateRegMemInstruction(TR::InstOpCode::L8RegMem, node, tmpReg2, generateX86MemoryReference(srcReg, 0, cg), cg); + generateMemRegInstruction(TR::InstOpCode::S8MemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -8, cg), tmpReg1, cg); + generateMemRegInstruction(TR::InstOpCode::S8MemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpReg2, cg); + generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); + + // --------------------------------- + generateLabelInstruction(TR::InstOpCode::label, node, copy20ORMoreBytesLabel, cg); + generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, sizeReg, 32, cg); + generateLabelInstruction(TR::InstOpCode::JA4, node, copy36ORMoreBytesLabel, cg); + + // 20-32 Bytes + generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -16, cg), cg); + generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); + generateMemRegInstruction(TR::InstOpCode::MOVDQUMemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -16, cg), tmpXmmYmmReg1, cg); + generateMemRegInstruction(TR::InstOpCode::MOVDQUMemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); + + if (repMovsThresholdBytes == 64) + { + // --------------------------------- + generateLabelInstruction(TR::InstOpCode::label, node, copy36ORMoreBytesLabel, cg); + generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, sizeReg, 64, cg); + generateLabelInstruction(TR::InstOpCode::JA4, node, repMovsLabel, cg); + + // 36-64 Bytes + generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -32, cg), cg); + generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); + generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, sizeReg, 0, -32, cg), tmpXmmYmmReg1, cg); + generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); + } + else // repMovsThresholdBytes = 128 + { + TR::LabelSymbol* copy68ORMoreBytesLabel = generateLabelSymbol(cg); + + // --------------------------------- + generateLabelInstruction(TR::InstOpCode::label, node, copy36ORMoreBytesLabel, cg); + generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, sizeReg, 64, cg); + generateLabelInstruction(TR::InstOpCode::JA4, node, copy68ORMoreBytesLabel, cg); + + // 36-64 Bytes + generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -32, cg), cg); + generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); + generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, sizeReg, 0, -32, cg), tmpXmmYmmReg1, cg); + generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); + + // --------------------------------- + generateLabelInstruction(TR::InstOpCode::label, node, copy68ORMoreBytesLabel, cg); + generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, sizeReg, 128, cg); + generateLabelInstruction(TR::InstOpCode::JA4, node, repMovsLabel, cg); + + // 68-128 Bytes + generateRegMemInstruction(TR::InstOpCode::VMOVDQUZmmMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -64, cg), cg); + generateRegMemInstruction(TR::InstOpCode::VMOVDQUZmmMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); + generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemZmm, node, generateX86MemoryReference(dstReg, sizeReg, 0, -64, cg), tmpXmmYmmReg1, cg); + generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemZmm, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + + generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); + } + } + /** \brief * Generate instructions to do array copy for 16-bit primitives. * @@ -1767,6 +2055,26 @@ static void arrayCopy16BitPrimitive(TR::Node* node, TR::Register* dstReg, TR::Re generateLabelInstruction(TR::InstOpCode::label, node, mainEndLabel, dependencies, cg); } +/** \brief + * This method generates inline sequence of smaller size of array copy for 16 bit (2 bytes) primitive arrays + * + * arraycopyEvaluator : elmementSize = 2 + * | + * !useREPMOVSW | + * +---------------------------------+---------------------------------+ + * | | + * | | + * V V + * arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSLargeSizeREPMOVSD arrayCopyDefault + * | | + * | V + * | arrayCopyPrimitiveInlineSmallSizeWithoutREPMOVS + * | | + * +-----------------------+ +---------------------+ + * | | + * V V + * arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16 + */ static void arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(TR::Node* node, TR::Register* dstReg, TR::Register* srcReg, @@ -1776,6 +2084,7 @@ static void arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(TR::N TR::Register* tmpXmmYmmReg1, TR::Register* tmpXmmYmmReg2, TR::CodeGenerator* cg, + TR::LabelSymbol *repMovsLabel, TR::LabelSymbol* mainEndLabel) { int32_t REP_MOVS_THRESHOLD_BYTES = 64; @@ -1788,8 +2097,8 @@ static void arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(TR::N } /* - * The setup to run `rep movsd` is not efficient on copying smaller sizes. - * This method inlines copy size <= REP_MOVS_THRESHOLD_BYTES without using `rep movsd`. + * The setup to run `rep movsd` or `rep movsw` is not efficient on copying smaller sizes. + * This method inlines copy size <= REP_MOVS_THRESHOLD_BYTES without using `rep movs[d|w]]`. * * if copySize > 16 * jmp copy18ORMoreBytesLabel ------+ @@ -1842,7 +2151,6 @@ static void arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(TR::N * |----------- temp2 ------------| */ - TR::LabelSymbol* repMovsLabel = generateLabelSymbol(cg); TR::LabelSymbol* copy4ORMoreBytesLabel = generateLabelSymbol(cg); TR::LabelSymbol* copy10ORMoreBytesLabel = generateLabelSymbol(cg); @@ -1902,15 +2210,53 @@ static void arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(TR::N // --------------------------------- generateLabelInstruction(TR::InstOpCode::label, node, copy34ORMoreBytesLabel, cg); - generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, sizeReg, 64, cg); + generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, sizeReg, REP_MOVS_THRESHOLD_BYTES, cg); generateLabelInstruction(TR::InstOpCode::JA4, node, repMovsLabel, cg); // 34-64 Bytes - generateRegMemInstruction(TR::InstOpCode::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -32, cg), cg); - generateRegMemInstruction(TR::InstOpCode::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, sizeReg, 0, -32, cg), tmpXmmYmmReg1, cg); - generateMemRegInstruction(TR::InstOpCode::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -32, cg), cg); + generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); + generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, sizeReg, 0, -32, cg), tmpXmmYmmReg1, cg); + generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); + } + +static void arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSLargeSizeREPMOVSD(TR::Node* node, TR::Register* dstReg, TR::Register* srcReg, TR::Register* sizeReg, TR::CodeGenerator* cg) + { + TR::Register* tmpReg1 = cg->allocateRegister(TR_GPR); + TR::Register* tmpReg2 = cg->allocateRegister(TR_GPR); + TR::Register* tmpXmmYmmReg1 = cg->allocateRegister(TR_VRF); + TR::Register* tmpXmmYmmReg2 = cg->allocateRegister(TR_VRF); + + TR::RegisterDependencyConditions* dependencies = generateRegisterDependencyConditions((uint8_t)7, (uint8_t)7, cg); + + dependencies->addPreCondition(srcReg, TR::RealRegister::esi, cg); + dependencies->addPreCondition(dstReg, TR::RealRegister::edi, cg); + dependencies->addPreCondition(sizeReg, TR::RealRegister::ecx, cg); + dependencies->addPreCondition(tmpReg1, TR::RealRegister::NoReg, cg); + dependencies->addPreCondition(tmpReg2, TR::RealRegister::NoReg, cg); + dependencies->addPreCondition(tmpXmmYmmReg1, TR::RealRegister::NoReg, cg); + dependencies->addPreCondition(tmpXmmYmmReg2, TR::RealRegister::NoReg, cg); + + dependencies->addPostCondition(srcReg, TR::RealRegister::esi, cg); + dependencies->addPostCondition(dstReg, TR::RealRegister::edi, cg); + dependencies->addPostCondition(sizeReg, TR::RealRegister::ecx, cg); + dependencies->addPostCondition(tmpReg1, TR::RealRegister::NoReg, cg); + dependencies->addPostCondition(tmpReg2, TR::RealRegister::NoReg, cg); + dependencies->addPostCondition(tmpXmmYmmReg1, TR::RealRegister::NoReg, cg); + dependencies->addPostCondition(tmpXmmYmmReg2, TR::RealRegister::NoReg, cg); + + TR::LabelSymbol* mainBegLabel = generateLabelSymbol(cg); + mainBegLabel->setStartInternalControlFlow(); + + TR::LabelSymbol* mainEndLabel = generateLabelSymbol(cg); + mainEndLabel->setEndInternalControlFlow(); + + TR::LabelSymbol* repMovsLabel = generateLabelSymbol(cg); + + generateLabelInstruction(TR::InstOpCode::label, node, mainBegLabel, cg); + + arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(node, dstReg, srcReg, sizeReg, tmpReg1, tmpReg2, tmpXmmYmmReg1, tmpXmmYmmReg2, cg, repMovsLabel, mainEndLabel); /* --------------------------------- * size > REP_MOVS_THRESHOLD_BYTES @@ -1921,6 +2267,7 @@ static void arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(TR::N * mainEndLabel: */ generateLabelInstruction(TR::InstOpCode::label, node, repMovsLabel, cg); + if (node->isForwardArrayCopy()) { generateRegImmInstruction(TR::InstOpCode::SHRRegImm1(), node, sizeReg, 2, cg); @@ -1955,42 +2302,6 @@ static void arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(TR::N og.endOutlinedInstructionSequence(); } } - } - -static void arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVS(TR::Node* node, TR::Register* dstReg, TR::Register* srcReg, TR::Register* sizeReg, TR::CodeGenerator* cg) - { - TR::Register* tmpReg1 = cg->allocateRegister(TR_GPR); - TR::Register* tmpReg2 = cg->allocateRegister(TR_GPR); - TR::Register* tmpXmmYmmReg1 = cg->allocateRegister(TR_VRF); - TR::Register* tmpXmmYmmReg2 = cg->allocateRegister(TR_VRF); - - TR::RegisterDependencyConditions* dependencies = generateRegisterDependencyConditions((uint8_t)7, (uint8_t)7, cg); - - dependencies->addPreCondition(srcReg, TR::RealRegister::esi, cg); - dependencies->addPreCondition(dstReg, TR::RealRegister::edi, cg); - dependencies->addPreCondition(sizeReg, TR::RealRegister::ecx, cg); - dependencies->addPreCondition(tmpReg1, TR::RealRegister::NoReg, cg); - dependencies->addPreCondition(tmpReg2, TR::RealRegister::NoReg, cg); - dependencies->addPreCondition(tmpXmmYmmReg1, TR::RealRegister::NoReg, cg); - dependencies->addPreCondition(tmpXmmYmmReg2, TR::RealRegister::NoReg, cg); - - dependencies->addPostCondition(srcReg, TR::RealRegister::esi, cg); - dependencies->addPostCondition(dstReg, TR::RealRegister::edi, cg); - dependencies->addPostCondition(sizeReg, TR::RealRegister::ecx, cg); - dependencies->addPostCondition(tmpReg1, TR::RealRegister::NoReg, cg); - dependencies->addPostCondition(tmpReg2, TR::RealRegister::NoReg, cg); - dependencies->addPostCondition(tmpXmmYmmReg1, TR::RealRegister::NoReg, cg); - dependencies->addPostCondition(tmpXmmYmmReg2, TR::RealRegister::NoReg, cg); - - TR::LabelSymbol* mainBegLabel = generateLabelSymbol(cg); - mainBegLabel->setStartInternalControlFlow(); - - TR::LabelSymbol* mainEndLabel = generateLabelSymbol(cg); - mainEndLabel->setEndInternalControlFlow(); - - generateLabelInstruction(TR::InstOpCode::label, node, mainBegLabel, cg); - - arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(node, dstReg, srcReg, sizeReg, tmpReg1, tmpReg2, tmpXmmYmmReg1, tmpXmmYmmReg2, cg, mainEndLabel); generateLabelInstruction(TR::InstOpCode::label, node, mainEndLabel, dependencies, cg); @@ -2150,14 +2461,42 @@ static void arrayCopy8BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot8(TR::Nod generateLabelInstruction(TR::InstOpCode::JA4, node, repMovsLabel, cg); // 33-64 Bytes - generateRegMemInstruction(TR::InstOpCode::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -32, cg), cg); - generateRegMemInstruction(TR::InstOpCode::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, sizeReg, 0, -32, cg), tmpXmmYmmReg1, cg); - generateMemRegInstruction(TR::InstOpCode::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -32, cg), cg); + generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); + generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, sizeReg, 0, -32, cg), tmpXmmYmmReg1, cg); + generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); } -static void arrayCopy8BitPrimitiveInlineSmallSizeWithoutREPMOVS(TR::Node* node, TR::Register* dstReg, TR::Register* srcReg, TR::Register* sizeReg, TR::CodeGenerator* cg) +/** \brief +* Generate instructions to do primitive array copy without using rep movs for smaller copy size +* +* \param node +* The tree node +* +* \param elementSize +* The size of an element, in bytes +* +* \param dstReg +* The destination array address register +* +* \param srcReg +* The source array address register +* +* \param sizeReg +* The register holding the total size of elements to be copied, in bytes +* +* \param cg +* The code generator +*/ + +static void arrayCopyPrimitiveInlineSmallSizeWithoutREPMOVS(TR::Node* node, + uint8_t elementSize, + int repMovsThresholdBytes, + TR::Register* dstReg, + TR::Register* srcReg, + TR::Register* sizeReg, + TR::CodeGenerator* cg) { TR::Register* tmpReg1 = cg->allocateRegister(TR_GPR); TR::Register* tmpReg2 = cg->allocateRegister(TR_GPR); @@ -2192,20 +2531,41 @@ static void arrayCopy8BitPrimitiveInlineSmallSizeWithoutREPMOVS(TR::Node* node, generateLabelInstruction(TR::InstOpCode::label, node, mainBegLabel, cg); - arrayCopy8BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot8(node, dstReg, srcReg, sizeReg, tmpReg1, tmpReg2, tmpXmmYmmReg1, tmpXmmYmmReg2, cg, repMovsLabel, mainEndLabel); + TR::InstOpCode::Mnemonic repmovs; + switch (elementSize) + { + case 8: + OMR::X86::TreeEvaluator::arrayCopy64BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(node, dstReg, srcReg, sizeReg, tmpReg1, tmpReg2, + tmpXmmYmmReg1, tmpXmmYmmReg2, cg, repMovsThresholdBytes, repMovsLabel, mainEndLabel); + repmovs = TR::InstOpCode::REPMOVSQ; + break; + case 4: + OMR::X86::TreeEvaluator::arrayCopy32BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(node, dstReg, srcReg, sizeReg, tmpReg1, tmpReg2, + tmpXmmYmmReg1, tmpXmmYmmReg2, cg, repMovsThresholdBytes, repMovsLabel, mainEndLabel); + repmovs = TR::InstOpCode::REPMOVSD; + break; + case 2: + arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(node, dstReg, srcReg, sizeReg, tmpReg1, tmpReg2, tmpXmmYmmReg1, tmpXmmYmmReg2, cg, repMovsLabel, mainEndLabel); + repmovs = TR::InstOpCode::REPMOVSW; + break; + default: // 1-byte + arrayCopy8BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot8(node, dstReg, srcReg, sizeReg, tmpReg1, tmpReg2, tmpXmmYmmReg1, tmpXmmYmmReg2, cg, repMovsLabel, mainEndLabel); + repmovs = TR::InstOpCode::REPMOVSB; + break; + } /* --------------------------------- - * size > REP_MOVS_THRESHOLD_BYTES - * rep movsb + * repMovsLabel: + * rep movs[b|w|d|q] * mainEndLabel: */ generateLabelInstruction(TR::InstOpCode::label, node, repMovsLabel, cg); if (node->isForwardArrayCopy()) { - generateRepMovsInstruction(TR::InstOpCode::REPMOVSB, node, sizeReg, dependencies, cg); + generateRepMovsInstruction(repmovs, node, sizeReg, dependencies, cg); } - else + else // decide direction during runtime { TR::LabelSymbol* backwardLabel = generateLabelSymbol(cg); @@ -2213,14 +2573,14 @@ static void arrayCopy8BitPrimitiveInlineSmallSizeWithoutREPMOVS(TR::Node* node, generateRegRegInstruction(TR::InstOpCode::CMPRegReg(), node, dstReg, sizeReg, cg); // cmp dst, size generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, dstReg, generateX86MemoryReference(dstReg, srcReg, 0, cg), cg); // dst = dst + src generateLabelInstruction(TR::InstOpCode::JB4, node, backwardLabel, cg); // jb, skip backward copy setup - generateRepMovsInstruction(TR::InstOpCode::REPMOVSB, node, sizeReg, NULL, cg); + generateRepMovsInstruction(repmovs, node, sizeReg, NULL, cg); { TR_OutlinedInstructionsGenerator og(backwardLabel, node, cg); - generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, srcReg, generateX86MemoryReference(srcReg, sizeReg, 0, -(intptr_t)1, cg), cg); - generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, dstReg, generateX86MemoryReference(dstReg, sizeReg, 0, -(intptr_t)1, cg), cg); + generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, srcReg, generateX86MemoryReference(srcReg, sizeReg, 0, -(intptr_t)elementSize, cg), cg); + generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, dstReg, generateX86MemoryReference(dstReg, sizeReg, 0, -(intptr_t)elementSize, cg), cg); generateInstruction(TR::InstOpCode::STD, node, cg); - generateRepMovsInstruction(TR::InstOpCode::REPMOVSB, node, sizeReg, NULL, cg); + generateRepMovsInstruction(repmovs, node, sizeReg, NULL, cg); generateInstruction(TR::InstOpCode::CLD, node, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); og.endOutlinedInstructionSequence(); @@ -2235,6 +2595,44 @@ static void arrayCopy8BitPrimitiveInlineSmallSizeWithoutREPMOVS(TR::Node* node, cg->stopUsingRegister(tmpXmmYmmReg2); } +static bool enablePrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS(uint8_t elementSize, TR::CodeGenerator* cg, int& threshold) + { + static bool disable8BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS = feGetEnv("TR_Disable8BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS") != NULL; + static bool disable16BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS = feGetEnv("TR_Disable16BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS") != NULL; + static bool disable32BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS = feGetEnv("TR_Disable32BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS") != NULL; + static bool disable64BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS = feGetEnv("TR_Disable64BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS") != NULL; + + bool disableEnhancement = false; + bool result = false; + threshold = cg->comp()->target().cpu.supportsFeature(OMR_FEATURE_X86_AVX512F) ? 128 : 64; + + switch (elementSize) + { + case 8: + disableEnhancement = disable64BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS + || cg->comp()->getOption(TR_Disable64BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS); + break; + case 4: + disableEnhancement = disable32BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS + || cg->comp()->getOption(TR_Disable32BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS); + break; + case 2: + disableEnhancement = disable16BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS + || cg->comp()->getOption(TR_Disable16BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS); + break; + default: // 1 byte + disableEnhancement = disable8BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS + || cg->comp()->getOption(TR_Disable8BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS); + break; + } + + result = (!disableEnhancement && + cg->comp()->target().cpu.supportsAVX() && + cg->comp()->target().is64Bit()) ? true : false; + + return result; + } + /** \brief * Generate instructions to do array copy. * @@ -2258,17 +2656,10 @@ static void arrayCopy8BitPrimitiveInlineSmallSizeWithoutREPMOVS(TR::Node* node, */ static void arrayCopyDefault(TR::Node* node, uint8_t elementSize, TR::Register* dstReg, TR::Register* srcReg, TR::Register* sizeReg, TR::CodeGenerator* cg) { - static bool disable8BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS = (feGetEnv("TR_Disable8BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS") != NULL); - bool disableEnhancement = disable8BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS || - cg->comp()->getOption(TR_Disable8BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS); - - bool enable8BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS = ((elementSize == 1) && - !disableEnhancement && - cg->comp()->target().cpu.supportsAVX() && - cg->comp()->target().is64Bit()) ? true : false; - if (enable8BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS) + int repMovsThresholdBytes = 0; + if (enablePrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS(elementSize, cg, repMovsThresholdBytes)) { - arrayCopy8BitPrimitiveInlineSmallSizeWithoutREPMOVS(node, dstReg, srcReg, sizeReg, cg); + arrayCopyPrimitiveInlineSmallSizeWithoutREPMOVS(node, elementSize, repMovsThresholdBytes, dstReg, srcReg, sizeReg, cg); return; } @@ -2732,16 +3123,9 @@ TR::Register *OMR::X86::TreeEvaluator::arraycopyEvaluator(TR::Node *node, TR::Co } else if (elementSize == 2 && !useREPMOVSW) { - static bool disable16BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS = feGetEnv("TR_Disable16BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS") != NULL; - bool disableEnhancement = disable16BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS - || cg->comp()->getOption(TR_Disable16BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS); - - bool enable16BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS = (!disableEnhancement && - cg->comp()->target().cpu.supportsAVX() && - cg->comp()->target().is64Bit()) ? true : false; - - if (enable16BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS) - arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVS(node, dstReg, srcReg, sizeReg, cg); + int repMovsThresholdBytes = 0; + if (enablePrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS(2, cg, repMovsThresholdBytes)) + arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSLargeSizeREPMOVSD(node, dstReg, srcReg, sizeReg, cg); else arrayCopy16BitPrimitive(node, dstReg, srcReg, sizeReg, cg); } diff --git a/compiler/x/codegen/OMRTreeEvaluator.hpp b/compiler/x/codegen/OMRTreeEvaluator.hpp index e5eeb950889..5899f6c2a73 100644 --- a/compiler/x/codegen/OMRTreeEvaluator.hpp +++ b/compiler/x/codegen/OMRTreeEvaluator.hpp @@ -519,6 +519,31 @@ class OMR_EXTENSIBLE TreeEvaluator: public OMR::TreeEvaluator static bool setCarryBorrow(TR::Node *flagNode, bool invertValue, TR::CodeGenerator *cg); + static void arrayCopy64BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(TR::Node *node, + TR::Register *dstReg, + TR::Register *srcReg, + TR::Register *sizeReg, + TR::Register *tmpReg1, + TR::Register *tmpReg2, + TR::Register *tmpXmmYmmReg1, + TR::Register *tmpXmmYmmReg2, + TR::CodeGenerator *cg, + int repMovsThresholdBytes, + TR::LabelSymbol *repMovsLabel, + TR::LabelSymbol *mainEndLabel); + + static void arrayCopy32BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(TR::Node *node, + TR::Register *dstReg, + TR::Register *srcReg, + TR::Register *sizeReg, + TR::Register *tmpReg1, + TR::Register *tmpReg2, + TR::Register *tmpXmmYmmReg1, + TR::Register *tmpXmmYmmReg2, + TR::CodeGenerator *cg, + int repMovsThresholdBytes, + TR::LabelSymbol *repMovsLabel, + TR::LabelSymbol *mainEndLabel); protected: static TR::Register *performHelperCall(TR::Node *node, TR::SymbolReference *helperSymRef, TR::ILOpCodes helperCallOpCode, bool spillFPRegs, TR::CodeGenerator *cg);