From 0636989466db12aab4d9245fff681db7e0d7dd24 Mon Sep 17 00:00:00 2001 From: Spencer Comin Date: Tue, 19 Sep 2023 15:50:03 -0400 Subject: [PATCH] Accelerate vectorizedMismatch in IL This patch adds ArraysSupport.vectorizedMismatch as a recognized method, and adds a SupportsInlineVectorizedMismatch flag to the code generator. This flag is set in Z, Power, aarch64 and x86 code generator initialization if the arraycmp opcode is supported and the TR_disableInlineVectorizedMismatch environment variable is not set. If the flag is set, vectorizedMismatch call nodes are transformed to a functionally equivalent tree that uses arraycmp. Fixes: #15204 Signed-off-by: Spencer Comin --- .../aarch64/codegen/J9CodeGenerator.cpp | 6 ++ runtime/compiler/codegen/J9CodeGenerator.hpp | 11 +++ .../codegen/J9RecognizedMethodsEnum.hpp | 1 + runtime/compiler/env/j9method.cpp | 8 +- .../optimizer/J9RecognizedCallTransformer.cpp | 59 +++++++++++++ .../optimizer/J9RecognizedCallTransformer.hpp | 82 +++++++++++++++++++ .../compiler/p/codegen/J9CodeGenerator.cpp | 6 ++ .../compiler/x/codegen/J9CodeGenerator.cpp | 6 ++ .../compiler/z/codegen/J9CodeGenerator.cpp | 6 ++ 9 files changed, 184 insertions(+), 1 deletion(-) diff --git a/runtime/compiler/aarch64/codegen/J9CodeGenerator.cpp b/runtime/compiler/aarch64/codegen/J9CodeGenerator.cpp index 14a39b11fbe..5314f868178 100644 --- a/runtime/compiler/aarch64/codegen/J9CodeGenerator.cpp +++ b/runtime/compiler/aarch64/codegen/J9CodeGenerator.cpp @@ -77,6 +77,12 @@ J9::ARM64::CodeGenerator::initialize() comp->setOption(TR_EnableMonitorCacheLookup); } + static bool disableInlineVectorizedMismatch = feGetEnv("TR_disableInlineVectorizedMismatch") != NULL; + if (cg->getSupportsArrayCmpLen() && !disableInlineVectorizedMismatch) + { + cg->setSupportsInlineVectorizedMismatch(); + } + if (comp->fej9()->hasFixedFrameC_CallingConvention()) cg->setHasFixedFrameC_CallingConvention(); } diff --git a/runtime/compiler/codegen/J9CodeGenerator.hpp b/runtime/compiler/codegen/J9CodeGenerator.hpp index 5e8bb5ceb77..a34d255f4d6 100644 --- a/runtime/compiler/codegen/J9CodeGenerator.hpp +++ b/runtime/compiler/codegen/J9CodeGenerator.hpp @@ -492,6 +492,16 @@ void addMonClass(TR::Node* monNode, TR_OpaqueClassBlock* clazz); */ void setSupportsInlineEncodeASCII() { _j9Flags.set(SupportsInlineEncodeASCII); } + /** \brief + * Determines whether the code generator supports inlining of jdk/internal/util/ArraysSupport.vectorizedMismatch + */ + bool getSupportsInlineVectorizedMismatch() { return _j9Flags.testAny(SupportsInlineVectorizedMismatch); } + + /** \brief + * The code generator supports inlining of jdk/internal/util/ArraysSupport.vectorizedMismatch + */ + void setSupportsInlineVectorizedMismatch() { _j9Flags.set(SupportsInlineVectorizedMismatch); } + /** * \brief * The number of nodes between a monext and the next monent before @@ -649,6 +659,7 @@ void addMonClass(TR::Node* monNode, TR_OpaqueClassBlock* clazz); SupportsIntegerToChars = 0x00000200, SupportsInlineEncodeASCII = 0x00000400, SavesNonVolatileGPRsForGC = 0x00000800, + SupportsInlineVectorizedMismatch = 0x00001000, }; flags32_t _j9Flags; diff --git a/runtime/compiler/codegen/J9RecognizedMethodsEnum.hpp b/runtime/compiler/codegen/J9RecognizedMethodsEnum.hpp index 724a48ad0db..df5703ea8e4 100644 --- a/runtime/compiler/codegen/J9RecognizedMethodsEnum.hpp +++ b/runtime/compiler/codegen/J9RecognizedMethodsEnum.hpp @@ -433,6 +433,7 @@ jdk_internal_misc_Unsafe_copyMemory0, jdk_internal_loader_NativeLibraries_load, + jdk_internal_util_ArraysSupport_vectorizedMismatch, jdk_internal_util_Preconditions_checkIndex, FirstVectorMethod, diff --git a/runtime/compiler/env/j9method.cpp b/runtime/compiler/env/j9method.cpp index 0571e56604d..54bef1f5852 100644 --- a/runtime/compiler/env/j9method.cpp +++ b/runtime/compiler/env/j9method.cpp @@ -3970,6 +3970,12 @@ void TR_ResolvedJ9Method::construct() { TR::unknownMethod}, }; + static X ArraysSupportMethods [] = + { + {x(TR::jdk_internal_util_ArraysSupport_vectorizedMismatch, "vectorizedMismatch", "(Ljava/lang/Object;JLjava/lang/Object;JII)I")}, + { TR::unknownMethod} + }; + struct Y { const char * _class; X * _methods; }; /* classXX where XX is the number of characters in the class name */ @@ -4151,7 +4157,7 @@ void TR_ResolvedJ9Method::construct() { "com/ibm/jit/DecimalFormatHelper", DecimalFormatHelperMethods}, { "jdk/internal/reflect/Reflection", ReflectionMethods }, { "jdk/internal/util/Preconditions", PreconditionsMethods }, - + { "jdk/internal/util/ArraysSupport", ArraysSupportMethods }, { 0 } }; static Y class32[] = diff --git a/runtime/compiler/optimizer/J9RecognizedCallTransformer.cpp b/runtime/compiler/optimizer/J9RecognizedCallTransformer.cpp index d3be6ef484e..f065b2ce0d5 100644 --- a/runtime/compiler/optimizer/J9RecognizedCallTransformer.cpp +++ b/runtime/compiler/optimizer/J9RecognizedCallTransformer.cpp @@ -334,6 +334,60 @@ void J9::RecognizedCallTransformer::process_java_lang_StringUTF16_toBytes(TR::Tr } } +void J9::RecognizedCallTransformer::process_jdk_internal_util_ArraysSupport_vectorizedMismatch(TR::TreeTop* treetop, TR::Node* node) + { + TR::Node* a = node->getChild(0); + TR::Node* aOffset = node->getChild(1); + TR::Node* b = node->getChild(2); + TR::Node* bOffset = node->getChild(3); + TR::Node* length = node->getChild(4); + TR::Node* log2ArrayIndexScale = node->getChild(5); + TR::Node* log2ArrayIndexScale64Bits = TR::Node::create(node, TR::iu2l, 1, log2ArrayIndexScale); + + TR::Node* lengthInBytes = TR::Node::create(node, TR::lshl, 2, + TR::Node::create(node, TR::iu2l, 1, length), + log2ArrayIndexScale); + + TR::Node* mask = TR::Node::create(node, TR::lor, 2, + TR::Node::create(node, TR::lshl, 2, + log2ArrayIndexScale64Bits, + TR::Node::iconst(node, 1)), + TR::Node::lconst(node, 3)); + + TR::Node* lengthToCompare = TR::Node::create(node, TR::land, 2, + lengthInBytes, + TR::Node::create(node, TR::lxor, 2, mask, TR::Node::lconst(node, -1))); + + TR::Node* mismatchByteIndex = TR::Node::create(node, TR::arraycmplen, 3); + // TODO: replace the following aladd's with generateDataAddrLoadTrees when off-heap memory changes come in + // See OpenJ9 issue #16717 https://github.com/eclipse-openj9/openj9/issues/16717 + mismatchByteIndex->setAndIncChild(0, TR::Node::create(node, TR::aladd, 2, a, aOffset)); + mismatchByteIndex->setAndIncChild(1, TR::Node::create(node, TR::aladd, 2, b, bOffset)); + mismatchByteIndex->setAndIncChild(2, lengthToCompare); + mismatchByteIndex->setSymbolReference(getSymRefTab()->findOrCreateArrayCmpLenSymbol()); + + TR::Node* invertedRemainder = TR::Node::create(node, TR::ixor, 2, + TR::Node::create(node, TR::l2i, 1, + TR::Node::create(node, TR::lshr, 2, + TR::Node::create(node, TR::land, 2, lengthInBytes, mask), + log2ArrayIndexScale)), + TR::Node::iconst(node, -1)); + + TR::Node* mismatchElementIndex = TR::Node::create(node, TR::l2i, 1, TR::Node::create(node, TR::lshr, 2, mismatchByteIndex, log2ArrayIndexScale)); + TR::Node* noMismatchFound = TR::Node::create(node, TR::lcmpeq, 2, mismatchByteIndex, lengthToCompare); + + anchorAllChildren(node, treetop); + prepareToReplaceNode(node); + + TR::Node::recreate(node, TR::iselect); + node->setNumChildren(3); + node->setAndIncChild(0, noMismatchFound); + node->setAndIncChild(1, invertedRemainder); + node->setAndIncChild(2, mismatchElementIndex); + + TR::TransformUtil::removeTree(comp(), treetop); + } + void J9::RecognizedCallTransformer::process_java_lang_StrictMath_and_Math_sqrt(TR::TreeTop* treetop, TR::Node* node) { TR::Node* valueNode = node->getLastChild(); @@ -1138,6 +1192,8 @@ bool J9::RecognizedCallTransformer::isInlineable(TR::TreeTop* treetop) case TR::java_lang_StringCoding_encodeASCII: case TR::java_lang_String_encodeASCII: return comp()->cg()->getSupportsInlineEncodeASCII(); + case TR::jdk_internal_util_ArraysSupport_vectorizedMismatch: + return comp()->cg()->getSupportsInlineVectorizedMismatch(); default: return false; } @@ -1273,6 +1329,9 @@ void J9::RecognizedCallTransformer::transform(TR::TreeTop* treetop) case TR::java_lang_Long_reverseBytes: processIntrinsicFunction(treetop, node, TR::lbyteswap); break; + case TR::jdk_internal_util_ArraysSupport_vectorizedMismatch: + process_jdk_internal_util_ArraysSupport_vectorizedMismatch(treetop, node); + break; default: break; } diff --git a/runtime/compiler/optimizer/J9RecognizedCallTransformer.hpp b/runtime/compiler/optimizer/J9RecognizedCallTransformer.hpp index 6bd94a7fa42..e20cf97ae55 100644 --- a/runtime/compiler/optimizer/J9RecognizedCallTransformer.hpp +++ b/runtime/compiler/optimizer/J9RecognizedCallTransformer.hpp @@ -139,6 +139,88 @@ class RecognizedCallTransformer : public OMR::RecognizedCallTransformer * \endcode */ void process_java_lang_StringUTF16_toBytes(TR::TreeTop* treetop, TR::Node* node); + /** \brief + * Transforms jdk/internal/util/ArraysSupport.vectorizedMismatch(Ljava/lang/Object;JLjava/lang/Object;JII)I + * into an arraycmplen, bit manipulation and iselect sequence with equivalent semantics. + * + * \param treetop + * The treetop which anchors the call node. + * + * \param node + * The call node representing a call to jdk/internal/util/ArraysSupport.vectorizedMismatch(Ljava/lang/Object;JLjava/lang/Object;JII)I + * which has the following shape: + * + * \code + * icall + * + * + * + * + * + * + * \endcode + * + * \details + * The call node is transformed to the following shape: + * + * \code + * iselect () + * lcmpeq + * arraycmplen + * aladd + * + * + * aladd + * + * + * land + * lshl + * i2l + * + * + * lxor + * lor + * lshl + * ==>i2l + * iconst 1 + * lconst 3 + * lconst -1 + * ==>land + * ixor + * l2i + * lshr + * land + * ==>lshl + * ==>lor + * + * iconst -1 + * l2i + * lshr + * ==>arraycmplen + * + * \endcode + * + * This transformation is valid because vectorizedMismatch is functionally equivalent to the following pseudocode + * + * \code + * vectorizedMismatch(a, aOffset, b, bOffset, length, log2ArrayIndexScale) { + * lengthInBytes = length << log2ArrayIndexScale + * + * // the following mask calculation is equivalent to 'mask = (log2ArrayIndexScale<2) ? 3 : 7', assuming log2ArrayIndexScale <= 3 + * // the original java implementation checks multiple of 8B at a time, but for elements smaller than 4B it also checks another 4B at the end + * // this mask serves to round down to nearest multiple of 8B (or 4B if the element is smaller than 4B) and get remainder + * mask = (log2ArrayIndexScale<<1) | 3 + * + * lengthToCompare = lengthInBytes & ~(mask) // round down to nearest multiple of 8 (or 4) + * mismatchIndex = arrayCmpLen(a+aOffset, b+bOffset, lengthToCompare) + * if (mismatchIndex == lengthToCompare) // no mismatch found + * return ~((lengthInBytes & mask) >> log2ArrayIndexScale) // inverted remainder, converted from byte-wise index to element-wise index + * else // mismatch found + * return mismatchIndex >> log2ArrayIndexScale // convert byte-wise index to element-wise index + * } + * \endcode + */ + void process_jdk_internal_util_ArraysSupport_vectorizedMismatch(TR::TreeTop* treetop, TR::Node* node); /** \brief * Transforms java/lang/StrictMath.sqrt(D)D and java/lang/Math.sqrt(D)D into a CodeGen inlined function with equivalent semantics. * diff --git a/runtime/compiler/p/codegen/J9CodeGenerator.cpp b/runtime/compiler/p/codegen/J9CodeGenerator.cpp index 4887a261122..59fc61de4dd 100644 --- a/runtime/compiler/p/codegen/J9CodeGenerator.cpp +++ b/runtime/compiler/p/codegen/J9CodeGenerator.cpp @@ -77,6 +77,12 @@ J9::Power::CodeGenerator::initialize() cg->setSupportsInlineConcurrentLinkedQueue(); } + static bool disableInlineVectorizedMismatch = feGetEnv("TR_disableInlineVectorizedMismatch") != NULL; + if (cg->getSupportsArrayCmpLen() && !disableInlineVectorizedMismatch) + { + cg->setSupportsInlineVectorizedMismatch(); + } + cg->setSupportsNewInstanceImplOpt(); static char *disableMonitorCacheLookup = feGetEnv("TR_disableMonitorCacheLookup"); diff --git a/runtime/compiler/x/codegen/J9CodeGenerator.cpp b/runtime/compiler/x/codegen/J9CodeGenerator.cpp index e8d4520b92b..58c7cbbc101 100644 --- a/runtime/compiler/x/codegen/J9CodeGenerator.cpp +++ b/runtime/compiler/x/codegen/J9CodeGenerator.cpp @@ -135,6 +135,12 @@ J9::X86::CodeGenerator::initialize() cg->setSupportsBDLLHardwareOverflowCheck(); } + static bool disableInlineVectorizedMismatch = feGetEnv("TR_disableInlineVectorizedMismatch") != NULL; + if (cg->getSupportsArrayCmpLen() && !disableInlineVectorizedMismatch) + { + cg->setSupportsInlineVectorizedMismatch(); + } + // Disable fast gencon barriers for AOT compiles because relocations on // the inlined heap addresses are not available (yet). // diff --git a/runtime/compiler/z/codegen/J9CodeGenerator.cpp b/runtime/compiler/z/codegen/J9CodeGenerator.cpp index 7eec0fb2ed8..bb9b71eb403 100644 --- a/runtime/compiler/z/codegen/J9CodeGenerator.cpp +++ b/runtime/compiler/z/codegen/J9CodeGenerator.cpp @@ -124,6 +124,12 @@ J9::Z::CodeGenerator::initialize() cg->setSupportsInlineEncodeASCII(); } + static bool disableInlineVectorizedMismatch = feGetEnv("TR_disableInlineVectorizedMismatch") != NULL; + if (cg->getSupportsArrayCmpLen() && !disableInlineVectorizedMismatch) + { + cg->setSupportsInlineVectorizedMismatch(); + } + // Let's turn this on. There is more work needed in the opt // to catch the case where the BNDSCHK is inserted after //