Skip to content

Commit

Permalink
Merge pull request #16662 from Spencer-Comin/vectorizedMismatch-iselect
Browse files Browse the repository at this point in the history
Accelerate ArraysSupport.vectorizedMismatch in IL
  • Loading branch information
jdmpapin committed Oct 27, 2023
2 parents 4a6551c + 0636989 commit d62e757
Show file tree
Hide file tree
Showing 9 changed files with 184 additions and 1 deletion.
6 changes: 6 additions & 0 deletions runtime/compiler/aarch64/codegen/J9CodeGenerator.cpp
Expand Up @@ -77,6 +77,12 @@ J9::ARM64::CodeGenerator::initialize()
comp->setOption(TR_EnableMonitorCacheLookup);
}

static bool disableInlineVectorizedMismatch = feGetEnv("TR_disableInlineVectorizedMismatch") != NULL;
if (cg->getSupportsArrayCmpLen() && !disableInlineVectorizedMismatch)
{
cg->setSupportsInlineVectorizedMismatch();
}

if (comp->fej9()->hasFixedFrameC_CallingConvention())
cg->setHasFixedFrameC_CallingConvention();
}
Expand Down
11 changes: 11 additions & 0 deletions runtime/compiler/codegen/J9CodeGenerator.hpp
Expand Up @@ -492,6 +492,16 @@ void addMonClass(TR::Node* monNode, TR_OpaqueClassBlock* clazz);
*/
void setSupportsInlineEncodeASCII() { _j9Flags.set(SupportsInlineEncodeASCII); }

/** \brief
* Determines whether the code generator supports inlining of jdk/internal/util/ArraysSupport.vectorizedMismatch
*/
bool getSupportsInlineVectorizedMismatch() { return _j9Flags.testAny(SupportsInlineVectorizedMismatch); }

/** \brief
* The code generator supports inlining of jdk/internal/util/ArraysSupport.vectorizedMismatch
*/
void setSupportsInlineVectorizedMismatch() { _j9Flags.set(SupportsInlineVectorizedMismatch); }

/**
* \brief
* The number of nodes between a monext and the next monent before
Expand Down Expand Up @@ -649,6 +659,7 @@ void addMonClass(TR::Node* monNode, TR_OpaqueClassBlock* clazz);
SupportsIntegerToChars = 0x00000200,
SupportsInlineEncodeASCII = 0x00000400,
SavesNonVolatileGPRsForGC = 0x00000800,
SupportsInlineVectorizedMismatch = 0x00001000,
};

flags32_t _j9Flags;
Expand Down
1 change: 1 addition & 0 deletions runtime/compiler/codegen/J9RecognizedMethodsEnum.hpp
Expand Up @@ -433,6 +433,7 @@

jdk_internal_misc_Unsafe_copyMemory0,
jdk_internal_loader_NativeLibraries_load,
jdk_internal_util_ArraysSupport_vectorizedMismatch,
jdk_internal_util_Preconditions_checkIndex,

FirstVectorMethod,
Expand Down
8 changes: 7 additions & 1 deletion runtime/compiler/env/j9method.cpp
Expand Up @@ -3970,6 +3970,12 @@ void TR_ResolvedJ9Method::construct()
{ TR::unknownMethod},
};

static X ArraysSupportMethods [] =
{
{x(TR::jdk_internal_util_ArraysSupport_vectorizedMismatch, "vectorizedMismatch", "(Ljava/lang/Object;JLjava/lang/Object;JII)I")},
{ TR::unknownMethod}
};

struct Y { const char * _class; X * _methods; };

/* classXX where XX is the number of characters in the class name */
Expand Down Expand Up @@ -4151,7 +4157,7 @@ void TR_ResolvedJ9Method::construct()
{ "com/ibm/jit/DecimalFormatHelper", DecimalFormatHelperMethods},
{ "jdk/internal/reflect/Reflection", ReflectionMethods },
{ "jdk/internal/util/Preconditions", PreconditionsMethods },

{ "jdk/internal/util/ArraysSupport", ArraysSupportMethods },
{ 0 }
};
static Y class32[] =
Expand Down
59 changes: 59 additions & 0 deletions runtime/compiler/optimizer/J9RecognizedCallTransformer.cpp
Expand Up @@ -334,6 +334,60 @@ void J9::RecognizedCallTransformer::process_java_lang_StringUTF16_toBytes(TR::Tr
}
}

void J9::RecognizedCallTransformer::process_jdk_internal_util_ArraysSupport_vectorizedMismatch(TR::TreeTop* treetop, TR::Node* node)
{
TR::Node* a = node->getChild(0);
TR::Node* aOffset = node->getChild(1);
TR::Node* b = node->getChild(2);
TR::Node* bOffset = node->getChild(3);
TR::Node* length = node->getChild(4);
TR::Node* log2ArrayIndexScale = node->getChild(5);
TR::Node* log2ArrayIndexScale64Bits = TR::Node::create(node, TR::iu2l, 1, log2ArrayIndexScale);

TR::Node* lengthInBytes = TR::Node::create(node, TR::lshl, 2,
TR::Node::create(node, TR::iu2l, 1, length),
log2ArrayIndexScale);

TR::Node* mask = TR::Node::create(node, TR::lor, 2,
TR::Node::create(node, TR::lshl, 2,
log2ArrayIndexScale64Bits,
TR::Node::iconst(node, 1)),
TR::Node::lconst(node, 3));

TR::Node* lengthToCompare = TR::Node::create(node, TR::land, 2,
lengthInBytes,
TR::Node::create(node, TR::lxor, 2, mask, TR::Node::lconst(node, -1)));

TR::Node* mismatchByteIndex = TR::Node::create(node, TR::arraycmplen, 3);
// TODO: replace the following aladd's with generateDataAddrLoadTrees when off-heap memory changes come in
// See OpenJ9 issue #16717 https://github.com/eclipse-openj9/openj9/issues/16717
mismatchByteIndex->setAndIncChild(0, TR::Node::create(node, TR::aladd, 2, a, aOffset));
mismatchByteIndex->setAndIncChild(1, TR::Node::create(node, TR::aladd, 2, b, bOffset));
mismatchByteIndex->setAndIncChild(2, lengthToCompare);
mismatchByteIndex->setSymbolReference(getSymRefTab()->findOrCreateArrayCmpLenSymbol());

TR::Node* invertedRemainder = TR::Node::create(node, TR::ixor, 2,
TR::Node::create(node, TR::l2i, 1,
TR::Node::create(node, TR::lshr, 2,
TR::Node::create(node, TR::land, 2, lengthInBytes, mask),
log2ArrayIndexScale)),
TR::Node::iconst(node, -1));

TR::Node* mismatchElementIndex = TR::Node::create(node, TR::l2i, 1, TR::Node::create(node, TR::lshr, 2, mismatchByteIndex, log2ArrayIndexScale));
TR::Node* noMismatchFound = TR::Node::create(node, TR::lcmpeq, 2, mismatchByteIndex, lengthToCompare);

anchorAllChildren(node, treetop);
prepareToReplaceNode(node);

TR::Node::recreate(node, TR::iselect);
node->setNumChildren(3);
node->setAndIncChild(0, noMismatchFound);
node->setAndIncChild(1, invertedRemainder);
node->setAndIncChild(2, mismatchElementIndex);

TR::TransformUtil::removeTree(comp(), treetop);
}

void J9::RecognizedCallTransformer::process_java_lang_StrictMath_and_Math_sqrt(TR::TreeTop* treetop, TR::Node* node)
{
TR::Node* valueNode = node->getLastChild();
Expand Down Expand Up @@ -1138,6 +1192,8 @@ bool J9::RecognizedCallTransformer::isInlineable(TR::TreeTop* treetop)
case TR::java_lang_StringCoding_encodeASCII:
case TR::java_lang_String_encodeASCII:
return comp()->cg()->getSupportsInlineEncodeASCII();
case TR::jdk_internal_util_ArraysSupport_vectorizedMismatch:
return comp()->cg()->getSupportsInlineVectorizedMismatch();
default:
return false;
}
Expand Down Expand Up @@ -1273,6 +1329,9 @@ void J9::RecognizedCallTransformer::transform(TR::TreeTop* treetop)
case TR::java_lang_Long_reverseBytes:
processIntrinsicFunction(treetop, node, TR::lbyteswap);
break;
case TR::jdk_internal_util_ArraysSupport_vectorizedMismatch:
process_jdk_internal_util_ArraysSupport_vectorizedMismatch(treetop, node);
break;
default:
break;
}
Expand Down
82 changes: 82 additions & 0 deletions runtime/compiler/optimizer/J9RecognizedCallTransformer.hpp
Expand Up @@ -139,6 +139,88 @@ class RecognizedCallTransformer : public OMR::RecognizedCallTransformer
* \endcode
*/
void process_java_lang_StringUTF16_toBytes(TR::TreeTop* treetop, TR::Node* node);
/** \brief
* Transforms jdk/internal/util/ArraysSupport.vectorizedMismatch(Ljava/lang/Object;JLjava/lang/Object;JII)I
* into an arraycmplen, bit manipulation and iselect sequence with equivalent semantics.
*
* \param treetop
* The treetop which anchors the call node.
*
* \param node
* The call node representing a call to jdk/internal/util/ArraysSupport.vectorizedMismatch(Ljava/lang/Object;JLjava/lang/Object;JII)I
* which has the following shape:
*
* \code
* icall <jdk/internal/util/ArraysSupport.vectorizedMismatch(Ljava/lang/Object;JLjava/lang/Object;JII)I>
* <a>
* <aOffset>
* <b>
* <bOffset>
* <length>
* <log2ArrayIndexScale>
* \endcode
*
* \details
* The call node is transformed to the following shape:
*
* \code
* iselect ()
* lcmpeq
* arraycmplen
* aladd
* <a>
* <aOffset>
* aladd
* <b>
* <bOffset>
* land
* lshl
* i2l
* <length>
* <log2ArrayIndexScale>
* lxor
* lor
* lshl
* ==>i2l
* iconst 1
* lconst 3
* lconst -1
* ==>land
* ixor
* l2i
* lshr
* land
* ==>lshl
* ==>lor
* <log2ArrayIndexScale>
* iconst -1
* l2i
* lshr
* ==>arraycmplen
* <log2ArrayIndexScale>
* \endcode
*
* This transformation is valid because vectorizedMismatch is functionally equivalent to the following pseudocode
*
* \code
* vectorizedMismatch(a, aOffset, b, bOffset, length, log2ArrayIndexScale) {
* lengthInBytes = length << log2ArrayIndexScale
*
* // the following mask calculation is equivalent to 'mask = (log2ArrayIndexScale<2) ? 3 : 7', assuming log2ArrayIndexScale <= 3
* // the original java implementation checks multiple of 8B at a time, but for elements smaller than 4B it also checks another 4B at the end
* // this mask serves to round down to nearest multiple of 8B (or 4B if the element is smaller than 4B) and get remainder
* mask = (log2ArrayIndexScale<<1) | 3
*
* lengthToCompare = lengthInBytes & ~(mask) // round down to nearest multiple of 8 (or 4)
* mismatchIndex = arrayCmpLen(a+aOffset, b+bOffset, lengthToCompare)
* if (mismatchIndex == lengthToCompare) // no mismatch found
* return ~((lengthInBytes & mask) >> log2ArrayIndexScale) // inverted remainder, converted from byte-wise index to element-wise index
* else // mismatch found
* return mismatchIndex >> log2ArrayIndexScale // convert byte-wise index to element-wise index
* }
* \endcode
*/
void process_jdk_internal_util_ArraysSupport_vectorizedMismatch(TR::TreeTop* treetop, TR::Node* node);
/** \brief
* Transforms java/lang/StrictMath.sqrt(D)D and java/lang/Math.sqrt(D)D into a CodeGen inlined function with equivalent semantics.
*
Expand Down
6 changes: 6 additions & 0 deletions runtime/compiler/p/codegen/J9CodeGenerator.cpp
Expand Up @@ -77,6 +77,12 @@ J9::Power::CodeGenerator::initialize()
cg->setSupportsInlineConcurrentLinkedQueue();
}

static bool disableInlineVectorizedMismatch = feGetEnv("TR_disableInlineVectorizedMismatch") != NULL;
if (cg->getSupportsArrayCmpLen() && !disableInlineVectorizedMismatch)
{
cg->setSupportsInlineVectorizedMismatch();
}

cg->setSupportsNewInstanceImplOpt();

static char *disableMonitorCacheLookup = feGetEnv("TR_disableMonitorCacheLookup");
Expand Down
6 changes: 6 additions & 0 deletions runtime/compiler/x/codegen/J9CodeGenerator.cpp
Expand Up @@ -135,6 +135,12 @@ J9::X86::CodeGenerator::initialize()
cg->setSupportsBDLLHardwareOverflowCheck();
}

static bool disableInlineVectorizedMismatch = feGetEnv("TR_disableInlineVectorizedMismatch") != NULL;
if (cg->getSupportsArrayCmpLen() && !disableInlineVectorizedMismatch)
{
cg->setSupportsInlineVectorizedMismatch();
}

// Disable fast gencon barriers for AOT compiles because relocations on
// the inlined heap addresses are not available (yet).
//
Expand Down
6 changes: 6 additions & 0 deletions runtime/compiler/z/codegen/J9CodeGenerator.cpp
Expand Up @@ -124,6 +124,12 @@ J9::Z::CodeGenerator::initialize()
cg->setSupportsInlineEncodeASCII();
}

static bool disableInlineVectorizedMismatch = feGetEnv("TR_disableInlineVectorizedMismatch") != NULL;
if (cg->getSupportsArrayCmpLen() && !disableInlineVectorizedMismatch)
{
cg->setSupportsInlineVectorizedMismatch();
}

// Let's turn this on. There is more work needed in the opt
// to catch the case where the BNDSCHK is inserted after
//
Expand Down

0 comments on commit d62e757

Please sign in to comment.