Skip to content

Commit

Permalink
Merge pull request #16243 from bhavanisn/crc32c_merge
Browse files Browse the repository at this point in the history
Integrate CRC32C polynomial on Power
  • Loading branch information
ymanton committed Nov 24, 2022
2 parents 78038a1 + 5c4aa8b commit 8206ceb
Show file tree
Hide file tree
Showing 12 changed files with 2,303 additions and 34 deletions.
4 changes: 4 additions & 0 deletions runtime/compiler/codegen/J9RecognizedMethodsEnum.hpp
Expand Up @@ -343,7 +343,11 @@
java_util_HashMapHashIterator_init,
java_util_zip_CRC32_update,
java_util_zip_CRC32_updateBytes,
java_util_zip_CRC32_updateBytes0,
java_util_zip_CRC32_updateByteBuffer,
java_util_zip_CRC32_updateByteBuffer0,
java_util_zip_CRC32C_updateBytes,
java_util_zip_CRC32C_updateDirectByteBuffer,
sun_misc_Unsafe_compareAndSwapInt_jlObjectJII_Z,
sun_misc_Unsafe_compareAndSwapLong_jlObjectJJJ_Z,
sun_misc_Unsafe_compareAndSwapObject_jlObjectJjlObjectjlObject_Z,
Expand Down
10 changes: 10 additions & 0 deletions runtime/compiler/env/j9method.cpp
Expand Up @@ -3281,7 +3281,16 @@ void TR_ResolvedJ9Method::construct()
{
{x(TR::java_util_zip_CRC32_update, "update", "(II)I")},
{x(TR::java_util_zip_CRC32_updateBytes, "updateBytes", "(I[BII)I")},
{x(TR::java_util_zip_CRC32_updateBytes0, "updateBytes0", "(I[BII)I")},
{x(TR::java_util_zip_CRC32_updateByteBuffer, "updateByteBuffer", "(IJII)I")},
{x(TR::java_util_zip_CRC32_updateByteBuffer0, "updateByteBuffer0", "(IJII)I")},
{ TR::unknownMethod}
};

static X CRC32CMethods[] =
{
{x(TR::java_util_zip_CRC32C_updateBytes, "updateBytes", "(I[BII)I")},
{x(TR::java_util_zip_CRC32C_updateDirectByteBuffer, "updateDirectByteBuffer", "(IJII)I")},
{ TR::unknownMethod}
};

Expand Down Expand Up @@ -3981,6 +3990,7 @@ void TR_ResolvedJ9Method::construct()
{ "java/lang/StrictMath", StrictMathMethods },
{ "java/math/BigDecimal", BigDecimalMethods },
{ "java/math/BigInteger", BigIntegerMethods },
{ "java/util/zip/CRC32C", CRC32CMethods },
{ 0 }
};

Expand Down
5 changes: 5 additions & 0 deletions runtime/compiler/il/J9Node.cpp
Expand Up @@ -342,8 +342,13 @@ J9::Node::processJNICall(TR::TreeTop *callNodeTreeTop, TR::ResolvedMethodSymbol
// The addresses of the optimized helpers in the server process will not necessarily
// match the client-side addresses, so we can't take this shortcut in JITServer mode.
if (((methodSymbol->getRecognizedMethod() == TR::java_util_zip_CRC32_update) ||
#if JAVA_SPEC_VERSION <= 8
(methodSymbol->getRecognizedMethod() == TR::java_util_zip_CRC32_updateBytes) ||
(methodSymbol->getRecognizedMethod() == TR::java_util_zip_CRC32_updateByteBuffer)) &&
#else
(methodSymbol->getRecognizedMethod() == TR::java_util_zip_CRC32_updateBytes0) ||
(methodSymbol->getRecognizedMethod() == TR::java_util_zip_CRC32_updateByteBuffer0)) &&
#endif
!comp->requiresSpineChecks() &&
!comp->compileRelocatableCode()
#ifdef J9VM_OPT_JITSERVER
Expand Down
11 changes: 11 additions & 0 deletions runtime/compiler/p/codegen/J9CodeGenerator.cpp
Expand Up @@ -353,6 +353,17 @@ bool J9::Power::CodeGenerator::suppressInliningOfRecognizedMethod(TR::Recognized
}
}

/* Need this check for CRC32C update* methods so that this doesn't get inlined.
* Unlike CRC32's update* which are JNI methods, CRC32C have Java implementation
* and therefore we need to prevent the inliner from eliminating the calls so
* we can redirect them to our optimized helpers.
*/
if (method == TR::java_util_zip_CRC32C_updateBytes ||
method == TR::java_util_zip_CRC32C_updateDirectByteBuffer)
{
return true;
}

return false;
}

Expand Down
24 changes: 22 additions & 2 deletions runtime/compiler/p/codegen/PPCJNILinkage.cpp
Expand Up @@ -115,8 +115,20 @@ TR::Register *J9::Power::JNILinkage::buildDirectDispatch(TR::Node *callNode)
uintptr_t targetAddress;

bool crc32m1 = (callSymbol->getRecognizedMethod() == TR::java_util_zip_CRC32_update);
bool crc32m2 = (callSymbol->getRecognizedMethod() == TR::java_util_zip_CRC32_updateBytes);
bool crc32m3 = (callSymbol->getRecognizedMethod() == TR::java_util_zip_CRC32_updateByteBuffer);
bool crc32m2 =
#if JAVA_SPEC_VERSION >= 9
(callSymbol->getRecognizedMethod() == TR::java_util_zip_CRC32_updateBytes0)
#else
(callSymbol->getRecognizedMethod() == TR::java_util_zip_CRC32_updateBytes)
#endif
;
bool crc32m3 =
#if JAVA_SPEC_VERSION >= 9
(callSymbol->getRecognizedMethod() == TR::java_util_zip_CRC32_updateByteBuffer0)
#else
(callSymbol->getRecognizedMethod() == TR::java_util_zip_CRC32_updateByteBuffer)
#endif
;

// TODO: How to handle discontiguous array?
// The specialCaseJNI shortcut will mangle register dependencies and use system/C dispatch.
Expand Down Expand Up @@ -304,6 +316,14 @@ TR::Register *J9::Power::JNILinkage::buildDirectDispatch(TR::Node *callNode)
}
generateTrg1Src2Instruction(cg(), TR::InstOpCode::add, callNode, addrArg, addrArg, posArg);

if (crc32m2 || crc32m3)
{
/* Passing zero for the castagnoli parameter of crc32_vpmsum helper. Here we are re-using
* posArg in gr6 after the buffer address has been calculated.
*/
generateTrg1ImmInstruction(cg(), TR::InstOpCode::li, callNode, posArg, 0);
}

deps->getPreConditions()->setDependencyInfo(map.getTargetIndex(TR::RealRegister::gr4), addrArg, TR::RealRegister::gr4, UsesDependentRegister);
deps->getPostConditions()->setDependencyInfo(map.getTargetIndex(TR::RealRegister::gr4), addrArg, TR::RealRegister::gr4, UsesDependentRegister);

Expand Down
6 changes: 3 additions & 3 deletions runtime/compiler/p/codegen/PPCJNILinkage.hpp
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright (c) 2000, 2019 IBM Corp. and others
* Copyright (c) 2000, 2022 IBM Corp. and others
*
* This program and the accompanying materials are made available under
* the terms of the Eclipse Public License 2.0 which accompanies this
Expand Down Expand Up @@ -31,8 +31,8 @@
extern "C"
{
unsigned int crc32_oneByte(unsigned int crc, unsigned int b);
unsigned int crc32_no_vpmsum(unsigned int crc, unsigned char *p, unsigned long len);
unsigned int crc32_vpmsum(unsigned int crc, unsigned char *p, unsigned long len);
unsigned int crc32_no_vpmsum(unsigned int crc, unsigned char *p, unsigned long len, unsigned int castognoli);
unsigned int crc32_vpmsum(unsigned int crc, unsigned char *p, unsigned long len, unsigned int castognoli);
}

class TR_BitVector;
Expand Down
107 changes: 106 additions & 1 deletion runtime/compiler/p/codegen/PPCPrivateLinkage.cpp
Expand Up @@ -2669,6 +2669,100 @@ void inlineCharacterIsMethod(TR::Node *node, TR::MethodSymbol* methodSymbol, TR:
cg->stopUsingRegister(tmpReg);
}

void buildCRC32CCall(TR::Node *callNode,
TR::RegisterDependencyConditions *deps,
TR::MethodSymbol* methodSymbol,
TR::CodeGenerator *cg,
TR::LabelSymbol *&returnLabel,
bool crc32m2, bool crc32m3)
{
TR::Compilation *comp = cg->comp();
uintptr_t targetAddress;
TR::Register *gr2Reg;
bool aix_style_linkage = comp->target().isAIX() || (comp->target().is64Bit() && comp->target().isLinux());

if (aix_style_linkage)
{
gr2Reg = deps->searchPreConditionRegister(TR::RealRegister::gr2);
}

// Argument changes are needed
targetAddress = (uintptr_t)((comp->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8) && comp->target().cpu.supportsFeature(OMR_FEATURE_PPC_HAS_VSX))?crc32_vpmsum:crc32_no_vpmsum);

// Assuming pre/postCondition have the same index, we use preCondition to map
OMR::RegisterDependencyMap map(deps->getPreConditions()->getRegisterDependency(0), deps->getAddCursorForPre());
for (int32_t cnt=0; cnt < deps->getAddCursorForPre(); cnt++)
map.addDependency(deps->getPreConditions()->getRegisterDependency(cnt), cnt);

TR::Register *addrArg, *posArg, *lenArg, *wasteArg;
if (crc32m2)
{
addrArg = map.getSourceWithTarget(TR::RealRegister::gr4);
posArg = map.getSourceWithTarget(TR::RealRegister::gr5);
lenArg = map.getSourceWithTarget(TR::RealRegister::gr6);

generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi2, callNode, addrArg, addrArg, TR::Compiler->om.contiguousArrayHeaderSizeInBytes());
}

if (crc32m3)
{
addrArg = map.getSourceWithTarget(comp->target().is64Bit()?(TR::RealRegister::gr4):(TR::RealRegister::gr5));
posArg = map.getSourceWithTarget(comp->target().is64Bit()?(TR::RealRegister::gr5):(TR::RealRegister::gr6));
lenArg = map.getSourceWithTarget(comp->target().is64Bit()?(TR::RealRegister::gr6):(TR::RealRegister::gr7));
if (!comp->target().is64Bit())
wasteArg = map.getSourceWithTarget(TR::RealRegister::gr4);
}
generateTrg1Src2Instruction(cg, TR::InstOpCode::add, callNode, addrArg, addrArg, posArg);

/* For CRC32C, java uses len arg as offset for crc calculation. To workaround this in vpmsum
* where (off + len) is passed as len, perform a sub operation to take offset from len
*/
generateTrg1Src2Instruction(cg, TR::InstOpCode::subf, callNode, lenArg, posArg, lenArg);
/* Passing one for the castagnoli parameter of crc32_vpmsum helper. Here we are re-using
* posArg in gr6 after the buffer address has been calculated.
*/
generateTrg1ImmInstruction(cg, TR::InstOpCode::li, callNode, posArg, 1);

deps->getPreConditions()->setDependencyInfo(map.getTargetIndex(TR::RealRegister::gr4), addrArg, TR::RealRegister::gr4, UsesDependentRegister);
deps->getPostConditions()->setDependencyInfo(map.getTargetIndex(TR::RealRegister::gr4), addrArg, TR::RealRegister::gr4, UsesDependentRegister);

deps->getPreConditions()->setDependencyInfo(map.getTargetIndex(TR::RealRegister::gr5), lenArg, TR::RealRegister::gr5, UsesDependentRegister);
deps->getPostConditions()->setDependencyInfo(map.getTargetIndex(TR::RealRegister::gr5), lenArg, TR::RealRegister::gr5, UsesDependentRegister);

deps->getPreConditions()->setDependencyInfo(map.getTargetIndex(TR::RealRegister::gr6), posArg, TR::RealRegister::gr6, UsesDependentRegister);
deps->getPostConditions()->setDependencyInfo(map.getTargetIndex(TR::RealRegister::gr6), posArg, TR::RealRegister::gr6, UsesDependentRegister);

if (crc32m3 && !comp->target().is64Bit())
{
deps->getPreConditions()->setDependencyInfo(map.getTargetIndex(TR::RealRegister::gr7), wasteArg, TR::RealRegister::gr7, UsesDependentRegister);
deps->getPostConditions()->setDependencyInfo(map.getTargetIndex(TR::RealRegister::gr7), wasteArg, TR::RealRegister::gr7, UsesDependentRegister);
}

TR::Register *gr0Reg = deps->searchPreConditionRegister(TR::RealRegister::gr0);
TR::Register *gr11Reg = deps->searchPreConditionRegister(TR::RealRegister::gr11);
TR::Register *gr12Reg = deps->searchPreConditionRegister(TR::RealRegister::gr12);

loadConstant(cg, callNode, (int64_t)targetAddress, gr12Reg);
if (aix_style_linkage &&
!((comp)->target().is64Bit() && ((comp)->target().isLinux()) && (comp)->target().cpu.isLittleEndian()))
{
// get the target address
generateTrg1MemInstruction(cg,TR::InstOpCode::Op_load, callNode, gr0Reg, TR::MemoryReference::createWithDisplacement(cg, gr12Reg, 0, TR::Compiler->om.sizeofReferenceAddress()));
// put the target address into the count register
generateSrc1Instruction(cg, TR::InstOpCode::mtctr, callNode, gr0Reg);
// load the toc register
generateTrg1MemInstruction(cg,TR::InstOpCode::Op_load, callNode, gr2Reg, TR::MemoryReference::createWithDisplacement(cg, gr12Reg, TR::Compiler->om.sizeofReferenceAddress(), TR::Compiler->om.sizeofReferenceAddress()));
// load the environment register
generateTrg1MemInstruction(cg,TR::InstOpCode::Op_load, callNode, gr11Reg, TR::MemoryReference::createWithDisplacement(cg, gr12Reg, 2*TR::Compiler->om.sizeofReferenceAddress(), TR::Compiler->om.sizeofReferenceAddress()));
}
else {
// put the target address into the count register
generateSrc1Instruction(cg, TR::InstOpCode::mtctr, callNode, gr12Reg);
}
generateInstruction(cg, TR::InstOpCode::bctrl, callNode);
generateDepLabelInstruction(cg, TR::InstOpCode::label, callNode, returnLabel, deps);
}

void J9::Power::PrivateLinkage::buildDirectCall(TR::Node *callNode,
TR::SymbolReference *callSymRef,
TR::RegisterDependencyConditions *dependencies,
Expand Down Expand Up @@ -2759,7 +2853,18 @@ TR::Register *J9::Power::PrivateLinkage::buildDirectDispatch(TR::Node *callNode)
}
}

buildDirectCall(callNode, callSymRef, dependencies, pp, argSize);
if (comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8) &&
comp()->target().cpu.supportsFeature(OMR_FEATURE_PPC_HAS_VSX) &&
(callNode->getSymbol()->castToMethodSymbol()->getRecognizedMethod() == TR::java_util_zip_CRC32C_updateBytes ||
callNode->getSymbol()->castToMethodSymbol()->getRecognizedMethod() == TR::java_util_zip_CRC32C_updateDirectByteBuffer)) {

TR::MethodSymbol *callSymbol = callNode->getSymbolReference()->getSymbol()->castToMethodSymbol();
bool crc32m2 = (callSymbol->getRecognizedMethod() == TR::java_util_zip_CRC32C_updateBytes);
bool crc32m3 = (callSymbol->getRecognizedMethod() == TR::java_util_zip_CRC32C_updateDirectByteBuffer);
buildCRC32CCall(callNode, dependencies, callSymbol, cg(), doneLabel, crc32m2, crc32m3);
} else {
buildDirectCall(callNode, callSymRef, dependencies, pp, argSize);
}
// SG - end

cg()->machine()->setLinkRegisterKilled(true);
Expand Down
8 changes: 7 additions & 1 deletion runtime/compiler/p/codegen/PPCPrivateLinkage.hpp
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright (c) 2000, 2019 IBM Corp. and others
* Copyright (c) 2000, 2022 IBM Corp. and others
*
* This program and the accompanying materials are made available under
* the terms of the Eclipse Public License 2.0 which accompanies this
Expand Down Expand Up @@ -27,6 +27,12 @@
#include "codegen/PrivateLinkage.hpp"
#include "infra/Assert.hpp"

extern "C"
{
unsigned int crc32_no_vpmsum(unsigned int crc, unsigned char *p, unsigned long len, unsigned int castagnoli);
unsigned int crc32_vpmsum(unsigned int crc, unsigned char *p, unsigned long len, unsigned int castagnoli);
}

class TR_BitVector;
class TR_ResolvedMethod;
namespace TR { class AutomaticSymbol; }
Expand Down

0 comments on commit 8206ceb

Please sign in to comment.