Skip to content

Commit

Permalink
Improve code generation for inlineArrayCmp on Power10
Browse files Browse the repository at this point in the history
The current inlineArrayCmp has some drawbacks that affects it's
performance.
The key deficiencies are - expensive comparison opcode usage,
single byte compare loop in residue loop, many branches - etc.

This commit uses wider load by utilizing power10 instructions and
reduces the number of branches in the generated code.
If merged, this patch enables more performant inlineArrayCmp
on power10+ system.
  • Loading branch information
mnalam-p committed May 28, 2021
1 parent 9143684 commit a95aba5
Show file tree
Hide file tree
Showing 6 changed files with 263 additions and 87 deletions.
13 changes: 7 additions & 6 deletions compiler/p/codegen/OMRInstOpCode.enum
Expand Up @@ -579,8 +579,8 @@
// vabsduw, // vector absolute difference unsigned word
vand, // vector logical and
vandc, // vector logical and with complement
// vclzlsbb, // vector count leading zero least-significant bits byte
// vctzlsbb, // vector count trailing zero least-significant bits byte
vclzlsbb, // vector count leading zero least-significant bits byte
vctzlsbb, // vector count trailing zero least-significant bits byte
// vctzb, // vector count trailing zeros Byte
// vctzh, // vector count trailing zeros Hword
// vctzw, // vector count trailing zeros Word
Expand Down Expand Up @@ -632,6 +632,7 @@
// vpermr, // vector permute right-indexed
// vbpermd, // vector bit permute DWord
vbpermq, // Vector Bit Permute Qword
// vextractbm, // Vector Extract Byte Mask
// vrld, // Vector Rotate Left Dword
// vsbox, // Vector AES SubBytes
vsel, // vector conditional select
Expand Down Expand Up @@ -771,8 +772,8 @@
vcmpgtuh_r, // vector compare greater than unsigned halfword with record
vcmpgtuw, // vector compare greater than unsigned word
vcmpgtuw_r, // vector compare greater than unsigned word with record
// vcmpneb, // vector compare not equal Byte
// vcmpneb_r, // vector compare not equal Byte Rc=1
vcmpneb, // vector compare not equal Byte
vcmpneb_r, // vector compare not equal Byte Rc=1
// vcmpneh, // vector compare not equal Hword
// vcmpneh_r, // vector compare not equal Hword Rc=1
// vcmpnew, // vector compare not equal Word
Expand Down Expand Up @@ -816,9 +817,9 @@
// lxsibzx, // Load VSX scalar as integer byte & zero indexed
// lxsihzx, // Load VSX scalar as integer Hword & zero indexed
lxv, // Load VSX vector
// lxvb16x, // Load VSX vector byte*16 indexed
lxvb16x, // Load VSX vector byte*16 indexed
// lxvh8x, // Load VSX vector Hword*8 indexed
// lxvll, // Load VSX vector left-justified with length
lxvll, // Load VSX vector left-justified with length
// lxvwsx, // Load VSX vector word & splat indexed
// lxvx, // Load VSX vector indexed
// stxsd, // Store VSX Scalar Dword
Expand Down
142 changes: 77 additions & 65 deletions compiler/p/codegen/OMRInstOpCodeProperties.hpp
Expand Up @@ -6761,29 +6761,29 @@
PPCOpProp_SyncSideEffectFree,
},

/* { */
/* .mnemonic = OMR::InstOpCode::vclzlsbb, */
/* .name = "vclzlsbb", */
{
/* .mnemonic = */ OMR::InstOpCode::vclzlsbb,
/* .name = */ "vclzlsbb",
/* .description = "vector count leading zero least-significant bits byte", */
/* .prefix = 0x00000000, */
/* .opcode = 0x10000602, */
/* .format = FORMAT_UNKNOWN, */
/* .minimumALS = OMR_PROCESSOR_PPC_P9, */
/* .properties = PPCOpProp_IsVMX | */
/* PPCOpProp_SyncSideEffectFree, */
/* }, */
/* .prefix = */ 0x00000000,
/* .opcode = */ 0x10000602,
/* .format = */ FORMAT_RT_VRB,
/* .minimumALS = */ OMR_PROCESSOR_PPC_P9,
/* .properties = */ PPCOpProp_IsVMX |
PPCOpProp_SyncSideEffectFree,
},

/* { */
/* .mnemonic = OMR::InstOpCode::vctzlsbb, */
/* .name = "vctzlsbb", */
{
/* .mnemonic = */ OMR::InstOpCode::vctzlsbb,
/* .name = */ "vctzlsbb",
/* .description = "vector count trailing zero least-significant bits byte", */
/* .prefix = 0x00000000, */
/* .opcode = 0x10010602, */
/* .format = FORMAT_UNKNOWN, */
/* .minimumALS = OMR_PROCESSOR_PPC_P9, */
/* .properties = PPCOpProp_IsVMX | */
/* PPCOpProp_SyncSideEffectFree, */
/* }, */
/* .prefix = */ 0x00000000,
/* .opcode = */ 0x10010602,
/* .format = */ FORMAT_RT_VRB,
/* .minimumALS = */ OMR_PROCESSOR_PPC_P9,
/* .properties = */ PPCOpProp_IsVMX |
PPCOpProp_SyncSideEffectFree,
},

/* { */
/* .mnemonic = OMR::InstOpCode::vctzb, */
Expand Down Expand Up @@ -7399,6 +7399,18 @@
PPCOpProp_SyncSideEffectFree,
},

/* { */
/* .mnemonic = OMR::InstOpCode::vextractbm, */
/* .name = "vextractbm", */
/* .description = "Vector Extract Byte Mask", */
/* .prefix = 0x00000000, */
/* .opcode = 0x10080642, */
/* .format = FORMAT_RT_VRB, */
/* .minimumALS = OMR_PROCESSOR_PPC_P10, */
/* .properties = PPCOpProp_IsVMX | */
/* PPCOpProp_SyncSideEffectFree, */
/* }, */

/* { */
/* .mnemonic = OMR::InstOpCode::vrld, */
/* .name = "vrld", */
Expand Down Expand Up @@ -9086,30 +9098,30 @@
/* .properties = */ OMR::Power::InstOpCode::metadata[OMR::InstOpCode::vcmpgtuw].properties & ~PPCOpProp_HasRecordForm | PPCOpProp_IsRecordForm,
},

/* { */
/* .mnemonic = OMR::InstOpCode::vcmpneb, */
/* .name = "vcmpneb", */
{
/* .mnemonic = */ OMR::InstOpCode::vcmpneb,
/* .name = */ "vcmpneb",
/* .description = "vector compare not equal Byte", */
/* .prefix = 0x00000000, */
/* .opcode = 0x10000007, */
/* .format = FORMAT_UNKNOWN, */
/* .minimumALS = OMR_PROCESSOR_PPC_P9, */
/* .properties = PPCOpProp_IsVMX | */
/* PPCOpProp_HasRecordForm | */
/* PPCOpProp_CompareOp | */
/* PPCOpProp_SyncSideEffectFree, */
/* }, */
/* .prefix = */ 0x00000000,
/* .opcode = */ 0x10000007,
/* .format = */ FORMAT_VRT_VRA_VRB,
/* .minimumALS = */ OMR_PROCESSOR_PPC_P9,
/* .properties = */ PPCOpProp_IsVMX |
PPCOpProp_HasRecordForm |
PPCOpProp_CompareOp |
PPCOpProp_SyncSideEffectFree,
},

/* { */
/* .mnemonic = OMR::InstOpCode::vcmpneb_r, */
/* .name = "vcmpneb.", */
{
/* .mnemonic = */ OMR::InstOpCode::vcmpneb_r,
/* .name = */ "vcmpneb.",
/* .description = "vector compare not equal Byte Rc=1", */
/* .prefix = OMR::Power::InstOpCode::metadata[OMR::InstOpCode::vcmpneb].prefix, */
/* .opcode = OMR::Power::InstOpCode::metadata[OMR::InstOpCode::vcmpneb].opcode + 1, */
/* .format = OMR::Power::InstOpCode::metadata[OMR::InstOpCode::vcmpneb].format, */
/* .minimumALS = OMR::Power::InstOpCode::metadata[OMR::InstOpCode::vcmpneb].minimumALS, */
/* .properties = OMR::Power::InstOpCode::metadata[OMR::InstOpCode::vcmpneb].properties & ~PPCOpProp_HasRecordForm | PPCOpProp_IsRecordForm, */
/* }, */
/* .prefix = */ OMR::Power::InstOpCode::metadata[OMR::InstOpCode::vcmpneb].prefix,
/* .opcode = */ 0x10000407,
/* .format = */ OMR::Power::InstOpCode::metadata[OMR::InstOpCode::vcmpneb].format,
/* .minimumALS = */ OMR::Power::InstOpCode::metadata[OMR::InstOpCode::vcmpneb].minimumALS,
/* .properties = */ OMR::Power::InstOpCode::metadata[OMR::InstOpCode::vcmpneb].properties & ~PPCOpProp_HasRecordForm | PPCOpProp_IsRecordForm,
},

/* { */
/* .mnemonic = OMR::InstOpCode::vcmpneh, */
Expand Down Expand Up @@ -9640,18 +9652,18 @@
PPCOpProp_IsVSX,
},

/* { */
/* .mnemonic = OMR::InstOpCode::lxvb16x, */
/* .name = "lxvb16x", */
{
/* .mnemonic = */ OMR::InstOpCode::lxvb16x,
/* .name = */ "lxvb16x",
/* .description = "Load VSX vector byte*16 indexed", */
/* .prefix = 0x00000000, */
/* .opcode = 0x7C0006D8, */
/* .format = FORMAT_UNKNOWN, */
/* .minimumALS = OMR_PROCESSOR_PPC_P9, */
/* .properties = PPCOpProp_IsLoad | */
/* PPCOpProp_ExcludeR0ForRA | */
/* PPCOpProp_IsVSX, */
/* }, */
/* .prefix = */ 0x00000000,
/* .opcode = */ 0x7C0006D8,
/* .format = */ FORMAT_XT_RA_RB,
/* .minimumALS = */ OMR_PROCESSOR_PPC_P9,
/* .properties = */ PPCOpProp_IsLoad |
PPCOpProp_ExcludeR0ForRA |
PPCOpProp_IsVSX,
},

/* { */
/* .mnemonic = OMR::InstOpCode::lxvh8x, */
Expand All @@ -9666,18 +9678,18 @@
/* PPCOpProp_IsVSX, */
/* }, */

/* { */
/* .mnemonic = OMR::InstOpCode::lxvll, */
/* .name = "lxvll", */
{
/* .mnemonic = */ OMR::InstOpCode::lxvll,
/* .name = */ "lxvll",
/* .description = "Load VSX vector left-justified with length", */
/* .prefix = 0x00000000, */
/* .opcode = 0x7C00025A, */
/* .format = FORMAT_UNKNOWN, */
/* .minimumALS = OMR_PROCESSOR_PPC_P9, */
/* .properties = PPCOpProp_IsLoad | */
/* PPCOpProp_ExcludeR0ForRA | */
/* PPCOpProp_IsVSX, */
/* }, */
/* .prefix = */ 0x00000000,
/* .opcode = */ 0x7C00025A,
/* .format = */ FORMAT_XT_RA_RB,
/* .minimumALS = */ OMR_PROCESSOR_PPC_P9,
/* .properties = */ PPCOpProp_IsLoad |
PPCOpProp_ExcludeR0ForRA |
PPCOpProp_IsVSX,
},

/* { */
/* .mnemonic = OMR::InstOpCode::lxvwsx, */
Expand Down Expand Up @@ -13559,8 +13571,8 @@
/* .name = */ "vcmpequq",
/* .description = "Vector Compare Equal Quadword", */
/* .prefix = */ 0x00000000,
/* .opcode = */ 0x00000000,
/* .format = */ FORMAT_UNKNOWN,
/* .opcode = */ 0x100001C7,
/* .format = */ FORMAT_VRT_VRA_VRB,
/* .minimumALS = */ OMR_PROCESSOR_PPC_P10,
/* .properties = */ PPCOpProp_IsVMX |
PPCOpProp_CompareOp |
Expand Down
122 changes: 122 additions & 0 deletions compiler/p/codegen/OMRTreeEvaluator.cpp
Expand Up @@ -3134,6 +3134,125 @@ static inline void loadArrayCmpSources(TR::Node *node, TR::InstOpCode::Mnemonic
}
}

static TR::Register *inlineArrayCmpP10(TR::Node *node, TR::CodeGenerator *cg)
{
TR::Node *src1AddrNode = node->getChild(0);
TR::Node *src2AddrNode = node->getChild(1);
TR::Node *lengthNode = node->getChild(2);

TR::Register *src1AddrReg = cg->evaluate(src1AddrNode);
TR::Register *src2AddrReg = cg->evaluate(src2AddrNode);
TR::Register *indexReg = cg->allocateRegister(TR_GPR);
TR::Register *returnReg = cg->allocateRegister(TR_GPR);
TR::Register *tempReg = cg->gprClobberEvaluate(lengthNode);
TR::Register *temp2Reg = cg->allocateRegister(TR_GPR);

TR::Register *vec0Reg = cg->allocateRegister(TR_VRF);
TR::Register *vec1Reg = cg->allocateRegister(TR_VRF);
TR::Register *condReg = cg->allocateRegister(TR_CCR);

TR::LabelSymbol *startLabel = generateLabelSymbol(cg);
TR::LabelSymbol *loopStartLabel = generateLabelSymbol(cg);
TR::LabelSymbol *residueStartLabel = generateLabelSymbol(cg);
TR::LabelSymbol *endLabel = generateLabelSymbol(cg);
TR::LabelSymbol *resultLabel = generateLabelSymbol(cg);

generateLabelInstruction(cg, TR::InstOpCode::label, node, startLabel);
startLabel->setStartInternalControlFlow();

generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, indexReg, 0);
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::cmpi4, node, condReg, tempReg, 16);

// We don't need length anymore as we can calculate the appropriate index by using indexReg and the remainder
generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rlwinm, node, returnReg, tempReg, 0, 0xF);
generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, residueStartLabel, condReg);

generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::srawi, node, tempReg, tempReg, 4);
generateSrc1Instruction(cg, TR::InstOpCode::mtctr, node, tempReg);

generateLabelInstruction(cg, TR::InstOpCode::label, node, loopStartLabel);

// main-loop
generateTrg1Src2Instruction(cg, TR::InstOpCode::lxvb16x, node, vec0Reg, indexReg, src1AddrReg);
generateTrg1Src2Instruction(cg, TR::InstOpCode::lxvb16x, node, vec1Reg, indexReg, src2AddrReg);
generateTrg1Src2Instruction(cg, TR::InstOpCode::vcmpneb_r, node, vec0Reg, vec0Reg, vec1Reg);

generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, resultLabel, condReg);
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 16);

generateConditionalBranchInstruction(cg, TR::InstOpCode::bdnz, node, loopStartLabel, condReg);
// main-loop end

// residue start
generateLabelInstruction(cg, TR::InstOpCode::label, node, residueStartLabel);

generateShiftLeftImmediateLong(cg, node, temp2Reg, returnReg, 56);
generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, tempReg, src1AddrReg, indexReg);
generateTrg1Src2Instruction(cg, TR::InstOpCode::lxvll, node, vec0Reg, tempReg, temp2Reg);
generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, tempReg, src2AddrReg, indexReg);
generateTrg1Src2Instruction(cg, TR::InstOpCode::lxvll, node, vec1Reg, tempReg, temp2Reg);
generateTrg1Src2Instruction(cg, TR::InstOpCode::vcmpneb, node, vec0Reg, vec0Reg, vec1Reg);
// residue end

// result
generateLabelInstruction(cg, TR::InstOpCode::label, node, resultLabel);

generateTrg1Src1Instruction(cg, TR::InstOpCode::vclzlsbb, node, tempReg, vec0Reg);

if (!node->isArrayCmpLen())
{
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, returnReg, returnReg, -1);
}

// offset = matched-byte-count == 16 ? remainder : match-byte-count
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::cmpi4, node, condReg, tempReg, 16);
generateTrg1Src3Instruction(cg, TR::InstOpCode::isellt, node, returnReg, tempReg, returnReg, condReg);

// index = index + offset, if we need to return unmatched index, then we are done here
generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, returnReg, indexReg, returnReg);

if (!node->isArrayCmpLen())
{
generateTrg1Src2Instruction(cg, TR::InstOpCode::lbzx, node, tempReg, returnReg, src1AddrReg);
generateTrg1Src2Instruction(cg, TR::InstOpCode::lbzx, node, indexReg, returnReg, src2AddrReg);
generateTrg1Src2Instruction(cg, TR::InstOpCode::cmp4, node, condReg, tempReg, indexReg);
// result = -1,0,1
generateTrg1Src1Instruction(cg, TR::InstOpCode::setb, node, tempReg, condReg);
// convert -1,0,1 to 1,0,2 to match current arraycmp return value
generateTrg1Src1Instruction(cg, TR::InstOpCode::neg, node, tempReg, tempReg);
generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rlwinm, node, returnReg, tempReg, 2, 3);
generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, returnReg, returnReg, tempReg);
}

int32_t numRegs = 9;

TR::RegisterDependencyConditions *dependencies = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(0, numRegs, cg->trMemory());
dependencies->addPostCondition(src1AddrReg, TR::RealRegister::NoReg);
dependencies->addPostCondition(src2AddrReg, TR::RealRegister::NoReg);
dependencies->addPostCondition(tempReg, TR::RealRegister::NoReg);
dependencies->getPostConditions()->getRegisterDependency(2)->setExcludeGPR0();
dependencies->addPostCondition(returnReg, TR::RealRegister::NoReg);
dependencies->getPostConditions()->getRegisterDependency(3)->setExcludeGPR0();
dependencies->addPostCondition(condReg, TR::RealRegister::cr6);
dependencies->addPostCondition(vec0Reg, TR::RealRegister::NoReg);
dependencies->addPostCondition(vec1Reg, TR::RealRegister::NoReg);
dependencies->addPostCondition(indexReg, TR::RealRegister::NoReg);
dependencies->getPostConditions()->getRegisterDependency(7)->setExcludeGPR0();
dependencies->addPostCondition(temp2Reg, TR::RealRegister::NoReg);

generateDepLabelInstruction(cg, TR::InstOpCode::label, node, endLabel, dependencies);
endLabel->setEndInternalControlFlow();

node->setRegister(returnReg);
cg->decReferenceCount(src1AddrNode);
cg->decReferenceCount(src2AddrNode);
cg->decReferenceCount(lengthNode);
TR::Register *liveRegs[3] = { src1AddrReg, src2AddrReg, returnReg };
dependencies->stopUsingDepRegs(cg, 3, liveRegs);

return returnReg;
}


static TR::Register *inlineArrayCmp(TR::Node *node, TR::CodeGenerator *cg)
{
Expand Down Expand Up @@ -3317,6 +3436,9 @@ static TR::Register *inlineArrayCmp(TR::Node *node, TR::CodeGenerator *cg)
TR::Register *OMR::Power::TreeEvaluator::arraycmpEvaluator(TR::Node *node, TR::CodeGenerator *cg)
{
TR::Compilation *comp = cg->comp();
static char *disableP10ArrayCmp = feGetEnv("TR_DisableP10ArrayCmp");
if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P10) && !disableP10ArrayCmp)
return inlineArrayCmpP10(node, cg);
return inlineArrayCmp(node, cg);
}

Expand Down
6 changes: 6 additions & 0 deletions compiler/p/codegen/PPCBinaryEncoding.cpp
Expand Up @@ -1569,6 +1569,11 @@ void TR::PPCTrg1Src1Instruction::fillBinaryEncodingFields(uint32_t *cursor)
fillFieldXB(self(), cursor, src);
break;

case FORMAT_RT_VRB:
fillFieldRT(self(), cursor, trg);
fillFieldVRB(self(), cursor, src);
break;

default:
TR_ASSERT_FATAL_WITH_INSTRUCTION(self(), false, "Format %d cannot be binary encoded by PPCTrg1Src1Instruction", getOpCode().getFormat());
}
Expand Down Expand Up @@ -2431,6 +2436,7 @@ void TR::PPCTrg1MemInstruction::fillBinaryEncodingFields(uint32_t *cursor)
fillMemoryReferenceRARB(self(), cursor, memRef);
break;

case FORMAT_XT_RA_RB:
case FORMAT_XT_RA_RB_MEM:
fillFieldXT(self(), cursor, trg);
fillMemoryReferenceRARB(self(), cursor, memRef);
Expand Down

0 comments on commit a95aba5

Please sign in to comment.