Skip to content

Commit

Permalink
Merge pull request #6874 from Akira1Saitoh/aarch64ImproveMask
Browse files Browse the repository at this point in the history
AArch64: Improve mFirstTrue and mLastTrue evaluators
  • Loading branch information
knn-k committed Feb 2, 2023
2 parents 4147c5e + a2107cf commit 9801b8a
Showing 1 changed file with 56 additions and 101 deletions.
157 changes: 56 additions & 101 deletions compiler/aarch64/codegen/OMRTreeEvaluator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1335,81 +1335,58 @@ OMR::ARM64::TreeEvaluator::mFirstTrueEvaluator(TR::Node *node, TR::CodeGenerator
{
case TR::Int8:
/*
* shrn v1.8b, v0.8h, #7 ; Moves mask values to bit 0 and 1 of the lane 0 - 7.
* sli v1.16b, v1.16b, #6 ; Shifts left v1 by 6 bits and inserts into v1. bit 6 - 9 of the lane 0, 1, 2 and 3 have mask values.
* ushr v1.8h, v1.8h, #6 ; Moves mask values to bit 0 - 3 of each lane.
* sli v1.8h, v1.8h, #12 ; Shifts left v1 by 12 bits and inserts into v1. bit 12 - 19 of the lane 0 and 1 have mask values.
* umov x0, v1.2d[0] ; Mask values at bit 12 - 19 and bit 44 - 51.
* bfi x0, x0, #24, #20 ; Inserts bit 0 - 19 into bit 24 - 43.
* ubfx x0, x0, #36, #16 ; Moves bit 36 - 51 to bit 0 - 15.
* orr w0, w0, #0x10000 ; Sets the bit 16.
* rbit w0, w0 ; Reverses bits.
* clz w0, w0 ; Counts leading zeros.
* shrn v1.8b, v0.8h, #4 ; Moves mask values to bit 0-3 and 4-7 of the lane 0 - 7.
* umov x0, v1.2d[0]
* rbit x0, x0 ; Reverses bits.
* clz x0, x0 ; Counts leading zeros.
* lsr x0, x0, #2 ; Divides by 4.
*/
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vshrn_8b, node, tempReg, maskReg, 7);
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vsli16b, node, tempReg, tempReg, 6);
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vushr8h, node, tempReg, tempReg, 6);
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vsli8h, node, tempReg, tempReg, 12);
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vshrn_8b, node, tempReg, maskReg, 4);
generateMovVectorElementToGPRInstruction(cg, TR::InstOpCode::umovxd, node, resReg, tempReg, 0);
generateBFIInstruction(cg, node, resReg, resReg, 24, 20, true);
generateUBFXInstruction(cg, node, resReg, resReg, 36, 16, true);
generateLogicalImmInstruction(cg, TR::InstOpCode::orrimmw, node, resReg, resReg, false, 0x400); // immr=16, imms=0 for 0x10000
generateTrg1Src1Instruction(cg, TR::InstOpCode::rbitw, node, resReg, resReg);
generateTrg1Src1Instruction(cg, TR::InstOpCode::clzw, node, resReg, resReg);
generateTrg1Src1Instruction(cg, TR::InstOpCode::rbitx, node, resReg, resReg);
generateTrg1Src1Instruction(cg, TR::InstOpCode::clzx, node, resReg, resReg);
generateLogicalShiftRightImmInstruction(cg, node, resReg, resReg, 2, true);
break;
case TR::Int16:
/*
* shrn v1.4h, v0.4s, #15 ; Moves mask values to bit 0 and 1 of the lane 0 - 3.
* sli v1.8h, v1.8h, #14 ; Shifts left v1 by 14 bits and inserts into v1. bit 14 - 17 of the lane 0 and 1 have mask values.
* shrn v1.4h, v0.4s, #8 ; Moves mask values to byte 0 - 7.
* umov w0, v1.2d[0]
* bfi x0, x0, #28, #18 ; Inserts bit 0 - 17 into bit 28 - 45.
* ubfx x0, x0, #42, #8 ; Moves bit 42 - 49 to bit 0 - 7.
* orr w0, w0, #0x100 ; Sets the bit 8.
* rbit w0, w0 ; Reverses bits.
* clz w0, w0 ; Counts leading zeros.
* rbit x0, x0 ; Reverses bits.
* clz x0, x0 ; Counts leading zeros.
* lsr x0, x0, #3 ; Divides by 8.
*/
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vshrn_4h, node, tempReg, maskReg, 15);
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vsli8h, node, tempReg, tempReg, 14);
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vshrn_4h, node, tempReg, maskReg, 8);
generateMovVectorElementToGPRInstruction(cg, TR::InstOpCode::umovxd, node, resReg, tempReg, 0);
generateBFIInstruction(cg, node, resReg, resReg, 28, 18, true);
generateUBFXInstruction(cg, node, resReg, resReg, 42, 8, true);
generateLogicalImmInstruction(cg, TR::InstOpCode::orrimmw, node, resReg, resReg, false, 0x600); // immr=24, imms=0 for 0x100
generateTrg1Src1Instruction(cg, TR::InstOpCode::rbitw, node, resReg, resReg);
generateTrg1Src1Instruction(cg, TR::InstOpCode::clzw, node, resReg, resReg);
break;
generateTrg1Src1Instruction(cg, TR::InstOpCode::rbitx, node, resReg, resReg);
generateTrg1Src1Instruction(cg, TR::InstOpCode::clzx, node, resReg, resReg);
generateLogicalShiftRightImmInstruction(cg, node, resReg, resReg, 3, true);
case TR::Int32:
case TR::Float:
/*
* shrn v1.2s, v0.2d, #31
* shrn v1.2s, v0.2d, #16 ; Moves mask values to lane 0 - 3 (viewed as short vector)
* umov w0, v1.2d[0]
* bfi x0, x0, #30, #2
* ubfx x0, x0, #30, #4
* orr w0, w0, #0x10
* rbit w0, w0
* clz w0, w0
* rbit x0, x0 ; Reverses bits.
* clz x0, x0 ; Counts leading zeros.
* lsr x0, x0, #4 ; Divides by 16.
*/
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vshrn_2s, node, tempReg, maskReg, 31);
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vshrn_2s, node, tempReg, maskReg, 16);
generateMovVectorElementToGPRInstruction(cg, TR::InstOpCode::umovxd, node, resReg, tempReg, 0);
generateBFIInstruction(cg, node, resReg, resReg, 30, 2, true);
generateUBFXInstruction(cg, node, resReg, resReg, 30, 4, true);
generateLogicalImmInstruction(cg, TR::InstOpCode::orrimmw, node, resReg, resReg, false, 0x700); // immr=28, imms=0 for 0x10
generateTrg1Src1Instruction(cg, TR::InstOpCode::rbitw, node, resReg, resReg);
generateTrg1Src1Instruction(cg, TR::InstOpCode::clzw, node, resReg, resReg);
generateTrg1Src1Instruction(cg, TR::InstOpCode::rbitx, node, resReg, resReg);
generateTrg1Src1Instruction(cg, TR::InstOpCode::clzx, node, resReg, resReg);
generateLogicalShiftRightImmInstruction(cg, node, resReg, resReg, 4, true);
break;
case TR::Int64:
case TR::Double:
/*
* ext v1.16b, v0.16b, v0.16b, #1
* umov w0, v1.4s[3] ; Byte 3 has the mask value of the lane 0. Byte 2 has the value of the lane 1.
* orr w0, w0, #0x8000 ; Sets the bit 15.
* ext v1.16b, v0.16b, v0.16b, #2
* umov w0, v1.4s[3] ; Byte 2-3 has the mask value of the lane 0. Byte 0-1 has the value of the lane 1.
* clz w0, w0 ; Counts leading zeros.
* lsr w0, w0, #3 ; Divide by 8.
* lsr w0, w0, #4 ; Divides by 16.
*/
generateTrg1Src2ImmInstruction(cg, TR::InstOpCode::vext16b, node, tempReg, maskReg, maskReg, 1);
generateTrg1Src2ImmInstruction(cg, TR::InstOpCode::vext16b, node, tempReg, maskReg, maskReg, 2);
generateMovVectorElementToGPRInstruction(cg, TR::InstOpCode::umovws, node, resReg, tempReg, 3);
generateLogicalImmInstruction(cg, TR::InstOpCode::orrimmw, node, resReg, resReg, false, 0x440); // immr=17, imms=0 for 0x8000
generateTrg1Src1Instruction(cg, TR::InstOpCode::clzw, node, resReg, resReg);
generateLogicalShiftRightImmInstruction(cg, node, resReg, resReg, 3, false);
generateLogicalShiftRightImmInstruction(cg, node, resReg, resReg, 4, false);
break;
default:
TR_ASSERT_FATAL_WITH_NODE(node, false, "Unexpected element type");
Expand Down Expand Up @@ -1444,90 +1421,68 @@ OMR::ARM64::TreeEvaluator::mLastTrueEvaluator(TR::Node *node, TR::CodeGenerator
{
case TR::Int8:
/*
* shrn v1.8b, v0.8h, #7 ; Moves mask values to bit 0 and 1 of the lane 0 - 7.
* sli v1.16b, v1.16b, #6 ; Shifts left v1 by 6 bits and inserts into v1. bit 6 - 9 of the lane 0, 1, 2 and 3 have mask values.
* ushr v1.8h, v1.8h, #6 ; Moves mask values to bit 0 - 3 of each lane.
* sli v1.8h, v1.8h, #12 ; Shifts left v1 by 12 bits and inserts into v1. bit 12 - 19 of the lane 0 and 1 have mask values.
* umov x0, v1.2d[0] ; Mask values at bit 12 - 19 and bit 44 - 51.
* bfi x0, x0, #24, #20 ; Inserts bit 0 - 19 into bit 24 - 43.
* lsr x0, x0, #20 ; Moves mask values to bit 16 - 31.
* orr w0, w0, #0x8000 ; Sets the bit 8.
* mov w1, #15
* clz w0, w0
* shrn v1.8b, v0.8h, #4 ; Moves mask values to bit 0-3 and 4-7 of the lane 0 - 7.
* umov x0, v1.2d[0]
* mov x1, #15
* clz x0, x0 ; Counts leading zeros.
* lsr x0, x0, #2 ; Divides by 4.
* sub x0, x1, x0
*/
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vshrn_8b, node, tempReg, maskReg, 7);
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vsli16b, node, tempReg, tempReg, 6);
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vushr8h, node, tempReg, tempReg, 6);
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vsli8h, node, tempReg, tempReg, 12);
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vshrn_8b, node, tempReg, maskReg, 4);
generateMovVectorElementToGPRInstruction(cg, TR::InstOpCode::umovxd, node, resReg, tempReg, 0);
generateBFIInstruction(cg, node, resReg, resReg, 24, 20, true);
generateLogicalShiftRightImmInstruction(cg, node, resReg, resReg, 20, true);
generateLogicalImmInstruction(cg, TR::InstOpCode::orrimmw, node, resReg, resReg, false, 0x440); // immr=17, imms=0 for 0x8000
loadConstant32(cg, node, 15, maxLaneReg);
generateTrg1Src1Instruction(cg, TR::InstOpCode::clzw, node, resReg, resReg);
generateTrg1Src1Instruction(cg, TR::InstOpCode::clzx, node, resReg, resReg);
generateLogicalShiftRightImmInstruction(cg, node, resReg, resReg, 2, true);
generateTrg1Src2Instruction(cg, TR::InstOpCode::subx, node, resReg, maxLaneReg, resReg);
break;
case TR::Int16:
/*
* shrn v1.4h, v0.4s, #15 ; Moves mask values to bit 0 and 1 of the lane 0 - 3.
* sli v1.8h, v1.8h, #14 ; Shifts left v1 by 14 bits and inserts into v1. bit 14 - 17 of the lane 0 and 1 have mask values.
* umov w0, v1.2d[0]
* bfi x0, x0, #28, #18 ; Inserts bit 0 - 17 into bit 28 - 45.
* lsr x0, x0, #18 ; Moves mask values to bit 24 - 31.
* orr w0, w0, #0x800000 ; Sets the bit 23.
* mov w1, #7
* clz w0, w0
* shrn v1.4h, v0.4s, #8 ; Moves mask values to byte 0 - 7.
* umov x0, v1.2d[0]
* mov x1, #7
* clz x0, x0 ; Counts leading zeros.
* lsr x0, x0, #3 ; Divides by 8.
* sub x0, x1, x0
*/
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vshrn_4h, node, tempReg, maskReg, 15);
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vsli8h, node, tempReg, tempReg, 14);
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vshrn_4h, node, tempReg, maskReg, 8);
generateMovVectorElementToGPRInstruction(cg, TR::InstOpCode::umovxd, node, resReg, tempReg, 0);
generateBFIInstruction(cg, node, resReg, resReg, 28, 18, true);
generateLogicalShiftRightImmInstruction(cg, node, resReg, resReg, 18, true);
generateLogicalImmInstruction(cg, TR::InstOpCode::orrimmw, node, resReg, resReg, false, 0x240); // immr=9, imms=0 for 0x800000
loadConstant32(cg, node, 7, maxLaneReg);
generateTrg1Src1Instruction(cg, TR::InstOpCode::clzw, node, resReg, resReg);
generateTrg1Src1Instruction(cg, TR::InstOpCode::clzx, node, resReg, resReg);
generateLogicalShiftRightImmInstruction(cg, node, resReg, resReg, 3, true);
generateTrg1Src2Instruction(cg, TR::InstOpCode::subx, node, resReg, maxLaneReg, resReg);
break;
case TR::Int32:
case TR::Float:
/*
* shrn v1.2s, v0.2d, #31
* shrn v1.2s, v0.2d, #16 ; Moves mask values to lane 0 - 3 (viewed as short vector)
* umov w0, v1.2d[0]
* bfi x0, x0, #30, #2
* lsr x0, x0, #2
* orr w0, w0, #0x8000000
* mov w1, #3
* clz w0, w0
* mov x1, #3
* clz x0, x0 ; Counts leading zeros.
* lsr x0, x0, #4 ; Divides by 16.
* sub x0, x1, x0
*/
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vshrn_2s, node, tempReg, maskReg, 31);
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vshrn_2s, node, tempReg, maskReg, 16);
generateMovVectorElementToGPRInstruction(cg, TR::InstOpCode::umovxd, node, resReg, tempReg, 0);
generateBFIInstruction(cg, node, resReg, resReg, 30, 2, true);
generateLogicalShiftRightImmInstruction(cg, node, resReg, resReg, 2, true);
generateLogicalImmInstruction(cg, TR::InstOpCode::orrimmw, node, resReg, resReg, false, 0x140); // immr=5, imms=0 for 0x8000000
loadConstant32(cg, node, 3, maxLaneReg);
generateTrg1Src1Instruction(cg, TR::InstOpCode::clzw, node, resReg, resReg);
generateTrg1Src1Instruction(cg, TR::InstOpCode::clzx, node, resReg, resReg);
generateLogicalShiftRightImmInstruction(cg, node, resReg, resReg, 4, true);
generateTrg1Src2Instruction(cg, TR::InstOpCode::subx, node, resReg, maxLaneReg, resReg);
break;
case TR::Int64:
case TR::Double:
/*
* ext v1.16b, v0.16b, v0.16b, #9
* ext v1.16b, v0.16b, v0.16b, #2
* umov w0, v1.4s[3]
* orr w0, w0, #0x8000
* mov w1, #1
* clz w0, w0
* lsr w0, w0, #3
* lsr w0, w0, #4
* sub x0, x1, x0
*/
generateTrg1Src2ImmInstruction(cg, TR::InstOpCode::vext16b, node, tempReg, maskReg, maskReg, 9);
generateMovVectorElementToGPRInstruction(cg, TR::InstOpCode::umovws, node, resReg, tempReg, 3);
generateLogicalImmInstruction(cg, TR::InstOpCode::orrimmw, node, resReg, resReg, false, 0x440); // immr=17, imms=0 for 0x8000
loadConstant32(cg, node, 1, maxLaneReg);
generateTrg1Src1Instruction(cg, TR::InstOpCode::clzw, node, resReg, resReg);
generateLogicalShiftRightImmInstruction(cg, node, resReg, resReg, 3, false);
generateLogicalShiftRightImmInstruction(cg, node, resReg, resReg, 4, false);
generateTrg1Src2Instruction(cg, TR::InstOpCode::subx, node, resReg, maxLaneReg, resReg);
break;
default:
Expand Down

0 comments on commit 9801b8a

Please sign in to comment.