Skip to content

Commit

Permalink
Add VectorTableList and TableVectorExtension intrinsics (#35600)
Browse files Browse the repository at this point in the history
  • Loading branch information
TamarChristinaArm committed Apr 30, 2020
1 parent 0764d8c commit aa81328
Show file tree
Hide file tree
Showing 25 changed files with 4,776 additions and 20 deletions.
8 changes: 5 additions & 3 deletions src/coreclr/src/jit/emit.h
Original file line number Diff line number Diff line change
Expand Up @@ -1222,9 +1222,11 @@ class emitter

#define PERFSCORE_THROUGHPUT_ILLEGAL -1024.0f

#define PERFSCORE_THROUGHPUT_4X 0.25f // Fastest - Quad issue
#define PERFSCORE_THROUGHPUT_3X (1.0f / 3.0f) // Faster - Three issue
#define PERFSCORE_THROUGHPUT_2X 0.5f // Faster - Dual issue
#define PERFSCORE_THROUGHPUT_6X (1.0f / 6.0f) // Hextuple issue
#define PERFSCORE_THROUGHPUT_5X 0.20f // Pentuple issue
#define PERFSCORE_THROUGHPUT_4X 0.25f // Quad issue
#define PERFSCORE_THROUGHPUT_3X (1.0f / 3.0f) // Three issue
#define PERFSCORE_THROUGHPUT_2X 0.5f // Dual issue

#define PERFSCORE_THROUGHPUT_1C 1.0f // Single Issue

Expand Down
116 changes: 102 additions & 14 deletions src/coreclr/src/jit/emitarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -817,6 +817,22 @@ void emitter::emitInsSanityCheck(instrDesc* id)
break;

case IF_DV_3C: // DV_3C .Q.........mmmmm ......nnnnnddddd Vd Vn Vm (vector)
switch (id->idIns())
{
case INS_tbl:
case INS_tbl_2regs:
case INS_tbl_3regs:
case INS_tbl_4regs:
case INS_tbx:
case INS_tbx_2regs:
case INS_tbx_3regs:
case INS_tbx_4regs:
elemsize = optGetElemsize(id->idInsOpt());
assert(elemsize == EA_1BYTE);
break;
default:
break;
}
assert(isValidVectorDatasize(id->idOpSize()));
assert(isValidArrangement(id->idOpSize(), id->idInsOpt()));
assert(isVectorRegister(id->idReg1()));
Expand Down Expand Up @@ -3213,15 +3229,16 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt)
}

//------------------------------------------------------------------------
// insGetLoadStoreRegisterListSize: Returns a size of the register list a given instruction operates on.
// insGetRegisterListSize: Returns a size of the register list a given instruction operates on.
//
// Arguments:
// ins - A Load/Store Vector instruction (e.g. ld1 (2 registers), ld1r, st1).
// ins - An instruction which uses a register list
// (e.g. ld1 (2 registers), ld1r, st1, tbl, tbx).
//
// Return value:
// A number of consecutive SIMD and floating-point registers the instruction loads to/store from.
//
/*static*/ unsigned emitter::insGetLoadStoreRegisterListSize(instruction ins)
/*static*/ unsigned emitter::insGetRegisterListSize(instruction ins)
{
unsigned registerListSize = 0;

Expand All @@ -3230,6 +3247,8 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt)
case INS_ld1:
case INS_ld1r:
case INS_st1:
case INS_tbl:
case INS_tbx:
registerListSize = 1;
break;

Expand All @@ -3238,6 +3257,8 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt)
case INS_ld2r:
case INS_st1_2regs:
case INS_st2:
case INS_tbl_2regs:
case INS_tbx_2regs:
registerListSize = 2;
break;

Expand All @@ -3246,6 +3267,8 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt)
case INS_ld3r:
case INS_st1_3regs:
case INS_st3:
case INS_tbl_3regs:
case INS_tbx_3regs:
registerListSize = 3;
break;

Expand All @@ -3254,6 +3277,8 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt)
case INS_ld4r:
case INS_st1_4regs:
case INS_st4:
case INS_tbl_4regs:
case INS_tbx_4regs:
registerListSize = 4;
break;

Expand Down Expand Up @@ -5192,7 +5217,7 @@ void emitter::emitIns_R_R_I(

if (insOptsAnyArrangement(opt))
{
registerListSize = insGetLoadStoreRegisterListSize(ins);
registerListSize = insGetRegisterListSize(ins);
assert(isValidVectorDatasize(size));
assert(isValidArrangement(size, opt));
assert((size * registerListSize) == imm);
Expand Down Expand Up @@ -5226,7 +5251,7 @@ void emitter::emitIns_R_R_I(
assert(isValidArrangement(size, opt));

elemsize = optGetElemsize(opt);
registerListSize = insGetLoadStoreRegisterListSize(ins);
registerListSize = insGetRegisterListSize(ins);
assert((elemsize * registerListSize) == imm);

// Load single structure and replicate post-indexed by an immediate
Expand Down Expand Up @@ -5676,6 +5701,14 @@ void emitter::emitIns_R_R_R(
case INS_eor:
case INS_orr:
case INS_orn:
case INS_tbl:
case INS_tbl_2regs:
case INS_tbl_3regs:
case INS_tbl_4regs:
case INS_tbx:
case INS_tbx_2regs:
case INS_tbx_3regs:
case INS_tbx_4regs:
if (isVectorRegister(reg1))
{
assert(isValidVectorDatasize(size));
Expand Down Expand Up @@ -6612,7 +6645,7 @@ void emitter::emitIns_R_R_I_I(
assert(isValidVectorElemsize(elemsize));
assert(isValidVectorIndex(EA_16BYTE, elemsize, imm1));

registerListSize = insGetLoadStoreRegisterListSize(ins);
registerListSize = insGetRegisterListSize(ins);
assert((elemsize * registerListSize) == (unsigned)imm2);
assert(insOptsPostIndex(opt));

Expand Down Expand Up @@ -11884,7 +11917,7 @@ void emitter::emitDispIns(

case IF_LS_2D: // LS_2D .Q.............. ....ssnnnnnttttt Vt Rn
case IF_LS_2E: // LS_2E .Q.............. ....ssnnnnnttttt Vt Rn
registerListSize = insGetLoadStoreRegisterListSize(id->idIns());
registerListSize = insGetRegisterListSize(id->idIns());
emitDispVectorRegList(id->idReg1(), registerListSize, id->idInsOpt(), true);

if (fmt == IF_LS_2D)
Expand All @@ -11903,7 +11936,7 @@ void emitter::emitDispIns(

case IF_LS_2F: // LS_2F .Q.............. xx.Sssnnnnnttttt Vt[] Rn
case IF_LS_2G: // LS_2G .Q.............. xx.Sssnnnnnttttt Vt[] Rn
registerListSize = insGetLoadStoreRegisterListSize(id->idIns());
registerListSize = insGetRegisterListSize(id->idIns());
elemsize = id->idOpSize();
emitDispVectorElemList(id->idReg1(), registerListSize, elemsize, id->idSmallCns(), true);

Expand Down Expand Up @@ -11967,7 +12000,7 @@ void emitter::emitDispIns(

case IF_LS_3F: // LS_3F .Q.........mmmmm ....ssnnnnnttttt Vt Rn Rm
case IF_LS_3G: // LS_3G .Q.........mmmmm ...Sssnnnnnttttt Vt[] Rn Rm
registerListSize = insGetLoadStoreRegisterListSize(id->idIns());
registerListSize = insGetRegisterListSize(id->idIns());

if (fmt == IF_LS_3F)
{
Expand Down Expand Up @@ -12468,9 +12501,25 @@ void emitter::emitDispIns(

case IF_DV_3C: // DV_3C .Q.........mmmmm ......nnnnnddddd Vd Vn Vm (vector)
emitDispVectorReg(id->idReg1(), id->idInsOpt(), true);
if (ins != INS_mov)
switch (ins)
{
emitDispVectorReg(id->idReg2(), id->idInsOpt(), true);
case INS_tbl:
case INS_tbl_2regs:
case INS_tbl_3regs:
case INS_tbl_4regs:
case INS_tbx:
case INS_tbx_2regs:
case INS_tbx_3regs:
case INS_tbx_4regs:
registerListSize = insGetRegisterListSize(ins);
elemsize = id->idOpSize();
emitDispVectorRegList(id->idReg2(), registerListSize, id->idInsOpt(), true);
break;
case INS_mov:
break;
default:
emitDispVectorReg(id->idReg2(), id->idInsOpt(), true);
break;
}
emitDispVectorReg(id->idReg3(), id->idInsOpt(), false);
break;
Expand Down Expand Up @@ -14145,9 +14194,48 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
}
break;

case IF_DV_3C: // mov,and, bic, eor, mov,mvn, orn, bsl, bit, bif (vector)
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
result.insLatency = PERFSCORE_LATENCY_1C;
case IF_DV_3C: // mov,and, bic, eor, mov,mvn, orn, bsl, bit, bif,
// tbl, tbx (vector)
switch (ins)
{
case INS_tbl:
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
result.insLatency = PERFSCORE_LATENCY_1C;
break;
case INS_tbl_2regs:
result.insThroughput = PERFSCORE_THROUGHPUT_3X;
result.insLatency = PERFSCORE_LATENCY_2C;
break;
case INS_tbl_3regs:
result.insThroughput = PERFSCORE_THROUGHPUT_4X;
result.insLatency = PERFSCORE_LATENCY_3C;
break;
case INS_tbl_4regs:
result.insThroughput = PERFSCORE_THROUGHPUT_3X;
result.insLatency = PERFSCORE_LATENCY_4C;
break;
case INS_tbx:
result.insThroughput = PERFSCORE_THROUGHPUT_3X;
result.insLatency = PERFSCORE_LATENCY_2C;
break;
case INS_tbx_2regs:
result.insThroughput = PERFSCORE_THROUGHPUT_4X;
result.insLatency = PERFSCORE_LATENCY_3C;
break;
case INS_tbx_3regs:
result.insThroughput = PERFSCORE_THROUGHPUT_5X;
result.insLatency = PERFSCORE_LATENCY_4C;
break;
case INS_tbx_4regs:
result.insThroughput = PERFSCORE_THROUGHPUT_6X;
result.insLatency = PERFSCORE_LATENCY_5C;
break;
default:
// All other instructions
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
result.insLatency = PERFSCORE_LATENCY_1C;
break;
}
break;

case IF_DV_2E: // mov, dup (scalar)
Expand Down
6 changes: 3 additions & 3 deletions src/coreclr/src/jit/emitarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -453,9 +453,9 @@ static emitAttr optGetSrcsize(insOpts conversion);
// for an element of size 'elemsize' in a vector register of size 'datasize'
static bool isValidVectorIndex(emitAttr datasize, emitAttr elemsize, ssize_t index);

// For a given Load/Store Vector instruction 'ins' returns a number of consecutive SIMD registers
// the instruction loads to/store from.
static unsigned insGetLoadStoreRegisterListSize(instruction ins);
// For a given instruction 'ins' which contains a register lists returns a
// number of consecutive SIMD registers the instruction loads to/store from.
static unsigned insGetRegisterListSize(instruction ins);

/************************************************************************/
/* Public inline informational methods */
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/src/jit/hwintrinsiclistarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ HARDWARE_INTRINSIC(AdvSimd, SqrtScalar,
HARDWARE_INTRINSIC(AdvSimd, Store, -1, 2, {INS_st1, INS_st1, INS_st1, INS_st1, INS_st1, INS_st1, INS_st1, INS_st1, INS_st1, INS_st1}, HW_Category_MemoryStore, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromSecondArg)
HARDWARE_INTRINSIC(AdvSimd, Subtract, -1, 2, {INS_sub, INS_sub, INS_sub, INS_sub, INS_sub, INS_sub, INS_sub, INS_sub, INS_fsub, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AdvSimd, SubtractScalar, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sub, INS_sub, INS_fsub, INS_fsub}, HW_Category_SIMDScalar, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AdvSimd, VectorTableLookup, 8, 2, {INS_tbl, INS_tbl, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AdvSimd, VectorTableLookupExtension, 8, 3, {INS_tbx, INS_tbx, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_HasRMWSemantics)
HARDWARE_INTRINSIC(AdvSimd, Xor, -1, 2, {INS_eor, INS_eor, INS_eor, INS_eor, INS_eor, INS_eor, INS_eor, INS_eor, INS_eor, INS_eor}, HW_Category_SimpleSIMD, HW_Flag_Commutative)

// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
Expand Down Expand Up @@ -195,6 +197,8 @@ HARDWARE_INTRINSIC(AdvSimd_Arm64, TransposeEven, -
HARDWARE_INTRINSIC(AdvSimd_Arm64, TransposeOdd, -1, 2, {INS_trn2, INS_trn2, INS_trn2, INS_trn2, INS_trn2, INS_trn2, INS_trn2, INS_trn2, INS_trn2, INS_trn2}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AdvSimd_Arm64, UnzipEven, -1, 2, {INS_uzp1, INS_uzp1, INS_uzp1, INS_uzp1, INS_uzp1, INS_uzp1, INS_uzp1, INS_uzp1, INS_uzp1, INS_uzp1}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AdvSimd_Arm64, UnzipOdd, -1, 2, {INS_uzp2, INS_uzp2, INS_uzp2, INS_uzp2, INS_uzp2, INS_uzp2, INS_uzp2, INS_uzp2, INS_uzp2, INS_uzp2}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AdvSimd_Arm64, VectorTableLookup, 16, 2, {INS_tbl, INS_tbl, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AdvSimd_Arm64, VectorTableLookupExtension, 16, 3, {INS_tbx, INS_tbx, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_HasRMWSemantics)
HARDWARE_INTRINSIC(AdvSimd_Arm64, ZipHigh, -1, 2, {INS_zip2, INS_zip2, INS_zip2, INS_zip2, INS_zip2, INS_zip2, INS_zip2, INS_zip2, INS_zip2, INS_zip2}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AdvSimd_Arm64, ZipLow, -1, 2, {INS_zip1, INS_zip1, INS_zip1, INS_zip1, INS_zip1, INS_zip1, INS_zip1, INS_zip1, INS_zip1, INS_zip1}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)

Expand Down
25 changes: 25 additions & 0 deletions src/coreclr/src/jit/instrsarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -1832,6 +1832,31 @@ INST1(uxtl, "uxtl", 0, 0, IF_DV_2O, 0x2F00A400)

INST1(uxtl2, "uxtl2", 0, 0, IF_DV_2O, 0x6F00A400)
// uxtl2 Vd,Vn DV_2O 011011110iiiiiii 101001nnnnnddddd 6F00 A400 Vd,Vn (shift - vector)

INST1(tbl, "tbl", 0, 0, IF_DV_3C, 0x0E000000)
// tbl Vd,{Vn},Vm DV_3C 0Q001110000mmmmm 000000nnnnnddddd 0E00 0000 Vd,Vn,Vm (vector)

INST1(tbl_2regs, "tbl", 0, 0, IF_DV_3C, 0x0E002000)
// tbl Vd,{Vn,Vn+1},Vm DV_3C 0Q001110000mmmmm 001000nnnnnddddd 0E00 2000 Vd,Vn,Vm (vector)

INST1(tbl_3regs, "tbl", 0, 0, IF_DV_3C, 0x0E004000)
// tbl Vd,{Vn,Vn+1,Vn+2},Vm DV_3C 0Q001110000mmmmm 010000nnnnnddddd 0E00 4000 Vd,Vn,Vm (vector)

INST1(tbl_4regs, "tbl", 0, 0, IF_DV_3C, 0x0E006000)
// tbl Vd,{Vn,Vn+1,Vn+2,Vn+3},Vm DV_3C 0Q001110000mmmmm 011000nnnnnddddd 0E00 6000 Vd,Vn,Vm (vector)

INST1(tbx, "tbx", 0, 0, IF_DV_3C, 0x0E001000)
// tbx Vd,{Vn},Vm DV_3C 0Q001110000mmmmm 000100nnnnnddddd 0E00 1000 Vd,Vn,Vm (vector)

INST1(tbx_2regs, "tbx", 0, 0, IF_DV_3C, 0x0E003000)
// tbx Vd,{Vn,Vn+1},Vm DV_3C 0Q001110000mmmmm 001100nnnnnddddd 0E00 3000 Vd,Vn,Vm (vector)

INST1(tbx_3regs, "tbx", 0, 0, IF_DV_3C, 0x0E005000)
// tbx Vd,{Vn,Vn+1,Vn+2},Vm DV_3C 0Q001110000mmmmm 010100nnnnnddddd 0E00 5000 Vd,Vn,Vm (vector)

INST1(tbx_4regs, "tbx", 0, 0, IF_DV_3C, 0x0E007000)
// tbx Vd,{Vn,Vn+1,Vn+2,Vn+3},Vm DV_3C 0Q001110000mmmmm 011100nnnnnddddd 0E00 7000 Vd,Vn,Vm (vector)

// clang-format on

/*****************************************************************************/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,10 @@
<Compile Include="TransposeOdd.Vector128.UInt16.cs" />
<Compile Include="TransposeOdd.Vector128.UInt32.cs" />
<Compile Include="TransposeOdd.Vector128.UInt64.cs" />
<Compile Include="VectorTableLookup.Vector128.Byte.cs" />
<Compile Include="VectorTableLookup.Vector128.SByte.cs" />
<Compile Include="VectorTableLookupExtension.Vector128.Byte.cs" />
<Compile Include="VectorTableLookupExtension.Vector128.SByte.cs" />
<Compile Include="UnzipEven.Vector64.Byte.cs" />
<Compile Include="UnzipEven.Vector64.Int16.cs" />
<Compile Include="UnzipEven.Vector64.Int32.cs" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,10 @@
<Compile Include="TransposeOdd.Vector128.UInt16.cs" />
<Compile Include="TransposeOdd.Vector128.UInt32.cs" />
<Compile Include="TransposeOdd.Vector128.UInt64.cs" />
<Compile Include="VectorTableLookup.Vector128.Byte.cs" />
<Compile Include="VectorTableLookup.Vector128.SByte.cs" />
<Compile Include="VectorTableLookupExtension.Vector128.Byte.cs" />
<Compile Include="VectorTableLookupExtension.Vector128.SByte.cs" />
<Compile Include="UnzipEven.Vector64.Byte.cs" />
<Compile Include="UnzipEven.Vector64.Int16.cs" />
<Compile Include="UnzipEven.Vector64.Int32.cs" />
Expand Down
Loading

0 comments on commit aa81328

Please sign in to comment.