diff --git a/src/jit/codegen.h b/src/jit/codegen.h index be20865c0eec..78bb83b645f0 100644 --- a/src/jit/codegen.h +++ b/src/jit/codegen.h @@ -95,6 +95,7 @@ class CodeGen : public CodeGenInterface static bool genShouldRoundFP(); GenTreeIndir indirForm(var_types type, GenTree* base); + GenTreeStoreInd storeIndirForm(var_types type, GenTree* base, GenTree* data); GenTreeIntCon intForm(var_types type, ssize_t value); @@ -1042,6 +1043,9 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void genConsumeRegs(GenTree* tree); void genConsumeOperands(GenTreeOp* tree); +#ifdef FEATURE_HW_INTRINSICS + void genConsumeHWIntrinsicOperands(GenTreeHWIntrinsic* tree); +#endif // FEATURE_HW_INTRINSICS void genEmitGSCookieCheck(bool pushReg); void genSetRegToIcon(regNumber reg, ssize_t val, var_types type = TYP_INT, insFlags flags = INS_FLAGS_DONT_CARE); void genCodeForShift(GenTree* tree); @@ -1311,6 +1315,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #if defined(_TARGET_XARCH_) void inst_RV_RV_IV(instruction ins, emitAttr size, regNumber reg1, regNumber reg2, unsigned ival); + void inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenTree* rmOp, int ival); #endif void inst_RV_RR(instruction ins, emitAttr size, regNumber reg1, regNumber reg2); diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp index b90cf5548df1..bebcde5fd62d 100644 --- a/src/jit/codegencommon.cpp +++ b/src/jit/codegencommon.cpp @@ -11296,6 +11296,17 @@ GenTreeIndir CodeGen::indirForm(var_types type, GenTree* base) return i; } +//------------------------------------------------------------------------ +// indirForm: Make a temporary indir we can feed to pattern matching routines +// in cases where we don't want to instantiate all the indirs that happen. +// +GenTreeStoreInd CodeGen::storeIndirForm(var_types type, GenTree* base, GenTree* data) +{ + GenTreeStoreInd i(type, base, data); + i.gtRegNum = REG_NA; + return i; +} + //------------------------------------------------------------------------ // intForm: Make a temporary int we can feed to pattern matching routines // in cases where we don't want to instantiate. diff --git a/src/jit/codegenlinear.cpp b/src/jit/codegenlinear.cpp index 149bb3333f59..6579a19dd8d1 100644 --- a/src/jit/codegenlinear.cpp +++ b/src/jit/codegenlinear.cpp @@ -1333,12 +1333,27 @@ void CodeGen::genConsumeRegs(GenTree* tree) // Update the life of the lcl var. genUpdateLife(tree); } -#endif // _TARGET_XARCH_ - else if (tree->OperIsInitVal()) +#ifdef FEATURE_HW_INTRINSICS + else if (tree->OperIs(GT_HWIntrinsic)) { - genConsumeReg(tree->gtGetOp1()); + // Only load/store HW intrinsics can be contained (and the address may also be contained). + HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(tree->AsHWIntrinsic()->gtHWIntrinsicId); + assert((category == HW_Category_MemoryLoad) || (category == HW_Category_MemoryStore)); + int numArgs = HWIntrinsicInfo::lookupNumArgs(tree->AsHWIntrinsic()); + genConsumeAddress(tree->gtGetOp1()); + if (category == HW_Category_MemoryStore) + { + assert((numArgs == 2) && !tree->gtGetOp2()->isContained()); + genConsumeReg(tree->gtGetOp2()); + } + else + { + assert(numArgs == 1); + } } - else if (tree->OperIsHWIntrinsic()) +#endif // FEATURE_HW_INTRINSICS +#endif // _TARGET_XARCH_ + else if (tree->OperIsInitVal()) { genConsumeReg(tree->gtGetOp1()); } @@ -1368,11 +1383,6 @@ void CodeGen::genConsumeRegs(GenTree* tree) // Return Value: // None. // -// Notes: -// Note that this logic is localized here because we must do the liveness update in -// the correct execution order. This is important because we may have two operands -// that involve the same lclVar, and if one is marked "lastUse" we must handle it -// after the first. void CodeGen::genConsumeOperands(GenTreeOp* tree) { @@ -1389,6 +1399,55 @@ void CodeGen::genConsumeOperands(GenTreeOp* tree) } } +#ifdef FEATURE_HW_INTRINSICS +//------------------------------------------------------------------------ +// genConsumeHWIntrinsicOperands: Do liveness update for the operands of a GT_HWIntrinsic node +// +// Arguments: +// node - the GenTreeHWIntrinsic node whose operands will have their liveness updated. +// +// Return Value: +// None. +// + +void CodeGen::genConsumeHWIntrinsicOperands(GenTreeHWIntrinsic* node) +{ + int numArgs = HWIntrinsicInfo::lookupNumArgs(node); + GenTree* op1 = node->gtGetOp1(); + if (op1 == nullptr) + { + assert((numArgs == 0) && (node->gtGetOp2() == nullptr)); + return; + } + if (op1->OperIs(GT_LIST)) + { + int foundArgs = 0; + assert(node->gtGetOp2() == nullptr); + for (GenTreeArgList* list = op1->AsArgList(); list != nullptr; list = list->Rest()) + { + GenTree* operand = list->Current(); + genConsumeRegs(operand); + foundArgs++; + } + assert(foundArgs == numArgs); + } + else + { + genConsumeRegs(op1); + GenTree* op2 = node->gtGetOp2(); + if (op2 != nullptr) + { + genConsumeRegs(op2); + assert(numArgs == 2); + } + else + { + assert(numArgs == 1); + } + } +} +#endif // FEATURE_HW_INTRINSICS + #if FEATURE_PUT_STRUCT_ARG_STK //------------------------------------------------------------------------ // genConsumePutStructArgStk: Do liveness update for the operands of a PutArgStk node. diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp index bbaeb81d0fe6..b4fb40833fcf 100644 --- a/src/jit/emitxarch.cpp +++ b/src/jit/emitxarch.cpp @@ -2867,6 +2867,12 @@ void emitter::emitInsLoadInd(instruction ins, emitAttr attr, regNumber dstReg, G id->idReg1(dstReg); emitHandleMemOp(mem, id, IF_RWR_ARD, ins); UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins)); + if (Is4ByteSSEInstruction(ins)) + { + // The 4-Byte SSE instructions require an additional byte. + sz += 1; + } + id->idCodeSize(sz); dispIns(id); emitCurIGsize += sz; @@ -4055,6 +4061,12 @@ void emitter::emitIns_R_A(instruction ins, emitAttr attr, regNumber reg1, GenTre emitHandleMemOp(indir, id, IF_RRW_ARD, ins); UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins)); + if (Is4ByteSSEInstruction(ins)) + { + // The 4-Byte SSE instructions require an additional byte. + sz += 1; + } + id->idCodeSize(sz); dispIns(id); @@ -4106,8 +4118,8 @@ void emitter::emitIns_R_AR_I(instruction ins, emitAttr attr, regNumber reg1, reg if (Is4ByteSSEInstruction(ins)) { - // The 4-Byte SSE instructions require two additional bytes - sz += 2; + // The 4-Byte SSE instructions require an additional byte. + sz += 1; } id->idCodeSize(sz); @@ -5183,8 +5195,8 @@ void emitter::emitIns_R_AR(instruction ins, emitAttr attr, regNumber ireg, regNu if (Is4ByteSSEInstruction(ins)) { - // The 4-Byte SSE instructions require two additional bytes - sz += 2; + // The 4-Byte SSE instructions require an additional byte. + sz += 1; } id->idCodeSize(sz); @@ -5658,7 +5670,7 @@ void emitter::emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNu #ifdef FEATURE_HW_INTRINSICS //------------------------------------------------------------------------ -// emitIns_SIMD_R_R_I: emits the code for a SIMD instruction that takes a register operand, an immediate operand +// emitIns_SIMD_R_R_I: emits the code for an instruction that takes a register operand, an immediate operand // and that returns a value in register // // Arguments: @@ -5668,6 +5680,13 @@ void emitter::emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNu // op1Reg -- The register of the first operand // ival -- The immediate value // +// Notes: +// This will handle the required register copy if 'op1Reg' and 'targetReg' are not the same, and +// the 3-operand format is not available. +// This is not really SIMD-specific, but is currently only used in that context, as that's +// where we frequently need to handle the case of generating 3-operand or 2-operand forms +// depending on what target ISA is supported. +// void emitter::emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int ival) { if (UseVEXEncoding() || IsDstSrcImmAvxInstruction(ins)) @@ -5722,12 +5741,14 @@ void emitter::emitIns_SIMD_R_R_A( // targetReg -- The target register // op1Reg -- The register of the first operand // base -- The base register used for the memory address +// offset -- The memory offset // -void emitter::emitIns_SIMD_R_R_AR(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base) +void emitter::emitIns_SIMD_R_R_AR( + instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int offset) { if (UseVEXEncoding()) { - emitIns_R_R_AR(ins, attr, targetReg, op1Reg, base, 0); + emitIns_R_R_AR(ins, attr, targetReg, op1Reg, base, offset); } else { @@ -5735,7 +5756,7 @@ void emitter::emitIns_SIMD_R_R_AR(instruction ins, emitAttr attr, regNumber targ { emitIns_R_R(INS_movaps, attr, targetReg, op1Reg); } - emitIns_R_AR(ins, attr, targetReg, base, 0); + emitIns_R_AR(ins, attr, targetReg, base, offset); } } diff --git a/src/jit/emitxarch.h b/src/jit/emitxarch.h index 5b06838b0709..bad81b7cbbcd 100644 --- a/src/jit/emitxarch.h +++ b/src/jit/emitxarch.h @@ -449,7 +449,8 @@ void emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, void emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int ival); void emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir); -void emitIns_SIMD_R_R_AR(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base); +void emitIns_SIMD_R_R_AR( + instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int offset); void emitIns_SIMD_R_R_C( instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, CORINFO_FIELD_HANDLE fldHnd, int offs); void emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg); diff --git a/src/jit/gentree.cpp b/src/jit/gentree.cpp index 623a98c22ca8..2b6520682caa 100644 --- a/src/jit/gentree.cpp +++ b/src/jit/gentree.cpp @@ -3510,6 +3510,26 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree) costSz = 2 * 2; break; +#if defined(FEATURE_HW_INTRINSICS) && defined(_TARGET_XARCH_) + case GT_HWIntrinsic: + { + if (tree->AsHWIntrinsic()->OperIsMemoryLoadOrStore()) + { + costEx = IND_COST_EX; + costSz = 2; + // See if we can form a complex addressing mode. + + GenTree* addr = op1->gtEffectiveVal(); + + if (addr->OperIs(GT_ADD) && gtMarkAddrMode(addr, &costEx, &costSz, tree->TypeGet())) + { + goto DONE; + } + } + } + break; +#endif // FEATURE_HW_INTRINSICS && _TARGET_XARCH_ + case GT_BLK: case GT_IND: diff --git a/src/jit/hwintrinsiccodegenxarch.cpp b/src/jit/hwintrinsiccodegenxarch.cpp index 9660ae346ef1..9eada1ef9de6 100644 --- a/src/jit/hwintrinsiccodegenxarch.cpp +++ b/src/jit/hwintrinsiccodegenxarch.cpp @@ -109,32 +109,50 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) { case 1: { - genConsumeOperands(node); - op1Reg = op1->gtRegNum; - if (node->OperIsMemoryLoad()) { - emit->emitIns_R_AR(ins, simdSize, targetReg, op1Reg, 0); - } - else if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId)) - { - emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg); - } - else if ((ival != -1) && varTypeIsFloating(baseType)) - { - assert((ival >= 0) && (ival <= 127)); - genHWIntrinsic_R_RM_I(node, ins, (int8_t)ival); + genConsumeAddress(op1); + // Until we improve the handling of addressing modes in the emitter, we'll create a + // temporary GT_IND to generate code with. + GenTreeIndir load = indirForm(node->TypeGet(), op1); + emit->emitInsLoadInd(ins, simdSize, node->gtRegNum, &load); } else { - genHWIntrinsic_R_RM(node, ins, simdSize); + genConsumeRegs(op1); + op1Reg = op1->gtRegNum; + + if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId)) + { + emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg); + } + else if ((ival != -1) && varTypeIsFloating(baseType)) + { + assert((ival >= 0) && (ival <= 127)); + genHWIntrinsic_R_RM_I(node, ins, (int8_t)ival); + } + else + { + genHWIntrinsic_R_RM(node, ins, simdSize); + } } break; } case 2: { - genConsumeOperands(node); + if (category == HW_Category_MemoryStore) + { + genConsumeAddress(op1); + genConsumeReg(op2); + // Until we improve the handling of addressing modes in the emitter, we'll create a + // temporary GT_STORE_IND to generate code with. + GenTreeStoreInd store = storeIndirForm(node->TypeGet(), op1, op2); + emit->emitInsStoreInd(ins, simdSize, &store); + break; + } + genConsumeRegs(op1); + genConsumeRegs(op2); op1Reg = op1->gtRegNum; op2Reg = op2->gtRegNum; @@ -153,25 +171,30 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) op1Reg = targetReg; } - if (category == HW_Category_MemoryStore) - { - emit->emitIns_AR_R(ins, simdSize, op2Reg, op1Reg, 0); - } - else if ((ival != -1) && varTypeIsFloating(baseType)) + if ((ival != -1) && varTypeIsFloating(baseType)) { assert((ival >= 0) && (ival <= 127)); genHWIntrinsic_R_R_RM_I(node, ins, ival); } else if (category == HW_Category_MemoryLoad) { + // Get the address and the 'other' register. + GenTree* addr; + regNumber otherReg; if (intrinsicId == NI_AVX_MaskLoad || intrinsicId == NI_AVX2_MaskLoad) { - emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op2Reg, op1Reg); + addr = op1; + otherReg = op2Reg; } else { - emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2Reg); + addr = op2; + otherReg = op1Reg; } + // Until we improve the handling of addressing modes in the emitter, we'll create a + // temporary GT_IND to generate code with. + GenTreeIndir load = indirForm(node->TypeGet(), addr); + genHWIntrinsic_R_R_RM(node, ins, simdSize, targetReg, otherReg, &load); } else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2)) { @@ -210,10 +233,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) case 3: { - assert(op1->OperIsList()); - assert(op1->gtGetOp2()->OperIsList()); - assert(op1->gtGetOp2()->gtGetOp2()->OperIsList()); - GenTreeArgList* argList = op1->AsArgList(); op1 = argList->Current(); genConsumeRegs(op1); @@ -520,99 +539,8 @@ void CodeGen::genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, i { assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId)); assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1); - - TempDsc* tmpDsc = nullptr; - unsigned varNum = BAD_VAR_NUM; - unsigned offset = (unsigned)-1; - - if (op1->isUsedFromSpillTemp()) - { - assert(op1->IsRegOptional()); - - tmpDsc = getSpillTempDsc(op1); - varNum = tmpDsc->tdTempNum(); - offset = 0; - - regSet.tmpRlsTemp(tmpDsc); - } - else if (op1->OperIsHWIntrinsic()) - { - emit->emitIns_R_AR_I(ins, simdSize, targetReg, op1->gtGetOp1()->gtRegNum, 0, ival); - return; - } - else if (op1->isIndir()) - { - GenTreeIndir* memIndir = op1->AsIndir(); - GenTree* memBase = memIndir->gtOp1; - - switch (memBase->OperGet()) - { - case GT_LCL_VAR_ADDR: - { - varNum = memBase->AsLclVarCommon()->GetLclNum(); - offset = 0; - - // Ensure that all the GenTreeIndir values are set to their defaults. - assert(!memIndir->HasIndex()); - assert(memIndir->Scale() == 1); - assert(memIndir->Offset() == 0); - - break; - } - - case GT_CLS_VAR_ADDR: - { - emit->emitIns_R_C_I(ins, simdSize, targetReg, memBase->gtClsVar.gtClsVarHnd, 0, ival); - return; - } - - default: - { - emit->emitIns_R_A_I(ins, simdSize, targetReg, memIndir, ival); - return; - } - } - } - else - { - switch (op1->OperGet()) - { - case GT_LCL_FLD: - { - GenTreeLclFld* lclField = op1->AsLclFld(); - - varNum = lclField->GetLclNum(); - offset = lclField->gtLclFld.gtLclOffs; - break; - } - - case GT_LCL_VAR: - { - assert(op1->IsRegOptional() || !compiler->lvaTable[op1->gtLclVar.gtLclNum].lvIsRegCandidate()); - varNum = op1->AsLclVar()->GetLclNum(); - offset = 0; - break; - } - - default: - unreached(); - break; - } - } - - // Ensure we got a good varNum and offset. - // We also need to check for `tmpDsc != nullptr` since spill temp numbers - // are negative and start with -1, which also happens to be BAD_VAR_NUM. - assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr)); - assert(offset != (unsigned)-1); - - emit->emitIns_R_S_I(ins, simdSize, targetReg, varNum, offset, ival); - } - else - { - regNumber op1Reg = op1->gtRegNum; - emit->emitIns_SIMD_R_R_I(ins, simdSize, targetReg, op1Reg, ival); } + inst_RV_TT_IV(ins, simdSize, targetReg, op1, ival); } //------------------------------------------------------------------------ @@ -681,7 +609,11 @@ void CodeGen::genHWIntrinsic_R_R_RM( } else if (op2->OperIsHWIntrinsic()) { - emit->emitIns_SIMD_R_R_AR(ins, attr, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum); + GenTree* addr = op2->gtGetOp1(); + // Until we improve the handling of addressing modes in the emitter, we'll create a + // temporary GT_IND to generate code with. + GenTreeIndir load = indirForm(node->TypeGet(), addr); + emit->emitIns_SIMD_R_R_A(ins, attr, targetReg, op1Reg, &load); return; } else if (op2->isIndir()) @@ -1267,7 +1199,7 @@ void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsi // node - The hardware intrinsic node // // Note: -// We currently assume that all base intrinsics only have a single operand. +// We currently assume that all base intrinsics have zero or one operand. // void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node) { @@ -1279,15 +1211,10 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node) assert(compiler->compSupports(InstructionSet_SSE)); assert((baseType >= TYP_BYTE) && (baseType <= TYP_DOUBLE)); - GenTree* op1 = node->gtGetOp1(); - regNumber op1Reg = REG_NA; + GenTree* op1 = node->gtGetOp1(); - if (op1 != nullptr) - { - assert(!op1->OperIsList()); - op1Reg = op1->gtRegNum; - genConsumeOperands(node); - } + genConsumeHWIntrinsicOperands(node); + regNumber op1Reg = (op1 == nullptr) ? REG_NA : op1->gtRegNum; assert(node->gtGetOp2() == nullptr); @@ -1418,11 +1345,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) regNumber op4Reg = REG_NA; emitter* emit = getEmitter(); - if ((op1 != nullptr) && !op1->OperIsList()) - { - op1Reg = op1->gtRegNum; - genConsumeOperands(node); - } + genConsumeHWIntrinsicOperands(node); switch (intrinsicId) { @@ -1529,6 +1452,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) assert(op2 == nullptr); instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType); + op1Reg = op1->gtRegNum; emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0); break; } @@ -1568,11 +1492,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node) regNumber op2Reg = REG_NA; emitter* emit = getEmitter(); - if ((op1 != nullptr) && !op1->OperIsList()) - { - op1Reg = op1->gtRegNum; - genConsumeOperands(node); - } + genConsumeHWIntrinsicOperands(node); switch (intrinsicId) { @@ -1588,6 +1508,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node) assert((ival >= 0) && (ival <= 127)); instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); + op1Reg = op1->gtRegNum; op2Reg = op2->gtRegNum; emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival); @@ -1711,6 +1632,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node) if (varTypeIsIntegral(baseType)) { assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG); + op1Reg = op1->gtRegNum; emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg); } else @@ -1748,6 +1670,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node) op2Reg = op2->gtRegNum; instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); + op1Reg = op1->gtRegNum; emit->emitIns_AR_R(ins, emitTypeSize(baseType), op2Reg, op1Reg, 0); break; } @@ -1783,16 +1706,13 @@ void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node) regNumber op4Reg = REG_NA; emitter* emit = getEmitter(); - if ((op1 != nullptr) && !op1->OperIsList()) - { - op1Reg = op1->gtRegNum; - genConsumeOperands(node); - } + genConsumeHWIntrinsicOperands(node); switch (intrinsicId) { case NI_SSE41_TestAllOnes: { + op1Reg = op1->gtRegNum; regNumber tmpReg = node->GetSingleTempReg(); assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest); emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg); @@ -1845,12 +1765,12 @@ void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node) if (baseType == TYP_FLOAT) { // extract instructions return to GP-registers, so it needs int size as the emitsize - emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1Reg, i); + inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1, i); emit->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, targetReg, tmpTargetReg); } else { - emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), targetReg, op1Reg, i); + inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), targetReg, op1, i); } }; @@ -1896,8 +1816,8 @@ void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node) var_types targetType = node->TypeGet(); emitter* emit = getEmitter(); + genConsumeHWIntrinsicOperands(node); regNumber op1Reg = op1->gtRegNum; - genConsumeOperands(node); assert(targetReg != REG_NA); assert(op1Reg != REG_NA); @@ -1966,18 +1886,15 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node) regNumber targetReg = node->gtRegNum; emitter* emit = getEmitter(); - if ((op1 != nullptr) && !op1->OperIsList()) - { - op1Reg = op1->gtRegNum; - genConsumeOperands(node); - } + genConsumeHWIntrinsicOperands(node); switch (intrinsicId) { case NI_AVX2_ConvertToInt32: case NI_AVX2_ConvertToUInt32: { - assert(op2 == nullptr); + op1Reg = op1->gtRegNum; + assert(numArgs == 1); assert((baseType == TYP_INT) || (baseType == TYP_UINT)); instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg); @@ -1992,16 +1909,13 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node) GenTreeArgList* list = op1->AsArgList(); op1 = list->Current(); op1Reg = op1->gtRegNum; - genConsumeRegs(op1); list = list->Rest(); op2 = list->Current(); op2Reg = op2->gtRegNum; - genConsumeRegs(op2); list = list->Rest(); GenTree* op3 = list->Current(); - genConsumeRegs(op3); list = list->Rest(); GenTree* op4 = nullptr; @@ -2017,12 +1931,11 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node) if (numArgs == 5) { assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256); - op4 = list->Current(); - list = list->Rest(); - lastOp = list->Current(); - op3Reg = op3->gtRegNum; - op4Reg = op4->gtRegNum; - genConsumeRegs(op4); + op4 = list->Current(); + list = list->Rest(); + lastOp = list->Current(); + op3Reg = op3->gtRegNum; + op4Reg = op4->gtRegNum; addrBaseReg = op2Reg; addrIndexReg = op3Reg; indexOp = op3; @@ -2157,10 +2070,7 @@ void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node) assert(targetReg != REG_NA); assert(op1 != nullptr); - if (!op1->OperIsList()) - { - genConsumeOperands(node); - } + genConsumeHWIntrinsicOperands(node); switch (intrinsicId) { @@ -2224,16 +2134,13 @@ void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node) { GenTreeArgList* argList = op1->AsArgList(); op1 = argList->Current(); - genConsumeRegs(op1); - op1Reg = op1->gtRegNum; - argList = argList->Rest(); - op2 = argList->Current(); - genConsumeRegs(op2); - op2Reg = op2->gtRegNum; - argList = argList->Rest(); - GenTree* op3 = argList->Current(); - genConsumeRegs(op3); - op3Reg = op3->gtRegNum; + op1Reg = op1->gtRegNum; + argList = argList->Rest(); + op2 = argList->Current(); + op2Reg = op2->gtRegNum; + argList = argList->Rest(); + GenTree* op3 = argList->Current(); + op3Reg = op3->gtRegNum; assert(op3Reg != op1Reg); assert(op3Reg != targetReg); assert(op3Reg != REG_EDX); @@ -2288,22 +2195,16 @@ void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node) regNumber targetReg = node->gtRegNum; assert(HWIntrinsicInfo::lookupNumArgs(node) == 3); - assert(op1 != nullptr); - assert(op1->OperIsList()); - assert(op1->gtGetOp2()->OperIsList()); - assert(op1->gtGetOp2()->gtGetOp2()->OperIsList()); + genConsumeHWIntrinsicOperands(node); GenTreeArgList* argList = op1->AsArgList(); op1 = argList->Current(); - genConsumeRegs(op1); argList = argList->Rest(); GenTree* op2 = argList->Current(); - genConsumeRegs(op2); argList = argList->Rest(); GenTree* op3 = argList->Current(); - genConsumeRegs(op3); regNumber op1Reg; regNumber op2Reg; diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h index 60e0f801e2f1..6434831fdfdf 100644 --- a/src/jit/hwintrinsiclistxarch.h +++ b/src/jit/hwintrinsiclistxarch.h @@ -114,11 +114,11 @@ HARDWARE_INTRINSIC(SSE_ConvertScalarToVector128Single, "ConvertScal HARDWARE_INTRINSIC(SSE_ConvertToInt32WithTruncation, "ConvertToInt32WithTruncation", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE_Divide, "Divide", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_DivideScalar, "DivideScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE_LoadAlignedVector128, "LoadAlignedVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE_LoadHigh, "LoadHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE_LoadLow, "LoadLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE_LoadScalarVector128, "LoadScalarVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE_LoadVector128, "LoadVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE_LoadAlignedVector128, "LoadAlignedVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE_LoadHigh, "LoadHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE_LoadLow, "LoadLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE_LoadScalarVector128, "LoadScalarVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE_LoadVector128, "LoadVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE_Max, "Max", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE_MaxScalar, "MaxScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE_Min, "Min", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) @@ -141,13 +141,13 @@ HARDWARE_INTRINSIC(SSE_ReciprocalSqrtScalar, "ReciprocalS HARDWARE_INTRINSIC(SSE_Shuffle, "Shuffle", SSE, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(SSE_Sqrt, "Sqrt", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE_SqrtScalar, "SqrtScalar", SSE, -1, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE_Store, "Store", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE_StoreAligned, "StoreAligned", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movntps, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE_Store, "Store", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE_StoreAligned, "StoreAligned", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movntps, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE_StoreFence, "StoreFence", SSE, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE_StoreHigh, "StoreHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE_StoreLow, "StoreLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE_StoreScalar, "StoreScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE_StoreHigh, "StoreHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE_StoreLow, "StoreLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE_StoreScalar, "StoreScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE_Subtract, "Subtract", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_SubtractScalar, "SubtractScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE_UnpackHigh, "UnpackHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) @@ -227,12 +227,12 @@ HARDWARE_INTRINSIC(SSE2_Divide, "Divide", HARDWARE_INTRINSIC(SSE2_DivideScalar, "DivideScalar", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE2_Extract, "Extract", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_pextrw, INS_pextrw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_Insert, "Insert", SSE2, -1, 16, 3, {INS_invalid, INS_invalid, INS_pinsrw, INS_pinsrw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(SSE2_LoadAlignedVector128, "LoadAlignedVector128", SSE2, -1, 16, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_invalid, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE2_LoadAlignedVector128, "LoadAlignedVector128", SSE2, -1, 16, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_invalid, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_LoadFence, "LoadFence", SSE2, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2_LoadHigh, "LoadHigh", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhpd}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2_LoadLow, "LoadLow", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlpd}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2_LoadScalarVector128, "LoadScalarVector128", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_movd, INS_movq, INS_movq, INS_invalid, INS_movsdsse2}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2_LoadVector128, "LoadVector128", SSE2, -1, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_invalid, INS_movupd}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE2_LoadHigh, "LoadHigh", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhpd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE2_LoadLow, "LoadLow", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlpd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE2_LoadScalarVector128, "LoadScalarVector128", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_movd, INS_movq, INS_movq, INS_invalid, INS_movsdsse2}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE2_LoadVector128, "LoadVector128", SSE2, -1, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_invalid, INS_movupd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_MaskMove, "MaskMove", SSE2, -1, 16, 3, {INS_maskmovdqu, INS_maskmovdqu, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_Max, "Max", SSE2, -1, 16, 2, {INS_invalid, INS_pmaxub, INS_pmaxsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE2_MemoryFence, "MemoryFence", SSE2, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) @@ -260,13 +260,13 @@ HARDWARE_INTRINSIC(SSE2_ShuffleHigh, "ShuffleHigh HARDWARE_INTRINSIC(SSE2_ShuffleLow, "ShuffleLow", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_pshuflw, INS_pshuflw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(SSE2_Sqrt, "Sqrt", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_SqrtScalar, "SqrtScalar", SSE2, -1, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE2_Store, "Store", SSE2, -1, 16, 2, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_invalid, INS_movupd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2_StoreAligned, "StoreAligned", SSE2, -1, 16, 2, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_invalid, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", SSE2, -1, 16, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_invalid, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2_StoreHigh, "StoreHigh", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhpd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2_StoreLow, "StoreLow", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movq, INS_movq, INS_invalid, INS_movlpd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2_StoreNonTemporal, "StoreNonTemporal", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movnti, INS_movnti, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(SSE2_StoreScalar, "StoreScalar", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsdsse2}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE2_Store, "Store", SSE2, -1, 16, 2, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_invalid, INS_movupd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE2_StoreAligned, "StoreAligned", SSE2, -1, 16, 2, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_invalid, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE2_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", SSE2, -1, 16, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_invalid, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE2_StoreHigh, "StoreHigh", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE2_StoreLow, "StoreLow", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movq, INS_movq, INS_invalid, INS_movlpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE2_StoreNonTemporal, "StoreNonTemporal", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movnti, INS_movnti, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(SSE2_StoreScalar, "StoreScalar", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsdsse2}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_Subtract, "Subtract", SSE2, -1, 16, 2, {INS_psubb, INS_psubb, INS_psubw, INS_psubw, INS_psubd, INS_psubd, INS_psubq, INS_psubq, INS_invalid, INS_subpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE2_SubtractSaturate, "SubtractSaturate", SSE2, -1, 16, 2, {INS_psubsb, INS_psubusb, INS_psubsw, INS_psubusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE2_SubtractScalar, "SubtractScalar", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) @@ -286,7 +286,7 @@ HARDWARE_INTRINSIC(SSE2_X64_ConvertToUInt64, "ConvertToUI HARDWARE_INTRINSIC(SSE2_X64_ConvertScalarToVector128Double, "ConvertScalarToVector128Double", SSE2_X64, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2sd, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(SSE2_X64_ConvertScalarToVector128Int64, "ConvertScalarToVector128Int64", SSE2_X64, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mov_i2xmm, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(SSE2_X64_ConvertScalarToVector128UInt64, "ConvertScalarToVector128UInt64", SSE2_X64, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mov_i2xmm, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(SSE2_X64_StoreNonTemporal, "StoreNonTemporal", SSE2_X64, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movnti, INS_movnti, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(SSE2_X64_StoreNonTemporal, "StoreNonTemporal", SSE2_X64, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movnti, INS_movnti, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // Intrinsic ID Function name ISA ival SIMD size NumArg instructions Category Flags @@ -297,8 +297,8 @@ HARDWARE_INTRINSIC(SSE3_IsSupported, "get_IsSuppo HARDWARE_INTRINSIC(SSE3_AddSubtract, "AddSubtract", SSE3, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE3_HorizontalAdd, "HorizontalAdd", SSE3, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE3_HorizontalSubtract, "HorizontalSubtract", SSE3, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE3_LoadAndDuplicateToVector128, "LoadAndDuplicateToVector128", SSE3, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_lddqu, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE3_LoadDquVector128, "LoadDquVector128", SSE3, -1, 16, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE3_LoadAndDuplicateToVector128, "LoadAndDuplicateToVector128", SSE3, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_lddqu, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE3_LoadDquVector128, "LoadDquVector128", SSE3, -1, 16, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE3_MoveAndDuplicate, "MoveAndDuplicate", SSE3, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE3_MoveHighAndDuplicate, "MoveHighAndDuplicate", SSE3, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movshdup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE3_MoveLowAndDuplicate, "MoveLowAndDuplicate", SSE3, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) @@ -339,7 +339,7 @@ HARDWARE_INTRINSIC(SSE41_Extract, "Extract", HARDWARE_INTRINSIC(SSE41_Floor, "Floor", SSE41, 9, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE41_FloorScalar, "FloorScalar", SSE41, 9, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE41_Insert, "Insert", SSE41, -1, 16, 3, {INS_pinsrb, INS_pinsrb, INS_invalid, INS_invalid, INS_pinsrd, INS_pinsrd, INS_invalid, INS_invalid, INS_insertps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(SSE41_LoadAlignedVector128NonTemporal, "LoadAlignedVector128NonTemporal", SSE41, -1, 16, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE41_LoadAlignedVector128NonTemporal, "LoadAlignedVector128NonTemporal", SSE41, -1, 16, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE41_Max, "Max", SSE41, -1, 16, 2, {INS_pmaxsb, INS_invalid, INS_invalid, INS_pmaxuw, INS_pmaxsd, INS_pmaxud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE41_Min, "Min", SSE41, -1, 16, 2, {INS_pminsb, INS_invalid, INS_invalid, INS_pminuw, INS_pminsd, INS_pminud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE41_MinHorizontal, "MinHorizontal", SSE41, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_phminposuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) @@ -403,9 +403,9 @@ HARDWARE_INTRINSIC(AVX_AndNot, "AndNot", HARDWARE_INTRINSIC(AVX_Blend, "Blend", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX_BlendVariable, "BlendVariable", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_Ceiling, "Ceiling", AVX, 10, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX_BroadcastScalarToVector128, "BroadcastScalarToVector128", AVX, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX_BroadcastScalarToVector256, "BroadcastScalarToVector256", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_MemoryLoad, HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX_BroadcastVector128ToVector256, "BroadcastVector128ToVector256", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_vbroadcastf128}, HW_Category_MemoryLoad, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(AVX_BroadcastScalarToVector128, "BroadcastScalarToVector128", AVX, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX_BroadcastScalarToVector256, "BroadcastScalarToVector256", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX_BroadcastVector128ToVector256, "BroadcastVector128ToVector256", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_vbroadcastf128}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_Compare, "Compare", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_IMM, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_CompareScalar, "CompareScalar", AVX, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_IMM, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX_ConvertToVector128Int32, "ConvertToVector128Int32", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) @@ -424,12 +424,12 @@ HARDWARE_INTRINSIC(AVX_Floor, "Floor", HARDWARE_INTRINSIC(AVX_HorizontalAdd, "HorizontalAdd", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_HorizontalSubtract, "HorizontalSubtract", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_InsertVector128, "InsertVector128", AVX, -1, 32, 3, {INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX_LoadAlignedVector256, "LoadAlignedVector256", AVX, -1, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX_LoadDquVector256, "LoadDquVector256", AVX, -1, 32, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX_LoadVector256, "LoadVector256", AVX, -1, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX_LoadAlignedVector256, "LoadAlignedVector256", AVX, -1, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX_LoadDquVector256, "LoadDquVector256", AVX, -1, 32, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX_LoadVector256, "LoadVector256", AVX, -1, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX_Max, "Max", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxps, INS_maxpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX_Min, "Min", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minps, INS_minpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX_MaskLoad, "MaskLoad", AVX, -1, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_UnfixedSIMDSize) +HARDWARE_INTRINSIC(AVX_MaskLoad, "MaskLoad", AVX, -1, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryLoad, HW_Flag_UnfixedSIMDSize) HARDWARE_INTRINSIC(AVX_MaskStore, "MaskStore", AVX, -1, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_UnfixedSIMDSize|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(AVX_MoveMask, "MoveMask", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(AVX_Multiply, "Multiply", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) @@ -446,9 +446,9 @@ HARDWARE_INTRINSIC(AVX_RoundToPositiveInfinity, "RoundToPosi HARDWARE_INTRINSIC(AVX_RoundToZero, "RoundToZero", AVX, 11, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX_Shuffle, "Shuffle", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_shufpd}, HW_Category_IMM, HW_Flag_NoRMWSemantics|HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX_Sqrt, "Sqrt", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_sqrtpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX_Store, "Store", AVX, -1, 32, 2, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX_StoreAligned, "StoreAligned", AVX, -1, 32, 2, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", AVX, -1, 32, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntps, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX_Store, "Store", AVX, -1, 32, 2, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX_StoreAligned, "StoreAligned", AVX, -1, 32, 2, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", AVX, -1, 32, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntps, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX_Subtract, "Subtract", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subps, INS_subpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_TestC, "TestC", AVX, -1, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(AVX_TestNotZAndNotC, "TestNotZAndNotC", AVX, -1, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg) @@ -472,9 +472,9 @@ HARDWARE_INTRINSIC(AVX2_AndNot, "AndNot", HARDWARE_INTRINSIC(AVX2_Average, "Average", AVX2, -1, 32, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2_Blend, "Blend", AVX2, -1, 0, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_vpblendd, INS_vpblendd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_UnfixedSIMDSize|HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2_BlendVariable, "BlendVariable", AVX2, -1, 32, 3, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector128, "BroadcastScalarToVector128", AVX2, -1, 16, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_movddup}, HW_Category_SIMDScalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector256, "BroadcastScalarToVector256", AVX2, -1, 32, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX2_BroadcastVector128ToVector256, "BroadcastVector128ToVector256", AVX2, -1, 32, 1, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector128, "BroadcastScalarToVector128", AVX2, -1, 16, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_movddup}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) +HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector256, "BroadcastScalarToVector256", AVX2, -1, 32, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) +HARDWARE_INTRINSIC(AVX2_BroadcastVector128ToVector256, "BroadcastVector128ToVector256", AVX2, -1, 32, 1, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2_CompareEqual, "CompareEqual", AVX2, -1, 32, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2_CompareGreaterThan, "CompareGreaterThan", AVX2, -1, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2_ExtractVector128, "ExtractVector128", AVX2, -1, 32, 2, {INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) @@ -495,9 +495,9 @@ HARDWARE_INTRINSIC(AVX2_HorizontalAddSaturate, "HorizontalA HARDWARE_INTRINSIC(AVX2_HorizontalSubtract, "HorizontalSubtract", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2_HorizontalSubtractSaturate, "HorizontalSubtractSaturate", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2_InsertVector128, "InsertVector128", AVX2, -1, 32, 3, {INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX2_LoadAlignedVector256NonTemporal, "LoadAlignedVector256NonTemporal", AVX2, -1, 32, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX2_MaskLoad, "MaskLoad", AVX2, -1, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_UnfixedSIMDSize) -HARDWARE_INTRINSIC(AVX2_MaskStore, "MaskStore", AVX2, -1, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_UnfixedSIMDSize|HW_Flag_BaseTypeFromSecondArg) +HARDWARE_INTRINSIC(AVX2_LoadAlignedVector256NonTemporal, "LoadAlignedVector256NonTemporal", AVX2, -1, 32, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX2_MaskLoad, "MaskLoad", AVX2, -1, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_UnfixedSIMDSize) +HARDWARE_INTRINSIC(AVX2_MaskStore, "MaskStore", AVX2, -1, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_UnfixedSIMDSize|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(AVX2_Max, "Max", AVX2, -1, 32, 2, {INS_pmaxsb, INS_pmaxub, INS_pmaxsw, INS_pmaxuw, INS_pmaxsd, INS_pmaxud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2_Min, "Min", AVX2, -1, 32, 2, {INS_pminsb, INS_pminub, INS_pminsw, INS_pminuw, INS_pminsd, INS_pminud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2_MoveMask, "MoveMask", AVX2, -1, 32, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) diff --git a/src/jit/hwintrinsicxarch.cpp b/src/jit/hwintrinsicxarch.cpp index fcc20e04c642..dc298ed2303c 100644 --- a/src/jit/hwintrinsicxarch.cpp +++ b/src/jit/hwintrinsicxarch.cpp @@ -847,6 +847,15 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, case 1: argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass))); op1 = getArgForHWIntrinsic(argType, argClass); + if ((category == HW_Category_MemoryLoad) && op1->OperIs(GT_CAST)) + { + // Although the API specifies a pointer, if what we have is a BYREF, that's what + // we really want, so throw away the cast. + if (op1->gtGetOp1()->TypeGet() == TYP_BYREF) + { + op1 = op1->gtGetOp1(); + } + } retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize); break; case 2: diff --git a/src/jit/instr.cpp b/src/jit/instr.cpp index cbab22be761a..72a3fb93bb0c 100644 --- a/src/jit/instr.cpp +++ b/src/jit/instr.cpp @@ -1039,7 +1039,126 @@ void CodeGen::inst_RV_RV_IV(instruction ins, emitAttr size, regNumber reg1, regN getEmitter()->emitIns_R_R_I(ins, size, reg1, reg2, ival); } -#endif + +#ifdef FEATURE_HW_INTRINSICS +//------------------------------------------------------------------------ +// inst_RV_TT_IV: Generates an instruction that takes 3 operands: +// a register operand, an operand that may be memory or register and an immediate +// and that returns a value in register +// +// Arguments: +// ins -- The instruction being emitted +// attr -- The emit attribute +// reg1 -- The first operand, a register +// rmOp -- The second operand, which may be a memory node or a node producing a register +// ival -- The immediate operand +// +// Notes: +// This isn't really specific to HW intrinsics, but depends on other methods that are +// only defined for FEATURE_HW_INTRINSICS, and is currently only used in that context. +// +void CodeGen::inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenTree* rmOp, int ival) +{ + noway_assert(getEmitter()->emitVerifyEncodable(ins, EA_SIZE(attr), reg1)); + + if (rmOp->isContained() || rmOp->isUsedFromSpillTemp()) + { + TempDsc* tmpDsc = nullptr; + unsigned varNum = BAD_VAR_NUM; + unsigned offset = (unsigned)-1; + + if (rmOp->isUsedFromSpillTemp()) + { + assert(rmOp->IsRegOptional()); + + tmpDsc = getSpillTempDsc(rmOp); + varNum = tmpDsc->tdTempNum(); + offset = 0; + + regSet.tmpRlsTemp(tmpDsc); + } + else if (rmOp->OperIsHWIntrinsic()) + { + getEmitter()->emitIns_R_AR_I(ins, attr, reg1, rmOp->gtGetOp1()->gtRegNum, 0, ival); + return; + } + else if (rmOp->isIndir()) + { + GenTreeIndir* memIndir = rmOp->AsIndir(); + GenTree* memBase = memIndir->gtOp1; + + switch (memBase->OperGet()) + { + case GT_LCL_VAR_ADDR: + { + varNum = memBase->AsLclVarCommon()->GetLclNum(); + offset = 0; + + // Ensure that all the GenTreeIndir values are set to their defaults. + assert(!memIndir->HasIndex()); + assert(memIndir->Scale() == 1); + assert(memIndir->Offset() == 0); + + break; + } + + case GT_CLS_VAR_ADDR: + { + getEmitter()->emitIns_R_C_I(ins, attr, reg1, memBase->gtClsVar.gtClsVarHnd, 0, ival); + return; + } + + default: + { + getEmitter()->emitIns_R_A_I(ins, attr, reg1, memIndir, ival); + return; + } + } + } + else + { + switch (rmOp->OperGet()) + { + case GT_LCL_FLD: + { + GenTreeLclFld* lclField = rmOp->AsLclFld(); + + varNum = lclField->GetLclNum(); + offset = lclField->gtLclFld.gtLclOffs; + break; + } + + case GT_LCL_VAR: + { + assert(rmOp->IsRegOptional() || !compiler->lvaGetDesc(rmOp->gtLclVar.gtLclNum)->lvIsRegCandidate()); + varNum = rmOp->AsLclVar()->GetLclNum(); + offset = 0; + break; + } + + default: + unreached(); + break; + } + } + + // Ensure we got a good varNum and offset. + // We also need to check for `tmpDsc != nullptr` since spill temp numbers + // are negative and start with -1, which also happens to be BAD_VAR_NUM. + assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr)); + assert(offset != (unsigned)-1); + + getEmitter()->emitIns_R_S_I(ins, attr, reg1, varNum, offset, ival); + } + else + { + regNumber rmOpReg = rmOp->gtRegNum; + getEmitter()->emitIns_SIMD_R_R_I(ins, attr, reg1, rmOpReg, ival); + } +} +#endif // FEATURE_HW_INTRINSICS + +#endif // _TARGET_XARCH_ /***************************************************************************** * diff --git a/src/jit/lower.h b/src/jit/lower.h index e29bb9c4d645..e0a7c64d2f97 100644 --- a/src/jit/lower.h +++ b/src/jit/lower.h @@ -104,6 +104,7 @@ class Lowering : public Phase void ContainCheckSIMD(GenTreeSIMD* simdNode); #endif // FEATURE_SIMD #ifdef FEATURE_HW_INTRINSICS + void ContainCheckHWIntrinsicAddr(GenTreeHWIntrinsic* node, GenTree** pAddr); void ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node); #endif // FEATURE_HW_INTRINSICS diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp index 292fb93c537d..373f881a8a2f 100644 --- a/src/jit/lowerxarch.cpp +++ b/src/jit/lowerxarch.cpp @@ -2488,6 +2488,10 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, Ge switch (category) { + case HW_Category_MemoryLoad: + supportsGeneralLoads = (!node->OperIsHWIntrinsic()); + break; + case HW_Category_SimpleSIMD: { // These intrinsics only expect 16 or 32-byte nodes for containment @@ -2664,6 +2668,15 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, Ge break; } + case NI_AVX2_BroadcastScalarToVector128: + case NI_AVX2_BroadcastScalarToVector256: + { + // The memory form of this already takes a pointer, and cannot be further contained. + // The containable form is the one that takes a SIMD value, that may be in memory. + supportsGeneralLoads = (node->TypeGet() == TYP_SIMD16); + break; + } + case NI_SSE_ConvertScalarToVector128Single: case NI_SSE2_ConvertScalarToVector128Double: case NI_SSE2_ConvertScalarToVector128Int32: @@ -2781,6 +2794,28 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, Ge } } +//---------------------------------------------------------------------------------------------- +// ContainCheckHWIntrinsicAddr: Perform containment analysis for an address operand of a hardware +// intrinsic node. +// +// Arguments: +// node - The hardware intrinsic node +// pAddr - The "parent" pointer to the address operand, so that we can update the operand +// of the parent as needed. +// +void Lowering::ContainCheckHWIntrinsicAddr(GenTreeHWIntrinsic* node, GenTree** pAddr) +{ + assert(((*pAddr)->TypeGet() == TYP_I_IMPL) || ((*pAddr)->TypeGet() == TYP_BYREF)); + TryCreateAddrMode(LIR::Use(BlockRange(), pAddr, node), true); + GenTree* addr = *pAddr; + if ((addr->OperIs(GT_CLS_VAR_ADDR, GT_LCL_VAR_ADDR) || + (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp)) || (addr->OperGet() == GT_LEA)) && + IsSafeToContainMem(node, addr)) + { + MakeSrcContained(node, addr); + } +} + //---------------------------------------------------------------------------------------------- // ContainCheckHWIntrinsic: Perform containment analysis for a hardware intrinsic node. // @@ -2800,7 +2835,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (!HWIntrinsicInfo::SupportsContainment(intrinsicId)) { - // AVX2 gather are not contaibable and always have constant IMM argument + // AVX2 gather are not containable and always have constant IMM argument if (HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsicId)) { GenTree* lastOp = HWIntrinsicInfo::lookupLastOp(node); @@ -2825,6 +2860,12 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) switch (category) { + case HW_Category_MemoryLoad: + { + GenTree** pAddr = &node->gtOp1; + ContainCheckHWIntrinsicAddr(node, pAddr); + break; + } case HW_Category_SimpleSIMD: case HW_Category_SIMDScalar: case HW_Category_Scalar: @@ -2905,6 +2946,26 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) switch (category) { + case HW_Category_MemoryLoad: + { + GenTree** pAddr = nullptr; + if ((intrinsicId == NI_AVX_MaskLoad) || (intrinsicId == NI_AVX2_MaskLoad)) + { + pAddr = &node->gtOp.gtOp1; + } + else + { + pAddr = &node->gtOp.gtOp2; + } + ContainCheckHWIntrinsicAddr(node, pAddr); + break; + } + case HW_Category_MemoryStore: + { + GenTree** pAddr = &node->gtOp1; + ContainCheckHWIntrinsicAddr(node, pAddr); + break; + } case HW_Category_SimpleSIMD: case HW_Category_SIMDScalar: case HW_Category_Scalar: @@ -3113,6 +3174,12 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) switch (category) { + case HW_Category_MemoryStore: + { + GenTree** pAddr = &node->gtOp.gtOp1->gtOp.gtOp1; + ContainCheckHWIntrinsicAddr(node, pAddr); + break; + } case HW_Category_SimpleSIMD: case HW_Category_SIMDScalar: case HW_Category_Scalar: diff --git a/src/jit/lsra.h b/src/jit/lsra.h index c0ce4fbdc356..8494699a0de9 100644 --- a/src/jit/lsra.h +++ b/src/jit/lsra.h @@ -1535,6 +1535,7 @@ class LinearScan : public LinearScanInterface int BuildOperandUses(GenTree* node, regMaskTP candidates = RBM_NONE); int BuildDelayFreeUses(GenTree* node, regMaskTP candidates = RBM_NONE); int BuildIndirUses(GenTreeIndir* indirTree, regMaskTP candidates = RBM_NONE); + int BuildAddrUses(GenTree* addr, regMaskTP candidates = RBM_NONE); void HandleFloatVarArgs(GenTreeCall* call, GenTree* argNode, bool* callHasFloatRegArgs); RefPosition* BuildDef(GenTree* tree, regMaskTP dstCandidates = RBM_NONE, int multiRegIdx = 0); void BuildDefs(GenTree* tree, int dstCount, regMaskTP dstCandidates = RBM_NONE); diff --git a/src/jit/lsrabuild.cpp b/src/jit/lsrabuild.cpp index 38dfca69859c..aada9b2f75ff 100644 --- a/src/jit/lsrabuild.cpp +++ b/src/jit/lsrabuild.cpp @@ -2672,6 +2672,11 @@ RefPosition* LinearScan::BuildUse(GenTree* operand, regMaskTP candidates, int mu int LinearScan::BuildIndirUses(GenTreeIndir* indirTree, regMaskTP candidates) { GenTree* const addr = indirTree->gtOp1; + return BuildAddrUses(addr, candidates); +} + +int LinearScan::BuildAddrUses(GenTree* addr, regMaskTP candidates) +{ if (!addr->isContained()) { BuildUse(addr, candidates); @@ -2725,11 +2730,17 @@ int LinearScan::BuildOperandUses(GenTree* node, regMaskTP candidates) { return BuildIndirUses(node->AsIndir(), candidates); } +#ifdef FEATURE_HW_INTRINSICS if (node->OperIsHWIntrinsic()) { + if (node->AsHWIntrinsic()->OperIsMemoryLoad()) + { + return BuildAddrUses(node->gtGetOp1()); + } BuildUse(node->gtGetOp1(), candidates); return 1; } +#endif // FEATURE_HW_INTRINSICS return 0; } diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp index 364a4b7103a2..c430db049d60 100644 --- a/src/jit/lsraxarch.cpp +++ b/src/jit/lsraxarch.cpp @@ -2699,11 +2699,29 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) { assert((numArgs > 0) && (numArgs < 4)); - srcCount += BuildOperandUses(op1); + if (intrinsicTree->OperIsMemoryLoadOrStore()) + { + srcCount += BuildAddrUses(op1); + } + else + { + srcCount += BuildOperandUses(op1); + } if (op2 != nullptr) { - srcCount += (isRMW) ? BuildDelayFreeUses(op2) : BuildOperandUses(op2); + if (op2->OperIs(GT_HWIntrinsic) && op2->AsHWIntrinsic()->OperIsMemoryLoad() && op2->isContained()) + { + srcCount += BuildAddrUses(op2->gtGetOp1()); + } + else if (isRMW) + { + srcCount += BuildDelayFreeUses(op2); + } + else + { + srcCount += BuildOperandUses(op2); + } if (op3 != nullptr) { diff --git a/tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.cs b/tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.cs new file mode 100644 index 000000000000..0e8f03c977c9 --- /dev/null +++ b/tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.cs @@ -0,0 +1,143 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using System.Threading; + +// Test folding of addressing expressions + +public class Program +{ + struct S + { + public float f0; + public float f1; + public float f2; + public float f3; + public float f4; + public float f5; + public float f6; + public float f7; + public float f8; + public float f9; + public float f10; + public float f11; + public float f12; + public float f13; + public float f14; + public float f15; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static unsafe int Test(ref S s, Vector128 v, int offset) + { + int returnVal = 100; + + if (Sse2.IsSupported) + { + fixed (float* p = &s.f0) + { + // We need an address aligned on 16 bytes, so we need to add a *float* offset to get there. + int alignmentOffset = (0x10 - ((int)p & 0xc)) >> 2; + try + { + // This is the aligned case. + // We're going to store a scalar at an offset of 2 from the aligned location. + // As it happens, we know that the struct has been initialized to all zeros, + // and the vector passed in was all ones, so now we have a one at offset 2. + Sse2.StoreScalar(p + alignmentOffset + 2, Sse2.Subtract(v, Sse2.LoadAlignedVector128(p + offset + alignmentOffset + 4))); + + // Now do a load from the aligned location. + // That should give us {0, 0, 1, 0}. + Vector128 v2; + if (Sse41.IsSupported) + { + v2 = Sse41.LoadAlignedVector128NonTemporal((byte*)(p + alignmentOffset)).AsSingle(); + } + else + { + v2 = Sse2.LoadVector128((byte*)(p + alignmentOffset)).AsSingle(); + } + if (!v2.Equals(Vector128.Create(0.0F, 0.0F, 1.0F, 0.0F))) + { + Console.WriteLine("Aligned case FAILED: v2 = " + v2); + returnVal = -1; + } + + // This is the unaligned case. The value we're loading to subtract is one element earlier than what we just stored. + // So we're doing { 1, 1, 1, 1 } - { 0, 1, 0, 0 } = { 1, 0, 1, 1 } + Sse2.Store(p + alignmentOffset + 1, Sse2.Subtract(v, Sse2.LoadVector128(p + offset + alignmentOffset + 1))); + // Now do an unaligned load from that location. + v2 = Sse2.LoadVector128(p + alignmentOffset + 1); + if (!v2.Equals(Vector128.Create(1.0F, 0.0F, 1.0F, 1.0F))) + { + Console.WriteLine("Unaligned case FAILED: v2 = " + v2); + returnVal = -1; + } + + } + catch (Exception e) + { + Console.WriteLine("Unexpected exception: " + e.Message); + returnVal = -1; + } + } + } + return returnVal; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static unsafe int Test256(ref S s, Vector256 v, int offset) + { + int returnVal = 100; + if (Avx.IsSupported) + { + // offset must be a multiple of the vector size in floats. + offset &= ~3; + fixed (float* p = &s.f0) + { + try + { + Avx.Store(p + 1, Avx.Subtract(v, Avx.LoadVector256(p + offset + 1))); + Vector256 v2 = Avx.LoadVector256(p + 1); + if (!v2.Equals(v)) + { + Console.WriteLine("Vector256 case FAILED: v = " + v + ", v2 = " + v2); + returnVal = -1; + } + } + catch (Exception e) + { + Console.WriteLine("Unexpected exception: " + e.Message); + returnVal = -1; + } + } + } + return returnVal; + } + + static int Main() + { + S s = new S(); + Vector128 v = Vector128.Create(1.0F); + int returnVal = Test(ref s, v, 0); + if (returnVal != 100) + { + Console.WriteLine("Vector128 test failed."); + } + + // Get a new vector initialized to zeros. + S s2 = new S(); + Vector256 v2 = Vector256.Create(1.0F); + if (Test256(ref s2, v2, 4) != 100) + { + Console.WriteLine("Vector256 test failed."); + returnVal = -1; + } + return returnVal; + } +} diff --git a/tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.csproj b/tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.csproj new file mode 100644 index 000000000000..1ef998940bf2 --- /dev/null +++ b/tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.csproj @@ -0,0 +1,16 @@ + + + + + $(MSBuildProjectName) + Exe + None + True + True + + + + + + +