Skip to content

Commit

Permalink
Accelerate additional cross platform hardware intrinsics (#61649)
Browse files Browse the repository at this point in the history
* Updating Vector64/128/256.IsHardwareAccelerated to be treated as a constant and return true where supported

* Accelerate the CmpOpAll intrinsics

* Accelerate the CmpOpAny intrinsics

* Accelerate the ConverToDouble/Int32/Int64/Single/UInt32/UInt64 intrinsics

* Applying formatting patch

* Fixing ConvertToInt32 and ConvertToSingle to use the right intrinsic

* Fixing some issues and assert types are correct

* Updating ConvertToDouble and ConvertToSingle to have correct vectorized versions on x86/x64

* Ensure Vector<T>.ConvertToDouble/Single are accelerated

* Swap operands and invert immediate so the constant can be contained on blend

* Restrict ConvertToDouble(Vector128<UInt64>) tests to inputs no more than long.MaxValue

* Ensure that we create a long/ulong rather than a uint
  • Loading branch information
tannergooding committed Jan 4, 2022
1 parent 56ffec2 commit 7172c68
Show file tree
Hide file tree
Showing 50 changed files with 3,188 additions and 866 deletions.
18 changes: 1 addition & 17 deletions src/coreclr/jit/codegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3963,10 +3963,6 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode)
break;

case SIMDIntrinsicCast:
case SIMDIntrinsicConvertToSingle:
case SIMDIntrinsicConvertToInt32:
case SIMDIntrinsicConvertToDouble:
case SIMDIntrinsicConvertToInt64:
genSIMDIntrinsicUnOp(simdNode);
break;

Expand Down Expand Up @@ -4051,10 +4047,6 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
case SIMDIntrinsicCast:
result = INS_mov;
break;
case SIMDIntrinsicConvertToInt32:
case SIMDIntrinsicConvertToInt64:
result = INS_fcvtzs;
break;
case SIMDIntrinsicEqual:
result = INS_fcmeq;
break;
Expand All @@ -4081,10 +4073,6 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
case SIMDIntrinsicCast:
result = INS_mov;
break;
case SIMDIntrinsicConvertToDouble:
case SIMDIntrinsicConvertToSingle:
result = isUnsigned ? INS_ucvtf : INS_scvtf;
break;
case SIMDIntrinsicEqual:
result = INS_cmeq;
break;
Expand Down Expand Up @@ -4232,11 +4220,7 @@ void CodeGen::genSIMDIntrinsicInitN(GenTreeSIMD* simdNode)
//
void CodeGen::genSIMDIntrinsicUnOp(GenTreeSIMD* simdNode)
{
assert((simdNode->GetSIMDIntrinsicId() == SIMDIntrinsicCast) ||
(simdNode->GetSIMDIntrinsicId() == SIMDIntrinsicConvertToSingle) ||
(simdNode->GetSIMDIntrinsicId() == SIMDIntrinsicConvertToInt32) ||
(simdNode->GetSIMDIntrinsicId() == SIMDIntrinsicConvertToDouble) ||
(simdNode->GetSIMDIntrinsicId() == SIMDIntrinsicConvertToInt64));
assert(simdNode->GetSIMDIntrinsicId() == SIMDIntrinsicCast);

GenTree* op1 = simdNode->Op(1);
var_types baseType = simdNode->GetSimdBaseType();
Expand Down
156 changes: 145 additions & 11 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19273,21 +19273,59 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode(genTreeOps op,

NamedIntrinsic intrinsic = NI_Illegal;

#if defined(TARGET_XARCH)
if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
assert(varTypeIsFloating(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2));
}
#endif // TARGET_XARCH

switch (op)
{
#if defined(TARGET_XARCH)
case GT_EQ:
{
intrinsic = (simdSize == 32) ? NI_Vector256_op_Equality : NI_Vector128_op_Equality;
break;
}

case GT_GE:
case GT_GT:
case GT_LE:
case GT_LT:
{
// We want to generate a comparison along the lines of
// GT_XX(op1, op2).As<T, TInteger>() == Vector128<TInteger>.AllBitsSet

NamedIntrinsic getAllBitsSet = NI_Illegal;

if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
assert(varTypeIsFloating(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2));
intrinsic = NI_Vector256_op_Equality;
intrinsic = NI_Vector256_op_Equality;
getAllBitsSet = NI_Vector256_get_AllBitsSet;
}
else
{
intrinsic = NI_Vector128_op_Equality;
intrinsic = NI_Vector128_op_Equality;
getAllBitsSet = NI_Vector128_get_AllBitsSet;
}

op1 = gtNewSimdCmpOpNode(op, simdBaseType, op1, op2, simdBaseJitType, simdSize,
/* isSimdAsHWIntrinsic */ false);

if (simdBaseType == TYP_FLOAT)
{
simdBaseType = TYP_INT;
simdBaseJitType = CORINFO_TYPE_INT;
}
else if (simdBaseType == TYP_DOUBLE)
{
simdBaseType = TYP_LONG;
simdBaseJitType = CORINFO_TYPE_LONG;
}

op2 = gtNewSimdHWIntrinsicNode(simdBaseType, getAllBitsSet, simdBaseJitType, simdSize);
break;
}
#elif defined(TARGET_ARM64)
Expand All @@ -19296,6 +19334,45 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode(genTreeOps op,
intrinsic = (simdSize == 8) ? NI_Vector64_op_Equality : NI_Vector128_op_Equality;
break;
}

case GT_GE:
case GT_GT:
case GT_LE:
case GT_LT:
{
// We want to generate a comparison along the lines of
// GT_XX(op1, op2).As<T, TInteger>() == Vector128<TInteger>.AllBitsSet

NamedIntrinsic getAllBitsSet = NI_Illegal;

if (simdSize == 8)
{
intrinsic = NI_Vector64_op_Equality;
getAllBitsSet = NI_Vector64_get_AllBitsSet;
}
else
{
intrinsic = NI_Vector128_op_Equality;
getAllBitsSet = NI_Vector128_get_AllBitsSet;
}

op1 = gtNewSimdCmpOpNode(op, simdBaseType, op1, op2, simdBaseJitType, simdSize,
/* isSimdAsHWIntrinsic */ false);

if (simdBaseType == TYP_FLOAT)
{
simdBaseType = TYP_INT;
simdBaseJitType = CORINFO_TYPE_INT;
}
else if (simdBaseType == TYP_DOUBLE)
{
simdBaseType = TYP_LONG;
simdBaseJitType = CORINFO_TYPE_LONG;
}

op2 = gtNewSimdHWIntrinsicNode(simdBaseType, getAllBitsSet, simdBaseJitType, simdSize);
break;
}
#else
#error Unsupported platform
#endif // !TARGET_XARCH && !TARGET_ARM64
Expand Down Expand Up @@ -19335,24 +19412,81 @@ GenTree* Compiler::gtNewSimdCmpOpAnyNode(genTreeOps op,

NamedIntrinsic intrinsic = NI_Illegal;

#if defined(TARGET_XARCH)
if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
assert(varTypeIsFloating(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2));
}
#endif // TARGET_XARCH

switch (op)
{
#if defined(TARGET_XARCH)
case GT_NE:
case GT_EQ:
case GT_GE:
case GT_GT:
case GT_LE:
case GT_LT:
{
if (simdSize == 32)
// We want to generate a comparison along the lines of
// GT_XX(op1, op2).As<T, TInteger>() != Vector128<TInteger>.Zero

intrinsic = (simdSize == 32) ? NI_Vector256_op_Inequality : NI_Vector128_op_Inequality;

op1 = gtNewSimdCmpOpNode(op, simdBaseType, op1, op2, simdBaseJitType, simdSize,
/* isSimdAsHWIntrinsic */ false);

if (simdBaseType == TYP_FLOAT)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
assert(varTypeIsFloating(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2));
intrinsic = NI_Vector256_op_Inequality;
simdBaseType = TYP_INT;
simdBaseJitType = CORINFO_TYPE_INT;
}
else
else if (simdBaseType == TYP_DOUBLE)
{
intrinsic = NI_Vector128_op_Inequality;
simdBaseType = TYP_LONG;
simdBaseJitType = CORINFO_TYPE_LONG;
}

op2 = gtNewSimdZeroNode(simdBaseType, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false);
break;
}

case GT_NE:
{
intrinsic = (simdSize == 32) ? NI_Vector256_op_Inequality : NI_Vector128_op_Inequality;
break;
}
#elif defined(TARGET_ARM64)
case GT_EQ:
case GT_GE:
case GT_GT:
case GT_LE:
case GT_LT:
{
// We want to generate a comparison along the lines of
// GT_XX(op1, op2).As<T, TInteger>() != Vector128<TInteger>.Zero

intrinsic = (simdSize == 8) ? NI_Vector64_op_Inequality : NI_Vector128_op_Inequality;

op1 = gtNewSimdCmpOpNode(op, simdBaseType, op1, op2, simdBaseJitType, simdSize,
/* isSimdAsHWIntrinsic */ false);

if (simdBaseType == TYP_FLOAT)
{
simdBaseType = TYP_INT;
simdBaseJitType = CORINFO_TYPE_INT;
}
else if (simdBaseType == TYP_DOUBLE)
{
simdBaseType = TYP_LONG;
simdBaseJitType = CORINFO_TYPE_LONG;
}

op2 = gtNewSimdZeroNode(simdBaseType, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false);
break;
}

case GT_NE:
{
intrinsic = (simdSize == 8) ? NI_Vector64_op_Inequality : NI_Vector128_op_Inequality;
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/hwintrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp,

bool isIsaSupported = comp->compHWIntrinsicDependsOn(isa) && comp->compSupportsHWIntrinsic(isa);

if (strcmp(methodName, "get_IsSupported") == 0)
if ((strcmp(methodName, "get_IsSupported") == 0) || (strcmp(methodName, "get_IsHardwareAccelerated") == 0))
{
return isIsaSupported ? (comp->compExactlyDependsOn(isa) ? NI_IsSupported_True : NI_IsSupported_Dynamic)
: NI_IsSupported_False;
Expand Down
Loading

0 comments on commit 7172c68

Please sign in to comment.