New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix target register false dependency of lzcnt, tzcnt, and popcnt #19772

Merged
merged 1 commit into from Aug 31, 2018
Merged
Changes from all commits
Commits
File filter...
Filter file types
Jump to…
Jump to file or symbol
Failed to load files and symbols.
+32 −3
Diff settings

Always

Just for now

Fix target register false dependency of lzcnt, tzcnt, and popcnt

  • Loading branch information...
fiigii committed Aug 30, 2018
commit 6957b4f44f0917209df89499b7c4071bb0bc1941
@@ -2179,10 +2179,21 @@ void CodeGen::genBMI1Intrinsic(GenTreeHWIntrinsic* node)
case NI_BMI1_ExtractLowestSetBit:
case NI_BMI1_GetMaskUpToLowestSetBit:
case NI_BMI1_ResetLowestSetBit:
{
assert(op2 == nullptr);
assert((targetType == TYP_INT) || (targetType == TYP_LONG));
genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
break;
}

case NI_BMI1_TrailingZeroCount:
{
assert(op2 == nullptr);
assert((targetType == TYP_INT) || (targetType == TYP_LONG));
// tzcnt has false dependency on the target register on Intel Sandy Bridge and Haswell processors,
This conversation was marked as resolved by SimonCropp

This comment has been minimized.

@tannergooding

tannergooding Aug 30, 2018

Member

Do we want to do this unambiguously? Not all micro-architectures have this false dependency: https://reviews.llvm.org/D40334

This comment has been minimized.

@fiigii

fiigii Aug 30, 2018

Author Collaborator

Adding micro-architecture checks may make troubles for CoreCLR test matrix that we have talked several times. And @damageboy's profiling shows the performance gain is really good (1.8x faster) and I use xor r32, r32 here that is the shortest code size trade-off to fix the issue so far.

// so insert a `XOR target, target` to break the dependency via XOR triggering register renaming.
regNumber targetReg = node->gtRegNum;
getEmitter()->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
break;
}
@@ -2353,6 +2364,10 @@ void CodeGen::genLZCNTIntrinsic(GenTreeHWIntrinsic* node)
assert(node->gtHWIntrinsicId == NI_LZCNT_LeadingZeroCount);

genConsumeOperands(node);
// lzcnt has false dependency on the target register on Intel Sandy Bridge and Haswell processors,
// so insert a `XOR target, target` to break the dependency via XOR triggering register renaming.
regNumber targetReg = node->gtRegNum;
getEmitter()->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
genHWIntrinsic_R_RM(node, INS_lzcnt, emitTypeSize(node->TypeGet()));
genProduceReg(node);
}
@@ -2379,6 +2394,10 @@ void CodeGen::genPOPCNTIntrinsic(GenTreeHWIntrinsic* node)
assert(node->gtHWIntrinsicId == NI_POPCNT_PopCount);

genConsumeOperands(node);
// popcnt has false dependency on the target register on Intel Sandy Bridge, Haswell, and Skylake processors,
// so insert a `XOR target, target` to break the dependency via XOR triggering register renaming.
regNumber targetReg = node->gtRegNum;
getEmitter()->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
genHWIntrinsic_R_RM(node, INS_popcnt, emitTypeSize(node->TypeGet()));
genProduceReg(node);
}
@@ -480,7 +480,7 @@ HARDWARE_INTRINSIC(BMI1_AndNot, "AndNot",
HARDWARE_INTRINSIC(BMI1_ExtractLowestSetBit, "ExtractLowestSetBit", BMI1, -1, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_blsi, INS_blsi, INS_blsi, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(BMI1_GetMaskUpToLowestSetBit, "GetMaskUpToLowestSetBit", BMI1, -1, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsmsk, INS_blsmsk, INS_blsmsk, INS_blsmsk, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(BMI1_ResetLowestSetBit, "ResetLowestSetBit", BMI1, -1, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsr, INS_blsr, INS_blsr, INS_blsr, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(BMI1_TrailingZeroCount, "TrailingZeroCount", BMI1, -1, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_tzcnt, INS_tzcnt, INS_tzcnt, INS_tzcnt, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(BMI1_TrailingZeroCount, "TrailingZeroCount", BMI1, -1, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_tzcnt, INS_tzcnt, INS_tzcnt, INS_tzcnt, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns)

// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// Intrinsic ID Function name ISA ival SIMD size NumArg instructions Category Flags
@@ -514,7 +514,7 @@ HARDWARE_INTRINSIC(FMA_MultiplySubtractNegatedScalar, "MultiplySub
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// LZCNT Intrinsics
HARDWARE_INTRINSIC(LZCNT_IsSupported, "get_IsSupported", LZCNT, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(LZCNT_LeadingZeroCount, "LeadingZeroCount", LZCNT, -1, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_lzcnt, INS_invalid, INS_lzcnt, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(LZCNT_LeadingZeroCount, "LeadingZeroCount", LZCNT, -1, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_lzcnt, INS_invalid, INS_lzcnt, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns)

// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// Intrinsic ID Function name ISA ival SIMD size NumArg instructions Category Flags
@@ -529,7 +529,7 @@ HARDWARE_INTRINSIC(PCLMULQDQ_IsSupported, "get_IsSuppo
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// POPCNT Intrinsics
HARDWARE_INTRINSIC(POPCNT_IsSupported, "get_IsSupported", POPCNT, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(POPCNT_PopCount, "PopCount", POPCNT, -1, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_popcnt, INS_invalid, INS_popcnt, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(POPCNT_PopCount, "PopCount", POPCNT, -1, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_popcnt, INS_invalid, INS_popcnt, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns)
#endif // FEATURE_HW_INTRINSIC

#undef HARDWARE_INTRINSIC
Copy path View file
@@ -2584,6 +2584,16 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
break;
}

case NI_BMI1_TrailingZeroCount:
case NI_LZCNT_LeadingZeroCount:
case NI_POPCNT_PopCount:
{
assert(numArgs == 1);
srcCount += BuildDelayFreeUses(op1);
buildUses = false;
break;
}

default:
{
assert((intrinsicId > NI_HW_INTRINSIC_START) && (intrinsicId < NI_HW_INTRINSIC_END));
ProTip! Use n and p to navigate between commits in a pull request.