Skip to content
This repository has been archived by the owner on Jan 23, 2023. It is now read-only.

Improve Math(F).FusedMultiplyAdd codegen #27060

Merged
merged 23 commits into from
Nov 6, 2019
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 42 additions & 8 deletions src/jit/importer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4029,14 +4029,48 @@ GenTree* Compiler::impIntrinsic(GenTree* newobjThis,
// Vector128.CreateScalar(z)
// ).ToScalar();

GenTree* op3 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, impPopStack().val,
NI_Vector128_CreateScalarUnsafe, callType, 16);
GenTree* op2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, impPopStack().val,
NI_Vector128_CreateScalarUnsafe, callType, 16);
GenTree* op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, impPopStack().val,
NI_Vector128_CreateScalarUnsafe, callType, 16);
GenTree* res =
gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, op3, NI_FMA_MultiplyAddScalar, callType, 16);
// Depending on additional GT_NEG nodes it can be:
EgorBo marked this conversation as resolved.
Show resolved Hide resolved
//
// x * y + z -> NI_FMA_MultiplyAddScalar
// x * -y + z -> NI_FMA_MultiplyAddNegatedScalar
// -x * y + z -> NI_FMA_MultiplyAddNegatedScalar
// -x * -y + z -> NI_FMA_MultiplyAddScalar
// x * y - z -> NI_FMA_MultiplySubtractScalar
// x * -y - z -> NI_FMA_MultiplySubtractNegatedScalar
// -x * y - z -> NI_FMA_MultiplySubtractNegatedScalar
// -x * -y - z -> NI_FMA_MultiplySubtractScalar

GenTree* z = impPopStack().val;
GenTree* y = impPopStack().val;
GenTree* x = impPopStack().val;

bool negMul = x->OperIs(GT_NEG) ^ y->OperIs(GT_NEG);
NamedIntrinsic fma;
if (x->OperIs(GT_NEG))
{
x = x->gtGetOp1();
}
if (y->OperIs(GT_NEG))
{
y = y->gtGetOp1();
}
if (z->OperIs(GT_NEG))
{
z = z->gtGetOp1();
fma = negMul ? NI_FMA_MultiplySubtractNegatedScalar : NI_FMA_MultiplySubtractScalar;
}
else
{
fma = negMul ? NI_FMA_MultiplyAddNegatedScalar : NI_FMA_MultiplyAddScalar;
}

GenTree* op3 =
gtNewSimdHWIntrinsicNode(TYP_SIMD16, z, NI_Vector128_CreateScalarUnsafe, callType, 16);
GenTree* op2 =
gtNewSimdHWIntrinsicNode(TYP_SIMD16, y, NI_Vector128_CreateScalarUnsafe, callType, 16);
GenTree* op1 =
gtNewSimdHWIntrinsicNode(TYP_SIMD16, x, NI_Vector128_CreateScalarUnsafe, callType, 16);
GenTree* res = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, op3, fma, callType, 16);

retNode = gtNewSimdHWIntrinsicNode(callType, res, NI_Vector128_ToScalar, callType, 16);
}
Expand Down
48 changes: 24 additions & 24 deletions src/jit/instrsxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -557,30 +557,30 @@ INST3(vfmsub231sd, "fmsub231sd", IUM_WR, BAD_CODE, BAD_CODE,
INST3(vfmsub132ss, "fmsub132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9B), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Scalar Single-Precision Floating-Point Values
INST3(vfmsub213ss, "fmsub213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAB), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfmsub231ss, "fmsub231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBB), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmadd132pd, "fmnadd132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Double-Precision Floating-Point Values
INST3(vfnmadd213pd, "fmnadd213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmadd231pd, "fmnadd231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmadd132ps, "fmnadd132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Single-Precision Floating-Point Values
INST3(vfnmadd213ps, "fmnadd213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmadd231ps, "fmnadd231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmadd132sd, "fmnadd132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Double-Precision Floating-Point Values
INST3(vfnmadd213sd, "fmnadd213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmadd231sd, "fmnadd231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmadd132ss, "fmnadd132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Single-Precision Floating-Point Values
INST3(vfnmadd213ss, "fmnadd213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmadd231ss, "fmnadd231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmsub132pd, "fmnsub132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Double-Precision Floating-Point Values
INST3(vfnmsub213pd, "fmnsub213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmsub231pd, "fmnsub231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmsub132ps, "fmnsub132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Single-Precision Floating-Point Values
INST3(vfnmsub213ps, "fmnsub213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmsub231ps, "fmnsub231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmsub132sd, "fmnsub132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Double-Precision Floating-Point Values
INST3(vfnmsub213sd, "fmnsub213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmsub231sd, "fmnsub231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmsub132ss, "fmnsub132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Single-Precision Floating-Point Values
INST3(vfnmsub213ss, "fmnsub213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmsub231ss, "fmnsub231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmadd132pd, "fnmadd132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Double-Precision Floating-Point Values
EgorBo marked this conversation as resolved.
Show resolved Hide resolved
INST3(vfnmadd213pd, "fnmadd213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmadd231pd, "fnmadd231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmadd132ps, "fnmadd132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Single-Precision Floating-Point Values
INST3(vfnmadd213ps, "fnmadd213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmadd231ps, "fnmadd231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmadd132sd, "fnmadd132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Double-Precision Floating-Point Values
INST3(vfnmadd213sd, "fnmadd213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmadd231sd, "fnmadd231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmadd132ss, "fnmadd132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Single-Precision Floating-Point Values
INST3(vfnmadd213ss, "fnmadd213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmadd231ss, "fnmadd231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmsub132pd, "fnmsub132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Double-Precision Floating-Point Values
INST3(vfnmsub213pd, "fnmsub213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmsub231pd, "fnmsub231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmsub132ps, "fnmsub132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Single-Precision Floating-Point Values
INST3(vfnmsub213ps, "fnmsub213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmsub231ps, "fnmsub231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmsub132sd, "fnmsub132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Double-Precision Floating-Point Values
INST3(vfnmsub213sd, "fnmsub213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmsub231sd, "fnmsub231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmsub132ss, "fnmsub132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Single-Precision Floating-Point Values
INST3(vfnmsub213ss, "fnmsub213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(vfnmsub231ss, "fnmsub231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_Flags_IsDstDstSrcAVXInstruction) //
INST3(LAST_FMA_INSTRUCTION, "LAST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)

// BMI1
Expand Down