Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[mono] Implement Rdm and Dp #49737

Merged
merged 10 commits into from
Jun 24, 2021
6 changes: 5 additions & 1 deletion src/mono/mono/mini/aot-compiler.c
Original file line number Diff line number Diff line change
Expand Up @@ -8236,7 +8236,7 @@ parse_cpu_features (const gchar *attr)
// if we disable a feature from the SSE-AVX tree we also need to disable all dependencies
if (!enabled && (feature & MONO_CPU_X86_FULL_SSEAVX_COMBINED))
feature = (MonoCPUFeatures) (MONO_CPU_X86_FULL_SSEAVX_COMBINED & ~feature);

#elif defined(TARGET_ARM64)
// MONO_CPU_ARM64_BASE is unconditionally set in mini_get_cpu_features.
if (!strcmp (attr + prefix, "crc"))
Expand All @@ -8245,6 +8245,10 @@ parse_cpu_features (const gchar *attr)
feature = MONO_CPU_ARM64_CRYPTO;
else if (!strcmp (attr + prefix, "neon"))
feature = MONO_CPU_ARM64_NEON;
else if (!strcmp (attr + prefix, "rdm"))
feature = MONO_CPU_ARM64_RDM;
else if (!strcmp (attr + prefix, "dotprod"))
feature = MONO_CPU_ARM64_DP;
#elif defined(TARGET_WASM)
if (!strcmp (attr + prefix, "simd"))
feature = MONO_CPU_WASM_SIMD;
Expand Down
6 changes: 6 additions & 0 deletions src/mono/mono/mini/llvm-intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#define Widen INTRIN_kind_widen
#define WidenAcross INTRIN_kind_widen_across
#define Across INTRIN_kind_across
#define Arm64DotProd INTRIN_kind_arm64_dot_prod
#if !defined(Generic)
#define Generic
#endif
Expand Down Expand Up @@ -466,6 +467,10 @@ INTRINS_OVR_TAG(AARCH64_ADV_SIMD_SRI, aarch64_neon_vsri, Arm64, V64 | V128 | I1

INTRINS_OVR_TAG(AARCH64_ADV_SIMD_TBX1, aarch64_neon_tbx1, Arm64, V64 | V128 | I1)
INTRINS_OVR_TAG(AARCH64_ADV_SIMD_TBL1, aarch64_neon_tbl1, Arm64, V64 | V128 | I1)

INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_SDOT, aarch64_neon_sdot, Arm64, Arm64DotProd, V64 | V128 | I4)
INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_UDOT, aarch64_neon_udot, Arm64, Arm64DotProd, V64 | V128 | I4)

#endif

#undef INTRINS
Expand All @@ -486,6 +491,7 @@ INTRINS_OVR_TAG(AARCH64_ADV_SIMD_TBL1, aarch64_neon_tbl1, Arm64, V64 | V128 | I1
#undef Ftoi
#undef WidenAcross
#undef Across
#undef Arm64DotProd
#undef Generic
#undef X86
#undef Arm64
Expand Down
68 changes: 65 additions & 3 deletions src/mono/mono/mini/mini-llvm.c
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,7 @@ enum {
INTRIN_kind_widen,
INTRIN_kind_widen_across,
INTRIN_kind_across,
INTRIN_kind_arm64_dot_prod,
};

static const uint8_t intrin_kind [] = {
Expand Down Expand Up @@ -9660,6 +9661,21 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
values [ins->dreg] = result;
break;
}
case OP_ARM64_SELECT_QUAD: {
LLVMTypeRef src_type = simd_class_to_llvm_type (ctx, ins->data.op [1].klass);
LLVMTypeRef ret_type = simd_class_to_llvm_type (ctx, ins->klass);
unsigned int src_type_bits = mono_llvm_get_prim_size_bits (src_type);
unsigned int ret_type_bits = mono_llvm_get_prim_size_bits (ret_type);
unsigned int src_intermediate_elems = src_type_bits / 32;
unsigned int ret_intermediate_elems = ret_type_bits / 32;
LLVMTypeRef intermediate_type = LLVMVectorType (i4_t, src_intermediate_elems);
LLVMValueRef result = LLVMBuildBitCast (builder, lhs, intermediate_type, "arm64_select_quad");
result = LLVMBuildExtractElement (builder, result, rhs, "arm64_select_quad");
result = broadcast_element (ctx, result, ret_intermediate_elems);
result = LLVMBuildBitCast (builder, result, ret_type, "arm64_select_quad");
values [ins->dreg] = result;
break;
}
case OP_LSCNT32:
case OP_LSCNT64: {
// %shr = ashr i32 %x, 31
Expand All @@ -9683,6 +9699,43 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, ins->opcode == OP_LSCNT32 ? INTRINS_CTLZ_I32 : INTRINS_CTLZ_I64), args, 2, "");
break;
}
case OP_ARM64_SQRDMLAH:
case OP_ARM64_SQRDMLAH_BYSCALAR:
case OP_ARM64_SQRDMLAH_SCALAR:
case OP_ARM64_SQRDMLSH:
case OP_ARM64_SQRDMLSH_BYSCALAR:
case OP_ARM64_SQRDMLSH_SCALAR: {
gboolean byscalar = FALSE;
gboolean scalar = FALSE;
gboolean subtract = FALSE;
switch (ins->opcode) {
case OP_ARM64_SQRDMLAH_BYSCALAR: byscalar = TRUE; break;
case OP_ARM64_SQRDMLAH_SCALAR: scalar = TRUE; break;
case OP_ARM64_SQRDMLSH: subtract = TRUE; break;
case OP_ARM64_SQRDMLSH_BYSCALAR: subtract = TRUE; byscalar = TRUE; break;
case OP_ARM64_SQRDMLSH_SCALAR: subtract = TRUE; scalar = TRUE; break;
}
int acc_iid = subtract ? INTRINS_AARCH64_ADV_SIMD_SQSUB : INTRINS_AARCH64_ADV_SIMD_SQADD;
LLVMTypeRef ret_t = simd_class_to_llvm_type (ctx, ins->klass);
llvm_ovr_tag_t ovr_tag = ovr_tag_from_llvm_type (ret_t);
ScalarOpFromVectorOpCtx sctx = scalar_op_from_vector_op (ctx, ret_t, ins);
LLVMValueRef args [] = { lhs, rhs, arg3 };
if (byscalar) {
unsigned int elems = LLVMGetVectorSize (ret_t);
args [2] = broadcast_element (ctx, scalar_from_vector (ctx, args [2]), elems);
}
if (scalar) {
ovr_tag = sctx.ovr_tag;
scalar_op_from_vector_op_process_args (&sctx, args, 3);
}
LLVMValueRef result = call_overloaded_intrins (ctx, INTRINS_AARCH64_ADV_SIMD_SQRDMULH, ovr_tag, &args [1], "arm64_sqrdmlxh");
args [1] = result;
result = call_overloaded_intrins (ctx, acc_iid, ovr_tag, &args [0], "arm64_sqrdmlxh");
if (scalar)
result = scalar_op_from_vector_op_process_result (&sctx, result);
values [ins->dreg] = result;
break;
}
case OP_ARM64_SMULH:
case OP_ARM64_UMULH: {
LLVMValueRef op1, op2;
Expand Down Expand Up @@ -12136,6 +12189,13 @@ add_intrinsic (LLVMModuleRef module, int id)
int associated_prim = MAX(ew, 2);
LLVMTypeRef associated_scalar_type = intrin_types [0][associated_prim];
intrins = add_intrins2 (module, id, associated_scalar_type, distinguishing_type);
} else if (kind == INTRIN_kind_arm64_dot_prod) {
/*
* @llvm.aarch64.neon.sdot.v2i32.v8i8
* @llvm.aarch64.neon.sdot.v4i32.v16i8
*/
LLVMTypeRef associated_type = intrin_types [vw][0];
intrins = add_intrins2 (module, id, distinguishing_type, associated_type);
} else
intrins = add_intrins1 (module, id, distinguishing_type);
int key = key_from_id_and_tag (id, test);
Expand Down Expand Up @@ -13530,9 +13590,11 @@ MonoCPUFeatures mono_llvm_get_cpu_features (void)
{ "bmi2", MONO_CPU_X86_BMI2 },
#endif
#if defined(TARGET_ARM64)
{ "crc", MONO_CPU_ARM64_CRC },
{ "crypto", MONO_CPU_ARM64_CRYPTO },
{ "neon", MONO_CPU_ARM64_NEON }
{ "crc", MONO_CPU_ARM64_CRC },
{ "crypto", MONO_CPU_ARM64_CRYPTO },
{ "neon", MONO_CPU_ARM64_NEON },
{ "rdm", MONO_CPU_ARM64_RDM },
{ "dotprod", MONO_CPU_ARM64_DP },
#endif
#if defined(TARGET_WASM)
{ "simd", MONO_CPU_WASM_SIMD },
Expand Down
8 changes: 8 additions & 0 deletions src/mono/mono/mini/mini-ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -1740,6 +1740,7 @@ MINI_OP(OP_ARM64_UQXTN2, "arm64_uqxtn2", XREG, XREG, XREG)
MINI_OP(OP_ARM64_SQXTUN2, "arm64_sqxtun2", XREG, XREG, XREG)

MINI_OP(OP_ARM64_SELECT_SCALAR, "arm64_select_scalar", XREG, XREG, IREG)
MINI_OP(OP_ARM64_SELECT_QUAD, "arm64_select_quad", XREG, XREG, IREG)

MINI_OP(OP_ARM64_FCVTZU, "arm64_fcvtzu", XREG, XREG, NONE)
MINI_OP(OP_ARM64_FCVTZS, "arm64_fcvtzs", XREG, XREG, NONE)
Expand Down Expand Up @@ -1807,4 +1808,11 @@ MINI_OP(OP_ARM64_XNARROW_SCALAR, "arm64_xnarrow_scalar", XREG, XREG, NONE)

MINI_OP3(OP_ARM64_EXT, "arm64_ext", XREG, XREG, XREG, IREG)

MINI_OP3(OP_ARM64_SQRDMLAH, "arm64_sqrdmlah", XREG, XREG, XREG, XREG)
MINI_OP3(OP_ARM64_SQRDMLAH_BYSCALAR, "arm64_sqrdmlah_byscalar", XREG, XREG, XREG, XREG)
MINI_OP3(OP_ARM64_SQRDMLAH_SCALAR, "arm64_sqrdmlah_scalar", XREG, XREG, XREG, XREG)
MINI_OP3(OP_ARM64_SQRDMLSH, "arm64_sqrdmlsh", XREG, XREG, XREG, XREG)
MINI_OP3(OP_ARM64_SQRDMLSH_BYSCALAR, "arm64_sqrdmlsh_byscalar", XREG, XREG, XREG, XREG)
MINI_OP3(OP_ARM64_SQRDMLSH_SCALAR, "arm64_sqrdmlsh_scalar", XREG, XREG, XREG, XREG)

#endif // TARGET_ARM64
67 changes: 65 additions & 2 deletions src/mono/mono/mini/simd-intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -1428,13 +1428,31 @@ static SimdIntrinsic advsimd_methods [] = {
{SN_get_IsSupported},
};

static const SimdIntrinsic rdm_methods [] = {
{SN_MultiplyRoundedDoublingAndAddSaturateHigh, OP_ARM64_SQRDMLAH},
{SN_MultiplyRoundedDoublingAndAddSaturateHighScalar, OP_ARM64_SQRDMLAH_SCALAR},
{SN_MultiplyRoundedDoublingAndSubtractSaturateHigh, OP_ARM64_SQRDMLSH},
{SN_MultiplyRoundedDoublingAndSubtractSaturateHighScalar, OP_ARM64_SQRDMLSH_SCALAR},
{SN_MultiplyRoundedDoublingBySelectedScalarAndAddSaturateHigh},
{SN_MultiplyRoundedDoublingBySelectedScalarAndSubtractSaturateHigh},
{SN_MultiplyRoundedDoublingScalarBySelectedScalarAndAddSaturateHigh},
{SN_MultiplyRoundedDoublingScalarBySelectedScalarAndSubtractSaturateHigh},
{SN_get_IsSupported},
};

static const SimdIntrinsic dp_methods [] = {
{SN_DotProduct, OP_XOP_OVR_X_X_X_X, INTRINS_AARCH64_ADV_SIMD_SDOT, OP_XOP_OVR_X_X_X_X, INTRINS_AARCH64_ADV_SIMD_UDOT},
{SN_DotProductBySelectedQuadruplet},
{SN_get_IsSupported},
};

static const IntrinGroup supported_arm_intrinsics [] = {
{ "AdvSimd", MONO_CPU_ARM64_NEON, advsimd_methods, sizeof (advsimd_methods) },
{ "Aes", MONO_CPU_ARM64_CRYPTO, crypto_aes_methods, sizeof (crypto_aes_methods) },
{ "ArmBase", MONO_CPU_ARM64_BASE, armbase_methods, sizeof (armbase_methods) },
{ "Crc32", MONO_CPU_ARM64_CRC, crc32_methods, sizeof (crc32_methods) },
{ "Dp", MONO_CPU_ARM64_DP, unsupported, sizeof (unsupported) },
{ "Rdm", MONO_CPU_ARM64_RDM, unsupported, sizeof (unsupported) },
{ "Dp", MONO_CPU_ARM64_DP, dp_methods, sizeof (dp_methods) },
{ "Rdm", MONO_CPU_ARM64_RDM, rdm_methods, sizeof (rdm_methods) },
{ "Sha1", MONO_CPU_ARM64_CRYPTO, sha1_methods, sizeof (sha1_methods) },
{ "Sha256", MONO_CPU_ARM64_CRYPTO, sha256_methods, sizeof (sha256_methods) },
};
Expand Down Expand Up @@ -1740,6 +1758,51 @@ emit_arm64_intrinsics (
}
}

if (feature == MONO_CPU_ARM64_RDM) {
switch (id) {
case SN_MultiplyRoundedDoublingBySelectedScalarAndAddSaturateHigh:
case SN_MultiplyRoundedDoublingBySelectedScalarAndSubtractSaturateHigh:
case SN_MultiplyRoundedDoublingScalarBySelectedScalarAndAddSaturateHigh:
case SN_MultiplyRoundedDoublingScalarBySelectedScalarAndSubtractSaturateHigh: {
MonoClass *ret_klass = mono_class_from_mono_type_internal (fsig->ret);
int opcode = 0;
switch (id) {
case SN_MultiplyRoundedDoublingBySelectedScalarAndAddSaturateHigh: opcode = OP_ARM64_SQRDMLAH_BYSCALAR; break;
case SN_MultiplyRoundedDoublingBySelectedScalarAndSubtractSaturateHigh: opcode = OP_ARM64_SQRDMLSH_BYSCALAR; break;
case SN_MultiplyRoundedDoublingScalarBySelectedScalarAndAddSaturateHigh: opcode = OP_ARM64_SQRDMLAH_SCALAR; break;
case SN_MultiplyRoundedDoublingScalarBySelectedScalarAndSubtractSaturateHigh: opcode = OP_ARM64_SQRDMLSH_SCALAR; break;
}
MonoInst *scalar = emit_simd_ins (cfg, ret_klass, OP_ARM64_SELECT_SCALAR, args [2]->dreg, args [3]->dreg);
MonoInst *ret = emit_simd_ins (cfg, ret_klass, opcode, args [0]->dreg, args [1]->dreg);
ret->inst_c1 = arg0_type;
ret->sreg3 = scalar->dreg;
return ret;
}
default:
g_assert_not_reached ();
}
}

if (feature == MONO_CPU_ARM64_DP) {
switch (id) {
case SN_DotProductBySelectedQuadruplet: {
MonoClass *ret_klass = mono_class_from_mono_type_internal (fsig->ret);
MonoClass *arg_klass = mono_class_from_mono_type_internal (fsig->params [1]);
MonoClass *quad_klass = mono_class_from_mono_type_internal (fsig->params [2]);
gboolean is_unsigned = type_is_unsigned (fsig->ret);
int iid = is_unsigned ? INTRINS_AARCH64_ADV_SIMD_UDOT : INTRINS_AARCH64_ADV_SIMD_SDOT;
MonoInst *quad = emit_simd_ins (cfg, arg_klass, OP_ARM64_SELECT_QUAD, args [2]->dreg, args [3]->dreg);
quad->data.op [1].klass = quad_klass;
MonoInst *ret = emit_simd_ins (cfg, ret_klass, OP_XOP_OVR_X_X_X_X, args [0]->dreg, args [1]->dreg);
ret->sreg3 = quad->dreg;
ret->inst_c0 = iid;
return ret;
}
default:
g_assert_not_reached ();
}
}

return NULL;
}
#endif // TARGET_ARM64
Expand Down
14 changes: 13 additions & 1 deletion src/mono/mono/mini/simd-methods.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ METHOD(HashUpdate2)
METHOD(ScheduleUpdate0)
METHOD(ScheduleUpdate1)
METHOD(MixColumns)
//AdvSimd
// AdvSimd
METHOD(AbsSaturate)
METHOD(AbsSaturateScalar)
METHOD(AbsScalar)
Expand Down Expand Up @@ -559,3 +559,15 @@ METHOD(ZeroExtendWideningLower)
METHOD(ZeroExtendWideningUpper)
METHOD(ZipHigh)
METHOD(ZipLow)
// Arm.Rdm
METHOD(MultiplyRoundedDoublingAndAddSaturateHigh)
METHOD(MultiplyRoundedDoublingAndSubtractSaturateHigh)
METHOD(MultiplyRoundedDoublingBySelectedScalarAndAddSaturateHigh)
METHOD(MultiplyRoundedDoublingBySelectedScalarAndSubtractSaturateHigh)
// Arm.Rdm.Arm64
METHOD(MultiplyRoundedDoublingAndAddSaturateHighScalar)
METHOD(MultiplyRoundedDoublingAndSubtractSaturateHighScalar)
METHOD(MultiplyRoundedDoublingScalarBySelectedScalarAndAddSaturateHigh)
METHOD(MultiplyRoundedDoublingScalarBySelectedScalarAndSubtractSaturateHigh)
// Arm.Dp
METHOD(DotProductBySelectedQuadruplet)