[mono] Reenable some amd64 intrinsic tests, enable amd64 ISA extensio…

…ns when AOTing, several intrinsics fixes (#53752) Changes: * Consolidate SSE shuffle constant unrolling Remove `OP_SSE2_SHUFFLE`, which is unused. Rename `OP_SSE_SHUFFLE` to `OP_SSE_SHUFPS`, to make this more consistent with the naming convention used for other SSE shuffles. Use `immediate_unroll_*` instead of hand-writing branch emission. These branch tables are huge (in the simplest case, with 256 different constant values, we can spend over 1KB of code on nothing but `shufps` and `jmp`s, and the cost gets worse if any tail duplication happens), and are currently emitted inline. Future work ought to: 1. use a sequence of extractelement/insertelement instructions, which can be optimized into a constant shuffle when the shuffle control parameter is constant, and otherwise generates a high-latency but low-code-size fallback (note that this only works for shuffles); or 2. emit the fallback branch tables out of line and use `llvm.is.constant` to generate either a constant shuffle or a call to a fallback shuffle branch table function (the cost isn't too bad: a direct-call/ret pair would add ~4-5 cycles and eat an RSB slot on top of the cost of the branch table). Fixes `JIT/HardwareIntrinsics/X86/Regression/GitHub_21855/GitHub_21855_r`. * Fix intrinsification for MathF.Round `OP_SSE41_ROUNDS` takes two source registers, not one. TODO: Investigate what happens with `llvm.round` and `llvm.experimental.constrained.round`. Fixes `JIT/Intrinsics/MathRoundSingle_r`, `JIT/Math/Functions/Functions_r`, and `JIT/Performance/CodeQuality/Math/Functions/Functions`. * Clean up intrinsic group lookup Use a dummy never-supported intrinsic group as a default fallback, instead of adding a special-case "intrinsic group not present" branch Correctly intrinsify get_IsSupported even when not using LLVM Fixes spurious `System.PlatformNotSupportedException`s when calling `get_IsSupported` when the LLVM backend isn't being used. * The "not" SSE comparions are unordered, so use the appropriate unordered LLVM IR comparisons Add labeled constants for the immediate parameter we pass to CMPSS/CMPSD. Fixes `Regressions.coreclr/GitHub_34094/Test34094`. * Fix `LoadAndDuplicateToVector128` `LoadAndDuplicateToVector128` should load exactly one 8-byte value from memory before broadcasting it into both lanes in a 128-bit result vector. Fixes `JIT/HardwareIntrinsics/X86/Sse3/LoadAndDuplicateToVector128_r`. * Implement constant unrolling for `Sse41.DotProduct` As with shuffles, the fallback jump table should probably be kept out of line someday; `vdpps` uses 6 bytes of space, so any fallback jump table for the selection control mask will be at least 1.5kb large. Fixes `JIT/HardwareIntrinsics/X86/Sse41/DotProduct_r`. * Implement constant unrolling for `Sse41.Blend` The usual: big jump blobs should be out of line, possible to use extract/insertelement. * Zero is part of the domain of `lzcnt` and shouldn't yield an undef. Use fully-defined `llvm.ctlz` when implementing `OP_LZCNT32/64`. Fixes `JIT/HardwareIntrinsics/X86/Regression/GitHub_21666/GitHub_21666_r` * Unify amd64/arm64 vector extraction handling Removes `OP_EXTRACT_U1` and `OP_EXTRACT_U2`. Instead, sign/zero extension is determined via `inst_c1` for `OP_EXTRACT_*` and `OP_XEXTRACT_*` (and `OP_EXTRACTX_U2`, which doesn't seem to be generated as part of intrinsic translation), which must be set to a MonoTypeEnum. Replaces `OP_EXTRACT_VAR_*` with `OP_XEXTRACT_*`. Fixes `JIT/Regression/JitBlue/GitHub_23159/GitHub_23159` and `JIT/Regression/JitBlue/GitHub_13568/GitHub_13568`. * Remove `OP_DPPS`; it is unused * Disable `JIT/Regression/CLR-x86-JIT/V1.1-M1-Beta1/b143840` when running with mono LLVM AOT * Disable finalizearray when running with mono LLVM AOT * Disable Vector256_1/Vector128_1 tests on wasm * Enable sse4.2, popcnt, lzcnt, bmi, and bmi2 when AOT compiling the runtime tests. * Pass the runtime variant to `helixpublishwitharcade.proj`, and forward this runtime variant to testenvironment.proj. This is used to selectively enable LLVM JIT on the LLVM AOT lanes. Removes the hack added to CLRTest.Execute.Bash.targets that did this for arm64 (which happens to only have an LLVM AOT lane for runtime tests right now). * Enable `JIT/HardwareIntrinsics/General/Vector128_1/**`, `JIT/HardwareIntrinsics/General/Vector256/**`, `JIT/HardwareIntrinsics/General/Vector256_1/**`, and `JIT/HardwareIntrinsics/X86/General/IsSupported*/**` for LLVM AOT on amd64.
dotnet · Jun 13, 2021 · b8b3ef1 · b8b3ef1
1 parent 86cebb3
commit b8b3ef1
Show file tree

Hide file tree

Showing 15 changed files with 357 additions and 457 deletions.
diff --git a/eng/pipelines/common/templates/runtimes/run-test-job.yml b/eng/pipelines/common/templates/runtimes/run-test-job.yml
@@ -326,6 +326,7 @@ jobs:
         osSubgroup: ${{ parameters.osSubgroup}}
         runtimeFlavorDisplayName: ${{ parameters.runtimeFlavorDisplayName }}
         shouldContinueOnError: ${{ parameters.shouldContinueOnError }}
+        runtimeVariant: ${{ parameters.runtimeVariant }}
 
         ${{ if eq(variables['System.TeamProject'], 'public') }}:
           creator: $(Build.DefinitionName)

diff --git a/src/mono/mono/mini/cpu-amd64.md b/src/mono/mono/mini/cpu-amd64.md
@@ -779,9 +779,7 @@ extract_i4: dest:i src1:x len:5
 extract_i8: dest:i src1:x len:9
 
 extract_i2: dest:i src1:x len:13
-extract_u2: dest:i src1:x len:13
 extract_i1: dest:i src1:x len:13
-extract_u1: dest:i src1:x len:13
 extract_r8: dest:f src1:x len:5
 
 iconv_to_r4_raw: dest:f src1:i len:10

diff --git a/src/mono/mono/mini/cpu-x86.md b/src/mono/mono/mini/cpu-x86.md
@@ -631,9 +631,7 @@ iconv_to_x: dest:x src1:i len:4
 extract_i4: dest:i src1:x len:4
 
 extract_i2: dest:i src1:x len:10
-extract_u2: dest:i src1:x len:10
 extract_i1: dest:i src1:x len:10
-extract_u1: dest:i src1:x len:10
 extract_r8: dest:f src1:x len:8
 
 insert_i2: dest:x src1:x src2:i len:5 clob:1

diff --git a/src/mono/mono/mini/intrinsics.c b/src/mono/mono/mini/intrinsics.c
@@ -140,15 +140,18 @@ llvm_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign
 			}
 #if defined(TARGET_X86) || defined(TARGET_AMD64)
 			else if (!strcmp (cmethod->name, "Round") && (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE41) != 0) {
-				// special case: emit vroundps for MathF.Round directly instead of what llvm.round.f32 emits
+				// special case: emit vroundss for MathF.Round directly instead of what llvm.round.f32 emits
 				// to align with CoreCLR behavior
 				int xreg = alloc_xreg (cfg);
 				EMIT_NEW_UNALU (cfg, ins, OP_FCONV_TO_R4_X, xreg, args [0]->dreg);
-				EMIT_NEW_UNALU (cfg, ins, OP_SSE41_ROUNDS, xreg, xreg);
+				int xround = alloc_xreg (cfg);
+				EMIT_NEW_BIALU (cfg, ins, OP_SSE41_ROUNDS, xround, xreg, xreg);
 				ins->inst_c0 = 0x4; // vroundss xmm0, xmm0, xmm0, 0x4 (mode for rounding)
 				ins->inst_c1 = MONO_TYPE_R4;
 				int dreg = alloc_freg (cfg);
-				EMIT_NEW_UNALU (cfg, ins, OP_EXTRACT_R4, dreg, xreg);
+				EMIT_NEW_UNALU (cfg, ins, OP_EXTRACT_R4, dreg, xround);
+				ins->inst_c0 = 0;
+				ins->inst_c1 = MONO_TYPE_R4;
 				return ins;
 			}
 #endif

diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c
@@ -4076,8 +4076,8 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
 
 		case OP_XEXTRACT_R4:
 		case OP_XEXTRACT_R8:
-		case OP_XEXTRACT_I32:
-		case OP_XEXTRACT_I64: {
+		case OP_XEXTRACT_I4:
+		case OP_XEXTRACT_I8: {
 			// TODO
 			g_assert_not_reached();
 			break;
@@ -7235,19 +7235,17 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			}
 			break;
 		case OP_EXTRACT_I1:
-		case OP_EXTRACT_U1:
 			amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4);
 			if (ins->inst_c0)
 				amd64_shift_reg_imm (code, X86_SHR, ins->dreg, ins->inst_c0 * 8);
-			amd64_widen_reg (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I1, FALSE);
+			amd64_widen_reg (code, ins->dreg, ins->dreg, ins->inst_c1 == MONO_TYPE_I1, FALSE);
 			break;
 		case OP_EXTRACT_I2:
-		case OP_EXTRACT_U2:
 			/*amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4);
 			if (ins->inst_c0)
 				amd64_shift_reg_imm_size (code, X86_SHR, ins->dreg, 16, 4);*/
 			amd64_sse_pextrw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0);
-			amd64_widen_reg_size (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I2, TRUE, 4);
+			amd64_widen_reg_size (code, ins->dreg, ins->dreg, ins->inst_c1 == MONO_TYPE_I2, TRUE, 4);
 			break;
 		case OP_EXTRACT_R8:
 			if (ins->inst_c0)
@@ -8965,6 +8963,8 @@ mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMetho
 				ins->inst_c1 = MONO_TYPE_R8;
 				int dreg = alloc_freg (cfg);
 				EMIT_NEW_UNALU (cfg, ins, OP_EXTRACT_R8, dreg, xreg);
+				ins->inst_c0 = 0;
+				ins->inst_c1 = MONO_TYPE_R8;
 				return ins;
 			}
 		}