- 
                Notifications
    
You must be signed in to change notification settings  - Fork 5.2k
 
Description
Currently, the V?PMOV[SZ]X[BWD][WDQ] HWIntrinsics methods define only Vector128<T> arguments, but most of these instructions consume less than 128 bits.
These instructions also support both [xy]mm,xmm or [xy]mm,mem encoding.  In order to use the mem-encoded version of the instruction, we must currently write something like
Avx2.ConvertToVector256UInt32(Sse2.LoadVector128(bytePtr))And the JIT folds the load, emitting the correct
vpmovzxbd ymm1,mmword ptr [rsi]However, this gives Sse2.LoadVector128 non-deterministic behavior, as in this case only 64 bits are read.
Additionally, the SSE4.1 and AVX2 versions of these intrinsics are not symmetrical.  For example, _mm_cvtepu8_epi32 is mapped to Sse41.ConvertToVector128Int32 while _mm256_cvtepu8_epi32 is mapped to Avx2.ConvertToVector256UInt32.  The distinction there being that in Sse41 we return a signed result and in Avx2, we return unsigned.
I propose we do the following:
Change the existing Avx2 methods to return signed types for all widening operations to match the semantics of the Intel intrinsics (e.g. epu8->epi32) and the current Sse41 HWIntrinsics.  These will continue to map to the zero-extend versions of the instructions.
namespace System.Runtime.Intrinsics.X86
{
    public abstract class Avx2
    {
-       public static Vector256<ushort> ConvertToVector256UInt16(Vector128<byte> value) => ConvertToVector256UInt16(value);
+       public static Vector256<short> ConvertToVector256Int16(Vector128<byte> value) => ConvertToVector256Int16(value);
-       public static Vector256<uint> ConvertToVector256UInt32(Vector128<byte> value) => ConvertToVector256UInt32(value);
+       public static Vector256<int> ConvertToVector256Int32(Vector128<byte> value) => ConvertToVector256Int32(value);
-       public static Vector256<uint> ConvertToVector256UInt32(Vector128<ushort> value) => ConvertToVector256UInt32(value);
+       public static Vector256<int> ConvertToVector256Int32(Vector128<ushort> value) => ConvertToVector256Int32(value);
-       public static Vector256<ulong> ConvertToVector256UInt64(Vector128<byte> value) => ConvertToVector256UInt64(value);
+       public static Vector256<long> ConvertToVector256Int64(Vector128<byte> value) => ConvertToVector256Int64(value);
-       public static Vector256<ulong> ConvertToVector256UInt64(Vector128<ushort> value) => ConvertToVector256UInt64(value);
+       public static Vector256<long> ConvertToVector256Int64(Vector128<ushort> value) => ConvertToVector256Int64(value);
-       public static Vector256<ulong> ConvertToVector256UInt64(Vector128<uint> value) => ConvertToVector256UInt64(value);
+       public static Vector256<long> ConvertToVector256Int64(Vector128<uint> value) => ConvertToVector256Int64(value);
    }
}Add pointer overloads to both the SSE4.1 and AVX2 versions of the methods.
namespace System.Runtime.Intrinsics.X86
{
    public abstract class Sse41
    {
        /// <summary>
        /// __m128i _mm_cvtepi8_epi16 (__m128i a)
        ///   PMOVSXBW xmm, xmm/m64
        /// </summary>
        public static unsafe Vector128<short> ConvertToVector128Int16(sbyte* address) => ConvertToVector128Int16(address);
        /// <summary>
        /// __m128i _mm_cvtepu8_epi16 (__m128i a)
        ///   PMOVZXBW xmm, xmm/m64
        /// </summary>
        public static unsafe Vector128<short> ConvertToVector128Int16(byte* address) => ConvertToVector128Int16(address);
        /// <summary>
        /// __m128i _mm_cvtepi8_epi32 (__m128i a)
        ///   PMOVSXBD xmm, xmm/m32
        /// </summary>
        public static unsafe Vector128<int> ConvertToVector128Int32(sbyte* address) => ConvertToVector128Int32(address);
        /// <summary>
        /// __m128i _mm_cvtepu8_epi32 (__m128i a)
        ///   PMOVZXBD xmm, xmm/m32
        /// </summary>
        public static unsafe Vector128<int> ConvertToVector128Int32(byte* address) => ConvertToVector128Int32(address);
        /// <summary>
        /// __m128i _mm_cvtepi16_epi32 (__m128i a)
        ///   PMOVSXWD xmm, xmm/m64
        /// </summary>
        public static unsafe Vector128<int> ConvertToVector128Int32(short* address) => ConvertToVector128Int32(value);
        /// <summary>
        /// __m128i _mm_cvtepu16_epi32 (__m128i a)
        ///   PMOVZXWD xmm, xmm/m64
        /// </summary>
        public static unsafe Vector128<int> ConvertToVector128Int32(ushort* address) => ConvertToVector128Int32(address);
        /// <summary>
        /// __m128i _mm_cvtepi8_epi64 (__m128i a)
        ///   PMOVSXBQ xmm, xmm/m16
        /// </summary>
        public static unsafe Vector128<long> ConvertToVector128Int64(sbyte* address) => ConvertToVector128Int64(address);
        /// <summary>
        /// __m128i _mm_cvtepu8_epi64 (__m128i a)
        ///   PMOVZXBQ xmm, xmm/m16
        /// </summary>
        public static unsafe Vector128<long> ConvertToVector128Int64(byte* address) => ConvertToVector128Int64(address);
        /// <summary>
        /// __m128i _mm_cvtepi16_epi64 (__m128i a)
        ///   PMOVSXWQ xmm, xmm/m32
        /// </summary>
        public static unsafe Vector128<long> ConvertToVector128Int64(short* address) => ConvertToVector128Int64(address);
        /// <summary>
        /// __m128i _mm_cvtepu16_epi64 (__m128i a)
        ///   PMOVZXWQ xmm, xmm/m32
        /// </summary>
        public static unsafe Vector128<long> ConvertToVector128Int64(ushort* address) => ConvertToVector128Int64(address);
        /// <summary>
        /// __m128i _mm_cvtepi32_epi64 (__m128i a)
        ///   PMOVSXDQ xmm, xmm/m64
        /// </summary>
        public static unsafe Vector128<long> ConvertToVector128Int64(int* address) => ConvertToVector128Int64(address);
        /// <summary>
        /// __m128i _mm_cvtepu32_epi64 (__m128i a)
        ///   PMOVZXDQ xmm, xmm/m64
        /// </summary>
        public static unsafe Vector128<long> ConvertToVector128Int64(uint* address) => ConvertToVector128Int64(address);
    }
    public abstract class Avx2
    {
        /// <summary>
        /// __m256i _mm256_cvtepi8_epi16 (__m128i a)
        ///   VPMOVSXBW ymm, xmm/m128
        /// </summary>
        public static unsafe Vector256<short> ConvertToVector256Int16(sbyte* address) => ConvertToVector256Int16(address);
        /// <summary>
        /// __m256i _mm256_cvtepu8_epi16 (__m128i a)
        ///   VPMOVZXBW ymm, xmm/m128
        /// </summary>
        public static unsafe Vector256<short> ConvertToVector256Int16(byte* address) => ConvertToVector256Int16(address);
        /// <summary>
        /// __m256i _mm256_cvtepi8_epi32 (__m128i a)
        ///   VPMOVSXBD ymm, xmm/m64
        /// </summary>
        public static unsafe Vector256<int> ConvertToVector256Int32(sbyte* address) => ConvertToVector256Int32(address);
        /// <summary>
        /// __m256i _mm256_cvtepu8_epi32 (__m128i a)
        ///   VPMOVZXBD ymm, xmm/m64
        /// </summary>
        public static unsafe Vector256<int> ConvertToVector256Int32(byte* address) => ConvertToVector256Int32(address);
        /// <summary>
        /// __m256i _mm256_cvtepi16_epi32 (__m128i a)
        ///   VPMOVSXWD ymm, xmm/m128
        /// </summary>
        public static unsafe Vector256<int> ConvertToVector256Int32(short* address) => ConvertToVector256Int32(address);
        /// <summary>
        /// __m256i _mm256_cvtepu16_epi32 (__m128i a)
        ///   VPMOVZXWD ymm, xmm/m128
        /// </summary>
        public static unsafe Vector256<int> ConvertToVector256Int32(ushort* address) => ConvertToVector256Int32(address);
        /// <summary>
        /// __m256i _mm256_cvtepi8_epi64 (__m128i a)
        ///   VPMOVSXBQ ymm, xmm/m32
        /// </summary>
        public static unsafe Vector256<long> ConvertToVector256Int64(sbyte* address) => ConvertToVector256Int64(address);
        /// <summary>
        /// __m256i _mm256_cvtepu8_epi64 (__m128i a)
        ///   VPMOVZXBQ ymm, xmm/m32
        /// </summary>
        public static unsafe Vector256<long> ConvertToVector256Int64(byte* address) => ConvertToVector256Int64(address);
        /// <summary>
        /// __m256i _mm256_cvtepi16_epi64 (__m128i a)
        ///   VPMOVSXWQ ymm, xmm/m64
        /// </summary>
        public static unsafe Vector256<long> ConvertToVector256Int64(short* address) => ConvertToVector256Int64(address);
        /// <summary>
        /// __m256i _mm256_cvtepu16_epi64 (__m128i a)
        ///   VPMOVZXWQ ymm, xmm/m64
        /// </summary>
        public static unsafe Vector256<long> ConvertToVector256Int64(ushort* address) => ConvertToVector256Int64(address);
        /// <summary>
        /// __m256i _mm256_cvtepi32_epi64 (__m128i a)
        ///   VPMOVSXDQ ymm, xmm/m128
        /// </summary>
        public static unsafe Vector256<long> ConvertToVector256Int64(int* address) => ConvertToVector256Int64(address);
        /// <summary>
        /// __m256i _mm256_cvtepu32_epi64 (__m128i a)
        ///   VPMOVZXDQ ymm, xmm/m128
        /// </summary>
        public static unsafe Vector256<long> ConvertToVector256Int64(uint* address) => ConvertToVector256Int64(address);
    }
}