[Arm64] Store Pair of SIMD&FP registers #33532

echesakov · 2020-03-12T21:33:26Z

class AdvSimd.Arm64
{
            /// <summary>
            ///   A64: STP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(byte* address, Vector64<byte> value1, Vector64<byte> value2);

            /// <summary>
            ///   A64: STP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(double* address, Vector64<double> value1, Vector64<double> value2);

            /// <summary>
            ///   A64: STP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(short* address, Vector64<short> value1, Vector64<short> value2);

            /// <summary>
            ///   A64: STP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(int* address, Vector64<int> value1, Vector64<int> value2);

            /// <summary>
            ///   A64: STP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(long* address, Vector64<long> value1, Vector64<long> value2);

            /// <summary>
            ///   A64: STP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(sbyte* address, Vector64<sbyte> value1, Vector64<sbyte> value2);

            /// <summary>
            ///   A64: STP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(float* address, Vector64<float> value1, Vector64<float> value2);

            /// <summary>
            ///   A64: STP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(ushort* address, Vector64<ushort> value1, Vector64<ushort> value2);

            /// <summary>
            ///   A64: STP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(uint* address, Vector64<uint> value1, Vector64<uint> value2);

            /// <summary>
            ///   A64: STP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(ulong* address, Vector64<ulong> value1, Vector64<ulong> value2);

            /// <summary>
            ///   A64: STP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(byte* address, Vector128<byte> value1, Vector128<byte> value2);

            /// <summary>
            ///   A64: STP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(double* address, Vector128<double> value1, Vector128<double> value2);

            /// <summary>
            ///   A64: STP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(short* address, Vector128<short> value1, Vector128<short> value2);

            /// <summary>
            ///   A64: STP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(int* address, Vector128<int> value1, Vector128<int> value2);

            /// <summary>
            ///   A64: STP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(long* address, Vector128<long> value1, Vector128<long> value2);

            /// <summary>
            ///   A64: STP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(sbyte* address, Vector128<sbyte> value1, Vector128<sbyte> value2);

            /// <summary>
            ///   A64: STP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(float* address, Vector128<float> value1, Vector128<float> value2);

            /// <summary>
            ///   A64: STP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(ushort* address, Vector128<ushort> value1, Vector128<ushort> value2);

            /// <summary>
            ///   A64: STP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(uint* address, Vector128<uint> value1, Vector128<uint> value2);

            /// <summary>
            ///   A64: STP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePair(ulong* address, Vector128<ulong> value1, Vector128<ulong> value2);

            /// <summary>
            ///   A64: STP St1, St2, [Xn]
            /// </summary>
            public static unsafe void StorePairScalar(int* address, Vector64<int> value1, Vector64<int> value2);

            /// <summary>
            ///   A64: STP St1, St2, [Xn]
            /// </summary>
            public static unsafe void StorePairScalar(float* address, Vector64<float> value1, Vector64<float> value2);

            /// <summary>
            ///   A64: STP St1, St2, [Xn]
            /// </summary>
            public static unsafe void StorePairScalar(uint* address, Vector64<uint> value1, Vector64<uint> value2);

            /// <summary>
            ///   A64: STNP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(byte* address, Vector64<byte> value1, Vector64<byte> value2);

            /// <summary>
            ///   A64: STNP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(double* address, Vector64<double> value1, Vector64<double> value2);

            /// <summary>
            ///   A64: STNP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(short* address, Vector64<short> value1, Vector64<short> value2);

            /// <summary>
            ///   A64: STNP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(int* address, Vector64<int> value1, Vector64<int> value2);

            /// <summary>
            ///   A64: STNP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(long* address, Vector64<long> value1, Vector64<long> value2);

            /// <summary>
            ///   A64: STNP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(sbyte* address, Vector64<sbyte> value1, Vector64<sbyte> value2);

            /// <summary>
            ///   A64: STNP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(float* address, Vector64<float> value1, Vector64<float> value2);

            /// <summary>
            ///   A64: STNP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(ushort* address, Vector64<ushort> value1, Vector64<ushort> value2);

            /// <summary>
            ///   A64: STNP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(uint* address, Vector64<uint> value1, Vector64<uint> value2);

            /// <summary>
            ///   A64: STNP Dt1, Dt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(ulong* address, Vector64<ulong> value1, Vector64<ulong> value2);

            /// <summary>
            ///   A64: STNP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(byte* address, Vector128<byte> value1, Vector128<byte> value2);

            /// <summary>
            ///   A64: STNP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(double* address, Vector128<double> value1, Vector128<double> value2);

            /// <summary>
            ///   A64: STNP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(short* address, Vector128<short> value1, Vector128<short> value2);

            /// <summary>
            ///   A64: STNP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(int* address, Vector128<int> value1, Vector128<int> value2);

            /// <summary>
            ///   A64: STNP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(long* address, Vector128<long> value1, Vector128<long> value2);

            /// <summary>
            ///   A64: STNP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(sbyte* address, Vector128<sbyte> value1, Vector128<sbyte> value2);

            /// <summary>
            ///   A64: STNP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(float* address, Vector128<float> value1, Vector128<float> value2);

            /// <summary>
            ///   A64: STNP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(ushort* address, Vector128<ushort> value1, Vector128<ushort> value2);

            /// <summary>
            ///   A64: STNP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(uint* address, Vector128<uint> value1, Vector128<uint> value2);

            /// <summary>
            ///   A64: STNP Qt1, Qt2, [Xn]
            /// </summary>
            public static unsafe void StorePairNonTemporal(ulong* address, Vector128<ulong> value1, Vector128<ulong> value2);

            /// <summary>
            ///   A64: STNP St1, St2, [Xn]
            /// </summary>
            public static unsafe void StorePairScalarNonTemporal(int* address, Vector64<int> value1, Vector64<int> value2);

            /// <summary>
            ///   A64: STNP St1, St2, [Xn]
            /// </summary>
            public static unsafe void StorePairScalarNonTemporal(float* address, Vector64<float> value1, Vector64<float> value2);

            /// <summary>
            ///   A64: STNP St1, St2, [Xn]
            /// </summary>
            public static unsafe void StorePairScalarNonTemporal(uint* address, Vector64<uint> value1, Vector64<uint> value2);
}

Dotnet-GitSync-Bot · 2020-03-12T21:33:28Z

I couldn't add an area label to this Issue.

Checkout this page to find out which area owner to ping, or please add exactly one area label to help train me in the future.

echesakov · 2020-03-12T21:34:03Z

@CarolEidt, @tannergooding , @TamarChristinaArm PTAL

echesakov · 2020-03-12T21:35:17Z

@TamarChristinaArm Is it correct to say that functionality of StorePair on Arm32 can be achieved by using VSTM?

tannergooding · 2020-03-12T22:20:36Z

I think we should could have better names than value1/value2 to help clarify the order they are stored in.

I wonder if StoreScalarPair is better than StorePairScalar

echesakov · 2020-03-12T22:46:47Z

I think we should could have better names than value1/value2 to help clarify the order they are stored in.

How about first/second ?

I wonder if StoreScalarPair is better than StorePairScalar

It is an option, but I can't tell which one has more emphasis on that we store a pair

CarolEidt · 2020-03-13T00:05:42Z

I wonder if StoreScalarPair is better than StorePairScalar

It is an option, but I can't tell which one has more emphasis on that we store a pair

It seems to me that it's useful to have all of these start the same, i.e. StorePairXXX

TamarChristinaArm · 2020-03-17T07:33:43Z

@TamarChristinaArm Is it correct to say that functionality of StorePair on Arm32 can be achieved by using VSTM?

Yeah that's correct

TamarChristinaArm · 2020-03-17T07:38:04Z

Question about these store intrinsics in general, how are you guys planning on dealing with the different addressing modes? or are you only interested in a the register addressing modes?

echesakov · 2020-03-17T16:37:39Z

@TamarChristinaArm

I had a discussion with @BruceForstall where we briefly discussed how we can benefit from post-index addressing modes if we had something like these:

// ST1 { <Vt>.<T> }, [<Xn|SP>], #16
void Store(ref T* address, Vector128<T> value);

For example, if have a loop

Vector128<double> val ;

for (int i = 0; i < count; i++)
{
    /* 
    compute new value of val 
   */

    Store(baseAddr + i * 16, val);
}

a user might want to do some sort of strength reduction manually and have this

Vector128<double> val ;
T* ptr = baseAddr;

for (int i = 0; i < count; i++)
{
    /* 
    compute new value of val 
   */

    Store(ptr, val);
    ptr += 16;
}

and that as a result

Vector128<double> val ;
T* ptr = baseAddr;

for (int i = 0; i < count; i++)
{
    /* 
    compute new value of val 
   */

    Store(ref ptr, val);
}

tannergooding · 2020-03-17T16:41:57Z

Why can't we just detect and emit the right encoding for baseAddr + i * 16 like we do on x86/x64?

tannergooding · 2020-03-17T16:44:06Z

For example, see https://sharplab.io/#v2:EYLgxg9gTgpgtADwGwBYA0AXEBDAzgWwB8ABABgAJiBGAOgCUBXAOwwEt8YaBJFqVp3KzC4A3AFgAUGUq1GLdpx4Y+AobhoANABxJxEycQDM5ZrmwAzGJQBM5AMLkA3pPKvKxgGowwGaFWtaADzmADYQ2BgAfOQAsgAUoeEYAFTk2AAm6VBo5PwYuUzpMAgAlE4ubpXEAOzkAMq4nAAy4elePn4BcRlZ5ADUBUUI5KkoJXqVAL6Sk0A=

We already support optimizing things like:

Sse.LoadVector128(addr + index * 4);

into:

vmovups xmm0, [r8+rax*4]

echesakov · 2020-03-17T16:44:37Z

Why can't we just detect and emit the right encoding for baseAddr + i * 16 like we do on x86/x64?

I did not know that we are doing this on x86/x64. Thes, yes, we can.

echesakov · 2020-03-17T16:45:18Z

I will open an issue to track this work

TamarChristinaArm · 2020-03-17T16:46:46Z

Why can't we just detect and emit the right encoding for baseAddr + i * 16 like we do on x86/x64?

I did not know that we are doing this on x86/x64. Thes, yes, we can.

That would make it easier to do pre-index addressing mode too.

e.g. baseAddr + 16

echesakov · 2020-03-17T19:22:06Z

For example, see https://sharplab.io/#v2:EYLgxg9gTgpgtADwGwBYA0AXEBDAzgWwB8ABABgAJiBGAOgCUBXAOwwEt8YaBJFqVp3KzC4A3AFgAUGUq1GLdpx4Y+AobhoANABxJxEycQDM5ZrmwAzGJQBM5AMLkA3pPKvKxgGowwGaFWtaADzmADYQ2BgAfOQAsgAUoeEYAFTk2AAm6VBo5PwYuUzpMAgAlE4ubpXEAOzkAMq4nAAy4elePn4BcRlZ5ADUBUUI5KkoJXqVAL6Sk0A=
We already support optimizing things like:
Sse.LoadVector128(addr + index * 4);
into:
vmovups xmm0, [r8+rax*4]

Actually, I stand corrected when I said we can do this (well, we can but it's not that easy) - the problem with detecting a post-indexing address mode is harder than what you described on x86/x64, since during writeback stage the instruction modifies the value of a base register.

I don't think we use post-indexing modes anywhere on arm64 other than in hand-written prolog/epilog or cpObj codegen.

BruceForstall · 2020-03-17T19:52:53Z

It seems like we would want intrinsics to allow directly specifying pre-indexed or post-indexed addressing, due to writeback. I don't think the JIT will be able to optimize that in all cases.

I was wondering the APIs allow specifying a "memcpy" using LD1/ST1 with post-indexing, for example:

LD1 {V0.16B, V1,16B, V2.16B, V3.16B}, [X0], #64
ST1 {V0.16B, V1,16B, V2.16B, V3.16B}, [X1], #64
<loop back>

instead of:

LD1 {V0.16B, V1,16B, V2.16B, V3.16B}, [X0]
ADD X0, X0, #64
ST1 {V0.16B, V1,16B, V2.16B, V3.16B}, [X1]
ADD X1, X1, #64
<loop back>

In simple cases the JIT might be able to optimize this, but we shouldn't necessarily depend on that.

tannergooding · 2020-03-17T20:29:36Z

It looks like writeback in this context is the modification of (depending on the post-index overload) Xm (register used for offset), Xn (register used for address), or SP (stack pointer) and not writeback with regards to how the memory is being stored, is that correct?

tannergooding · 2020-03-17T20:30:14Z

That is, those inputs are effectively RMW?

echesakov · 2020-03-17T20:32:07Z

It looks like writeback in this context is the modification of (depending on the post-index overload) Xm (register used for offset), Xn (register used for address), or SP (stack pointer) and not writeback with regards to how the memory is being stored, is that correct?

It's modification of Xn|SP (register used for address) or a base register. Xm doesn't change

tannergooding · 2020-03-17T20:34:48Z

Xm doesn't change

Ah, yes, I see. The operation manual has the following and I misread the first if statement

if wback then
    if m != 31 then
        offs = X[m];
    if n == 31 then
        SP[] = address + offs;
    else
        X[n] = address + offs;

tannergooding · 2020-03-17T21:12:53Z

Given the vector instructions look to force wback = TRUE, would it make sense for them to simply be: byte* Store(byte* address, Vector128<byte> value)
Then the JIT can more easily determine whether the user intended for address to be mutated?

tannergooding · 2020-03-17T21:14:33Z

Oh, nevermind. It's only forced to true for the post index variant not the no offset variant

tannergooding · 2020-03-17T21:17:30Z

It seems like we would want intrinsics to allow directly specifying pre-indexed or post-indexed addressing, due to writeback. I don't think the JIT will be able to optimize that in all cases.

Hmmm, but the native intrinsics don't seem to have variants that take anything other than T*, do they just have better analysis for handling/optimizing this?
Given we won't be able to have overloads for things like array[index], is this not something we want to better handle in the JIT anyways?

CarolEidt · 2020-03-17T22:16:46Z

Hmmm, but the native intrinsics don't seem to have variants that take anything other than T*, do they just have better analysis for handling/optimizing this?

I would suspect that's the case.

Given we won't be able to have overloads for things like array[index], is this not something we want to better handle in the JIT anyways

I don't know why we'd need to have specific overloads for array[index] for this to be useful. In any event the memory hw intrinsics require a pointer, and it would seem that supporting post-indexing of that pointer would be desirable.

tannergooding · 2020-03-17T22:21:08Z

I don't know why we'd need to have specific overloads for array[index] for this to be useful

I meant this more as a, post-indexing isn't a concept that is specific to HWIntrinsics, it is a more generally applicable scenario. Because of this, and because there are cases where we can't expose post-indexing specific overloads, would it make sense to just track this as a place we can/should invest further JIT improvements instead?

If we were to expose the post-index variant, do we have an idea of how that would look and how the JIT would need to handle it? I would guess the proposed void Store(ref T* address, Vector128<T> value); has its own disadvantages due to the T* being address taken (effectively a T**)

CarolEidt · 2020-03-17T22:28:45Z

I think it remains to be seen whether it would be better to expose this directly or to rely on the JIT to optimize it. But I suspect that the difficulties of determining that address is not really address-taken in the presumably frequent case that it's a local var are less than the difficulties of optimizing this. In any event, it would be great to know the extent to which this would be useful.

TamarChristinaArm · 2020-03-18T12:05:45Z

I think it remains to be seen whether it would be better to expose this directly or to rely on the JIT to optimize it. But I suspect that the difficulties of determining that address is not really address-taken in the presumably frequent case that it's a local var are less than the difficulties of optimizing this. In any event, it would be great to know the extent to which this would be useful.

For one or two loads/stores I suspect it shouldn't matter all that much.. when you have a lot of them reading from the same sources or using the same offsets it becomes a bit more of an issue as if you pick the wrong addressing mode you end up with more instructions and higher register pressure. For instance if you fail to recognize that e.g. you can use register offset or an immediate offset you can up using lots of adds to generate the address to use a simpler addressing mode.

LDR for instance has a ton of different addressing modes including the ability to do extensions inside the addressing mode instead of as a different instruction.

That said, C compilers routinely don't use the most efficient addressing modes and it hasn't terribly hurt us yet at this point. On the grand scheme of things there are higher priority optimization tasks but recognizing the simple cases would be a good start I think.

terrajobst · 2020-06-25T17:41:30Z

Video

namespace System.Runtime.Intrinsics.Arm
{
    partial class AdvSimd.Arm64
    {
        public static unsafe void StorePair(byte* address, Vector64<byte> value1, Vector64<byte> value2);
        public static unsafe void StorePair(double* address, Vector64<double> value1, Vector64<double> value2);
        public static unsafe void StorePair(short* address, Vector64<short> value1, Vector64<short> value2);
        public static unsafe void StorePair(int* address, Vector64<int> value1, Vector64<int> value2);
        public static unsafe void StorePair(long* address, Vector64<long> value1, Vector64<long> value2);
        public static unsafe void StorePair(sbyte* address, Vector64<sbyte> value1, Vector64<sbyte> value2);
        public static unsafe void StorePair(float* address, Vector64<float> value1, Vector64<float> value2);
        public static unsafe void StorePair(ushort* address, Vector64<ushort> value1, Vector64<ushort> value2);
        public static unsafe void StorePair(uint* address, Vector64<uint> value1, Vector64<uint> value2);
        public static unsafe void StorePair(ulong* address, Vector64<ulong> value1, Vector64<ulong> value2);
        public static unsafe void StorePair(byte* address, Vector128<byte> value1, Vector128<byte> value2);
        public static unsafe void StorePair(double* address, Vector128<double> value1, Vector128<double> value2);
        public static unsafe void StorePair(short* address, Vector128<short> value1, Vector128<short> value2);
        public static unsafe void StorePair(int* address, Vector128<int> value1, Vector128<int> value2);
        public static unsafe void StorePair(long* address, Vector128<long> value1, Vector128<long> value2);
        public static unsafe void StorePair(sbyte* address, Vector128<sbyte> value1, Vector128<sbyte> value2);
        public static unsafe void StorePair(float* address, Vector128<float> value1, Vector128<float> value2);
        public static unsafe void StorePair(ushort* address, Vector128<ushort> value1, Vector128<ushort> value2);
        public static unsafe void StorePair(uint* address, Vector128<uint> value1, Vector128<uint> value2);
        public static unsafe void StorePair(ulong* address, Vector128<ulong> value1, Vector128<ulong> value2);

        public static unsafe void StorePairScalar(int* address, Vector64<int> value1, Vector64<int> value2);
        public static unsafe void StorePairScalar(float* address, Vector64<float> value1, Vector64<float> value2);
        public static unsafe void StorePairScalar(uint* address, Vector64<uint> value1, Vector64<uint> value2);

        public static unsafe void StorePairNonTemporal(byte* address, Vector64<byte> value1, Vector64<byte> value2);
        public static unsafe void StorePairNonTemporal(double* address, Vector64<double> value1, Vector64<double> value2);
        public static unsafe void StorePairNonTemporal(short* address, Vector64<short> value1, Vector64<short> value2);
        public static unsafe void StorePairNonTemporal(int* address, Vector64<int> value1, Vector64<int> value2);
        public static unsafe void StorePairNonTemporal(long* address, Vector64<long> value1, Vector64<long> value2);
        public static unsafe void StorePairNonTemporal(sbyte* address, Vector64<sbyte> value1, Vector64<sbyte> value2);
        public static unsafe void StorePairNonTemporal(float* address, Vector64<float> value1, Vector64<float> value2);
        public static unsafe void StorePairNonTemporal(ushort* address, Vector64<ushort> value1, Vector64<ushort> value2);
        public static unsafe void StorePairNonTemporal(uint* address, Vector64<uint> value1, Vector64<uint> value2);
        public static unsafe void StorePairNonTemporal(ulong* address, Vector64<ulong> value1, Vector64<ulong> value2);
        public static unsafe void StorePairNonTemporal(byte* address, Vector128<byte> value1, Vector128<byte> value2);
        public static unsafe void StorePairNonTemporal(double* address, Vector128<double> value1, Vector128<double> value2);
        public static unsafe void StorePairNonTemporal(short* address, Vector128<short> value1, Vector128<short> value2);
        public static unsafe void StorePairNonTemporal(int* address, Vector128<int> value1, Vector128<int> value2);
        public static unsafe void StorePairNonTemporal(long* address, Vector128<long> value1, Vector128<long> value2);
        public static unsafe void StorePairNonTemporal(sbyte* address, Vector128<sbyte> value1, Vector128<sbyte> value2);
        public static unsafe void StorePairNonTemporal(float* address, Vector128<float> value1, Vector128<float> value2);
        public static unsafe void StorePairNonTemporal(ushort* address, Vector128<ushort> value1, Vector128<ushort> value2);
        public static unsafe void StorePairNonTemporal(uint* address, Vector128<uint> value1, Vector128<uint> value2);
        public static unsafe void StorePairNonTemporal(ulong* address, Vector128<ulong> value1, Vector128<ulong> value2);

        public static unsafe void StorePairScalarNonTemporal(int* address, Vector64<int> value1, Vector64<int> value2);
        public static unsafe void StorePairScalarNonTemporal(float* address, Vector64<float> value1, Vector64<float> value2);
        public static unsafe void StorePairScalarNonTemporal(uint* address, Vector64<uint> value1, Vector64<uint> value2);
    }
}

echesakov · 2020-07-09T21:24:14Z

@TamarChristinaArm Is it correct to say that functionality of StorePair on Arm32 can be achieved by using VSTM?

Yeah that's correct

@TamarChristinaArm I started implementing StorePair and I realized that my original statement above was wrong since VSTM only allows to store from a list of consecutively numbered D-registers while STP can store an arbitrary pair of registers, so they are not equivalent. I believe the intrinsics in this PR should be Arm64 only. Do you agree? Out of curiosity - why there is no C++ intrinsics that store a pair of SIMD/FP registers?

TamarChristinaArm · 2020-07-10T07:00:41Z

@echesakovMSFT

@TamarChristinaArm I started implementing StorePair and I realized that my original statement above was wrong since VSTM only allows to store from a list of consecutively numbered D-registers while STP can store an arbitrary pair of registers, so they are not equivalent. I believe the intrinsics in this PR should be Arm64 only. Do you agree?

Well sort of, VSTM allows D and S registers. By the overlap of the register file they also allow Q registers. i.e. VSTM r0, {Q0, Q1} is just VSTM r0, {d0-d3} and GAS allows this as a programmer convenience instruction.

So the VSTM functionally allows you to do everything you can with an STP. Whether the registers are consecutive or not really is a RA issue...

But yes, in the context of not having convenience intrinsics in CoreCLR I agree that it needs to be different intrinsics. (In case you're wondering, in C we would have done this in AArch32 by putting the values in a struct in the definition of the intrinsics before expanding STM. This usually wouldn't produce any extra moves as the RA will arrange if possible the values immediately in the right registers and the struct is optimized away).

Out of curiosity - why there is no C++ intrinsics that store a pair of SIMD/FP registers?

The belief is that you don't need them and that the compiler should always be able to form pairs when it's possible. To do this both LLVM and GCC have special passes that aid in this. In GCC for instance we have a scheduler fusion pass that allows the instruction scheduler to move consecutive loads and stores next to each other if the pipeline description says it makes sense based on the data dependencies etc.

i.e. we won't move them if you can't form pairs so that you don't overload your pipelines with a long chain of load/stores. After this we peephole them. After this we have a late scheduling pass that is able to schedule the formed pairs better so that again you don't have a long chain of them in your pipeline.

Another way it deals with this is that we have modes that are larger than a machine int register size. e.g. TImode is 128 bits, OImode is 256 etc. The actual machine registers can be declared as a subregister of these larger modes in RTL(IR in LLVM), so that RA gives you consecutive registers and we can split the values later (into pairs if possible).

echesakov · 2020-07-10T18:56:49Z

@TamarChristinaArm Thank you for your thorough reply!
@CarolEidt I am wondering whether the optimization Tamar describes is feasible in RyuJIT?

CarolEidt · 2020-07-10T19:01:29Z

RyuJIT doesn't have a scheduling pass, nor do we have any peephole-like phases that, for example, use a sliding window of instructions to analyze for optimizations such as this. Not to mention that we only have very limited capability of doing dependence analysis to identify interfering memory operations. So the only near-term feasible optimization would be for immediately adjacent instructions.

echesakov added api-suggestion Early API idea and discussion, it is NOT ready for implementation arch-arm64 area-System.Runtime.Intrinsics labels Mar 12, 2020

Dotnet-GitSync-Bot added the untriaged New issue has not been triaged by the area owner label Mar 12, 2020

echesakov mentioned this issue Mar 17, 2020

[Arm64] Detect and emit right addressing mode for Load and Store intrinsics #33676

Open

echesakov closed this as completed Mar 17, 2020

echesakov reopened this Mar 17, 2020

BruceForstall mentioned this issue Apr 10, 2020

ARM64: loop array indexing inefficiencies #34810

Closed

BruceForstall added this to To do general in Hardware Intrinsics via automation Apr 16, 2020

BruceForstall moved this from To do general to To do arm64 in Hardware Intrinsics Apr 16, 2020

BruceForstall moved this from To do arm64 to API design in Hardware Intrinsics Apr 16, 2020

BruceForstall added this to the 5.0 milestone May 11, 2020

BruceForstall removed the untriaged New issue has not been triaged by the area owner label May 11, 2020

tannergooding added api-ready-for-review and removed api-suggestion Early API idea and discussion, it is NOT ready for implementation labels May 26, 2020

terrajobst added api-approved API was approved in API review, it can be implemented and removed api-ready-for-review labels Jun 25, 2020

echesakov moved this from API design to In progress in Hardware Intrinsics Jul 7, 2020

echesakov self-assigned this Jul 7, 2020

echesakov mentioned this issue Jul 14, 2020

[Arm64] ASIMD StorePair StorePairNonTemporal #39240

Merged

echesakov closed this as completed in #39240 Jul 14, 2020

Hardware Intrinsics automation moved this from In progress to Done Jul 14, 2020

dotnet locked as resolved and limited conversation to collaborators Dec 10, 2020

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Arm64] Store Pair of SIMD&FP registers #33532

[Arm64] Store Pair of SIMD&FP registers #33532

echesakov commented Mar 12, 2020

Dotnet-GitSync-Bot commented Mar 12, 2020

echesakov commented Mar 12, 2020

echesakov commented Mar 12, 2020 •

edited

tannergooding commented Mar 12, 2020

echesakov commented Mar 12, 2020

CarolEidt commented Mar 13, 2020

TamarChristinaArm commented Mar 17, 2020

TamarChristinaArm commented Mar 17, 2020

echesakov commented Mar 17, 2020

tannergooding commented Mar 17, 2020

tannergooding commented Mar 17, 2020

echesakov commented Mar 17, 2020

echesakov commented Mar 17, 2020

TamarChristinaArm commented Mar 17, 2020

echesakov commented Mar 17, 2020 •

edited

BruceForstall commented Mar 17, 2020

tannergooding commented Mar 17, 2020

tannergooding commented Mar 17, 2020

echesakov commented Mar 17, 2020 •

edited

tannergooding commented Mar 17, 2020

tannergooding commented Mar 17, 2020 •

edited

tannergooding commented Mar 17, 2020

tannergooding commented Mar 17, 2020

CarolEidt commented Mar 17, 2020

tannergooding commented Mar 17, 2020

CarolEidt commented Mar 17, 2020

TamarChristinaArm commented Mar 18, 2020 •

edited

terrajobst commented Jun 25, 2020 •

edited

echesakov commented Jul 9, 2020

TamarChristinaArm commented Jul 10, 2020

echesakov commented Jul 10, 2020

CarolEidt commented Jul 10, 2020

[Arm64] Store Pair of SIMD&FP registers #33532

[Arm64] Store Pair of SIMD&FP registers #33532

Comments

echesakov commented Mar 12, 2020

Dotnet-GitSync-Bot commented Mar 12, 2020

echesakov commented Mar 12, 2020

echesakov commented Mar 12, 2020 • edited

tannergooding commented Mar 12, 2020

echesakov commented Mar 12, 2020

CarolEidt commented Mar 13, 2020

TamarChristinaArm commented Mar 17, 2020

TamarChristinaArm commented Mar 17, 2020

echesakov commented Mar 17, 2020

tannergooding commented Mar 17, 2020

tannergooding commented Mar 17, 2020

echesakov commented Mar 17, 2020

echesakov commented Mar 17, 2020

TamarChristinaArm commented Mar 17, 2020

echesakov commented Mar 17, 2020 • edited

BruceForstall commented Mar 17, 2020

tannergooding commented Mar 17, 2020

tannergooding commented Mar 17, 2020

echesakov commented Mar 17, 2020 • edited

tannergooding commented Mar 17, 2020

tannergooding commented Mar 17, 2020 • edited

tannergooding commented Mar 17, 2020

tannergooding commented Mar 17, 2020

CarolEidt commented Mar 17, 2020

tannergooding commented Mar 17, 2020

CarolEidt commented Mar 17, 2020

TamarChristinaArm commented Mar 18, 2020 • edited

terrajobst commented Jun 25, 2020 • edited

echesakov commented Jul 9, 2020

TamarChristinaArm commented Jul 10, 2020

echesakov commented Jul 10, 2020

CarolEidt commented Jul 10, 2020

echesakov commented Mar 12, 2020 •

edited

echesakov commented Mar 17, 2020 •

edited

echesakov commented Mar 17, 2020 •

edited

tannergooding commented Mar 17, 2020 •

edited

TamarChristinaArm commented Mar 18, 2020 •

edited

terrajobst commented Jun 25, 2020 •

edited