Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT: Allow strength reducing to GCD of IVs #110222

Merged
merged 4 commits into from
Dec 4, 2024

Conversation

jakobbotsch
Copy link
Member

@jakobbotsch jakobbotsch commented Nov 27, 2024

This adds support for strength reduction to create a new primary IV that is the GCD of several IVs found in the loop. When the same index is used to access arrays of different sizes we will often see the IV being multiplied by different values; however, it is usually still profitable to strength reduce to the GCD of the step values and then "recover" the final IV by scaling.

Example:

public static void Foo()
{
    string puzzle = "003020600900305001001806400008102900700000008006708200002609500800203009005010300";
    int[] board = new int[81];

    for (int i = 0; i < puzzle.Length; i++)
    {
        board[i] = puzzle[i] - '0';
    }
}

Codegen diff for loop:

        xor      ecx, ecx
+       mov      edx, 81

 G_M24659_IG03:
-       mov      edx, ecx
-       movzx    r8, word  ptr [rbx+2*rdx+0x10]
+       movzx    r8, word  ptr [rbx+rcx+0x10]
        add      r8d, -48
-       mov      dword ptr [rax+4*rdx+0x10], r8d
-       inc      ecx
-       cmp      ecx, 81
-       jl       SHORT G_M24659_IG03
-						;; size=24 bbWeight=3.96 PerfScore 19.80
+       mov      dword ptr [rax+2*rcx+0x10], r8d
+       add      rcx, 2
+       dec      edx
+       jne      SHORT G_M24659_IG03
+						;; size=23 bbWeight=3.96 PerfScore 18.81

A similar diff in System.Linq.Enumerable+EnumerableSorter`2[System.__Canon,System.Decimal]:ComputeKeys(System.__Canon[],int):

+       xor      edx, edx
 G_M57524_IG05:
-       mov      edx, r15d
-       mov      r8, gword ptr [rbx+8*rdx+0x10]
+       mov      r8, gword ptr [rbx+rdx+0x10]
        vmovups  xmm0, xmmword ptr [r8+0x20]
        vmovups  xmmword ptr [rsp+0x28], xmm0
-       shl      rdx, 4
        vmovups  xmm0, xmmword ptr [rsp+0x28]
-       vmovups  xmmword ptr [r14+rdx+0x10], xmm0
-       inc      r15d
-       cmp      r13d, r15d
-       jg       SHORT G_M57524_IG05
-						;; size=45 bbWeight=75.73 PerfScore 1079.10
+       vmovups  xmmword ptr [r14+2*rdx+0x10], xmm0
+       add      rdx, 8
+       dec      r13d
+       jne      SHORT G_M57524_IG05
+						;; size=39 bbWeight=75.73 PerfScore 1022.31

Fix #102068
Fix #105241

This adds support for strength reduction to create a new primary IV that
is the GCD of several IVs found in the loop. When the same index is used
to access arrays of different sizes we will often see the IV being
multiplied by different values; however, it is usually still profitable
to strength reduce to the GCD of the step values and then "recover" the
final IV by scaling.

Example:
```csharp
public static void Foo()
{
    string puzzle = "003020600900305001001806400008102900700000008006708200002609500800203009005010300";
    int[] board = new int[81];

    for (int i = 0; i < puzzle.Length; i++)
    {
        board[i] = puzzle[i] - '0';
    }
}
```

Codegen diff for loop:
```diff
        xor      ecx, ecx
+       mov      edx, 81

 G_M24659_IG03:
-       mov      edx, ecx
-       movzx    r8, word  ptr [rbx+2*rdx+0x10]
+       movzx    r8, word  ptr [rbx+rcx+0x10]
        add      r8d, -48
-       mov      dword ptr [rax+4*rdx+0x10], r8d
-       inc      ecx
-       cmp      ecx, 81
-       jl       SHORT G_M24659_IG03
-						;; size=24 bbWeight=3.96 PerfScore 19.80
+       mov      dword ptr [rax+2*rcx+0x10], r8d
+       add      rcx, 2
+       dec      edx
+       jne      SHORT G_M24659_IG03
+						;; size=23 bbWeight=3.96 PerfScore 18.81
```

A similar diff in ``System.Linq.Enumerable+EnumerableSorter`2[System.__Canon,System.Decimal]:ComputeKeys(System.__Canon[],int)``:
```diff
+       xor      edx, edx
 G_M57524_IG05:
-       mov      edx, r15d
-       mov      r8, gword ptr [rbx+8*rdx+0x10]
+       mov      r8, gword ptr [rbx+rdx+0x10]
        vmovups  xmm0, xmmword ptr [r8+0x20]
        vmovups  xmmword ptr [rsp+0x28], xmm0
-       shl      rdx, 4
        vmovups  xmm0, xmmword ptr [rsp+0x28]
-       vmovups  xmmword ptr [r14+rdx+0x10], xmm0
-       inc      r15d
-       cmp      r13d, r15d
-       jg       SHORT G_M57524_IG05
-						;; size=45 bbWeight=75.73 PerfScore 1079.10
+       vmovups  xmmword ptr [r14+2*rdx+0x10], xmm0
+       add      rdx, 8
+       dec      r13d
+       jne      SHORT G_M57524_IG05
+						;; size=39 bbWeight=75.73 PerfScore 1022.31
```

Fix dotnet#102068
Fix dotnet#105241
@dotnet-issue-labeler dotnet-issue-labeler bot added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Nov 27, 2024
@jakobbotsch
Copy link
Member Author

This needs some heuristics for arm64. There strength reducing in this case can break address modes, e.g. before:

G_M51654_IG04:  ;; offset=0x0048
            mov     w4, w1
            ldr     x5, [x2, x4, LSL #3]
            ldr     w5, [x5, #0x28]
            str     w5, [x3, x4, LSL #2]
            add     w1, w1, #1
            cmp     w20, w1
            bgt     G_M51654_IG04
						;; size=28 bbWeight=4.95 PerfScore 47.03

After

G_M51654_IG04:  ;; offset=0x004C
            lsl     x4, x1, #1
            ldr     x4, [x2, x4]
            ldr     w4, [x4, #0x28]
            str     w4, [x3, x1]
            add     x1, x1, #4
            sub     w20, w20, #1
            cbnz    w20, G_M51654_IG04

Before we have two address modes. However, ARM64 cannot represent an address mode with a scale that doesn't correspond to the element size, so in the "after" case it is no longer possible to represent any address modes.

@jakobbotsch jakobbotsch marked this pull request as ready for review December 3, 2024 14:21
@jakobbotsch
Copy link
Member Author

cc @dotnet/jit-contrib PTAL @AndyAyersMS

Diffs. Gets around a hundred cases in libraries_tests.run and a few dozen in aspnet/benchmarks. Also fixes two motivating cases.

I investigated why we don't see more cases and opened #110315 for one of the reasons, but I think I'll save further investigation for another time.

allowRephrasingByScalingIV2);
}

if (iv2->Type == TYP_LONG)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does this check iv2->Type when the previous check was against iv1->Type?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a typo, switched it to iv1.

@jakobbotsch jakobbotsch merged commit 949c3ec into dotnet:main Dec 4, 2024
103 of 108 checks passed
@jakobbotsch jakobbotsch deleted the strength-reduce-gcd branch December 4, 2024 10:17
eduardo-vp pushed a commit to eduardo-vp/runtime that referenced this pull request Dec 5, 2024
This adds support for strength reduction to create a new primary IV that
is the GCD of several IVs found in the loop. When the same index is used
to access arrays of different sizes we will often see the IV being
multiplied by different values; however, it is usually still profitable
to strength reduce to the GCD of the step values and then "recover" the
final IV by scaling.

Example:
```csharp
public static void Foo()
{
    string puzzle = "003020600900305001001806400008102900700000008006708200002609500800203009005010300";
    int[] board = new int[81];

    for (int i = 0; i < puzzle.Length; i++)
    {
        board[i] = puzzle[i] - '0';
    }
}
```

Codegen diff for loop:
```diff
        xor      ecx, ecx
+       mov      edx, 81

 G_M24659_IG03:
-       mov      edx, ecx
-       movzx    r8, word  ptr [rbx+2*rdx+0x10]
+       movzx    r8, word  ptr [rbx+rcx+0x10]
        add      r8d, -48
-       mov      dword ptr [rax+4*rdx+0x10], r8d
-       inc      ecx
-       cmp      ecx, 81
-       jl       SHORT G_M24659_IG03
-						;; size=24 bbWeight=3.96 PerfScore 19.80
+       mov      dword ptr [rax+2*rcx+0x10], r8d
+       add      rcx, 2
+       dec      edx
+       jne      SHORT G_M24659_IG03
+						;; size=23 bbWeight=3.96 PerfScore 18.81
```

A similar diff in ``System.Linq.Enumerable+EnumerableSorter`2[System.__Canon,System.Decimal]:ComputeKeys(System.__Canon[],int)``:
```diff
+       xor      edx, edx
 G_M57524_IG05:
-       mov      edx, r15d
-       mov      r8, gword ptr [rbx+8*rdx+0x10]
+       mov      r8, gword ptr [rbx+rdx+0x10]
        vmovups  xmm0, xmmword ptr [r8+0x20]
        vmovups  xmmword ptr [rsp+0x28], xmm0
-       shl      rdx, 4
        vmovups  xmm0, xmmword ptr [rsp+0x28]
-       vmovups  xmmword ptr [r14+rdx+0x10], xmm0
-       inc      r15d
-       cmp      r13d, r15d
-       jg       SHORT G_M57524_IG05
-						;; size=45 bbWeight=75.73 PerfScore 1079.10
+       vmovups  xmmword ptr [r14+2*rdx+0x10], xmm0
+       add      rdx, 8
+       dec      r13d
+       jne      SHORT G_M57524_IG05
+						;; size=39 bbWeight=75.73 PerfScore 1022.31
```

Fix dotnet#102068
Fix dotnet#105241
mikelle-rogers pushed a commit to mikelle-rogers/runtime that referenced this pull request Dec 10, 2024
This adds support for strength reduction to create a new primary IV that
is the GCD of several IVs found in the loop. When the same index is used
to access arrays of different sizes we will often see the IV being
multiplied by different values; however, it is usually still profitable
to strength reduce to the GCD of the step values and then "recover" the
final IV by scaling.

Example:
```csharp
public static void Foo()
{
    string puzzle = "003020600900305001001806400008102900700000008006708200002609500800203009005010300";
    int[] board = new int[81];

    for (int i = 0; i < puzzle.Length; i++)
    {
        board[i] = puzzle[i] - '0';
    }
}
```

Codegen diff for loop:
```diff
        xor      ecx, ecx
+       mov      edx, 81

 G_M24659_IG03:
-       mov      edx, ecx
-       movzx    r8, word  ptr [rbx+2*rdx+0x10]
+       movzx    r8, word  ptr [rbx+rcx+0x10]
        add      r8d, -48
-       mov      dword ptr [rax+4*rdx+0x10], r8d
-       inc      ecx
-       cmp      ecx, 81
-       jl       SHORT G_M24659_IG03
-						;; size=24 bbWeight=3.96 PerfScore 19.80
+       mov      dword ptr [rax+2*rcx+0x10], r8d
+       add      rcx, 2
+       dec      edx
+       jne      SHORT G_M24659_IG03
+						;; size=23 bbWeight=3.96 PerfScore 18.81
```

A similar diff in ``System.Linq.Enumerable+EnumerableSorter`2[System.__Canon,System.Decimal]:ComputeKeys(System.__Canon[],int)``:
```diff
+       xor      edx, edx
 G_M57524_IG05:
-       mov      edx, r15d
-       mov      r8, gword ptr [rbx+8*rdx+0x10]
+       mov      r8, gword ptr [rbx+rdx+0x10]
        vmovups  xmm0, xmmword ptr [r8+0x20]
        vmovups  xmmword ptr [rsp+0x28], xmm0
-       shl      rdx, 4
        vmovups  xmm0, xmmword ptr [rsp+0x28]
-       vmovups  xmmword ptr [r14+rdx+0x10], xmm0
-       inc      r15d
-       cmp      r13d, r15d
-       jg       SHORT G_M57524_IG05
-						;; size=45 bbWeight=75.73 PerfScore 1079.10
+       vmovups  xmmword ptr [r14+2*rdx+0x10], xmm0
+       add      rdx, 8
+       dec      r13d
+       jne      SHORT G_M57524_IG05
+						;; size=39 bbWeight=75.73 PerfScore 1022.31
```

Fix dotnet#102068
Fix dotnet#105241
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI
Projects
None yet
2 participants