Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add to std.complex some optional high level SIMD code #9994

Open
dlangBugzillaToGithub opened this issue Jul 23, 2013 · 0 comments
Open

Add to std.complex some optional high level SIMD code #9994

dlangBugzillaToGithub opened this issue Jul 23, 2013 · 0 comments

Comments

@dlangBugzillaToGithub
Copy link

bearophile_hugs reported this on 2013-07-23T13:41:33Z

Transfered from https://issues.dlang.org/show_bug.cgi?id=10707

CC List

Description

This simple D benchmark shows a complex number multiplication done with std.complex, using double2 and SIMD instructions, and the same wrapped in a struct:


import std.stdio, core.simd, ldc.gccbuiltins_x86, std.complex;

alias ComplexD = Complex!double;

ComplexD mult1(in ComplexD x, in ComplexD y) {
    return x * y;
}

double2 mult2(double2 a, double2 b) {
    double2 b_flip = [b.array[1], b.array[0]]; // Swap b.re and b.im
    double2 a_im   = [a.array[1], a.array[1]]; // Imag. part of a in both
    double2 a_re   = [a.array[0], a.array[0]]; // Real part of a in both
    double2 aib    = a_im * b_flip;            // (a.im*b.im, a.im*b.re)
    double2 arb    = a_re * b;                 // (a.re*b.re, a.re*b.im)
    return __builtin_ia32_addsubpd(arb, aib);  // subtract/add
}

struct ComplexS {
    union {
        double2 x;
        struct {
            double re, im;
        }
    }
    alias x this;
}

ComplexS mult3(ComplexS a, ComplexS b) {
    double2 b_flip = [b.im, b.re];
    double2 a_im   = [a.im, a.im];
    double2 a_re   = [a.re, a.re];
    double2 aib    = a_im * b_flip;
    double2 arb    = a_re * b;
    return ComplexS(__builtin_ia32_addsubpd(arb, aib));
}

void main() {
    const c1 = ComplexD(1.0, 30.0);
    const c2 = ComplexD(500.0, 7000.0);
    mult1(c1, c2).writeln;

    double2 x1 = [1.0, 30.0];
    double2 x2 = [500.0, 7000.0];
    mult2(x1, x2).array.writeln;

    auto x1s = ComplexS([1.0, 30.0]);
    auto x2s = ComplexS([500.0, 7000.0]);
    mult3(x1s, x2s).array.writeln;
}



If I compile the code with a compiler designed for floating point performance (ldc2 v. 0.11.0 based on DMD v2.062 and LLVM 3.3svn) with:

ldmd2 -O -release -inline -noboundscheck -mattr=sse3 -output-s test_complex.d


I get the 32bit x86 with SS3 asm:

__D12test_complex5mult1FxS3std7complex14__T7ComplexTdZ7ComplexxS3std7complex14__T7ComplexTdZ7ComplexZS3std7complex14__T7ComplexTdZ7Complex:
    movsd   20(%esp), %xmm0
    movsd   28(%esp), %xmm3
    movsd   4(%esp),  %xmm1
    movsd   12(%esp), %xmm2
    movaps  %xmm2,    %xmm4
    mulsd   %xmm3,    %xmm4
    movaps  %xmm1,    %xmm5
    mulsd   %xmm0,    %xmm5
    subsd   %xmm4,    %xmm5
    movsd   %xmm5,    (%eax)
    mulsd   %xmm3,    %xmm1
    mulsd   %xmm0,    %xmm2
    addsd   %xmm1,    %xmm2
    movsd   %xmm2,    8(%eax)
    ret $32

__D12test_complex5mult2FNhG2dNhG2dZNhG2d:
    pshufd   $238,  %xmm1, %xmm3
    pshufd   $78,   %xmm0, %xmm2
    mulpd    %xmm3, %xmm2
    pshufd   $68,   %xmm1, %xmm1
    mulpd    %xmm0, %xmm1
    addsubpd %xmm2, %xmm1
    movapd   %xmm1, %xmm0
    ret

__D12test_complex5mult3FS12test_complex8ComplexSS12test_complex8ComplexSZS12test_complex8ComplexS:
    pushl    %ebp
    movl     %esp,     %ebp
    andl     $-16,     %esp
    subl     $16,      %esp
    movsd    16(%ebp), %xmm1
    movhpd   8(%ebp),  %xmm1
    movddup  32(%ebp), %xmm0
    mulpd    %xmm1,    %xmm0
    movddup  24(%ebp), %xmm1
    mulpd    8(%ebp),  %xmm1
    addsubpd %xmm0,    %xmm1
    movupd   %xmm1,    (%eax)
    movl     %ebp,      %esp
    popl     %ebp
    ret $32






Notes:
- mult2 is quite more efficient than mult1.
- Maybe there is a way to remove one asm instruction from mult2, the last movapd, if the precedent instruction addsubpd is done on %xmm2 and %xmm0 and other instruction registers are swapped.
- Maybe the struct ComplexS is badly designed.


For that code I have used the SS3 SIMD extension, that today is very common for all kind of Intel compatible CPUs. Using the AVX2 ldc2 produces:

__D12test_complex5mult1FxS3std7complex14__T7ComplexTdZ7ComplexxS3std7complex14__T7ComplexTdZ7ComplexZS3std7complex14__T7ComplexTdZ7Complex:
    vmovsd  20(%esp), %xmm0
    vmovsd  28(%esp), %xmm1
    vmovsd  4(%esp),  %xmm2
    vmovsd  12(%esp), %xmm3
    vmulsd  %xmm1,    %xmm3,  %xmm4
    vmulsd  %xmm0,    %xmm2,  %xmm5
    vsubsd  %xmm4,    %xmm5,  %xmm4
    vmovsd  %xmm4,    (%eax) 
    vmulsd  %xmm1,    %xmm2,  %xmm1
    vmulsd  %xmm0,    %xmm3,  %xmm0
    vaddsd  %xmm1,    %xmm0,  %xmm0
    vmovsd  %xmm0,    8(%eax)
    ret $32

__D12test_complex5mult2FNhG2dNhG2dZNhG2d:
    vpermilpd       $3, %xmm1, %xmm2
    vpermilpd       $1, %xmm0, %xmm3
    vmulpd       %xmm2, %xmm3, %xmm2
    vpbroadcastq %xmm1, %xmm1
    vmulpd       %xmm0, %xmm1, %xmm0
    vaddsubpd    %xmm2, %xmm0, %xmm0
    ret


If I add -vectorize-slp -vectorize-slp-aggressive to the compilation switches the asm of mult1 gets a bit better:

__D12test_complex5mult1FxS3std7complex14__T7ComplexTdZ7ComplexxS3std7complex14__T7ComplexTdZ7ComplexZS3std7complex14__T7ComplexTdZ7Complex:
    vmovsd    20(%esp), %xmm0
    vmovsd    28(%esp), %xmm1
    vmovsd     4(%esp), %xmm2
    vmovsd    12(%esp), %xmm3
    vmulsd       %xmm1, %xmm2, %xmm4
    vmulsd       %xmm0, %xmm3, %xmm5
    vaddsd       %xmm4, %xmm5, %xmm4
    vmulsd       %xmm1, %xmm3, %xmm1
    vmulsd       %xmm0, %xmm2, %xmm0
    vsubsd       %xmm1, %xmm0, %xmm0
    vunpcklpd    %xmm4, %xmm0, %xmm0
    vmovupd      %xmm0, (%eax)
    ret $32


So I suggest to add to std.complex some high level SIMD code like mult2, that gets compiled if the target CPU supports SS3 instructions.
@LightBender LightBender removed the P4 label Dec 6, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

2 participants