We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
This simple D benchmark shows a complex number multiplication done with std.complex, using double2 and SIMD instructions, and the same wrapped in a struct: import std.stdio, core.simd, ldc.gccbuiltins_x86, std.complex; alias ComplexD = Complex!double; ComplexD mult1(in ComplexD x, in ComplexD y) { return x * y; } double2 mult2(double2 a, double2 b) { double2 b_flip = [b.array[1], b.array[0]]; // Swap b.re and b.im double2 a_im = [a.array[1], a.array[1]]; // Imag. part of a in both double2 a_re = [a.array[0], a.array[0]]; // Real part of a in both double2 aib = a_im * b_flip; // (a.im*b.im, a.im*b.re) double2 arb = a_re * b; // (a.re*b.re, a.re*b.im) return __builtin_ia32_addsubpd(arb, aib); // subtract/add } struct ComplexS { union { double2 x; struct { double re, im; } } alias x this; } ComplexS mult3(ComplexS a, ComplexS b) { double2 b_flip = [b.im, b.re]; double2 a_im = [a.im, a.im]; double2 a_re = [a.re, a.re]; double2 aib = a_im * b_flip; double2 arb = a_re * b; return ComplexS(__builtin_ia32_addsubpd(arb, aib)); } void main() { const c1 = ComplexD(1.0, 30.0); const c2 = ComplexD(500.0, 7000.0); mult1(c1, c2).writeln; double2 x1 = [1.0, 30.0]; double2 x2 = [500.0, 7000.0]; mult2(x1, x2).array.writeln; auto x1s = ComplexS([1.0, 30.0]); auto x2s = ComplexS([500.0, 7000.0]); mult3(x1s, x2s).array.writeln; } If I compile the code with a compiler designed for floating point performance (ldc2 v. 0.11.0 based on DMD v2.062 and LLVM 3.3svn) with: ldmd2 -O -release -inline -noboundscheck -mattr=sse3 -output-s test_complex.d I get the 32bit x86 with SS3 asm: __D12test_complex5mult1FxS3std7complex14__T7ComplexTdZ7ComplexxS3std7complex14__T7ComplexTdZ7ComplexZS3std7complex14__T7ComplexTdZ7Complex: movsd 20(%esp), %xmm0 movsd 28(%esp), %xmm3 movsd 4(%esp), %xmm1 movsd 12(%esp), %xmm2 movaps %xmm2, %xmm4 mulsd %xmm3, %xmm4 movaps %xmm1, %xmm5 mulsd %xmm0, %xmm5 subsd %xmm4, %xmm5 movsd %xmm5, (%eax) mulsd %xmm3, %xmm1 mulsd %xmm0, %xmm2 addsd %xmm1, %xmm2 movsd %xmm2, 8(%eax) ret $32 __D12test_complex5mult2FNhG2dNhG2dZNhG2d: pshufd $238, %xmm1, %xmm3 pshufd $78, %xmm0, %xmm2 mulpd %xmm3, %xmm2 pshufd $68, %xmm1, %xmm1 mulpd %xmm0, %xmm1 addsubpd %xmm2, %xmm1 movapd %xmm1, %xmm0 ret __D12test_complex5mult3FS12test_complex8ComplexSS12test_complex8ComplexSZS12test_complex8ComplexS: pushl %ebp movl %esp, %ebp andl $-16, %esp subl $16, %esp movsd 16(%ebp), %xmm1 movhpd 8(%ebp), %xmm1 movddup 32(%ebp), %xmm0 mulpd %xmm1, %xmm0 movddup 24(%ebp), %xmm1 mulpd 8(%ebp), %xmm1 addsubpd %xmm0, %xmm1 movupd %xmm1, (%eax) movl %ebp, %esp popl %ebp ret $32 Notes: - mult2 is quite more efficient than mult1. - Maybe there is a way to remove one asm instruction from mult2, the last movapd, if the precedent instruction addsubpd is done on %xmm2 and %xmm0 and other instruction registers are swapped. - Maybe the struct ComplexS is badly designed. For that code I have used the SS3 SIMD extension, that today is very common for all kind of Intel compatible CPUs. Using the AVX2 ldc2 produces: __D12test_complex5mult1FxS3std7complex14__T7ComplexTdZ7ComplexxS3std7complex14__T7ComplexTdZ7ComplexZS3std7complex14__T7ComplexTdZ7Complex: vmovsd 20(%esp), %xmm0 vmovsd 28(%esp), %xmm1 vmovsd 4(%esp), %xmm2 vmovsd 12(%esp), %xmm3 vmulsd %xmm1, %xmm3, %xmm4 vmulsd %xmm0, %xmm2, %xmm5 vsubsd %xmm4, %xmm5, %xmm4 vmovsd %xmm4, (%eax) vmulsd %xmm1, %xmm2, %xmm1 vmulsd %xmm0, %xmm3, %xmm0 vaddsd %xmm1, %xmm0, %xmm0 vmovsd %xmm0, 8(%eax) ret $32 __D12test_complex5mult2FNhG2dNhG2dZNhG2d: vpermilpd $3, %xmm1, %xmm2 vpermilpd $1, %xmm0, %xmm3 vmulpd %xmm2, %xmm3, %xmm2 vpbroadcastq %xmm1, %xmm1 vmulpd %xmm0, %xmm1, %xmm0 vaddsubpd %xmm2, %xmm0, %xmm0 ret If I add -vectorize-slp -vectorize-slp-aggressive to the compilation switches the asm of mult1 gets a bit better: __D12test_complex5mult1FxS3std7complex14__T7ComplexTdZ7ComplexxS3std7complex14__T7ComplexTdZ7ComplexZS3std7complex14__T7ComplexTdZ7Complex: vmovsd 20(%esp), %xmm0 vmovsd 28(%esp), %xmm1 vmovsd 4(%esp), %xmm2 vmovsd 12(%esp), %xmm3 vmulsd %xmm1, %xmm2, %xmm4 vmulsd %xmm0, %xmm3, %xmm5 vaddsd %xmm4, %xmm5, %xmm4 vmulsd %xmm1, %xmm3, %xmm1 vmulsd %xmm0, %xmm2, %xmm0 vsubsd %xmm1, %xmm0, %xmm0 vunpcklpd %xmm4, %xmm0, %xmm0 vmovupd %xmm0, (%eax) ret $32 So I suggest to add to std.complex some high level SIMD code like mult2, that gets compiled if the target CPU supports SS3 instructions.
The text was updated successfully, but these errors were encountered:
No branches or pull requests
bearophile_hugs reported this on 2013-07-23T13:41:33Z
Transfered from https://issues.dlang.org/show_bug.cgi?id=10707
CC List
Description
This simple D benchmark shows a complex number multiplication done with std.complex, using double2 and SIMD instructions, and the same wrapped in a struct: import std.stdio, core.simd, ldc.gccbuiltins_x86, std.complex; alias ComplexD = Complex!double; ComplexD mult1(in ComplexD x, in ComplexD y) { return x * y; } double2 mult2(double2 a, double2 b) { double2 b_flip = [b.array[1], b.array[0]]; // Swap b.re and b.im double2 a_im = [a.array[1], a.array[1]]; // Imag. part of a in both double2 a_re = [a.array[0], a.array[0]]; // Real part of a in both double2 aib = a_im * b_flip; // (a.im*b.im, a.im*b.re) double2 arb = a_re * b; // (a.re*b.re, a.re*b.im) return __builtin_ia32_addsubpd(arb, aib); // subtract/add } struct ComplexS { union { double2 x; struct { double re, im; } } alias x this; } ComplexS mult3(ComplexS a, ComplexS b) { double2 b_flip = [b.im, b.re]; double2 a_im = [a.im, a.im]; double2 a_re = [a.re, a.re]; double2 aib = a_im * b_flip; double2 arb = a_re * b; return ComplexS(__builtin_ia32_addsubpd(arb, aib)); } void main() { const c1 = ComplexD(1.0, 30.0); const c2 = ComplexD(500.0, 7000.0); mult1(c1, c2).writeln; double2 x1 = [1.0, 30.0]; double2 x2 = [500.0, 7000.0]; mult2(x1, x2).array.writeln; auto x1s = ComplexS([1.0, 30.0]); auto x2s = ComplexS([500.0, 7000.0]); mult3(x1s, x2s).array.writeln; } If I compile the code with a compiler designed for floating point performance (ldc2 v. 0.11.0 based on DMD v2.062 and LLVM 3.3svn) with: ldmd2 -O -release -inline -noboundscheck -mattr=sse3 -output-s test_complex.d I get the 32bit x86 with SS3 asm: __D12test_complex5mult1FxS3std7complex14__T7ComplexTdZ7ComplexxS3std7complex14__T7ComplexTdZ7ComplexZS3std7complex14__T7ComplexTdZ7Complex: movsd 20(%esp), %xmm0 movsd 28(%esp), %xmm3 movsd 4(%esp), %xmm1 movsd 12(%esp), %xmm2 movaps %xmm2, %xmm4 mulsd %xmm3, %xmm4 movaps %xmm1, %xmm5 mulsd %xmm0, %xmm5 subsd %xmm4, %xmm5 movsd %xmm5, (%eax) mulsd %xmm3, %xmm1 mulsd %xmm0, %xmm2 addsd %xmm1, %xmm2 movsd %xmm2, 8(%eax) ret $32 __D12test_complex5mult2FNhG2dNhG2dZNhG2d: pshufd $238, %xmm1, %xmm3 pshufd $78, %xmm0, %xmm2 mulpd %xmm3, %xmm2 pshufd $68, %xmm1, %xmm1 mulpd %xmm0, %xmm1 addsubpd %xmm2, %xmm1 movapd %xmm1, %xmm0 ret __D12test_complex5mult3FS12test_complex8ComplexSS12test_complex8ComplexSZS12test_complex8ComplexS: pushl %ebp movl %esp, %ebp andl $-16, %esp subl $16, %esp movsd 16(%ebp), %xmm1 movhpd 8(%ebp), %xmm1 movddup 32(%ebp), %xmm0 mulpd %xmm1, %xmm0 movddup 24(%ebp), %xmm1 mulpd 8(%ebp), %xmm1 addsubpd %xmm0, %xmm1 movupd %xmm1, (%eax) movl %ebp, %esp popl %ebp ret $32 Notes: - mult2 is quite more efficient than mult1. - Maybe there is a way to remove one asm instruction from mult2, the last movapd, if the precedent instruction addsubpd is done on %xmm2 and %xmm0 and other instruction registers are swapped. - Maybe the struct ComplexS is badly designed. For that code I have used the SS3 SIMD extension, that today is very common for all kind of Intel compatible CPUs. Using the AVX2 ldc2 produces: __D12test_complex5mult1FxS3std7complex14__T7ComplexTdZ7ComplexxS3std7complex14__T7ComplexTdZ7ComplexZS3std7complex14__T7ComplexTdZ7Complex: vmovsd 20(%esp), %xmm0 vmovsd 28(%esp), %xmm1 vmovsd 4(%esp), %xmm2 vmovsd 12(%esp), %xmm3 vmulsd %xmm1, %xmm3, %xmm4 vmulsd %xmm0, %xmm2, %xmm5 vsubsd %xmm4, %xmm5, %xmm4 vmovsd %xmm4, (%eax) vmulsd %xmm1, %xmm2, %xmm1 vmulsd %xmm0, %xmm3, %xmm0 vaddsd %xmm1, %xmm0, %xmm0 vmovsd %xmm0, 8(%eax) ret $32 __D12test_complex5mult2FNhG2dNhG2dZNhG2d: vpermilpd $3, %xmm1, %xmm2 vpermilpd $1, %xmm0, %xmm3 vmulpd %xmm2, %xmm3, %xmm2 vpbroadcastq %xmm1, %xmm1 vmulpd %xmm0, %xmm1, %xmm0 vaddsubpd %xmm2, %xmm0, %xmm0 ret If I add -vectorize-slp -vectorize-slp-aggressive to the compilation switches the asm of mult1 gets a bit better: __D12test_complex5mult1FxS3std7complex14__T7ComplexTdZ7ComplexxS3std7complex14__T7ComplexTdZ7ComplexZS3std7complex14__T7ComplexTdZ7Complex: vmovsd 20(%esp), %xmm0 vmovsd 28(%esp), %xmm1 vmovsd 4(%esp), %xmm2 vmovsd 12(%esp), %xmm3 vmulsd %xmm1, %xmm2, %xmm4 vmulsd %xmm0, %xmm3, %xmm5 vaddsd %xmm4, %xmm5, %xmm4 vmulsd %xmm1, %xmm3, %xmm1 vmulsd %xmm0, %xmm2, %xmm0 vsubsd %xmm1, %xmm0, %xmm0 vunpcklpd %xmm4, %xmm0, %xmm0 vmovupd %xmm0, (%eax) ret $32 So I suggest to add to std.complex some high level SIMD code like mult2, that gets compiled if the target CPU supports SS3 instructions.The text was updated successfully, but these errors were encountered: