From 57ca31d159e97dc3bdeee2bf9d3c0d7943f7858f Mon Sep 17 00:00:00 2001 From: Chad Kunde Date: Tue, 22 Aug 2017 11:51:43 -0700 Subject: [PATCH] asm/c128: Asm dot product functions. --- internal/asm/c128/dotcinc_amd64.s | 153 ++++++++++++++++++++++++++ internal/asm/c128/dotcunitary_amd64.s | 143 ++++++++++++++++++++++++ internal/asm/c128/dotuinc_amd64.s | 141 ++++++++++++++++++++++++ internal/asm/c128/dotuunitary_amd64.s | 130 ++++++++++++++++++++++ 4 files changed, 567 insertions(+) create mode 100644 internal/asm/c128/dotcinc_amd64.s create mode 100644 internal/asm/c128/dotcunitary_amd64.s create mode 100644 internal/asm/c128/dotuinc_amd64.s create mode 100644 internal/asm/c128/dotuunitary_amd64.s diff --git a/internal/asm/c128/dotcinc_amd64.s b/internal/asm/c128/dotcinc_amd64.s new file mode 100644 index 0000000000..9146f4ec5b --- /dev/null +++ b/internal/asm/c128/dotcinc_amd64.s @@ -0,0 +1,153 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//+build !noasm,!appengine + +#include "textflag.h" + +#define MOVDDUP_XPTR__X3 LONG $0x1E120FF2 // MOVDDUP (SI), X3 +#define MOVDDUP_XPTR_INCX__X5 LONG $0x120F42F2; WORD $0x062C // MOVDDUP (SI)(R8*1), X5 +#define MOVDDUP_XPTR_INCX_2__X7 LONG $0x120F42F2; WORD $0x463C // MOVDDUP (SI)(R8*2), X7 +#define MOVDDUP_XPTR_INCx3X__X9 LONG $0x120F46F2; WORD $0x0E0C // MOVDDUP (SI)(R9*1), X9 + +#define MOVDDUP_8_XPTR__X2 LONG $0x56120FF2; BYTE $0x08 // MOVDDUP 8(SI), X2 +#define MOVDDUP_8_XPTR_INCX__X4 LONG $0x120F42F2; WORD $0x0664; BYTE $0x08 // MOVDDUP 8(SI)(R8*1), X4 +#define MOVDDUP_8_XPTR_INCX_2__X6 LONG $0x120F42F2; WORD $0x4674; BYTE $0x08 // MOVDDUP 8(SI)(R8*2), X6 +#define MOVDDUP_8_XPTR_INCx3X__X8 LONG $0x120F46F2; WORD $0x0E44; BYTE $0x08 // MOVDDUP 8(SI)(R9*1), X8 + +#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3 +#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5 +#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7 +#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9 + +#define X_PTR SI +#define Y_PTR DI +#define LEN CX +#define TAIL BX +#define SUM X0 +#define P_SUM X1 +#define INC_X R8 +#define INCx3_X R9 +#define INC_Y R10 +#define INCx3_Y R11 +#define NEG1 X15 +#define P_NEG1 X14 + +// func DotcInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128) +TEXT ·DotcInc(SB), NOSPLIT, $0 + MOVQ x_base+0(FP), X_PTR // X_PTR = &x + MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y + MOVQ n+48(FP), LEN // LEN = n + PXOR SUM, SUM // SUM = 0 + CMPQ LEN, $0 // if LEN == 0 { return } + JE dot_end + PXOR P_SUM, P_SUM // P_SUM = 0 + MOVQ ix+72(FP), INC_X // INC_X = ix * sizeof(complex128) + SHLQ $4, INC_X + MOVQ iy+80(FP), INC_Y // INC_Y = iy * sizeof(complex128) + SHLQ $4, INC_Y + LEAQ (X_PTR)(INC_X*1), X_PTR // X_PTR = &(X_PTR[ix]) + LEAQ (Y_PTR)(INC_Y*1), Y_PTR // Y_PTR = &(Y_PTR[iy]) + MOVQ incX+56(FP), INC_X // INC_X = incX + SHLQ $4, INC_X // INC_X *= sizeof(complex128) + MOVQ incY+64(FP), INC_Y // INC_Y = incY + SHLQ $4, INC_Y // INC_Y *= sizeof(complex128) + MOVSD $(-1.0), NEG1 + SHUFPD $0, NEG1, NEG1 // { -1, -1 } + MOVQ LEN, TAIL + ANDQ $3, TAIL // TAIL = n % 4 + SHRQ $2, LEN // LEN = floor( n / 4 ) + JZ dot_tail // if n <= 4 { goto dot_tail } + MOVAPS NEG1, P_NEG1 // Copy NEG1 to P_NEG1 for pipelining + LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = 3 * incX * sizeof(complex128) + LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = 3 * incY * sizeof(complex128) + +dot_loop: // do { + MOVDDUP_XPTR__X3 // X_(i+1) = { real(x[i], real(x[i]) } + MOVDDUP_XPTR_INCX__X5 + MOVDDUP_XPTR_INCX_2__X7 + MOVDDUP_XPTR_INCx3X__X9 + + MOVDDUP_8_XPTR__X2 // X_i = { imag(x[i]), imag(x[i]) } + MOVDDUP_8_XPTR_INCX__X4 + MOVDDUP_8_XPTR_INCX_2__X6 + MOVDDUP_8_XPTR_INCx3X__X8 + + // X_i = { -imag(x[i]), -imag(x[i]) } + MULPD NEG1, X2 + MULPD P_NEG1, X4 + MULPD NEG1, X6 + MULPD P_NEG1, X8 + + // X_j = { imag(y[i]), real(y[i]) } + MOVUPS (Y_PTR), X10 + MOVUPS (Y_PTR)(INC_Y*1), X11 + MOVUPS (Y_PTR)(INC_Y*2), X12 + MOVUPS (Y_PTR)(INCx3_Y*1), X13 + + // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } + MULPD X10, X3 + MULPD X11, X5 + MULPD X12, X7 + MULPD X13, X9 + + // X_j = { real(y[i]), imag(y[i]) } + SHUFPD $0x1, X10, X10 + SHUFPD $0x1, X11, X11 + SHUFPD $0x1, X12, X12 + SHUFPD $0x1, X13, X13 + + // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } + MULPD X10, X2 + MULPD X11, X4 + MULPD X12, X6 + MULPD X13, X8 + + // X_(i+1) = { + // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), + // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) + // } + ADDSUBPD_X2_X3 + ADDSUBPD_X4_X5 + ADDSUBPD_X6_X7 + ADDSUBPD_X8_X9 + + // psum += result[i] + ADDPD X3, SUM + ADDPD X5, P_SUM + ADDPD X7, SUM + ADDPD X9, P_SUM + + LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4]) + LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4]) + + DECQ LEN + JNZ dot_loop // } while --LEN > 0 + ADDPD P_SUM, SUM // sum += psum + CMPQ TAIL, $0 // if TAIL == 0 { return } + JE dot_end + +dot_tail: // do { + MOVDDUP_XPTR__X3 // X_(i+1) = { real(x[i], real(x[i]) } + MOVDDUP_8_XPTR__X2 // X_i = { imag(x[i]), imag(x[i]) } + MULPD NEG1, X2 // X_i = { -imag(x[i]) , -imag(x[i]) } + MOVUPS (Y_PTR), X10 // X_j = { imag(y[i]) , real(y[i]) } + MULPD X10, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } + SHUFPD $0x1, X10, X10 // X_j = { real(y[i]) , imag(y[i]) } + MULPD X10, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } + + // X_(i+1) = { + // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), + // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) + // } + ADDSUBPD_X2_X3 + ADDPD X3, SUM // sum += result[i] + ADDQ INC_X, X_PTR // X_PTR += incX + ADDQ INC_Y, Y_PTR // Y_PTR += incY + DECQ TAIL + JNZ dot_tail // } while --TAIL > 0 + +dot_end: + MOVUPS SUM, sum+88(FP) + RET diff --git a/internal/asm/c128/dotcunitary_amd64.s b/internal/asm/c128/dotcunitary_amd64.s new file mode 100644 index 0000000000..f16b20102b --- /dev/null +++ b/internal/asm/c128/dotcunitary_amd64.s @@ -0,0 +1,143 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//+build !noasm,!appengine + +#include "textflag.h" + +#define MOVDDUP_XPTR_IDX_8__X3 LONG $0x1C120FF2; BYTE $0xC6 // MOVDDUP (SI)(AX*8), X3 +#define MOVDDUP_16_XPTR_IDX_8__X5 LONG $0x6C120FF2; WORD $0x10C6 // MOVDDUP 16(SI)(AX*8), X5 +#define MOVDDUP_32_XPTR_IDX_8__X7 LONG $0x7C120FF2; WORD $0x20C6 // MOVDDUP 32(SI)(AX*8), X7 +#define MOVDDUP_48_XPTR_IDX_8__X9 LONG $0x120F44F2; WORD $0xC64C; BYTE $0x30 // MOVDDUP 48(SI)(AX*8), X9 + +#define MOVDDUP_XPTR_IIDX_8__X2 LONG $0x14120FF2; BYTE $0xD6 // MOVDDUP (SI)(DX*8), X2 +#define MOVDDUP_16_XPTR_IIDX_8__X4 LONG $0x64120FF2; WORD $0x10D6 // MOVDDUP 16(SI)(DX*8), X4 +#define MOVDDUP_32_XPTR_IIDX_8__X6 LONG $0x74120FF2; WORD $0x20D6 // MOVDDUP 32(SI)(DX*8), X6 +#define MOVDDUP_48_XPTR_IIDX_8__X8 LONG $0x120F44F2; WORD $0xD644; BYTE $0x30 // MOVDDUP 48(SI)(DX*8), X8 + +#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3 +#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5 +#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7 +#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9 + +#define X_PTR SI +#define Y_PTR DI +#define LEN CX +#define TAIL BX +#define SUM X0 +#define P_SUM X1 +#define IDX AX +#define I_IDX DX +#define NEG1 X15 +#define P_NEG1 X14 + +// func DotcUnitary(x, y []complex128) (sum complex128) +TEXT ·DotcUnitary(SB), NOSPLIT, $0 + MOVQ x_base+0(FP), X_PTR // X_PTR = &x + MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y + MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) ) + CMPQ y_len+32(FP), LEN + CMOVQLE y_len+32(FP), LEN + PXOR SUM, SUM // sum = 0 + CMPQ LEN, $0 // if LEN == 0 { return } + JE dot_end + XORPS P_SUM, P_SUM // psum = 0 + MOVSD $(-1.0), NEG1 + SHUFPD $0, NEG1, NEG1 // { -1, -1 } + XORQ IDX, IDX // i := 0 + MOVQ $1, I_IDX // j := 1 + MOVQ LEN, TAIL + ANDQ $3, TAIL // TAIL = floor( TAIL / 4 ) + SHRQ $2, LEN // LEN = TAIL % 4 + JZ dot_tail // if LEN == 0 { goto dot_tail } + + MOVAPS NEG1, P_NEG1 // Copy NEG1 to P_NEG1 for pipelining + +dot_loop: // do { + MOVDDUP_XPTR_IDX_8__X3 // X_(i+1) = { real(x[i], real(x[i]) } + MOVDDUP_16_XPTR_IDX_8__X5 + MOVDDUP_32_XPTR_IDX_8__X7 + MOVDDUP_48_XPTR_IDX_8__X9 + + MOVDDUP_XPTR_IIDX_8__X2 // X_i = { imag(x[i]), imag(x[i]) } + MOVDDUP_16_XPTR_IIDX_8__X4 + MOVDDUP_32_XPTR_IIDX_8__X6 + MOVDDUP_48_XPTR_IIDX_8__X8 + + // X_i = { -imag(x[i]), -imag(x[i]) } + MULPD NEG1, X2 + MULPD P_NEG1, X4 + MULPD NEG1, X6 + MULPD P_NEG1, X8 + + // X_j = { imag(y[i]), real(y[i]) } + MOVUPS (Y_PTR)(IDX*8), X10 + MOVUPS 16(Y_PTR)(IDX*8), X11 + MOVUPS 32(Y_PTR)(IDX*8), X12 + MOVUPS 48(Y_PTR)(IDX*8), X13 + + // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } + MULPD X10, X3 + MULPD X11, X5 + MULPD X12, X7 + MULPD X13, X9 + + // X_j = { real(y[i]), imag(y[i]) } + SHUFPD $0x1, X10, X10 + SHUFPD $0x1, X11, X11 + SHUFPD $0x1, X12, X12 + SHUFPD $0x1, X13, X13 + + // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } + MULPD X10, X2 + MULPD X11, X4 + MULPD X12, X6 + MULPD X13, X8 + + // X_(i+1) = { + // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), + // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) + // } + ADDSUBPD_X2_X3 + ADDSUBPD_X4_X5 + ADDSUBPD_X6_X7 + ADDSUBPD_X8_X9 + + // psum += result[i] + ADDPD X3, SUM + ADDPD X5, P_SUM + ADDPD X7, SUM + ADDPD X9, P_SUM + + ADDQ $8, IDX // IDX += 8 + ADDQ $8, I_IDX // I_IDX += 8 + DECQ LEN + JNZ dot_loop // } while --LEN > 0 + ADDPD P_SUM, SUM // sum += psum + CMPQ TAIL, $0 // if TAIL == 0 { return } + JE dot_end + +dot_tail: // do { + MOVDDUP_XPTR_IDX_8__X3 // X_(i+1) = { real(x[i]) , real(x[i]) } + MOVDDUP_XPTR_IIDX_8__X2 // X_i = { imag(x[i]) , imag(x[i]) } + MULPD NEG1, X2 // X_i = { -imag(x[i]) , -imag(x[i]) } + MOVUPS (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]) , real(y[i]) } + MULPD X10, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } + SHUFPD $0x1, X10, X10 // X_j = { real(y[i]) , imag(y[i]) } + MULPD X10, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } + + // X_(i+1) = { + // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), + // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) + // } + ADDSUBPD_X2_X3 + ADDPD X3, SUM // SUM += result[i] + ADDQ $2, IDX // IDX += 2 + ADDQ $2, I_IDX // I_IDX += 2 + DECQ TAIL + JNZ dot_tail // } while --TAIL > 0 + +dot_end: + MOVUPS SUM, sum+48(FP) + RET diff --git a/internal/asm/c128/dotuinc_amd64.s b/internal/asm/c128/dotuinc_amd64.s new file mode 100644 index 0000000000..814569460b --- /dev/null +++ b/internal/asm/c128/dotuinc_amd64.s @@ -0,0 +1,141 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//+build !noasm,!appengine + +#include "textflag.h" + +#define MOVDDUP_XPTR__X3 LONG $0x1E120FF2 // MOVDDUP (SI), X3 +#define MOVDDUP_XPTR_INCX__X5 LONG $0x120F42F2; WORD $0x062C // MOVDDUP (SI)(R8*1), X5 +#define MOVDDUP_XPTR_INCX_2__X7 LONG $0x120F42F2; WORD $0x463C // MOVDDUP (SI)(R8*2), X7 +#define MOVDDUP_XPTR_INCx3X__X9 LONG $0x120F46F2; WORD $0x0E0C // MOVDDUP (SI)(R9*1), X9 + +#define MOVDDUP_8_XPTR__X2 LONG $0x56120FF2; BYTE $0x08 // MOVDDUP 8(SI), X2 +#define MOVDDUP_8_XPTR_INCX__X4 LONG $0x120F42F2; WORD $0x0664; BYTE $0x08 // MOVDDUP 8(SI)(R8*1), X4 +#define MOVDDUP_8_XPTR_INCX_2__X6 LONG $0x120F42F2; WORD $0x4674; BYTE $0x08 // MOVDDUP 8(SI)(R8*2), X6 +#define MOVDDUP_8_XPTR_INCx3X__X8 LONG $0x120F46F2; WORD $0x0E44; BYTE $0x08 // MOVDDUP 8(SI)(R9*1), X8 + +#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3 +#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5 +#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7 +#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9 + +#define X_PTR SI +#define Y_PTR DI +#define LEN CX +#define TAIL BX +#define SUM X0 +#define P_SUM X1 +#define INC_X R8 +#define INCx3_X R9 +#define INC_Y R10 +#define INCx3_Y R11 + +// func DotuInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128) +TEXT ·DotuInc(SB), NOSPLIT, $0 + MOVQ x_base+0(FP), X_PTR // X_PTR = &x + MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y + MOVQ n+48(FP), LEN // LEN = n + PXOR SUM, SUM // sum = 0 + CMPQ LEN, $0 // if LEN == 0 { return } + JE dot_end + MOVQ ix+72(FP), INC_X // INC_X = ix * sizeof(complex128) + SHLQ $4, INC_X + MOVQ iy+80(FP), INC_Y // INC_Y = iy * sizeof(complex128) + SHLQ $4, INC_Y + LEAQ (X_PTR)(INC_X*1), X_PTR // X_PTR = &(X_PTR[ix]) + LEAQ (Y_PTR)(INC_Y*1), Y_PTR // Y_PTR = &(Y_PTR[iy]) + MOVQ incX+56(FP), INC_X // INC_X = incX + SHLQ $4, INC_X // INC_X *= sizeof(complex128) + MOVQ incY+64(FP), INC_Y // INC_Y = incY + SHLQ $4, INC_Y // INC_Y *= sizeof(complex128) + MOVQ LEN, TAIL + ANDQ $3, TAIL // LEN = LEN % 4 + SHRQ $2, LEN // LEN = floor( LEN / 4 ) + JZ dot_tail // if LEN <= 4 { goto dot_tail } + PXOR P_SUM, P_SUM // psum = 0 + LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = 3 * incX * sizeof(complex128) + LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = 3 * incY * sizeof(complex128) + +dot_loop: // do { + MOVDDUP_XPTR__X3 // X_(i+1) = { real(x[i], real(x[i]) } + MOVDDUP_XPTR_INCX__X5 + MOVDDUP_XPTR_INCX_2__X7 + MOVDDUP_XPTR_INCx3X__X9 + + MOVDDUP_8_XPTR__X2 // X_i = { imag(x[i]), imag(x[i]) } + MOVDDUP_8_XPTR_INCX__X4 + MOVDDUP_8_XPTR_INCX_2__X6 + MOVDDUP_8_XPTR_INCx3X__X8 + + // X_j = { imag(y[i]), real(y[i]) } + MOVUPS (Y_PTR), X10 + MOVUPS (Y_PTR)(INC_Y*1), X11 + MOVUPS (Y_PTR)(INC_Y*2), X12 + MOVUPS (Y_PTR)(INCx3_Y*1), X13 + + // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } + MULPD X10, X3 + MULPD X11, X5 + MULPD X12, X7 + MULPD X13, X9 + + // X_j = { real(y[i]), imag(y[i]) } + SHUFPD $0x1, X10, X10 + SHUFPD $0x1, X11, X11 + SHUFPD $0x1, X12, X12 + SHUFPD $0x1, X13, X13 + + // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } + MULPD X10, X2 + MULPD X11, X4 + MULPD X12, X6 + MULPD X13, X8 + + // X_(i+1) = { + // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), + // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) + // } + ADDSUBPD_X2_X3 + ADDSUBPD_X4_X5 + ADDSUBPD_X6_X7 + ADDSUBPD_X8_X9 + + // psum += result[i] + ADDPD X3, SUM + ADDPD X5, P_SUM + ADDPD X7, SUM + ADDPD X9, P_SUM + + LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4]) + LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4]) + + DECQ LEN + JNZ dot_loop // } while --BX > 0 + ADDPD P_SUM, SUM // sum += psum + CMPQ TAIL, $0 // if TAIL == 0 { return } + JE dot_end + +dot_tail: // do { + MOVDDUP_XPTR__X3 // X_(i+1) = { real(x[i], real(x[i]) } + MOVDDUP_8_XPTR__X2 // X_i = { imag(x[i]), imag(x[i]) } + MOVUPS (Y_PTR), X10 // X_j = { imag(y[i]) , real(y[i]) } + MULPD X10, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } + SHUFPD $0x1, X10, X10 // X_j = { real(y[i]) , imag(y[i]) } + MULPD X10, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } + + // X_(i+1) = { + // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), + // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) + // } + ADDSUBPD_X2_X3 + ADDPD X3, SUM // sum += result[i] + ADDQ INC_X, X_PTR // X_PTR += incX + ADDQ INC_Y, Y_PTR // Y_PTR += incY + DECQ TAIL // --TAIL + JNZ dot_tail // } while TAIL > 0 + +dot_end: + MOVUPS SUM, sum+88(FP) + RET diff --git a/internal/asm/c128/dotuunitary_amd64.s b/internal/asm/c128/dotuunitary_amd64.s new file mode 100644 index 0000000000..cbeacdf6af --- /dev/null +++ b/internal/asm/c128/dotuunitary_amd64.s @@ -0,0 +1,130 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//+build !noasm,!appengine + +#include "textflag.h" + +#define MOVDDUP_XPTR_IDX_8__X3 LONG $0x1C120FF2; BYTE $0xC6 // MOVDDUP (SI)(AX*8), X3 +#define MOVDDUP_16_XPTR_IDX_8__X5 LONG $0x6C120FF2; WORD $0x10C6 // MOVDDUP 16(SI)(AX*8), X5 +#define MOVDDUP_32_XPTR_IDX_8__X7 LONG $0x7C120FF2; WORD $0x20C6 // MOVDDUP 32(SI)(AX*8), X7 +#define MOVDDUP_48_XPTR_IDX_8__X9 LONG $0x120F44F2; WORD $0xC64C; BYTE $0x30 // MOVDDUP 48(SI)(AX*8), X9 + +#define MOVDDUP_XPTR_IIDX_8__X2 LONG $0x14120FF2; BYTE $0xD6 // MOVDDUP (SI)(DX*8), X2 +#define MOVDDUP_16_XPTR_IIDX_8__X4 LONG $0x64120FF2; WORD $0x10D6 // MOVDDUP 16(SI)(DX*8), X4 +#define MOVDDUP_32_XPTR_IIDX_8__X6 LONG $0x74120FF2; WORD $0x20D6 // MOVDDUP 32(SI)(DX*8), X6 +#define MOVDDUP_48_XPTR_IIDX_8__X8 LONG $0x120F44F2; WORD $0xD644; BYTE $0x30 // MOVDDUP 48(SI)(DX*8), X8 + +#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3 +#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5 +#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7 +#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9 + +#define X_PTR SI +#define Y_PTR DI +#define LEN CX +#define TAIL BX +#define SUM X0 +#define P_SUM X1 +#define IDX AX +#define I_IDX DX + +// func DotuUnitary(x, y []complex128) (sum complex128) +TEXT ·DotuUnitary(SB), NOSPLIT, $0 + MOVQ x_base+0(FP), X_PTR // X_PTR = &x + MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y + MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) ) + CMPQ y_len+32(FP), LEN + CMOVQLE y_len+32(FP), LEN + PXOR SUM, SUM // SUM = 0 + CMPQ LEN, $0 // if LEN == 0 { return } + JE dot_end + PXOR P_SUM, P_SUM // P_SUM = 0 + XORQ IDX, IDX // IDX = 0 + MOVQ $1, DX // j = 1 + MOVQ LEN, TAIL + ANDQ $3, TAIL // TAIL = floor( LEN / 4 ) + SHRQ $2, LEN // LEN = LEN % 4 + JZ dot_tail // if LEN == 0 { goto dot_tail } + +dot_loop: // do { + MOVDDUP_XPTR_IDX_8__X3 // X_(i+1) = { real(x[i], real(x[i]) } + MOVDDUP_16_XPTR_IDX_8__X5 + MOVDDUP_32_XPTR_IDX_8__X7 + MOVDDUP_48_XPTR_IDX_8__X9 + + MOVDDUP_XPTR_IIDX_8__X2 // X_i = { imag(x[i]), imag(x[i]) } + MOVDDUP_16_XPTR_IIDX_8__X4 + MOVDDUP_32_XPTR_IIDX_8__X6 + MOVDDUP_48_XPTR_IIDX_8__X8 + + // X_j = { imag(y[i]), real(y[i]) } + MOVUPS (Y_PTR)(IDX*8), X10 + MOVUPS 16(Y_PTR)(IDX*8), X11 + MOVUPS 32(Y_PTR)(IDX*8), X12 + MOVUPS 48(Y_PTR)(IDX*8), X13 + + // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } + MULPD X10, X3 + MULPD X11, X5 + MULPD X12, X7 + MULPD X13, X9 + + // X_j = { real(y[i]), imag(y[i]) } + SHUFPD $0x1, X10, X10 + SHUFPD $0x1, X11, X11 + SHUFPD $0x1, X12, X12 + SHUFPD $0x1, X13, X13 + + // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } + MULPD X10, X2 + MULPD X11, X4 + MULPD X12, X6 + MULPD X13, X8 + + // X_(i+1) = { + // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), + // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) + // } + ADDSUBPD_X2_X3 + ADDSUBPD_X4_X5 + ADDSUBPD_X6_X7 + ADDSUBPD_X8_X9 + + // psum += result[i] + ADDPD X3, SUM + ADDPD X5, P_SUM + ADDPD X7, SUM + ADDPD X9, P_SUM + + ADDQ $8, IDX // IDX += 8 + ADDQ $8, I_IDX // I_IDX += 8 + DECQ LEN + JNZ dot_loop // } while --LEN > 0 + ADDPD P_SUM, SUM // SUM += P_SUM + CMPQ TAIL, $0 // if TAIL == 0 { return } + JE dot_end + +dot_tail: // do { + MOVDDUP_XPTR_IDX_8__X3 // X_(i+1) = { real(x[i] , real(x[i]) } + MOVDDUP_XPTR_IIDX_8__X2 // X_i = { imag(x[i]) , imag(x[i]) } + MOVUPS (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]) , real(y[i]) } + MULPD X10, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } + SHUFPD $0x1, X10, X10 // X_j = { real(y[i]) , imag(y[i]) } + MULPD X10, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } + + // X_(i+1) = { + // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), + // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) + // } + ADDSUBPD_X2_X3 + ADDPD X3, SUM // psum += result[i] + ADDQ $2, IDX // IDX += 2 + ADDQ $2, I_IDX // I_IDX += 2 + DECQ TAIL // --TAIL + JNZ dot_tail // } while TAIL > 0 + +dot_end: + MOVUPS SUM, sum+48(FP) + RET