diff --git a/internal/asm/c128/dscalinc_amd64.s b/internal/asm/c128/dscalinc_amd64.s new file mode 100644 index 000000000..778e2fabb --- /dev/null +++ b/internal/asm/c128/dscalinc_amd64.s @@ -0,0 +1,69 @@ +// Copyright ©2017 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//+build !noasm,!appengine + +#include "textflag.h" + +#define SRC SI +#define DST SI +#define LEN CX +#define TAIL BX +#define INC R9 +#define INC3 R10 +#define ALPHA X0 +#define ALPHA_2 X1 + +#define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x0824 // MOVDDUP 8(SP), X0 + +// func DscalInc(alpha float64, x []complex128, n, inc uintptr) +TEXT ·DscalInc(SB), NOSPLIT, $0 + MOVQ x_base+8(FP), SRC // SRC = &x + MOVQ n+32(FP), LEN // LEN = n + CMPQ LEN, $0 // if LEN == 0 { return } + JE dscal_end + + MOVDDUP_ALPHA // ALPHA = alpha + MOVQ inc+40(FP), INC // INC = inc + SHLQ $4, INC // INC = INC * sizeof(complex128) + LEAQ (INC)(INC*2), INC3 // INC3 = 3 * INC + MOVUPS ALPHA, ALPHA_2 // Copy ALPHA and ALPHA_2 for pipelining + MOVQ LEN, TAIL // TAIL = LEN + SHRQ $2, LEN // LEN = floor( n / 4 ) + JZ dscal_tail // if LEN == 0 { goto dscal_tail } + +dscal_loop: // do { + MOVUPS (SRC), X2 // X_i = x[i] + MOVUPS (SRC)(INC*1), X3 + MOVUPS (SRC)(INC*2), X4 + MOVUPS (SRC)(INC3*1), X5 + + MULPD ALPHA, X2 // X_i *= ALPHA + MULPD ALPHA_2, X3 + MULPD ALPHA, X4 + MULPD ALPHA_2, X5 + + MOVUPS X2, (DST) // x[i] = X_i + MOVUPS X3, (DST)(INC*1) + MOVUPS X4, (DST)(INC*2) + MOVUPS X5, (DST)(INC3*1) + + LEAQ (SRC)(INC*4), SRC // SRC += INC*4 + DECQ LEN + JNZ dscal_loop // } while --LEN > 0 + +dscal_tail: + ANDQ $3, TAIL // TAIL = TAIL % 4 + JE dscal_end // if TAIL == 0 { return } + +dscal_tail_loop: // do { + MOVUPS (SRC), X2 // X_i = x[i] + MULPD ALPHA, X2 // X_i *= ALPHA + MOVUPS X2, (DST) // x[i] = X_i + ADDQ INC, SRC // SRC += INC + DECQ TAIL + JNZ dscal_tail_loop // } while --TAIL > 0 + +dscal_end: + RET diff --git a/internal/asm/c128/dscalunitary_amd64.s b/internal/asm/c128/dscalunitary_amd64.s new file mode 100644 index 000000000..32ef15461 --- /dev/null +++ b/internal/asm/c128/dscalunitary_amd64.s @@ -0,0 +1,66 @@ +// Copyright ©2017 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//+build !noasm,!appengine + +#include "textflag.h" + +#define SRC SI +#define DST SI +#define LEN CX +#define IDX AX +#define TAIL BX +#define ALPHA X0 +#define ALPHA_2 X1 + +#define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x0824 // MOVDDUP 8(SP), X0 + +// func DscalUnitary(alpha float64, x []complex128) +TEXT ·DscalUnitary(SB), NOSPLIT, $0 + MOVQ x_base+8(FP), SRC // SRC = &x + MOVQ x_len+16(FP), LEN // LEN = len(x) + CMPQ LEN, $0 // if LEN == 0 { return } + JE dscal_end + + MOVDDUP_ALPHA // ALPHA = alpha + XORQ IDX, IDX // IDX = 0 + MOVUPS ALPHA, ALPHA_2 // Copy ALPHA to ALPHA_2 for pipelining + MOVQ LEN, TAIL // TAIL = LEN + SHRQ $2, LEN // LEN = floor( n / 4 ) + JZ dscal_tail // if LEN == 0 { goto dscal_tail } + +dscal_loop: // do { + MOVUPS (SRC)(IDX*8), X2 // X_i = x[i] + MOVUPS 16(SRC)(IDX*8), X3 + MOVUPS 32(SRC)(IDX*8), X4 + MOVUPS 48(SRC)(IDX*8), X5 + + MULPD ALPHA, X2 // X_i *= ALPHA + MULPD ALPHA_2, X3 + MULPD ALPHA, X4 + MULPD ALPHA_2, X5 + + MOVUPS X2, (DST)(IDX*8) // x[i] = X_i + MOVUPS X3, 16(DST)(IDX*8) + MOVUPS X4, 32(DST)(IDX*8) + MOVUPS X5, 48(DST)(IDX*8) + + ADDQ $8, IDX // IDX += 8 + DECQ LEN + JNZ dscal_loop // } while --LEN > 0 + +dscal_tail: + ANDQ $3, TAIL // TAIL = TAIL % 4 + JZ dscal_end // if TAIL == 0 { return } + +dscal_tail_loop: // do { + MOVUPS (SRC)(IDX*8), X2 // X_i = x[i] + MULPD ALPHA, X2 // X_i *= ALPHA + MOVUPS X2, (DST)(IDX*8) // x[i] = X_i + ADDQ $2, IDX // IDX += 2 + DECQ TAIL + JNZ dscal_tail_loop // } while --TAIL > 0 + +dscal_end: + RET diff --git a/internal/asm/c128/scal.go b/internal/asm/c128/scal.go index 9d26973df..580b4d8da 100644 --- a/internal/asm/c128/scal.go +++ b/internal/asm/c128/scal.go @@ -4,16 +4,6 @@ package c128 -// ScalUnitary is -// for i := range x { -// x[i] *= alpha -// } -func ScalUnitary(alpha complex128, x []complex128) { - for i := range x { - x[i] *= alpha - } -} - // ScalUnitaryTo is // for i, v := range x { // dst[i] = alpha * v @@ -24,20 +14,6 @@ func ScalUnitaryTo(dst []complex128, alpha complex128, x []complex128) { } } -// ScalInc is -// var ix uintptr -// for i := 0; i < int(n); i++ { -// x[ix] *= alpha -// ix += incX -// } -func ScalInc(alpha complex128, x []complex128, n, incX uintptr) { - var ix uintptr - for i := 0; i < int(n); i++ { - x[ix] *= alpha - ix += incX - } -} - // ScalIncTo is // var idst, ix uintptr // for i := 0; i < int(n); i++ { diff --git a/internal/asm/c128/scalUnitary_amd64.s b/internal/asm/c128/scalUnitary_amd64.s new file mode 100644 index 000000000..22152a0ed --- /dev/null +++ b/internal/asm/c128/scalUnitary_amd64.s @@ -0,0 +1,116 @@ +// Copyright ©2017 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//+build !noasm,!appengine + +#include "textflag.h" + +#define SRC SI +#define DST SI +#define LEN CX +#define IDX AX +#define TAIL BX +#define ALPHA X0 +#define ALPHA_C X1 +#define ALPHA2 X10 +#define ALPHA_C2 X11 + +#define MOVDDUP_X2_X3 LONG $0xDA120FF2 // MOVDDUP X2, X3 +#define MOVDDUP_X4_X5 LONG $0xEC120FF2 // MOVDDUP X4, X5 +#define MOVDDUP_X6_X7 LONG $0xFE120FF2 // MOVDDUP X6, X7 +#define MOVDDUP_X8_X9 LONG $0x120F45F2; BYTE $0xC8 // MOVDDUP X8, X9 + +#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3 +#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5 +#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7 +#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9 + +// func ScalUnitary(alpha complex128, x []complex128) +TEXT ·ScalUnitary(SB), NOSPLIT, $0 + MOVQ x_base+16(FP), SRC // SRC = &x + MOVQ x_len+24(FP), LEN // LEN = len(x) + CMPQ LEN, $0 // if LEN == 0 { return } + JE scal_end + + MOVUPS alpha+0(FP), ALPHA // ALPHA = { imag(alpha), real(alpha) } + MOVAPS ALPHA, ALPHA_C + SHUFPD $0x1, ALPHA_C, ALPHA_C // ALPHA_C = { real(alpha), imag(alpha) } + + XORQ IDX, IDX // IDX = 0 + MOVAPS ALPHA, ALPHA2 // Copy ALPHA and ALPHA_C for pipelining + MOVAPS ALPHA_C, ALPHA_C2 + MOVQ LEN, TAIL + SHRQ $2, LEN // LEN = floor( n / 4 ) + JZ scal_tail // if BX == 0 { goto scal_tail } + +scal_loop: // do { + MOVUPS (SRC)(IDX*8), X2 // X_i = { imag(x[i]), real(x[i]) } + MOVUPS 16(SRC)(IDX*8), X4 + MOVUPS 32(SRC)(IDX*8), X6 + MOVUPS 48(SRC)(IDX*8), X8 + + // X_(i+1) = { real(x[i], real(x[i]) } + MOVDDUP_X2_X3 + MOVDDUP_X4_X5 + MOVDDUP_X6_X7 + MOVDDUP_X8_X9 + + // X_i = { imag(x[i]), imag(x[i]) } + SHUFPD $0x3, X2, X2 + SHUFPD $0x3, X4, X4 + SHUFPD $0x3, X6, X6 + SHUFPD $0x3, X8, X8 + + // X_i = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i]) } + // X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i]) } + MULPD ALPHA_C, X2 + MULPD ALPHA, X3 + MULPD ALPHA_C2, X4 + MULPD ALPHA2, X5 + MULPD ALPHA_C, X6 + MULPD ALPHA, X7 + MULPD ALPHA_C2, X8 + MULPD ALPHA2, X9 + + // X_(i+1) = { + // imag(result[i]): imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]), + // real(result[i]): real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i]) + // } + ADDSUBPD_X2_X3 + ADDSUBPD_X4_X5 + ADDSUBPD_X6_X7 + ADDSUBPD_X8_X9 + + MOVUPS X3, (DST)(IDX*8) // x[i] = X_(i+1) + MOVUPS X5, 16(DST)(IDX*8) + MOVUPS X7, 32(DST)(IDX*8) + MOVUPS X9, 48(DST)(IDX*8) + ADDQ $8, IDX // IDX += 8 + DECQ LEN + JNZ scal_loop // } while --LEN > 0 + +scal_tail: + ANDQ $3, TAIL // TAIL = TAIL % 4 + JZ scal_end // if TAIL == 0 { return } + +scal_tail_loop: // do { + MOVUPS (SRC)(IDX*8), X2 // X_i = { imag(x[i]), real(x[i]) } + MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) } + SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) } + MULPD ALPHA_C, X2 // X_i = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i]) } + MULPD ALPHA, X3 // X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i]) } + + // X_(i+1) = { + // imag(result[i]): imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]), + // real(result[i]): real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i]) + // } + ADDSUBPD_X2_X3 + + MOVUPS X3, (DST)(IDX*8) // x[i] = X_(i+1) + ADDQ $2, IDX // IDX += 2 + DECQ TAIL + JNZ scal_tail_loop // } while --LEN > 0 + +scal_end: + RET diff --git a/internal/asm/c128/scalinc_amd64.s b/internal/asm/c128/scalinc_amd64.s new file mode 100644 index 000000000..ec0f15285 --- /dev/null +++ b/internal/asm/c128/scalinc_amd64.s @@ -0,0 +1,121 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//+build !noasm,!appengine + +#include "textflag.h" + +#define SRC SI +#define DST SI +#define LEN CX +#define TAIL BX +#define INC R9 +#define INC3 R10 +#define ALPHA X0 +#define ALPHA_C X1 +#define ALPHA2 X10 +#define ALPHA_C2 X11 + +#define MOVDDUP_X2_X3 LONG $0xDA120FF2 // MOVDDUP X2, X3 +#define MOVDDUP_X4_X5 LONG $0xEC120FF2 // MOVDDUP X4, X5 +#define MOVDDUP_X6_X7 LONG $0xFE120FF2 // MOVDDUP X6, X7 +#define MOVDDUP_X8_X9 LONG $0x120F45F2; BYTE $0xC8 // MOVDDUP X8, X9 + +#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3 +#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5 +#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7 +#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9 + +// func ScalInc(alpha complex128, x []complex128, n, inc uintptr) +TEXT ·ScalInc(SB), NOSPLIT, $0 + MOVQ x_base+16(FP), SRC // SRC = &x + MOVQ n+40(FP), LEN // LEN = len(x) + CMPQ LEN, $0 + JE scal_end // if LEN == 0 { return } + + MOVQ inc+48(FP), INC // INC = inc + SHLQ $4, INC // INC = INC * sizeof(complex128) + LEAQ (INC)(INC*2), INC3 // INC3 = 3 * INC + + MOVUPS alpha+0(FP), ALPHA // ALPHA = { imag(alpha), real(alpha) } + MOVAPS ALPHA, ALPHA_C + SHUFPD $0x1, ALPHA_C, ALPHA_C // ALPHA_C = { real(alpha), imag(alpha) } + + MOVAPS ALPHA, ALPHA2 // Copy ALPHA and ALPHA_C for pipelining + MOVAPS ALPHA_C, ALPHA_C2 + MOVQ LEN, TAIL + SHRQ $2, LEN // LEN = floor( n / 4 ) + JZ scal_tail // if BX == 0 { goto scal_tail } + +scal_loop: // do { + MOVUPS (SRC), X2 // X_i = { imag(x[i]), real(x[i]) } + MOVUPS (SRC)(INC*1), X4 + MOVUPS (SRC)(INC*2), X6 + MOVUPS (SRC)(INC3*1), X8 + + // X_(i+1) = { real(x[i], real(x[i]) } + MOVDDUP_X2_X3 + MOVDDUP_X4_X5 + MOVDDUP_X6_X7 + MOVDDUP_X8_X9 + + // X_i = { imag(x[i]), imag(x[i]) } + SHUFPD $0x3, X2, X2 + SHUFPD $0x3, X4, X4 + SHUFPD $0x3, X6, X6 + SHUFPD $0x3, X8, X8 + + // X_i = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i]) } + // X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i]) } + MULPD ALPHA_C, X2 + MULPD ALPHA, X3 + MULPD ALPHA_C2, X4 + MULPD ALPHA2, X5 + MULPD ALPHA_C, X6 + MULPD ALPHA, X7 + MULPD ALPHA_C2, X8 + MULPD ALPHA2, X9 + + // X_(i+1) = { + // imag(result[i]): imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]), + // real(result[i]): real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i]) + // } + ADDSUBPD_X2_X3 + ADDSUBPD_X4_X5 + ADDSUBPD_X6_X7 + ADDSUBPD_X8_X9 + + MOVUPS X3, (DST) // x[i] = X_(i+1) + MOVUPS X5, (DST)(INC*1) + MOVUPS X7, (DST)(INC*2) + MOVUPS X9, (DST)(INC3*1) + + LEAQ (SRC)(INC*4), SRC // SRC = &(SRC[inc*4]) + DECQ LEN + JNZ scal_loop // } while --BX > 0 + +scal_tail: + ANDQ $3, TAIL // TAIL = TAIL % 4 + JE scal_end // if TAIL == 0 { return } + +scal_tail_loop: // do { + MOVUPS (SRC), X2 // X_i = { imag(x[i]), real(x[i]) } + MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) } + SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) } + MULPD ALPHA_C, X2 // X_i = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i]) } + MULPD ALPHA, X3 // X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i]) } + + // X_(i+1) = { + // imag(result[i]): imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]), + // real(result[i]): real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i]) + // } + ADDSUBPD_X2_X3 + + MOVUPS X3, (DST) // x[i] = X_i + ADDQ INC, SRC // SRC = &(SRC[incX]) + DECQ TAIL + JNZ scal_tail_loop // } while --TAIL > 0 + +scal_end: + RET diff --git a/internal/asm/c128/stubs_amd64.go b/internal/asm/c128/stubs_amd64.go index fe190299e..f3494a78c 100644 --- a/internal/asm/c128/stubs_amd64.go +++ b/internal/asm/c128/stubs_amd64.go @@ -34,3 +34,31 @@ func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) // idst += incDst // } func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) + +// DscalUnitary is +// for i, v := range x { +// x[i] = complex(real(v)*alpha, imag(v)*alpha) +// } +func DscalUnitary(alpha float64, x []complex128) + +// DscalInc is +// var ix uintptr +// for i := 0; i < int(n); i++ { +// x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha) +// ix += inc +// } +func DscalInc(alpha float64, x []complex128, n, inc uintptr) + +// ScalInc is +// var ix uintptr +// for i := 0; i < int(n); i++ { +// x[ix] *= alpha +// ix += incX +// } +func ScalInc(alpha complex128, x []complex128, n, inc uintptr) + +// ScalUnitary is +// for i := range x { +// x[i] *= alpha +// } +func ScalUnitary(alpha complex128, x []complex128) diff --git a/internal/asm/c128/stubs_noasm.go b/internal/asm/c128/stubs_noasm.go index bf373612e..008a1e6a9 100644 --- a/internal/asm/c128/stubs_noasm.go +++ b/internal/asm/c128/stubs_noasm.go @@ -55,3 +55,51 @@ func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y [] idst += incDst } } + +// DscalUnitary is +// for i, v := range x { +// x[i] = complex(real(v)*alpha, imag(v)*alpha) +// } +func DscalUnitary(alpha float64, x []complex128) { + for i, v := range x { + x[i] = complex(real(v)*alpha, imag(v)*alpha) + } +} + +// DscalInc is +// var ix uintptr +// for i := 0; i < int(n); i++ { +// x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha) +// ix += inc +// } +func DscalInc(alpha float64, x []complex128, n, inc uintptr) { + var ix uintptr + for i := 0; i < int(n); i++ { + x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha) + ix += inc + } +} + +// ScalInc is +// var ix uintptr +// for i := 0; i < int(n); i++ { +// x[ix] *= alpha +// ix += incX +// } +func ScalInc(alpha complex128, x []complex128, n, inc uintptr) { + var ix uintptr + for i := 0; i < int(n); i++ { + x[ix] *= alpha + ix += inc + } +} + +// ScalUnitary is +// for i := range x { +// x[i] *= alpha +// } +func ScalUnitary(alpha complex128, x []complex128) { + for i := range x { + x[i] *= alpha + } +} diff --git a/internal/asm/c128/stubs_test.go b/internal/asm/c128/stubs_test.go index 2c7390456..6af21e118 100644 --- a/internal/asm/c128/stubs_test.go +++ b/internal/asm/c128/stubs_test.go @@ -4,7 +4,10 @@ package c128 -import "testing" +import ( + "fmt" + "testing" +) var tests = []struct { incX, incY, incDst int @@ -82,150 +85,251 @@ var tests = []struct { ex: []complex128{2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i}}, } -func guardVector(vec []complex128, guard_val complex128, guard_len int) (guarded []complex128) { - guarded = make([]complex128, len(vec)+guard_len*2) - copy(guarded[guard_len:], vec) - for i := 0; i < guard_len; i++ { - guarded[i] = guard_val - guarded[len(guarded)-1-i] = guard_val - } - return guarded -} - -func isValidGuard(vec []complex128, guard_val complex128, guard_len int) bool { - for i := 0; i < guard_len; i++ { - if vec[i] != guard_val || vec[len(vec)-1-i] != guard_val { - return false - } - } - return true -} - func TestAxpyUnitary(t *testing.T) { - var x_gd, y_gd complex128 = 1, 1 + const xGdVal, yGdVal = 1, 1 for cas, test := range tests { - xg_ln, yg_ln := 4+cas%2, 4+cas%3 - test.x, test.y = guardVector(test.x, x_gd, xg_ln), guardVector(test.y, y_gd, yg_ln) - x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln] + xgLn, ygLn := 4+cas%2, 4+cas%3 + test.x, test.y = guardVector(test.x, xGdVal, xgLn), guardVector(test.y, yGdVal, ygLn) + x, y := test.x[xgLn:len(test.x)-xgLn], test.y[ygLn:len(test.y)-ygLn] AxpyUnitary(test.a, x, y) for i := range test.ex { if y[i] != test.ex[i] { t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, y[i], test.ex[i]) } } - if !isValidGuard(test.x, x_gd, xg_ln) { - t.Errorf("Test %d Guard violated in x vector %v %v", cas, test.x[:xg_ln], test.x[len(test.x)-xg_ln:]) + if !isValidGuard(test.x, xGdVal, xgLn) { + t.Errorf("Test %d Guard violated in x vector %v %v", cas, test.x[:xgLn], test.x[len(test.x)-xgLn:]) } - if !isValidGuard(test.y, y_gd, yg_ln) { - t.Errorf("Test %d Guard violated in y vector %v %v", cas, test.y[:yg_ln], test.y[len(test.y)-yg_ln:]) + if !isValidGuard(test.y, yGdVal, ygLn) { + t.Errorf("Test %d Guard violated in y vector %v %v", cas, test.y[:ygLn], test.y[len(test.y)-ygLn:]) } } } func TestAxpyUnitaryTo(t *testing.T) { - var x_gd, y_gd, dst_gd complex128 = 1, 1, 0 + const xGdVal, yGdVal, dstGdVal = 1, 1, 0 for cas, test := range tests { - xg_ln, yg_ln := 4+cas%2, 4+cas%3 - test.x, test.y = guardVector(test.x, x_gd, xg_ln), guardVector(test.y, y_gd, yg_ln) - test.dst = guardVector(test.dst, dst_gd, xg_ln) - x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln] - dst := test.dst[xg_ln : len(test.dst)-xg_ln] + xgLn, ygLn := 4+cas%2, 4+cas%3 + test.x, test.y = guardVector(test.x, xGdVal, xgLn), guardVector(test.y, yGdVal, ygLn) + test.dst = guardVector(test.dst, dstGdVal, xgLn) + x, y := test.x[xgLn:len(test.x)-xgLn], test.y[ygLn:len(test.y)-ygLn] + dst := test.dst[xgLn : len(test.dst)-xgLn] AxpyUnitaryTo(dst, test.a, x, y) for i := range test.ex { if dst[i] != test.ex[i] { t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, dst[i], test.ex[i]) } } - if !isValidGuard(test.x, x_gd, xg_ln) { - t.Errorf("Test %d Guard violated in x vector %v %v", cas, test.x[:xg_ln], test.x[len(test.x)-xg_ln:]) + if !isValidGuard(test.x, xGdVal, xgLn) { + t.Errorf("Test %d Guard violated in x vector %v %v", cas, test.x[:xgLn], test.x[len(test.x)-xgLn:]) } - if !isValidGuard(test.y, y_gd, yg_ln) { - t.Errorf("Test %d Guard violated in y vector %v %v", cas, test.y[:yg_ln], test.y[len(test.y)-yg_ln:]) + if !isValidGuard(test.y, yGdVal, ygLn) { + t.Errorf("Test %d Guard violated in y vector %v %v", cas, test.y[:ygLn], test.y[len(test.y)-ygLn:]) } - if !isValidGuard(test.dst, dst_gd, xg_ln) { - t.Errorf("Test %d Guard violated in dst vector %v %v", cas, test.dst[:xg_ln], test.dst[len(test.dst)-xg_ln:]) + if !isValidGuard(test.dst, dstGdVal, xgLn) { + t.Errorf("Test %d Guard violated in dst vector %v %v", cas, test.dst[:xgLn], test.dst[len(test.dst)-xgLn:]) } } } -func guardIncVector(vec []complex128, guard_val complex128, incV uintptr, guard_len int) (guarded []complex128) { - inc := int(incV) - s_ln := len(vec) * inc - if inc < 0 { - s_ln = len(vec) * -inc - } - guarded = make([]complex128, s_ln+guard_len*2) - for i, cas := 0, 0; i < len(guarded); i++ { - switch { - case i < guard_len, i > guard_len+s_ln: - guarded[i] = guard_val - case (i-guard_len)%(inc) == 0 && cas < len(vec): - guarded[i] = vec[cas] - cas++ - default: - guarded[i] = guard_val - } - } - return guarded -} - -func checkValidIncGuard(t *testing.T, vec []complex128, guard_val complex128, incV uintptr, guard_len int) { - inc := int(incV) - s_ln := len(vec) - 2*guard_len - if inc < 0 { - s_ln = len(vec) * -inc - } - - for i := range vec { - switch { - case vec[i] == guard_val: - // Correct value - case i < guard_len: - t.Errorf("Front guard violated at %d %v", i, vec[:guard_len]) - case i > guard_len+s_ln: - t.Errorf("Back guard violated at %d %v", i-guard_len-s_ln, vec[guard_len+s_ln:]) - case (i-guard_len)%inc == 0 && (i-guard_len)/inc < len(vec): - // Ignore input values - default: - t.Errorf("Internal guard violated at %d %v", i-guard_len, vec[guard_len:guard_len+s_ln]) - } - } -} - func TestAxpyInc(t *testing.T) { - var x_gd, y_gd complex128 = 1, 1 + const xGdVal, yGdVal = 1, 1 for cas, test := range tests { - xg_ln, yg_ln := 4+cas%2, 4+cas%3 - test.x, test.y = guardIncVector(test.x, x_gd, uintptr(test.incX), xg_ln), guardIncVector(test.y, y_gd, uintptr(test.incY), yg_ln) - x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln] + xgLn, ygLn := 4+cas%2, 4+cas%3 + test.x, test.y = guardIncVector(test.x, xGdVal, test.incX, xgLn), guardIncVector(test.y, yGdVal, test.incY, ygLn) + x, y := test.x[xgLn:len(test.x)-xgLn], test.y[ygLn:len(test.y)-ygLn] AxpyInc(test.a, x, y, uintptr(len(test.ex)), uintptr(test.incX), uintptr(test.incY), test.ix, test.iy) for i := range test.ex { if y[int(test.iy)+i*int(test.incY)] != test.ex[i] { t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, y[i*int(test.incY)], test.ex[i]) } } - checkValidIncGuard(t, test.x, x_gd, uintptr(test.incX), xg_ln) - checkValidIncGuard(t, test.y, y_gd, uintptr(test.incY), yg_ln) + checkValidIncGuard(t, test.x, xGdVal, test.incX, xgLn) + checkValidIncGuard(t, test.y, yGdVal, test.incY, ygLn) } } func TestAxpyIncTo(t *testing.T) { - var x_gd, y_gd, dst_gd complex128 = 1, 1, 0 + const xGdVal, yGdVal, dstGdVal = 1, 1, 0 for cas, test := range tests { - xg_ln, yg_ln := 4+cas%2, 4+cas%3 - test.x, test.y = guardIncVector(test.x, x_gd, uintptr(test.incX), xg_ln), guardIncVector(test.y, y_gd, uintptr(test.incY), yg_ln) - test.dst = guardIncVector(test.dst, dst_gd, uintptr(test.incDst), xg_ln) - x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln] - dst := test.dst[xg_ln : len(test.dst)-xg_ln] + xgLn, ygLn := 4+cas%2, 4+cas%3 + test.x, test.y = guardIncVector(test.x, xGdVal, test.incX, xgLn), guardIncVector(test.y, yGdVal, test.incY, ygLn) + test.dst = guardIncVector(test.dst, dstGdVal, test.incDst, xgLn) + x, y := test.x[xgLn:len(test.x)-xgLn], test.y[ygLn:len(test.y)-ygLn] + dst := test.dst[xgLn : len(test.dst)-xgLn] AxpyIncTo(dst, uintptr(test.incDst), test.idst, test.a, x, y, uintptr(len(test.ex)), uintptr(test.incX), uintptr(test.incY), test.ix, test.iy) for i := range test.ex { if dst[int(test.idst)+i*int(test.incDst)] != test.ex[i] { t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, dst[i*int(test.incDst)], test.ex[i]) } } - checkValidIncGuard(t, test.x, x_gd, uintptr(test.incX), xg_ln) - checkValidIncGuard(t, test.y, y_gd, uintptr(test.incY), yg_ln) - checkValidIncGuard(t, test.dst, dst_gd, uintptr(test.incDst), xg_ln) + checkValidIncGuard(t, test.x, xGdVal, test.incX, xgLn) + checkValidIncGuard(t, test.y, yGdVal, test.incY, ygLn) + checkValidIncGuard(t, test.dst, dstGdVal, test.incDst, xgLn) + } +} + +var dscalTests = []struct { + alpha float64 + x []complex128 + want []complex128 +}{ + { + alpha: 0, + x: []complex128{}, + want: []complex128{}, + }, + { + alpha: 1, + x: []complex128{1 + 2i}, + want: []complex128{1 + 2i}, + }, + { + alpha: 2, + x: []complex128{1 + 2i}, + want: []complex128{2 + 4i}, + }, + { + alpha: 2, + x: []complex128{1 + 2i, 3 + 5i, 6 + 11i, 12 - 23i}, + want: []complex128{2 + 4i, 6 + 10i, 12 + 22i, 24 - 46i}, + }, + { + alpha: 3, + x: []complex128{1 + 2i, 5 + 4i, 3 + 6i, 8 + 12i, -3 - 2i, -5 + 5i}, + want: []complex128{3 + 6i, 15 + 12i, 9 + 18i, 24 + 36i, -9 - 6i, -15 + 15i}, + }, + { + alpha: 5, + x: []complex128{1 + 2i, 5 + 4i, 3 + 6i, 8 + 12i, -3 - 2i, -5 + 5i, 1 + 2i, 5 + 4i, 3 + 6i, 8 + 12i, -3 - 2i, -5 + 5i}, + want: []complex128{5 + 10i, 25 + 20i, 15 + 30i, 40 + 60i, -15 - 10i, -25 + 25i, 5 + 10i, 25 + 20i, 15 + 30i, 40 + 60i, -15 - 10i, -25 + 25i}, + }, +} + +func TestDscalUnitary(t *testing.T) { + const xGdVal = -0.5 + for i, test := range dscalTests { + for _, align := range align1 { + prefix := fmt.Sprintf("Test %v (x:%v)", i, align) + xgLn := 4 + align + xg := guardVector(test.x, xGdVal, xgLn) + x := xg[xgLn : len(xg)-xgLn] + + DscalUnitary(test.alpha, x) + + for i := range test.want { + if !same(x[i], test.want[i]) { + t.Errorf(msgVal, prefix, i, x[i], test.want[i]) + } + } + if !isValidGuard(xg, xGdVal, xgLn) { + t.Errorf(msgGuard, prefix, "x", xg[:xgLn], xg[len(xg)-xgLn:]) + } + } + } +} + +func TestDscalInc(t *testing.T) { + const xGdVal = -0.5 + gdLn := 4 + for i, test := range dscalTests { + n := len(test.x) + for _, incX := range []int{1, 2, 3, 4, 7, 10} { + prefix := fmt.Sprintf("Test %v (x:%v)", i, incX) + xg := guardIncVector(test.x, xGdVal, incX, gdLn) + x := xg[gdLn : len(xg)-gdLn] + + DscalInc(test.alpha, x, uintptr(n), uintptr(incX)) + + for i := range test.want { + if !same(x[i*incX], test.want[i]) { + t.Errorf(msgVal, prefix, i, x[i*incX], test.want[i]) + } + } + checkValidIncGuard(t, xg, xGdVal, incX, gdLn) + } + } +} + +var scalTests = []struct { + alpha complex128 + x []complex128 + want []complex128 +}{ + { + alpha: 0, + x: []complex128{}, + want: []complex128{}, + }, + { + alpha: 1 + 1i, + x: []complex128{1 + 2i}, + want: []complex128{-1 + 3i}, + }, + { + alpha: 2 + 3i, + x: []complex128{1 + 2i}, + want: []complex128{-4 + 7i}, + }, + { + alpha: 2 - 4i, + x: []complex128{1 + 2i}, + want: []complex128{10}, + }, + { + alpha: 2 + 8i, + x: []complex128{1 + 2i, 5 + 4i, 3 + 6i, 8 + 12i, -3 - 2i, -5 + 5i}, + want: []complex128{-14 + 12i, -22 + 48i, -42 + 36i, -80 + 88i, 10 - 28i, -50 - 30i}, + }, + { + alpha: 5 - 10i, + x: []complex128{1 + 2i, 5 + 4i, 3 + 6i, 8 + 12i, -3 - 2i, -5 + 5i, 1 + 2i, 5 + 4i, 3 + 6i, 8 + 12i, -3 - 2i, -5 + 5i}, + want: []complex128{25, 65 - 30i, 75, 160 - 20i, -35 + 20i, 25 + 75i, 25, 65 - 30i, 75, 160 - 20i, -35 + 20i, 25 + 75i}, + }, +} + +func TestScalUnitary(t *testing.T) { + const xGdVal = -0.5 + for i, test := range scalTests { + for _, align := range align1 { + prefix := fmt.Sprintf("Test %v (x:%v)", i, align) + xgLn := 4 + align + xg := guardVector(test.x, xGdVal, xgLn) + x := xg[xgLn : len(xg)-xgLn] + + ScalUnitary(test.alpha, x) + + for i := range test.want { + if !same(x[i], test.want[i]) { + t.Errorf(msgVal, prefix, i, x[i], test.want[i]) + } + } + if !isValidGuard(xg, xGdVal, xgLn) { + t.Errorf(msgGuard, prefix, "x", xg[:xgLn], xg[len(xg)-xgLn:]) + } + } + } +} + +func TestScalInc(t *testing.T) { + const xGdVal = -0.5 + gdLn := 4 + for i, test := range scalTests { + n := len(test.x) + for _, inc := range []int{1, 2, 3, 4, 7, 10} { + prefix := fmt.Sprintf("Test %v (x:%v)", i, inc) + xg := guardIncVector(test.x, xGdVal, inc, gdLn) + x := xg[gdLn : len(xg)-gdLn] + + ScalInc(test.alpha, x, uintptr(n), uintptr(inc)) + + for i := range test.want { + if !same(x[i*inc], test.want[i]) { + t.Errorf(msgVal, prefix, i, x[i*inc], test.want[i]) + } + } + checkValidIncGuard(t, xg, xGdVal, inc, gdLn) + } } } diff --git a/internal/asm/c128/util_test.go b/internal/asm/c128/util_test.go new file mode 100644 index 000000000..e07470d97 --- /dev/null +++ b/internal/asm/c128/util_test.go @@ -0,0 +1,123 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package c128 + +import ( + "math" + "testing" +) + +const ( + msgVal = "%v: unexpected value at %v Got: %v Expected: %v" + msgGuard = "%v: Guard violated in %s vector %v %v" +) + +func same(x, y complex128) bool { + return (x == y || + math.IsNaN(real(x)) && math.IsNaN(real(y)) && imag(x) == imag(y) || + math.IsNaN(imag(y)) && math.IsNaN(imag(x)) && real(y) == real(x) || + math.IsNaN(real(x)) && math.IsNaN(real(y)) && math.IsNaN(imag(y)) && math.IsNaN(imag(x))) +} + +func guardVector(vec []complex128, guard_val complex128, guard_len int) (guarded []complex128) { + guarded = make([]complex128, len(vec)+guard_len*2) + copy(guarded[guard_len:], vec) + for i := 0; i < guard_len; i++ { + guarded[i] = guard_val + guarded[len(guarded)-1-i] = guard_val + } + return guarded +} + +func isValidGuard(vec []complex128, guard_val complex128, guard_len int) bool { + for i := 0; i < guard_len; i++ { + if vec[i] != guard_val || vec[len(vec)-1-i] != guard_val { + return false + } + } + return true +} + +func guardIncVector(vec []complex128, guard_val complex128, inc, guard_len int) (guarded []complex128) { + s_ln := len(vec) * inc + if inc < 0 { + s_ln = len(vec) * -inc + } + guarded = make([]complex128, s_ln+guard_len*2) + for i, cas := 0, 0; i < len(guarded); i++ { + switch { + case i < guard_len, i > guard_len+s_ln: + guarded[i] = guard_val + case (i-guard_len)%(inc) == 0 && cas < len(vec): + guarded[i] = vec[cas] + cas++ + default: + guarded[i] = guard_val + } + } + return guarded +} + +func checkValidIncGuard(t *testing.T, vec []complex128, guard_val complex128, inc, guard_len int) { + s_ln := len(vec) - 2*guard_len + if inc < 0 { + s_ln = len(vec) * -inc + } + + for i := range vec { + switch { + case vec[i] == guard_val: + // Correct value + case i < guard_len: + t.Errorf("Front guard violated at %d %v", i, vec[:guard_len]) + case i > guard_len+s_ln: + t.Errorf("Back guard violated at %d %v", i-guard_len-s_ln, vec[guard_len+s_ln:]) + case (i-guard_len)%inc == 0 && (i-guard_len)/inc < len(vec): + // Ignore input values + default: + t.Errorf("Internal guard violated at %d %v", i-guard_len, vec[guard_len:guard_len+s_ln]) + } + } +} + +var ( // Offset sets for testing alignment handling in Unitary assembly functions. + align1 = []int{0, 1} + align2 = newIncSet(0, 1) + align3 = newIncToSet(0, 1) +) + +type incSet struct { + x, y int +} + +// genInc will generate all (x,y) combinations of the input increment set. +func newIncSet(inc ...int) []incSet { + n := len(inc) + is := make([]incSet, n*n) + for x := range inc { + for y := range inc { + is[x*n+y] = incSet{inc[x], inc[y]} + } + } + return is +} + +type incToSet struct { + dst, x, y int +} + +// genIncTo will generate all (dst,x,y) combinations of the input increment set. +func newIncToSet(inc ...int) []incToSet { + n := len(inc) + is := make([]incToSet, n*n*n) + for i, dst := range inc { + for x := range inc { + for y := range inc { + is[i*n*n+x*n+y] = incToSet{dst, inc[x], inc[y]} + } + } + } + return is +}