Skip to content

Commit

Permalink
Merge pull request #194 from gonum/c128/dscal
Browse files Browse the repository at this point in the history
asm/c128: Added [d]scalinc and [d]scalunitary
  • Loading branch information
Kunde21 committed Aug 22, 2017
2 parents 267e698 + 5e3ead2 commit 563dd33
Show file tree
Hide file tree
Showing 9 changed files with 774 additions and 123 deletions.
69 changes: 69 additions & 0 deletions internal/asm/c128/dscalinc_amd64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Copyright ©2017 The gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//+build !noasm,!appengine

#include "textflag.h"

#define SRC SI
#define DST SI
#define LEN CX
#define TAIL BX
#define INC R9
#define INC3 R10
#define ALPHA X0
#define ALPHA_2 X1

#define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x0824 // MOVDDUP 8(SP), X0

// func DscalInc(alpha float64, x []complex128, n, inc uintptr)
TEXT ·DscalInc(SB), NOSPLIT, $0
MOVQ x_base+8(FP), SRC // SRC = &x
MOVQ n+32(FP), LEN // LEN = n
CMPQ LEN, $0 // if LEN == 0 { return }
JE dscal_end

MOVDDUP_ALPHA // ALPHA = alpha
MOVQ inc+40(FP), INC // INC = inc
SHLQ $4, INC // INC = INC * sizeof(complex128)
LEAQ (INC)(INC*2), INC3 // INC3 = 3 * INC
MOVUPS ALPHA, ALPHA_2 // Copy ALPHA and ALPHA_2 for pipelining
MOVQ LEN, TAIL // TAIL = LEN
SHRQ $2, LEN // LEN = floor( n / 4 )
JZ dscal_tail // if LEN == 0 { goto dscal_tail }

dscal_loop: // do {
MOVUPS (SRC), X2 // X_i = x[i]
MOVUPS (SRC)(INC*1), X3
MOVUPS (SRC)(INC*2), X4
MOVUPS (SRC)(INC3*1), X5

MULPD ALPHA, X2 // X_i *= ALPHA
MULPD ALPHA_2, X3
MULPD ALPHA, X4
MULPD ALPHA_2, X5

MOVUPS X2, (DST) // x[i] = X_i
MOVUPS X3, (DST)(INC*1)
MOVUPS X4, (DST)(INC*2)
MOVUPS X5, (DST)(INC3*1)

LEAQ (SRC)(INC*4), SRC // SRC += INC*4
DECQ LEN
JNZ dscal_loop // } while --LEN > 0

dscal_tail:
ANDQ $3, TAIL // TAIL = TAIL % 4
JE dscal_end // if TAIL == 0 { return }

dscal_tail_loop: // do {
MOVUPS (SRC), X2 // X_i = x[i]
MULPD ALPHA, X2 // X_i *= ALPHA
MOVUPS X2, (DST) // x[i] = X_i
ADDQ INC, SRC // SRC += INC
DECQ TAIL
JNZ dscal_tail_loop // } while --TAIL > 0

dscal_end:
RET
66 changes: 66 additions & 0 deletions internal/asm/c128/dscalunitary_amd64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Copyright ©2017 The gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//+build !noasm,!appengine

#include "textflag.h"

#define SRC SI
#define DST SI
#define LEN CX
#define IDX AX
#define TAIL BX
#define ALPHA X0
#define ALPHA_2 X1

#define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x0824 // MOVDDUP 8(SP), X0

// func DscalUnitary(alpha float64, x []complex128)
TEXT ·DscalUnitary(SB), NOSPLIT, $0
MOVQ x_base+8(FP), SRC // SRC = &x
MOVQ x_len+16(FP), LEN // LEN = len(x)
CMPQ LEN, $0 // if LEN == 0 { return }
JE dscal_end

MOVDDUP_ALPHA // ALPHA = alpha
XORQ IDX, IDX // IDX = 0
MOVUPS ALPHA, ALPHA_2 // Copy ALPHA to ALPHA_2 for pipelining
MOVQ LEN, TAIL // TAIL = LEN
SHRQ $2, LEN // LEN = floor( n / 4 )
JZ dscal_tail // if LEN == 0 { goto dscal_tail }

dscal_loop: // do {
MOVUPS (SRC)(IDX*8), X2 // X_i = x[i]
MOVUPS 16(SRC)(IDX*8), X3
MOVUPS 32(SRC)(IDX*8), X4
MOVUPS 48(SRC)(IDX*8), X5

MULPD ALPHA, X2 // X_i *= ALPHA
MULPD ALPHA_2, X3
MULPD ALPHA, X4
MULPD ALPHA_2, X5

MOVUPS X2, (DST)(IDX*8) // x[i] = X_i
MOVUPS X3, 16(DST)(IDX*8)
MOVUPS X4, 32(DST)(IDX*8)
MOVUPS X5, 48(DST)(IDX*8)

ADDQ $8, IDX // IDX += 8
DECQ LEN
JNZ dscal_loop // } while --LEN > 0

dscal_tail:
ANDQ $3, TAIL // TAIL = TAIL % 4
JZ dscal_end // if TAIL == 0 { return }

dscal_tail_loop: // do {
MOVUPS (SRC)(IDX*8), X2 // X_i = x[i]
MULPD ALPHA, X2 // X_i *= ALPHA
MOVUPS X2, (DST)(IDX*8) // x[i] = X_i
ADDQ $2, IDX // IDX += 2
DECQ TAIL
JNZ dscal_tail_loop // } while --TAIL > 0

dscal_end:
RET
24 changes: 0 additions & 24 deletions internal/asm/c128/scal.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,6 @@

package c128

// ScalUnitary is
// for i := range x {
// x[i] *= alpha
// }
func ScalUnitary(alpha complex128, x []complex128) {
for i := range x {
x[i] *= alpha
}
}

// ScalUnitaryTo is
// for i, v := range x {
// dst[i] = alpha * v
Expand All @@ -24,20 +14,6 @@ func ScalUnitaryTo(dst []complex128, alpha complex128, x []complex128) {
}
}

// ScalInc is
// var ix uintptr
// for i := 0; i < int(n); i++ {
// x[ix] *= alpha
// ix += incX
// }
func ScalInc(alpha complex128, x []complex128, n, incX uintptr) {
var ix uintptr
for i := 0; i < int(n); i++ {
x[ix] *= alpha
ix += incX
}
}

// ScalIncTo is
// var idst, ix uintptr
// for i := 0; i < int(n); i++ {
Expand Down
116 changes: 116 additions & 0 deletions internal/asm/c128/scalUnitary_amd64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
// Copyright ©2017 The gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//+build !noasm,!appengine

#include "textflag.h"

#define SRC SI
#define DST SI
#define LEN CX
#define IDX AX
#define TAIL BX
#define ALPHA X0
#define ALPHA_C X1
#define ALPHA2 X10
#define ALPHA_C2 X11

#define MOVDDUP_X2_X3 LONG $0xDA120FF2 // MOVDDUP X2, X3
#define MOVDDUP_X4_X5 LONG $0xEC120FF2 // MOVDDUP X4, X5
#define MOVDDUP_X6_X7 LONG $0xFE120FF2 // MOVDDUP X6, X7
#define MOVDDUP_X8_X9 LONG $0x120F45F2; BYTE $0xC8 // MOVDDUP X8, X9

#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3
#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5
#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7
#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9

// func ScalUnitary(alpha complex128, x []complex128)
TEXT ·ScalUnitary(SB), NOSPLIT, $0
MOVQ x_base+16(FP), SRC // SRC = &x
MOVQ x_len+24(FP), LEN // LEN = len(x)
CMPQ LEN, $0 // if LEN == 0 { return }
JE scal_end

MOVUPS alpha+0(FP), ALPHA // ALPHA = { imag(alpha), real(alpha) }
MOVAPS ALPHA, ALPHA_C
SHUFPD $0x1, ALPHA_C, ALPHA_C // ALPHA_C = { real(alpha), imag(alpha) }

XORQ IDX, IDX // IDX = 0
MOVAPS ALPHA, ALPHA2 // Copy ALPHA and ALPHA_C for pipelining
MOVAPS ALPHA_C, ALPHA_C2
MOVQ LEN, TAIL
SHRQ $2, LEN // LEN = floor( n / 4 )
JZ scal_tail // if BX == 0 { goto scal_tail }

scal_loop: // do {
MOVUPS (SRC)(IDX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVUPS 16(SRC)(IDX*8), X4
MOVUPS 32(SRC)(IDX*8), X6
MOVUPS 48(SRC)(IDX*8), X8

// X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_X2_X3
MOVDDUP_X4_X5
MOVDDUP_X6_X7
MOVDDUP_X8_X9

// X_i = { imag(x[i]), imag(x[i]) }
SHUFPD $0x3, X2, X2
SHUFPD $0x3, X4, X4
SHUFPD $0x3, X6, X6
SHUFPD $0x3, X8, X8

// X_i = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i]) }
// X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i]) }
MULPD ALPHA_C, X2
MULPD ALPHA, X3
MULPD ALPHA_C2, X4
MULPD ALPHA2, X5
MULPD ALPHA_C, X6
MULPD ALPHA, X7
MULPD ALPHA_C2, X8
MULPD ALPHA2, X9

// X_(i+1) = {
// imag(result[i]): imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
// real(result[i]): real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9

MOVUPS X3, (DST)(IDX*8) // x[i] = X_(i+1)
MOVUPS X5, 16(DST)(IDX*8)
MOVUPS X7, 32(DST)(IDX*8)
MOVUPS X9, 48(DST)(IDX*8)
ADDQ $8, IDX // IDX += 8
DECQ LEN
JNZ scal_loop // } while --LEN > 0

scal_tail:
ANDQ $3, TAIL // TAIL = TAIL % 4
JZ scal_end // if TAIL == 0 { return }

scal_tail_loop: // do {
MOVUPS (SRC)(IDX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
MULPD ALPHA_C, X2 // X_i = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i]) }
MULPD ALPHA, X3 // X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i]) }

// X_(i+1) = {
// imag(result[i]): imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
// real(result[i]): real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
// }
ADDSUBPD_X2_X3

MOVUPS X3, (DST)(IDX*8) // x[i] = X_(i+1)
ADDQ $2, IDX // IDX += 2
DECQ TAIL
JNZ scal_tail_loop // } while --LEN > 0

scal_end:
RET

0 comments on commit 563dd33

Please sign in to comment.