-
Notifications
You must be signed in to change notification settings - Fork 528
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #194 from gonum/c128/dscal
asm/c128: Added [d]scalinc and [d]scalunitary
- Loading branch information
Showing
9 changed files
with
774 additions
and
123 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
// Copyright ©2017 The gonum Authors. All rights reserved. | ||
// Use of this source code is governed by a BSD-style | ||
// license that can be found in the LICENSE file. | ||
|
||
//+build !noasm,!appengine | ||
|
||
#include "textflag.h" | ||
|
||
#define SRC SI | ||
#define DST SI | ||
#define LEN CX | ||
#define TAIL BX | ||
#define INC R9 | ||
#define INC3 R10 | ||
#define ALPHA X0 | ||
#define ALPHA_2 X1 | ||
|
||
#define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x0824 // MOVDDUP 8(SP), X0 | ||
|
||
// func DscalInc(alpha float64, x []complex128, n, inc uintptr) | ||
TEXT ·DscalInc(SB), NOSPLIT, $0 | ||
MOVQ x_base+8(FP), SRC // SRC = &x | ||
MOVQ n+32(FP), LEN // LEN = n | ||
CMPQ LEN, $0 // if LEN == 0 { return } | ||
JE dscal_end | ||
|
||
MOVDDUP_ALPHA // ALPHA = alpha | ||
MOVQ inc+40(FP), INC // INC = inc | ||
SHLQ $4, INC // INC = INC * sizeof(complex128) | ||
LEAQ (INC)(INC*2), INC3 // INC3 = 3 * INC | ||
MOVUPS ALPHA, ALPHA_2 // Copy ALPHA and ALPHA_2 for pipelining | ||
MOVQ LEN, TAIL // TAIL = LEN | ||
SHRQ $2, LEN // LEN = floor( n / 4 ) | ||
JZ dscal_tail // if LEN == 0 { goto dscal_tail } | ||
|
||
dscal_loop: // do { | ||
MOVUPS (SRC), X2 // X_i = x[i] | ||
MOVUPS (SRC)(INC*1), X3 | ||
MOVUPS (SRC)(INC*2), X4 | ||
MOVUPS (SRC)(INC3*1), X5 | ||
|
||
MULPD ALPHA, X2 // X_i *= ALPHA | ||
MULPD ALPHA_2, X3 | ||
MULPD ALPHA, X4 | ||
MULPD ALPHA_2, X5 | ||
|
||
MOVUPS X2, (DST) // x[i] = X_i | ||
MOVUPS X3, (DST)(INC*1) | ||
MOVUPS X4, (DST)(INC*2) | ||
MOVUPS X5, (DST)(INC3*1) | ||
|
||
LEAQ (SRC)(INC*4), SRC // SRC += INC*4 | ||
DECQ LEN | ||
JNZ dscal_loop // } while --LEN > 0 | ||
|
||
dscal_tail: | ||
ANDQ $3, TAIL // TAIL = TAIL % 4 | ||
JE dscal_end // if TAIL == 0 { return } | ||
|
||
dscal_tail_loop: // do { | ||
MOVUPS (SRC), X2 // X_i = x[i] | ||
MULPD ALPHA, X2 // X_i *= ALPHA | ||
MOVUPS X2, (DST) // x[i] = X_i | ||
ADDQ INC, SRC // SRC += INC | ||
DECQ TAIL | ||
JNZ dscal_tail_loop // } while --TAIL > 0 | ||
|
||
dscal_end: | ||
RET |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
// Copyright ©2017 The gonum Authors. All rights reserved. | ||
// Use of this source code is governed by a BSD-style | ||
// license that can be found in the LICENSE file. | ||
|
||
//+build !noasm,!appengine | ||
|
||
#include "textflag.h" | ||
|
||
#define SRC SI | ||
#define DST SI | ||
#define LEN CX | ||
#define IDX AX | ||
#define TAIL BX | ||
#define ALPHA X0 | ||
#define ALPHA_2 X1 | ||
|
||
#define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x0824 // MOVDDUP 8(SP), X0 | ||
|
||
// func DscalUnitary(alpha float64, x []complex128) | ||
TEXT ·DscalUnitary(SB), NOSPLIT, $0 | ||
MOVQ x_base+8(FP), SRC // SRC = &x | ||
MOVQ x_len+16(FP), LEN // LEN = len(x) | ||
CMPQ LEN, $0 // if LEN == 0 { return } | ||
JE dscal_end | ||
|
||
MOVDDUP_ALPHA // ALPHA = alpha | ||
XORQ IDX, IDX // IDX = 0 | ||
MOVUPS ALPHA, ALPHA_2 // Copy ALPHA to ALPHA_2 for pipelining | ||
MOVQ LEN, TAIL // TAIL = LEN | ||
SHRQ $2, LEN // LEN = floor( n / 4 ) | ||
JZ dscal_tail // if LEN == 0 { goto dscal_tail } | ||
|
||
dscal_loop: // do { | ||
MOVUPS (SRC)(IDX*8), X2 // X_i = x[i] | ||
MOVUPS 16(SRC)(IDX*8), X3 | ||
MOVUPS 32(SRC)(IDX*8), X4 | ||
MOVUPS 48(SRC)(IDX*8), X5 | ||
|
||
MULPD ALPHA, X2 // X_i *= ALPHA | ||
MULPD ALPHA_2, X3 | ||
MULPD ALPHA, X4 | ||
MULPD ALPHA_2, X5 | ||
|
||
MOVUPS X2, (DST)(IDX*8) // x[i] = X_i | ||
MOVUPS X3, 16(DST)(IDX*8) | ||
MOVUPS X4, 32(DST)(IDX*8) | ||
MOVUPS X5, 48(DST)(IDX*8) | ||
|
||
ADDQ $8, IDX // IDX += 8 | ||
DECQ LEN | ||
JNZ dscal_loop // } while --LEN > 0 | ||
|
||
dscal_tail: | ||
ANDQ $3, TAIL // TAIL = TAIL % 4 | ||
JZ dscal_end // if TAIL == 0 { return } | ||
|
||
dscal_tail_loop: // do { | ||
MOVUPS (SRC)(IDX*8), X2 // X_i = x[i] | ||
MULPD ALPHA, X2 // X_i *= ALPHA | ||
MOVUPS X2, (DST)(IDX*8) // x[i] = X_i | ||
ADDQ $2, IDX // IDX += 2 | ||
DECQ TAIL | ||
JNZ dscal_tail_loop // } while --TAIL > 0 | ||
|
||
dscal_end: | ||
RET |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
// Copyright ©2017 The gonum Authors. All rights reserved. | ||
// Use of this source code is governed by a BSD-style | ||
// license that can be found in the LICENSE file. | ||
|
||
//+build !noasm,!appengine | ||
|
||
#include "textflag.h" | ||
|
||
#define SRC SI | ||
#define DST SI | ||
#define LEN CX | ||
#define IDX AX | ||
#define TAIL BX | ||
#define ALPHA X0 | ||
#define ALPHA_C X1 | ||
#define ALPHA2 X10 | ||
#define ALPHA_C2 X11 | ||
|
||
#define MOVDDUP_X2_X3 LONG $0xDA120FF2 // MOVDDUP X2, X3 | ||
#define MOVDDUP_X4_X5 LONG $0xEC120FF2 // MOVDDUP X4, X5 | ||
#define MOVDDUP_X6_X7 LONG $0xFE120FF2 // MOVDDUP X6, X7 | ||
#define MOVDDUP_X8_X9 LONG $0x120F45F2; BYTE $0xC8 // MOVDDUP X8, X9 | ||
|
||
#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3 | ||
#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5 | ||
#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7 | ||
#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9 | ||
|
||
// func ScalUnitary(alpha complex128, x []complex128) | ||
TEXT ·ScalUnitary(SB), NOSPLIT, $0 | ||
MOVQ x_base+16(FP), SRC // SRC = &x | ||
MOVQ x_len+24(FP), LEN // LEN = len(x) | ||
CMPQ LEN, $0 // if LEN == 0 { return } | ||
JE scal_end | ||
|
||
MOVUPS alpha+0(FP), ALPHA // ALPHA = { imag(alpha), real(alpha) } | ||
MOVAPS ALPHA, ALPHA_C | ||
SHUFPD $0x1, ALPHA_C, ALPHA_C // ALPHA_C = { real(alpha), imag(alpha) } | ||
|
||
XORQ IDX, IDX // IDX = 0 | ||
MOVAPS ALPHA, ALPHA2 // Copy ALPHA and ALPHA_C for pipelining | ||
MOVAPS ALPHA_C, ALPHA_C2 | ||
MOVQ LEN, TAIL | ||
SHRQ $2, LEN // LEN = floor( n / 4 ) | ||
JZ scal_tail // if BX == 0 { goto scal_tail } | ||
|
||
scal_loop: // do { | ||
MOVUPS (SRC)(IDX*8), X2 // X_i = { imag(x[i]), real(x[i]) } | ||
MOVUPS 16(SRC)(IDX*8), X4 | ||
MOVUPS 32(SRC)(IDX*8), X6 | ||
MOVUPS 48(SRC)(IDX*8), X8 | ||
|
||
// X_(i+1) = { real(x[i], real(x[i]) } | ||
MOVDDUP_X2_X3 | ||
MOVDDUP_X4_X5 | ||
MOVDDUP_X6_X7 | ||
MOVDDUP_X8_X9 | ||
|
||
// X_i = { imag(x[i]), imag(x[i]) } | ||
SHUFPD $0x3, X2, X2 | ||
SHUFPD $0x3, X4, X4 | ||
SHUFPD $0x3, X6, X6 | ||
SHUFPD $0x3, X8, X8 | ||
|
||
// X_i = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i]) } | ||
// X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i]) } | ||
MULPD ALPHA_C, X2 | ||
MULPD ALPHA, X3 | ||
MULPD ALPHA_C2, X4 | ||
MULPD ALPHA2, X5 | ||
MULPD ALPHA_C, X6 | ||
MULPD ALPHA, X7 | ||
MULPD ALPHA_C2, X8 | ||
MULPD ALPHA2, X9 | ||
|
||
// X_(i+1) = { | ||
// imag(result[i]): imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]), | ||
// real(result[i]): real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i]) | ||
// } | ||
ADDSUBPD_X2_X3 | ||
ADDSUBPD_X4_X5 | ||
ADDSUBPD_X6_X7 | ||
ADDSUBPD_X8_X9 | ||
|
||
MOVUPS X3, (DST)(IDX*8) // x[i] = X_(i+1) | ||
MOVUPS X5, 16(DST)(IDX*8) | ||
MOVUPS X7, 32(DST)(IDX*8) | ||
MOVUPS X9, 48(DST)(IDX*8) | ||
ADDQ $8, IDX // IDX += 8 | ||
DECQ LEN | ||
JNZ scal_loop // } while --LEN > 0 | ||
|
||
scal_tail: | ||
ANDQ $3, TAIL // TAIL = TAIL % 4 | ||
JZ scal_end // if TAIL == 0 { return } | ||
|
||
scal_tail_loop: // do { | ||
MOVUPS (SRC)(IDX*8), X2 // X_i = { imag(x[i]), real(x[i]) } | ||
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) } | ||
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) } | ||
MULPD ALPHA_C, X2 // X_i = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i]) } | ||
MULPD ALPHA, X3 // X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i]) } | ||
|
||
// X_(i+1) = { | ||
// imag(result[i]): imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]), | ||
// real(result[i]): real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i]) | ||
// } | ||
ADDSUBPD_X2_X3 | ||
|
||
MOVUPS X3, (DST)(IDX*8) // x[i] = X_(i+1) | ||
ADDQ $2, IDX // IDX += 2 | ||
DECQ TAIL | ||
JNZ scal_tail_loop // } while --LEN > 0 | ||
|
||
scal_end: | ||
RET |
Oops, something went wrong.