Permalink
Browse files

math/big: improve performance for addVV/subVV for ppc64x

This change adds a better asm implementation of addVV for ppc64x, with speedups
up to nearly 3x in the best cases.

benchmark                   old ns/op     new ns/op     delta
BenchmarkAddVV/1-8          7.33          5.81          -20.74%
BenchmarkAddVV/2-8          8.72          6.49          -25.57%
BenchmarkAddVV/3-8          10.5          7.08          -32.57%
BenchmarkAddVV/4-8          12.7          7.57          -40.39%
BenchmarkAddVV/5-8          14.3          8.06          -43.64%
BenchmarkAddVV/10-8         27.6          11.1          -59.78%
BenchmarkAddVV/100-8        218           82.4          -62.20%
BenchmarkAddVV/1000-8       2064          718           -65.21%
BenchmarkAddVV/10000-8      20536         7153          -65.17%
BenchmarkAddVV/100000-8     211004        72403         -65.69%

benchmark                   old MB/s     new MB/s     speedup
BenchmarkAddVV/1-8          8729.74      11006.26     1.26x
BenchmarkAddVV/2-8          14683.65     19707.55     1.34x
BenchmarkAddVV/3-8          18226.96     27103.63     1.49x
BenchmarkAddVV/4-8          20204.50     33805.81     1.67x
BenchmarkAddVV/5-8          22348.64     39694.06     1.78x
BenchmarkAddVV/10-8         23212.74     57631.08     2.48x
BenchmarkAddVV/100-8        29300.07     77629.53     2.65x
BenchmarkAddVV/1000-8       31000.56     89094.54     2.87x
BenchmarkAddVV/10000-8      31163.61     89469.16     2.87x
BenchmarkAddVV/100000-8     30331.16     88393.73     2.91x

It also adds the use of CTR for the loop counter in subVV, instead of
manually updating the loop counter. This is slightly faster.

Change-Id: Ic4b05cad384fd057972d46a5618ed5c3039d7460
Reviewed-on: https://go-review.googlesource.com/41010
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
  • Loading branch information...
ceseo authored and laboger committed Apr 18, 2017
1 parent a041806 commit 9459c03b29937d236a8b61e452cb02d01c7b8559
Showing with 37 additions and 12 deletions.
  1. +37 −12 src/math/big/arith_ppc64x.s
@@ -19,8 +19,35 @@ TEXT ·mulWW(SB), NOSPLIT, $0
MOVD R7, z0+24(FP)
RET
// func addVV(z, y, y []Word) (c Word)
// z[i] = x[i] + y[i] for all i, carrying
TEXT ·addVV(SB), NOSPLIT, $0
BR ·addVV_g(SB)
MOVD z_len+8(FP), R7
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD z+0(FP), R10
MOVD R0, R4
MOVD R0, R6 // R6 will be the address index
ADDC R4, R4 // clear CA
MOVD R7, CTR
CMP R0, R7
BEQ done
loop:
MOVD (R8)(R6), R11 // x[i]
MOVD (R9)(R6), R12 // y[i]
ADDE R12, R11, R15 // x[i] + y[i] + CA
MOVD R15, (R10)(R6) // z[i]
ADD $8, R6
BC 16, 0, loop // bdnz
done:
ADDZE R4
MOVD R4, c+72(FP)
RET
// func subVV(z, x, y []Word) (c Word)
// z[i] = x[i] - y[i] for all i, carrying
@@ -30,32 +57,30 @@ TEXT ·subVV(SB), NOSPLIT, $0
MOVD y+48(FP), R9
MOVD z+0(FP), R10
MOVD $0, R4 // c = 0
MOVD $0, R5 // i = 0
MOVD $1, R29 // work around lack of ADDI
MOVD $8, R28 // work around lack of scaled addressing
MOVD R0, R4 // c = 0
MOVD R0, R6
SUBC R0, R0 // clear CA
JMP sublend
MOVD R7, CTR
CMP R0, R7
BEQ sublend
// amd64 saves and restores CF, but I believe they only have to do that because all of
// their math operations clobber it - we should just be able to recover it at the end.
subloop:
MULLD R5, R28, R6
MOVD (R8)(R6), R11 // x[i]
MOVD (R9)(R6), R12 // y[i]
SUBE R12, R11, R15
MOVD R15, (R10)(R6)
ADD R29, R5 // i++
ADD $8, R6
BC 16, 0, subloop // bdnz
sublend:
CMP R5, R7
BLT subloop
ADDZE R4
XOR R29, R4
XOR $1, R4
MOVD R4, c+72(FP)
RET

0 comments on commit 9459c03

Please sign in to comment.