Permalink
Browse files

math/big: improve performance for AddMulVVW and mulAddVWW for ppc64x

This change adds a better implementation in asm for AddMulVVW and
mulAddVWW for ppc64x, with speedups up to 1.54x.

benchmark                       old ns/op     new ns/op     delta
BenchmarkAddMulVVW/1-8          6.58          6.29          -4.41%
BenchmarkAddMulVVW/2-8          7.43          7.25          -2.42%
BenchmarkAddMulVVW/3-8          8.95          8.15          -8.94%
BenchmarkAddMulVVW/4-8          10.1          9.37          -7.23%
BenchmarkAddMulVVW/5-8          12.0          10.7          -10.83%
BenchmarkAddMulVVW/10-8         22.1          20.1          -9.05%
BenchmarkAddMulVVW/100-8        211           154           -27.01%
BenchmarkAddMulVVW/1000-8       2046          1450          -29.13%
BenchmarkAddMulVVW/10000-8      20407         14793         -27.51%
BenchmarkAddMulVVW/100000-8     223857        145548        -34.98%

benchmark                       old MB/s     new MB/s     speedup
BenchmarkAddMulVVW/1-8          9719.88      10175.79     1.05x
BenchmarkAddMulVVW/2-8          17233.97     17657.54     1.02x
BenchmarkAddMulVVW/3-8          21446.05     23550.49     1.10x
BenchmarkAddMulVVW/4-8          25375.70     27334.33     1.08x
BenchmarkAddMulVVW/5-8          26650.52     30029.34     1.13x
BenchmarkAddMulVVW/10-8         28984.29     31833.68     1.10x
BenchmarkAddMulVVW/100-8        30249.41     41531.69     1.37x
BenchmarkAddMulVVW/1000-8       31273.35     44108.54     1.41x
BenchmarkAddMulVVW/10000-8      31360.47     43263.54     1.38x
BenchmarkAddMulVVW/100000-8     28589.58     43971.66     1.54x

Change-Id: I8a8105d4da3592afdef3125757a99f378a0254bb
Reviewed-on: https://go-review.googlesource.com/53931
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
  • Loading branch information...
ceseo authored and laboger committed Jul 14, 2017
1 parent 92cfd07 commit 3cb41be817cadd2dd19a390cf5e21740cb801967
Showing with 42 additions and 86 deletions.
  1. +42 −86 src/math/big/arith_ppc64x.s
View
@@ -98,103 +98,59 @@ TEXT ·shrVU(SB), NOSPLIT, $0
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVD z+0(FP), R10
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD r+56(FP), R4 // c = r
MOVD z_len+8(FP), R11
MOVD $0, R3 // i = 0
MOVD $8, R18
MOVD $1, R19
JMP e5
l5:
MULLD R18, R3, R5
MOVD (R8)(R5), R20
MULLD R9, R20, R6
MULHDU R9, R20, R7
ADDC R4, R6
ADDZE R7
MOVD R6, (R10)(R5)
MOVD R7, R4
ADD R19, R3
MOVD z+0(FP), R10 // R10 = z[]
MOVD x+24(FP), R8 // R8 = x[]
MOVD y+48(FP), R9 // R9 = y
MOVD r+56(FP), R4 // R4 = r = c
MOVD z_len+8(FP), R11 // R11 = z_len
e5:
CMP R3, R11
BLT l5
MOVD R0, R3 // R3 will be the index register
CMP R0, R11
MOVD R11, CTR // Initialize loop counter
BEQ done
loop:
MOVD (R8)(R3), R20 // x[i]
MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y)
MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y)
ADDC R4, R6 // Compute sum for z1 and z0
ADDZE R7
MOVD R6, (R10)(R3) // z[i]
MOVD R7, R4 // c
ADD $8, R3
BC 16, 0, loop // bdnz
done:
MOVD R4, c+64(FP)
RET
// func addMulVVW(z, x []Word, y Word) (c Word)
TEXT ·addMulVVW(SB), NOSPLIT, $0
MOVD z+0(FP), R10
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD z_len+8(FP), R22
MOVD $0, R5 // i = 0
MOVD $0, R4 // c = 0
MOVD $8, R28
MOVD $-2, R23
AND R22, R23 // mask the last bit of z.len
MOVD $2, R24
CMP R23, R24
BGE unrolled
JMP end
unrolled:
MOVD $8, R19 // no (RA)(RB*8) on power
MULLD R5, R19
MOVD (R10)(R19), R11 // R11 = z[i]
MOVD (R8)(R19), R16 // R16 = x[i]
ADD R28, R19, R25
MOVD (R10)(R25), R17
MOVD (R8)(R25), R18
MULLD R9, R16, R12
MULHDU R9, R16, R14
MULLD R9, R18, R6
MULHDU R9, R18, R7
ADDC R4, R12
ADDZE R14
ADDC R11, R12 // z[i] = (x[i]*y) + z[i] + carry
ADDZE R14 // carry = high order bits + add carry
MOVD R12, (R10)(R19)
ADDC R14, R6
ADDZE R7
ADDC R17, R6
ADDZE R7
MOVD R6, (R10)(R25)
MOVD R7, R4
MOVD z+0(FP), R10 // R10 = z[]
MOVD x+24(FP), R8 // R8 = x[]
MOVD y+48(FP), R9 // R9 = y
MOVD z_len+8(FP), R22 // R22 = z_len
ADD R24, R5
CMP R5, R23
BLT unrolled
JMP end
MOVD R0, R3 // R3 will be the index register
CMP R0, R22
MOVD R0, R4 // R4 = c = 0
MOVD R22, CTR // Initialize loop counter
BEQ done
loop:
MOVD $8, R19
MULLD R5, R19
MOVD (R10)(R19), R11
MOVD (R8)(R19), R16
MULLD R9, R16, R12
MULHDU R9, R16, R14
ADDC R4, R12
ADDZE R14
ADDC R11, R12
ADDZE R14
MOVD R12, (R10)(R19)
MOVD R14, R4
MOVD $1, R15
ADD R15, R5
end:
CMP R5, R22
BLT loop
MOVD (R8)(R3), R20 // Load x[i]
MOVD (R10)(R3), R21 // Load z[i]
MULLD R9, R20, R6 // R6 = Low-order(x[i]*y)
MULHDU R9, R20, R7 // R7 = High-order(x[i]*y)
ADDC R21, R6 // R6 = z0
ADDZE R7 // R7 = z1
ADDC R4, R6 // R6 = z0 + c + 0
ADDZE R7, R4 // c += z1
MOVD R6, (R10)(R3) // Store z[i]
ADD $8, R3
BC 16, 0, loop // bdnz
done:
MOVD R4, c+56(FP)
RET

0 comments on commit 3cb41be

Please sign in to comment.