Skip to content

Commit 6e4a0d8

Browse files
kmvijay-IBMgopherbot
authored andcommitted
crypto/internal/fips140/bigmod: vector implementation of addMulVVWx on s390x
addMulVVWx assembly routine is used to multiply bignum multiplicand with a 64-bit multiplier. The new implementation for s390x architecture uses an algorithm based on vector instructions, with a significant performance improvement. Note: z13 is the minimum architecture for Go, which already has VX support. The performance improvement is as below: goos: linux goarch: s390x pkg: crypto/internal/fips140/bigmod Orig.txt Vector_Patch.txt sec/op sec/op vs base ModAdd 164.1n ± 0% 159.7n ± 0% -2.7% (p=0.000 n=10) ModSub 152.3n ± 1% 147.3n ± 0% -3.25 (p=0.000 n=10) MontgomeryRepr 4.806µ ± 3% 1.829µ ± 0% -61.94% (p=0.000 n=10) MontgomeryMul 4.812µ ± 5% 1.834µ ± 0% -61.90% (p=0.000 n=10) ModMul 9.646µ ± 3% 3.698µ ± 0% -61.67% (p=0.000 n=10) ExpBig 11.28m ± 0% 11.28m ± 0% +0.04 (p=0.035 n=10) Exp 12.284m ± 5% 5.004m ± 1% -59.26 (p=0.000 n=10) geomean 18.61µ 10.74µ -42.2 Change-Id: I679944c9dac9f43f1626b018f72efa6da0d2442d Cq-Include-Trybots: luci.golang.try:gotip-linux-s390x Reviewed-on: https://go-review.googlesource.com/c/go/+/716480 Auto-Submit: Filippo Valsorda <filippo@golang.org> Reviewed-by: Vishwanatha HD <vishwanatha.hd@ibm.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Roland Shoemaker <roland@golang.org> Reviewed-by: Filippo Valsorda <filippo@golang.org> Reviewed-by: Srinivas Pokala <Pokala.Srinivas@ibm.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
1 parent 657b331 commit 6e4a0d8

File tree

1 file changed

+140
-65
lines changed

1 file changed

+140
-65
lines changed

src/crypto/internal/fips140/bigmod/nat_s390x.s

Lines changed: 140 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -4,82 +4,157 @@
44

55
//go:build !purego
66

7+
// Register usage (z13 convention):
8+
// R2 = rp (result pointer)
9+
// R3 = ap (source pointer)
10+
// R4 = an / idx (loop counter)
11+
// R5 = b0 (multiplier limb)
12+
// R6 = cy (carry)
13+
714
#include "textflag.h"
815

916
// func addMulVVW1024(z, x *uint, y uint) (c uint)
1017
TEXT ·addMulVVW1024(SB), $0-32
11-
MOVD $16, R5
12-
JMP addMulVVWx(SB)
18+
MOVD $16, R4
19+
JMP addMulVVWx(SB)
1320

1421
// func addMulVVW1536(z, x *uint, y uint) (c uint)
1522
TEXT ·addMulVVW1536(SB), $0-32
16-
MOVD $24, R5
17-
JMP addMulVVWx(SB)
23+
MOVD $24, R4
24+
JMP addMulVVWx(SB)
1825

1926
// func addMulVVW2048(z, x *uint, y uint) (c uint)
2027
TEXT ·addMulVVW2048(SB), $0-32
21-
MOVD $32, R5
22-
JMP addMulVVWx(SB)
28+
MOVD $32, R4
29+
JMP addMulVVWx(SB)
2330

2431
TEXT addMulVVWx(SB), NOFRAME|NOSPLIT, $0
2532
MOVD z+0(FP), R2
26-
MOVD x+8(FP), R8
27-
MOVD y+16(FP), R9
28-
29-
MOVD $0, R1 // i*8 = 0
30-
MOVD $0, R7 // i = 0
31-
MOVD $0, R0 // make sure it's zero
32-
MOVD $0, R4 // c = 0
33-
34-
MOVD R5, R12
35-
AND $-2, R12
36-
CMPBGE R5, $2, A6
37-
BR E6
38-
39-
A6:
40-
MOVD (R8)(R1*1), R6
41-
MULHDU R9, R6
42-
MOVD (R2)(R1*1), R10
43-
ADDC R10, R11 // add to low order bits
44-
ADDE R0, R6
45-
ADDC R4, R11
46-
ADDE R0, R6
47-
MOVD R6, R4
48-
MOVD R11, (R2)(R1*1)
49-
50-
MOVD (8)(R8)(R1*1), R6
51-
MULHDU R9, R6
52-
MOVD (8)(R2)(R1*1), R10
53-
ADDC R10, R11 // add to low order bits
54-
ADDE R0, R6
55-
ADDC R4, R11
56-
ADDE R0, R6
57-
MOVD R6, R4
58-
MOVD R11, (8)(R2)(R1*1)
59-
60-
ADD $16, R1 // i*8 + 8
61-
ADD $2, R7 // i++
62-
63-
CMPBLT R7, R12, A6
64-
BR E6
65-
66-
L6:
67-
// TODO: drop unused single-step loop.
68-
MOVD (R8)(R1*1), R6
69-
MULHDU R9, R6
70-
MOVD (R2)(R1*1), R10
71-
ADDC R10, R11 // add to low order bits
72-
ADDE R0, R6
73-
ADDC R4, R11
74-
ADDE R0, R6
75-
MOVD R6, R4
76-
MOVD R11, (R2)(R1*1)
77-
78-
ADD $8, R1 // i*8 + 8
79-
ADD $1, R7 // i++
80-
81-
E6:
82-
CMPBLT R7, R5, L6 // i < n
83-
84-
MOVD R4, c+24(FP)
33+
MOVD x+8(FP), R3
34+
MOVD y+16(FP), R5
35+
36+
MOVD $0, R6
37+
38+
L_ent:
39+
VZERO V0
40+
VZERO V2
41+
SRD $2, R4, R10
42+
TMLL R4, $1
43+
BRC $8, L_bx0
44+
45+
L_bx1:
46+
VLEG $1, 0(R2), V2
47+
VZERO V4
48+
TMLL R4, $2
49+
BRC $7, L_b11
50+
51+
L_b01:
52+
MOVD $-24, R4
53+
MOVD R6, R0
54+
MOVD 0(R3), R7
55+
MLGR R5, R6
56+
ADDC R0, R7
57+
MOVD $0, R0
58+
ADDE R0, R6
59+
VLVGG $1, R7, V4
60+
VAQ V2, V4, V2
61+
VSTEG $1, V2, 0(R2)
62+
VMRHG V2, V2, V2
63+
CMPBEQ R10, $0, L_1
64+
BR L_cj0
65+
66+
L_b11:
67+
MOVD $-8, R4
68+
MOVD 0(R3), R9
69+
MLGR R5, R8
70+
ADDC R6, R9
71+
MOVD $0, R6
72+
ADDE R6, R8
73+
VLVGG $1, R9, V4
74+
VAQ V2, V4, V2
75+
VSTEG $1, V2, 0(R2)
76+
VMRHG V2, V2, V2
77+
BR L_cj1
78+
79+
L_bx0:
80+
TMLL R4, $2
81+
BRC $7, L_b10
82+
83+
L_b00:
84+
MOVD $-32, R4
85+
86+
L_cj0:
87+
MOVD 32(R3)(R4), R1
88+
MOVD 40(R3)(R4), R9
89+
MLGR R5, R0
90+
MLGR R5, R8
91+
VL 32(R4)(R2), V1
92+
VPDI $4, V1, V1, V1
93+
VLVGP R0, R1, V6
94+
VLVGP R9, R6, V7
95+
BR L_mid
96+
97+
L_b10:
98+
MOVD $-16, R4
99+
MOVD R6, R8
100+
101+
L_cj1:
102+
MOVD 16(R4)(R3), R1
103+
MOVD 24(R4)(R3), R7
104+
MLGR R5, R0
105+
MLGR R5, R6
106+
VL 16(R4)(R2), V1
107+
VPDI $4, V1, V1, V1
108+
VLVGP R0, R1, V6
109+
VLVGP R7, R8, V7
110+
CMPBEQ R10, $0, L_end
111+
112+
L_top:
113+
MOVD 32(R4)(R3), R1
114+
MOVD 40(R4)(R3), R9
115+
MLGR R5, R0
116+
MLGR R5, R8
117+
VACQ V6, V1, V0, V5
118+
VACCCQ V6, V1, V0, V0
119+
VACQ V5, V7, V2, V3
120+
VACCCQ V5, V7, V2, V2
121+
VPDI $4, V3, V3, V3
122+
VL 32(R4)(R2), V1
123+
VPDI $4, V1, V1, V1
124+
VST V3, 16(R4)(R2)
125+
VLVGP R0, R1, V6
126+
VLVGP R9, R6, V7
127+
128+
L_mid:
129+
MOVD 48(R4)(R3), R1
130+
MOVD 56(R4)(R3), R7
131+
MLGR R5, R0
132+
MLGR R5, R6
133+
VACQ V6, V1, V0, V5
134+
VACCCQ V6, V1, V0, V0
135+
VACQ V5, V7, V2, V3
136+
VACCCQ V5, V7, V2, V2
137+
VPDI $4, V3, V3, V3
138+
VL 48(R4)(R2), V1
139+
VPDI $4, V1, V1, V1
140+
VST V3, 32(R4)(R2)
141+
VLVGP R0, R1, V6
142+
VLVGP R7, R8, V7
143+
MOVD $32(R4), R4
144+
BRCTG R10, L_top
145+
146+
L_end:
147+
VACQ V6, V1, V0, V5
148+
VACCCQ V6, V1, V0, V0
149+
VACQ V5, V7, V2, V3
150+
VACCCQ V5, V7, V2, V2
151+
VPDI $4, V3, V3, V3
152+
VST V3, 16(R2)(R4)
153+
VAG V0, V2, V2
154+
155+
L_1:
156+
VLGVG $1, V2, R2
157+
ADDC R6, R2
158+
MOVD R2, c+24(FP)
85159
RET
160+

0 commit comments

Comments
 (0)