Skip to content

Commit

Permalink
sm9/bn256: use neon inst. for conditional move
Browse files Browse the repository at this point in the history
  • Loading branch information
emmansun committed Nov 7, 2023
1 parent 06a310d commit ef55df3
Showing 1 changed file with 105 additions and 266 deletions.
371 changes: 105 additions & 266 deletions sm9/bn256/select_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -95,137 +95,57 @@ TEXT ·gfP12MovCond(SB),NOSPLIT,$0
MOVD b+16(FP), b_ptr
MOVD cond+24(FP), R3

CMP $0, R3
// Two remarks:
// 1) Will want to revisit NEON, when support is better
// 2) CSEL might not be constant time on all ARM processors
LDP 0*16(a_ptr), (R4, R5)
LDP 1*16(a_ptr), (R6, R7)
LDP 2*16(a_ptr), (R8, R9)
LDP 0*16(b_ptr), (R16, R17)
LDP 1*16(b_ptr), (R19, R20)
LDP 2*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 0*16(res_ptr)
STP (R6, R7), 1*16(res_ptr)
STP (R8, R9), 2*16(res_ptr)

LDP 3*16(a_ptr), (R4, R5)
LDP 4*16(a_ptr), (R6, R7)
LDP 5*16(a_ptr), (R8, R9)
LDP 3*16(b_ptr), (R16, R17)
LDP 4*16(b_ptr), (R19, R20)
LDP 5*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 3*16(res_ptr)
STP (R6, R7), 4*16(res_ptr)
STP (R8, R9), 5*16(res_ptr)

LDP 6*16(a_ptr), (R4, R5)
LDP 7*16(a_ptr), (R6, R7)
LDP 8*16(a_ptr), (R8, R9)
LDP 6*16(b_ptr), (R16, R17)
LDP 7*16(b_ptr), (R19, R20)
LDP 8*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 6*16(res_ptr)
STP (R6, R7), 7*16(res_ptr)
STP (R8, R9), 8*16(res_ptr)

LDP 9*16(a_ptr), (R4, R5)
LDP 10*16(a_ptr), (R6, R7)
LDP 11*16(a_ptr), (R8, R9)
LDP 9*16(b_ptr), (R16, R17)
LDP 10*16(b_ptr), (R19, R20)
LDP 11*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 9*16(res_ptr)
STP (R6, R7), 10*16(res_ptr)
STP (R8, R9), 11*16(res_ptr)

LDP 12*16(a_ptr), (R4, R5)
LDP 13*16(a_ptr), (R6, R7)
LDP 14*16(a_ptr), (R8, R9)
LDP 12*16(b_ptr), (R16, R17)
LDP 13*16(b_ptr), (R19, R20)
LDP 14*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 12*16(res_ptr)
STP (R6, R7), 13*16(res_ptr)
STP (R8, R9), 14*16(res_ptr)

LDP 15*16(a_ptr), (R4, R5)
LDP 16*16(a_ptr), (R6, R7)
LDP 17*16(a_ptr), (R8, R9)
LDP 15*16(b_ptr), (R16, R17)
LDP 16*16(b_ptr), (R19, R20)
LDP 17*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 15*16(res_ptr)
STP (R6, R7), 16*16(res_ptr)
STP (R8, R9), 17*16(res_ptr)

LDP 18*16(a_ptr), (R4, R5)
LDP 19*16(a_ptr), (R6, R7)
LDP 20*16(a_ptr), (R8, R9)
LDP 18*16(b_ptr), (R16, R17)
LDP 19*16(b_ptr), (R19, R20)
LDP 20*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 18*16(res_ptr)
STP (R6, R7), 19*16(res_ptr)
STP (R8, R9), 20*16(res_ptr)

LDP 21*16(a_ptr), (R4, R5)
LDP 22*16(a_ptr), (R6, R7)
LDP 23*16(a_ptr), (R8, R9)
LDP 21*16(b_ptr), (R16, R17)
LDP 22*16(b_ptr), (R19, R20)
LDP 23*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 21*16(res_ptr)
STP (R6, R7), 22*16(res_ptr)
STP (R8, R9), 23*16(res_ptr)
VEOR V0.B16, V0.B16, V0.B16
VMOV R3, V1.S4
VCMEQ V0.S4, V1.S4, V2.S4

VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)

VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)

VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)

VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)

VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)

VLD1 (a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1 (b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1 [V3.B16, V4.B16, V5.B16, V6.B16], (res_ptr)

RET

Expand All @@ -238,52 +158,25 @@ TEXT ·curvePointMovCond(SB),NOSPLIT,$0
MOVD b+16(FP), b_ptr
MOVD cond+24(FP), R3

CMP $0, R3
// Two remarks:
// 1) Will want to revisit NEON, when support is better
// 2) CSEL might not be constant time on all ARM processors
LDP 0*16(a_ptr), (R4, R5)
LDP 1*16(a_ptr), (R6, R7)
LDP 2*16(a_ptr), (R8, R9)
LDP 0*16(b_ptr), (R16, R17)
LDP 1*16(b_ptr), (R19, R20)
LDP 2*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 0*16(res_ptr)
STP (R6, R7), 1*16(res_ptr)
STP (R8, R9), 2*16(res_ptr)

LDP 3*16(a_ptr), (R4, R5)
LDP 4*16(a_ptr), (R6, R7)
LDP 5*16(a_ptr), (R8, R9)
LDP 3*16(b_ptr), (R16, R17)
LDP 4*16(b_ptr), (R19, R20)
LDP 5*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 3*16(res_ptr)
STP (R6, R7), 4*16(res_ptr)
STP (R8, R9), 5*16(res_ptr)

LDP 6*16(a_ptr), (R4, R5)
LDP 7*16(a_ptr), (R6, R7)
LDP 6*16(b_ptr), (R16, R17)
LDP 7*16(b_ptr), (R19, R20)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
STP (R4, R5), 6*16(res_ptr)
STP (R6, R7), 7*16(res_ptr)
VEOR V0.B16, V0.B16, V0.B16
VMOV R3, V1.S4
VCMEQ V0.S4, V1.S4, V2.S4

VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)

VLD1 (a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1 (b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1 [V3.B16, V4.B16, V5.B16, V6.B16], (res_ptr)

RET

Expand All @@ -296,94 +189,40 @@ TEXT ·twistPointMovCond(SB),NOSPLIT,$0
MOVD b+16(FP), b_ptr
MOVD cond+24(FP), R3

CMP $0, R3
// Two remarks:
// 1) Will want to revisit NEON, when support is better
// 2) CSEL might not be constant time on all ARM processors
LDP 0*16(a_ptr), (R4, R5)
LDP 1*16(a_ptr), (R6, R7)
LDP 2*16(a_ptr), (R8, R9)
LDP 0*16(b_ptr), (R16, R17)
LDP 1*16(b_ptr), (R19, R20)
LDP 2*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 0*16(res_ptr)
STP (R6, R7), 1*16(res_ptr)
STP (R8, R9), 2*16(res_ptr)

LDP 3*16(a_ptr), (R4, R5)
LDP 4*16(a_ptr), (R6, R7)
LDP 5*16(a_ptr), (R8, R9)
LDP 3*16(b_ptr), (R16, R17)
LDP 4*16(b_ptr), (R19, R20)
LDP 5*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 3*16(res_ptr)
STP (R6, R7), 4*16(res_ptr)
STP (R8, R9), 5*16(res_ptr)

LDP 6*16(a_ptr), (R4, R5)
LDP 7*16(a_ptr), (R6, R7)
LDP 8*16(a_ptr), (R8, R9)
LDP 6*16(b_ptr), (R16, R17)
LDP 7*16(b_ptr), (R19, R20)
LDP 8*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 6*16(res_ptr)
STP (R6, R7), 7*16(res_ptr)
STP (R8, R9), 8*16(res_ptr)

LDP 9*16(a_ptr), (R4, R5)
LDP 10*16(a_ptr), (R6, R7)
LDP 11*16(a_ptr), (R8, R9)
LDP 9*16(b_ptr), (R16, R17)
LDP 10*16(b_ptr), (R19, R20)
LDP 11*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 9*16(res_ptr)
STP (R6, R7), 10*16(res_ptr)
STP (R8, R9), 11*16(res_ptr)

LDP 12*16(a_ptr), (R4, R5)
LDP 13*16(a_ptr), (R6, R7)
LDP 14*16(a_ptr), (R8, R9)
LDP 12*16(b_ptr), (R16, R17)
LDP 13*16(b_ptr), (R19, R20)
LDP 14*16(b_ptr), (R21, R22)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
CSEL EQ, R19, R6, R6
CSEL EQ, R20, R7, R7
CSEL EQ, R21, R8, R8
CSEL EQ, R22, R9, R9
STP (R4, R5), 12*16(res_ptr)
STP (R6, R7), 13*16(res_ptr)
STP (R8, R9), 14*16(res_ptr)

LDP 15*16(a_ptr), (R4, R5)
LDP 15*16(b_ptr), (R16, R17)
CSEL EQ, R16, R4, R4
CSEL EQ, R17, R5, R5
STP (R4, R5), 15*16(res_ptr)
VEOR V0.B16, V0.B16, V0.B16
VMOV R3, V1.S4
VCMEQ V0.S4, V1.S4, V2.S4

VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)

VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)

VLD1.P (64)(a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1.P (64)(b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1.P [V3.B16, V4.B16, V5.B16, V6.B16], (64)(res_ptr)

VLD1 (a_ptr), [V3.B16, V4.B16, V5.B16, V6.B16]
VLD1 (b_ptr), [V7.B16, V8.B16, V9.B16, V10.B16]
VBIT V2.B16, V7.B16, V3.B16
VBIT V2.B16, V8.B16, V4.B16
VBIT V2.B16, V9.B16, V5.B16
VBIT V2.B16, V10.B16, V6.B16
VST1 [V3.B16, V4.B16, V5.B16, V6.B16], (res_ptr)

RET

0 comments on commit ef55df3

Please sign in to comment.