/
gfp.h
148 lines (143 loc) · 2.64 KB
/
gfp.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#define storeBlock(a1,a2,a3,a4, r) \
MOVQ a1, 0+r \
MOVQ a2, 8+r \
MOVQ a3, 16+r \
MOVQ a4, 24+r
#define loadBlock(r, a1,a2,a3,a4) \
MOVQ 0+r, a1 \
MOVQ 8+r, a2 \
MOVQ 16+r, a3 \
MOVQ 24+r, a4
#define gfpCarry(a1,a2,a3,a4,a5, b1,b2,b3,b4,b5) \
\ // b = a-p
MOVQ a1, b1 \
MOVQ a2, b2 \
MOVQ a3, b3 \
MOVQ a4, b4 \
MOVQ a5, b5 \
\
SUBQ ·p2+0(SB), b1 \
SBBQ ·p2+8(SB), b2 \
SBBQ ·p2+16(SB), b3 \
SBBQ ·p2+24(SB), b4 \
SBBQ $0, b5 \
\
\ // if b is negative then return a
\ // else return a
CMOVQCC b1, a1 \
CMOVQCC b2, a2 \
CMOVQCC b3, a3 \
CMOVQCC b4, a4
#define mul(ra, rb) \
mulArb(0+ra,8+ra,16+ra,24+ra, rb)
#define mulArb(a1,a2,a3,a4, rb) \
MOVQ a1, DX \
MOVQ $0, R13 \
MULXQ 0+rb, R8, R9 \
MULXQ 8+rb, AX, R10 \
ADDQ AX, R9 \
MULXQ 16+rb, AX, R11 \
ADCQ AX, R10 \
MULXQ 24+rb, AX, R12 \
ADCQ AX, R11 \
ADCQ $0, R12 \
ADCQ $0, R13 \
\
MOVQ a2, DX \
MOVQ $0, R14 \
MULXQ 0+rb, AX, BX \
ADDQ AX, R9 \
ADCQ BX, R10 \
MULXQ 16+rb, AX, BX \
ADCQ AX, R11 \
ADCQ BX, R12 \
ADCQ $0, R13 \
MULXQ 8+rb, AX, BX \
ADDQ AX, R10 \
ADCQ BX, R11 \
MULXQ 24+rb, AX, BX \
ADCQ AX, R12 \
ADCQ BX, R13 \
ADCQ $0, R14 \
\
MOVQ a3, DX \
MOVQ $0, R15 \
MULXQ 0+rb, AX, BX \
ADDQ AX, R10 \
ADCQ BX, R11 \
MULXQ 16+rb, AX, BX \
ADCQ AX, R12 \
ADCQ BX, R13 \
ADCQ $0, R14 \
MULXQ 8+rb, AX, BX \
ADDQ AX, R11 \
ADCQ BX, R12 \
MULXQ 24+rb, AX, BX \
ADCQ AX, R13 \
ADCQ BX, R14 \
ADCQ $0, R15 \
\
MOVQ a4, DX \
MULXQ 0+rb, AX, BX \
ADDQ AX, R11 \
ADCQ BX, R12 \
MULXQ 16+rb, AX, BX \
ADCQ AX, R13 \
ADCQ BX, R14 \
ADCQ $0, R15 \
MULXQ 8+rb, AX, BX \
ADDQ AX, R12 \
ADCQ BX, R13 \
MULXQ 24+rb, AX, BX \
ADCQ AX, R14 \
ADCQ BX, R15
#define gfpReduce() \
\ // m = (T * N') mod R, store m in R8:R9:R10:R11
MOVQ ·np+0(SB), DX \
MULXQ 0(SP), R8, R9 \
MULXQ 8(SP), AX, R10 \
ADDQ AX, R9 \
MULXQ 16(SP), AX, R11 \
ADCQ AX, R10 \
MULXQ 24(SP), AX, BX \
ADCQ AX, R11 \
\
MOVQ ·np+8(SB), DX \
MULXQ 0(SP), AX, BX \
ADDQ AX, R9 \
ADCQ BX, R10 \
MULXQ 16(SP), AX, BX \
ADCQ AX, R11 \
MULXQ 8(SP), AX, BX \
ADDQ AX, R10 \
ADCQ BX, R11 \
\
MOVQ ·np+16(SB), DX \
MULXQ 0(SP), AX, BX \
ADDQ AX, R10 \
ADCQ BX, R11 \
MULXQ 8(SP), AX, BX \
ADDQ AX, R11 \
\
MOVQ ·np+24(SB), DX \
MULXQ 0(SP), AX, BX \
ADDQ AX, R11 \
\
storeBlock(R8,R9,R10,R11, 64(SP)) \
\
\ // m * N
mulArb(·p2+0(SB),·p2+8(SB),·p2+16(SB),·p2+24(SB), 64(SP)) \
\
\ // Add the 512-bit intermediate to m*N
MOVQ $0, AX \
ADDQ 0(SP), R8 \
ADCQ 8(SP), R9 \
ADCQ 16(SP), R10 \
ADCQ 24(SP), R11 \
ADCQ 32(SP), R12 \
ADCQ 40(SP), R13 \
ADCQ 48(SP), R14 \
ADCQ 56(SP), R15 \
ADCQ $0, AX \
\
gfpCarry(R12,R13,R14,R15,AX, R8,R9,R10,R11,BX)