forked from m-pays/m-cpuminer-v2
-
Notifications
You must be signed in to change notification settings - Fork 3
/
sha256-armv8-aarch64.S
179 lines (167 loc) · 5.61 KB
/
sha256-armv8-aarch64.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
/*
1-June-2018 Rauli Kumpulainen
Taken from https://github.com/jocover/sha256-armv8 all credit to which.
__AARCH64EL__ (almost always used) is assumed at compile time.
*/
.text
.arch armv8-a+crypto
# SHA256 assembly implementation for ARMv8 AArch64
.global sha256_block_data_order
.type sha256_block_data_order,%function
.align 2
sha256_block_data_order:
.Lsha256prolog:
stp x29, x30, [sp,#-64]!
mov x29, sp
adr x3, .LKConstant256
// Load constants table pointer
str q8, [sp, #16]
// Cache constants table in registers v16 - v31
ld1 {v16.4s-v19.4s}, [x3], #64
ld1 {v0.4s}, [x0], #16
ld1 {v20.4s-v23.4s}, [x3], #64
add x2, x1, x2, lsl #6
ld1 {v1.4s}, [x0]
ld1 {v24.4s-v27.4s}, [x3], #64
sub x0, x0, #16
str q9, [sp, #32]
str q10, [sp, #48]
ld1 {v28.4s-v31.4s}, [x3], #64
// Cache constants table in registers v16 - v31
// below taken from https://github.com/minio/sha256-simd/blob/master/sha256block_arm64.s not sure if worth integrating
//.inst 0x4cdf2870 // ld1 {v16.4s-v19.4s}, [x3], #64
//.inst 0x4cdf7800 // ld1 {v0.4s}, [x0], #16
//.inst 0x4cdf2874 // ld1 {v20.4s-v23.4s}, [x3], #64
//.inst 0x4c407801 // ld1 {v1.4s}, [x0]
//.inst 0x4cdf2878 // ld1 {v24.4s-v27.4s}, [x3], #64
//.inst 0xd1004000 // sub x0, x0, #0x10
//.inst 0x4cdf287c // ld1 {v28.4s-v31.4s}, [x3], #64
.Lsha256loop:
ld1 {v5.16b-v8.16b}, [x1], #64
mov v2.16b, v0.16b
mov v3.16b, v1.16b
rev32 v5.16b, v5.16b
rev32 v6.16b, v6.16b
add v9.4s, v5.4s, v16.4s
rev32 v7.16b, v7.16b
add v10.4s, v6.4s, v17.4s
mov v4.16b, v2.16b
sha256h q2, q3, v9.4s
sha256h2 q3, q4, v9.4s
sha256su0 v5.4s, v6.4s
rev32 v8.16b, v8.16b
add v9.4s, v7.4s, v18.4s
mov v4.16b, v2.16b
sha256h q2, q3, v10.4s
sha256h2 q3, q4, v10.4s
sha256su0 v6.4s, v7.4s
sha256su1 v5.4s, v7.4s, v8.4s
add v10.4s, v8.4s, v19.4s
mov v4.16b, v2.16b
sha256h q2, q3, v9.4s
sha256h2 q3, q4, v9.4s
sha256su0 v7.4s, v8.4s
sha256su1 v6.4s, v8.4s, v5.4s
add v9.4s, v5.4s, v20.4s
mov v4.16b, v2.16b
sha256h q2, q3, v10.4s
sha256h2 q3, q4, v10.4s
sha256su0 v8.4s, v5.4s
sha256su1 v7.4s, v5.4s, v6.4s
add v10.4s, v6.4s, v21.4s
mov v4.16b, v2.16b
sha256h q2, q3, v9.4s
sha256h2 q3, q4, v9.4s
sha256su0 v5.4s, v6.4s
sha256su1 v8.4s, v6.4s, v7.4s
add v9.4s, v7.4s, v22.4s
mov v4.16b, v2.16b
sha256h q2, q3, v10.4s
sha256h2 q3, q4, v10.4s
sha256su0 v6.4s, v7.4s
sha256su1 v5.4s, v7.4s, v8.4s
add v10.4s, v8.4s, v23.4s
mov v4.16b, v2.16b
sha256h q2, q3, v9.4s
sha256h2 q3, q4, v9.4s
sha256su0 v7.4s, v8.4s
sha256su1 v6.4s, v8.4s, v5.4s
add v9.4s, v5.4s, v24.4s
mov v4.16b, v2.16b
sha256h q2, q3, v10.4s
sha256h2 q3, q4, v10.4s
sha256su0 v8.4s, v5.4s
sha256su1 v7.4s, v5.4s, v6.4s
add v10.4s, v6.4s, v25.4s
mov v4.16b, v2.16b
sha256h q2, q3, v9.4s
sha256h2 q3, q4, v9.4s
sha256su0 v5.4s, v6.4s
sha256su1 v8.4s, v6.4s, v7.4s
add v9.4s, v7.4s, v26.4s
mov v4.16b, v2.16b
sha256h q2, q3, v10.4s
sha256h2 q3, q4, v10.4s
sha256su0 v6.4s, v7.4s
sha256su1 v5.4s, v7.4s, v8.4s
add v10.4s, v8.4s, v27.4s
mov v4.16b, v2.16b
sha256h q2, q3, v9.4s
sha256h2 q3, q4, v9.4s
sha256su0 v7.4s, v8.4s
sha256su1 v6.4s, v8.4s, v5.4s
add v9.4s, v5.4s, v28.4s
mov v4.16b, v2.16b
sha256h q2, q3, v10.4s
sha256h2 q3, q4, v10.4s
sha256su0 v8.4s, v5.4s
sha256su1 v7.4s, v5.4s, v6.4s
add v10.4s, v6.4s, v29.4s
mov v4.16b, v2.16b
sha256h q2, q3, v9.4s
sha256h2 q3, q4, v9.4s
sha256su1 v8.4s, v6.4s, v7.4s
add v9.4s, v7.4s, v30.4s
mov v4.16b, v2.16b
sha256h q2, q3, v10.4s
sha256h2 q3, q4, v10.4s
add v10.4s, v8.4s, v31.4s
mov v4.16b, v2.16b
sha256h q2, q3, v9.4s
sha256h2 q3, q4, v9.4s
mov v4.16b, v2.16b
sha256h q2, q3, v10.4s
sha256h2 q3, q4, v10.4s
//cmp x1, x2
sub x4, x1, x2 // iterator--
add v1.4s, v1.4s, v3.4s
add v0.4s, v0.4s, v2.4s
//b.ne .Lsha256loop
cbnz x4,.Lsha256loop //any more blocks? actually only dealing with one block in one round "sha256_block_data_order(r,data,1)"
//.Lsha256epilog:
st1 {v0.4s,v1.4s}, [x0]
ldr q10, [sp, #48]
ldr q9, [sp, #32]
ldr q8, [sp, #16]
ldr x29, [sp], #64
ret
.align 5
.LKConstant256:
.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size sha256_block_data_order,.-sha256_block_data_order
.align 2