Skip to content

Commit 537c144

Browse files
Ard Biesheuvelherbertx
authored andcommitted
crypto: arm64/gcm - implement native driver using v8 Crypto Extensions
Currently, the AES-GCM implementation for arm64 systems that support the ARMv8 Crypto Extensions is based on the generic GCM module, which combines the AES-CTR implementation using AES instructions with the PMULL based GHASH driver. This is suboptimal, given the fact that the input data needs to be loaded twice, once for the encryption and again for the MAC calculation. On Cortex-A57 (r1p2) and other recent cores that implement micro-op fusing for the AES instructions, AES executes at less than 1 cycle per byte, which means that any cycles wasted on loading the data twice hurt even more. So implement a new GCM driver that combines the AES and PMULL instructions at the block level. This improves performance on Cortex-A57 by ~37% (from 3.5 cpb to 2.6 cpb) Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
1 parent ec808bb commit 537c144

File tree

3 files changed

+591
-26
lines changed

3 files changed

+591
-26
lines changed

arch/arm64/crypto/Kconfig

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,12 @@ config CRYPTO_SHA2_ARM64_CE
2929
select CRYPTO_SHA256_ARM64
3030

3131
config CRYPTO_GHASH_ARM64_CE
32-
tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions"
32+
tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
3333
depends on KERNEL_MODE_NEON
3434
select CRYPTO_HASH
3535
select CRYPTO_GF128MUL
36+
select CRYPTO_AES
37+
select CRYPTO_AES_ARM64
3638

3739
config CRYPTO_CRCT10DIF_ARM64_CE
3840
tristate "CRCT10DIF digest algorithm using PMULL instructions"

arch/arm64/crypto/ghash-ce-core.S

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,3 +77,178 @@ CPU_LE( rev64 T1.16b, T1.16b )
7777
st1 {XL.2d}, [x1]
7878
ret
7979
ENDPROC(pmull_ghash_update)
80+
81+
KS .req v8
82+
CTR .req v9
83+
INP .req v10
84+
85+
.macro load_round_keys, rounds, rk
86+
cmp \rounds, #12
87+
blo 2222f /* 128 bits */
88+
beq 1111f /* 192 bits */
89+
ld1 {v17.4s-v18.4s}, [\rk], #32
90+
1111: ld1 {v19.4s-v20.4s}, [\rk], #32
91+
2222: ld1 {v21.4s-v24.4s}, [\rk], #64
92+
ld1 {v25.4s-v28.4s}, [\rk], #64
93+
ld1 {v29.4s-v31.4s}, [\rk]
94+
.endm
95+
96+
.macro enc_round, state, key
97+
aese \state\().16b, \key\().16b
98+
aesmc \state\().16b, \state\().16b
99+
.endm
100+
101+
.macro enc_block, state, rounds
102+
cmp \rounds, #12
103+
b.lo 2222f /* 128 bits */
104+
b.eq 1111f /* 192 bits */
105+
enc_round \state, v17
106+
enc_round \state, v18
107+
1111: enc_round \state, v19
108+
enc_round \state, v20
109+
2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
110+
enc_round \state, \key
111+
.endr
112+
aese \state\().16b, v30.16b
113+
eor \state\().16b, \state\().16b, v31.16b
114+
.endm
115+
116+
.macro pmull_gcm_do_crypt, enc
117+
ld1 {SHASH.2d}, [x4]
118+
ld1 {XL.2d}, [x1]
119+
ldr x8, [x5, #8] // load lower counter
120+
121+
movi MASK.16b, #0xe1
122+
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
123+
CPU_LE( rev x8, x8 )
124+
shl MASK.2d, MASK.2d, #57
125+
eor SHASH2.16b, SHASH2.16b, SHASH.16b
126+
127+
.if \enc == 1
128+
ld1 {KS.16b}, [x7]
129+
.endif
130+
131+
0: ld1 {CTR.8b}, [x5] // load upper counter
132+
ld1 {INP.16b}, [x3], #16
133+
rev x9, x8
134+
add x8, x8, #1
135+
sub w0, w0, #1
136+
ins CTR.d[1], x9 // set lower counter
137+
138+
.if \enc == 1
139+
eor INP.16b, INP.16b, KS.16b // encrypt input
140+
st1 {INP.16b}, [x2], #16
141+
.endif
142+
143+
rev64 T1.16b, INP.16b
144+
145+
cmp w6, #12
146+
b.ge 2f // AES-192/256?
147+
148+
1: enc_round CTR, v21
149+
150+
ext T2.16b, XL.16b, XL.16b, #8
151+
ext IN1.16b, T1.16b, T1.16b, #8
152+
153+
enc_round CTR, v22
154+
155+
eor T1.16b, T1.16b, T2.16b
156+
eor XL.16b, XL.16b, IN1.16b
157+
158+
enc_round CTR, v23
159+
160+
pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
161+
eor T1.16b, T1.16b, XL.16b
162+
163+
enc_round CTR, v24
164+
165+
pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
166+
pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
167+
168+
enc_round CTR, v25
169+
170+
ext T1.16b, XL.16b, XH.16b, #8
171+
eor T2.16b, XL.16b, XH.16b
172+
eor XM.16b, XM.16b, T1.16b
173+
174+
enc_round CTR, v26
175+
176+
eor XM.16b, XM.16b, T2.16b
177+
pmull T2.1q, XL.1d, MASK.1d
178+
179+
enc_round CTR, v27
180+
181+
mov XH.d[0], XM.d[1]
182+
mov XM.d[1], XL.d[0]
183+
184+
enc_round CTR, v28
185+
186+
eor XL.16b, XM.16b, T2.16b
187+
188+
enc_round CTR, v29
189+
190+
ext T2.16b, XL.16b, XL.16b, #8
191+
192+
aese CTR.16b, v30.16b
193+
194+
pmull XL.1q, XL.1d, MASK.1d
195+
eor T2.16b, T2.16b, XH.16b
196+
197+
eor KS.16b, CTR.16b, v31.16b
198+
199+
eor XL.16b, XL.16b, T2.16b
200+
201+
.if \enc == 0
202+
eor INP.16b, INP.16b, KS.16b
203+
st1 {INP.16b}, [x2], #16
204+
.endif
205+
206+
cbnz w0, 0b
207+
208+
CPU_LE( rev x8, x8 )
209+
st1 {XL.2d}, [x1]
210+
str x8, [x5, #8] // store lower counter
211+
212+
.if \enc == 1
213+
st1 {KS.16b}, [x7]
214+
.endif
215+
216+
ret
217+
218+
2: b.eq 3f // AES-192?
219+
enc_round CTR, v17
220+
enc_round CTR, v18
221+
3: enc_round CTR, v19
222+
enc_round CTR, v20
223+
b 1b
224+
.endm
225+
226+
/*
227+
* void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
228+
* struct ghash_key const *k, u8 ctr[],
229+
* int rounds, u8 ks[])
230+
*/
231+
ENTRY(pmull_gcm_encrypt)
232+
pmull_gcm_do_crypt 1
233+
ENDPROC(pmull_gcm_encrypt)
234+
235+
/*
236+
* void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
237+
* struct ghash_key const *k, u8 ctr[],
238+
* int rounds)
239+
*/
240+
ENTRY(pmull_gcm_decrypt)
241+
pmull_gcm_do_crypt 0
242+
ENDPROC(pmull_gcm_decrypt)
243+
244+
/*
245+
* void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
246+
*/
247+
ENTRY(pmull_gcm_encrypt_block)
248+
cbz x2, 0f
249+
load_round_keys w3, x2
250+
0: ld1 {v0.16b}, [x1]
251+
enc_block v0, w3
252+
st1 {v0.16b}, [x0]
253+
ret
254+
ENDPROC(pmull_gcm_encrypt_block)

0 commit comments

Comments
 (0)