Skip to content

Commit

Permalink
[AArch64] Use m4 macros in gcm-hash.asm and add documentation comments
Browse files Browse the repository at this point in the history
  • Loading branch information
mamonet committed Mar 21, 2021
1 parent 3f43c14 commit 03b8ba3
Showing 1 changed file with 113 additions and 107 deletions.
220 changes: 113 additions & 107 deletions arm64/crypto/gcm-hash.asm
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
C arm/v8/gcm-hash.asm
C arm64/crypto/gcm-hash.asm

ifelse(`
Copyright (C) 2020 Niels Möller and Mamone Tarsha
Expand Down Expand Up @@ -38,30 +38,42 @@ ifelse(`
C gcm_set_key() assigns H value in the middle element of the table
define(`H_Idx', `128')

C common register usage:
C common SIMD register usage:
define(`POLY', `v6')
C temporary register that assist the reduction procedure
define(`T', `v7')
C permenant register that hold the 16-byte result of pmull
define(`F', `v16')
C permenant register that hold the 16-byte result of pmull2,
C its value is accumulated on 'F' register immediately
define(`F1', `v17')
C permenant register that hold the 16-byte result of pmull
define(`R', `v18')
C permenant register that hold the 16-byte result of pmull2,
C its value is accumulated on 'F' register immediately
define(`R1', `v19')

C common macros:
.macro PMUL in, param1, param2
pmull F.1q,\param2\().1d,\in\().1d
pmull2 F1.1q,\param2\().2d,\in\().2d
pmull R.1q,\param1\().1d,\in\().1d
pmull2 R1.1q,\param1\().2d,\in\().2d
C long multiply of six 64-bit polynomials and sum
C R = (in.l × param2.l) + (in.h × param2.h)
C F = (in.l × param3.l) + (in.h × param3.h)
C PMUL(in, param1, param2)
define(`PMUL', m4_assert_numargs(3)`
pmull F.1q,$3.1d,$1.1d
pmull2 F1.1q,$3.2d,$1.2d
pmull R.1q,$2.1d,$1.1d
pmull2 R1.1q,$2.2d,$1.2d
eor F.16b,F.16b,F1.16b
eor R.16b,R.16b,R1.16b
.endm

.macro REDUCTION out
')
C Reduce 'R' and 'F' values to 128-bit output
C REDUCTION(out)
define(`REDUCTION', m4_assert_numargs(1)`
pmull T.1q,F.1d,POLY.1d
eor R.16b,R.16b,T.16b
ext R.16b,R.16b,R.16b,#8
eor \out\().16b,F.16b,R.16b
.endm
eor $1.16b,F.16b,R.16b
')

C void gcm_init_key (union gcm_block *table)

Expand Down Expand Up @@ -101,13 +113,14 @@ define(`H3L', `v28')
define(`H4M', `v29')
define(`H4L', `v30')

.macro PMUL_PARAM in, param1, param2
pmull2 Hp.1q,\in\().2d,POLY.2d
eor Hm.16b,\in\().16b,Hp.16b
ext \param1\().16b,Hm.16b,\in\().16b,#8
ext \param2\().16b,\in\().16b,Hm.16b,#8
ext \param1\().16b,\param1\().16b,\param1\().16b,#8
.endm
C PMUL_PARAM(in, param1, param2)
define(`PMUL_PARAM', m4_assert_numargs(3)`
pmull2 Hp.1q,$1.2d,POLY.2d
eor Hm.16b,$1.16b,Hp.16b
ext $2.16b,Hm.16b,$1.16b,#8
ext $3.16b,$1.16b,Hm.16b,#8
ext $2.16b,$2.16b,$2.16b,#8
')

PROLOGUE(_nettle_gcm_init_key)
add x1,TABLE,#16*H_Idx
Expand All @@ -120,6 +133,8 @@ PROLOGUE(_nettle_gcm_init_key)
IF_LE(`
rev64 H.16b,H.16b
')
C --- calculate H = H × x mod R(X); R(X) = (x¹²⁸+x¹²⁷+x¹²⁶+x¹²¹+1) ---

dup EMSB.16b,H.b[7]
mov x1,#0xC200000000000000
mov x2,#1
Expand All @@ -136,36 +151,36 @@ IF_LE(`

dup POLY.2d,POLY.d[0]

C --- calculate H^2 = H*H ---
C --- calculate H^2 = H × H ---

PMUL_PARAM H,H1M,H1L
PMUL_PARAM(H,H1M,H1L)

PMUL H,H1M,H1L
PMUL(H,H1M,H1L)

REDUCTION H2
REDUCTION(H2)

PMUL_PARAM H2,H2M,H2L
PMUL_PARAM(H2,H2M,H2L)

C we store to the table as doubleword-vectors in current memory endianness
C because it's our own strictly internal data structure and what gcm_hash
C can most naturally use
st1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE],#64

C --- calculate H^3 = H^1*H^2 ---
C --- calculate H^3 = H^1 × H^2 ---

PMUL H2,H1M,H1L
PMUL(H2,H1M,H1L)

REDUCTION H3
REDUCTION(H3)

PMUL_PARAM H3,H3M,H3L
PMUL_PARAM(H3,H3M,H3L)

C --- calculate H^4 = H^2*H^2 ---
C --- calculate H^4 = H^2 × H^2 ---

PMUL H2,H2M,H2L
PMUL(H2,H2M,H2L)

REDUCTION H4
REDUCTION(H4)

PMUL_PARAM H4,H4M,H4L
PMUL_PARAM(H4,H4M,H4L)

st1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[TABLE]

Expand All @@ -180,7 +195,6 @@ define(`DATA', `x3')

define(`D', `v0')
define(`C0', `v1')
define(`C0D', `d1')
define(`C1', `v2')
define(`C2', `v3')
define(`C3', `v4')
Expand All @@ -197,16 +211,52 @@ define(`H3L', `v29')
define(`H4M', `v30')
define(`H4L', `v31')

.macro PMUL_SUM in, param1, param2
pmull F2.1q,\param2\().1d,\in\().1d
pmull2 F3.1q,\param2\().2d,\in\().2d
pmull R2.1q,\param1\().1d,\in\().1d
pmull2 R3.1q,\param1\().2d,\in\().2d
C PMUL_SUM(in, param1, param2)
define(`PMUL_SUM', m4_assert_numargs(3)`
pmull F2.1q,$3.1d,$1.1d
pmull2 F3.1q,$3.2d,$1.2d
pmull R2.1q,$2.1d,$1.1d
pmull2 R3.1q,$2.2d,$1.2d
eor F2.16b,F2.16b,F3.16b
eor R2.16b,R2.16b,R3.16b
eor F.16b,F.16b,F2.16b
eor R.16b,R.16b,R2.16b
.endm
')

C Load the final partial block into SIMD register,
C stored in little-endian order for each 64-bit part
C LOAD_REV_PARTIAL_BLOCK(out)
define(`LOAD_REV_PARTIAL_BLOCK', m4_assert_numargs(1)`
tbz LENGTH,3,Lless_8_bytes
ldr `d'substr($1,1,len($1)),[DATA],#8
IF_LE(`
rev64 $1.16b,$1.16b
')
mov x7,#0
mov $1.d[1],x7
tst LENGTH,#7
b.eq Lload_done
Lless_8_bytes:
mov x6,#0
mov x5,#64
and x4,LENGTH,#7
Lload_byte_loop:
mov x7,#0
ldrb w7,[DATA],#1
sub x5,x5,#8
lsl x7,x7,x5
orr x6,x6,x7
subs x4,x4,#1
b.ne Lload_byte_loop
tbz LENGTH,3,Lstore_hi_dw
mov $1.d[1],x6
b Lload_done
Lstore_hi_dw:
mov x7,#0
mov $1.d[0],x6
mov $1.d[1],x7
Lload_done:
')

C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
C size_t length, const uint8_t *data)
Expand All @@ -221,13 +271,13 @@ IF_LE(`
')

ands x4,LENGTH,#-64
b.eq L2x
b.eq L1_block

add x5,TABLE,#64
ld1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE]
ld1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[x5]

L4x_loop:
L4_blocks_loop:
ld1 {C0.2d,C1.2d,C2.2d,C3.2d},[DATA],#64
IF_LE(`
rev64 C0.16b,C0.16b
Expand All @@ -238,98 +288,54 @@ IF_LE(`

eor C0.16b,C0.16b,D.16b

PMUL C1,H3M,H3L
PMUL_SUM C2,H2M,H2L
PMUL_SUM C3,H1M,H1L
PMUL_SUM C0,H4M,H4L
PMUL(C1,H3M,H3L)
PMUL_SUM(C2,H2M,H2L)
PMUL_SUM(C3,H1M,H1L)
PMUL_SUM(C0,H4M,H4L)

REDUCTION D
REDUCTION(D)

subs x4,x4,#64
b.ne L4x_loop
b.ne L4_blocks_loop

and LENGTH,LENGTH,#63

L2x:
tst LENGTH,#-32
b.eq L1x

ld1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE]

ld1 {C0.2d,C1.2d},[DATA],#32
IF_LE(`
rev64 C0.16b,C0.16b
rev64 C1.16b,C1.16b
')

eor C0.16b,C0.16b,D.16b

PMUL C1,H1M,H1L
PMUL_SUM C0,H2M,H2L

REDUCTION D

and LENGTH,LENGTH,#31

L1x:
tst LENGTH,#-16
b.eq Lmod
L1_block:
ands x4,LENGTH,#-16
b.eq Lpartial

ld1 {H1M.2d,H1L.2d},[TABLE]

L1_block_loop:
ld1 {C0.2d},[DATA],#16
IF_LE(`
rev64 C0.16b,C0.16b
')

eor C0.16b,C0.16b,D.16b

PMUL C0,H1M,H1L
PMUL(C0,H1M,H1L)

REDUCTION(D)

REDUCTION D
subs x4,x4,#16
b.ne L1_block_loop

Lmod:
Lpartial:
tst LENGTH,#15
b.eq Ldone
b.eq Lghash_done

ld1 {H1M.2d,H1L.2d},[TABLE]
LOAD_REV_PARTIAL_BLOCK(C0)

tbz LENGTH,3,Lmod_8
ldr C0D,[DATA],#8
IF_LE(`
rev64 C0.16b,C0.16b
')
mov x7,#0
mov C0.d[1],x7
Lmod_8:
tst LENGTH,#7
b.eq Lmod_8_done
mov x6,#0
mov x5,#64
and x4,LENGTH,#7
Lmod_8_loop:
mov x7,#0
ldrb w7,[DATA],#1
sub x5,x5,#8
lsl x7,x7,x5
orr x6,x6,x7
subs x4,x4,#1
b.ne Lmod_8_loop
tbz LENGTH,3,Lmod_8_load
mov C0.d[1],x6
b Lmod_8_done
Lmod_8_load:
mov x7,#0
mov C0.d[0],x6
mov C0.d[1],x7
Lmod_8_done:
eor C0.16b,C0.16b,D.16b

PMUL C0,H1M,H1L
PMUL(C0,H1M,H1L)

REDUCTION D
REDUCTION(D)

Ldone:
Lghash_done:
IF_LE(`
rev64 D.16b,D.16b
')
Expand Down

0 comments on commit 03b8ba3

Please sign in to comment.