|
| 1 | +// SPDX-License-Identifier: GPL-2.0 |
| 2 | +/* |
| 3 | + * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved. |
| 4 | + */ |
| 5 | + |
| 6 | +#include <asm/asm.h> |
| 7 | +#include <asm/regdef.h> |
| 8 | +#include <linux/linkage.h> |
| 9 | + |
| 10 | +.text |
| 11 | + |
| 12 | +/* Salsa20 quarter-round */ |
| 13 | +.macro QR a b c d |
| 14 | + add.w \a, \a, \b |
| 15 | + xor \d, \d, \a |
| 16 | + rotri.w \d, \d, 16 |
| 17 | + |
| 18 | + add.w \c, \c, \d |
| 19 | + xor \b, \b, \c |
| 20 | + rotri.w \b, \b, 20 |
| 21 | + |
| 22 | + add.w \a, \a, \b |
| 23 | + xor \d, \d, \a |
| 24 | + rotri.w \d, \d, 24 |
| 25 | + |
| 26 | + add.w \c, \c, \d |
| 27 | + xor \b, \b, \c |
| 28 | + rotri.w \b, \b, 25 |
| 29 | +.endm |
| 30 | + |
| 31 | +/* |
| 32 | + * Very basic LoongArch implementation of ChaCha20. Produces a given positive |
| 33 | + * number of blocks of output with a nonce of 0, taking an input key and |
| 34 | + * 8-byte counter. Importantly does not spill to the stack. Its arguments |
| 35 | + * are: |
| 36 | + * |
| 37 | + * a0: output bytes |
| 38 | + * a1: 32-byte key input |
| 39 | + * a2: 8-byte counter input/output |
| 40 | + * a3: number of 64-byte blocks to write to output |
| 41 | + */ |
| 42 | +SYM_FUNC_START(__arch_chacha20_blocks_nostack) |
| 43 | + |
| 44 | +/* We don't need a frame pointer */ |
| 45 | +#define s9 fp |
| 46 | + |
| 47 | +#define output a0 |
| 48 | +#define key a1 |
| 49 | +#define counter a2 |
| 50 | +#define nblocks a3 |
| 51 | +#define i a4 |
| 52 | +#define state0 s0 |
| 53 | +#define state1 s1 |
| 54 | +#define state2 s2 |
| 55 | +#define state3 s3 |
| 56 | +#define state4 s4 |
| 57 | +#define state5 s5 |
| 58 | +#define state6 s6 |
| 59 | +#define state7 s7 |
| 60 | +#define state8 s8 |
| 61 | +#define state9 s9 |
| 62 | +#define state10 a5 |
| 63 | +#define state11 a6 |
| 64 | +#define state12 a7 |
| 65 | +#define state13 t0 |
| 66 | +#define state14 t1 |
| 67 | +#define state15 t2 |
| 68 | +#define cnt_lo t3 |
| 69 | +#define cnt_hi t4 |
| 70 | +#define copy0 t5 |
| 71 | +#define copy1 t6 |
| 72 | +#define copy2 t7 |
| 73 | + |
| 74 | +/* Reuse i as copy3 */ |
| 75 | +#define copy3 i |
| 76 | + |
| 77 | + /* |
| 78 | + * The ABI requires s0-s9 saved, and sp aligned to 16-byte. |
| 79 | + * This does not violate the stack-less requirement: no sensitive data |
| 80 | + * is spilled onto the stack. |
| 81 | + */ |
| 82 | + PTR_ADDI sp, sp, (-SZREG * 10) & STACK_ALIGN |
| 83 | + REG_S s0, sp, 0 |
| 84 | + REG_S s1, sp, SZREG |
| 85 | + REG_S s2, sp, SZREG * 2 |
| 86 | + REG_S s3, sp, SZREG * 3 |
| 87 | + REG_S s4, sp, SZREG * 4 |
| 88 | + REG_S s5, sp, SZREG * 5 |
| 89 | + REG_S s6, sp, SZREG * 6 |
| 90 | + REG_S s7, sp, SZREG * 7 |
| 91 | + REG_S s8, sp, SZREG * 8 |
| 92 | + REG_S s9, sp, SZREG * 9 |
| 93 | + |
| 94 | + li.w copy0, 0x61707865 |
| 95 | + li.w copy1, 0x3320646e |
| 96 | + li.w copy2, 0x79622d32 |
| 97 | + |
| 98 | + ld.w cnt_lo, counter, 0 |
| 99 | + ld.w cnt_hi, counter, 4 |
| 100 | + |
| 101 | +.Lblock: |
| 102 | + /* state[0,1,2,3] = "expand 32-byte k" */ |
| 103 | + move state0, copy0 |
| 104 | + move state1, copy1 |
| 105 | + move state2, copy2 |
| 106 | + li.w state3, 0x6b206574 |
| 107 | + |
| 108 | + /* state[4,5,..,11] = key */ |
| 109 | + ld.w state4, key, 0 |
| 110 | + ld.w state5, key, 4 |
| 111 | + ld.w state6, key, 8 |
| 112 | + ld.w state7, key, 12 |
| 113 | + ld.w state8, key, 16 |
| 114 | + ld.w state9, key, 20 |
| 115 | + ld.w state10, key, 24 |
| 116 | + ld.w state11, key, 28 |
| 117 | + |
| 118 | + /* state[12,13] = counter */ |
| 119 | + move state12, cnt_lo |
| 120 | + move state13, cnt_hi |
| 121 | + |
| 122 | + /* state[14,15] = 0 */ |
| 123 | + move state14, zero |
| 124 | + move state15, zero |
| 125 | + |
| 126 | + li.w i, 10 |
| 127 | +.Lpermute: |
| 128 | + /* odd round */ |
| 129 | + QR state0, state4, state8, state12 |
| 130 | + QR state1, state5, state9, state13 |
| 131 | + QR state2, state6, state10, state14 |
| 132 | + QR state3, state7, state11, state15 |
| 133 | + |
| 134 | + /* even round */ |
| 135 | + QR state0, state5, state10, state15 |
| 136 | + QR state1, state6, state11, state12 |
| 137 | + QR state2, state7, state8, state13 |
| 138 | + QR state3, state4, state9, state14 |
| 139 | + |
| 140 | + addi.w i, i, -1 |
| 141 | + bnez i, .Lpermute |
| 142 | + |
| 143 | + /* |
| 144 | + * copy[3] = "expa", materialize it here because copy[3] shares the |
| 145 | + * same register with i which just became dead. |
| 146 | + */ |
| 147 | + li.w copy3, 0x6b206574 |
| 148 | + |
| 149 | + /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */ |
| 150 | + add.w state0, state0, copy0 |
| 151 | + add.w state1, state1, copy1 |
| 152 | + add.w state2, state2, copy2 |
| 153 | + add.w state3, state3, copy3 |
| 154 | + st.w state0, output, 0 |
| 155 | + st.w state1, output, 4 |
| 156 | + st.w state2, output, 8 |
| 157 | + st.w state3, output, 12 |
| 158 | + |
| 159 | + /* from now on state[0,1,2,3] are scratch registers */ |
| 160 | + |
| 161 | + /* state[0,1,2,3] = lo32(key) */ |
| 162 | + ld.w state0, key, 0 |
| 163 | + ld.w state1, key, 4 |
| 164 | + ld.w state2, key, 8 |
| 165 | + ld.w state3, key, 12 |
| 166 | + |
| 167 | + /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */ |
| 168 | + add.w state4, state4, state0 |
| 169 | + add.w state5, state5, state1 |
| 170 | + add.w state6, state6, state2 |
| 171 | + add.w state7, state7, state3 |
| 172 | + st.w state4, output, 16 |
| 173 | + st.w state5, output, 20 |
| 174 | + st.w state6, output, 24 |
| 175 | + st.w state7, output, 28 |
| 176 | + |
| 177 | + /* state[0,1,2,3] = hi32(key) */ |
| 178 | + ld.w state0, key, 16 |
| 179 | + ld.w state1, key, 20 |
| 180 | + ld.w state2, key, 24 |
| 181 | + ld.w state3, key, 28 |
| 182 | + |
| 183 | + /* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */ |
| 184 | + add.w state8, state8, state0 |
| 185 | + add.w state9, state9, state1 |
| 186 | + add.w state10, state10, state2 |
| 187 | + add.w state11, state11, state3 |
| 188 | + st.w state8, output, 32 |
| 189 | + st.w state9, output, 36 |
| 190 | + st.w state10, output, 40 |
| 191 | + st.w state11, output, 44 |
| 192 | + |
| 193 | + /* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */ |
| 194 | + add.w state12, state12, cnt_lo |
| 195 | + add.w state13, state13, cnt_hi |
| 196 | + st.w state12, output, 48 |
| 197 | + st.w state13, output, 52 |
| 198 | + st.w state14, output, 56 |
| 199 | + st.w state15, output, 60 |
| 200 | + |
| 201 | + /* ++counter */ |
| 202 | + addi.w cnt_lo, cnt_lo, 1 |
| 203 | + sltui state0, cnt_lo, 1 |
| 204 | + add.w cnt_hi, cnt_hi, state0 |
| 205 | + |
| 206 | + /* output += 64 */ |
| 207 | + PTR_ADDI output, output, 64 |
| 208 | + /* --nblocks */ |
| 209 | + PTR_ADDI nblocks, nblocks, -1 |
| 210 | + bnez nblocks, .Lblock |
| 211 | + |
| 212 | + /* counter = [cnt_lo, cnt_hi] */ |
| 213 | + st.w cnt_lo, counter, 0 |
| 214 | + st.w cnt_hi, counter, 4 |
| 215 | + |
| 216 | + /* |
| 217 | + * Zero out the potentially sensitive regs, in case nothing uses these |
| 218 | + * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and |
| 219 | + * state[0,...,9] are s0-s9 those we'll restore in the epilogue, so we |
| 220 | + * only need to zero state[11,...,15]. |
| 221 | + */ |
| 222 | + move state10, zero |
| 223 | + move state11, zero |
| 224 | + move state12, zero |
| 225 | + move state13, zero |
| 226 | + move state14, zero |
| 227 | + move state15, zero |
| 228 | + |
| 229 | + REG_L s0, sp, 0 |
| 230 | + REG_L s1, sp, SZREG |
| 231 | + REG_L s2, sp, SZREG * 2 |
| 232 | + REG_L s3, sp, SZREG * 3 |
| 233 | + REG_L s4, sp, SZREG * 4 |
| 234 | + REG_L s5, sp, SZREG * 5 |
| 235 | + REG_L s6, sp, SZREG * 6 |
| 236 | + REG_L s7, sp, SZREG * 7 |
| 237 | + REG_L s8, sp, SZREG * 8 |
| 238 | + REG_L s9, sp, SZREG * 9 |
| 239 | + PTR_ADDI sp, sp, -((-SZREG * 10) & STACK_ALIGN) |
| 240 | + |
| 241 | + jr ra |
| 242 | +SYM_FUNC_END(__arch_chacha20_blocks_nostack) |
0 commit comments