From 9d909cb67e70fd792926ac1e2ab305b2cc96bc27 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Thu, 15 Apr 2021 23:49:40 +0300 Subject: [PATCH] mpi/ec: add fast reduction functions for NIST curves MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * configure.ac (ASM_DISABLED): New. * mpi/Makefile.am: Add 'ec-nist.c' and 'ec-inline.h'. * mpi/ec-nist.c: New. * mpi/ec-inline.h: New. * mpi/ec-internal.h (_gcry_mpi_ec_nist192_mod) (_gcry_mpi_ec_nist224_mod, _gcry_mpi_ec_nist256_mod) (_gcry_mpi_ec_nist384_mod, _gcry_mpi_ec_nist521_mod): New. * mpi/ec.c (ec_addm, ec_subm, ec_mulm, ec_mul2): Use 'ctx->mod'. (field_table): Add 'mod' function; Add NIST reduction functions. (ec_p_init): Setup ctx->mod; Setup function pointers from field_table only if pointer is not NULL; Resize ctx->a and ctx->b only if set. * mpi/mpi-internal.h (RESIZE_AND_CLEAR_IF_NEEDED): New. * mpi/mpiutil.c (_gcry_mpi_resize): Clear all unused limbs also in realloc case. * src/ec-context.h (mpi_ec_ctx_s): Add 'mod' function. -- Benchmark on AMD Ryzen 7 5800X (x86_64): Before: NIST-P192 | nanosecs/iter cycles/iter auto Mhz mult | 283346 1369473 4833 keygen | 1688442 8185744 4848 sign | 549683 2662984 4845 verify | 615284 2984325 4850 = NIST-P224 | nanosecs/iter cycles/iter auto Mhz mult | 516443 2501173 4843 keygen | 2859746 13866802 4849 sign | 918472 4455043 4850 verify | 1057940 5131372 4850 = NIST-P256 | nanosecs/iter cycles/iter auto Mhz mult | 423536 2054040 4850 keygen | 2383097 11557572 4850 sign | 774346 3754243 4848 verify | 864934 4196315 4852 = NIST-P384 | nanosecs/iter cycles/iter auto Mhz mult | 929985 4511881 4852 keygen | 5230788 25367299 4850 sign | 1671432 8109726 4852 verify | 1902729 9228568 4850 = NIST-P521 | nanosecs/iter cycles/iter auto Mhz mult | 2123546 10300952 4851 keygen | 12019340 58297774 4850 sign | 3886988 18853054 4850 verify | 4507885 21864015 4850 After: NIST-P192 | nanosecs/iter cycles/iter auto Mhz speed-up mult | 186679 905603 4851 +51% keygen | 1161423 5623822 4842 +46% sign | 389531 1887557 4846 +41% verify | 412936 2000461 4844 +49% = NIST-P224 | nanosecs/iter cycles/iter auto Mhz speed-up mult | 260621 1256327 4821 +99% keygen | 1557845 7531677 4835 +84% sign | 521678 2527083 4844 +76% verify | 554084 2677949 4833 +92% = NIST-P256 | nanosecs/iter cycles/iter auto Mhz speed-up mult | 319045 1542061 4833 +33% keygen | 1834822 8898950 4850 +30% sign | 612866 2972630 4850 +26% verify | 664821 3222597 4847 +30% = NIST-P384 | nanosecs/iter cycles/iter auto Mhz speed-up mult | 593894 2875260 4841 +57% keygen | 3526600 17089717 4846 +48% sign | 1178098 5710151 4847 +42% verify | 1260185 6107449 4846 +51% = NIST-P521 | nanosecs/iter cycles/iter auto Mhz speed-up mult | 1160220 5621946 4846 +83% keygen | 6862975 33247351 4844 +75%ยด sign | 2287366 11096711 4851 +70% verify | 2455858 11888045 4841 +84% Benchmark on AMD Ryzen 7 5800X (i386): Before: NIST-P192 | nanosecs/iter cycles/iter auto Mhz mult | 648039 3143236 4850 keygen | 3554452 17244822 4852 sign | 1163173 5641932 4850 verify | 1300076 6305673 4850 = NIST-P224 | nanosecs/iter cycles/iter auto Mhz mult | 798607 3874405 4851 keygen | 4657604 22589864 4850 sign | 1515803 7352049 4850 verify | 1635470 7935373 4852 = NIST-P256 | nanosecs/iter cycles/iter auto Mhz mult | 927033 4496283 4850 keygen | 5313601 25771983 4850 sign | 1735795 8418514 4850 verify | 1945804 9438212 4851 = NIST-P384 | nanosecs/iter cycles/iter auto Mhz mult | 2301781 11164473 4850 keygen | 12856001 62353242 4850 sign | 4161041 20180651 4850 verify | 4705961 22827478 4851 = NIST-P521 | nanosecs/iter cycles/iter auto Mhz mult | 6066635 29422721 4850 keygen | 32995868 160046407 4850 sign | 10503306 50945387 4850 verify | 12225252 59294323 4850 After: NIST-P192 | nanosecs/iter cycles/iter auto Mhz speed-up mult | 413605 2007498 4854 +57% keygen | 2479429 12010926 4844 +44% sign | 825111 3997147 4844 +41% verify | 890206 4318723 4851 +46% = NIST-P224 | nanosecs/iter cycles/iter auto Mhz speed-up mult | 551703 2676454 4851 +45% keygen | 3257022 15781844 4845 +43% sign | 1085678 5258894 4844 +40% verify | 1172195 5678499 4844 +40% = NIST-P256 | nanosecs/iter cycles/iter auto Mhz speed-up mult | 720395 3497486 4855 +29% keygen | 4217758 20461257 4851 +26% sign | 1404350 6814131 4852 +24% verify | 1515136 7353955 4854 +28% = NIST-P384 | nanosecs/iter cycles/iter auto Mhz speed-up mult | 1525742 7400771 4851 +51% keygen | 9046660 43877889 4850 +42% sign | 2974641 14408703 4844 +40% verify | 3265285 15834951 4849 +44% = NIST-P521 | nanosecs/iter cycles/iter auto Mhz speed-up mult | 3289348 15968678 4855 +84% keygen | 19354174 93873531 4850 +70% sign | 6351493 30830140 4854 +65% verify | 6979292 33854215 4851 +75% Signed-off-by: Jussi Kivilinna --- configure.ac | 3 + mpi/Makefile.am | 2 +- mpi/ec-inline.h | 1047 ++++++++++++++++++++++++++++++++++++++++++++ mpi/ec-internal.h | 16 + mpi/ec-nist.c | 795 +++++++++++++++++++++++++++++++++ mpi/ec.c | 90 +++- mpi/mpi-internal.h | 5 + mpi/mpiutil.c | 2 +- src/ec-context.h | 1 + 9 files changed, 1943 insertions(+), 18 deletions(-) create mode 100644 mpi/ec-inline.h create mode 100644 mpi/ec-nist.c diff --git a/configure.ac b/configure.ac index 37947ecbc..6fdca24ad 100644 --- a/configure.ac +++ b/configure.ac @@ -546,6 +546,9 @@ AC_ARG_ENABLE([asm], [try_asm_modules=$enableval], [try_asm_modules=yes]) AC_MSG_RESULT($try_asm_modules) +if test "$try_asm_modules" != yes ; then + AC_DEFINE(ASM_DISABLED,1,[Defined if --disable-asm was used to configure]) +fi # Implementation of the --enable-m-guard switch. AC_MSG_CHECKING([whether memory guard is requested]) diff --git a/mpi/Makefile.am b/mpi/Makefile.am index d06594e18..adb8e6f50 100644 --- a/mpi/Makefile.am +++ b/mpi/Makefile.am @@ -175,5 +175,5 @@ libmpi_la_SOURCES = longlong.h \ mpih-mul.c \ mpih-const-time.c \ mpiutil.c \ - ec.c ec-internal.h ec-ed25519.c + ec.c ec-internal.h ec-ed25519.c ec-nist.c ec-inline.h EXTRA_libmpi_la_SOURCES = asm-common-aarch64.h diff --git a/mpi/ec-inline.h b/mpi/ec-inline.h new file mode 100644 index 000000000..25c3b40d3 --- /dev/null +++ b/mpi/ec-inline.h @@ -0,0 +1,1047 @@ +/* ec-inline.h - EC inline addition/substraction helpers + * Copyright (C) 2021 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#ifndef GCRY_EC_INLINE_H +#define GCRY_EC_INLINE_H + +#include "mpi-internal.h" +#include "longlong.h" +#include "ec-context.h" +#include "../cipher/bithelp.h" +#include "../cipher/bufhelp.h" + + +#if BYTES_PER_MPI_LIMB == 8 + +/* 64-bit limb definitions for 64-bit architectures. */ + +#define LIMBS_PER_LIMB64 1 +#define LOAD64(x, pos) ((x)[pos]) +#define STORE64(x, pos, v) ((x)[pos] = (mpi_limb_t)(v)) +#define LIMB_TO64(v) ((mpi_limb_t)(v)) +#define LIMB_FROM64(v) ((mpi_limb_t)(v)) +#define HIBIT_LIMB64(v) ((mpi_limb_t)(v) >> (BITS_PER_MPI_LIMB - 1)) +#define HI32_LIMB64(v) (u32)((mpi_limb_t)(v) >> (BITS_PER_MPI_LIMB - 32)) +#define LO32_LIMB64(v) ((u32)(v)) +#define LIMB64_C(hi, lo) (((mpi_limb_t)(u32)(hi) << 32) | (u32)(lo)) +#define STORE64_COND(x, pos, mask1, val1, mask2, val2) \ + ((x)[(pos)] = ((mask1) & (val1)) | ((mask2) & (val2))) + +typedef mpi_limb_t mpi_limb64_t; + +static inline u32 +LOAD32(mpi_ptr_t x, unsigned int pos) +{ + unsigned int shr = (pos % 2) * 32; + return (x[pos / 2] >> shr); +} + +static inline mpi_limb64_t +LIMB64_HILO(u32 hi, u32 lo) +{ + mpi_limb64_t v = hi; + return (v << 32) | lo; +} + + +/* x86-64 addition/subtraction helpers. */ +#if defined (__x86_64__) && defined(HAVE_CPU_ARCH_X86) && __GNUC__ >= 4 + +#define ADD3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \ + __asm__ ("addq %8, %2\n" \ + "adcq %7, %1\n" \ + "adcq %6, %0\n" \ + : "=r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "0" ((mpi_limb_t)(B2)), \ + "1" ((mpi_limb_t)(B1)), \ + "2" ((mpi_limb_t)(B0)), \ + "g" ((mpi_limb_t)(C2)), \ + "g" ((mpi_limb_t)(C1)), \ + "g" ((mpi_limb_t)(C0)) \ + : "cc") + +#define SUB3_LIMB64(A3, A2, A1, A0, B2, B1, B0, C2, C1, C0) \ + __asm__ ("subq %8, %2\n" \ + "sbbq %7, %1\n" \ + "sbbq %6, %0\n" \ + : "=r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "0" ((mpi_limb_t)(B2)), \ + "1" ((mpi_limb_t)(B1)), \ + "2" ((mpi_limb_t)(B0)), \ + "g" ((mpi_limb_t)(C2)), \ + "g" ((mpi_limb_t)(C1)), \ + "g" ((mpi_limb_t)(C0)) \ + : "cc") + +#define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \ + __asm__ ("addq %11, %3\n" \ + "adcq %10, %2\n" \ + "adcq %9, %1\n" \ + "adcq %8, %0\n" \ + : "=r" (A3), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "0" ((mpi_limb_t)(B3)), \ + "1" ((mpi_limb_t)(B2)), \ + "2" ((mpi_limb_t)(B1)), \ + "3" ((mpi_limb_t)(B0)), \ + "g" ((mpi_limb_t)(C3)), \ + "g" ((mpi_limb_t)(C2)), \ + "g" ((mpi_limb_t)(C1)), \ + "g" ((mpi_limb_t)(C0)) \ + : "cc") + +#define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \ + __asm__ ("subq %11, %3\n" \ + "sbbq %10, %2\n" \ + "sbbq %9, %1\n" \ + "sbbq %8, %0\n" \ + : "=r" (A3), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "0" ((mpi_limb_t)(B3)), \ + "1" ((mpi_limb_t)(B2)), \ + "2" ((mpi_limb_t)(B1)), \ + "3" ((mpi_limb_t)(B0)), \ + "g" ((mpi_limb_t)(C3)), \ + "g" ((mpi_limb_t)(C2)), \ + "g" ((mpi_limb_t)(C1)), \ + "g" ((mpi_limb_t)(C0)) \ + : "cc") + +#define ADD5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \ + C4, C3, C2, C1, C0) \ + __asm__ ("addq %14, %4\n" \ + "adcq %13, %3\n" \ + "adcq %12, %2\n" \ + "adcq %11, %1\n" \ + "adcq %10, %0\n" \ + : "=r" (A4), \ + "=&r" (A3), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "0" ((mpi_limb_t)(B4)), \ + "1" ((mpi_limb_t)(B3)), \ + "2" ((mpi_limb_t)(B2)), \ + "3" ((mpi_limb_t)(B1)), \ + "4" ((mpi_limb_t)(B0)), \ + "g" ((mpi_limb_t)(C4)), \ + "g" ((mpi_limb_t)(C3)), \ + "g" ((mpi_limb_t)(C2)), \ + "g" ((mpi_limb_t)(C1)), \ + "g" ((mpi_limb_t)(C0)) \ + : "cc") + +#define SUB5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \ + C4, C3, C2, C1, C0) \ + __asm__ ("subq %14, %4\n" \ + "sbbq %13, %3\n" \ + "sbbq %12, %2\n" \ + "sbbq %11, %1\n" \ + "sbbq %10, %0\n" \ + : "=r" (A4), \ + "=&r" (A3), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "0" ((mpi_limb_t)(B4)), \ + "1" ((mpi_limb_t)(B3)), \ + "2" ((mpi_limb_t)(B2)), \ + "3" ((mpi_limb_t)(B1)), \ + "4" ((mpi_limb_t)(B0)), \ + "g" ((mpi_limb_t)(C4)), \ + "g" ((mpi_limb_t)(C3)), \ + "g" ((mpi_limb_t)(C2)), \ + "g" ((mpi_limb_t)(C1)), \ + "g" ((mpi_limb_t)(C0)) \ + : "cc") + +#endif /* __x86_64__ */ + + +/* ARM AArch64 addition/subtraction helpers. */ +#if defined (__aarch64__) && defined(HAVE_CPU_ARCH_ARM) && __GNUC__ >= 4 + +#define ADD3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \ + __asm__ ("adds %2, %5, %8\n" \ + "adcs %1, %4, %7\n" \ + "adc %0, %3, %6\n" \ + : "=r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "r" ((mpi_limb_t)(B2)), \ + "r" ((mpi_limb_t)(B1)), \ + "r" ((mpi_limb_t)(B0)), \ + "r" ((mpi_limb_t)(C2)), \ + "r" ((mpi_limb_t)(C1)), \ + "r" ((mpi_limb_t)(C0)) \ + : "cc") + +#define SUB3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \ + __asm__ ("subs %2, %5, %8\n" \ + "sbcs %1, %4, %7\n" \ + "sbc %0, %3, %6\n" \ + : "=r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "r" ((mpi_limb_t)(B2)), \ + "r" ((mpi_limb_t)(B1)), \ + "r" ((mpi_limb_t)(B0)), \ + "r" ((mpi_limb_t)(C2)), \ + "r" ((mpi_limb_t)(C1)), \ + "r" ((mpi_limb_t)(C0)) \ + : "cc") + +#define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \ + __asm__ ("adds %3, %7, %11\n" \ + "adcs %2, %6, %10\n" \ + "adcs %1, %5, %9\n" \ + "adc %0, %4, %8\n" \ + : "=r" (A3), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "r" ((mpi_limb_t)(B3)), \ + "r" ((mpi_limb_t)(B2)), \ + "r" ((mpi_limb_t)(B1)), \ + "r" ((mpi_limb_t)(B0)), \ + "r" ((mpi_limb_t)(C3)), \ + "r" ((mpi_limb_t)(C2)), \ + "r" ((mpi_limb_t)(C1)), \ + "r" ((mpi_limb_t)(C0)) \ + : "cc") + +#define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \ + __asm__ ("subs %3, %7, %11\n" \ + "sbcs %2, %6, %10\n" \ + "sbcs %1, %5, %9\n" \ + "sbc %0, %4, %8\n" \ + : "=r" (A3), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "r" ((mpi_limb_t)(B3)), \ + "r" ((mpi_limb_t)(B2)), \ + "r" ((mpi_limb_t)(B1)), \ + "r" ((mpi_limb_t)(B0)), \ + "r" ((mpi_limb_t)(C3)), \ + "r" ((mpi_limb_t)(C2)), \ + "r" ((mpi_limb_t)(C1)), \ + "r" ((mpi_limb_t)(C0)) \ + : "cc") + +#define ADD5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \ + C4, C3, C2, C1, C0) \ + __asm__ ("adds %4, %9, %14\n" \ + "adcs %3, %8, %13\n" \ + "adcs %2, %7, %12\n" \ + "adcs %1, %6, %11\n" \ + "adc %0, %5, %10\n" \ + : "=r" (A4), \ + "=&r" (A3), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "r" ((mpi_limb_t)(B4)), \ + "r" ((mpi_limb_t)(B3)), \ + "r" ((mpi_limb_t)(B2)), \ + "r" ((mpi_limb_t)(B1)), \ + "r" ((mpi_limb_t)(B0)), \ + "r" ((mpi_limb_t)(C4)), \ + "r" ((mpi_limb_t)(C3)), \ + "r" ((mpi_limb_t)(C2)), \ + "r" ((mpi_limb_t)(C1)), \ + "r" ((mpi_limb_t)(C0)) \ + : "cc") + +#define SUB5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \ + C4, C3, C2, C1, C0) \ + __asm__ ("subs %4, %9, %14\n" \ + "sbcs %3, %8, %13\n" \ + "sbcs %2, %7, %12\n" \ + "sbcs %1, %6, %11\n" \ + "sbc %0, %5, %10\n" \ + : "=r" (A4), \ + "=&r" (A3), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "r" ((mpi_limb_t)(B4)), \ + "r" ((mpi_limb_t)(B3)), \ + "r" ((mpi_limb_t)(B2)), \ + "r" ((mpi_limb_t)(B1)), \ + "r" ((mpi_limb_t)(B0)), \ + "r" ((mpi_limb_t)(C4)), \ + "r" ((mpi_limb_t)(C3)), \ + "r" ((mpi_limb_t)(C2)), \ + "r" ((mpi_limb_t)(C1)), \ + "r" ((mpi_limb_t)(C0)) \ + : "cc") + +#endif /* __aarch64__ */ + + +/* PowerPC64 addition/subtraction helpers. */ +#if defined (__powerpc__) && defined(HAVE_CPU_ARCH_PPC) && __GNUC__ >= 4 + +#define ADD3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \ + __asm__ ("addc %2, %8, %5\n" \ + "adde %1, %7, %4\n" \ + "adde %0, %6, %3\n" \ + : "=r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "r" ((mpi_limb_t)(B2)), \ + "r" ((mpi_limb_t)(B1)), \ + "r" ((mpi_limb_t)(B0)), \ + "r" ((mpi_limb_t)(C2)), \ + "r" ((mpi_limb_t)(C1)), \ + "r" ((mpi_limb_t)(C0)) \ + : "cc", "r0") + +#define SUB3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \ + __asm__ ("subfc %2, %8, %5\n" \ + "subfe %1, %7, %4\n" \ + "subfe %0, %6, %3\n" \ + : "=r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "r" ((mpi_limb_t)(B2)), \ + "r" ((mpi_limb_t)(B1)), \ + "r" ((mpi_limb_t)(B0)), \ + "r" ((mpi_limb_t)(C2)), \ + "r" ((mpi_limb_t)(C1)), \ + "r" ((mpi_limb_t)(C0)) \ + : "cc", "r0") + +#define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \ + __asm__ ("addc %3, %11, %7\n" \ + "adde %2, %10, %6\n" \ + "adde %1, %9, %5\n" \ + "adde %0, %8, %4\n" \ + : "=r" (A3), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "r" ((mpi_limb_t)(B3)), \ + "r" ((mpi_limb_t)(B2)), \ + "r" ((mpi_limb_t)(B1)), \ + "r" ((mpi_limb_t)(B0)), \ + "r" ((mpi_limb_t)(C3)), \ + "r" ((mpi_limb_t)(C2)), \ + "r" ((mpi_limb_t)(C1)), \ + "r" ((mpi_limb_t)(C0)) \ + : "cc") + +#define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \ + __asm__ ("subfc %3, %11, %7\n" \ + "subfe %2, %10, %6\n" \ + "subfe %1, %9, %5\n" \ + "subfe %0, %8, %4\n" \ + : "=r" (A3), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "r" ((mpi_limb_t)(B3)), \ + "r" ((mpi_limb_t)(B2)), \ + "r" ((mpi_limb_t)(B1)), \ + "r" ((mpi_limb_t)(B0)), \ + "r" ((mpi_limb_t)(C3)), \ + "r" ((mpi_limb_t)(C2)), \ + "r" ((mpi_limb_t)(C1)), \ + "r" ((mpi_limb_t)(C0)) \ + : "cc") + +#define ADD5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \ + C4, C3, C2, C1, C0) \ + __asm__ ("addc %4, %14, %9\n" \ + "adde %3, %13, %8\n" \ + "adde %2, %12, %7\n" \ + "adde %1, %11, %6\n" \ + "adde %0, %10, %5\n" \ + : "=r" (A4), \ + "=&r" (A3), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "r" ((mpi_limb_t)(B4)), \ + "r" ((mpi_limb_t)(B3)), \ + "r" ((mpi_limb_t)(B2)), \ + "r" ((mpi_limb_t)(B1)), \ + "r" ((mpi_limb_t)(B0)), \ + "r" ((mpi_limb_t)(C4)), \ + "r" ((mpi_limb_t)(C3)), \ + "r" ((mpi_limb_t)(C2)), \ + "r" ((mpi_limb_t)(C1)), \ + "r" ((mpi_limb_t)(C0)) \ + : "cc") + +#define SUB5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \ + C4, C3, C2, C1, C0) \ + __asm__ ("subfc %4, %14, %9\n" \ + "subfe %3, %13, %8\n" \ + "subfe %2, %12, %7\n" \ + "subfe %1, %11, %6\n" \ + "subfe %0, %10, %5\n" \ + : "=r" (A4), \ + "=&r" (A3), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "r" ((mpi_limb_t)(B4)), \ + "r" ((mpi_limb_t)(B3)), \ + "r" ((mpi_limb_t)(B2)), \ + "r" ((mpi_limb_t)(B1)), \ + "r" ((mpi_limb_t)(B0)), \ + "r" ((mpi_limb_t)(C4)), \ + "r" ((mpi_limb_t)(C3)), \ + "r" ((mpi_limb_t)(C2)), \ + "r" ((mpi_limb_t)(C1)), \ + "r" ((mpi_limb_t)(C0)) \ + : "cc") + +#endif /* __powerpc__ */ + + +/* s390x/zSeries addition/subtraction helpers. */ +#if defined (__s390x__) && defined(HAVE_CPU_ARCH_S390X) && __GNUC__ >= 4 + +#define ADD3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \ + __asm__ ("algr %2, %8\n" \ + "alcgr %1, %7\n" \ + "alcgr %0, %6\n" \ + : "=r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "0" ((mpi_limb_t)(B2)), \ + "1" ((mpi_limb_t)(B1)), \ + "2" ((mpi_limb_t)(B0)), \ + "r" ((mpi_limb_t)(C2)), \ + "r" ((mpi_limb_t)(C1)), \ + "r" ((mpi_limb_t)(C0)) \ + : "cc") + +#define SUB3_LIMB64(A3, A2, A1, A0, B2, B1, B0, C2, C1, C0) \ + __asm__ ("slgr %2, %8\n" \ + "slbgr %1, %7\n" \ + "slbgr %0, %6\n" \ + : "=r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "0" ((mpi_limb_t)(B2)), \ + "1" ((mpi_limb_t)(B1)), \ + "2" ((mpi_limb_t)(B0)), \ + "r" ((mpi_limb_t)(C2)), \ + "r" ((mpi_limb_t)(C1)), \ + "r" ((mpi_limb_t)(C0)) \ + : "cc") + +#define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \ + __asm__ ("algr %3, %11\n" \ + "alcgr %2, %10\n" \ + "alcgr %1, %9\n" \ + "alcgr %0, %8\n" \ + : "=r" (A3), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "0" ((mpi_limb_t)(B3)), \ + "1" ((mpi_limb_t)(B2)), \ + "2" ((mpi_limb_t)(B1)), \ + "3" ((mpi_limb_t)(B0)), \ + "r" ((mpi_limb_t)(C3)), \ + "r" ((mpi_limb_t)(C2)), \ + "r" ((mpi_limb_t)(C1)), \ + "r" ((mpi_limb_t)(C0)) \ + : "cc") + +#define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \ + __asm__ ("slgr %3, %11\n" \ + "slbgr %2, %10\n" \ + "slbgr %1, %9\n" \ + "slbgr %0, %8\n" \ + : "=r" (A3), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "0" ((mpi_limb_t)(B3)), \ + "1" ((mpi_limb_t)(B2)), \ + "2" ((mpi_limb_t)(B1)), \ + "3" ((mpi_limb_t)(B0)), \ + "r" ((mpi_limb_t)(C3)), \ + "r" ((mpi_limb_t)(C2)), \ + "r" ((mpi_limb_t)(C1)), \ + "r" ((mpi_limb_t)(C0)) \ + : "cc") + +#define ADD5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \ + C4, C3, C2, C1, C0) \ + __asm__ ("algr %4, %14\n" \ + "alcgr %3, %13\n" \ + "alcgr %2, %12\n" \ + "alcgr %1, %11\n" \ + "alcgr %0, %10\n" \ + : "=r" (A4), \ + "=&r" (A3), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "0" ((mpi_limb_t)(B4)), \ + "1" ((mpi_limb_t)(B3)), \ + "2" ((mpi_limb_t)(B2)), \ + "3" ((mpi_limb_t)(B1)), \ + "4" ((mpi_limb_t)(B0)), \ + "r" ((mpi_limb_t)(C4)), \ + "r" ((mpi_limb_t)(C3)), \ + "r" ((mpi_limb_t)(C2)), \ + "r" ((mpi_limb_t)(C1)), \ + "r" ((mpi_limb_t)(C0)) \ + : "cc") + +#define SUB5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \ + C4, C3, C2, C1, C0) \ + __asm__ ("slgr %4, %14\n" \ + "slbgr %3, %13\n" \ + "slbgr %2, %12\n" \ + "slbgr %1, %11\n" \ + "slbgr %0, %10\n" \ + : "=r" (A4), \ + "=&r" (A3), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "0" ((mpi_limb_t)(B4)), \ + "1" ((mpi_limb_t)(B3)), \ + "2" ((mpi_limb_t)(B2)), \ + "3" ((mpi_limb_t)(B1)), \ + "4" ((mpi_limb_t)(B0)), \ + "r" ((mpi_limb_t)(C4)), \ + "r" ((mpi_limb_t)(C3)), \ + "r" ((mpi_limb_t)(C2)), \ + "r" ((mpi_limb_t)(C1)), \ + "r" ((mpi_limb_t)(C0)) \ + : "cc") + +#endif /* __s390x__ */ + + +/* Common 64-bit arch addition/subtraction macros. */ + +#define ADD2_LIMB64(A1, A0, B1, B0, C1, C0) \ + add_ssaaaa(A1, A0, B1, B0, C1, C0) + +#define SUB2_LIMB64(A1, A0, B1, B0, C1, C0) \ + sub_ddmmss(A1, A0, B1, B0, C1, C0) + +#endif /* BYTES_PER_MPI_LIMB == 8 */ + + +#if BYTES_PER_MPI_LIMB == 4 + +/* 64-bit limb definitions for 32-bit architectures. */ + +#define LIMBS_PER_LIMB64 2 +#define LIMB_FROM64(v) ((v).lo) +#define HIBIT_LIMB64(v) ((v).hi >> (BITS_PER_MPI_LIMB - 1)) +#define HI32_LIMB64(v) ((v).hi) +#define LO32_LIMB64(v) ((v).lo) +#define LOAD32(x, pos) ((x)[pos]) +#define LIMB64_C(hi, lo) { (lo), (hi) } + +typedef struct +{ + mpi_limb_t lo; + mpi_limb_t hi; +} mpi_limb64_t; + +static inline mpi_limb64_t +LOAD64(const mpi_ptr_t x, unsigned int pos) +{ + mpi_limb64_t v; + v.lo = x[pos * 2 + 0]; + v.hi = x[pos * 2 + 1]; + return v; +} + +static inline void +STORE64(mpi_ptr_t x, unsigned int pos, mpi_limb64_t v) +{ + x[pos * 2 + 0] = v.lo; + x[pos * 2 + 1] = v.hi; +} + +static inline void +STORE64_COND(mpi_ptr_t x, unsigned int pos, mpi_limb_t mask1, + mpi_limb64_t val1, mpi_limb_t mask2, mpi_limb64_t val2) +{ + x[pos * 2 + 0] = (mask1 & val1.lo) | (mask2 & val2.lo); + x[pos * 2 + 1] = (mask1 & val1.hi) | (mask2 & val2.hi); +} + +static inline mpi_limb64_t +LIMB_TO64(mpi_limb_t x) +{ + mpi_limb64_t v; + v.lo = x; + v.hi = 0; + return v; +} + +static inline mpi_limb64_t +LIMB64_HILO(mpi_limb_t hi, mpi_limb_t lo) +{ + mpi_limb64_t v; + v.lo = lo; + v.hi = hi; + return v; +} + + +/* i386 addition/subtraction helpers. */ +#if defined (__i386__) && defined(HAVE_CPU_ARCH_X86) && __GNUC__ >= 4 + +#define ADD4_LIMB32(a3, a2, a1, a0, b3, b2, b1, b0, c3, c2, c1, c0) \ + __asm__ ("addl %11, %3\n" \ + "adcl %10, %2\n" \ + "adcl %9, %1\n" \ + "adcl %8, %0\n" \ + : "=r" (a3), \ + "=&r" (a2), \ + "=&r" (a1), \ + "=&r" (a0) \ + : "0" ((mpi_limb_t)(b3)), \ + "1" ((mpi_limb_t)(b2)), \ + "2" ((mpi_limb_t)(b1)), \ + "3" ((mpi_limb_t)(b0)), \ + "g" ((mpi_limb_t)(c3)), \ + "g" ((mpi_limb_t)(c2)), \ + "g" ((mpi_limb_t)(c1)), \ + "g" ((mpi_limb_t)(c0)) \ + : "cc") + +#define ADD6_LIMB32(a5, a4, a3, a2, a1, a0, b5, b4, b3, b2, b1, b0, \ + c5, c4, c3, c2, c1, c0) do { \ + mpi_limb_t __carry6_32; \ + __asm__ ("addl %10, %3\n" \ + "adcl %9, %2\n" \ + "adcl %8, %1\n" \ + "sbbl %0, %0\n" \ + : "=r" (__carry6_32), \ + "=&r" (a2), \ + "=&r" (a1), \ + "=&r" (a0) \ + : "0" ((mpi_limb_t)(0)), \ + "1" ((mpi_limb_t)(b2)), \ + "2" ((mpi_limb_t)(b1)), \ + "3" ((mpi_limb_t)(b0)), \ + "g" ((mpi_limb_t)(c2)), \ + "g" ((mpi_limb_t)(c1)), \ + "g" ((mpi_limb_t)(c0)) \ + : "cc"); \ + __asm__ ("addl $1, %3\n" \ + "adcl %10, %2\n" \ + "adcl %9, %1\n" \ + "adcl %8, %0\n" \ + : "=r" (a5), \ + "=&r" (a4), \ + "=&r" (a3), \ + "=&r" (__carry6_32) \ + : "0" ((mpi_limb_t)(b5)), \ + "1" ((mpi_limb_t)(b4)), \ + "2" ((mpi_limb_t)(b3)), \ + "3" ((mpi_limb_t)(__carry6_32)), \ + "g" ((mpi_limb_t)(c5)), \ + "g" ((mpi_limb_t)(c4)), \ + "g" ((mpi_limb_t)(c3)) \ + : "cc"); \ + } while (0) + +#define SUB4_LIMB32(a3, a2, a1, a0, b3, b2, b1, b0, c3, c2, c1, c0) \ + __asm__ ("subl %11, %3\n" \ + "sbbl %10, %2\n" \ + "sbbl %9, %1\n" \ + "sbbl %8, %0\n" \ + : "=r" (a3), \ + "=&r" (a2), \ + "=&r" (a1), \ + "=&r" (a0) \ + : "0" ((mpi_limb_t)(b3)), \ + "1" ((mpi_limb_t)(b2)), \ + "2" ((mpi_limb_t)(b1)), \ + "3" ((mpi_limb_t)(b0)), \ + "g" ((mpi_limb_t)(c3)), \ + "g" ((mpi_limb_t)(c2)), \ + "g" ((mpi_limb_t)(c1)), \ + "g" ((mpi_limb_t)(c0)) \ + : "cc") + +#define SUB6_LIMB32(a5, a4, a3, a2, a1, a0, b5, b4, b3, b2, b1, b0, \ + c5, c4, c3, c2, c1, c0) do { \ + mpi_limb_t __borrow6_32; \ + __asm__ ("subl %10, %3\n" \ + "sbbl %9, %2\n" \ + "sbbl %8, %1\n" \ + "sbbl %0, %0\n" \ + : "=r" (__borrow6_32), \ + "=&r" (a2), \ + "=&r" (a1), \ + "=&r" (a0) \ + : "0" ((mpi_limb_t)(0)), \ + "1" ((mpi_limb_t)(b2)), \ + "2" ((mpi_limb_t)(b1)), \ + "3" ((mpi_limb_t)(b0)), \ + "g" ((mpi_limb_t)(c2)), \ + "g" ((mpi_limb_t)(c1)), \ + "g" ((mpi_limb_t)(c0)) \ + : "cc"); \ + __asm__ ("addl $1, %3\n" \ + "sbbl %10, %2\n" \ + "sbbl %9, %1\n" \ + "sbbl %8, %0\n" \ + : "=r" (a5), \ + "=&r" (a4), \ + "=&r" (a3), \ + "=&r" (__borrow6_32) \ + : "0" ((mpi_limb_t)(b5)), \ + "1" ((mpi_limb_t)(b4)), \ + "2" ((mpi_limb_t)(b3)), \ + "3" ((mpi_limb_t)(__borrow6_32)), \ + "g" ((mpi_limb_t)(c5)), \ + "g" ((mpi_limb_t)(c4)), \ + "g" ((mpi_limb_t)(c3)) \ + : "cc"); \ + } while (0) + +#endif /* __i386__ */ + + +/* ARM addition/subtraction helpers. */ +#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS + +#define ADD4_LIMB32(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \ + __asm__ ("adds %3, %7, %11\n" \ + "adcs %2, %6, %10\n" \ + "adcs %1, %5, %9\n" \ + "adc %0, %4, %8\n" \ + : "=r" (A3), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "r" ((mpi_limb_t)(B3)), \ + "r" ((mpi_limb_t)(B2)), \ + "r" ((mpi_limb_t)(B1)), \ + "r" ((mpi_limb_t)(B0)), \ + "Ir" ((mpi_limb_t)(C3)), \ + "Ir" ((mpi_limb_t)(C2)), \ + "Ir" ((mpi_limb_t)(C1)), \ + "Ir" ((mpi_limb_t)(C0)) \ + : "cc") + +#define ADD6_LIMB32(A5, A4, A3, A2, A1, A0, B5, B4, B3, B2, B1, B0, \ + C5, C4, C3, C2, C1, C0) do { \ + mpi_limb_t __carry6_32; \ + __asm__ ("adds %3, %7, %10\n" \ + "adcs %2, %6, %9\n" \ + "adcs %1, %5, %8\n" \ + "adc %0, %4, %4\n" \ + : "=r" (__carry6_32), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "r" ((mpi_limb_t)(0)), \ + "r" ((mpi_limb_t)(B2)), \ + "r" ((mpi_limb_t)(B1)), \ + "r" ((mpi_limb_t)(B0)), \ + "Ir" ((mpi_limb_t)(C2)), \ + "Ir" ((mpi_limb_t)(C1)), \ + "Ir" ((mpi_limb_t)(C0)) \ + : "cc"); \ + ADD4_LIMB32(A5, A4, A3, __carry6_32, B5, B4, B3, __carry6_32, \ + C5, C4, C3, 0xffffffffU); \ + } while (0) + +#define SUB4_LIMB32(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \ + __asm__ ("subs %3, %7, %11\n" \ + "sbcs %2, %6, %10\n" \ + "sbcs %1, %5, %9\n" \ + "sbc %0, %4, %8\n" \ + : "=r" (A3), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "r" ((mpi_limb_t)(B3)), \ + "r" ((mpi_limb_t)(B2)), \ + "r" ((mpi_limb_t)(B1)), \ + "r" ((mpi_limb_t)(B0)), \ + "Ir" ((mpi_limb_t)(C3)), \ + "Ir" ((mpi_limb_t)(C2)), \ + "Ir" ((mpi_limb_t)(C1)), \ + "Ir" ((mpi_limb_t)(C0)) \ + : "cc") + + +#define SUB6_LIMB32(A5, A4, A3, A2, A1, A0, B5, B4, B3, B2, B1, B0, \ + C5, C4, C3, C2, C1, C0) do { \ + mpi_limb_t __borrow6_32; \ + __asm__ ("subs %3, %7, %10\n" \ + "sbcs %2, %6, %9\n" \ + "sbcs %1, %5, %8\n" \ + "sbc %0, %4, %4\n" \ + : "=r" (__borrow6_32), \ + "=&r" (A2), \ + "=&r" (A1), \ + "=&r" (A0) \ + : "r" ((mpi_limb_t)(0)), \ + "r" ((mpi_limb_t)(B2)), \ + "r" ((mpi_limb_t)(B1)), \ + "r" ((mpi_limb_t)(B0)), \ + "Ir" ((mpi_limb_t)(C2)), \ + "Ir" ((mpi_limb_t)(C1)), \ + "Ir" ((mpi_limb_t)(C0)) \ + : "cc"); \ + SUB4_LIMB32(A5, A4, A3, __borrow6_32, B5, B4, B3, 0, \ + C5, C4, C3, -__borrow6_32); \ + } while (0) + +#endif /* HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS */ + + +/* Common 32-bit arch addition/subtraction macros. */ + +#if defined(ADD4_LIMB32) +/* A[0..1] = B[0..1] + C[0..1] */ +#define ADD2_LIMB64(A1, A0, B1, B0, C1, C0) \ + ADD4_LIMB32(A1.hi, A1.lo, A0.hi, A0.lo, \ + B1.hi, B1.lo, B0.hi, B0.lo, \ + C1.hi, C1.lo, C0.hi, C0.lo) +#else +/* A[0..1] = B[0..1] + C[0..1] */ +#define ADD2_LIMB64(A1, A0, B1, B0, C1, C0) do { \ + mpi_limb_t __carry2_0, __carry2_1; \ + add_ssaaaa(__carry2_0, A0.lo, 0, B0.lo, 0, C0.lo); \ + add_ssaaaa(__carry2_1, A0.hi, 0, B0.hi, 0, C0.hi); \ + add_ssaaaa(__carry2_1, A0.hi, __carry2_1, A0.hi, 0, __carry2_0); \ + add_ssaaaa(A1.hi, A1.lo, B1.hi, B1.lo, C1.hi, C1.lo); \ + add_ssaaaa(A1.hi, A1.lo, A1.hi, A1.lo, 0, __carry2_1); \ + } while (0) +#endif + +#if defined(ADD6_LIMB32) +/* A[0..2] = B[0..2] + C[0..2] */ +#define ADD3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \ + ADD6_LIMB32(A2.hi, A2.lo, A1.hi, A1.lo, A0.hi, A0.lo, \ + B2.hi, B2.lo, B1.hi, B1.lo, B0.hi, B0.lo, \ + C2.hi, C2.lo, C1.hi, C1.lo, C0.hi, C0.lo) +#endif + +#if defined(ADD6_LIMB32) +/* A[0..3] = B[0..3] + C[0..3] */ +#define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) do { \ + mpi_limb_t __carry4; \ + ADD6_LIMB32(__carry4, A2.lo, A1.hi, A1.lo, A0.hi, A0.lo, \ + 0, B2.lo, B1.hi, B1.lo, B0.hi, B0.lo, \ + 0, C2.lo, C1.hi, C1.lo, C0.hi, C0.lo); \ + ADD4_LIMB32(A3.hi, A3.lo, A2.hi, __carry4, \ + B3.hi, B3.lo, B2.hi, __carry4, \ + C3.hi, C3.lo, C2.hi, 0xffffffffU); \ + } while (0) +#endif + +#if defined(SUB4_LIMB32) +/* A[0..1] = B[0..1] - C[0..1] */ +#define SUB2_LIMB64(A1, A0, B1, B0, C1, C0) \ + SUB4_LIMB32(A1.hi, A1.lo, A0.hi, A0.lo, \ + B1.hi, B1.lo, B0.hi, B0.lo, \ + C1.hi, C1.lo, C0.hi, C0.lo) +#else +/* A[0..1] = B[0..1] - C[0..1] */ +#define SUB2_LIMB64(A1, A0, B1, B0, C1, C0) do { \ + mpi_limb_t __borrow2_0, __borrow2_1; \ + sub_ddmmss(__borrow2_0, A0.lo, 0, B0.lo, 0, C0.lo); \ + sub_ddmmss(__borrow2_1, A0.hi, 0, B0.hi, 0, C0.hi); \ + sub_ddmmss(__borrow2_1, A0.hi, __borrow2_1, A0.hi, 0, -__borrow2_0); \ + sub_ddmmss(A1.hi, A1.lo, B1.hi, B1.lo, C1.hi, C1.lo); \ + sub_ddmmss(A1.hi, A1.lo, A1.hi, A1.lo, 0, -__borrow2_1); \ + } while (0) +#endif + +#if defined(SUB6_LIMB32) +/* A[0..2] = B[0..2] - C[0..2] */ +#define SUB3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \ + SUB6_LIMB32(A2.hi, A2.lo, A1.hi, A1.lo, A0.hi, A0.lo, \ + B2.hi, B2.lo, B1.hi, B1.lo, B0.hi, B0.lo, \ + C2.hi, C2.lo, C1.hi, C1.lo, C0.hi, C0.lo) +#endif + +#if defined(SUB6_LIMB32) +/* A[0..3] = B[0..3] - C[0..3] */ +#define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) do { \ + mpi_limb_t __borrow4; \ + SUB6_LIMB32(__borrow4, A2.lo, A1.hi, A1.lo, A0.hi, A0.lo, \ + 0, B2.lo, B1.hi, B1.lo, B0.hi, B0.lo, \ + 0, C2.lo, C1.hi, C1.lo, C0.hi, C0.lo); \ + SUB4_LIMB32(A3.hi, A3.lo, A2.hi, __borrow4, \ + B3.hi, B3.lo, B2.hi, 0, \ + C3.hi, C3.lo, C2.hi, -__borrow4); \ + } while (0) +#endif + +#endif /* BYTES_PER_MPI_LIMB == 4 */ + + +/* Common definitions. */ +#define BITS_PER_MPI_LIMB64 (BITS_PER_MPI_LIMB * LIMBS_PER_LIMB64) +#define BYTES_PER_MPI_LIMB64 (BYTES_PER_MPI_LIMB * LIMBS_PER_LIMB64) + + +/* Common addition/subtraction macros. */ + +#ifndef ADD3_LIMB64 +/* A[0..2] = B[0..2] + C[0..2] */ +#define ADD3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) do { \ + mpi_limb64_t __carry3; \ + ADD2_LIMB64(__carry3, A0, zero, B0, zero, C0); \ + ADD2_LIMB64(A2, A1, B2, B1, C2, C1); \ + ADD2_LIMB64(A2, A1, A2, A1, zero, __carry3); \ + } while (0) +#endif + +#ifndef ADD4_LIMB64 +/* A[0..3] = B[0..3] + C[0..3] */ +#define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) do { \ + mpi_limb64_t __carry4; \ + ADD3_LIMB64(__carry4, A1, A0, zero, B1, B0, zero, C1, C0); \ + ADD2_LIMB64(A3, A2, B3, B2, C3, C2); \ + ADD2_LIMB64(A3, A2, A3, A2, zero, __carry4); \ + } while (0) +#endif + +#ifndef ADD5_LIMB64 +/* A[0..4] = B[0..4] + C[0..4] */ +#define ADD5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \ + C4, C3, C2, C1, C0) do { \ + mpi_limb64_t __carry5; \ + ADD4_LIMB64(__carry5, A2, A1, A0, zero, B2, B1, B0, zero, C2, C1, C0); \ + ADD2_LIMB64(A4, A3, B4, B3, C4, C3); \ + ADD2_LIMB64(A4, A3, A4, A3, zero, __carry5); \ + } while (0) +#endif + +#ifndef ADD7_LIMB64 +/* A[0..6] = B[0..6] + C[0..6] */ +#define ADD7_LIMB64(A6, A5, A4, A3, A2, A1, A0, B6, B5, B4, B3, B2, B1, B0, \ + C6, C5, C4, C3, C2, C1, C0) do { \ + mpi_limb64_t __carry7; \ + ADD4_LIMB64(__carry7, A2, A1, A0, zero, B2, B1, B0, \ + zero, C2, C1, C0); \ + ADD5_LIMB64(A6, A5, A4, A3, __carry7, B6, B5, B4, B3, \ + __carry7, C6, C5, C4, C3, LIMB64_HILO(-1, -1)); \ + } while (0) +#endif + +#ifndef SUB3_LIMB64 +/* A[0..2] = B[0..2] - C[0..2] */ +#define SUB3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) do { \ + mpi_limb64_t __borrow3; \ + SUB2_LIMB64(__borrow3, A0, zero, B0, zero, C0); \ + SUB2_LIMB64(A2, A1, B2, B1, C2, C1); \ + SUB2_LIMB64(A2, A1, A2, A1, zero, LIMB_TO64(-LIMB_FROM64(__borrow3))); \ + } while (0) +#endif + +#ifndef SUB4_LIMB64 +/* A[0..3] = B[0..3] - C[0..3] */ +#define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) do { \ + mpi_limb64_t __borrow4; \ + SUB3_LIMB64(__borrow4, A1, A0, zero, B1, B0, zero, C1, C0); \ + SUB2_LIMB64(A3, A2, B3, B2, C3, C2); \ + SUB2_LIMB64(A3, A2, A3, A2, zero, LIMB_TO64(-LIMB_FROM64(__borrow4))); \ + } while (0) +#endif + +#ifndef SUB5_LIMB64 +/* A[0..4] = B[0..4] - C[0..4] */ +#define SUB5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \ + C4, C3, C2, C1, C0) do { \ + mpi_limb64_t __borrow5; \ + SUB4_LIMB64(__borrow5, A2, A1, A0, zero, B2, B1, B0, zero, C2, C1, C0); \ + SUB2_LIMB64(A4, A3, B4, B3, C4, C3); \ + SUB2_LIMB64(A4, A3, A4, A3, zero, LIMB_TO64(-LIMB_FROM64(__borrow5))); \ + } while (0) +#endif + +#ifndef SUB7_LIMB64 +/* A[0..6] = B[0..6] - C[0..6] */ +#define SUB7_LIMB64(A6, A5, A4, A3, A2, A1, A0, B6, B5, B4, B3, B2, B1, B0, \ + C6, C5, C4, C3, C2, C1, C0) do { \ + mpi_limb64_t __borrow7; \ + SUB4_LIMB64(__borrow7, A2, A1, A0, zero, B2, B1, B0, \ + zero, C2, C1, C0); \ + SUB5_LIMB64(A6, A5, A4, A3, __borrow7, B6, B5, B4, B3, zero, \ + C6, C5, C4, C3, LIMB_TO64(-LIMB_FROM64(__borrow7))); \ + } while (0) +#endif + + +#if defined(WORDS_BIGENDIAN) || (BITS_PER_MPI_LIMB64 != BITS_PER_MPI_LIMB) +#define LOAD64_UNALIGNED(x, pos) \ + LIMB64_HILO(LOAD32(x, 2 * (pos) + 2), LOAD32(x, 2 * (pos) + 1)) +#else +#define LOAD64_UNALIGNED(x, pos) \ + buf_get_le64((const byte *)(&(x)[pos]) + 4) +#endif + + +/* Helper functions. */ + +static inline int +mpi_nbits_more_than (gcry_mpi_t w, unsigned int nbits) +{ + unsigned int nbits_nlimbs; + mpi_limb_t wlimb; + unsigned int n; + + nbits_nlimbs = (nbits + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB; + + /* Note: Assumes that 'w' is normalized. */ + + if (w->nlimbs > nbits_nlimbs) + return 1; + if (w->nlimbs < nbits_nlimbs) + return 0; + if ((nbits % BITS_PER_MPI_LIMB) == 0) + return 0; + + wlimb = w->d[nbits_nlimbs - 1]; + if (wlimb == 0) + log_bug ("mpi_nbits_more_than: input mpi not normalized\n"); + + count_leading_zeros (n, wlimb); + + return (BITS_PER_MPI_LIMB - n) > (nbits % BITS_PER_MPI_LIMB); +} + +#endif /* GCRY_EC_INLINE_H */ diff --git a/mpi/ec-internal.h b/mpi/ec-internal.h index 759335aad..2296d55dc 100644 --- a/mpi/ec-internal.h +++ b/mpi/ec-internal.h @@ -20,6 +20,22 @@ #ifndef GCRY_EC_INTERNAL_H #define GCRY_EC_INTERNAL_H +#include + void _gcry_mpi_ec_ed25519_mod (gcry_mpi_t a); +#ifndef ASM_DISABLED +void _gcry_mpi_ec_nist192_mod (gcry_mpi_t w, mpi_ec_t ctx); +void _gcry_mpi_ec_nist224_mod (gcry_mpi_t w, mpi_ec_t ctx); +void _gcry_mpi_ec_nist256_mod (gcry_mpi_t w, mpi_ec_t ctx); +void _gcry_mpi_ec_nist384_mod (gcry_mpi_t w, mpi_ec_t ctx); +void _gcry_mpi_ec_nist521_mod (gcry_mpi_t w, mpi_ec_t ctx); +#else +# define _gcry_mpi_ec_nist192_mod NULL +# define _gcry_mpi_ec_nist224_mod NULL +# define _gcry_mpi_ec_nist256_mod NULL +# define _gcry_mpi_ec_nist384_mod NULL +# define _gcry_mpi_ec_nist521_mod NULL +#endif + #endif /*GCRY_EC_INTERNAL_H*/ diff --git a/mpi/ec-nist.c b/mpi/ec-nist.c new file mode 100644 index 000000000..955d2b7c8 --- /dev/null +++ b/mpi/ec-nist.c @@ -0,0 +1,795 @@ +/* ec-nist.c - NIST optimized elliptic curve functions + * Copyright (C) 2021 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include +#include +#include +#include + + +#ifndef ASM_DISABLED + + +#include "mpi-internal.h" +#include "longlong.h" +#include "g10lib.h" +#include "context.h" +#include "ec-context.h" +#include "ec-inline.h" + + +/* These variables are used to generate masks from conditional operation + * flag parameters. Use of volatile prevents compiler optimizations from + * converting AND-masking to conditional branches. */ +static volatile mpi_limb_t vzero = 0; +static volatile mpi_limb_t vone = 1; + + +static inline +void prefetch(const void *tab, size_t len) +{ + const volatile byte *vtab = tab; + + if (len > 0 * 64) + (void)vtab[0 * 64]; + if (len > 1 * 64) + (void)vtab[1 * 64]; + if (len > 2 * 64) + (void)vtab[2 * 64]; + if (len > 3 * 64) + (void)vtab[3 * 64]; + if (len > 4 * 64) + (void)vtab[4 * 64]; + if (len > 5 * 64) + (void)vtab[5 * 64]; + if (len > 6 * 64) + (void)vtab[6 * 64]; + if (len > 7 * 64) + (void)vtab[7 * 64]; + if (len > 8 * 64) + (void)vtab[8 * 64]; + if (len > 9 * 64) + (void)vtab[9 * 64]; + if (len > 10 * 64) + (void)vtab[10 * 64]; + (void)vtab[len - 1]; +} + + +/* Fast reduction routines for NIST curves. */ + +void +_gcry_mpi_ec_nist192_mod (gcry_mpi_t w, mpi_ec_t ctx) +{ + static const mpi_limb64_t p_mult[3][4] = + { + { /* P * 1 */ + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xfffffffeU), + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0x00000000U) + }, + { /* P * 2 */ + LIMB64_C(0xffffffffU, 0xfffffffeU), LIMB64_C(0xffffffffU, 0xfffffffdU), + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0x00000001U) + }, + { /* P * 3 */ + LIMB64_C(0xffffffffU, 0xfffffffdU), LIMB64_C(0xffffffffU, 0xfffffffcU), + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0x00000002U) + } + }; + const mpi_limb64_t zero = LIMB_TO64(0); + mpi_ptr_t wp; + mpi_ptr_t pp; + mpi_size_t wsize = 192 / BITS_PER_MPI_LIMB64; + mpi_limb64_t s[wsize + 1]; + mpi_limb64_t o[wsize + 1]; + mpi_limb_t mask1; + mpi_limb_t mask2; + int carry; + + MPN_NORMALIZE (w->d, w->nlimbs); + if (mpi_nbits_more_than (w, 2 * 192)) + log_bug ("W must be less than m^2\n"); + + RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64); + RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64); + + pp = ctx->p->d; + wp = w->d; + + prefetch (p_mult, sizeof(p_mult)); + + /* See "FIPS 186-4, D.2.1 Curve P-192". */ + + s[0] = LOAD64(wp, 3); + ADD3_LIMB64 (s[3], s[2], s[1], + zero, zero, LOAD64(wp, 3), + zero, LOAD64(wp, 4), LOAD64(wp, 4)); + + ADD4_LIMB64 (s[3], s[2], s[1], s[0], + s[3], s[2], s[1], s[0], + zero, LOAD64(wp, 5), LOAD64(wp, 5), LOAD64(wp, 5)); + + ADD4_LIMB64 (s[3], s[2], s[1], s[0], + s[3], s[2], s[1], s[0], + zero, LOAD64(wp, 2), LOAD64(wp, 1), LOAD64(wp, 0)); + + /* mod p: + * 's[3]' holds carry value (0..2). Subtract (carry + 1) * p. Result will be + * with in range -p...p. Handle result being negative with addition and + * conditional store. */ + + carry = LO32_LIMB64(s[3]); + + SUB4_LIMB64 (s[3], s[2], s[1], s[0], + s[3], s[2], s[1], s[0], + p_mult[carry][3], p_mult[carry][2], + p_mult[carry][1], p_mult[carry][0]); + + ADD4_LIMB64 (o[3], o[2], o[1], o[0], + s[3], s[2], s[1], s[0], + zero, LOAD64(pp, 2), LOAD64(pp, 1), LOAD64(pp, 0)); + mask1 = vzero - (LO32_LIMB64(o[3]) >> 31); + mask2 = (LO32_LIMB64(o[3]) >> 31) - vone; + + STORE64_COND(wp, 0, mask2, o[0], mask1, s[0]); + STORE64_COND(wp, 1, mask2, o[1], mask1, s[1]); + STORE64_COND(wp, 2, mask2, o[2], mask1, s[2]); + + w->nlimbs = 192 / BITS_PER_MPI_LIMB; + MPN_NORMALIZE (wp, w->nlimbs); +} + +void +_gcry_mpi_ec_nist224_mod (gcry_mpi_t w, mpi_ec_t ctx) +{ + static const mpi_limb64_t p_mult[5][4] = + { + { /* P * -1 */ + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xffffffffU), + LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xffffffffU, 0x00000000U) + }, + { /* P * 0 */ + LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U), + LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U) + }, + { /* P * 1 */ + LIMB64_C(0x00000000U, 0x00000001U), LIMB64_C(0xffffffffU, 0x00000000U), + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xffffffffU) + }, + { /* P * 2 */ + LIMB64_C(0x00000000U, 0x00000002U), LIMB64_C(0xfffffffeU, 0x00000000U), + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000001U, 0xffffffffU) + }, + { /* P * 3 */ + LIMB64_C(0x00000000U, 0x00000003U), LIMB64_C(0xfffffffdU, 0x00000000U), + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000002U, 0xffffffffU) + } + }; + const mpi_limb64_t zero = LIMB_TO64(0); + mpi_ptr_t wp; + mpi_ptr_t pp; + mpi_size_t wsize = (224 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64; + mpi_size_t psize = ctx->p->nlimbs; + mpi_limb64_t s[wsize]; + mpi_limb64_t d[wsize]; + mpi_limb_t mask1; + mpi_limb_t mask2; + int carry; + + MPN_NORMALIZE (w->d, w->nlimbs); + if (mpi_nbits_more_than (w, 2 * 224)) + log_bug ("W must be less than m^2\n"); + + RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64); + RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64); + ctx->p->nlimbs = psize; + + pp = ctx->p->d; + wp = w->d; + + prefetch (p_mult, sizeof(p_mult)); + + /* See "FIPS 186-4, D.2.2 Curve P-224". */ + + /* "S1 + S2" with 64-bit limbs: + * [0:A10]:[ A9: A8]:[ A7:0]:[0:0] + * + [0:0]:[A13:A12]:[A11:0]:[0:0] + * => s[3]:s[2]:s[1]:s[0] + */ + s[0] = zero; + ADD3_LIMB64 (s[3], s[2], s[1], + LIMB64_HILO(0, LOAD32(wp, 10)), + LOAD64(wp, 8 / 2), + LIMB64_HILO(LOAD32(wp, 7), 0), + zero, + LOAD64(wp, 12 / 2), + LIMB64_HILO(LOAD32(wp, 11), 0)); + + /* "T + S1 + S2" */ + ADD4_LIMB64 (s[3], s[2], s[1], s[0], + s[3], s[2], s[1], s[0], + LIMB64_HILO(0, LOAD32(wp, 6)), + LOAD64(wp, 4 / 2), + LOAD64(wp, 2 / 2), + LOAD64(wp, 0 / 2)); + + /* "D1 + D2" with 64-bit limbs: + * [0:A13]:[A12:A11]:[A10: A9]:[ A8: A7] + * + [0:0]:[ 0: 0]:[ 0:A13]:[A12:A11] + * => d[3]:d[2]:d[1]:d[0] + */ + ADD4_LIMB64 (d[3], d[2], d[1], d[0], + LIMB64_HILO(0, LOAD32(wp, 13)), + LOAD64_UNALIGNED(wp, 11 / 2), + LOAD64_UNALIGNED(wp, 9 / 2), + LOAD64_UNALIGNED(wp, 7 / 2), + zero, + zero, + LIMB64_HILO(0, LOAD32(wp, 13)), + LOAD64_UNALIGNED(wp, 11 / 2)); + + /* "T + S1 + S2 - D1 - D2" */ + SUB4_LIMB64 (s[3], s[2], s[1], s[0], + s[3], s[2], s[1], s[0], + d[3], d[2], d[1], d[0]); + + /* mod p: + * Upper 32-bits of 's[3]' holds carry value (-2..2). + * Subtract (carry + 1) * p. Result will be with in range -p...p. + * Handle result being negative with addition and conditional store. */ + + carry = HI32_LIMB64(s[3]); + + SUB4_LIMB64 (s[3], s[2], s[1], s[0], + s[3], s[2], s[1], s[0], + p_mult[carry + 2][3], p_mult[carry + 2][2], + p_mult[carry + 2][1], p_mult[carry + 2][0]); + + ADD4_LIMB64 (d[3], d[2], d[1], d[0], + s[3], s[2], s[1], s[0], + LOAD64(pp, 3), LOAD64(pp, 2), LOAD64(pp, 1), LOAD64(pp, 0)); + + mask1 = vzero - (HI32_LIMB64(d[3]) >> 31); + mask2 = (HI32_LIMB64(d[3]) >> 31) - vone; + + STORE64_COND(wp, 0, mask2, d[0], mask1, s[0]); + STORE64_COND(wp, 1, mask2, d[1], mask1, s[1]); + STORE64_COND(wp, 2, mask2, d[2], mask1, s[2]); + STORE64_COND(wp, 3, mask2, d[3], mask1, s[3]); + + w->nlimbs = wsize * LIMBS_PER_LIMB64; + MPN_NORMALIZE (wp, w->nlimbs); +} + +void +_gcry_mpi_ec_nist256_mod (gcry_mpi_t w, mpi_ec_t ctx) +{ + static const mpi_limb64_t p_mult[11][5] = + { + { /* P * -3 */ + LIMB64_C(0x00000000U, 0x00000003U), LIMB64_C(0xfffffffdU, 0x00000000U), + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000002U, 0xfffffffcU), + LIMB64_C(0xffffffffU, 0xfffffffdU) + }, + { /* P * -2 */ + LIMB64_C(0x00000000U, 0x00000002U), LIMB64_C(0xfffffffeU, 0x00000000U), + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000001U, 0xfffffffdU), + LIMB64_C(0xffffffffU, 0xfffffffeU) + }, + { /* P * -1 */ + LIMB64_C(0x00000000U, 0x00000001U), LIMB64_C(0xffffffffU, 0x00000000U), + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xfffffffeU), + LIMB64_C(0xffffffffU, 0xffffffffU) + }, + { /* P * 0 */ + LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U), + LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U), + LIMB64_C(0x00000000U, 0x00000000U) + }, + { /* P * 1 */ + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xffffffffU), + LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xffffffffU, 0x00000001U), + LIMB64_C(0x00000000U, 0x00000000U) + }, + { /* P * 2 */ + LIMB64_C(0xffffffffU, 0xfffffffeU), LIMB64_C(0x00000001U, 0xffffffffU), + LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffeU, 0x00000002U), + LIMB64_C(0x00000000U, 0x00000001U) + }, + { /* P * 3 */ + LIMB64_C(0xffffffffU, 0xfffffffdU), LIMB64_C(0x00000002U, 0xffffffffU), + LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffdU, 0x00000003U), + LIMB64_C(0x00000000U, 0x00000002U) + }, + { /* P * 4 */ + LIMB64_C(0xffffffffU, 0xfffffffcU), LIMB64_C(0x00000003U, 0xffffffffU), + LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffcU, 0x00000004U), + LIMB64_C(0x00000000U, 0x00000003U) + }, + { /* P * 5 */ + LIMB64_C(0xffffffffU, 0xfffffffbU), LIMB64_C(0x00000004U, 0xffffffffU), + LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffbU, 0x00000005U), + LIMB64_C(0x00000000U, 0x00000004U) + }, + { /* P * 6 */ + LIMB64_C(0xffffffffU, 0xfffffffaU), LIMB64_C(0x00000005U, 0xffffffffU), + LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffaU, 0x00000006U), + LIMB64_C(0x00000000U, 0x00000005U) + }, + { /* P * 7 */ + LIMB64_C(0xffffffffU, 0xfffffff9U), LIMB64_C(0x00000006U, 0xffffffffU), + LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffff9U, 0x00000007U), + LIMB64_C(0x00000000U, 0x00000006U) + } + }; + const mpi_limb64_t zero = LIMB_TO64(0); + mpi_ptr_t wp; + mpi_ptr_t pp; + mpi_size_t wsize = (256 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64; + mpi_size_t psize = ctx->p->nlimbs; + mpi_limb64_t s[wsize + 1]; + mpi_limb64_t t[wsize + 1]; + mpi_limb64_t d[wsize + 1]; + mpi_limb_t mask1; + mpi_limb_t mask2; + int carry; + + MPN_NORMALIZE (w->d, w->nlimbs); + if (mpi_nbits_more_than (w, 2 * 256)) + log_bug ("W must be less than m^2\n"); + + RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64); + RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64); + ctx->p->nlimbs = psize; + + pp = ctx->p->d; + wp = w->d; + + prefetch (p_mult, sizeof(p_mult)); + + /* See "FIPS 186-4, D.2.3 Curve P-256". */ + + /* "S1 + S2" with 64-bit limbs: + * [A15:A14]:[A13:A12]:[A11:0]:[0:0] + * + [0:A15]:[A14:A13]:[A12:0]:[0:0] + * => s[4]:s[3]:s[2]:s[1]:s[0] + */ + s[0] = zero; + ADD4_LIMB64 (s[4], s[3], s[2], s[1], + zero, + LOAD64(wp, 14 / 2), + LOAD64(wp, 12 / 2), + LIMB64_HILO(LOAD32(wp, 11), 0), + zero, + LIMB64_HILO(0, LOAD32(wp, 15)), + LOAD64_UNALIGNED(wp, 13 / 2), + LIMB64_HILO(LOAD32(wp, 12), 0)); + + /* "S3 + S4" with 64-bit limbs: + * [A15:A14]:[ 0: 0]:[ 0:A10]:[ A9:A8] + * + [A8:A13]:[A15:A14]:[A13:A11]:[A10:A9] + * => t[4]:t[3]:t[2]:t[1]:t[0] + */ + ADD5_LIMB64 (t[4], t[3], t[2], t[1], t[0], + zero, + LOAD64(wp, 14 / 2), + zero, + LIMB64_HILO(0, LOAD32(wp, 10)), + LOAD64(wp, 8 / 2), + zero, + LIMB64_HILO(LOAD32(wp, 8), LOAD32(wp, 13)), + LOAD64(wp, 14 / 2), + LIMB64_HILO(LOAD32(wp, 13), LOAD32(wp, 11)), + LOAD64_UNALIGNED(wp, 9 / 2)); + + /* "2*S1 + 2*S2" */ + ADD5_LIMB64 (s[4], s[3], s[2], s[1], s[0], + s[4], s[3], s[2], s[1], s[0], + s[4], s[3], s[2], s[1], s[0]); + + /* "T + S3 + S4" */ + ADD5_LIMB64 (t[4], t[3], t[2], t[1], t[0], + t[4], t[3], t[2], t[1], t[0], + zero, + LOAD64(wp, 6 / 2), + LOAD64(wp, 4 / 2), + LOAD64(wp, 2 / 2), + LOAD64(wp, 0 / 2)); + + /* "2*S1 + 2*S2 - D3" with 64-bit limbs: + * s[4]: s[3]: s[2]: s[1]: s[0] + * - [A12:0]:[A10:A9]:[A8:A15]:[A14:A13] + * => s[4]:s[3]:s[2]:s[1]:s[0] + */ + SUB5_LIMB64 (s[4], s[3], s[2], s[1], s[0], + s[4], s[3], s[2], s[1], s[0], + zero, + LIMB64_HILO(LOAD32(wp, 12), 0), + LOAD64_UNALIGNED(wp, 9 / 2), + LIMB64_HILO(LOAD32(wp, 8), LOAD32(wp, 15)), + LOAD64_UNALIGNED(wp, 13 / 2)); + + /* "T + 2*S1 + 2*S2 + S3 + S4 - D3" */ + ADD5_LIMB64 (s[4], s[3], s[2], s[1], s[0], + s[4], s[3], s[2], s[1], s[0], + t[4], t[3], t[2], t[1], t[0]); + + /* "D1 + D2" with 64-bit limbs: + * [0:A13]:[A12:A11] + [A15:A14]:[A13:A12] => d[2]:d[1]:d[0] + * [A10:A8] + [A11:A9] => d[4]:d[3] + */ + ADD3_LIMB64 (d[2], d[1], d[0], + zero, + LIMB64_HILO(0, LOAD32(wp, 13)), + LOAD64_UNALIGNED(wp, 11 / 2), + zero, + LOAD64(wp, 14 / 2), + LOAD64(wp, 12 / 2)); + ADD2_LIMB64 (d[4], d[3], + zero, LIMB64_HILO(LOAD32(wp, 10), LOAD32(wp, 8)), + zero, LIMB64_HILO(LOAD32(wp, 11), LOAD32(wp, 9))); + + /* "D1 + D2 + D4" with 64-bit limbs: + * d[4]: d[3]: d[2]: d[1]: d[0] + * - [A13:0]:[A11:A10]:[A9:0]:[A15:A14] + * => d[4]:d[3]:d[2]:d[1]:d[0] + */ + ADD5_LIMB64 (d[4], d[3], d[2], d[1], d[0], + d[4], d[3], d[2], d[1], d[0], + zero, + LIMB64_HILO(LOAD32(wp, 13), 0), + LOAD64(wp, 10 / 2), + LIMB64_HILO(LOAD32(wp, 9), 0), + LOAD64(wp, 14 / 2)); + + /* "T + 2*S1 + 2*S2 + S3 + S4 - D1 - D2 - D3 - D4" */ + SUB5_LIMB64 (s[4], s[3], s[2], s[1], s[0], + s[4], s[3], s[2], s[1], s[0], + d[4], d[3], d[2], d[1], d[0]); + + /* mod p: + * 's[4]' holds carry value (-4..6). Subtract (carry + 1) * p. Result + * will be with in range -p...p. Handle result being negative with + * addition and conditional store. */ + + carry = LO32_LIMB64(s[4]); + + SUB5_LIMB64 (s[4], s[3], s[2], s[1], s[0], + s[4], s[3], s[2], s[1], s[0], + p_mult[carry + 4][4], p_mult[carry + 4][3], + p_mult[carry + 4][2], p_mult[carry + 4][1], + p_mult[carry + 4][0]); + + ADD5_LIMB64 (d[4], d[3], d[2], d[1], d[0], + s[4], s[3], s[2], s[1], s[0], + zero, + LOAD64(pp, 3), LOAD64(pp, 2), LOAD64(pp, 1), LOAD64(pp, 0)); + + mask1 = vzero - (LO32_LIMB64(d[4]) >> 31); + mask2 = (LO32_LIMB64(d[4]) >> 31) - vone; + + STORE64_COND(wp, 0, mask2, d[0], mask1, s[0]); + STORE64_COND(wp, 1, mask2, d[1], mask1, s[1]); + STORE64_COND(wp, 2, mask2, d[2], mask1, s[2]); + STORE64_COND(wp, 3, mask2, d[3], mask1, s[3]); + + w->nlimbs = wsize * LIMBS_PER_LIMB64; + MPN_NORMALIZE (wp, w->nlimbs); +} + +void +_gcry_mpi_ec_nist384_mod (gcry_mpi_t w, mpi_ec_t ctx) +{ + static const mpi_limb64_t p_mult[11][7] = + { + { /* P * -2 */ + LIMB64_C(0xfffffffeU, 0x00000002U), LIMB64_C(0x00000001U, 0xffffffffU), + LIMB64_C(0x00000000U, 0x00000002U), LIMB64_C(0x00000000U, 0x00000000U), + LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U), + LIMB64_C(0xffffffffU, 0xfffffffeU) + }, + { /* P * -1 */ + LIMB64_C(0xffffffffU, 0x00000001U), LIMB64_C(0x00000000U, 0xffffffffU), + LIMB64_C(0x00000000U, 0x00000001U), LIMB64_C(0x00000000U, 0x00000000U), + LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U), + LIMB64_C(0xffffffffU, 0xffffffffU) + }, + { /* P * 0 */ + LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U), + LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U), + LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U), + LIMB64_C(0x00000000U, 0x00000000U) + }, + { /* P * 1 */ + LIMB64_C(0x00000000U, 0xffffffffU), LIMB64_C(0xffffffffU, 0x00000000U), + LIMB64_C(0xffffffffU, 0xfffffffeU), LIMB64_C(0xffffffffU, 0xffffffffU), + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU), + LIMB64_C(0x00000000U, 0x00000000U) + }, + { /* P * 2 */ + LIMB64_C(0x00000001U, 0xfffffffeU), LIMB64_C(0xfffffffeU, 0x00000000U), + LIMB64_C(0xffffffffU, 0xfffffffdU), LIMB64_C(0xffffffffU, 0xffffffffU), + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU), + LIMB64_C(0x00000000U, 0x00000001U) + }, + { /* P * 3 */ + LIMB64_C(0x00000002U, 0xfffffffdU), LIMB64_C(0xfffffffdU, 0x00000000U), + LIMB64_C(0xffffffffU, 0xfffffffcU), LIMB64_C(0xffffffffU, 0xffffffffU), + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU), + LIMB64_C(0x00000000U, 0x00000002U) + }, + { /* P * 4 */ + LIMB64_C(0x00000003U, 0xfffffffcU), LIMB64_C(0xfffffffcU, 0x00000000U), + LIMB64_C(0xffffffffU, 0xfffffffbU), LIMB64_C(0xffffffffU, 0xffffffffU), + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU), + LIMB64_C(0x00000000U, 0x00000003U) + }, + { /* P * 5 */ + LIMB64_C(0x00000004U, 0xfffffffbU), LIMB64_C(0xfffffffbU, 0x00000000U), + LIMB64_C(0xffffffffU, 0xfffffffaU), LIMB64_C(0xffffffffU, 0xffffffffU), + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU), + LIMB64_C(0x00000000U, 0x00000004U) + }, + { /* P * 6 */ + LIMB64_C(0x00000005U, 0xfffffffaU), LIMB64_C(0xfffffffaU, 0x00000000U), + LIMB64_C(0xffffffffU, 0xfffffff9U), LIMB64_C(0xffffffffU, 0xffffffffU), + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU), + LIMB64_C(0x00000000U, 0x00000005U) + }, + { /* P * 7 */ + LIMB64_C(0x00000006U, 0xfffffff9U), LIMB64_C(0xfffffff9U, 0x00000000U), + LIMB64_C(0xffffffffU, 0xfffffff8U), LIMB64_C(0xffffffffU, 0xffffffffU), + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU), + LIMB64_C(0x00000000U, 0x00000006U) + }, + { /* P * 8 */ + LIMB64_C(0x00000007U, 0xfffffff8U), LIMB64_C(0xfffffff8U, 0x00000000U), + LIMB64_C(0xffffffffU, 0xfffffff7U), LIMB64_C(0xffffffffU, 0xffffffffU), + LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU), + LIMB64_C(0x00000000U, 0x00000007U) + }, + }; + const mpi_limb64_t zero = LIMB_TO64(0); + mpi_ptr_t wp; + mpi_ptr_t pp; + mpi_size_t wsize = (384 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64; + mpi_size_t psize = ctx->p->nlimbs; +#if (BITS_PER_MPI_LIMB64 == BITS_PER_MPI_LIMB) && defined(WORDS_BIGENDIAN) + mpi_limb_t wp_shr32[wsize * LIMBS_PER_LIMB64]; +#endif + mpi_limb64_t s[wsize + 1]; + mpi_limb64_t t[wsize + 1]; + mpi_limb64_t d[wsize + 1]; + mpi_limb64_t x[wsize + 1]; + mpi_limb_t mask1; + mpi_limb_t mask2; + int carry; + + MPN_NORMALIZE (w->d, w->nlimbs); + if (mpi_nbits_more_than (w, 2 * 384)) + log_bug ("W must be less than m^2\n"); + + RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64); + RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64); + ctx->p->nlimbs = psize; + + pp = ctx->p->d; + wp = w->d; + + prefetch (p_mult, sizeof(p_mult)); + + /* See "FIPS 186-4, D.2.4 Curve P-384". */ + +#if BITS_PER_MPI_LIMB64 == BITS_PER_MPI_LIMB +# ifdef WORDS_BIGENDIAN +# define LOAD64_SHR32(idx) LOAD64(wp_shr32, ((idx) / 2 - wsize)) + _gcry_mpih_rshift (wp_shr32, wp + 384 / BITS_PER_MPI_LIMB, + wsize * LIMBS_PER_LIMB64, 32); +# else +# define LOAD64_SHR32(idx) LOAD64_UNALIGNED(wp, idx / 2) +#endif +#else +# define LOAD64_SHR32(idx) LIMB64_HILO(LOAD32(wp, (idx) + 1), LOAD32(wp, idx)) +#endif + + /* "S1 + S1" with 64-bit limbs: + * [0:A23]:[A22:A21] + * + [0:A23]:[A22:A21] + * => s[3]:s[2] + */ + ADD2_LIMB64 (s[3], s[2], + LIMB64_HILO(0, LOAD32(wp, 23)), + LOAD64_SHR32(21), + LIMB64_HILO(0, LOAD32(wp, 23)), + LOAD64_SHR32(21)); + + /* "S5 + S6" with 64-bit limbs: + * [A23:A22]:[A21:A20]:[ 0:0]:[0: 0] + * + [ 0: 0]:[A23:A22]:[A21:0]:[0:A20] + * => x[4]:x[3]:x[2]:x[1]:x[0] + */ + x[0] = LIMB64_HILO(0, LOAD32(wp, 20)); + x[1] = LIMB64_HILO(LOAD32(wp, 21), 0); + ADD3_LIMB64 (x[4], x[3], x[2], + zero, LOAD64(wp, 22 / 2), LOAD64(wp, 20 / 2), + zero, zero, LOAD64(wp, 22 / 2)); + + /* "D2 + D3" with 64-bit limbs: + * [0:A23]:[A22:A21]:[A20:0] + * + [0:A23]:[A23:0]:[0:0] + * => d[2]:d[1]:d[0] + */ + d[0] = LIMB64_HILO(LOAD32(wp, 20), 0); + ADD2_LIMB64 (d[2], d[1], + LIMB64_HILO(0, LOAD32(wp, 23)), + LOAD64_SHR32(21), + LIMB64_HILO(0, LOAD32(wp, 23)), + LIMB64_HILO(LOAD32(wp, 23), 0)); + + /* "2*S1 + S5 + S6" with 64-bit limbs: + * s[4]:s[3]:s[2]:s[1]:s[0] + * + x[4]:x[3]:x[2]:x[1]:x[0] + * => s[4]:s[3]:s[2]:s[1]:s[0] + */ + s[0] = x[0]; + s[1] = x[1]; + ADD3_LIMB64(s[4], s[3], s[2], + zero, s[3], s[2], + x[4], x[3], x[2]); + + /* "T + S2" with 64-bit limbs: + * [A11:A10]:[ A9: A8]:[ A7: A6]:[ A5: A4]:[ A3: A2]:[ A1: A0] + * + [A23:A22]:[A21:A20]:[A19:A18]:[A17:A16]:[A15:A14]:[A13:A12] + * => t[6]:t[5]:t[4]:t[3]:t[2]:t[1]:t[0] + */ + ADD7_LIMB64 (t[6], t[5], t[4], t[3], t[2], t[1], t[0], + zero, + LOAD64(wp, 10 / 2), LOAD64(wp, 8 / 2), LOAD64(wp, 6 / 2), + LOAD64(wp, 4 / 2), LOAD64(wp, 2 / 2), LOAD64(wp, 0 / 2), + zero, + LOAD64(wp, 22 / 2), LOAD64(wp, 20 / 2), LOAD64(wp, 18 / 2), + LOAD64(wp, 16 / 2), LOAD64(wp, 14 / 2), LOAD64(wp, 12 / 2)); + + /* "2*S1 + S4 + S5 + S6" with 64-bit limbs: + * s[6]: s[5]: s[4]: s[3]: s[2]: s[1]: s[0] + * + [A19:A18]:[A17:A16]:[A15:A14]:[A13:A12]:[A20:0]:[A23:0] + * => s[6]:s[5]:s[4]:s[3]:s[2]:s[1]:s[0] + */ + ADD7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0], + zero, zero, s[4], s[3], s[2], s[1], s[0], + zero, + LOAD64(wp, 18 / 2), LOAD64(wp, 16 / 2), + LOAD64(wp, 14 / 2), LOAD64(wp, 12 / 2), + LIMB64_HILO(LOAD32(wp, 20), 0), + LIMB64_HILO(LOAD32(wp, 23), 0)); + + /* "D1 + D2 + D3" with 64-bit limbs: + * d[6]: d[5]: d[4]: d[3]: d[2]: d[1]: d[0] + * + [A22:A21]:[A20:A19]:[A18:A17]:[A16:A15]:[A14:A13]:[A12:A23] + * => d[6]:d[5]:d[4]:d[3]:d[2]:d[1]:d[0] + */ + ADD7_LIMB64 (d[6], d[5], d[4], d[3], d[2], d[1], d[0], + zero, zero, zero, zero, d[2], d[1], d[0], + zero, + LOAD64_SHR32(21), + LOAD64_SHR32(19), + LOAD64_SHR32(17), + LOAD64_SHR32(15), + LOAD64_SHR32(13), + LIMB64_HILO(LOAD32(wp, 12), LOAD32(wp, 23))); + + /* "2*S1 + S3 + S4 + S5 + S6" with 64-bit limbs: + * s[6]: s[5]: s[4]: s[3]: s[2]: s[1]: s[0] + * + [A20:A19]:[A18:A17]:[A16:A15]:[A14:A13]:[A12:A23]:[A22:A21] + * => s[6]:s[5]:s[4]:s[3]:s[2]:s[1]:s[0] + */ + ADD7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0], + s[6], s[5], s[4], s[3], s[2], s[1], s[0], + zero, + LOAD64_SHR32(19), + LOAD64_SHR32(17), + LOAD64_SHR32(15), + LOAD64_SHR32(13), + LIMB64_HILO(LOAD32(wp, 12), LOAD32(wp, 23)), + LOAD64_SHR32(21)); + + /* "T + 2*S1 + S2 + S3 + S4 + S5 + S6" */ + ADD7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0], + s[6], s[5], s[4], s[3], s[2], s[1], s[0], + t[6], t[5], t[4], t[3], t[2], t[1], t[0]); + + /* "T + 2*S1 + S2 + S3 + S4 + S5 + S6 - D1 - D2 - D3" */ + SUB7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0], + s[6], s[5], s[4], s[3], s[2], s[1], s[0], + d[6], d[5], d[4], d[3], d[2], d[1], d[0]); + +#undef LOAD64_SHR32 + + /* mod p: + * 's[6]' holds carry value (-3..7). Subtract (carry + 1) * p. Result + * will be with in range -p...p. Handle result being negative with + * addition and conditional store. */ + + carry = LO32_LIMB64(s[6]); + + SUB7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0], + s[6], s[5], s[4], s[3], s[2], s[1], s[0], + p_mult[carry + 3][6], p_mult[carry + 3][5], + p_mult[carry + 3][4], p_mult[carry + 3][3], + p_mult[carry + 3][2], p_mult[carry + 3][1], + p_mult[carry + 3][0]); + + ADD7_LIMB64 (d[6], d[5], d[4], d[3], d[2], d[1], d[0], + s[6], s[5], s[4], s[3], s[2], s[1], s[0], + zero, + LOAD64(pp, 5), LOAD64(pp, 4), + LOAD64(pp, 3), LOAD64(pp, 2), + LOAD64(pp, 1), LOAD64(pp, 0)); + + mask1 = vzero - (LO32_LIMB64(d[6]) >> 31); + mask2 = (LO32_LIMB64(d[6]) >> 31) - vone; + + STORE64_COND(wp, 0, mask2, d[0], mask1, s[0]); + STORE64_COND(wp, 1, mask2, d[1], mask1, s[1]); + STORE64_COND(wp, 2, mask2, d[2], mask1, s[2]); + STORE64_COND(wp, 3, mask2, d[3], mask1, s[3]); + STORE64_COND(wp, 4, mask2, d[4], mask1, s[4]); + STORE64_COND(wp, 5, mask2, d[5], mask1, s[5]); + + w->nlimbs = wsize * LIMBS_PER_LIMB64; + MPN_NORMALIZE (wp, w->nlimbs); + +#if (BITS_PER_MPI_LIMB64 == BITS_PER_MPI_LIMB) && defined(WORDS_BIGENDIAN) + wipememory(wp_shr32, sizeof(wp_shr32)); +#endif +} + +void +_gcry_mpi_ec_nist521_mod (gcry_mpi_t w, mpi_ec_t ctx) +{ + mpi_size_t wsize = (521 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB; + mpi_limb_t s[wsize]; + mpi_limb_t cy; + mpi_ptr_t wp; + + MPN_NORMALIZE (w->d, w->nlimbs); + if (mpi_nbits_more_than (w, 2 * 521)) + log_bug ("W must be less than m^2\n"); + + RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2); + + wp = w->d; + + /* See "FIPS 186-4, D.2.5 Curve P-521". */ + + _gcry_mpih_rshift (s, wp + wsize - 1, wsize, 521 % BITS_PER_MPI_LIMB); + s[wsize - 1] &= (1 << (521 % BITS_PER_MPI_LIMB)) - 1; + wp[wsize - 1] &= (1 << (521 % BITS_PER_MPI_LIMB)) - 1; + _gcry_mpih_add_n (wp, wp, s, wsize); + + /* "mod p" */ + cy = _gcry_mpih_sub_n (wp, wp, ctx->p->d, wsize); + _gcry_mpih_add_n (s, wp, ctx->p->d, wsize); + mpih_set_cond (wp, s, wsize, (cy != 0UL)); + + w->nlimbs = wsize; + MPN_NORMALIZE (wp, w->nlimbs); +} + +#endif /* !ASM_DISABLED */ diff --git a/mpi/ec.c b/mpi/ec.c index 4fabf9b44..ae1d036a5 100644 --- a/mpi/ec.c +++ b/mpi/ec.c @@ -285,7 +285,7 @@ static void ec_addm (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx) { mpi_add (w, u, v); - ec_mod (w, ctx); + ctx->mod (w, ctx); } static void @@ -294,14 +294,14 @@ ec_subm (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ec) mpi_sub (w, u, v); while (w->sign) mpi_add (w, w, ec->p); - /*ec_mod (w, ec);*/ + /*ctx->mod (w, ec);*/ } static void ec_mulm (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx) { mpi_mul (w, u, v); - ec_mod (w, ctx); + ctx->mod (w, ctx); } /* W = 2 * U mod P. */ @@ -309,7 +309,7 @@ static void ec_mul2 (gcry_mpi_t w, gcry_mpi_t u, mpi_ec_t ctx) { mpi_lshift (w, u, 1); - ec_mod (w, ctx); + ctx->mod (w, ctx); } static void @@ -585,6 +585,7 @@ struct field_table { void (* mulm) (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx); void (* mul2) (gcry_mpi_t w, gcry_mpi_t u, mpi_ec_t ctx); void (* pow2) (gcry_mpi_t w, const gcry_mpi_t b, mpi_ec_t ctx); + void (* mod) (gcry_mpi_t w, mpi_ec_t ctx); }; static const struct field_table field_table[] = { @@ -594,7 +595,8 @@ static const struct field_table field_table[] = { ec_subm_25519, ec_mulm_25519, ec_mul2_25519, - ec_pow2_25519 + ec_pow2_25519, + NULL }, { "0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE" @@ -603,7 +605,55 @@ static const struct field_table field_table[] = { ec_subm_448, ec_mulm_448, ec_mul2_448, - ec_pow2_448 + ec_pow2_448, + NULL + }, + { + "0xfffffffffffffffffffffffffffffffeffffffffffffffff", + NULL, + NULL, + NULL, + NULL, + NULL, + _gcry_mpi_ec_nist192_mod + }, + { + "0xffffffffffffffffffffffffffffffff000000000000000000000001", + NULL, + NULL, + NULL, + NULL, + NULL, + _gcry_mpi_ec_nist224_mod + }, + { + "0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff", + NULL, + NULL, + NULL, + NULL, + NULL, + _gcry_mpi_ec_nist256_mod + }, + { + "0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffe" + "ffffffff0000000000000000ffffffff", + NULL, + NULL, + NULL, + NULL, + NULL, + _gcry_mpi_ec_nist384_mod + }, + { + "0x01ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" + "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff", + NULL, + NULL, + NULL, + NULL, + NULL, + _gcry_mpi_ec_nist521_mod }, { NULL, NULL, NULL, NULL, NULL, NULL }, }; @@ -757,6 +807,7 @@ ec_p_init (mpi_ec_t ctx, enum gcry_mpi_ec_models model, ctx->mulm = ec_mulm; ctx->mul2 = ec_mul2; ctx->pow2 = ec_pow2; + ctx->mod = ec_mod; for (i=0; field_table[i].p; i++) { @@ -769,18 +820,25 @@ ec_p_init (mpi_ec_t ctx, enum gcry_mpi_ec_models model, if (!mpi_cmp (p, f_p)) { - ctx->addm = field_table[i].addm; - ctx->subm = field_table[i].subm; - ctx->mulm = field_table[i].mulm; - ctx->mul2 = field_table[i].mul2; - ctx->pow2 = field_table[i].pow2; + ctx->addm = field_table[i].addm ? field_table[i].addm : ctx->addm; + ctx->subm = field_table[i].subm ? field_table[i].subm : ctx->subm; + ctx->mulm = field_table[i].mulm ? field_table[i].mulm : ctx->mulm; + ctx->mul2 = field_table[i].mul2 ? field_table[i].mul2 : ctx->mul2; + ctx->pow2 = field_table[i].pow2 ? field_table[i].pow2 : ctx->pow2; + ctx->mod = field_table[i].mod ? field_table[i].mod : ctx->mod; _gcry_mpi_release (f_p); - mpi_resize (ctx->a, ctx->p->nlimbs); - ctx->a->nlimbs = ctx->p->nlimbs; - - mpi_resize (ctx->b, ctx->p->nlimbs); - ctx->b->nlimbs = ctx->p->nlimbs; + if (ctx->a) + { + mpi_resize (ctx->a, ctx->p->nlimbs); + ctx->a->nlimbs = ctx->p->nlimbs; + } + + if (ctx->b) + { + mpi_resize (ctx->b, ctx->p->nlimbs); + ctx->b->nlimbs = ctx->p->nlimbs; + } for (i=0; i< DIM(ctx->t.scratch) && ctx->t.scratch[i]; i++) ctx->t.scratch[i]->nlimbs = ctx->p->nlimbs; diff --git a/mpi/mpi-internal.h b/mpi/mpi-internal.h index 8ccdeada5..11fcbde44 100644 --- a/mpi/mpi-internal.h +++ b/mpi/mpi-internal.h @@ -79,6 +79,11 @@ typedef int mpi_size_t; /* (must be a signed type) */ if( (a)->alloced < (b) ) \ mpi_resize((a), (b)); \ } while(0) +#define RESIZE_AND_CLEAR_IF_NEEDED(a,b) \ + do { \ + if( (a)->nlimbs < (b) ) \ + mpi_resize((a), (b)); \ + } while(0) /* Copy N limbs from S to D. */ #define MPN_COPY( d, s, n) \ diff --git a/mpi/mpiutil.c b/mpi/mpiutil.c index 5320f4d84..a5583c792 100644 --- a/mpi/mpiutil.c +++ b/mpi/mpiutil.c @@ -197,7 +197,7 @@ _gcry_mpi_resize (gcry_mpi_t a, unsigned nlimbs) if (a->d) { a->d = xrealloc (a->d, nlimbs * sizeof (mpi_limb_t)); - for (i=a->alloced; i < nlimbs; i++) + for (i=a->nlimbs; i < nlimbs; i++) a->d[i] = 0; } else diff --git a/src/ec-context.h b/src/ec-context.h index d1c648041..479862f6f 100644 --- a/src/ec-context.h +++ b/src/ec-context.h @@ -74,6 +74,7 @@ struct mpi_ec_ctx_s void (* mulm) (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx); void (* pow2) (gcry_mpi_t w, const gcry_mpi_t b, mpi_ec_t ctx); void (* mul2) (gcry_mpi_t w, gcry_mpi_t u, mpi_ec_t ctx); + void (* mod) (gcry_mpi_t w, mpi_ec_t ctx); };