From 9164a1b6582e2fc833c760a3403d26b9b0b3b7b3 Mon Sep 17 00:00:00 2001 From: Pieter Wuille Date: Sat, 28 Nov 2020 15:58:22 -0800 Subject: [PATCH] Optimization: special-case zero modulus limbs in modinv64 Both the field and scalar modulus can be written in signed{30,62} notation with one or more zero limbs. Make use of this in the update_de function to avoid a few wide multiplications when that is the case. This doesn't appear to be a win in the 32-bit implementation, so only do it for the 64-bit one. --- src/modinv64_impl.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/modinv64_impl.h b/src/modinv64_impl.h index 3ab21cdc0255b..281cdb9117846 100644 --- a/src/modinv64_impl.h +++ b/src/modinv64_impl.h @@ -338,22 +338,28 @@ static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp /* Compute limb 1 of t*[d,e]+modulus*[md,me], and store it as output limb 0 (= down shift). */ cd += (int128_t)u * d1 + (int128_t)v * e1; ce += (int128_t)q * d1 + (int128_t)r * e1; - cd += (int128_t)modinfo->modulus.v[1] * md; - ce += (int128_t)modinfo->modulus.v[1] * me; + if (modinfo->modulus.v[1]) { /* Optimize for the case where limb of modulus is zero. */ + cd += (int128_t)modinfo->modulus.v[1] * md; + ce += (int128_t)modinfo->modulus.v[1] * me; + } d->v[0] = (int64_t)cd & M62; cd >>= 62; e->v[0] = (int64_t)ce & M62; ce >>= 62; /* Compute limb 2 of t*[d,e]+modulus*[md,me], and store it as output limb 1. */ cd += (int128_t)u * d2 + (int128_t)v * e2; ce += (int128_t)q * d2 + (int128_t)r * e2; - cd += (int128_t)modinfo->modulus.v[2] * md; - ce += (int128_t)modinfo->modulus.v[2] * me; + if (modinfo->modulus.v[2]) { /* Optimize for the case where limb of modulus is zero. */ + cd += (int128_t)modinfo->modulus.v[2] * md; + ce += (int128_t)modinfo->modulus.v[2] * me; + } d->v[1] = (int64_t)cd & M62; cd >>= 62; e->v[1] = (int64_t)ce & M62; ce >>= 62; /* Compute limb 3 of t*[d,e]+modulus*[md,me], and store it as output limb 2. */ cd += (int128_t)u * d3 + (int128_t)v * e3; ce += (int128_t)q * d3 + (int128_t)r * e3; - cd += (int128_t)modinfo->modulus.v[3] * md; - ce += (int128_t)modinfo->modulus.v[3] * me; + if (modinfo->modulus.v[3]) { /* Optimize for the case where limb of modulus is zero. */ + cd += (int128_t)modinfo->modulus.v[3] * md; + ce += (int128_t)modinfo->modulus.v[3] * me; + } d->v[2] = (int64_t)cd & M62; cd >>= 62; e->v[2] = (int64_t)ce & M62; ce >>= 62; /* Compute limb 4 of t*[d,e]+modulus*[md,me], and store it as output limb 3. */