From 7f7a2ed3a83aa778118dd36de04ee8ee1b42f48e Mon Sep 17 00:00:00 2001
From: Andrew Poelstra <apoelstra@wpsoftware.net>
Date: Thu, 20 Sep 2018 22:24:57 +0000
Subject: [PATCH 1/6] ecmult_gen_impl: eliminate scratch memory used when
 generating context

---
 src/bench_ecmult.c    |  2 +-
 src/ecmult_gen_impl.h |  2 +-
 src/group.h           |  2 +-
 src/group_impl.h      | 39 +++++++++++++++++++++++++++------------
 src/tests.c           |  2 +-
 5 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/src/bench_ecmult.c b/src/bench_ecmult.c
index 52d0476a30ffb..c96f7fb6e43ab 100644
--- a/src/bench_ecmult.c
+++ b/src/bench_ecmult.c
@@ -172,7 +172,7 @@ int main(int argc, char **argv) {
             secp256k1_scalar_add(&data.seckeys[i], &data.seckeys[i - 1], &data.seckeys[i - 1]);
         }
     }
-    secp256k1_ge_set_all_gej_var(data.pubkeys, pubkeys_gej, POINTS, &data.ctx->error_callback);
+    secp256k1_ge_set_all_gej_var(data.pubkeys, pubkeys_gej, POINTS);
     free(pubkeys_gej);
 
     for (i = 1; i <= 8; ++i) {
diff --git a/src/ecmult_gen_impl.h b/src/ecmult_gen_impl.h
index 714f02e94c981..d64505dc00107 100644
--- a/src/ecmult_gen_impl.h
+++ b/src/ecmult_gen_impl.h
@@ -77,7 +77,7 @@ static void secp256k1_ecmult_gen_context_build(secp256k1_ecmult_gen_context *ctx
                 secp256k1_gej_add_var(&numsbase, &numsbase, &nums_gej, NULL);
             }
         }
-        secp256k1_ge_set_all_gej_var(prec, precj, 1024, cb);
+        secp256k1_ge_set_all_gej_var(prec, precj, 1024);
     }
     for (j = 0; j < 64; j++) {
         for (i = 0; i < 16; i++) {
diff --git a/src/group.h b/src/group.h
index 3947ea2ddafa3..0911df2cb51e6 100644
--- a/src/group.h
+++ b/src/group.h
@@ -65,7 +65,7 @@ static void secp256k1_ge_neg(secp256k1_ge *r, const secp256k1_ge *a);
 static void secp256k1_ge_set_gej(secp256k1_ge *r, secp256k1_gej *a);
 
 /** Set a batch of group elements equal to the inputs given in jacobian coordinates */
-static void secp256k1_ge_set_all_gej_var(secp256k1_ge *r, const secp256k1_gej *a, size_t len, const secp256k1_callback *cb);
+static void secp256k1_ge_set_all_gej_var(secp256k1_ge *r, const secp256k1_gej *a, size_t len);
 
 /** Set a batch of group elements equal to the inputs given in jacobian
  *  coordinates (with known z-ratios). zr must contain the known z-ratios such
diff --git a/src/group_impl.h b/src/group_impl.h
index b1ace87b6ffd0..006a4548876a5 100644
--- a/src/group_impl.h
+++ b/src/group_impl.h
@@ -126,30 +126,45 @@ static void secp256k1_ge_set_gej_var(secp256k1_ge *r, secp256k1_gej *a) {
     r->y = a->y;
 }
 
-static void secp256k1_ge_set_all_gej_var(secp256k1_ge *r, const secp256k1_gej *a, size_t len, const secp256k1_callback *cb) {
-    secp256k1_fe *az;
-    secp256k1_fe *azi;
+static void secp256k1_ge_set_all_gej_var(secp256k1_ge *r, const secp256k1_gej *a, size_t len) {
+    secp256k1_fe u;
     size_t i;
-    size_t count = 0;
-    az = (secp256k1_fe *)checked_malloc(cb, sizeof(secp256k1_fe) * len);
+    size_t last_i = SIZE_MAX;
+
     for (i = 0; i < len; i++) {
         if (!a[i].infinity) {
-            az[count++] = a[i].z;
+            /* Use destination's x coordinates as scratch space */
+            if (last_i == SIZE_MAX) {
+                r[i].x = a[i].z;
+            } else {
+                secp256k1_fe_mul(&r[i].x, &r[last_i].x, &a[i].z);
+            }
+            last_i = i;
         }
     }
+    if (last_i == SIZE_MAX) {
+        return;
+    }
+    secp256k1_fe_inv_var(&u, &r[last_i].x);
 
-    azi = (secp256k1_fe *)checked_malloc(cb, sizeof(secp256k1_fe) * count);
-    secp256k1_fe_inv_all_var(azi, az, count);
-    free(az);
+    i = last_i;
+    while (i > 0) {
+        i--;
+        if (!a[i].infinity) {
+            secp256k1_fe_mul(&r[last_i].x, &r[i].x, &u);
+            secp256k1_fe_mul(&u, &u, &a[last_i].z);
+            last_i = i;
+        }
+    }
+    VERIFY_CHECK(!a[last_i].infinity);
+    r[last_i].x = u;
 
-    count = 0;
     for (i = 0; i < len; i++) {
         r[i].infinity = a[i].infinity;
         if (!a[i].infinity) {
-            secp256k1_ge_set_gej_zinv(&r[i], &a[i], &azi[count++]);
+            secp256k1_ge_set_gej_zinv(&r[i], &a[i], &r[i].x);
         }
     }
-    free(azi);
 }
 
 static void secp256k1_ge_set_table_gej_var(secp256k1_ge *r, const secp256k1_gej *a, const secp256k1_fe *zr, size_t len) {
diff --git a/src/tests.c b/src/tests.c
index c72a742d87f8d..589cf85e1844f 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -2104,7 +2104,7 @@ void test_ge(void) {
             }
         }
         secp256k1_ge_set_table_gej_var(ge_set_table, gej, zr, 4 * runs + 1);
-        secp256k1_ge_set_all_gej_var(ge_set_all, gej, 4 * runs + 1, &ctx->error_callback);
+        secp256k1_ge_set_all_gej_var(ge_set_all, gej, 4 * runs + 1);
         for (i = 0; i < 4 * runs + 1; i++) {
             secp256k1_fe s;
             random_fe_non_zero(&s);

From 47045270fa90f81205d989f7107769bce1e71c4d Mon Sep 17 00:00:00 2001
From: Andrew Poelstra <apoelstra@wpsoftware.net>
Date: Thu, 20 Sep 2018 23:34:02 +0000
Subject: [PATCH 2/6] ecmult_impl: eliminate scratch memory used when
 generating context

---
 src/ecmult_impl.h | 115 +++++++++++++++++++++++++++++++++++++++-------
 src/group.h       |   5 --
 src/group_impl.h  |  18 --------
 src/tests.c       |   4 --
 4 files changed, 99 insertions(+), 43 deletions(-)

diff --git a/src/ecmult_impl.h b/src/ecmult_impl.h
index d5fb6c5b61dd2..74c350fcde902 100644
--- a/src/ecmult_impl.h
+++ b/src/ecmult_impl.h
@@ -137,24 +137,107 @@ static void secp256k1_ecmult_odd_multiples_table_globalz_windowa(secp256k1_ge *p
     secp256k1_ge_globalz_set_table_gej(ECMULT_TABLE_SIZE(WINDOW_A), pre, globalz, prej, zr);
 }
 
-static void secp256k1_ecmult_odd_multiples_table_storage_var(int n, secp256k1_ge_storage *pre, const secp256k1_gej *a, const secp256k1_callback *cb) {
-    secp256k1_gej *prej = (secp256k1_gej*)checked_malloc(cb, sizeof(secp256k1_gej) * n);
-    secp256k1_ge *prea = (secp256k1_ge*)checked_malloc(cb, sizeof(secp256k1_ge) * n);
-    secp256k1_fe *zr = (secp256k1_fe*)checked_malloc(cb, sizeof(secp256k1_fe) * n);
+static void secp256k1_ecmult_odd_multiples_table_storage_var(const int n, secp256k1_ge_storage *pre, const secp256k1_gej *a) {
+    secp256k1_gej d;
+    secp256k1_ge a_ge, d_ge, p_ge;
+    secp256k1_ge last_ge;
+    secp256k1_gej pj;
+    secp256k1_fe zi;
+    secp256k1_fe zr;
+    secp256k1_fe dx_over_dz_squared;
     int i;
 
-    /* Compute the odd multiples in Jacobian form. */
-    secp256k1_ecmult_odd_multiples_table(n, prej, zr, a);
-    /* Convert them in batch to affine coordinates. */
-    secp256k1_ge_set_table_gej_var(prea, prej, zr, n);
-    /* Convert them to compact storage form. */
-    for (i = 0; i < n; i++) {
-        secp256k1_ge_to_storage(&pre[i], &prea[i]);
+    VERIFY_CHECK(!a->infinity);
+
+    secp256k1_gej_double_var(&d, a, NULL);
+
+    /* First, we perform all the additions in an isomorphic curve obtained by multiplying
+     * all `z` coordinates by 1/`d.z`. In these coordinates `d` is affine so we can use
+     * `secp256k1_gej_add_ge_var` to perform the additions. For each addition, we store
+     * the resulting y-coordinate and the z-ratio, since we only have enough memory to
+     * store two field elements. These are sufficient to efficiently undo the isomorphism
+     * and recompute all the `x`s.
+     */
+    d_ge.x = d.x;
+    d_ge.y = d.y;
+    d_ge.infinity = 0;
+
+    secp256k1_ge_set_gej_zinv(&a_ge, a, &d.z);
+    pj.x = a_ge.x;
+    pj.y = a_ge.y;
+    pj.z = a->z;
+    pj.infinity = 0;
+
+    zr = d.z;
+    secp256k1_fe_normalize_var(&zr);
+    secp256k1_fe_to_storage(&pre[0].x, &zr);
+    secp256k1_fe_normalize_var(&pj.y);
+    secp256k1_fe_to_storage(&pre[0].y, &pj.y);
+
+    for (i = 1; i < n; i++) {
+        secp256k1_gej_add_ge_var(&pj, &pj, &d_ge, &zr);
+        secp256k1_fe_normalize_var(&zr);
+        secp256k1_fe_to_storage(&pre[i].x, &zr);
+        secp256k1_fe_normalize_var(&pj.y);
+        secp256k1_fe_to_storage(&pre[i].y, &pj.y);
     }
 
-    free(prea);
-    free(prej);
-    free(zr);
+    /* Map `pj` back to our curve by multiplying its z-coordinate by `d.z`. */
+    secp256k1_fe_mul(&pj.z, &pj.z, &d.z);
+    /* Directly set `pre[n - 1]` to `pj`, saving the inverted z-coordinate so
+     * that we can combine it with the saved z-ratios to compute the other zs
+     * without any more inversions. */
+    secp256k1_fe_inv_var(&zi, &pj.z);
+    secp256k1_ge_set_gej_zinv(&p_ge, &pj, &zi);
+    secp256k1_ge_from_storage(&last_ge, &pre[n - 1]);
+    secp256k1_ge_to_storage(&pre[n - 1], &p_ge);
+
+    /* Compute the actual x-coordinate of D, which will be needed below. */
+    secp256k1_fe_inv_var(&d.z, &d.z);
+    secp256k1_fe_sqr(&dx_over_dz_squared, &d.z);
+    secp256k1_fe_mul(&dx_over_dz_squared, &dx_over_dz_squared, &d.x);
+
+    i = n - 1;
+    while (i > 0) {
+        secp256k1_fe zi2, zi3;
+        i--;
+        /* For the remaining points, we extract the z-ratio from the stored
+         * x-coordinate, compute its z^-1 from that, and compute the full
+         * point from that. The z-ratio for the next iteration is stored in
+         * the x-coordinate at the end of the loop. */
+        secp256k1_fe_mul(&zi, &zi, &last_ge.x);
+        secp256k1_fe_sqr(&zi2, &zi);
+        secp256k1_fe_mul(&zi3, &zi2, &zi);
+        /* To compute the actual x-coordinate, we use the stored z ratio and
+         * y-coordinate, which we obtained from `secp256k1_gej_add_ge_var`
+         * in the loop above, as well as the inverse of the square of its
+         * z-coordinate. We store the latter in the `zi2` variable, which is
+         * computed iteratively starting from the overall Z inverse then
+         * multiplying by each z-ratio in turn.
+         *
+         * Denoting the z-ratio as `rzr` (though the actual variable binding
+         * is `last_ge.x`), we observe that it equal to `h` from the inside
+         * of the above `gej_add_ge_var` call. This satisfies
+         *
+         *    rzr = d_x * z^2 - x
+         *
+         * where `d_x` is the x coordinate of `D` and `(x, z)` are Jacobian
+         * coordinates of our desired point.
+         *
+         * Rearranging and dividing by `z^2` to convert to affine, we get
+         *
+         *     x = d_x - rzr / z^2
+         *       = d_x - rzr * zi2
+         */
+        secp256k1_fe_mul(&p_ge.x, &last_ge.x, &zi2);
+        secp256k1_fe_negate(&p_ge.x, &p_ge.x, 1);
+        secp256k1_fe_add(&p_ge.x, &dx_over_dz_squared);
+        /* y is stored_y/z^3, as we expect */
+        secp256k1_ge_from_storage(&last_ge, &pre[i]);
+        secp256k1_fe_mul(&p_ge.y, &last_ge.y, &zi3);
+        /* Store */
+        secp256k1_ge_to_storage(&pre[i], &p_ge);
+    }
 }
 
 /** The following two macro retrieves a particular odd multiple from a table
@@ -202,7 +285,7 @@ static void secp256k1_ecmult_context_build(secp256k1_ecmult_context *ctx, const
     ctx->pre_g = (secp256k1_ge_storage (*)[])checked_malloc(cb, sizeof((*ctx->pre_g)[0]) * ECMULT_TABLE_SIZE(WINDOW_G));
 
     /* precompute the tables with odd multiples */
-    secp256k1_ecmult_odd_multiples_table_storage_var(ECMULT_TABLE_SIZE(WINDOW_G), *ctx->pre_g, &gj, cb);
+    secp256k1_ecmult_odd_multiples_table_storage_var(ECMULT_TABLE_SIZE(WINDOW_G), *ctx->pre_g, &gj);
 
 #ifdef USE_ENDOMORPHISM
     {
@@ -216,7 +299,7 @@ static void secp256k1_ecmult_context_build(secp256k1_ecmult_context *ctx, const
         for (i = 0; i < 128; i++) {
             secp256k1_gej_double_var(&g_128j, &g_128j, NULL);
         }
-        secp256k1_ecmult_odd_multiples_table_storage_var(ECMULT_TABLE_SIZE(WINDOW_G), *ctx->pre_g_128, &g_128j, cb);
+        secp256k1_ecmult_odd_multiples_table_storage_var(ECMULT_TABLE_SIZE(WINDOW_G), *ctx->pre_g_128, &g_128j);
     }
 #endif
 }
diff --git a/src/group.h b/src/group.h
index 0911df2cb51e6..8e122ab429c56 100644
--- a/src/group.h
+++ b/src/group.h
@@ -67,11 +67,6 @@ static void secp256k1_ge_set_gej(secp256k1_ge *r, secp256k1_gej *a);
 /** Set a batch of group elements equal to the inputs given in jacobian coordinates */
 static void secp256k1_ge_set_all_gej_var(secp256k1_ge *r, const secp256k1_gej *a, size_t len);
 
-/** Set a batch of group elements equal to the inputs given in jacobian
- *  coordinates (with known z-ratios). zr must contain the known z-ratios such
- *  that mul(a[i].z, zr[i+1]) == a[i+1].z. zr[0] is ignored. */
-static void secp256k1_ge_set_table_gej_var(secp256k1_ge *r, const secp256k1_gej *a, const secp256k1_fe *zr, size_t len);
-
 /** Bring a batch inputs given in jacobian coordinates (with known z-ratios) to
  *  the same global z "denominator". zr must contain the known z-ratios such
  *  that mul(a[i].z, zr[i+1]) == a[i+1].z. zr[0] is ignored. The x and y
diff --git a/src/group_impl.h b/src/group_impl.h
index 006a4548876a5..5caf421b5e182 100644
--- a/src/group_impl.h
+++ b/src/group_impl.h
@@ -167,24 +167,6 @@ static void secp256k1_ge_set_all_gej_var(secp256k1_ge *r, const secp256k1_gej *a
     }
 }
 
-static void secp256k1_ge_set_table_gej_var(secp256k1_ge *r, const secp256k1_gej *a, const secp256k1_fe *zr, size_t len) {
-    size_t i = len - 1;
-    secp256k1_fe zi;
-
-    if (len > 0) {
-        /* Compute the inverse of the last z coordinate, and use it to compute the last affine output. */
-        secp256k1_fe_inv(&zi, &a[i].z);
-        secp256k1_ge_set_gej_zinv(&r[i], &a[i], &zi);
-
-        /* Work out way backwards, using the z-ratios to scale the x/y values. */
-        while (i > 0) {
-            secp256k1_fe_mul(&zi, &zi, &zr[i]);
-            i--;
-            secp256k1_ge_set_gej_zinv(&r[i], &a[i], &zi);
-        }
-    }
-}
-
 static void secp256k1_ge_globalz_set_table_gej(size_t len, secp256k1_ge *r, secp256k1_fe *globalz, const secp256k1_gej *a, const secp256k1_fe *zr) {
     size_t i = len - 1;
     secp256k1_fe zs;
diff --git a/src/tests.c b/src/tests.c
index 589cf85e1844f..3414a0f4cee3c 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -2095,7 +2095,6 @@ void test_ge(void) {
     /* Test batch gej -> ge conversion with and without known z ratios. */
     {
         secp256k1_fe *zr = (secp256k1_fe *)checked_malloc(&ctx->error_callback, (4 * runs + 1) * sizeof(secp256k1_fe));
-        secp256k1_ge *ge_set_table = (secp256k1_ge *)checked_malloc(&ctx->error_callback, (4 * runs + 1) * sizeof(secp256k1_ge));
         secp256k1_ge *ge_set_all = (secp256k1_ge *)checked_malloc(&ctx->error_callback, (4 * runs + 1) * sizeof(secp256k1_ge));
         for (i = 0; i < 4 * runs + 1; i++) {
             /* Compute gej[i + 1].z / gez[i].z (with gej[n].z taken to be 1). */
@@ -2103,16 +2102,13 @@ void test_ge(void) {
                 secp256k1_fe_mul(&zr[i + 1], &zinv[i], &gej[i + 1].z);
             }
         }
-        secp256k1_ge_set_table_gej_var(ge_set_table, gej, zr, 4 * runs + 1);
         secp256k1_ge_set_all_gej_var(ge_set_all, gej, 4 * runs + 1);
         for (i = 0; i < 4 * runs + 1; i++) {
             secp256k1_fe s;
             random_fe_non_zero(&s);
             secp256k1_gej_rescale(&gej[i], &s);
-            ge_equals_gej(&ge_set_table[i], &gej[i]);
             ge_equals_gej(&ge_set_all[i], &gej[i]);
         }
-        free(ge_set_table);
         free(ge_set_all);
         free(zr);
     }

From 84740acd2a185514f1f5be84ca3fae52ca1f6576 Mon Sep 17 00:00:00 2001
From: Andrew Poelstra <apoelstra@wpsoftware.net>
Date: Mon, 1 Oct 2018 21:00:41 +0000
Subject: [PATCH 3/6] ecmult_impl: save one fe_inv_var

---
 src/ecmult_impl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ecmult_impl.h b/src/ecmult_impl.h
index 74c350fcde902..bf6bb63fd10d4 100644
--- a/src/ecmult_impl.h
+++ b/src/ecmult_impl.h
@@ -183,6 +183,7 @@ static void secp256k1_ecmult_odd_multiples_table_storage_var(const int n, secp25
     }
 
     /* Map `pj` back to our curve by multiplying its z-coordinate by `d.z`. */
+    zr = pj.z; /* save pj.z so we can use it to extract (d.z)^-1 from zi */
     secp256k1_fe_mul(&pj.z, &pj.z, &d.z);
     /* Directly set `pre[n - 1]` to `pj`, saving the inverted z-coordinate so
      * that we can combine it with the saved z-ratios to compute the other zs
@@ -193,7 +194,7 @@ static void secp256k1_ecmult_odd_multiples_table_storage_var(const int n, secp25
     secp256k1_ge_to_storage(&pre[n - 1], &p_ge);
 
     /* Compute the actual x-coordinate of D, which will be needed below. */
-    secp256k1_fe_inv_var(&d.z, &d.z);
+    secp256k1_fe_mul(&d.z, &zi, &zr);  /* d.z = 1/d.z */
     secp256k1_fe_sqr(&dx_over_dz_squared, &d.z);
     secp256k1_fe_mul(&dx_over_dz_squared, &dx_over_dz_squared, &d.x);
 

From ffd3b346fe7250c488e6d3c8653c314cb00722a0 Mon Sep 17 00:00:00 2001
From: Andrew Poelstra <apoelstra@wpsoftware.net>
Date: Wed, 17 Oct 2018 19:50:37 +0000
Subject: [PATCH 4/6] add `secp256k1_ge_set_all_gej_var` test which deals with
 many infinite points

---
 src/tests.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/tests.c b/src/tests.c
index 3414a0f4cee3c..61bb7fce8727e 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -2113,6 +2113,22 @@ void test_ge(void) {
         free(zr);
     }
 
+    /* Test batch gej -> ge conversion with many infinities. */
+    for (i = 0; i < 4 * runs + 1; i++) {
+        random_group_element_test(&ge[i]);
+        /* randomly set half the points to infinitiy */
+        if(secp256k1_fe_is_odd(&ge[i].x)) {
+            secp256k1_ge_set_infinity(&ge[i]);
+        }
+        secp256k1_gej_set_ge(&gej[i], &ge[i]);
+    }
+    /* batch invert */
+    secp256k1_ge_set_all_gej_var(ge, gej, 4 * runs + 1);
+    /* check result */
+    for (i = 0; i < 4 * runs + 1; i++) {
+        ge_equals_gej(&ge[i], &gej[i]);
+    }
+
     free(ge);
     free(gej);
     free(zinv);

From efa783f8f0addb8f1e2ee0f1b9999673c6414acd Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Fri, 9 Nov 2018 14:50:17 +0700
Subject: [PATCH 5/6] Store z-ratios in the 'x' coord they'll recover

---
 src/ecmult_impl.h | 49 +++++++++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 27 deletions(-)

diff --git a/src/ecmult_impl.h b/src/ecmult_impl.h
index bf6bb63fd10d4..0856e549db2ae 100644
--- a/src/ecmult_impl.h
+++ b/src/ecmult_impl.h
@@ -139,8 +139,7 @@ static void secp256k1_ecmult_odd_multiples_table_globalz_windowa(secp256k1_ge *p
 
 static void secp256k1_ecmult_odd_multiples_table_storage_var(const int n, secp256k1_ge_storage *pre, const secp256k1_gej *a) {
     secp256k1_gej d;
-    secp256k1_ge a_ge, d_ge, p_ge;
-    secp256k1_ge last_ge;
+    secp256k1_ge d_ge, p_ge;
     secp256k1_gej pj;
     secp256k1_fe zi;
     secp256k1_fe zr;
@@ -162,51 +161,48 @@ static void secp256k1_ecmult_odd_multiples_table_storage_var(const int n, secp25
     d_ge.y = d.y;
     d_ge.infinity = 0;
 
-    secp256k1_ge_set_gej_zinv(&a_ge, a, &d.z);
-    pj.x = a_ge.x;
-    pj.y = a_ge.y;
+    secp256k1_ge_set_gej_zinv(&p_ge, a, &d.z);
+    pj.x = p_ge.x;
+    pj.y = p_ge.y;
     pj.z = a->z;
     pj.infinity = 0;
 
-    zr = d.z;
-    secp256k1_fe_normalize_var(&zr);
-    secp256k1_fe_to_storage(&pre[0].x, &zr);
-    secp256k1_fe_normalize_var(&pj.y);
-    secp256k1_fe_to_storage(&pre[0].y, &pj.y);
-
-    for (i = 1; i < n; i++) {
+    for (i = 0; i < (n - 1); i++) {
+        secp256k1_fe_normalize_var(&pj.y);
+        secp256k1_fe_to_storage(&pre[i].y, &pj.y);
         secp256k1_gej_add_ge_var(&pj, &pj, &d_ge, &zr);
         secp256k1_fe_normalize_var(&zr);
         secp256k1_fe_to_storage(&pre[i].x, &zr);
-        secp256k1_fe_normalize_var(&pj.y);
-        secp256k1_fe_to_storage(&pre[i].y, &pj.y);
     }
 
-    /* Map `pj` back to our curve by multiplying its z-coordinate by `d.z`. */
-    zr = pj.z; /* save pj.z so we can use it to extract (d.z)^-1 from zi */
-    secp256k1_fe_mul(&pj.z, &pj.z, &d.z);
+    /* Invert d.z in the same batch, preserving pj.z so we can extract 1/d.z */
+    secp256k1_fe_mul(&zi, &pj.z, &d.z);
+    secp256k1_fe_inv_var(&zi, &zi);
+
     /* Directly set `pre[n - 1]` to `pj`, saving the inverted z-coordinate so
      * that we can combine it with the saved z-ratios to compute the other zs
      * without any more inversions. */
-    secp256k1_fe_inv_var(&zi, &pj.z);
     secp256k1_ge_set_gej_zinv(&p_ge, &pj, &zi);
-    secp256k1_ge_from_storage(&last_ge, &pre[n - 1]);
     secp256k1_ge_to_storage(&pre[n - 1], &p_ge);
 
     /* Compute the actual x-coordinate of D, which will be needed below. */
-    secp256k1_fe_mul(&d.z, &zi, &zr);  /* d.z = 1/d.z */
+    secp256k1_fe_mul(&d.z, &zi, &pj.z);  /* d.z = 1/d.z */
     secp256k1_fe_sqr(&dx_over_dz_squared, &d.z);
     secp256k1_fe_mul(&dx_over_dz_squared, &dx_over_dz_squared, &d.x);
 
     i = n - 1;
     while (i > 0) {
         secp256k1_fe zi2, zi3;
+        const secp256k1_fe *rzr;
         i--;
+
+        secp256k1_ge_from_storage(&p_ge, &pre[i]);
+
         /* For the remaining points, we extract the z-ratio from the stored
          * x-coordinate, compute its z^-1 from that, and compute the full
-         * point from that. The z-ratio for the next iteration is stored in
-         * the x-coordinate at the end of the loop. */
-        secp256k1_fe_mul(&zi, &zi, &last_ge.x);
+         * point from that. */
+        rzr = &p_ge.x;
+        secp256k1_fe_mul(&zi, &zi, rzr);
         secp256k1_fe_sqr(&zi2, &zi);
         secp256k1_fe_mul(&zi3, &zi2, &zi);
         /* To compute the actual x-coordinate, we use the stored z ratio and
@@ -217,7 +213,7 @@ static void secp256k1_ecmult_odd_multiples_table_storage_var(const int n, secp25
          * multiplying by each z-ratio in turn.
          *
          * Denoting the z-ratio as `rzr` (though the actual variable binding
-         * is `last_ge.x`), we observe that it equal to `h` from the inside
+         * is `p_ge.x`), we observe that it equal to `h` from the inside
          * of the above `gej_add_ge_var` call. This satisfies
          *
          *    rzr = d_x * z^2 - x
@@ -230,12 +226,11 @@ static void secp256k1_ecmult_odd_multiples_table_storage_var(const int n, secp25
          *     x = d_x - rzr / z^2
          *       = d_x - rzr * zi2
          */
-        secp256k1_fe_mul(&p_ge.x, &last_ge.x, &zi2);
+        secp256k1_fe_mul(&p_ge.x, rzr, &zi2);
         secp256k1_fe_negate(&p_ge.x, &p_ge.x, 1);
         secp256k1_fe_add(&p_ge.x, &dx_over_dz_squared);
         /* y is stored_y/z^3, as we expect */
-        secp256k1_ge_from_storage(&last_ge, &pre[i]);
-        secp256k1_fe_mul(&p_ge.y, &last_ge.y, &zi3);
+        secp256k1_fe_mul(&p_ge.y, &p_ge.y, &zi3);
         /* Store */
         secp256k1_ge_to_storage(&pre[i], &p_ge);
     }

From b3bf5f99a3251e3d72ffde1f39158af6ea133e33 Mon Sep 17 00:00:00 2001
From: Andrew Poelstra <apoelstra@wpsoftware.net>
Date: Sat, 10 Nov 2018 13:42:55 +0000
Subject: [PATCH 6/6] ecmult_impl: expand comment to explain how effective
 affine interacts with everything

---
 src/ecmult_impl.h | 52 ++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 42 insertions(+), 10 deletions(-)

diff --git a/src/ecmult_impl.h b/src/ecmult_impl.h
index 0856e549db2ae..c00578beda92a 100644
--- a/src/ecmult_impl.h
+++ b/src/ecmult_impl.h
@@ -190,6 +190,26 @@ static void secp256k1_ecmult_odd_multiples_table_storage_var(const int n, secp25
     secp256k1_fe_sqr(&dx_over_dz_squared, &d.z);
     secp256k1_fe_mul(&dx_over_dz_squared, &dx_over_dz_squared, &d.x);
 
+    /* Going into the second loop, we have set `pre[n-1]` to its final affine
+     * form, but still need to set `pre[i]` for `i` in 0 through `n-2`. We
+     * have `zi = (p.z * d.z)^-1`, where
+     *
+     *     `p.z` is the z-coordinate of the point on the isomorphic curve
+     *           which was ultimately assigned to `pre[n-1]`.
+     *     `d.z` is the multiplier that must be applied to all z-coordinates
+     *           to move from our isomorphic curve back to secp256k1; so the
+     *           product `p.z * d.z` is the z-coordinate of the secp256k1
+     *           point assigned to `pre[n-1]`.
+     *
+     * All subsequent inverse-z-coordinates can be obtained by multiplying this
+     * factor by successive z-ratios, which is much more efficient than directly
+     * computing each one.
+     *
+     * Importantly, these inverse-zs will be coordinates of points on secp256k1,
+     * while our other stored values come from computations on the isomorphic
+     * curve. So in the below loop, we will take care not to actually use `zi`
+     * or any derived values until we're back on secp256k1.
+     */
     i = n - 1;
     while (i > 0) {
         secp256k1_fe zi2, zi3;
@@ -198,7 +218,7 @@ static void secp256k1_ecmult_odd_multiples_table_storage_var(const int n, secp25
 
         secp256k1_ge_from_storage(&p_ge, &pre[i]);
 
-        /* For the remaining points, we extract the z-ratio from the stored
+        /* For each remaining point, we extract the z-ratio from the stored
          * x-coordinate, compute its z^-1 from that, and compute the full
          * point from that. */
         rzr = &p_ge.x;
@@ -212,19 +232,31 @@ static void secp256k1_ecmult_odd_multiples_table_storage_var(const int n, secp25
          * computed iteratively starting from the overall Z inverse then
          * multiplying by each z-ratio in turn.
          *
-         * Denoting the z-ratio as `rzr` (though the actual variable binding
-         * is `p_ge.x`), we observe that it equal to `h` from the inside
-         * of the above `gej_add_ge_var` call. This satisfies
+         * Denoting the z-ratio as `rzr`, we observe that it is equal to `h`
+         * from the inside of the above `gej_add_ge_var` call. This satisfies
+         *
+         *    rzr = d_x * z^2 - x * d_z^2
+         *
+         * where (`d_x`, `d_z`) are Jacobian coordinates of `D` and `(x, z)`
+         * are Jacobian coordinates of our desired point -- except both are on
+         * the isomorphic curve that we were using when we called `gej_add_ge_var`.
+         * To get back to secp256k1, we must multiply both `z`s by `d_z`, or
+         * equivalently divide both `x`s by `d_z^2`. Our equation then becomes
+         *
+         *    rzr = d_x * z^2 / d_z^2 - x
+         *
+         * (The left-hand-side, being a ratio of z-coordinates, is unaffected
+         * by the isomorphism.)
          *
-         *    rzr = d_x * z^2 - x
+         * Rearranging to solve for `x`, we have
          *
-         * where `d_x` is the x coordinate of `D` and `(x, z)` are Jacobian
-         * coordinates of our desired point.
+         *     x = d_x * z^2 / d_z^2 - rzr
          *
-         * Rearranging and dividing by `z^2` to convert to affine, we get
+         * But what we actually want is the affine coordinate `X = x/z^2`,
+         * which will satisfy
          *
-         *     x = d_x - rzr / z^2
-         *       = d_x - rzr * zi2
+         *     X = d_x / d_z^2 - rzr / z^2
+         *       = dx_over_dz_squared - rzr * zi2
          */
         secp256k1_fe_mul(&p_ge.x, rzr, &zi2);
         secp256k1_fe_negate(&p_ge.x, &p_ge.x, 1);