From 4583c472f53c912dbc50466b8cae222a3c582176 Mon Sep 17 00:00:00 2001 From: Koute Date: Thu, 30 Mar 2023 15:16:18 +0900 Subject: [PATCH] Support SIMD on Rust stable (#520) * Remove dependency on `packed_simd` * Support SIMD on stable Rust * Move `packed_simd.rs` to `vector` module * Add comment header to `packed_simd.rs` * Initialize SIMD registers using intrinsics instead of `transmute` * Use a splat inside of `unpack_pair` * Update README: the AVX2 backend now works on stable Rust * Add a CI job to also build the AVX2 SIMD backend on Rust stable * Added SIMD MSRV test --- .github/workflows/rust.yml | 16 +- Cargo.toml | 6 +- README.md | 12 +- build.rs | 8 + src/backend/vector/avx2/constants.rs | 670 +++++++++++++-------------- src/backend/vector/avx2/field.rs | 198 ++++---- src/backend/vector/ifma/constants.rs | 662 +++++++++++++------------- src/backend/vector/ifma/field.rs | 132 +++--- src/backend/vector/mod.rs | 32 +- src/backend/vector/packed_simd.rs | 311 +++++++++++++ src/lib.rs | 9 +- 11 files changed, 1194 insertions(+), 862 deletions(-) create mode 100644 src/backend/vector/packed_simd.rs diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index d3ea8671d..be98f9751 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -55,7 +55,7 @@ jobs: - run: cargo build --target thumbv7em-none-eabi --release - run: cargo build --target thumbv7em-none-eabi --release --features serde - build-simd: + build-simd-nightly: name: Build simd backend (nightly) runs-on: ubuntu-latest steps: @@ -69,6 +69,16 @@ jobs: RUSTFLAGS: '--cfg curve25519_dalek_backend="simd" -C target_feature=+avx512ifma' run: cargo build --target x86_64-unknown-linux-gnu + test-simd-avx2: + name: Test simd backend (avx2) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: dtolnay/rust-toolchain@stable + - env: + RUSTFLAGS: '--cfg curve25519_dalek_backend="simd" -C target_feature=+avx2' + run: cargo test --target x86_64-unknown-linux-gnu + build-docs: name: Build docs runs-on: ubuntu-latest @@ -151,6 +161,10 @@ jobs: # deps and the stated MSRV - uses: dtolnay/rust-toolchain@1.60.0 - run: cargo build --no-default-features --features serde + # Also make sure the AVX2 build works + - env: + RUSTFLAGS: '--cfg curve25519_dalek_backend="simd" -C target_feature=+avx2' + run: cargo build --target x86_64-unknown-linux-gnu bench: name: Check that benchmarks compile diff --git a/Cargo.toml b/Cargo.toml index dde08ecd3..a1dafcb24 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,7 @@ rand_core = { version = "0.6", default-features = false, features = ["getrandom" [build-dependencies] platforms = "3.0.2" +rustc_version = "0.4.0" [[bench]] name = "dalek_benchmarks" @@ -57,11 +58,6 @@ zeroize = { version = "1", default-features = false, optional = true } [target.'cfg(curve25519_dalek_backend = "fiat")'.dependencies] fiat-crypto = "0.1.19" -# The original packed_simd package was orphaned, see -# https://github.com/rust-lang/packed_simd/issues/303#issuecomment-701361161 -[target.'cfg(curve25519_dalek_backend = "simd")'.dependencies] -packed_simd = { version = "0.3.8", package = "packed_simd_2", features = ["into_bits"] } - [features] default = ["alloc", "precomputed-tables", "zeroize"] alloc = ["zeroize?/alloc"] diff --git a/README.md b/README.md index 574e6f37d..429bae9ac 100644 --- a/README.md +++ b/README.md @@ -155,15 +155,15 @@ $ cargo build --target i686-unknown-linux-gnu Target backend selection within `simd` must be done manually by setting the `RUSTFLAGS` environment variable to one of the below options: -| CPU feature | `RUSTFLAGS` | -| :--- | :--- | -| avx2 | `-C target_feature=+avx2` | -| avx512ifma | `-C target_feature=+avx512ifma` | +| CPU feature | `RUSTFLAGS` | Requires nightly? | +| :--- | :--- | :--- | +| avx2 | `-C target_feature=+avx2` | no | +| avx512ifma | `-C target_feature=+avx512ifma` | yes | Or you can use `-C target_cpu=native` if you don't know what to set. -The `simd` backend also requires using nightly, e.g. by running `cargo -+nightly build`, to build. +The AVX512 backend requires Rust nightly. If enabled and when compiled on a non-nightly +compiler it will fall back to using the AVX2 backend. # Documentation diff --git a/build.rs b/build.rs index ca28d72b4..80c0eb1fb 100644 --- a/build.rs +++ b/build.rs @@ -19,6 +19,14 @@ fn main() { DalekBits::Dalek64 => println!("cargo:rustc-cfg=curve25519_dalek_bits=\"64\""), DalekBits::Dalek32 => println!("cargo:rustc-cfg=curve25519_dalek_bits=\"32\""), } + + if rustc_version::version_meta() + .expect("failed to detect rustc version") + .channel + == rustc_version::Channel::Nightly + { + println!("cargo:rustc-cfg=nightly"); + } } // Deterministic cfg(curve25519_dalek_bits) when this is not explicitly set. diff --git a/src/backend/vector/avx2/constants.rs b/src/backend/vector/avx2/constants.rs index ad80f67c2..25c7bde21 100644 --- a/src/backend/vector/avx2/constants.rs +++ b/src/backend/vector/avx2/constants.rs @@ -11,7 +11,7 @@ //! This module contains constants used by the AVX2 backend. -use packed_simd::u32x8; +use crate::backend::vector::packed_simd::u32x8; use crate::backend::vector::avx2::edwards::{CachedPoint, ExtendedPoint}; use crate::backend::vector::avx2::field::FieldElement2625x4; @@ -21,27 +21,27 @@ use crate::window::NafLookupTable8; /// The identity element as an `ExtendedPoint`. pub(crate) static EXTENDEDPOINT_IDENTITY: ExtendedPoint = ExtendedPoint(FieldElement2625x4([ - u32x8::new(0, 1, 0, 0, 1, 0, 0, 0), - u32x8::splat(0), - u32x8::splat(0), - u32x8::splat(0), - u32x8::splat(0), + u32x8::new_const(0, 1, 0, 0, 1, 0, 0, 0), + u32x8::splat_const::<0>(), + u32x8::splat_const::<0>(), + u32x8::splat_const::<0>(), + u32x8::splat_const::<0>(), ])); /// The identity element as a `CachedPoint`. pub(crate) static CACHEDPOINT_IDENTITY: CachedPoint = CachedPoint(FieldElement2625x4([ - u32x8::new(121647, 121666, 0, 0, 243332, 67108845, 0, 33554431), - u32x8::new(67108864, 0, 33554431, 0, 0, 67108863, 0, 33554431), - u32x8::new(67108863, 0, 33554431, 0, 0, 67108863, 0, 33554431), - u32x8::new(67108863, 0, 33554431, 0, 0, 67108863, 0, 33554431), - u32x8::new(67108863, 0, 33554431, 0, 0, 67108863, 0, 33554431), + u32x8::new_const(121647, 121666, 0, 0, 243332, 67108845, 0, 33554431), + u32x8::new_const(67108864, 0, 33554431, 0, 0, 67108863, 0, 33554431), + u32x8::new_const(67108863, 0, 33554431, 0, 0, 67108863, 0, 33554431), + u32x8::new_const(67108863, 0, 33554431, 0, 0, 67108863, 0, 33554431), + u32x8::new_const(67108863, 0, 33554431, 0, 0, 67108863, 0, 33554431), ])); /// The low limbs of (2p, 2p, 2p, 2p), so that /// ```ascii,no_run /// (2p, 2p, 2p, 2p) = [P_TIMES_2_LO, P_TIMES_2_HI, P_TIMES_2_HI, P_TIMES_2_HI, P_TIMES_2_HI] /// ``` -pub(crate) static P_TIMES_2_LO: u32x8 = u32x8::new( +pub(crate) static P_TIMES_2_LO: u32x8 = u32x8::new_const( 67108845 << 1, 67108845 << 1, 33554431 << 1, @@ -56,7 +56,7 @@ pub(crate) static P_TIMES_2_LO: u32x8 = u32x8::new( /// ```ascii,no_run /// (2p, 2p, 2p, 2p) = [P_TIMES_2_LO, P_TIMES_2_HI, P_TIMES_2_HI, P_TIMES_2_HI, P_TIMES_2_HI] /// ``` -pub(crate) static P_TIMES_2_HI: u32x8 = u32x8::new( +pub(crate) static P_TIMES_2_HI: u32x8 = u32x8::new_const( 67108863 << 1, 67108863 << 1, 33554431 << 1, @@ -71,7 +71,7 @@ pub(crate) static P_TIMES_2_HI: u32x8 = u32x8::new( /// ```ascii,no_run /// (16p, 16p, 16p, 16p) = [P_TIMES_16_LO, P_TIMES_16_HI, P_TIMES_16_HI, P_TIMES_16_HI, P_TIMES_16_HI] /// ``` -pub(crate) static P_TIMES_16_LO: u32x8 = u32x8::new( +pub(crate) static P_TIMES_16_LO: u32x8 = u32x8::new_const( 67108845 << 4, 67108845 << 4, 33554431 << 4, @@ -86,7 +86,7 @@ pub(crate) static P_TIMES_16_LO: u32x8 = u32x8::new( /// ```ascii,no_run /// (16p, 16p, 16p, 16p) = [P_TIMES_16_LO, P_TIMES_16_HI, P_TIMES_16_HI, P_TIMES_16_HI, P_TIMES_16_HI] /// ``` -pub(crate) static P_TIMES_16_HI: u32x8 = u32x8::new( +pub(crate) static P_TIMES_16_HI: u32x8 = u32x8::new_const( 67108863 << 4, 67108863 << 4, 33554431 << 4, @@ -101,1090 +101,1090 @@ pub(crate) static P_TIMES_16_HI: u32x8 = u32x8::new( #[cfg(feature = "precomputed-tables")] pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = NafLookupTable8([ CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 3571425, 10045002, 19036563, 1096096, 243332, 65897020, 0, 28963681, ), - u32x8::new( + u32x8::new_const( 30896895, 63055514, 1614915, 5095970, 0, 53791688, 0, 31258312, ), - u32x8::new( + u32x8::new_const( 13347627, 40339464, 2236269, 11185503, 0, 22520087, 0, 8659512, ), - u32x8::new( + u32x8::new_const( 11125413, 29139905, 32037254, 28360723, 0, 64556417, 0, 9635759, ), - u32x8::new( + u32x8::new_const( 33268144, 47262491, 4336918, 15795740, 0, 22027545, 0, 4846528, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 47099681, 31447946, 29365447, 24740513, 42991046, 18317844, 16051644, 21404226, ), - u32x8::new( + u32x8::new_const( 31708133, 28909527, 2366091, 13703791, 469246, 54159622, 2601402, 32988002, ), - u32x8::new( + u32x8::new_const( 63432457, 30251794, 15163516, 18491340, 28144087, 35605455, 13682295, 18474872, ), - u32x8::new( + u32x8::new_const( 12221607, 4967598, 26061980, 26008006, 20226147, 9726961, 17410, 18051083, ), - u32x8::new( + u32x8::new_const( 60569645, 62487085, 11911242, 21920922, 4092105, 38186967, 22431483, 31366585, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 18147205, 62587998, 2554617, 536692, 11924528, 26674131, 17645433, 24341419, ), - u32x8::new( + u32x8::new_const( 11573357, 27579485, 31491870, 29000885, 10800976, 51902791, 28076395, 20464029, ), - u32x8::new( + u32x8::new_const( 56031649, 10856669, 11791193, 26769430, 25306956, 5922200, 6630685, 9385098, ), - u32x8::new( + u32x8::new_const( 31319348, 23906711, 16290213, 32142166, 61106354, 17181823, 3548308, 12022566, ), - u32x8::new( + u32x8::new_const( 5904298, 50218605, 11826440, 5492249, 10379071, 3472255, 172742, 31948344, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 10625852, 15193821, 22918394, 23676410, 53695416, 54987793, 10067515, 11747680, ), - u32x8::new( + u32x8::new_const( 65013325, 1309652, 29616320, 28922974, 60360891, 19621771, 9938982, 30406429, ), - u32x8::new( + u32x8::new_const( 54967954, 65931918, 5595602, 25719523, 64909864, 30566415, 15945272, 8495317, ), - u32x8::new( + u32x8::new_const( 1167157, 55265018, 11507029, 31641054, 43497904, 2367338, 12937761, 27517066, ), - u32x8::new( + u32x8::new_const( 656704, 2544994, 13006713, 480979, 38471594, 62541240, 25353597, 11531760, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 22176662, 3984313, 27495285, 4110608, 2909584, 30594106, 15677919, 2549183, ), - u32x8::new( + u32x8::new_const( 33979105, 62269905, 2071511, 6894756, 53189950, 47232857, 6408191, 6123225, ), - u32x8::new( + u32x8::new_const( 32553873, 63948030, 12612401, 3633166, 24054373, 37626618, 14481327, 8520484, ), - u32x8::new( + u32x8::new_const( 56552486, 10749438, 12034813, 28811946, 1445640, 36755601, 12104575, 10257833, ), - u32x8::new( + u32x8::new_const( 22795808, 48761311, 1136056, 9380768, 1411523, 5341811, 27318329, 9686767, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 21157200, 39156966, 20473176, 4934657, 61478183, 45121537, 5429856, 13035023, ), - u32x8::new( + u32x8::new_const( 7954529, 58789246, 31440083, 7054221, 38438565, 36856107, 1364112, 14548122, ), - u32x8::new( + u32x8::new_const( 26120083, 36321360, 4919997, 31687496, 33757765, 36237559, 15243054, 32163861, ), - u32x8::new( + u32x8::new_const( 25878307, 46544824, 19455951, 2414935, 16844726, 56521560, 32680554, 26660660, ), - u32x8::new( + u32x8::new_const( 48360220, 43407178, 12187042, 24925816, 7423722, 25746484, 12814654, 17395963, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 63153652, 32195955, 4087908, 8431689, 30392384, 47203165, 8986649, 9053039, ), - u32x8::new( + u32x8::new_const( 63659241, 47988767, 2931872, 19953600, 11747107, 51610101, 20952181, 13364887, ), - u32x8::new( + u32x8::new_const( 3659197, 58790649, 5930099, 2605312, 28477896, 580728, 20579735, 2610622, ), - u32x8::new( + u32x8::new_const( 41781607, 17161358, 10690531, 24368015, 47027031, 36742339, 5414694, 13156365, ), - u32x8::new( + u32x8::new_const( 13237853, 51182423, 8954802, 29006542, 22643989, 56896541, 22830593, 10289708, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 1401265, 58846825, 30911620, 32239180, 15391552, 15200821, 6339309, 16403588, ), - u32x8::new( + u32x8::new_const( 55913797, 29541724, 1664461, 21709410, 38470488, 47097092, 17674945, 32666066, ), - u32x8::new( + u32x8::new_const( 22844482, 10797709, 27548106, 31638735, 34500968, 26611503, 19727211, 13160873, ), - u32x8::new( + u32x8::new_const( 31485204, 14496164, 13981208, 10276888, 5748808, 35024436, 2740987, 7479021, ), - u32x8::new( + u32x8::new_const( 58541207, 14866135, 32344041, 545930, 62661488, 6941250, 27940205, 11976112, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 39849808, 44781685, 15697329, 24387845, 12501486, 50260092, 23199481, 31929024, ), - u32x8::new( + u32x8::new_const( 24823070, 27956017, 27034296, 10316465, 47664045, 11152446, 15719183, 30181617, ), - u32x8::new( + u32x8::new_const( 20771189, 19969144, 31433937, 19185213, 27565920, 10384445, 2893359, 9255362, ), - u32x8::new( + u32x8::new_const( 42894974, 11925545, 32134441, 32738810, 55916336, 32479272, 19563550, 5511385, ), - u32x8::new( + u32x8::new_const( 17857161, 47809169, 14564114, 27997751, 33024640, 38669671, 31956536, 27313245, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 58237774, 15917425, 18872208, 19394230, 17374297, 6101419, 4839741, 6596900, ), - u32x8::new( + u32x8::new_const( 66947393, 15744215, 18368993, 17750160, 41006525, 9205497, 2629667, 32170865, ), - u32x8::new( + u32x8::new_const( 66481381, 1919414, 28338762, 7372967, 33819153, 4156199, 27126309, 12739816, ), - u32x8::new( + u32x8::new_const( 44117158, 58545296, 22521371, 11809712, 28998792, 50731010, 30215699, 25748377, ), - u32x8::new( + u32x8::new_const( 23561284, 4160244, 9035405, 24895184, 39761639, 59253416, 8684759, 22487864, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 12671134, 56419053, 16092401, 30038207, 4002647, 47822606, 7151311, 28430768, ), - u32x8::new( + u32x8::new_const( 61041684, 35765374, 30598048, 19666539, 44150175, 40140037, 290469, 28442674, ), - u32x8::new( + u32x8::new_const( 18847796, 1371617, 33316881, 13199936, 43646578, 17068881, 12074900, 1537415, ), - u32x8::new( + u32x8::new_const( 10052225, 38316070, 27469797, 5297537, 50725570, 20435349, 10339121, 2779737, ), - u32x8::new( + u32x8::new_const( 18372189, 15466385, 24762130, 22217964, 23503887, 47844464, 10415034, 2606889, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 55082775, 45300503, 16032654, 5964396, 17743504, 24634761, 19493066, 5184611, ), - u32x8::new( + u32x8::new_const( 50172633, 35093294, 10040575, 23616256, 4543900, 61852191, 4049821, 7423669, ), - u32x8::new( + u32x8::new_const( 20295398, 40009376, 10487190, 15670429, 51972856, 58649552, 20436392, 3432497, ), - u32x8::new( + u32x8::new_const( 35189420, 54117751, 12825868, 6283038, 27540739, 30648758, 22658912, 9466689, ), - u32x8::new( + u32x8::new_const( 51737549, 40725785, 17409814, 25201086, 21156239, 34176168, 26814520, 5956424, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 8211442, 8014184, 6260823, 22108096, 32182620, 51844847, 2466270, 28582231, ), - u32x8::new( + u32x8::new_const( 27199739, 3848333, 31738017, 10892045, 4963982, 65391770, 32551997, 28906469, ), - u32x8::new( + u32x8::new_const( 16606846, 32207068, 26404535, 7614129, 45416902, 65584718, 13821785, 2646060, ), - u32x8::new( + u32x8::new_const( 36090634, 57981287, 32247670, 22837502, 31003861, 55448117, 6062915, 20369975, ), - u32x8::new( + u32x8::new_const( 27381403, 50578107, 522631, 29521058, 31137497, 40220737, 27628049, 1824195, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 59402443, 17056879, 29262689, 6131785, 52551472, 43367471, 29423199, 18899208, ), - u32x8::new( + u32x8::new_const( 5749414, 43514612, 11365899, 21514624, 65591890, 60945892, 19841732, 5628567, ), - u32x8::new( + u32x8::new_const( 19334369, 52500268, 12307673, 5267367, 3212103, 9035822, 29142161, 30520954, ), - u32x8::new( + u32x8::new_const( 57261330, 6819646, 22089161, 9800373, 55155453, 62250856, 13766735, 25244545, ), - u32x8::new( + u32x8::new_const( 54370226, 61888301, 24496089, 2540581, 65637506, 60274355, 18154273, 11687259, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 12521903, 26014045, 13995625, 33360175, 23605474, 7376434, 27229267, 17195036, ), - u32x8::new( + u32x8::new_const( 59482891, 10074423, 574357, 3857753, 61377787, 50306685, 5241065, 20234396, ), - u32x8::new( + u32x8::new_const( 23674717, 6997172, 20771841, 16858511, 40565304, 29973136, 7049812, 14585010, ), - u32x8::new( + u32x8::new_const( 1427477, 13295732, 31762066, 31499740, 60419925, 54666164, 22009424, 8089609, ), - u32x8::new( + u32x8::new_const( 58154031, 41593020, 15342328, 957047, 38937260, 37037498, 24871992, 32973409, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 30654745, 51286025, 21206982, 2433562, 12780105, 31732574, 33087964, 33081189, ), - u32x8::new( + u32x8::new_const( 66640017, 42720009, 16567620, 15300745, 1530367, 33001123, 20930247, 21042661, ), - u32x8::new( + u32x8::new_const( 15003356, 5294119, 22985605, 18928772, 32628461, 18230172, 14773298, 27193722, ), - u32x8::new( + u32x8::new_const( 27555, 65346287, 17017174, 7837720, 21499787, 42855613, 22474984, 13675085, ), - u32x8::new( + u32x8::new_const( 24164369, 50130116, 5973149, 24152073, 1577334, 25400030, 18648484, 32228854, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 49518649, 59119280, 31670678, 20396561, 61728330, 651402, 176032, 9529498, ), - u32x8::new( + u32x8::new_const( 61765532, 9082232, 32794568, 15526956, 48543100, 32614212, 19001206, 25680229, ), - u32x8::new( + u32x8::new_const( 32086091, 10373081, 8996131, 31822823, 35788988, 49973190, 30542040, 17858455, ), - u32x8::new( + u32x8::new_const( 48130197, 58121889, 27753291, 29923268, 54448075, 43300790, 9336565, 15770022, ), - u32x8::new( + u32x8::new_const( 57725546, 20557498, 9366233, 16023566, 16189031, 2837363, 24315301, 27003505, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 28286608, 10767548, 18220739, 5413236, 48253387, 58255702, 11864864, 28527159, ), - u32x8::new( + u32x8::new_const( 45038176, 58655197, 25648758, 10951484, 42564382, 34542843, 23146954, 22234334, ), - u32x8::new( + u32x8::new_const( 14858710, 24978793, 15040559, 4379220, 47621477, 40271440, 15650420, 1998736, ), - u32x8::new( + u32x8::new_const( 24106391, 9626149, 344505, 25253814, 34579800, 59687089, 25718289, 25904133, ), - u32x8::new( + u32x8::new_const( 1981195, 37751302, 26132048, 1764722, 13288231, 28808622, 12531301, 18292949, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 13869851, 31448904, 14963539, 7581293, 20536485, 35021083, 21257574, 33356609, ), - u32x8::new( + u32x8::new_const( 36903364, 18429241, 11097857, 5943856, 60583077, 40015815, 30509523, 31915271, ), - u32x8::new( + u32x8::new_const( 49161801, 40681915, 67892, 25454357, 22779677, 25798439, 15964829, 5863227, ), - u32x8::new( + u32x8::new_const( 60810637, 4496471, 5217137, 14095116, 50942411, 50712663, 2507380, 26844507, ), - u32x8::new( + u32x8::new_const( 34579752, 53519385, 10859797, 18816024, 42552864, 39478521, 6783896, 17277037, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 43287109, 27900723, 33182187, 2766754, 17041989, 1018260, 33392790, 4830032, ), - u32x8::new( + u32x8::new_const( 60194178, 30788903, 24728888, 14513195, 20897010, 28843233, 20111980, 17475240, ), - u32x8::new( + u32x8::new_const( 46042274, 19257042, 4628173, 31649727, 27388316, 66631493, 11541886, 6408028, ), - u32x8::new( + u32x8::new_const( 57024680, 49536568, 32050358, 31321917, 17437691, 49672356, 2884755, 20493991, ), - u32x8::new( + u32x8::new_const( 59553007, 46782643, 29001173, 1814088, 21930692, 51319706, 14965872, 30748046, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 16441817, 36111849, 6900424, 602234, 46522199, 16441484, 8135070, 21726541, ), - u32x8::new( + u32x8::new_const( 37711225, 32701959, 11679112, 13125533, 32154135, 9407918, 26554289, 620848, ), - u32x8::new( + u32x8::new_const( 19233407, 30086864, 14679568, 2797374, 4892806, 7993077, 247658, 5632804, ), - u32x8::new( + u32x8::new_const( 37427262, 26675495, 27125659, 13496131, 50718473, 40115609, 28505351, 27837393, ), - u32x8::new( + u32x8::new_const( 196819, 18410429, 7070012, 21691388, 29763371, 24754123, 9727048, 10930179, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 28319289, 40734650, 16225680, 24739184, 64272368, 35356897, 7866648, 13635853, ), - u32x8::new( + u32x8::new_const( 34165295, 48328447, 27041670, 23643655, 48949950, 52963288, 30411133, 6045174, ), - u32x8::new( + u32x8::new_const( 18583559, 41649834, 9813585, 26098520, 25682734, 26733526, 19276490, 10654728, ), - u32x8::new( + u32x8::new_const( 34867476, 52715968, 5694571, 13380978, 15134994, 1831255, 8608001, 17266401, ), - u32x8::new( + u32x8::new_const( 59925903, 44282172, 27802465, 1855069, 14234749, 36635487, 11302294, 10938429, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 8373273, 49064494, 4932071, 32997499, 38472880, 29335908, 14504412, 22460029, ), - u32x8::new( + u32x8::new_const( 31795930, 50785923, 25835990, 25790073, 65669841, 11360450, 9969157, 9008164, ), - u32x8::new( + u32x8::new_const( 50262498, 45869261, 16124434, 15336007, 882762, 42522623, 11277198, 26296377, ), - u32x8::new( + u32x8::new_const( 42332732, 59129236, 14452816, 567985, 208061, 34722729, 32008143, 14828749, ), - u32x8::new( + u32x8::new_const( 17937794, 36846032, 32102665, 4442466, 19745435, 31633451, 7146411, 15812027, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 30741269, 38648744, 12562645, 30092623, 25073992, 28730659, 27911745, 30000958, ), - u32x8::new( + u32x8::new_const( 2859794, 25991700, 17776078, 27091930, 2328322, 60061146, 18581824, 18039008, ), - u32x8::new( + u32x8::new_const( 58206333, 17917354, 1972306, 11853766, 2655376, 60543390, 18416710, 13287440, ), - u32x8::new( + u32x8::new_const( 62746330, 61423885, 21246577, 2266675, 60099139, 14804707, 14772234, 20679434, ), - u32x8::new( + u32x8::new_const( 26987698, 15488817, 715616, 2339565, 51980752, 17333865, 21965103, 10839820, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 18672548, 57660959, 16042910, 19519287, 62865851, 17580961, 26628347, 23774759, ), - u32x8::new( + u32x8::new_const( 368070, 3464471, 25888304, 30370559, 52396053, 45426828, 28745251, 9246829, ), - u32x8::new( + u32x8::new_const( 29090099, 57950037, 23104657, 4903923, 10987778, 56163684, 23621539, 10332760, ), - u32x8::new( + u32x8::new_const( 53338235, 44851161, 21606845, 31069622, 4243630, 34464392, 11286454, 5802022, ), - u32x8::new( + u32x8::new_const( 46710757, 63389067, 11642865, 1980986, 12967337, 28162061, 3854192, 30432268, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 12179834, 41005450, 12809619, 33525228, 4624405, 46957889, 16968743, 11827816, ), - u32x8::new( + u32x8::new_const( 51521162, 12466775, 31791271, 15303651, 49798465, 62714504, 6509600, 12918560, ), - u32x8::new( + u32x8::new_const( 20445559, 1756449, 28848701, 7920171, 9835040, 5900071, 28757409, 12376688, ), - u32x8::new( + u32x8::new_const( 18259496, 14281012, 21767026, 10232236, 20000226, 12400540, 4104902, 23570543, ), - u32x8::new( + u32x8::new_const( 3687440, 26546648, 13328821, 26841081, 49822734, 22334054, 244496, 24862543, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 59523541, 62195428, 3853227, 13954801, 12387708, 47627615, 27221350, 17899572, ), - u32x8::new( + u32x8::new_const( 63193587, 36343307, 14595132, 6880795, 1364792, 37648434, 3259017, 20536046, ), - u32x8::new( + u32x8::new_const( 30362834, 10440372, 9574624, 11729232, 63861613, 21748389, 5530846, 2721586, ), - u32x8::new( + u32x8::new_const( 18339760, 1550632, 17170271, 25732971, 28459263, 63142237, 21642345, 31557672, ), - u32x8::new( + u32x8::new_const( 10611282, 5204623, 18049257, 214175, 19432723, 49809070, 26010406, 27449522, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 19770733, 26478685, 9464541, 29158041, 28604307, 45196604, 7586524, 6641859, ), - u32x8::new( + u32x8::new_const( 65654484, 52230498, 30886612, 19112823, 47271809, 38942611, 16020035, 10773481, ), - u32x8::new( + u32x8::new_const( 27464323, 54451016, 20646645, 17732915, 23008717, 53626684, 3253189, 15614410, ), - u32x8::new( + u32x8::new_const( 52381752, 40693008, 7063024, 28469981, 51159478, 44543211, 19941777, 5985451, ), - u32x8::new( + u32x8::new_const( 13553668, 35524849, 14788737, 1883845, 12385775, 47958835, 29135466, 1776722, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 36719806, 20827965, 23175373, 32996806, 42041892, 65708790, 5467143, 20884008, ), - u32x8::new( + u32x8::new_const( 43256281, 40770646, 17244063, 31959819, 64366384, 43544617, 25057754, 12628720, ), - u32x8::new( + u32x8::new_const( 17337782, 58472057, 27906934, 15305274, 30292418, 39284317, 16946773, 24806712, ), - u32x8::new( + u32x8::new_const( 6485126, 32447403, 16261486, 13561940, 49439635, 10738368, 16419889, 8897231, ), - u32x8::new( + u32x8::new_const( 44812203, 40122262, 25496058, 2759794, 25295304, 52178368, 24154195, 29334408, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 42307254, 57217102, 1088936, 3832827, 33905401, 23130334, 6958056, 12622851, ), - u32x8::new( + u32x8::new_const( 3881189, 14870059, 19712830, 6071598, 38147944, 60776394, 3427938, 13765703, ), - u32x8::new( + u32x8::new_const( 7666911, 24227591, 17077136, 22967588, 6874639, 30915523, 11451695, 24292224, ), - u32x8::new( + u32x8::new_const( 13659529, 31984463, 28764736, 20506164, 64729627, 49321636, 28284636, 25472371, ), - u32x8::new( + u32x8::new_const( 39360308, 42281399, 9446504, 868960, 49227724, 21351115, 30561851, 11292096, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 7071115, 46444090, 5387916, 15432877, 27226682, 41506862, 2398278, 3978240, ), - u32x8::new( + u32x8::new_const( 51009614, 54216973, 24368938, 31392616, 38456150, 62313644, 6729154, 99724, ), - u32x8::new( + u32x8::new_const( 17474332, 62857913, 2619930, 30659308, 18268181, 32809239, 22826292, 24561895, ), - u32x8::new( + u32x8::new_const( 38187020, 67003092, 14118280, 16500577, 18808560, 64983716, 25712929, 32518261, ), - u32x8::new( + u32x8::new_const( 25735813, 62284262, 10824872, 20558596, 48149681, 31162667, 22608274, 26285185, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 963440, 63742255, 10230323, 25515008, 32506414, 6105697, 25980317, 24645129, ), - u32x8::new( + u32x8::new_const( 7162189, 8101249, 14679265, 33443386, 2002396, 8541405, 19442276, 4795881, ), - u32x8::new( + u32x8::new_const( 8116694, 51463069, 4415528, 25599140, 55805721, 39582709, 6719436, 30033839, ), - u32x8::new( + u32x8::new_const( 14468202, 42181869, 25188826, 9639755, 47546189, 62711146, 32762447, 18338064, ), - u32x8::new( + u32x8::new_const( 33880058, 32810909, 8969931, 13095238, 38360605, 40138517, 9246134, 4928058, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 63655588, 17883670, 9410246, 26162761, 5000571, 7349225, 23785252, 32751089, ), - u32x8::new( + u32x8::new_const( 28568737, 10733123, 9342397, 21570673, 54096560, 32467591, 20494687, 21511513, ), - u32x8::new( + u32x8::new_const( 47675157, 47932807, 29250946, 15672208, 59760469, 9945465, 14939287, 18437405, ), - u32x8::new( + u32x8::new_const( 37985267, 8609815, 31573002, 3373596, 47828883, 20834216, 13248616, 24154292, ), - u32x8::new( + u32x8::new_const( 5543543, 29553242, 3386453, 30501150, 25058089, 15236571, 8814395, 32462955, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 39158670, 15322548, 20495103, 3312736, 14557171, 12985179, 8044741, 3176899, ), - u32x8::new( + u32x8::new_const( 24673290, 29693310, 21412266, 18324699, 2154518, 40329021, 17500543, 3954277, ), - u32x8::new( + u32x8::new_const( 36758685, 38738957, 165513, 14691866, 3070475, 10424235, 17096536, 16896898, ), - u32x8::new( + u32x8::new_const( 59790459, 43094586, 8720681, 10423589, 1122030, 31545615, 4463786, 31811293, ), - u32x8::new( + u32x8::new_const( 49778992, 60881044, 20509974, 5832494, 64155961, 31483358, 4511231, 20307815, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 2863373, 40876242, 26865913, 24067353, 15726407, 40919070, 12953902, 9931535, ), - u32x8::new( + u32x8::new_const( 60934877, 42512204, 21649141, 21945190, 52211954, 60984193, 7046207, 5363493, ), - u32x8::new( + u32x8::new_const( 4205971, 64068464, 18197273, 7327176, 51527794, 21166920, 20669933, 11828242, ), - u32x8::new( + u32x8::new_const( 59782815, 49617225, 15379924, 457923, 9320508, 21498914, 3242540, 31563182, ), - u32x8::new( + u32x8::new_const( 27714753, 8664670, 3366162, 26338598, 56775518, 25796006, 13129151, 21388876, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 59276548, 49972346, 16795002, 33455915, 48430097, 53857205, 18627071, 32474471, ), - u32x8::new( + u32x8::new_const( 42160315, 50705892, 13530540, 28012698, 19833221, 55886870, 20191784, 9644313, ), - u32x8::new( + u32x8::new_const( 20372416, 28414713, 24084234, 31804096, 33815377, 36131001, 17251241, 18291088, ), - u32x8::new( + u32x8::new_const( 56234667, 14920441, 2033267, 29572003, 1724043, 45519699, 17873735, 501988, ), - u32x8::new( + u32x8::new_const( 50031659, 31517850, 15697583, 1016845, 43104661, 54769582, 8008601, 27257051, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 52951491, 66542164, 14853573, 30444631, 12045973, 24321813, 16545674, 18160646, ), - u32x8::new( + u32x8::new_const( 60107911, 1126003, 5947677, 19486116, 41119984, 30860440, 7935395, 13354438, ), - u32x8::new( + u32x8::new_const( 17841328, 11063269, 1664538, 26687568, 6268968, 22280371, 17275484, 4523163, ), - u32x8::new( + u32x8::new_const( 15886041, 56799482, 15446552, 21712778, 1005290, 17827215, 4978741, 6854882, ), - u32x8::new( + u32x8::new_const( 34319277, 47731002, 20321804, 28544575, 29591814, 63376351, 24754545, 26001714, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 66783087, 5234346, 46102, 8566476, 19947339, 20180418, 25398238, 3726678, ), - u32x8::new( + u32x8::new_const( 63890180, 46380965, 20674069, 5366544, 59661487, 48406612, 31533614, 7071217, ), - u32x8::new( + u32x8::new_const( 13104676, 1406631, 24326736, 19854367, 61039528, 11019904, 31967425, 19219275, ), - u32x8::new( + u32x8::new_const( 39003597, 30143957, 15351834, 8639435, 57309582, 61436794, 15830475, 10090318, ), - u32x8::new( + u32x8::new_const( 45923044, 6700175, 99413, 21263025, 23762647, 53905481, 6063914, 10065424, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 42822326, 57678669, 4052879, 25452667, 54049411, 2373092, 22337016, 7701046, ), - u32x8::new( + u32x8::new_const( 44382355, 43307377, 16761537, 30373573, 49790216, 23230748, 25655306, 10519391, ), - u32x8::new( + u32x8::new_const( 919475, 59371245, 1273450, 25558666, 9724711, 8556709, 25755845, 10887647, ), - u32x8::new( + u32x8::new_const( 25465699, 44651158, 17658392, 11257418, 29735193, 22885150, 7094716, 26828565, ), - u32x8::new( + u32x8::new_const( 48237389, 47661599, 27054393, 7328070, 27280193, 65616691, 23062005, 4170709, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 26535281, 60238317, 30343788, 25790743, 37993933, 24614372, 9523840, 10401918, ), - u32x8::new( + u32x8::new_const( 2783987, 29468958, 4697011, 19804475, 37246678, 46797720, 10261254, 18942252, ), - u32x8::new( + u32x8::new_const( 58135580, 60247753, 25301938, 6844561, 20949454, 39844754, 4552026, 919057, ), - u32x8::new( + u32x8::new_const( 6694071, 44126261, 32285330, 31370180, 24603698, 53328179, 13971149, 5325636, ), - u32x8::new( + u32x8::new_const( 64879487, 582094, 17982081, 19190425, 24951286, 26923842, 29077174, 33286062, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 54863941, 67016431, 1224043, 23371240, 62940074, 52101083, 13523637, 30366406, ), - u32x8::new( + u32x8::new_const( 36324581, 25407485, 18258623, 4698602, 50300544, 2658516, 26300935, 2611030, ), - u32x8::new( + u32x8::new_const( 27183975, 21791014, 18105064, 9875199, 58118912, 54198635, 6400311, 14767984, ), - u32x8::new( + u32x8::new_const( 33918318, 42937962, 14809334, 22136592, 10636588, 29082337, 29829692, 28549776, ), - u32x8::new( + u32x8::new_const( 61080905, 854212, 12202487, 20004503, 9256495, 6903981, 20567109, 347423, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 41391822, 34336880, 22362564, 14247996, 12115604, 41583344, 7639288, 28910945, ), - u32x8::new( + u32x8::new_const( 62066617, 59758859, 26665947, 11614812, 65737664, 45704543, 30324810, 12868376, ), - u32x8::new( + u32x8::new_const( 17491771, 43589814, 9454919, 26047850, 52629282, 39304244, 3868968, 19296062, ), - u32x8::new( + u32x8::new_const( 17826638, 30413590, 32534225, 32741469, 15012391, 14365713, 33039233, 14791399, ), - u32x8::new( + u32x8::new_const( 64115596, 59197067, 32739005, 23275744, 32954320, 22241406, 20788442, 4942942, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 31956192, 59570132, 2784352, 4237732, 47222312, 4860927, 18658867, 15279314, ), - u32x8::new( + u32x8::new_const( 63240583, 28160478, 23524941, 13390861, 66437406, 57718120, 33345312, 28896298, ), - u32x8::new( + u32x8::new_const( 39026193, 46239965, 21440243, 25070488, 64012383, 60999016, 16517060, 29565907, ), - u32x8::new( + u32x8::new_const( 18118181, 60161496, 4212092, 23976240, 36277753, 62363144, 5816868, 16964362, ), - u32x8::new( + u32x8::new_const( 18196138, 62490693, 281468, 7934713, 56027312, 62015725, 4837237, 32932252, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 29885826, 51028067, 30418143, 33438769, 62542283, 39442528, 31535876, 143299, ), - u32x8::new( + u32x8::new_const( 17143063, 56709783, 14451852, 15782104, 32762665, 14047066, 26295037, 5432487, ), - u32x8::new( + u32x8::new_const( 75151, 533606, 7539077, 30926189, 38410914, 23771680, 4872443, 29199566, ), - u32x8::new( + u32x8::new_const( 61522396, 48934708, 16223126, 207380, 11171993, 47975147, 14164574, 352966, ), - u32x8::new( + u32x8::new_const( 15449006, 56530757, 26796528, 12045834, 63738697, 40667227, 33001582, 9101885, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 43331297, 18431341, 25801195, 17267698, 19365485, 57295202, 22218985, 21284590, ), - u32x8::new( + u32x8::new_const( 2429849, 19152559, 10762172, 22564684, 21880390, 66866426, 20357935, 22641906, ), - u32x8::new( + u32x8::new_const( 19771185, 31652693, 3666117, 28136958, 23624283, 55101502, 6313920, 6783662, ), - u32x8::new( + u32x8::new_const( 3487137, 7092443, 11001876, 26196524, 47319246, 44542068, 17594073, 15027760, ), - u32x8::new( + u32x8::new_const( 49563607, 32191113, 4991283, 25400512, 46539152, 4155103, 32368171, 201203, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 20548943, 14334571, 4073874, 6368588, 53208883, 56484515, 15970071, 25561889, ), - u32x8::new( + u32x8::new_const( 49915097, 44030795, 11202344, 29284344, 60258023, 66225712, 8075764, 12383512, ), - u32x8::new( + u32x8::new_const( 45248912, 4933668, 9592153, 5819559, 31030983, 38174071, 32435814, 7442522, ), - u32x8::new( + u32x8::new_const( 62688129, 48218381, 22089545, 12897361, 21050881, 34278889, 7569163, 3225449, ), - u32x8::new( + u32x8::new_const( 19050183, 51089071, 32935757, 22640195, 66122318, 47144608, 18743677, 25177079, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 41186817, 46681702, 31819867, 32997133, 38559207, 27147015, 30293819, 16762988, ), - u32x8::new( + u32x8::new_const( 24154689, 51762873, 23883879, 13510519, 55338250, 61224161, 11663149, 30803960, ), - u32x8::new( + u32x8::new_const( 18104238, 14117824, 11724021, 21362053, 65704761, 35530242, 13498058, 33522849, ), - u32x8::new( + u32x8::new_const( 63812888, 23995539, 28920539, 24005193, 26412223, 36582218, 4251418, 26160309, ), - u32x8::new( + u32x8::new_const( 16822053, 66064082, 3482145, 31979593, 45937188, 54475379, 612917, 7976478, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 46509314, 55327128, 8944536, 274914, 26432930, 53829300, 21192572, 3569894, ), - u32x8::new( + u32x8::new_const( 20919764, 64356651, 30642344, 17215170, 20335124, 11203745, 18663316, 19024174, ), - u32x8::new( + u32x8::new_const( 59297055, 53842463, 3680204, 9806710, 54004169, 51484914, 29807998, 20134199, ), - u32x8::new( + u32x8::new_const( 14781592, 22628010, 26877930, 25880359, 30434803, 190607, 30184292, 8991040, ), - u32x8::new( + u32x8::new_const( 64400983, 64591751, 854562, 28216111, 20010398, 50414793, 9803872, 22687008, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 15091184, 32550863, 8818643, 4244752, 43123513, 64565526, 408838, 13206998, ), - u32x8::new( + u32x8::new_const( 16405061, 60379639, 31489017, 20949281, 27568751, 38734986, 8364264, 12451020, ), - u32x8::new( + u32x8::new_const( 16005217, 58008076, 1406778, 26546927, 39571784, 56365493, 31274296, 8918790, ), - u32x8::new( + u32x8::new_const( 23271122, 19453469, 27718201, 32742670, 234332, 36785342, 22601675, 14331046, ), - u32x8::new( + u32x8::new_const( 40636025, 22442705, 22115403, 23745859, 41164945, 61012, 12499614, 542137, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 62776018, 32835413, 17373246, 17187309, 54469193, 21770290, 15923753, 28996575, ), - u32x8::new( + u32x8::new_const( 59385210, 63082298, 12568449, 8509004, 9483342, 16105238, 5756054, 26890758, ), - u32x8::new( + u32x8::new_const( 53987996, 38201748, 5521661, 19060159, 18663191, 9093637, 27786835, 31189196, ), - u32x8::new( + u32x8::new_const( 65872678, 43635130, 27903055, 25020300, 65772737, 38110437, 5213502, 21909342, ), - u32x8::new( + u32x8::new_const( 4438979, 9680838, 10212446, 4764184, 13235684, 58245995, 20264570, 21024049, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 60835961, 48209103, 31049052, 4688268, 12426713, 59829045, 22302488, 29008521, ), - u32x8::new( + u32x8::new_const( 50401667, 29716596, 23531224, 7581281, 49071895, 6952617, 14934683, 8218256, ), - u32x8::new( + u32x8::new_const( 1601446, 36631413, 31774811, 29625330, 56786114, 8331539, 23129509, 19783344, ), - u32x8::new( + u32x8::new_const( 59514327, 64513110, 1772300, 5701338, 5737511, 16147555, 9461515, 5703271, ), - u32x8::new( + u32x8::new_const( 33072974, 54300426, 11940114, 1308663, 15627555, 4931627, 28443714, 20924342, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 18135013, 20358426, 4922557, 10015355, 65729669, 34786528, 26248549, 29194359, ), - u32x8::new( + u32x8::new_const( 797666, 34997544, 24316856, 25107230, 24612576, 4761401, 15307321, 32404252, ), - u32x8::new( + u32x8::new_const( 16501152, 60565831, 9487105, 9316022, 24986054, 31917592, 3962024, 2501883, ), - u32x8::new( + u32x8::new_const( 63356796, 50432342, 18044926, 30566881, 42032028, 31415202, 13524600, 16119907, ), - u32x8::new( + u32x8::new_const( 3927286, 57022374, 9265437, 21620772, 19481940, 3806938, 24836192, 14572399, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 10785787, 46564798, 368445, 33181384, 5319843, 52687136, 30347110, 29837357, ), - u32x8::new( + u32x8::new_const( 56436732, 47859251, 24141084, 22250712, 59046084, 4963427, 33463413, 17168859, ), - u32x8::new( + u32x8::new_const( 15512044, 6366740, 4737504, 27644548, 30307977, 25037929, 14593903, 12836490, ), - u32x8::new( + u32x8::new_const( 63878897, 34013023, 5860752, 7244096, 3689461, 57012135, 18389096, 11589351, ), - u32x8::new( + u32x8::new_const( 4682110, 36302830, 653422, 22316819, 14081831, 5657024, 11088376, 24110612, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 39907267, 45940262, 24887471, 18342609, 878445, 40456159, 12019082, 345107, ), - u32x8::new( + u32x8::new_const( 12794982, 28893944, 9447505, 11387200, 16961963, 13916996, 10893728, 25898006, ), - u32x8::new( + u32x8::new_const( 44934162, 53465865, 3583620, 1102334, 53917811, 63478576, 2426066, 10389549, ), - u32x8::new( + u32x8::new_const( 45096036, 37595344, 19367718, 20257175, 10280866, 41653449, 27665642, 375926, ), - u32x8::new( + u32x8::new_const( 45847901, 24064074, 32494820, 32204556, 10720704, 51079060, 1297436, 29853825, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 66303987, 36060363, 16494578, 24962147, 11971403, 49538586, 25060560, 1964341, ), - u32x8::new( + u32x8::new_const( 25988481, 27641502, 24909517, 27237087, 66646363, 52777626, 16360849, 10459972, ), - u32x8::new( + u32x8::new_const( 43930529, 34374176, 31225968, 8807030, 10394758, 35904854, 25325589, 19335583, ), - u32x8::new( + u32x8::new_const( 25094697, 34380951, 20051185, 32287161, 11739332, 53887441, 30517319, 26601892, ), - u32x8::new( + u32x8::new_const( 8868546, 35635502, 32513071, 28248087, 51946989, 14222744, 19198839, 23261841, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 51218008, 5070126, 11046681, 5320810, 61212079, 34104447, 23895089, 6460727, ), - u32x8::new( + u32x8::new_const( 39843528, 46278671, 10426120, 25624792, 66658766, 37140083, 28933107, 12969597, ), - u32x8::new( + u32x8::new_const( 59635793, 40220191, 5751421, 173680, 58321825, 740337, 1412847, 7682623, ), - u32x8::new( + u32x8::new_const( 975962, 56440763, 20812276, 22631115, 49095824, 19883130, 2419746, 31043648, ), - u32x8::new( + u32x8::new_const( 66208703, 39669328, 22525915, 3748897, 65994776, 34533552, 8126286, 18326047, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 64176557, 3912400, 19351673, 30068471, 31190055, 24221683, 33142424, 28698542, ), - u32x8::new( + u32x8::new_const( 34784792, 4109933, 3867193, 19557314, 2112512, 32715890, 24550117, 16595976, ), - u32x8::new( + u32x8::new_const( 35542761, 48024875, 10925431, 31526577, 66577735, 23189821, 13375709, 1735095, ), - u32x8::new( + u32x8::new_const( 59699254, 43854093, 29783239, 24777271, 19600372, 39924461, 2896720, 1472185, ), - u32x8::new( + u32x8::new_const( 56389656, 35980854, 33172342, 1370336, 23707480, 57654949, 7850973, 12655016, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 38372660, 57101970, 7044964, 12732710, 57535705, 6043201, 30858914, 10946592, ), - u32x8::new( + u32x8::new_const( 21023468, 6946992, 26403324, 23901823, 35695559, 23440687, 4763891, 6514074, ), - u32x8::new( + u32x8::new_const( 28662273, 30933699, 9352242, 26354829, 37402243, 3145176, 8770289, 525937, ), - u32x8::new( + u32x8::new_const( 54933102, 36695832, 3281859, 4755022, 23043294, 32794379, 15618886, 23602412, ), - u32x8::new( + u32x8::new_const( 9931565, 29897140, 2480737, 24193701, 7833615, 2284939, 893926, 13421882, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 22917795, 22088359, 28978099, 19794863, 60542318, 29878494, 31053731, 9080720, ), - u32x8::new( + u32x8::new_const( 23679072, 52547035, 28424916, 20647332, 4008761, 28267029, 12961289, 1589095, ), - u32x8::new( + u32x8::new_const( 55616194, 26678929, 14998265, 23274397, 54625466, 46244264, 28627706, 33030665, ), - u32x8::new( + u32x8::new_const( 11527330, 6449415, 26531607, 3472938, 41541592, 62607682, 19862690, 20564723, ), - u32x8::new( + u32x8::new_const( 32843805, 49066843, 28425824, 19521495, 48792073, 48242878, 27392443, 13175986, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 16185025, 61537525, 2961305, 1492442, 25123147, 3095034, 31896958, 33089615, ), - u32x8::new( + u32x8::new_const( 64748157, 18336595, 16522231, 25426312, 65718949, 35485695, 30554083, 10205918, ), - u32x8::new( + u32x8::new_const( 39626934, 39271045, 16420458, 9826240, 56483981, 27128085, 3783403, 13360006, ), - u32x8::new( + u32x8::new_const( 30793778, 66771960, 17241420, 6564573, 61102581, 29974476, 32385512, 9011754, ), - u32x8::new( + u32x8::new_const( 28068166, 11862220, 14323567, 12380617, 52090465, 16029056, 24495309, 21409233, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 59411973, 57437124, 11695483, 17586857, 16108987, 43449109, 31098002, 6248476, ), - u32x8::new( + u32x8::new_const( 42258047, 61595931, 29308533, 11742653, 43042345, 27373650, 30165249, 21929989, ), - u32x8::new( + u32x8::new_const( 49907221, 9620337, 21888081, 20981082, 56288861, 61562203, 33223566, 3582446, ), - u32x8::new( + u32x8::new_const( 57535017, 41003416, 22080416, 14463796, 65518565, 18127889, 24370863, 33332664, ), - u32x8::new( + u32x8::new_const( 66655380, 6430175, 471782, 11947673, 30596400, 18898659, 15930721, 4211851, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 6757410, 65455566, 13584784, 11362173, 10797127, 24451471, 19541370, 29309435, ), - u32x8::new( + u32x8::new_const( 40360156, 17685025, 18326181, 3846903, 13693365, 63049479, 31900359, 23385063, ), - u32x8::new( + u32x8::new_const( 52455038, 57513503, 22163311, 27095042, 48610726, 66454160, 12085341, 26357004, ), - u32x8::new( + u32x8::new_const( 22097042, 14063840, 6705778, 14342902, 66139825, 20702105, 31279090, 7495745, ), - u32x8::new( + u32x8::new_const( 27360710, 49314837, 18774847, 7146436, 37066216, 42004961, 22409916, 10524446, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 1497507, 33054449, 11839906, 2960428, 40538463, 18884538, 25018820, 4073970, ), - u32x8::new( + u32x8::new_const( 54484385, 43640735, 2808257, 20710708, 39840730, 27222424, 21783544, 11848522, ), - u32x8::new( + u32x8::new_const( 45765237, 48200555, 9299019, 9393151, 34818188, 56098995, 13575233, 21012731, ), - u32x8::new( + u32x8::new_const( 4265428, 49627650, 24960282, 9425650, 47883651, 2797524, 11853190, 22877329, ), - u32x8::new( + u32x8::new_const( 25008173, 64199503, 380047, 12107343, 12329448, 11914399, 764281, 29687002, ), ])), CachedPoint(FieldElement2625x4([ - u32x8::new( + u32x8::new_const( 35889734, 23047226, 4022841, 7017445, 7274086, 53316179, 25100176, 15310676, ), - u32x8::new( + u32x8::new_const( 42409427, 30270106, 6823853, 31551384, 40645017, 66489807, 18021817, 32669351, ), - u32x8::new( + u32x8::new_const( 39827134, 43680850, 28297996, 20258133, 26058742, 52643238, 22238331, 21690533, ), - u32x8::new( + u32x8::new_const( 60808002, 17499995, 30042246, 29310584, 48219954, 29389518, 8680514, 17844709, ), - u32x8::new( + u32x8::new_const( 6452896, 50116553, 9532047, 26821214, 44524351, 50428429, 21904953, 12608048, ), ])), diff --git a/src/backend/vector/avx2/field.rs b/src/backend/vector/avx2/field.rs index 2612c75d8..9f278723d 100644 --- a/src/backend/vector/avx2/field.rs +++ b/src/backend/vector/avx2/field.rs @@ -40,8 +40,8 @@ const C_LANES64: u8 = 0b00_11_00_00; #[allow(unused)] const D_LANES64: u8 = 0b11_00_00_00; +use crate::backend::vector::packed_simd::{u32x8, u64x4}; use core::ops::{Add, Mul, Neg}; -use packed_simd::{i32x8, u32x8, u64x4, IntoBits}; use crate::backend::serial::u64::field::FieldElement51; use crate::backend::vector::avx2::constants::{ @@ -61,12 +61,12 @@ use crate::backend::vector::avx2::constants::{ fn unpack_pair(src: u32x8) -> (u32x8, u32x8) { let a: u32x8; let b: u32x8; - let zero = i32x8::new(0, 0, 0, 0, 0, 0, 0, 0); + let zero = u32x8::splat(0); unsafe { use core::arch::x86_64::_mm256_unpackhi_epi32; use core::arch::x86_64::_mm256_unpacklo_epi32; - a = _mm256_unpacklo_epi32(src.into_bits(), zero.into_bits()).into_bits(); - b = _mm256_unpackhi_epi32(src.into_bits(), zero.into_bits()).into_bits(); + a = _mm256_unpacklo_epi32(src.into(), zero.into()).into(); + b = _mm256_unpackhi_epi32(src.into(), zero.into()).into(); } (a, b) } @@ -89,13 +89,13 @@ fn repack_pair(x: u32x8, y: u32x8) -> u32x8 { // Input: x = (a0, 0, b0, 0, c0, 0, d0, 0) // Input: y = (a1, 0, b1, 0, c1, 0, d1, 0) - let x_shuffled = _mm256_shuffle_epi32(x.into_bits(), 0b11_01_10_00); - let y_shuffled = _mm256_shuffle_epi32(y.into_bits(), 0b10_00_11_01); + let x_shuffled = _mm256_shuffle_epi32(x.into(), 0b11_01_10_00); + let y_shuffled = _mm256_shuffle_epi32(y.into(), 0b10_00_11_01); // x' = (a0, b0, 0, 0, c0, d0, 0, 0) // y' = ( 0, 0, a1, b1, 0, 0, c1, d1) - _mm256_blend_epi32(x_shuffled, y_shuffled, 0b11001100).into_bits() + _mm256_blend_epi32(x_shuffled, y_shuffled, 0b11001100).into() } } @@ -180,7 +180,7 @@ impl ConditionallySelectable for FieldElement2625x4 { } impl FieldElement2625x4 { - pub const ZERO: FieldElement2625x4 = FieldElement2625x4([u32x8::splat(0); 5]); + pub const ZERO: FieldElement2625x4 = FieldElement2625x4([u32x8::splat_const::<0>(); 5]); /// Split this vector into an array of four (serial) field /// elements. @@ -188,14 +188,14 @@ impl FieldElement2625x4 { pub fn split(&self) -> [FieldElement51; 4] { let mut out = [FieldElement51::ZERO; 4]; for i in 0..5 { - let a_2i = self.0[i].extract(0) as u64; // - let b_2i = self.0[i].extract(1) as u64; // - let a_2i_1 = self.0[i].extract(2) as u64; // `. - let b_2i_1 = self.0[i].extract(3) as u64; // | pre-swapped to avoid - let c_2i = self.0[i].extract(4) as u64; // | a cross lane shuffle - let d_2i = self.0[i].extract(5) as u64; // .' - let c_2i_1 = self.0[i].extract(6) as u64; // - let d_2i_1 = self.0[i].extract(7) as u64; // + let a_2i = self.0[i].extract::<0>() as u64; // + let b_2i = self.0[i].extract::<1>() as u64; // + let a_2i_1 = self.0[i].extract::<2>() as u64; // `. + let b_2i_1 = self.0[i].extract::<3>() as u64; // | pre-swapped to avoid + let c_2i = self.0[i].extract::<4>() as u64; // | a cross lane shuffle + let d_2i = self.0[i].extract::<5>() as u64; // .' + let c_2i_1 = self.0[i].extract::<6>() as u64; // + let d_2i_1 = self.0[i].extract::<7>() as u64; // out[0].0[i] = a_2i + (a_2i_1 << 26); out[1].0[i] = b_2i + (b_2i_1 << 26); @@ -233,7 +233,7 @@ impl FieldElement2625x4 { // Note that this gets turned into a generic LLVM // shuffle-by-constants, which can be lowered to a simpler // instruction than a generic permute. - _mm256_permutevar8x32_epi32(x.into_bits(), c.into_bits()).into_bits() + _mm256_permutevar8x32_epi32(x.into(), c.into()).into() } } @@ -279,38 +279,29 @@ impl FieldElement2625x4 { // which does not require a shuffle immediate but *is* lowered // to immediate shuffles anyways). match control { - Lanes::C => { - _mm256_blend_epi32(x.into_bits(), y.into_bits(), C_LANES as i32).into_bits() - } - Lanes::D => { - _mm256_blend_epi32(x.into_bits(), y.into_bits(), D_LANES as i32).into_bits() - } + Lanes::C => _mm256_blend_epi32(x.into(), y.into(), C_LANES as i32).into(), + Lanes::D => _mm256_blend_epi32(x.into(), y.into(), D_LANES as i32).into(), Lanes::AD => { - _mm256_blend_epi32(x.into_bits(), y.into_bits(), (A_LANES | D_LANES) as i32) - .into_bits() + _mm256_blend_epi32(x.into(), y.into(), (A_LANES | D_LANES) as i32).into() } Lanes::AB => { - _mm256_blend_epi32(x.into_bits(), y.into_bits(), (A_LANES | B_LANES) as i32) - .into_bits() + _mm256_blend_epi32(x.into(), y.into(), (A_LANES | B_LANES) as i32).into() } Lanes::AC => { - _mm256_blend_epi32(x.into_bits(), y.into_bits(), (A_LANES | C_LANES) as i32) - .into_bits() + _mm256_blend_epi32(x.into(), y.into(), (A_LANES | C_LANES) as i32).into() } Lanes::CD => { - _mm256_blend_epi32(x.into_bits(), y.into_bits(), (C_LANES | D_LANES) as i32) - .into_bits() + _mm256_blend_epi32(x.into(), y.into(), (C_LANES | D_LANES) as i32).into() } Lanes::BC => { - _mm256_blend_epi32(x.into_bits(), y.into_bits(), (B_LANES | C_LANES) as i32) - .into_bits() + _mm256_blend_epi32(x.into(), y.into(), (B_LANES | C_LANES) as i32).into() } Lanes::ABCD => _mm256_blend_epi32( - x.into_bits(), - y.into_bits(), + x.into(), + y.into(), (A_LANES | B_LANES | C_LANES | D_LANES) as i32, ) - .into_bits(), + .into(), } } } @@ -413,7 +404,7 @@ impl FieldElement2625x4 { /// The coefficients of the result are bounded with \\( b < 0.0002 \\). #[inline] pub fn reduce(&self) -> FieldElement2625x4 { - let shifts = i32x8::new(26, 26, 25, 25, 26, 26, 25, 25); + let shifts = u32x8::new(26, 26, 25, 25, 26, 26, 25, 25); let masks = u32x8::new( (1 << 26) - 1, (1 << 26) - 1, @@ -436,8 +427,8 @@ impl FieldElement2625x4 { use core::arch::x86_64::_mm256_shuffle_epi32; use core::arch::x86_64::_mm256_srlv_epi32; - let c = _mm256_srlv_epi32(v.into_bits(), shifts.into_bits()); - _mm256_shuffle_epi32(c, 0b01_00_11_10).into_bits() + let c = _mm256_srlv_epi32(v.into(), shifts.into()); + _mm256_shuffle_epi32(c, 0b01_00_11_10).into() } }; @@ -458,7 +449,7 @@ impl FieldElement2625x4 { let combine = |v_lo: u32x8, v_hi: u32x8| -> u32x8 { unsafe { use core::arch::x86_64::_mm256_blend_epi32; - _mm256_blend_epi32(v_lo.into_bits(), v_hi.into_bits(), 0b11_00_11_00).into_bits() + _mm256_blend_epi32(v_lo.into(), v_hi.into(), 0b11_00_11_00).into() } }; @@ -488,17 +479,17 @@ impl FieldElement2625x4 { // // c98 = (c(x9), c(y9), c(x8), c(y8), c(z9), c(w9), c(z8), c(w8)); // c9_spread = (c(x9), c(x8), c(y9), c(y8), c(z9), c(z8), c(w9), c(w8)). - let c9_spread = _mm256_shuffle_epi32(c98.into_bits(), 0b11_01_10_00); + let c9_spread = _mm256_shuffle_epi32(c98.into(), 0b11_01_10_00); // Since the carryouts are bounded by 2^7, their products with 19 // are bounded by 2^11.25. This means that // // c9_19_spread = (19*c(x9), 0, 19*c(y9), 0, 19*c(z9), 0, 19*c(w9), 0). - let c9_19_spread = _mm256_mul_epu32(c9_spread, u64x4::splat(19).into_bits()); + let c9_19_spread = _mm256_mul_epu32(c9_spread, u64x4::splat(19).into()); // Unshuffle: // c9_19 = (19*c(x9), 19*c(y9), 0, 0, 19*c(z9), 19*c(w9), 0, 0). - _mm256_shuffle_epi32(c9_19_spread, 0b11_01_10_00).into_bits() + _mm256_shuffle_epi32(c9_19_spread, 0b11_01_10_00).into() }; // Add the final carryin. @@ -531,11 +522,11 @@ impl FieldElement2625x4 { debug_assert!(i < 9); if i % 2 == 0 { // Even limbs have 26 bits - z[i + 1] += z[i] >> 26; + z[i + 1] += z[i].shr::<26>(); z[i] &= LOW_26_BITS; } else { // Odd limbs have 25 bits - z[i + 1] += z[i] >> 25; + z[i + 1] += z[i].shr::<25>(); z[i] &= LOW_25_BITS; } }; @@ -558,17 +549,14 @@ impl FieldElement2625x4 { // big. To ensure c < 2^32, we would need z[9] < 2^57. // Instead, we split the carry in two, with c = c_0 + c_1*2^26. - let c = z[9] >> 25; + let c = z[9].shr::<25>(); z[9] &= LOW_25_BITS; let mut c0: u64x4 = c & LOW_26_BITS; // c0 < 2^26; - let mut c1: u64x4 = c >> 26; // c1 < 2^(39-26) = 2^13; + let mut c1: u64x4 = c.shr::<26>(); // c1 < 2^(39-26) = 2^13; - unsafe { - use core::arch::x86_64::_mm256_mul_epu32; - let x19 = u64x4::splat(19); - c0 = _mm256_mul_epu32(c0.into_bits(), x19.into_bits()).into_bits(); // c0 < 2^30.25 - c1 = _mm256_mul_epu32(c1.into_bits(), x19.into_bits()).into_bits(); // c1 < 2^17.25 - } + let x19 = u64x4::splat(19); + c0 = u32x8::from(c0).mul32(u32x8::from(x19)); + c1 = u32x8::from(c1).mul32(u32x8::from(x19)); z[0] += c0; // z0 < 2^26 + 2^30.25 < 2^30.33 z[1] += c1; // z1 < 2^25 + 2^17.25 < 2^25.0067 @@ -582,11 +570,11 @@ impl FieldElement2625x4 { // // So the packed result is bounded with b = 0.007. FieldElement2625x4([ - repack_pair(z[0].into_bits(), z[1].into_bits()), - repack_pair(z[2].into_bits(), z[3].into_bits()), - repack_pair(z[4].into_bits(), z[5].into_bits()), - repack_pair(z[6].into_bits(), z[7].into_bits()), - repack_pair(z[8].into_bits(), z[9].into_bits()), + repack_pair(z[0].into(), z[1].into()), + repack_pair(z[2].into(), z[3].into()), + repack_pair(z[4].into(), z[5].into()), + repack_pair(z[6].into(), z[7].into()), + repack_pair(z[8].into(), z[9].into()), ]) } @@ -603,14 +591,12 @@ impl FieldElement2625x4 { pub fn square_and_negate_D(&self) -> FieldElement2625x4 { #[inline(always)] fn m(x: u32x8, y: u32x8) -> u64x4 { - use core::arch::x86_64::_mm256_mul_epu32; - unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() } + x.mul32(y) } #[inline(always)] fn m_lo(x: u32x8, y: u32x8) -> u32x8 { - use core::arch::x86_64::_mm256_mul_epu32; - unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() } + x.mul32(y).into() } let v19 = u32x8::new(19, 0, 19, 0, 19, 0, 19, 0); @@ -621,14 +607,14 @@ impl FieldElement2625x4 { let (x6, x7) = unpack_pair(self.0[3]); let (x8, x9) = unpack_pair(self.0[4]); - let x0_2 = x0 << 1; - let x1_2 = x1 << 1; - let x2_2 = x2 << 1; - let x3_2 = x3 << 1; - let x4_2 = x4 << 1; - let x5_2 = x5 << 1; - let x6_2 = x6 << 1; - let x7_2 = x7 << 1; + let x0_2 = x0.shl::<1>(); + let x1_2 = x1.shl::<1>(); + let x2_2 = x2.shl::<1>(); + let x3_2 = x3.shl::<1>(); + let x4_2 = x4.shl::<1>(); + let x5_2 = x5.shl::<1>(); + let x6_2 = x6.shl::<1>(); + let x7_2 = x7.shl::<1>(); let x5_19 = m_lo(v19, x5); let x6_19 = m_lo(v19, x6); @@ -636,16 +622,16 @@ impl FieldElement2625x4 { let x8_19 = m_lo(v19, x8); let x9_19 = m_lo(v19, x9); - let mut z0 = m(x0, x0) + m(x2_2, x8_19) + m(x4_2, x6_19) + ((m(x1_2, x9_19) + m(x3_2, x7_19) + m(x5, x5_19)) << 1); - let mut z1 = m(x0_2, x1) + m(x3_2, x8_19) + m(x5_2, x6_19) + ((m(x2, x9_19) + m(x4, x7_19)) << 1); - let mut z2 = m(x0_2, x2) + m(x1_2, x1) + m(x4_2, x8_19) + m(x6, x6_19) + ((m(x3_2, x9_19) + m(x5_2, x7_19)) << 1); - let mut z3 = m(x0_2, x3) + m(x1_2, x2) + m(x5_2, x8_19) + ((m(x4, x9_19) + m(x6, x7_19)) << 1); - let mut z4 = m(x0_2, x4) + m(x1_2, x3_2) + m(x2, x2) + m(x6_2, x8_19) + ((m(x5_2, x9_19) + m(x7, x7_19)) << 1); - let mut z5 = m(x0_2, x5) + m(x1_2, x4) + m(x2_2, x3) + m(x7_2, x8_19) + ((m(x6, x9_19)) << 1); - let mut z6 = m(x0_2, x6) + m(x1_2, x5_2) + m(x2_2, x4) + m(x3_2, x3) + m(x8, x8_19) + ((m(x7_2, x9_19)) << 1); - let mut z7 = m(x0_2, x7) + m(x1_2, x6) + m(x2_2, x5) + m(x3_2, x4) + ((m(x8, x9_19)) << 1); - let mut z8 = m(x0_2, x8) + m(x1_2, x7_2) + m(x2_2, x6) + m(x3_2, x5_2) + m(x4, x4) + ((m(x9, x9_19)) << 1); - let mut z9 = m(x0_2, x9) + m(x1_2, x8) + m(x2_2, x7) + m(x3_2, x6) + m(x4_2, x5) ; + let mut z0 = m(x0, x0) + m(x2_2, x8_19) + m(x4_2, x6_19) + ((m(x1_2, x9_19) + m(x3_2, x7_19) + m(x5, x5_19)).shl::<1>()); + let mut z1 = m(x0_2, x1) + m(x3_2, x8_19) + m(x5_2, x6_19) + ((m(x2, x9_19) + m(x4, x7_19)).shl::<1>()); + let mut z2 = m(x0_2, x2) + m(x1_2, x1) + m(x4_2, x8_19) + m(x6, x6_19) + ((m(x3_2, x9_19) + m(x5_2, x7_19)).shl::<1>()); + let mut z3 = m(x0_2, x3) + m(x1_2, x2) + m(x5_2, x8_19) + ((m(x4, x9_19) + m(x6, x7_19)).shl::<1>()); + let mut z4 = m(x0_2, x4) + m(x1_2, x3_2) + m(x2, x2) + m(x6_2, x8_19) + ((m(x5_2, x9_19) + m(x7, x7_19)).shl::<1>()); + let mut z5 = m(x0_2, x5) + m(x1_2, x4) + m(x2_2, x3) + m(x7_2, x8_19) + ((m(x6, x9_19)).shl::<1>()); + let mut z6 = m(x0_2, x6) + m(x1_2, x5_2) + m(x2_2, x4) + m(x3_2, x3) + m(x8, x8_19) + ((m(x7_2, x9_19)).shl::<1>()); + let mut z7 = m(x0_2, x7) + m(x1_2, x6) + m(x2_2, x5) + m(x3_2, x4) + ((m(x8, x9_19)).shl::<1>()); + let mut z8 = m(x0_2, x8) + m(x1_2, x7_2) + m(x2_2, x6) + m(x3_2, x5_2) + m(x4, x4) + ((m(x9, x9_19)).shl::<1>()); + let mut z9 = m(x0_2, x9) + m(x1_2, x8) + m(x2_2, x7) + m(x3_2, x6) + m(x4_2, x5) ; // The biggest z_i is bounded as z_i < 249*2^(51 + 2*b); // if b < 1.5 we get z_i < 4485585228861014016. @@ -670,7 +656,7 @@ impl FieldElement2625x4 { let negate_D = |x: u64x4, p: u64x4| -> u64x4 { unsafe { use core::arch::x86_64::_mm256_blend_epi32; - _mm256_blend_epi32(x.into_bits(), (p - x).into_bits(), D_LANES64 as i32).into_bits() + _mm256_blend_epi32(x.into(), (p - x).into(), D_LANES64 as i32).into() } }; @@ -741,30 +727,26 @@ impl Mul<(u32, u32, u32, u32)> for FieldElement2625x4 { /// The coefficients of the result are bounded with \\( b < 0.007 \\). #[inline] fn mul(self, scalars: (u32, u32, u32, u32)) -> FieldElement2625x4 { - unsafe { - use core::arch::x86_64::_mm256_mul_epu32; - - let consts = u32x8::new(scalars.0, 0, scalars.1, 0, scalars.2, 0, scalars.3, 0); - - let (b0, b1) = unpack_pair(self.0[0]); - let (b2, b3) = unpack_pair(self.0[1]); - let (b4, b5) = unpack_pair(self.0[2]); - let (b6, b7) = unpack_pair(self.0[3]); - let (b8, b9) = unpack_pair(self.0[4]); - - FieldElement2625x4::reduce64([ - _mm256_mul_epu32(b0.into_bits(), consts.into_bits()).into_bits(), - _mm256_mul_epu32(b1.into_bits(), consts.into_bits()).into_bits(), - _mm256_mul_epu32(b2.into_bits(), consts.into_bits()).into_bits(), - _mm256_mul_epu32(b3.into_bits(), consts.into_bits()).into_bits(), - _mm256_mul_epu32(b4.into_bits(), consts.into_bits()).into_bits(), - _mm256_mul_epu32(b5.into_bits(), consts.into_bits()).into_bits(), - _mm256_mul_epu32(b6.into_bits(), consts.into_bits()).into_bits(), - _mm256_mul_epu32(b7.into_bits(), consts.into_bits()).into_bits(), - _mm256_mul_epu32(b8.into_bits(), consts.into_bits()).into_bits(), - _mm256_mul_epu32(b9.into_bits(), consts.into_bits()).into_bits(), - ]) - } + let consts = u32x8::new(scalars.0, 0, scalars.1, 0, scalars.2, 0, scalars.3, 0); + + let (b0, b1) = unpack_pair(self.0[0]); + let (b2, b3) = unpack_pair(self.0[1]); + let (b4, b5) = unpack_pair(self.0[2]); + let (b6, b7) = unpack_pair(self.0[3]); + let (b8, b9) = unpack_pair(self.0[4]); + + FieldElement2625x4::reduce64([ + b0.mul32(consts), + b1.mul32(consts), + b2.mul32(consts), + b3.mul32(consts), + b4.mul32(consts), + b5.mul32(consts), + b6.mul32(consts), + b7.mul32(consts), + b8.mul32(consts), + b9.mul32(consts), + ]) } } @@ -786,14 +768,12 @@ impl<'a, 'b> Mul<&'b FieldElement2625x4> for &'a FieldElement2625x4 { fn mul(self, rhs: &'b FieldElement2625x4) -> FieldElement2625x4 { #[inline(always)] fn m(x: u32x8, y: u32x8) -> u64x4 { - use core::arch::x86_64::_mm256_mul_epu32; - unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() } + x.mul32(y) } #[inline(always)] fn m_lo(x: u32x8, y: u32x8) -> u32x8 { - use core::arch::x86_64::_mm256_mul_epu32; - unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() } + x.mul32(y).into() } let (x0, x1) = unpack_pair(self.0[0]); diff --git a/src/backend/vector/ifma/constants.rs b/src/backend/vector/ifma/constants.rs index 47b9b263d..66ace9643 100644 --- a/src/backend/vector/ifma/constants.rs +++ b/src/backend/vector/ifma/constants.rs @@ -9,7 +9,7 @@ //! This module contains constants used by the IFMA backend. -use packed_simd::u64x4; +use crate::backend::vector::packed_simd::u64x4; #[cfg(feature = "precomputed-tables")] use crate::window::NafLookupTable8; @@ -19,58 +19,58 @@ use super::field::{F51x4Reduced, F51x4Unreduced}; /// The identity element as an `ExtendedPoint`. pub(crate) static EXTENDEDPOINT_IDENTITY: ExtendedPoint = ExtendedPoint(F51x4Unreduced([ - u64x4::new(0, 1, 1, 0), - u64x4::new(0, 0, 0, 0), - u64x4::new(0, 0, 0, 0), - u64x4::new(0, 0, 0, 0), - u64x4::new(0, 0, 0, 0), + u64x4::new_const(0, 1, 1, 0), + u64x4::new_const(0, 0, 0, 0), + u64x4::new_const(0, 0, 0, 0), + u64x4::new_const(0, 0, 0, 0), + u64x4::new_const(0, 0, 0, 0), ])); /// The identity element as a `CachedPoint`. pub(crate) static CACHEDPOINT_IDENTITY: CachedPoint = CachedPoint(F51x4Reduced([ - u64x4::new(121647, 121666, 243332, 2251799813685229), - u64x4::new(2251799813685248, 0, 0, 2251799813685247), - u64x4::new(2251799813685247, 0, 0, 2251799813685247), - u64x4::new(2251799813685247, 0, 0, 2251799813685247), - u64x4::new(2251799813685247, 0, 0, 2251799813685247), + u64x4::new_const(121647, 121666, 243332, 2251799813685229), + u64x4::new_const(2251799813685248, 0, 0, 2251799813685247), + u64x4::new_const(2251799813685247, 0, 0, 2251799813685247), + u64x4::new_const(2251799813685247, 0, 0, 2251799813685247), + u64x4::new_const(2251799813685247, 0, 0, 2251799813685247), ])); /// Odd multiples of the Ed25519 basepoint: #[cfg(feature = "precomputed-tables")] pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = NafLookupTable8([ CachedPoint(F51x4Reduced([ - u64x4::new(1277522120965857, 73557767439946, 243332, 1943719795065404), - u64x4::new(108375142003455, 341984820733594, 0, 2097709862669256), - u64x4::new(150073485536043, 750646439938056, 0, 581130035634455), - u64x4::new(2149983732744869, 1903255931888577, 0, 646644904824193), - u64x4::new(291045673509296, 1060034214701851, 0, 325245010451737), + u64x4::new_const(1277522120965857, 73557767439946, 243332, 1943719795065404), + u64x4::new_const(108375142003455, 341984820733594, 0, 2097709862669256), + u64x4::new_const(150073485536043, 750646439938056, 0, 581130035634455), + u64x4::new_const(2149983732744869, 1903255931888577, 0, 646644904824193), + u64x4::new_const(291045673509296, 1060034214701851, 0, 325245010451737), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1970681836121889, 1660307753655178, 1077207637163462, 1436413309977108, ), - u64x4::new( + u64x4::new_const( 158785710838757, 919645875412951, 174577133496574, 2213787394009350, ), - u64x4::new( + u64x4::new_const( 1017606396438281, 1240932851489554, 918203302506967, 1239827708070863, ), - u64x4::new( + u64x4::new_const( 1748989883612327, 1745367742532782, 1168385548387, 1211387683826673, ), - u64x4::new( + u64x4::new_const( 799349980018733, 1471088235739693, 1505351346057417, @@ -78,31 +78,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 171437462972293, 36016853025886, 1184164975342640, 1633525003912147, ), - u64x4::new( + u64x4::new_const( 2113383632509037, 1946216474924125, 1884174984466256, 1373317790955847, ), - u64x4::new( + u64x4::new_const( 791293623466401, 1796466048084189, 444977763198796, 629823271230872, ), - u64x4::new( + u64x4::new_const( 1093217720067380, 2157024270666135, 238122980108466, 806820763806847, ), - u64x4::new( + u64x4::new_const( 793658959468458, 368578641413741, 11592529764159, @@ -110,31 +110,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1538027396670268, 1588896993892061, 675619548648376, 788373514423313, ), - u64x4::new( + u64x4::new_const( 1987517656073805, 1940987929951188, 666993851697339, 2040540928108427, ), - u64x4::new( + u64x4::new_const( 375514548584082, 1726008037083790, 1070069155000872, 570111103756303, ), - u64x4::new( + u64x4::new_const( 772223645372213, 2123395244967674, 868238486911408, 1846639042240362, ), - u64x4::new( + u64x4::new_const( 872865734460736, 32277956842850, 1701451131455402, @@ -142,31 +142,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1845177363882902, 275858237213625, 1052127336883600, 171072805852218, ), - u64x4::new( + u64x4::new_const( 139016783952609, 462699304987089, 430046471494974, 410922720999257, ), - u64x4::new( + u64x4::new_const( 846403935976337, 243817706931454, 971825428236901, 571800039596794, ), - u64x4::new( + u64x4::new_const( 807642685434918, 1933536976438782, 812324278898440, 688391556487313, ), - u64x4::new( + u64x4::new_const( 76239450396192, 629532732688863, 1833302026979779, @@ -174,31 +174,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1373931604989264, 331159264656614, 364391529321767, 874765630865409, ), - u64x4::new( + u64x4::new_const( 2109908262150241, 473400816504190, 91544045127333, 976307977609515, ), - u64x4::new( + u64x4::new_const( 330175435673491, 2126511895885904, 1022944071588421, 2158480209801463, ), - u64x4::new( + u64x4::new_const( 1305666795527971, 162063591028664, 2193154870675382, 1789166662611800, ), - u64x4::new( + u64x4::new_const( 817858592500508, 1672743239440202, 859976879916778, @@ -206,31 +206,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 274334925170164, 565841102587251, 603083835949120, 607539210240861, ), - u64x4::new( + u64x4::new_const( 196754662972649, 1339063476699167, 1406077076979491, 896902435668469, ), - u64x4::new( + u64x4::new_const( 397962210956733, 174839587476217, 1381082665748936, 175195877334136, ), - u64x4::new( + u64x4::new_const( 717429432748391, 1635309821746318, 363374010274647, 882908746261699, ), - u64x4::new( + u64x4::new_const( 600946602802781, 1946596133370711, 1532135183320341, @@ -238,31 +238,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 2074443704000945, 2163534804938345, 425423840926528, 1100826171404853, ), - u64x4::new( + u64x4::new_const( 111700142796101, 1456893872751964, 1186145518682968, 2192182627706116, ), - u64x4::new( + u64x4::new_const( 1848722121856066, 2123239575044749, 1323870754599272, 883211262889775, ), - u64x4::new( + u64x4::new_const( 938263017712916, 689670293631396, 183944529557576, 501908638166580, ), - u64x4::new( + u64x4::new_const( 2170571907220631, 36636756989655, 1875035480138608, @@ -270,31 +270,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1053429956874064, 1636640618139765, 1556890827801070, 2142720579528828, ), - u64x4::new( + u64x4::new_const( 1814240918422814, 692326274601777, 1054896561802157, 2025454041705534, ), - u64x4::new( + u64x4::new_const( 2109495823888757, 1287497869997176, 194170063200096, 621116840113213, ), - u64x4::new( + u64x4::new_const( 2156505873679998, 2197064359737385, 1312887672223536, 369862818895912, ), - u64x4::new( + u64x4::new_const( 977381163563657, 1878897311974033, 2144566861359744, @@ -302,31 +302,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1266492498289486, 1301524759372145, 324789537938521, 442710471023019, ), - u64x4::new( + u64x4::new_const( 1232722320001345, 1191193089162455, 176474006074813, 2158950213252857, ), - u64x4::new( + u64x4::new_const( 1901782191467749, 494791441598902, 1820415815322129, 854954583485223, ), - u64x4::new( + u64x4::new_const( 1511383667649702, 792536415032464, 2027741263854728, 1727944381044738, ), - u64x4::new( + u64x4::new_const( 606355788891204, 1670687521471220, 582824350365415, @@ -334,31 +334,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1079942762813598, 2015830004785901, 479916361323351, 1907956590950158, ), - u64x4::new( + u64x4::new_const( 2053400302939156, 1319799126867070, 19493088767391, 1908755581402373, ), - u64x4::new( + u64x4::new_const( 2235858054780980, 885832711204321, 810332865560178, 103174191215441, ), - u64x4::new( + u64x4::new_const( 1843466881032833, 355511728384038, 693846715794114, 186545012724117, ), - u64x4::new( + u64x4::new_const( 1661758432892509, 1491022339899281, 698941123765263, @@ -366,31 +366,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1075933251927831, 400263885306647, 1308157532880528, 347933379126665, ), - u64x4::new( + u64x4::new_const( 673811632329433, 1584860147186478, 271778891257244, 498194055154207, ), - u64x4::new( + u64x4::new_const( 703783427747558, 1051624728592032, 1371463103351544, 230351033002960, ), - u64x4::new( + u64x4::new_const( 860729466483372, 421647596766583, 1520613871336707, 635298775280054, ), - u64x4::new( + u64x4::new_const( 1168352891728845, 1691216293752089, 1799491997061519, @@ -398,31 +398,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 420156727446514, 1483649215777128, 165508610199900, 1918121104840431, ), - u64x4::new( + u64x4::new_const( 2129902293682427, 730952770435213, 2184527544565390, 1939880362232986, ), - u64x4::new( + u64x4::new_const( 1771978364905086, 510975579746524, 927564335219142, 177574146260558, ), - u64x4::new( + u64x4::new_const( 2164104536437514, 1532598873799015, 406875369182421, 1367005937406517, ), - u64x4::new( + u64x4::new_const( 35073200082587, 1981124717036219, 1854087014063833, @@ -430,31 +430,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1963785875777739, 411497142699119, 1974557512687408, 1268304422747183, ), - u64x4::new( + u64x4::new_const( 762752575978150, 1443822019541748, 1331556159904338, 377726798263780, ), - u64x4::new( + u64x4::new_const( 825953972847841, 353487068141356, 1955697322427207, 2048226560172078, ), - u64x4::new( + u64x4::new_const( 1482378558684434, 657691905625918, 923870001994493, 1694132799397736, ), - u64x4::new( + u64x4::new_const( 1643904759603122, 170495566698285, 1218312703413378, @@ -462,31 +462,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 939230507241903, 2238763473105245, 1827325199528162, 1153939339775538, ), - u64x4::new( + u64x4::new_const( 38544505283339, 258889431497015, 351721979677947, 1357907379592829, ), - u64x4::new( + u64x4::new_const( 1393974676373341, 1131355528938676, 473104915298872, 978783482501776, ), - u64x4::new( + u64x4::new_const( 2131516168980501, 2113911780991092, 1477027502354261, 542884524860340, ), - u64x4::new( + u64x4::new_const( 1029606261349423, 64226378557628, 1669131167474348, @@ -494,31 +494,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1423176501543193, 163313632579593, 2220495688893001, 2220041045291870, ), - u64x4::new( + u64x4::new_const( 1111834224023697, 1026815658023689, 1404605100939775, 1412149108248227, ), - u64x4::new( + u64x4::new_const( 1542537854906076, 1270288391129127, 991419278941933, 1824939809581980, ), - u64x4::new( + u64x4::new_const( 1142003215657891, 525980550896367, 1508270666157963, 917719462309053, ), - u64x4::new( + u64x4::new_const( 400851268057105, 1620818232405188, 1251478578139510, @@ -526,31 +526,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 2125383272208441, 1368790097335984, 11813369275978, 639513785921674, ), - u64x4::new( + u64x4::new_const( 2200806265616284, 1041996387620216, 1275149397833084, 1723371028064068, ), - u64x4::new( + u64x4::new_const( 603720163891275, 2135593511176153, 2049641644431548, 1198460677818310, ), - u64x4::new( + u64x4::new_const( 1862491879401621, 2008116580769441, 626566325260235, 1058308304975798, ), - u64x4::new( + u64x4::new_const( 628557314314858, 1075323332046522, 1631772244117095, @@ -558,31 +558,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1222773123817104, 363276129291452, 796237592807883, 1914425291893078, ), - u64x4::new( + u64x4::new_const( 1721259057429088, 734941709009373, 1553365830564638, 1492120931079419, ), - u64x4::new( + u64x4::new_const( 1009354843273686, 293884504384873, 1050281954944357, 134132942667344, ), - u64x4::new( + u64x4::new_const( 23119363298711, 1694754778833445, 1725925193393496, 1738396998222001, ), - u64x4::new( + u64x4::new_const( 1753692057254667, 118428526447110, 840961387840295, @@ -590,31 +590,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1004186117579547, 508771992330056, 1426571663072421, 2238524171903259, ), - u64x4::new( + u64x4::new_const( 744764613007812, 398885442368825, 2047459490294949, 2141797621077959, ), - u64x4::new( + u64x4::new_const( 4556204156489, 1708213022802363, 1071381560923933, 393474529142567, ), - u64x4::new( + u64x4::new_const( 350116198213005, 945907227204695, 168267474358731, 1801504420122711, ), - u64x4::new( + u64x4::new_const( 728788674520360, 1262722049156121, 455259596607008, @@ -622,31 +622,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 2226818917892677, 185673745808179, 2240952219732549, 324137961621908, ), - u64x4::new( + u64x4::new_const( 1659527641857410, 973964060249383, 1349692151487730, 1172743533370593, ), - u64x4::new( + u64x4::new_const( 310591478467746, 2123977244137170, 774562885265820, 430035546191685, ), - u64x4::new( + u64x4::new_const( 2150863173197992, 2101978317708856, 193592648406011, 1375328504508580, ), - u64x4::new( + u64x4::new_const( 1946235834250479, 121741431658675, 1004342690620100, @@ -654,31 +654,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 463079632200153, 40415275714025, 545935352782679, 1458043501600908, ), - u64x4::new( + u64x4::new_const( 783771976559993, 880839641726471, 1782028201271831, 41664413404590, ), - u64x4::new( + u64x4::new_const( 985129151724159, 187728621410000, 16620051933318, 378011085567733, ), - u64x4::new( + u64x4::new_const( 1820372198168638, 905710046480679, 1912961774249737, 1868135861067161, ), - u64x4::new( + u64x4::new_const( 474460473983187, 1455684425673661, 652771171116843, @@ -686,31 +686,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1088886980746809, 1660218575261626, 527921875040240, 915086639857889, ), - u64x4::new( + u64x4::new_const( 1814735788528175, 1586698876186367, 2040856637532862, 405684812785624, ), - u64x4::new( + u64x4::new_const( 658578559700999, 1751442070931114, 1293623371490094, 715026719042518, ), - u64x4::new( + u64x4::new_const( 382156225644820, 897982285504960, 577673183555858, 1158728558309719, ), - u64x4::new( + u64x4::new_const( 1865791902475663, 124491617513788, 758484125168765, @@ -718,31 +718,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 330985690350617, 2214424721795630, 973374650780848, 1507267060932964, ), - u64x4::new( + u64x4::new_const( 1733823971011290, 1730742552292995, 669018866977489, 604527664126146, ), - u64x4::new( + u64x4::new_const( 1082092498645474, 1029182053935309, 756799947765834, 1764720030308351, ), - u64x4::new( + u64x4::new_const( 969912105693756, 38116887248276, 2148030115687613, 995140534653865, ), - u64x4::new( + u64x4::new_const( 2154373397460354, 298128883464656, 479587543632539, @@ -750,31 +750,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 843064865526549, 2019481782959016, 1873125524281672, 2013330239022371, ), - u64x4::new( + u64x4::new_const( 1192932403815186, 1818108671859220, 1247005102016258, 1210577394628058, ), - u64x4::new( + u64x4::new_const( 132359273326717, 795492788299178, 1235924489372816, 891705064411550, ), - u64x4::new( + u64x4::new_const( 1425833709104858, 152114045731085, 991347902581315, 1387773338707683, ), - u64x4::new( + u64x4::new_const( 48024203807922, 157005564892977, 1474053161953744, @@ -782,31 +782,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1076621484026788, 1309917234320927, 1786998180233659, 1595497085944737, ), - u64x4::new( + u64x4::new_const( 1737334672694726, 2038133716999447, 1929061192400917, 620544235219084, ), - u64x4::new( + u64x4::new_const( 1550527313469747, 329096759623509, 1585214659209474, 693419841748324, ), - u64x4::new( + u64x4::new_const( 1450010875912315, 2085047082180569, 757421110771886, 389367139787400, ), - u64x4::new( + u64x4::new_const( 781339490566117, 132941783448971, 258650459725225, @@ -814,31 +814,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 859638991542650, 2249840007426442, 1138753070862357, 793751342318913, ), - u64x4::new( + u64x4::new_const( 2133476133447306, 1027010646129239, 436851910892865, 866949948830344, ), - u64x4::new( + u64x4::new_const( 1936003572431223, 531513680252193, 1929877059408416, 830585477662503, ), - u64x4::new( + u64x4::new_const( 1460760405777960, 686673748420916, 275475330051554, 1581792376993692, ), - u64x4::new( + u64x4::new_const( 894482039456784, 1801274480988632, 16407898635278, @@ -846,31 +846,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 258585746227669, 936490904651492, 1826793887434108, 1201219990633823, ), - u64x4::new( + u64x4::new_const( 979462791643635, 461762372210187, 218708929991480, 1378150755760178, ), - u64x4::new( + u64x4::new_const( 642542170229970, 787135445552820, 371168855880557, 182642566486693, ), - u64x4::new( + u64x4::new_const( 1152277399721904, 1726910452705576, 1452393215705343, 2117799581546845, ), - u64x4::new( + u64x4::new_const( 1211265143925330, 14373046151823, 1745528818271507, @@ -878,31 +878,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 635154614562157, 1956763034454109, 509123035953043, 445727657534780, ), - u64x4::new( + u64x4::new_const( 2072765509783252, 1282639891593570, 1075086397362049, 722996110178195, ), - u64x4::new( + u64x4::new_const( 1385572918825603, 1190035835509576, 218317841176013, 1047865370756924, ), - u64x4::new( + u64x4::new_const( 473991569426488, 1910588123704592, 1338270051770806, 401676861680875, ), - u64x4::new( + u64x4::new_const( 992455353618436, 126422733426929, 1955248037756399, @@ -910,31 +910,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1555272991526078, 2214378187116349, 366893798097444, 1401502118355702, ), - u64x4::new( + u64x4::new_const( 1157229521930713, 2144787187506262, 1681597469697840, 847499096518697, ), - u64x4::new( + u64x4::new_const( 1872802655800758, 1027119609820793, 1137278714788290, 1664750301179485, ), - u64x4::new( + u64x4::new_const( 1091289858897030, 910126419483563, 1101920147235731, 597083075893952, ), - u64x4::new( + u64x4::new_const( 1711011533670315, 185206680336278, 1620960612579784, @@ -942,31 +942,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 73077300235958, 257216723095630, 466947267713785, 847105214181598, ), - u64x4::new( + u64x4::new_const( 1322905631406309, 407458059314731, 230045063190376, 923800751267786, ), - u64x4::new( + u64x4::new_const( 1146027205000415, 1541328763727623, 768510249199119, 1630223587589059, ), - u64x4::new( + u64x4::new_const( 1930368769879433, 1376145403022159, 1898149855343131, 1709421930518180, ), - u64x4::new( + u64x4::new_const( 633944191571764, 58314960742839, 2050971151574988, @@ -974,31 +974,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 361576929158539, 1035682890165818, 160945739362874, 266975208626222, ), - u64x4::new( + u64x4::new_const( 1635371797076046, 2106722851965197, 451585919077206, 6692426667180, ), - u64x4::new( + u64x4::new_const( 175820543533852, 2057511393764025, 1531846543720469, 1648320903946519, ), - u64x4::new( + u64x4::new_const( 947461770620940, 1107335044817620, 1725565474111216, 2182263619949220, ), - u64x4::new( + u64x4::new_const( 726444888601221, 1379664085279206, 1517215633290417, @@ -1006,31 +1006,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 686545355846512, 1712283265573167, 1743509592736302, 1653906616429153, ), - u64x4::new( + u64x4::new_const( 985108805667149, 2244347650874753, 1304749057936860, 321846134330589, ), - u64x4::new( + u64x4::new_const( 296321076156886, 1717929256240029, 450933772486425, 2015536856431605, ), - u64x4::new( + u64x4::new_const( 1690393512821866, 646913049470189, 2198650647576397, 1230646705710442, ), - u64x4::new( + u64x4::new_const( 601961913448442, 878806578800541, 620497587492381, @@ -1038,31 +1038,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 631510982676132, 1755753187697174, 1596201246674299, 2197888384902121, ), - u64x4::new( + u64x4::new_const( 626957678275745, 1447583371478595, 1375375216702128, 1443613232818823, ), - u64x4::new( + u64x4::new_const( 1962997804660501, 1051744123184519, 1002558639300437, 1237313314603385, ), - u64x4::new( + u64x4::new_const( 2118828335274995, 226398203764759, 889099617161107, 1620967117678504, ), - u64x4::new( + u64x4::new_const( 227261019362935, 2046897556746842, 591524060355369, @@ -1070,31 +1070,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1375403119051662, 222313965014452, 539873444241395, 213198095917915, ), - u64x4::new( + u64x4::new_const( 1436952871599114, 1229749762725246, 1174441562267670, 265367077740349, ), - u64x4::new( + u64x4::new_const( 11107426165917, 985954476039181, 1147329112365579, 1133931640328107, ), - u64x4::new( + u64x4::new_const( 585235055006843, 699515259687482, 299559608721134, 2134819767146767, ), - u64x4::new( + u64x4::new_const( 1376401105588528, 391412107507860, 302743651807545, @@ -1102,31 +1102,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1802940904616205, 1615132760193234, 869321663313735, 666494072545310, ), - u64x4::new( + u64x4::new_const( 1452849320020701, 1472716813676364, 472862999490802, 359937983286145, ), - u64x4::new( + u64x4::new_const( 1221198323133843, 491718521756528, 1387135774113906, 793779904904008, ), - u64x4::new( + u64x4::new_const( 1032129287829151, 30730741946697, 217603185195068, 2118169309744162, ), - u64x4::new( + u64x4::new_const( 225899335574721, 1767553399797342, 881082465669982, @@ -1134,31 +1134,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1127093564374276, 2245188499702906, 1250041622887441, 2179324911668149, ), - u64x4::new( + u64x4::new_const( 908019210866875, 1879900391060964, 1355047706206597, 647218945377302, ), - u64x4::new( + u64x4::new_const( 1616265604422592, 2134336781521657, 1157711219915601, 1227494173135033, ), - u64x4::new( + u64x4::new_const( 136450294813355, 1984543542455033, 1199486053011083, 33687889941331, ), - u64x4::new( + u64x4::new_const( 1053447012707371, 68239344331930, 537448158443925, @@ -1166,31 +1166,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 996806463322563, 2043104667851348, 1110361398300309, 1218740346887957, ), - u64x4::new( + u64x4::new_const( 399141907016839, 1307691109658227, 532535384961264, 896201194398872, ), - u64x4::new( + u64x4::new_const( 111705272106160, 1790972382466021, 1159338112559144, 303544352897203, ), - u64x4::new( + u64x4::new_const( 1036600573322969, 1457119922663674, 334117653665514, 460023361701263, ), - u64x4::new( + u64x4::new_const( 1363773215189933, 1915594049343802, 1661249423378694, @@ -1198,31 +1198,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 3093919631215, 574886478077610, 1704446919728971, 250093147254210, ), - u64x4::new( + u64x4::new_const( 1387413348737796, 360142717826981, 2116185073015983, 474541388374100, ), - u64x4::new( + u64x4::new_const( 1632539630892580, 1332404016215719, 2145297637794728, 1289783723173504, ), - u64x4::new( + u64x4::new_const( 1030244179060173, 579782698595797, 1062365251139982, 677149839815546, ), - u64x4::new( + u64x4::new_const( 6671539419876, 1426937459653775, 406942403696343, @@ -1230,31 +1230,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 271984148441782, 1708099625818957, 1499011822959235, 516808451044836, ), - u64x4::new( + u64x4::new_const( 1124847751346323, 2038336022958449, 1721698491022600, 705944403212572, ), - u64x4::new( + u64x4::new_const( 85459783780275, 1715213099986669, 1728445509034791, 730657630359717, ), - u64x4::new( + u64x4::new_const( 1185034652652387, 755472578204310, 476118360897817, 1800434542785310, ), - u64x4::new( + u64x4::new_const( 1815589628676941, 491778500674079, 1547664984392513, @@ -1262,31 +1262,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 2036337168672113, 1730787524684269, 639134121311693, 698060925015524, ), - u64x4::new( + u64x4::new_const( 315211075189491, 1329055848835358, 688621136402134, 1271193060119448, ), - u64x4::new( + u64x4::new_const( 1697984374314012, 459330773536457, 305481314707918, 61676911066002, ), - u64x4::new( + u64x4::new_const( 2166631826859191, 2105217187401781, 937587962768434, 357397435365683, ), - u64x4::new( + u64x4::new_const( 1206757093145471, 1287847622009294, 1951336140421622, @@ -1294,31 +1294,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 82144190081093, 1568417433687791, 907555979158442, 2037855062523867, ), - u64x4::new( + u64x4::new_const( 1225315484058853, 315317868015613, 1765025920288384, 175223259828436, ), - u64x4::new( + u64x4::new_const( 1215010304871271, 662713408454950, 429517658575616, 991062684008811, ), - u64x4::new( + u64x4::new_const( 993837615254894, 1485561584889450, 2001836754226476, 1915943063896801, ), - u64x4::new( + u64x4::new_const( 818895101625673, 1342479472068804, 1380235330010671, @@ -1326,31 +1326,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1500726307559118, 956166860173424, 512663951564436, 1940180717699824, ), - u64x4::new( + u64x4::new_const( 1789521472720825, 779456898652427, 2035063615853504, 863582140589407, ), - u64x4::new( + u64x4::new_const( 634508890793787, 1748041666732214, 259642099961634, 1294936839797812, ), - u64x4::new( + u64x4::new_const( 2183334898697038, 2197242820694806, 2217225409073703, 992633998226449, ), - u64x4::new( + u64x4::new_const( 2197077498155916, 1562008797791883, 1395088759904208, @@ -1358,31 +1358,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 186854731652320, 284389440026580, 1252175415119400, 1025377410100223, ), - u64x4::new( + u64x4::new_const( 1578732129417607, 898645497852382, 2237766074482974, 1939197790303592, ), - u64x4::new( + u64x4::new_const( 1438830390640145, 1682452015845597, 1108441197232223, 1984134492898664, ), - u64x4::new( + u64x4::new_const( 282668727301669, 1609018289552856, 390363439795705, 1138459124667912, ), - u64x4::new( + u64x4::new_const( 18889015928490, 532489638086725, 324621535996080, @@ -1390,31 +1390,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 2041327051605378, 2244037852176483, 2116336876147147, 9616672544864, ), - u64x4::new( + u64x4::new_const( 969847387559191, 1059119127679639, 1764630094670633, 364568045311834, ), - u64x4::new( + u64x4::new_const( 505938893153679, 2075421412172902, 326984153045666, 1959549727324704, ), - u64x4::new( + u64x4::new_const( 1088715617911260, 13917085151028, 950568481355929, 23687195265771, ), - u64x4::new( + u64x4::new_const( 1798284568673198, 808382292203333, 2214698741961545, @@ -1422,31 +1422,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1731488929623777, 1158815615106413, 1491090861948525, 1428384712900962, ), - u64x4::new( + u64x4::new_const( 722237139522457, 1514290328911535, 1366197913116230, 1519472657321210, ), - u64x4::new( + u64x4::new_const( 246028966932273, 1888239319448405, 423720022211163, 455243905681470, ), - u64x4::new( + u64x4::new_const( 738323403716001, 1758018973481179, 1180718299482318, 1008495946606708, ), - u64x4::new( + u64x4::new_const( 334959381596119, 1704599537529481, 2172191232106896, @@ -1454,31 +1454,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 273393076768079, 427388720298603, 1071733376018227, 1715429388968611, ), - u64x4::new( + u64x4::new_const( 751776629892313, 1965239102856011, 541955408230119, 831043488876080, ), - u64x4::new( + u64x4::new_const( 643718536393104, 390543998404644, 2176730661486279, 499459234889079, ), - u64x4::new( + u64x4::new_const( 1482404333915009, 865527293526285, 507957951411713, 216456252558825, ), - u64x4::new( + u64x4::new_const( 2210281256300231, 1519357818277551, 1257866936775246, @@ -1486,31 +1486,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 2135395168187905, 2214400157568614, 2032983817870823, 1124945109072647, ), - u64x4::new( + u64x4::new_const( 1602820011758145, 906675633903289, 782700735390986, 2067218823525601, ), - u64x4::new( + u64x4::new_const( 786785748926382, 1433583123655616, 905839404290873, 2249680349963778, ), - u64x4::new( + u64x4::new_const( 1940824582370584, 1610961256326291, 285307858781375, 1755588655461194, ), - u64x4::new( + u64x4::new_const( 233682812055333, 2146114223476434, 41132209533476, @@ -1518,31 +1518,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 600257696476418, 18449221564824, 1422209458591138, 239571584769716, ), - u64x4::new( + u64x4::new_const( 2056372917056980, 1155290566623531, 1252473955568148, 1276690716882081, ), - u64x4::new( + u64x4::new_const( 246974369025311, 658117221519903, 2000380937898441, 1351183273924850, ), - u64x4::new( + u64x4::new_const( 1803747363753112, 1736801515030186, 2025633577199091, 603378480769167, ), - u64x4::new( + u64x4::new_const( 57348749438551, 1893551220299655, 657926732731806, @@ -1550,31 +1550,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 591809128842736, 284860517232591, 27436696863545, 886306697195798, ), - u64x4::new( + u64x4::new_const( 2113192175751749, 1405882509906423, 561316282804847, 835573846576266, ), - u64x4::new( + u64x4::new_const( 94407289485409, 1781534171669004, 2098782516531528, 598529921520053, ), - u64x4::new( + u64x4::new_const( 1860137004504786, 2197323407480349, 1516772733981532, 961740253777086, ), - u64x4::new( + u64x4::new_const( 1484139612868217, 1593557644636881, 838834937143441, @@ -1582,31 +1582,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1165898865828562, 1153420815042389, 1068625028915785, 1945927229911090, ), - u64x4::new( + u64x4::new_const( 843454394017146, 571029655293754, 386282254545998, 1804608237584150, ), - u64x4::new( + u64x4::new_const( 370552451091100, 1279105656351124, 1864742949668631, 2093071521726981, ), - u64x4::new( + u64x4::new_const( 1872542389052198, 1679083953574330, 349872262454465, 1470311090717925, ), - u64x4::new( + u64x4::new_const( 685345654160323, 319718985807814, 1359932285384164, @@ -1614,31 +1614,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 2083666668832889, 314624387816655, 1496694646480345, 1946728950459189, ), - u64x4::new( + u64x4::new_const( 1579153761571203, 508771185291380, 1002249659402007, 551517831173801, ), - u64x4::new( + u64x4::new_const( 2132371471626150, 1988122278556533, 1552195130653890, 1327637750292755, ), - u64x4::new( + u64x4::new_const( 118937099181527, 382610380973142, 634951529106471, 382740054041699, ), - u64x4::new( + u64x4::new_const( 801287519643470, 87822941589258, 1908825350108451, @@ -1646,31 +1646,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 330347226380261, 672119116965146, 1761510370768005, 1959200302484704, ), - u64x4::new( + u64x4::new_const( 1631876583009250, 1684917718484264, 1027256947805920, 2174612545251129, ), - u64x4::new( + u64x4::new_const( 636668855699872, 625187713984839, 265886954766790, 167898557908504, ), - u64x4::new( + u64x4::new_const( 1210974548180860, 2051308710365526, 907620584086428, 1081788677970850, ), - u64x4::new( + u64x4::new_const( 621792955460854, 1450945504745382, 1666728650687828, @@ -1678,31 +1678,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 24725936182267, 2226765032752574, 2036560083102883, 2002351185719584, ), - u64x4::new( + u64x4::new_const( 1620080779405308, 1493220053370419, 2245691691038916, 1152182628629603, ), - u64x4::new( + u64x4::new_const( 317928527147500, 1855194218440212, 979380281964169, 861442286685289, ), - u64x4::new( + u64x4::new_const( 393308472784625, 486143087279967, 1234071346236405, 777748237119399, ), - u64x4::new( + u64x4::new_const( 43850412814718, 1497656407486446, 744128331046695, @@ -1710,31 +1710,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1670169946550211, 1230951698726438, 806586940221293, 23159779184607, ), - u64x4::new( + u64x4::new_const( 634011340979302, 764182085034744, 731065727766955, 1737985776442180, ), - u64x4::new( + u64x4::new_const( 240492712141842, 73976435954441, 162810587166835, 697230894340912, ), - u64x4::new( + u64x4::new_const( 1299745598348388, 1359436039694544, 1856609816731554, 25228008461513, ), - u64x4::new( + u64x4::new_const( 2180690501932381, 2161211192848458, 87069466793408, @@ -1742,31 +1742,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1106932458043379, 1675181364231371, 1681785724775243, 131824742557210, ), - u64x4::new( + u64x4::new_const( 1671649414647169, 1827849994880670, 1097958057111899, 701956891169434, ), - u64x4::new( + u64x4::new_const( 2095539283710881, 591029812888096, 1699571518315654, 1297589045812566, ), - u64x4::new( + u64x4::new_const( 1345612272298537, 2166754730876055, 2047982622154948, 1785222806258129, ), - u64x4::new( + u64x4::new_const( 2181915268829890, 1895697064378670, 1288412327355885, @@ -1774,31 +1774,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 741330264098392, 357073519729966, 1603572339180975, 433572083688575, ), - u64x4::new( + u64x4::new_const( 699685108971208, 1719650727634959, 1941668009419214, 870374958347891, ), - u64x4::new( + u64x4::new_const( 385971389331537, 11655507719711, 94814615497633, 515572102810609, ), - u64x4::new( + u64x4::new_const( 1396688200590426, 1518748475144123, 162386454324368, 2083303971579002, ), - u64x4::new( + u64x4::new_const( 1511688632419263, 251584258592336, 545345887993880, @@ -1806,31 +1806,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1298668855706029, 2017860934939344, 2224150456036391, 1925926576297971, ), - u64x4::new( + u64x4::new_const( 259522963883544, 1312469129541229, 1647530465049600, 1113737129047154, ), - u64x4::new( + u64x4::new_const( 733193298663145, 2115712816303403, 897628702762311, 116440277571901, ), - u64x4::new( + u64x4::new_const( 1998719395229750, 1662774553684237, 194395608126452, 98796702872301, ), - u64x4::new( + u64x4::new_const( 2226158244229144, 91961728239158, 526869903032152, @@ -1838,31 +1838,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 472779569333556, 854477760843410, 2070906720349401, 734613359834689, ), - u64x4::new( + u64x4::new_const( 1771897100487404, 1604024196006064, 319699348925383, 437152129592623, ), - u64x4::new( + u64x4::new_const( 627618365135361, 1768642666037955, 588564169143939, 35295037750744, ), - u64x4::new( + u64x4::new_const( 220241884231278, 319104161410840, 1048165719448798, 1583931089774347, ), - u64x4::new( + u64x4::new_const( 166479451884333, 1623611819962804, 59990366193679, @@ -1870,31 +1870,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 1944687327687331, 1328410791053991, 2083980670913902, 609396833380574, ), - u64x4::new( + u64x4::new_const( 1907563845734496, 1385619047697883, 869817384774457, 106642388505109, ), - u64x4::new( + u64x4::new_const( 1006516581737154, 1561918369633937, 1921172883211450, 2216650451558824, ), - u64x4::new( + u64x4::new_const( 1780506017391778, 233064930371847, 1332962603425752, 1380075261612354, ), - u64x4::new( + u64x4::new_const( 1907624789747741, 1310065402098523, 1838275780706825, @@ -1902,31 +1902,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 198729830692545, 100156148743413, 2140568641558859, 2220606475942394, ), - u64x4::new( + u64x4::new_const( 1108788217903741, 1706330932366163, 2050449866410661, 684907598542847, ), - u64x4::new( + u64x4::new_const( 1101958322366646, 659427843062405, 253899933868173, 896574852821269, ), - u64x4::new( + u64x4::new_const( 1157052140740658, 440541103447032, 2173354981480949, 604768603561932, ), - u64x4::new( + u64x4::new_const( 961238337866054, 830849154351308, 1643852412409441, @@ -1934,31 +1934,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 784870637473285, 1180234052037572, 2086951602998715, 419328169540373, ), - u64x4::new( + u64x4::new_const( 1966862397394559, 788036164772123, 2024355635709481, 1471696676696146, ), - u64x4::new( + u64x4::new_const( 1468884300957205, 1408016588131185, 2229595828577885, 240413942963547, ), - u64x4::new( + u64x4::new_const( 1481791691942441, 970648959691160, 1635500996148197, 2236917233261585, ), - u64x4::new( + u64x4::new_const( 31660820731028, 801794768903647, 1069092619607344, @@ -1966,31 +1966,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 911659428682786, 762502588057038, 1311399152500807, 1966922911783311, ), - u64x4::new( + u64x4::new_const( 1229849228728540, 258161307933217, 2140796867375541, 1569345075547911, ), - u64x4::new( + u64x4::new_const( 1487354676143742, 1818317546165791, 811033554173350, 1768788663337616, ), - u64x4::new( + u64x4::new_const( 450017165913234, 962535873747168, 2099104262993585, 503030952485785, ), - u64x4::new( + u64x4::new_const( 1259958681304518, 479589250923541, 1503904042161640, @@ -1998,31 +1998,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 794562643024291, 198670993088241, 1678984629358943, 273399517554618, ), - u64x4::new( + u64x4::new_const( 188458991574433, 1389872130156447, 1461868931574746, 795140878721432, ), - u64x4::new( + u64x4::new_const( 624046647169653, 630363741191019, 911018499983500, 1410140563046579, ), - u64x4::new( + u64x4::new_const( 1675056174405076, 632544713589250, 795454163559811, 1535271563341780, ), - u64x4::new( + u64x4::new_const( 25504547444781, 812510098987855, 51290042016232, @@ -2030,31 +2030,31 @@ pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = Naf ), ])), CachedPoint(F51x4Reduced([ - u64x4::new( + u64x4::new_const( 269968325452358, 470932785179706, 1684444304834150, 1027482126748243, ), - u64x4::new( + u64x4::new_const( 457941065342419, 2117377568137882, 1209423706730905, 2192403099717071, ), - u64x4::new( + u64x4::new_const( 1899046404863678, 1359500336071762, 1492389156724726, 1455627081827750, ), - u64x4::new( + u64x4::new_const( 2016101061876546, 1967000012916571, 582539481696050, 1197538178790094, ), - u64x4::new( + u64x4::new_const( 639684852217504, 1799941252757449, 1470016556327743, diff --git a/src/backend/vector/ifma/field.rs b/src/backend/vector/ifma/field.rs index fcfbb69cd..fd1955315 100644 --- a/src/backend/vector/ifma/field.rs +++ b/src/backend/vector/ifma/field.rs @@ -11,8 +11,8 @@ #![allow(non_snake_case)] +use crate::backend::vector::packed_simd::u64x4; use core::ops::{Add, Mul, Neg}; -use packed_simd::{u64x4, IntoBits}; use crate::backend::serial::u64::field::FieldElement51; @@ -20,14 +20,14 @@ use crate::backend::serial::u64::field::FieldElement51; #[inline(always)] unsafe fn madd52lo(z: u64x4, x: u64x4, y: u64x4) -> u64x4 { use core::arch::x86_64::_mm256_madd52lo_epu64; - _mm256_madd52lo_epu64(z.into_bits(), x.into_bits(), y.into_bits()).into_bits() + _mm256_madd52lo_epu64(z.into(), x.into(), y.into()).into() } /// A wrapper around `vpmadd52huq` that works on `u64x4`. #[inline(always)] unsafe fn madd52hi(z: u64x4, x: u64x4, y: u64x4) -> u64x4 { use core::arch::x86_64::_mm256_madd52hi_epu64; - _mm256_madd52hi_epu64(z.into_bits(), x.into_bits(), y.into_bits()).into_bits() + _mm256_madd52hi_epu64(z.into(), x.into(), y.into()).into() } /// A vector of four field elements in radix 2^51, with unreduced coefficients. @@ -59,16 +59,16 @@ fn shuffle_lanes(x: u64x4, control: Shuffle) -> u64x4 { use core::arch::x86_64::_mm256_permute4x64_epi64 as perm; match control { - Shuffle::AAAA => perm(x.into_bits(), 0b00_00_00_00).into_bits(), - Shuffle::BBBB => perm(x.into_bits(), 0b01_01_01_01).into_bits(), - Shuffle::BADC => perm(x.into_bits(), 0b10_11_00_01).into_bits(), - Shuffle::BACD => perm(x.into_bits(), 0b11_10_00_01).into_bits(), - Shuffle::ADDA => perm(x.into_bits(), 0b00_11_11_00).into_bits(), - Shuffle::CBCB => perm(x.into_bits(), 0b01_10_01_10).into_bits(), - Shuffle::ABDC => perm(x.into_bits(), 0b10_11_01_00).into_bits(), - Shuffle::ABAB => perm(x.into_bits(), 0b01_00_01_00).into_bits(), - Shuffle::DBBD => perm(x.into_bits(), 0b11_01_01_11).into_bits(), - Shuffle::CACA => perm(x.into_bits(), 0b00_10_00_10).into_bits(), + Shuffle::AAAA => perm(x.into(), 0b00_00_00_00).into(), + Shuffle::BBBB => perm(x.into(), 0b01_01_01_01).into(), + Shuffle::BADC => perm(x.into(), 0b10_11_00_01).into(), + Shuffle::BACD => perm(x.into(), 0b11_10_00_01).into(), + Shuffle::ADDA => perm(x.into(), 0b00_11_11_00).into(), + Shuffle::CBCB => perm(x.into(), 0b01_10_01_10).into(), + Shuffle::ABDC => perm(x.into(), 0b10_11_01_00).into(), + Shuffle::ABAB => perm(x.into(), 0b01_00_01_00).into(), + Shuffle::DBBD => perm(x.into(), 0b11_01_01_11).into(), + Shuffle::CACA => perm(x.into(), 0b00_10_00_10).into(), } } } @@ -90,18 +90,18 @@ fn blend_lanes(x: u64x4, y: u64x4, control: Lanes) -> u64x4 { use core::arch::x86_64::_mm256_blend_epi32 as blend; match control { - Lanes::D => blend(x.into_bits(), y.into_bits(), 0b11_00_00_00).into_bits(), - Lanes::C => blend(x.into_bits(), y.into_bits(), 0b00_11_00_00).into_bits(), - Lanes::AB => blend(x.into_bits(), y.into_bits(), 0b00_00_11_11).into_bits(), - Lanes::AC => blend(x.into_bits(), y.into_bits(), 0b00_11_00_11).into_bits(), - Lanes::AD => blend(x.into_bits(), y.into_bits(), 0b11_00_00_11).into_bits(), - Lanes::BCD => blend(x.into_bits(), y.into_bits(), 0b11_11_11_00).into_bits(), + Lanes::D => blend(x.into(), y.into(), 0b11_00_00_00).into(), + Lanes::C => blend(x.into(), y.into(), 0b00_11_00_00).into(), + Lanes::AB => blend(x.into(), y.into(), 0b00_00_11_11).into(), + Lanes::AC => blend(x.into(), y.into(), 0b00_11_00_11).into(), + Lanes::AD => blend(x.into(), y.into(), 0b11_00_00_11).into(), + Lanes::BCD => blend(x.into(), y.into(), 0b11_11_11_00).into(), } } } impl F51x4Unreduced { - pub const ZERO: F51x4Unreduced = F51x4Unreduced([u64x4::splat(0); 5]); + pub const ZERO: F51x4Unreduced = F51x4Unreduced([u64x4::splat_const::<0>(); 5]); pub fn new( x0: &FieldElement51, @@ -122,32 +122,32 @@ impl F51x4Unreduced { let x = &self.0; [ FieldElement51([ - x[0].extract(0), - x[1].extract(0), - x[2].extract(0), - x[3].extract(0), - x[4].extract(0), + x[0].extract::<0>(), + x[1].extract::<0>(), + x[2].extract::<0>(), + x[3].extract::<0>(), + x[4].extract::<0>(), ]), FieldElement51([ - x[0].extract(1), - x[1].extract(1), - x[2].extract(1), - x[3].extract(1), - x[4].extract(1), + x[0].extract::<1>(), + x[1].extract::<1>(), + x[2].extract::<1>(), + x[3].extract::<1>(), + x[4].extract::<1>(), ]), FieldElement51([ - x[0].extract(2), - x[1].extract(2), - x[2].extract(2), - x[3].extract(2), - x[4].extract(2), + x[0].extract::<2>(), + x[1].extract::<2>(), + x[2].extract::<2>(), + x[3].extract::<2>(), + x[4].extract::<2>(), ]), FieldElement51([ - x[0].extract(3), - x[1].extract(3), - x[2].extract(3), - x[3].extract(3), - x[4].extract(3), + x[0].extract::<3>(), + x[1].extract::<3>(), + x[2].extract::<3>(), + x[3].extract::<3>(), + x[4].extract::<3>(), ]), ] } @@ -291,64 +291,64 @@ impl F51x4Reduced { z1_2 = madd52hi(z1_2, x[0], x[0]); z2_4 = madd52hi(z2_4, x[0], x[1]); - let mut z2_1 = z2_4 << 2; + let mut z2_1 = z2_4.shl::<2>(); z2_2 = madd52lo(z2_2, x[0], x[2]); z2_1 = madd52lo(z2_1, x[1], x[1]); z3_4 = madd52hi(z3_4, x[0], x[2]); - let mut z3_1 = z3_4 << 2; + let mut z3_1 = z3_4.shl::<2>(); z3_2 = madd52lo(z3_2, x[1], x[2]); z3_2 = madd52lo(z3_2, x[0], x[3]); z3_2 = madd52hi(z3_2, x[1], x[1]); z4_4 = madd52hi(z4_4, x[1], x[2]); z4_4 = madd52hi(z4_4, x[0], x[3]); - let mut z4_1 = z4_4 << 2; + let mut z4_1 = z4_4.shl::<2>(); z4_2 = madd52lo(z4_2, x[1], x[3]); z4_2 = madd52lo(z4_2, x[0], x[4]); z4_1 = madd52lo(z4_1, x[2], x[2]); z5_4 = madd52hi(z5_4, x[1], x[3]); z5_4 = madd52hi(z5_4, x[0], x[4]); - let mut z5_1 = z5_4 << 2; + let mut z5_1 = z5_4.shl::<2>(); z5_2 = madd52lo(z5_2, x[2], x[3]); z5_2 = madd52lo(z5_2, x[1], x[4]); z5_2 = madd52hi(z5_2, x[2], x[2]); z6_4 = madd52hi(z6_4, x[2], x[3]); z6_4 = madd52hi(z6_4, x[1], x[4]); - let mut z6_1 = z6_4 << 2; + let mut z6_1 = z6_4.shl::<2>(); z6_2 = madd52lo(z6_2, x[2], x[4]); z6_1 = madd52lo(z6_1, x[3], x[3]); z7_4 = madd52hi(z7_4, x[2], x[4]); - let mut z7_1 = z7_4 << 2; + let mut z7_1 = z7_4.shl::<2>(); z7_2 = madd52lo(z7_2, x[3], x[4]); z7_2 = madd52hi(z7_2, x[3], x[3]); z8_4 = madd52hi(z8_4, x[3], x[4]); - let mut z8_1 = z8_4 << 2; + let mut z8_1 = z8_4.shl::<2>(); z8_1 = madd52lo(z8_1, x[4], x[4]); let mut z9_1 = u64x4::splat(0); z9_2 = madd52hi(z9_2, x[4], x[4]); - z5_1 += z5_2 << 1; - z6_1 += z6_2 << 1; - z7_1 += z7_2 << 1; - z9_1 += z9_2 << 1; + z5_1 += z5_2.shl::<1>(); + z6_1 += z6_2.shl::<1>(); + z7_1 += z7_2.shl::<1>(); + z9_1 += z9_2.shl::<1>(); let mut t0 = u64x4::splat(0); let mut t1 = u64x4::splat(0); let r19 = u64x4::splat(19); t0 = madd52hi(t0, r19, z9_1); - t1 = madd52lo(t1, r19, z9_1 >> 52); + t1 = madd52lo(t1, r19, z9_1.shr::<52>()); - z4_2 = madd52lo(z4_2, r19, z8_1 >> 52); - z3_2 = madd52lo(z3_2, r19, z7_1 >> 52); - z2_2 = madd52lo(z2_2, r19, z6_1 >> 52); - z1_2 = madd52lo(z1_2, r19, z5_1 >> 52); + z4_2 = madd52lo(z4_2, r19, z8_1.shr::<52>()); + z3_2 = madd52lo(z3_2, r19, z7_1.shr::<52>()); + z2_2 = madd52lo(z2_2, r19, z6_1.shr::<52>()); + z1_2 = madd52lo(z1_2, r19, z5_1.shr::<52>()); z0_2 = madd52lo(z0_2, r19, t0 + t1); z1_2 = madd52hi(z1_2, r19, z5_1); @@ -387,11 +387,11 @@ impl From for F51x4Reduced { let r19 = u64x4::splat(19); // Compute carryouts in parallel - let c0 = x.0[0] >> 51; - let c1 = x.0[1] >> 51; - let c2 = x.0[2] >> 51; - let c3 = x.0[3] >> 51; - let c4 = x.0[4] >> 51; + let c0 = x.0[0].shr::<51>(); + let c1 = x.0[1].shr::<51>(); + let c2 = x.0[2].shr::<51>(); + let c3 = x.0[3].shr::<51>(); + let c4 = x.0[4].shr::<51>(); unsafe { F51x4Reduced([ @@ -581,12 +581,12 @@ impl<'a, 'b> Mul<&'b F51x4Reduced> for &'a F51x4Reduced { // Wave 6 t0 = madd52hi(t0, r19, z9); - t1 = madd52lo(t1, r19, z9 >> 52); + t1 = madd52lo(t1, r19, z9.shr::<52>()); z3_1 = madd52lo(z3_1, x[0], y[3]); z4_2 = madd52hi(z4_2, x[0], y[3]); - z1_2 = madd52lo(z1_2, r19, z5 >> 52); - z2_2 = madd52lo(z2_2, r19, z6 >> 52); - z3_2 = madd52lo(z3_2, r19, z7 >> 52); + z1_2 = madd52lo(z1_2, r19, z5.shr::<52>()); + z2_2 = madd52lo(z2_2, r19, z6.shr::<52>()); + z3_2 = madd52lo(z3_2, r19, z7.shr::<52>()); z0_1 = madd52lo(z0_1, r19, z5); // Wave 7 @@ -601,7 +601,7 @@ impl<'a, 'b> Mul<&'b F51x4Reduced> for &'a F51x4Reduced { // Wave 8 z3_1 = madd52lo(z3_1, r19, z8); - z4_2 = madd52lo(z4_2, r19, z8 >> 52); + z4_2 = madd52lo(z4_2, r19, z8.shr::<52>()); F51x4Unreduced([ z0_1 + z0_2 + z0_2, diff --git a/src/backend/vector/mod.rs b/src/backend/vector/mod.rs index b05cffb36..51c9e81e3 100644 --- a/src/backend/vector/mod.rs +++ b/src/backend/vector/mod.rs @@ -11,28 +11,44 @@ #![doc = include_str!("../../../docs/parallel-formulas.md")] -#[cfg(not(any(target_feature = "avx2", target_feature = "avx512ifma", docsrs)))] +#[cfg(not(any( + target_feature = "avx2", + all(target_feature = "avx512ifma", nightly), + docsrs +)))] compile_error!("'simd' backend selected without target_feature=+avx2 or +avx512ifma"); +#[allow(missing_docs)] +pub mod packed_simd; + #[cfg(any( - all(target_feature = "avx2", not(target_feature = "avx512ifma")), + all( + target_feature = "avx2", + not(all(target_feature = "avx512ifma", nightly)) + ), all(docsrs, target_arch = "x86_64") ))] pub mod avx2; #[cfg(any( - all(target_feature = "avx2", not(target_feature = "avx512ifma")), + all( + target_feature = "avx2", + not(all(target_feature = "avx512ifma", nightly)) + ), all(docsrs, target_arch = "x86_64") ))] pub(crate) use self::avx2::{edwards::CachedPoint, edwards::ExtendedPoint}; -#[cfg(any(target_feature = "avx512ifma", all(docsrs, target_arch = "x86_64")))] +#[cfg(any( + all(target_feature = "avx512ifma", nightly), + all(docsrs, target_arch = "x86_64") +))] pub mod ifma; -#[cfg(target_feature = "avx512ifma")] +#[cfg(all(target_feature = "avx512ifma", nightly))] pub(crate) use self::ifma::{edwards::CachedPoint, edwards::ExtendedPoint}; #[cfg(any( target_feature = "avx2", - target_feature = "avx512ifma", + all(target_feature = "avx512ifma", nightly), all(docsrs, target_arch = "x86_64") ))] #[allow(missing_docs)] @@ -43,12 +59,12 @@ pub mod scalar_mul; #[cfg(any( all( target_feature = "avx2", - not(target_feature = "avx512ifma"), + not(all(target_feature = "avx512ifma", nightly)), feature = "precomputed-tables" ), all(docsrs, target_arch = "x86_64") ))] pub(crate) use self::avx2::constants::BASEPOINT_ODD_LOOKUP_TABLE; -#[cfg(all(target_feature = "avx512ifma", feature = "precomputed-tables"))] +#[cfg(all(target_feature = "avx512ifma", nightly, feature = "precomputed-tables"))] pub(crate) use self::ifma::constants::BASEPOINT_ODD_LOOKUP_TABLE; diff --git a/src/backend/vector/packed_simd.rs b/src/backend/vector/packed_simd.rs new file mode 100644 index 000000000..6a3484d72 --- /dev/null +++ b/src/backend/vector/packed_simd.rs @@ -0,0 +1,311 @@ +// -*- mode: rust; -*- +// +// This file is part of curve25519-dalek. +// See LICENSE for licensing information. + +///! This module defines wrappers over platform-specific SIMD types to make them +///! more convenient to use. +///! +///! UNSAFETY: Everything in this module assumes that we're running on hardware +///! which supports at least AVX2. This invariant *must* be enforced +///! by the callers of this code. +use core::ops::{Add, AddAssign, BitAnd, BitAndAssign, BitXor, BitXorAssign, Sub}; + +macro_rules! impl_shared { + ( + $ty:ident, + $lane_ty:ident, + $add_intrinsic:ident, + $sub_intrinsic:ident, + $shl_intrinsic:ident, + $shr_intrinsic:ident, + $extract_intrinsic:ident + ) => { + #[allow(non_camel_case_types)] + #[derive(Copy, Clone, Debug)] + #[repr(transparent)] + pub struct $ty(core::arch::x86_64::__m256i); + + impl From<$ty> for core::arch::x86_64::__m256i { + #[inline] + fn from(value: $ty) -> core::arch::x86_64::__m256i { + value.0 + } + } + + impl From for $ty { + #[inline] + fn from(value: core::arch::x86_64::__m256i) -> $ty { + $ty(value) + } + } + + impl PartialEq for $ty { + #[inline] + fn eq(&self, rhs: &$ty) -> bool { + unsafe { + // This compares each pair of 8-bit packed integers and returns either 0xFF or + // 0x00 depending on whether they're equal. + // + // So the values are equal if (and only if) this returns a value that's filled + // with only 0xFF. + // + // Pseudocode of what this does: + // self.0 + // .bytes() + // .zip(rhs.0.bytes()) + // .map(|a, b| if a == b { 0xFF } else { 0x00 }) + // .join(); + let m = core::arch::x86_64::_mm256_cmpeq_epi8(self.0, rhs.0); + + // Now we need to reduce the 256-bit value to something on which we can branch. + // + // This will just take the most significant bit of every 8-bit packed integer + // and build an `i32` out of it. If the values we previously compared were + // equal then all off the most significant bits will be equal to 1, which means + // that this will return 0xFFFFFFFF, which is equal to -1 when represented as + // an `i32`. + core::arch::x86_64::_mm256_movemask_epi8(m) == -1 + } + } + } + + impl Eq for $ty {} + + impl Add for $ty { + type Output = Self; + + #[inline] + fn add(self, rhs: $ty) -> Self { + unsafe { core::arch::x86_64::$add_intrinsic(self.0, rhs.0).into() } + } + } + + impl AddAssign for $ty { + #[inline] + fn add_assign(&mut self, rhs: $ty) { + *self = *self + rhs + } + } + + impl Sub for $ty { + type Output = Self; + + #[inline] + fn sub(self, rhs: $ty) -> Self { + unsafe { core::arch::x86_64::$sub_intrinsic(self.0, rhs.0).into() } + } + } + + impl BitAnd for $ty { + type Output = Self; + + #[inline] + fn bitand(self, rhs: $ty) -> Self { + unsafe { core::arch::x86_64::_mm256_and_si256(self.0, rhs.0).into() } + } + } + + impl BitXor for $ty { + type Output = Self; + + #[inline] + fn bitxor(self, rhs: $ty) -> Self { + unsafe { core::arch::x86_64::_mm256_xor_si256(self.0, rhs.0).into() } + } + } + + impl BitAndAssign for $ty { + #[inline] + fn bitand_assign(&mut self, rhs: $ty) { + *self = *self & rhs; + } + } + + impl BitXorAssign for $ty { + #[inline] + fn bitxor_assign(&mut self, rhs: $ty) { + *self = *self ^ rhs; + } + } + + #[allow(dead_code)] + impl $ty { + #[inline] + pub fn shl(self) -> Self { + unsafe { core::arch::x86_64::$shl_intrinsic(self.0, N).into() } + } + + #[inline] + pub fn shr(self) -> Self { + unsafe { core::arch::x86_64::$shr_intrinsic(self.0, N).into() } + } + + #[inline] + pub fn extract(self) -> $lane_ty { + unsafe { core::arch::x86_64::$extract_intrinsic(self.0, N) as $lane_ty } + } + } + }; +} + +macro_rules! impl_conv { + ($src:ident => $($dst:ident),+) => { + $( + impl From<$src> for $dst { + #[inline] + fn from(value: $src) -> $dst { + $dst(value.0) + } + } + )+ + } +} + +// We define SIMD functionality over packed unsigned integer types. However, all the integer +// intrinsics deal with signed integers. So we cast unsigned to signed, pack it into SIMD, do +// add/sub/shl/shr arithmetic, and finally cast back to unsigned at the end. Why is this equivalent +// to doing the same thing on unsigned integers? Shl/shr is clear, because casting does not change +// the bits of the integer. But what about add/sub? This is due to the following: +// +// 1) Rust uses two's complement to represent signed integers. So we're assured that the values +// we cast into SIMD and extract out at the end are two's complement. +// +// https://doc.rust-lang.org/reference/types/numeric.html +// +// 2) Wrapping add/sub is compatible between two's complement signed and unsigned integers. +// That is, for all x,y: u64 (or any unsigned integer type), +// +// x.wrapping_add(y) == (x as i64).wrapping_add(y as i64) as u64, and +// x.wrapping_sub(y) == (x as i64).wrapping_sub(y as i64) as u64 +// +// https://julesjacobs.com/2019/03/20/why-twos-complement-works.html +// +// 3) The add/sub functions we use for SIMD are indeed wrapping. The docs indicate that +// __mm256_add/sub compile to vpaddX/vpsubX instructions where X = w, d, or q depending on +// the bitwidth. From x86 docs: +// +// When an individual result is too large to be represented in X bits (overflow), the +// result is wrapped around and the low X bits are written to the destination operand +// (that is, the carry is ignored). +// +// https://www.felixcloutier.com/x86/paddb:paddw:paddd:paddq +// https://www.felixcloutier.com/x86/psubb:psubw:psubd +// https://www.felixcloutier.com/x86/psubq + +impl_shared!( + u64x4, + u64, + _mm256_add_epi64, + _mm256_sub_epi64, + _mm256_slli_epi64, + _mm256_srli_epi64, + _mm256_extract_epi64 +); +impl_shared!( + u32x8, + u32, + _mm256_add_epi32, + _mm256_sub_epi32, + _mm256_slli_epi32, + _mm256_srli_epi32, + _mm256_extract_epi32 +); + +impl_conv!(u64x4 => u32x8); + +#[allow(dead_code)] +impl u64x4 { + /// A constified variant of `new`. + /// + /// Should only be called from `const` contexts. At runtime `new` is going to be faster. + #[inline] + pub const fn new_const(x0: u64, x1: u64, x2: u64, x3: u64) -> Self { + // SAFETY: Transmuting between an array and a SIMD type is safe + // https://rust-lang.github.io/unsafe-code-guidelines/layout/packed-simd-vectors.html + unsafe { Self(core::mem::transmute([x0, x1, x2, x3])) } + } + + /// A constified variant of `splat`. + /// + /// Should only be called from `const` contexts. At runtime `splat` is going to be faster. + #[inline] + pub const fn splat_const() -> Self { + Self::new_const(N, N, N, N) + } + + /// Constructs a new instance. + #[inline] + pub fn new(x0: u64, x1: u64, x2: u64, x3: u64) -> Self { + unsafe { + // _mm256_set_epi64 sets the underlying vector in reverse order of the args + Self(core::arch::x86_64::_mm256_set_epi64x( + x3 as i64, x2 as i64, x1 as i64, x0 as i64, + )) + } + } + + /// Constructs a new instance with all of the elements initialized to the given value. + #[inline] + pub fn splat(x: u64) -> Self { + unsafe { Self(core::arch::x86_64::_mm256_set1_epi64x(x as i64)) } + } +} + +#[allow(dead_code)] +impl u32x8 { + /// A constified variant of `new`. + /// + /// Should only be called from `const` contexts. At runtime `new` is going to be faster. + #[inline] + pub const fn new_const( + x0: u32, + x1: u32, + x2: u32, + x3: u32, + x4: u32, + x5: u32, + x6: u32, + x7: u32, + ) -> Self { + // SAFETY: Transmuting between an array and a SIMD type is safe + // https://rust-lang.github.io/unsafe-code-guidelines/layout/packed-simd-vectors.html + unsafe { Self(core::mem::transmute([x0, x1, x2, x3, x4, x5, x6, x7])) } + } + + /// A constified variant of `splat`. + /// + /// Should only be called from `const` contexts. At runtime `splat` is going to be faster. + #[inline] + pub const fn splat_const() -> Self { + Self::new_const(N, N, N, N, N, N, N, N) + } + + /// Constructs a new instance. + #[inline] + pub fn new(x0: u32, x1: u32, x2: u32, x3: u32, x4: u32, x5: u32, x6: u32, x7: u32) -> Self { + unsafe { + // _mm256_set_epi32 sets the underlying vector in reverse order of the args + Self(core::arch::x86_64::_mm256_set_epi32( + x7 as i32, x6 as i32, x5 as i32, x4 as i32, x3 as i32, x2 as i32, x1 as i32, + x0 as i32, + )) + } + } + + /// Constructs a new instance with all of the elements initialized to the given value. + #[inline] + pub fn splat(x: u32) -> Self { + unsafe { Self(core::arch::x86_64::_mm256_set1_epi32(x as i32)) } + } + + /// Multiplies the low unsigned 32-bits from each packed 64-bit element + /// and returns the unsigned 64-bit results. + /// + /// (This ignores the upper 32-bits from each packed 64-bits!) + #[inline] + pub fn mul32(self, rhs: u32x8) -> u64x4 { + // NOTE: This ignores the upper 32-bits from each packed 64-bits. + unsafe { core::arch::x86_64::_mm256_mul_epu32(self.0, rhs.0).into() } + } +} diff --git a/src/lib.rs b/src/lib.rs index abdf980cd..83ccdadd4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,7 +10,14 @@ // - Henry de Valence #![no_std] -#![cfg_attr(curve25519_dalek_backend = "simd", feature(stdsimd))] +#![cfg_attr( + all( + curve25519_dalek_backend = "simd", + target_feature = "avx512ifma", + nightly + ), + feature(stdsimd) +)] #![cfg_attr(docsrs, feature(doc_auto_cfg, doc_cfg, doc_cfg_hide))] #![cfg_attr(docsrs, doc(cfg_hide(docsrs)))] //------------------------------------------------------------------------