fix: restructure use of rayon to avoid deadlocks

- perf: Uses rayon in the places where the work is cpu bound only - fix: Do not use custom rayon thread pools, as multiple of these can result in cross pool deadlocking. - fix: Proxy `BELLMAN_NUM_CPUS` to `RAYON_NUM_THREADS` for now, and deprecate it. - refactor: Use `RAYON_NUM_THREADS` as the baseline for determining the number of threads to to use - perf: Use `in_place_scope` instead of `scope` to avoid spawning unnecessary threads. - ci: Add CI tests to ensure the currently deadlocking test for `groth16_bench` is checked Co-authored-by: nemo <nemo@protocol.ai>
filecoin-project · Sep 10, 2021 · f0f8fe9 · f0f8fe9
1 parent 7f93cc7
commit f0f8fe9
Show file tree

Hide file tree

Showing 10 changed files with 344 additions and 221 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -92,6 +92,21 @@ commands:
           command: TARGET=<< parameters.target >> cargo test --release --no-default-features --features gpu,blst
           no_output_timeout: 30m
 
+  test_target_blst_gpu_ignored:
+    parameters:
+      target:
+        type: string
+    steps:
+      - *restore-workspace
+      - *restore-cache
+      - run:
+          name: Test blst ignored (<< parameters.target >>)
+          command: TARGET=<< parameters.target >> cargo test --release --no-default-features --features blst -- --ignored
+          no_output_timeout: 30m
+      - run:
+          name: Show results
+          command: cat aggregation.csv
+
 jobs:
 
   cargo_fetch:
@@ -164,6 +179,14 @@ jobs:
       - test_target_blst_gpu:
           target: "x86_64-unknown-linux-gnu"
 
+  test_blst_gpu_ignored_x86_64-unknown-linux-gnu:
+    executor: default
+    steps:
+      - set-env-path
+      - install-gpu-deps
+      - test_target_blst_gpu_ignored:
+          target: "x86_64-unknown-linux-gnu"
+
   rustfmt:
     executor: default
     steps:
@@ -271,6 +294,9 @@ workflows:
       - test_blst_gpu_x86_64-unknown-linux-gnu:
           requires:
             - cargo_fetch
+      - test_blst_gpu_ignored_x86_64-unknown-linux-gnu:
+          requires:
+            - cargo_fetch
 
       #- coverage_run:
       #    name: coverage_default_features

diff --git a/Cargo.toml b/Cargo.toml
@@ -23,7 +23,7 @@ byteorder = "1"
 log = "0.4.8"
 lazy_static = "1.4.0"
 rand = "0.7"
-rayon = "1.3.0"
+rayon = "1.5.0"
 memmap = "0.7.0"
 thiserror = "1.0.10"
 rustc-hash = "1.1.0"

diff --git a/README.md b/README.md
@@ -66,6 +66,15 @@ The gpu extension contains some env vars that may be set externally to this libr
     env::set_var("BELLMAN_CPU_UTILIZATION", "0.5");
     ```
 
+- `RAYON_NUM_THREADS`
+
+   Restricts the number of threads used in the library to roughly twice that number (best effort). In the past this was done using `BELLMAN_NUM_CPUS` which is now deprecated. The default is set to the number of logical cores reported on the machine.
+
+   ```rust
+    // Example
+    env::set_var("RAYON_NUM_THREADS", "6");
+   ```
+
 #### Supported / Tested Cards
 
 Depending on the size of the proof being passed to the gpu for work, certain cards will not be able to allocate enough memory to either the FFT or Multiexp kernel. Below are a list of devices that work for small sets. In the future we will add the cuttoff point at which a given card will not be able to allocate enough memory to utilize the GPU.

diff --git a/src/groth16/aggregate/macros.rs b/src/groth16/aggregate/macros.rs
@@ -3,10 +3,10 @@ macro_rules! try_par {
         $(
             let mut $name = None;
         )+
-            crate::multicore::THREAD_POOL.scoped(|s| {
+            rayon::in_place_scope(|s| {
                 $(
                     let $name = &mut $name;
-                    s.execute(move || {
+                    s.spawn(move |_| {
                         *$name = Some($f);
                     });)+
             });
@@ -21,10 +21,10 @@ macro_rules! par {
         $(
             let mut $name = None;
         )+
-            crate::multicore::THREAD_POOL.scoped(|s| {
+            rayon::in_place_scope(|s| {
                 $(
                     let $name = &mut $name;
-                    s.execute(move || {
+                    s.spawn(move |_| {
                         *$name = Some($f);
                     });)+
             });
@@ -38,11 +38,11 @@ macro_rules! par {
             let mut $name1 = None;
             let mut $name2 = None;
         )+
-            crate::multicore::THREAD_POOL.scoped(|s| {
+            rayon::in_place_scope(|s| {
                 $(
                     let $name1 = &mut $name1;
                     let $name2 = &mut $name2;
-                    s.execute(move || {
+                    s.spawn(move |_| {
                         let (a, b) = $f;
                         *$name1 = Some(a);
                         *$name2 = Some(b);

diff --git a/src/groth16/aggregate/verify.rs b/src/groth16/aggregate/verify.rs
@@ -132,46 +132,46 @@ pub fn verify_aggregate_proof<E: Engine + std::fmt::Debug, R: rand::RngCore + Se
         // 5. compute the middle part of the final pairing equation, the one
         //    with the public inputs
         let middle = {
-                // We want to compute MUL(i:0 -> l) S_i ^ (SUM(j:0 -> n) ai,j * r^j)
-                // this table keeps tracks of incremental computation of each i-th
-                // exponent to later multiply with S_i
-                // The index of the table is i, which is an index of the public
-                // input element
-                // We incrementally build the r vector and the table
-                // NOTE: in this version it's not r^2j but simply r^j
-
-                let l = public_inputs[0].len();
-                let mut g_ic = pvk.ic_projective[0];
-                g_ic.mul_assign(r_sum);
-
-                let powers = r_vec_receiver.recv().unwrap();
-
-                let now = Instant::now();
-                // now we do the multi exponentiation
-                let getter = |i: usize| -> <E::Fr as PrimeField>::Repr {
-                    // i denotes the column of the public input, and j denotes which public input
-                    let mut c = public_inputs[0][i];
-                    for j in 1..public_inputs.len() {
-                        let mut ai = public_inputs[j][i];
-                        ai.mul_assign(&powers[j]);
-                        c.add_assign(&ai);
-                    }
-                    c.into_repr()
-                };
-
-                let totsi = par_multiscalar::<_, E::G1Affine>(
-                    &ScalarList::Getter(getter, l),
-                    &pvk.multiscalar.at_point(1),
-                    std::mem::size_of::<<E::Fr as PrimeField>::Repr>() * 8,
-                );
-
-                g_ic.add_assign(&totsi);
-
-                let ml = E::miller_loop(&[(&g_ic.into_affine().prepare(), &pvk.gamma_g2)]);
-                let elapsed = now.elapsed().as_millis();
-                debug!("table generation: {}ms", elapsed);
-
-                ml
+            // We want to compute MUL(i:0 -> l) S_i ^ (SUM(j:0 -> n) ai,j * r^j)
+            // this table keeps tracks of incremental computation of each i-th
+            // exponent to later multiply with S_i
+            // The index of the table is i, which is an index of the public
+            // input element
+            // We incrementally build the r vector and the table
+            // NOTE: in this version it's not r^2j but simply r^j
+
+            let l = public_inputs[0].len();
+            let mut g_ic = pvk.ic_projective[0];
+            g_ic.mul_assign(r_sum);
+
+            let powers = r_vec_receiver.recv().unwrap();
+
+            let now = Instant::now();
+            // now we do the multi exponentiation
+            let getter = |i: usize| -> <E::Fr as PrimeField>::Repr {
+                // i denotes the column of the public input, and j denotes which public input
+                let mut c = public_inputs[0][i];
+                for j in 1..public_inputs.len() {
+                    let mut ai = public_inputs[j][i];
+                    ai.mul_assign(&powers[j]);
+                    c.add_assign(&ai);
+                }
+                c.into_repr()
+            };
+
+            let totsi = par_multiscalar::<_, E::G1Affine>(
+                &ScalarList::Getter(getter, l),
+                &pvk.multiscalar.at_point(1),
+                std::mem::size_of::<<E::Fr as PrimeField>::Repr>() * 8,
+            );
+
+            g_ic.add_assign(&totsi);
+
+            let ml = E::miller_loop(&[(&g_ic.into_affine().prepare(), &pvk.gamma_g2)]);
+            let elapsed = now.elapsed().as_millis();
+            debug!("table generation: {}ms", elapsed);
+
+            ml
         }
     };