Skip to content

Commit

Permalink
fix: restructure use of rayon to avoid deadlocks
Browse files Browse the repository at this point in the history
- perf: Uses rayon in the places where the work is cpu bound only 
- fix: Do not use custom rayon thread pools, as multiple of these can result in cross pool deadlocking.
- fix: Proxy `BELLMAN_NUM_CPUS` to `RAYON_NUM_THREADS` for now, and deprecate it.
- refactor: Use `RAYON_NUM_THREADS` as the baseline for determining the number of threads to to use
- perf: Use `in_place_scope` instead of `scope` to avoid spawning unnecessary threads.
- ci: Add CI tests to ensure the currently deadlocking test for `groth16_bench` is checked

Co-authored-by: nemo <nemo@protocol.ai>
  • Loading branch information
dignifiedquire and cryptonemo committed Sep 10, 2021
1 parent 7f93cc7 commit f0f8fe9
Show file tree
Hide file tree
Showing 10 changed files with 344 additions and 221 deletions.
26 changes: 26 additions & 0 deletions .circleci/config.yml
Expand Up @@ -92,6 +92,21 @@ commands:
command: TARGET=<< parameters.target >> cargo test --release --no-default-features --features gpu,blst
no_output_timeout: 30m

test_target_blst_gpu_ignored:
parameters:
target:
type: string
steps:
- *restore-workspace
- *restore-cache
- run:
name: Test blst ignored (<< parameters.target >>)
command: TARGET=<< parameters.target >> cargo test --release --no-default-features --features blst -- --ignored
no_output_timeout: 30m
- run:
name: Show results
command: cat aggregation.csv

jobs:

cargo_fetch:
Expand Down Expand Up @@ -164,6 +179,14 @@ jobs:
- test_target_blst_gpu:
target: "x86_64-unknown-linux-gnu"

test_blst_gpu_ignored_x86_64-unknown-linux-gnu:
executor: default
steps:
- set-env-path
- install-gpu-deps
- test_target_blst_gpu_ignored:
target: "x86_64-unknown-linux-gnu"

rustfmt:
executor: default
steps:
Expand Down Expand Up @@ -271,6 +294,9 @@ workflows:
- test_blst_gpu_x86_64-unknown-linux-gnu:
requires:
- cargo_fetch
- test_blst_gpu_ignored_x86_64-unknown-linux-gnu:
requires:
- cargo_fetch

#- coverage_run:
# name: coverage_default_features
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Expand Up @@ -23,7 +23,7 @@ byteorder = "1"
log = "0.4.8"
lazy_static = "1.4.0"
rand = "0.7"
rayon = "1.3.0"
rayon = "1.5.0"
memmap = "0.7.0"
thiserror = "1.0.10"
rustc-hash = "1.1.0"
Expand Down
9 changes: 9 additions & 0 deletions README.md
Expand Up @@ -66,6 +66,15 @@ The gpu extension contains some env vars that may be set externally to this libr
env::set_var("BELLMAN_CPU_UTILIZATION", "0.5");
```

- `RAYON_NUM_THREADS`

Restricts the number of threads used in the library to roughly twice that number (best effort). In the past this was done using `BELLMAN_NUM_CPUS` which is now deprecated. The default is set to the number of logical cores reported on the machine.

```rust
// Example
env::set_var("RAYON_NUM_THREADS", "6");
```

#### Supported / Tested Cards

Depending on the size of the proof being passed to the gpu for work, certain cards will not be able to allocate enough memory to either the FFT or Multiexp kernel. Below are a list of devices that work for small sets. In the future we will add the cuttoff point at which a given card will not be able to allocate enough memory to utilize the GPU.
Expand Down
12 changes: 6 additions & 6 deletions src/groth16/aggregate/macros.rs
Expand Up @@ -3,10 +3,10 @@ macro_rules! try_par {
$(
let mut $name = None;
)+
crate::multicore::THREAD_POOL.scoped(|s| {
rayon::in_place_scope(|s| {
$(
let $name = &mut $name;
s.execute(move || {
s.spawn(move |_| {
*$name = Some($f);
});)+
});
Expand All @@ -21,10 +21,10 @@ macro_rules! par {
$(
let mut $name = None;
)+
crate::multicore::THREAD_POOL.scoped(|s| {
rayon::in_place_scope(|s| {
$(
let $name = &mut $name;
s.execute(move || {
s.spawn(move |_| {
*$name = Some($f);
});)+
});
Expand All @@ -38,11 +38,11 @@ macro_rules! par {
let mut $name1 = None;
let mut $name2 = None;
)+
crate::multicore::THREAD_POOL.scoped(|s| {
rayon::in_place_scope(|s| {
$(
let $name1 = &mut $name1;
let $name2 = &mut $name2;
s.execute(move || {
s.spawn(move |_| {
let (a, b) = $f;
*$name1 = Some(a);
*$name2 = Some(b);
Expand Down
80 changes: 40 additions & 40 deletions src/groth16/aggregate/verify.rs
Expand Up @@ -132,46 +132,46 @@ pub fn verify_aggregate_proof<E: Engine + std::fmt::Debug, R: rand::RngCore + Se
// 5. compute the middle part of the final pairing equation, the one
// with the public inputs
let middle = {
// We want to compute MUL(i:0 -> l) S_i ^ (SUM(j:0 -> n) ai,j * r^j)
// this table keeps tracks of incremental computation of each i-th
// exponent to later multiply with S_i
// The index of the table is i, which is an index of the public
// input element
// We incrementally build the r vector and the table
// NOTE: in this version it's not r^2j but simply r^j

let l = public_inputs[0].len();
let mut g_ic = pvk.ic_projective[0];
g_ic.mul_assign(r_sum);

let powers = r_vec_receiver.recv().unwrap();

let now = Instant::now();
// now we do the multi exponentiation
let getter = |i: usize| -> <E::Fr as PrimeField>::Repr {
// i denotes the column of the public input, and j denotes which public input
let mut c = public_inputs[0][i];
for j in 1..public_inputs.len() {
let mut ai = public_inputs[j][i];
ai.mul_assign(&powers[j]);
c.add_assign(&ai);
}
c.into_repr()
};

let totsi = par_multiscalar::<_, E::G1Affine>(
&ScalarList::Getter(getter, l),
&pvk.multiscalar.at_point(1),
std::mem::size_of::<<E::Fr as PrimeField>::Repr>() * 8,
);

g_ic.add_assign(&totsi);

let ml = E::miller_loop(&[(&g_ic.into_affine().prepare(), &pvk.gamma_g2)]);
let elapsed = now.elapsed().as_millis();
debug!("table generation: {}ms", elapsed);

ml
// We want to compute MUL(i:0 -> l) S_i ^ (SUM(j:0 -> n) ai,j * r^j)
// this table keeps tracks of incremental computation of each i-th
// exponent to later multiply with S_i
// The index of the table is i, which is an index of the public
// input element
// We incrementally build the r vector and the table
// NOTE: in this version it's not r^2j but simply r^j

let l = public_inputs[0].len();
let mut g_ic = pvk.ic_projective[0];
g_ic.mul_assign(r_sum);

let powers = r_vec_receiver.recv().unwrap();

let now = Instant::now();
// now we do the multi exponentiation
let getter = |i: usize| -> <E::Fr as PrimeField>::Repr {
// i denotes the column of the public input, and j denotes which public input
let mut c = public_inputs[0][i];
for j in 1..public_inputs.len() {
let mut ai = public_inputs[j][i];
ai.mul_assign(&powers[j]);
c.add_assign(&ai);
}
c.into_repr()
};

let totsi = par_multiscalar::<_, E::G1Affine>(
&ScalarList::Getter(getter, l),
&pvk.multiscalar.at_point(1),
std::mem::size_of::<<E::Fr as PrimeField>::Repr>() * 8,
);

g_ic.add_assign(&totsi);

let ml = E::miller_loop(&[(&g_ic.into_affine().prepare(), &pvk.gamma_g2)]);
let elapsed = now.elapsed().as_millis();
debug!("table generation: {}ms", elapsed);

ml
}
};

Expand Down

0 comments on commit f0f8fe9

Please sign in to comment.