feat: add support for CUDA

Neptune now also runs on CUDA. Closes #104.
argumentcomputer · Sep 22, 2021 · 5bf574e · 5bf574e
1 parent e70ff72
commit 5bf574e
Show file tree

Hide file tree

Showing 19 changed files with 716 additions and 333 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -3,7 +3,7 @@ version: 2.1
 executors:
   default:
     machine:
-      image: ubuntu-1604-cuda-10.1:201909-23
+      image: ubuntu-2004-cuda-11.2:202103-01
     working_directory: ~/gpuci
     resource_class: gpu.nvidia.medium
 
@@ -18,6 +18,22 @@ restore-cache: &restore-cache
       - repo-source-{{ .Branch }}-{{ .Revision }}
 
 commands:
+  set-env-path:
+    steps:
+      - run:
+          name: Set the PATH env variable
+          command: |
+            echo 'export PATH="$HOME:~/.cargo/bin:/usr/local/cuda-11.2/bin:$PATH"' | tee --append $BASH_ENV
+            source $BASH_ENV
+
+  install-gpu-deps:
+    steps:
+      - run:
+          name: Install libraries for GPU tests
+          command: |
+            sudo apt update
+            sudo apt install -y ocl-icd-opencl-dev
+
   test_target:
     parameters:
       target:
@@ -35,19 +51,31 @@ commands:
           no_output_timeout: 30m
       - run:
           name: Test (opencl) (<< parameters.target >>)
-          command: TARGET=<< parameters.target >> cargo test --release --features opencl -- --test-threads=1
+          command: TARGET=<< parameters.target >> cargo test --release --features opencl,arity2,arity4,arity8,arity11,arity16,arity24,arity36 -- --test-threads=1
           no_output_timeout: 30m
+      # Darwin CI doesn't support CUDA
+      - when:
+          condition:
+            not:
+              equal: [ darwin, << parameters.target >> ]
+          steps:
+            - run:
+                name: Test (cuda) (<< parameters.target >>)
+                command: TARGET=<< parameters.target >> cargo test --release --features cuda,arity2,arity4,arity8,arity11,arity16,arity24,arity36 -- --test-threads=1
+                no_output_timeout: 30m
+            - run:
+                name: Test (cuda,opencl) (<< parameters.target >>)
+                command: TARGET=<< parameters.target >> cargo test --release --features cuda,opencl,arity2,arity4,arity8,arity11,arity16,arity24,arity36 -- --test-threads=1
+                no_output_timeout: 30m
 
 jobs:
   cargo_fetch:
     executor: default
     steps:
       - checkout
       - run: curl https://sh.rustup.rs -sSf | sh -s -- -y
-      - run: echo 'export PATH="$HOME:~/.cargo/bin:$PATH"' >> $BASH_ENV
-      - run: echo $BASH_ENV
+      - set-env-path
       - run: echo $HOME
-      - run: source $BASH_ENV
       - run: cargo --version
       - run: rustc --version
       - run:
@@ -77,12 +105,14 @@ jobs:
 
   test_x86_64-unknown-linux-gnu:
     executor: default
+    environment:
+      RUST_LOG: debug
+      # Build the kernel only for the single architecture that is used on CI. This should reduce
+      # the overall compile-time significantly.
+      NEPTUNE_CUDA_NVCC_ARGS: --fatbin --gpu-architecture=sm_75 --generate-code=arch=compute_75,code=sm_75
     steps:
-      - run: echo 'export PATH="$HOME:~/.cargo/bin:$PATH"' >> $BASH_ENV
-      - run: source $BASH_ENV
-      - run: sudo apt-get update -y
-      - run: apt-cache search opencl
-      - run: sudo apt install -y ocl-icd-opencl-dev
+      - set-env-path
+      - install-gpu-deps
       - test_target:
           target: "x86_64-unknown-linux-gnu"
 
@@ -109,8 +139,7 @@ jobs:
     steps:
       - *restore-workspace
       - *restore-cache
-      - run: echo 'export PATH="$HOME:~/.cargo/bin:$PATH"' >> $BASH_ENV
-      - run: source $BASH_ENV
+      - set-env-path
       - run:
           name: Run cargo fmt
           command: cargo fmt --all -- --check
@@ -120,22 +149,25 @@ jobs:
     steps:
       - *restore-workspace
       - *restore-cache
-      - run: echo 'export PATH="$HOME:~/.cargo/bin:$PATH"' >> $BASH_ENV
-      - run: source $BASH_ENV
-      - run: sudo apt-get update -y
-      - run: apt-cache search opencl
-      - run: sudo apt install -y ocl-icd-opencl-dev
+      - set-env-path
+      - install-gpu-deps
       - run:
           # gbench doesn't support running without GPU support, hence don't run Clippy on the whole workspace
           name: Run cargo clippy, without gbench
           command: cargo clippy --all-targets -- -D warnings
       - run:
           name: Run cargo clippy (futhark)
-          command: cargo clippy --workspace --all-targets --features futhark -- -D warnings
+          command: cargo clippy --workspace --all-targets --no-default-features --features futhark -- -D warnings
       - run:
           name: Run cargo clippy (opencl)
           # gbench enables `futhark` by default, disable it using `--no-default-features`.
-          command: cargo clippy --workspace --all-targets --no-default-features --features opencl -- -D warnings
+          command: cargo clippy --workspace --all-targets --no-default-features --features opencl,arity2 -- -D warnings
+      - run:
+          name: Run cargo clippy (cuda)
+          command: cargo clippy --workspace --all-targets --no-default-features --features cuda,arity2 -- -D warnings
+      - run:
+          name: Run cargo clippy (cuda,opencl)
+          command: cargo clippy --workspace --all-targets --no-default-features --features cuda,opencl,arity2 -- -D warnings
 
 workflows:
   version: 2.1

diff --git a/Cargo.toml b/Cargo.toml
@@ -20,7 +20,7 @@ ff = "0.11.0"
 generic-array = "0.14.4"
 itertools = { version = "0.8.0" }
 log = "0.4.8"
-rust-gpu-tools = { version = "0.4.0", optional = true }
+rust-gpu-tools = { git = "https://github.com/filecoin-project/rust-gpu-tools", branch = "master", default-features = false, optional = true }
 triton = { version = "2.1.0", package = "neptune-triton", default-features = false, features = ["opencl"], optional = true }
 pairing = "0.21"
 
@@ -32,6 +32,14 @@ tempdir = "0.3"
 rand_xorshift = "0.3.0"
 serde_json = "1.0.53"
 
+[build-dependencies]
+blstrs = { git = "https://github.com/filecoin-project/blstrs", branch = "master" }
+ec-gpu = { git = "https://github.com/filecoin-project/ec-gpu", branch = "master", optional = true }
+ec-gpu-gen = { git = "https://github.com/filecoin-project/ec-gpu", branch = "master", optional = true }
+execute = "0.2.9"
+hex = "0.4"
+sha2 = "0.9"
+
 [[bench]]
 name = "hash"
 harness = false
@@ -46,8 +54,19 @@ codegen-units = 1
 
 [features]
 default = []
-futhark = ["triton", "rust-gpu-tools"]
-opencl = ["rust-gpu-tools", "blstrs/gpu", "ec-gpu-gen", "ec-gpu"]
+cuda = ["rust-gpu-tools/cuda", "blstrs/gpu", "ec-gpu-gen", "ec-gpu"]
+futhark = ["triton", "rust-gpu-tools/opencl"]
+opencl = ["rust-gpu-tools/opencl", "blstrs/gpu", "ec-gpu-gen", "ec-gpu"]
+# The supported arities for Poseidon running on the GPU are specified at compile-time.
+arity2 = []
+arity4 = []
+arity8 = []
+arity11 = []
+arity16 = []
+arity24 = []
+arity36 = []
+# With this feature set, also the strengthened version of the kernel will be compiled.
+strengthened = []
 
 [workspace]
 members = [

diff --git a/README.md b/README.md
@@ -17,28 +17,51 @@ selection (including especially 2, 4, and 8 — which are explicitly, rather tha
 Proofs](https://github.com/filecoin-project/rust-fil-proofs) make heavy use of 8-ary merkle trees and merkle inclusion
 proofs (in SNARKs).
 
+At the time of the 1.0.0 release, Neptune on RTX 2080Ti GPU can build 8-ary Merkle trees for 4GiB of input in 16 seconds.
+
+## Implementation Specification
+
+Filecoin's Poseidon specification is published in the Filecoin specification document [here](https://spec.filecoin.io/#section-algorithms.crypto.poseidon). Additionally, a PDF version is mirrored in this repo [here](poseidon_spec.pdf).
+
+## Environment variables
+
+ - `NEPTUNE_DEFAULT_GPU=<unique-id>` allows you to select the default GPU that tree-builder is going to run on given its unique ID.
+
+(The unique ID is the UUID or the hexadecimal Bus-ID that can be found through `nvidia-smi`, `rocm-smi`, `lspci` and etc.)
+
+ - `NEPTUNE_GPU_FRAMEWORK=<cuda | opencl>` allows to select whether the CUDA or OpenCL implementation should be used. If not set, `cuda` will be used if available.
+
+ - `NEPTUNE_CUDA_NVCC_ARGS`
+
+By default the CUDA kernel is compiled for several architectures, which may take a long time. `BELLMAN_CUDA_NVCC_ARGS` can be used to override those arguments. The input and output file will still be automatically set.
+
+    // Example for compiling the kernel for only the Turing architecture
+    NEPTUNE_CUDA_NVCC_ARGS="--fatbin --gpu-architecture=sm_75 --generate-code=arch=compute_75,code=sm_75"
+
+## Rust feature flags
+
 Neptune also supports batch hashing and tree building, which can be performed on a GPU. The underlying GPU
 implementation, [neptune-triton](https://github.com/filecoin-project/neptune-triton) is implemented in the [Futhark
 Programming Language](https://futhark-lang.org/). To use `neptune-triton` GPU batch hashing, compile `neptune` with the
 `futhark` feature.
 
-Neptune now implements GPU batch hashing in pure OpenCL. The initial implementation is a bit less than 2x faster than
-the Futhark implementation, so once stabilized this will likely be the preferred option. The pure OpenCL batch hashing
-is provided by the internal `proteus` module. To use `proteus`, compile `neptune` with the `opencl` feature.
+Neptune now implements GPU batch hashing in pure CUDA/OpenCL. The initial implementation is a bit less than 2x faster than
+the Futhark implementation, so once stabilized this will likely be the preferred option. The pure CUDA/OpenCL batch hashing
+is provided by the internal `proteus` module. To use `proteus`, compile `neptune` with the `opencl` and/or `cuda` feature.
 
-The `futhark` and `opencl` features are mutually exclusive.
+The `futhark` and `cuda/opencl` features are mutually exclusive. The `cuda` and `opencl` feature can be used independently or together. If both `cuda` and `opencl` are used, you can also select which implementation to use via the `NEPTUNE_GPU_FRAMEWORK` environment variable.
 
-At the time of the 1.0.0 release, Neptune on RTX 2080Ti GPU can build 8-ary Merkle trees for 4GiB of input in 16 seconds.
+### Arities
 
-## Implementation Specification
+The CUDA/OpenCL kernel (enabled with the `cuda/opencl` feature) is generated with specific arities. Those arities need to be specified at compile-time via Rust feature flags. Available features are `arity2`, `arity4`, `arity8`, `arity11`, `arity16`, `arity24`, `arity36`. When the `strengthened` feature is enables, there will be an additional strengthened version available for each arity.
 
-Filecoin's Poseidon specification is published in the Filecoin specification document [here](https://spec.filecoin.io/#section-algorithms.crypto.poseidon). Additionally, a PDF version is mirrored in this repo [here](poseidon_spec.pdf).
+When using the `cuda` feature, the kernel is generated at compile-time. The more arities are used, the longer is the compile time. Hence, by default there are no specific arities enabled. You need to set at least one yourself.
 
-## Environment variables
+## Running the tests
 
- - `NEPTUNE_DEFAULT_GPU=<unique-id>` allows you to select the default GPU that tree-builder is going to run on given its unique ID.
+As the compile-time of the kernel depends on how many arities are used, there are no arities enabled by default. In order to run the test, all arities need to explicitly be enabled. To run all tests on e.g. the CUDA implementation, run:
 
-(The unique ID is the UUID or the hexadecimal Bus-ID that can be found through `nvidia-smi`, `rocm-smi`, `lspci` and etc.)
+    cargo test --no-default-features --features blst,cuda,arity2,arity4,arity8,arity11,arity16,arity24,arity36
 
 ## Future Work
 

diff --git a/build.rs b/build.rs
@@ -0,0 +1,83 @@
+/// The build script is needed to compile the CUDA kernel.
+
+#[cfg(feature = "cuda")]
+fn main() {
+    use std::path::PathBuf;
+    use std::process::Command;
+    use std::{env, fs};
+
+    use blstrs::Scalar as Fr;
+    use ec_gpu_gen::Limb32;
+    use sha2::{Digest, Sha256};
+
+    #[path = "src/proteus/sources.rs"]
+    mod sources;
+
+    let kernel_source = sources::generate_program::<Fr, Limb32>();
+    let out_dir = env::var("OUT_DIR").expect("OUT_DIR was not set.");
+
+    // Make it possible to override the default options. Though the source and output file is
+    // always set automatically.
+    let mut nvcc = match env::var("NEPTUNE_CUDA_NVCC_ARGS") {
+        Ok(args) => execute::command(format!("nvcc {}", args)),
+        Err(_) => {
+            let mut command = Command::new("nvcc");
+            command
+                .arg("--optimize=6")
+                .arg("--fatbin")
+                .arg("--gpu-architecture=sm_86")
+                .arg("--generate-code=arch=compute_86,code=sm_86")
+                .arg("--generate-code=arch=compute_80,code=sm_80")
+                .arg("--generate-code=arch=compute_75,code=sm_75");
+            command
+        }
+    };
+
+    // Hash the source and and the compile flags. Use that as the filename, so that the kernel is
+    // only rebuilt if any of them change.
+    let mut hasher = Sha256::new();
+    hasher.update(kernel_source.as_bytes());
+    hasher.update(&format!("{:?}", &nvcc));
+    let kernel_digest = hex::encode(hasher.finalize());
+
+    let source_path: PathBuf = [&out_dir, &format!("{}.cu", &kernel_digest)]
+        .iter()
+        .collect();
+    let fatbin_path: PathBuf = [&out_dir, &format!("{}.fatbin", &kernel_digest)]
+        .iter()
+        .collect();
+
+    fs::write(&source_path, &kernel_source).unwrap_or_else(|_| {
+        panic!(
+            "Cannot write kernel source at {}.",
+            source_path.to_str().unwrap()
+        )
+    });
+
+    // Only compile if the output doesn't exist yet.
+    if !fatbin_path.as_path().exists() {
+        let status = nvcc
+            .arg("--output-file")
+            .arg(&fatbin_path)
+            .arg(&source_path)
+            .status()
+            .expect("Cannot run nvcc.");
+
+        if !status.success() {
+            panic!(
+                "nvcc failed. See the kernel source at {}",
+                source_path.to_str().unwrap()
+            );
+        }
+    }
+
+    // The idea to put the path to the farbin into a compile-time env variable is from
+    // https://github.com/LutzCle/fast-interconnects-demo/blob/b80ea8e04825167f486ab8ac1b5d67cf7dd51d2c/rust-demo/build.rs
+    println!(
+        "cargo:rustc-env=CUDA_FATBIN={}",
+        fatbin_path.to_str().unwrap()
+    );
+}
+
+#[cfg(not(feature = "cuda"))]
+fn main() {}
diff --git a/gbench/Cargo.toml b/gbench/Cargo.toml
@@ -16,11 +16,12 @@ ff = "0.11.0"
 generic-array = "0.14.4"
 log = "0.4.8"
 neptune = { path = "../", default-features = false }
-rust-gpu-tools = { version = "0.4.0", optional = true }
+rust-gpu-tools = { git = "https://github.com/filecoin-project/rust-gpu-tools", branch = "master", default-features = false, optional = true }
 structopt = { version = "0.3", default-features = false }
 blstrs = { git = "https://github.com/filecoin-project/blstrs", branch = "master" }
 
 [features]
 default = ["futhark"]
-futhark = ["neptune/futhark", "rust-gpu-tools"]
-opencl = ["neptune/opencl", "rust-gpu-tools"]
+cuda = ["neptune/cuda", "rust-gpu-tools/cuda"]
+futhark = ["neptune/futhark", "rust-gpu-tools/opencl"]
+opencl = ["neptune/opencl", "rust-gpu-tools/opencl"]
diff --git a/gbench/src/main.rs b/gbench/src/main.rs
@@ -6,7 +6,7 @@ use generic_array::GenericArray;
 use log::info;
 use neptune::column_tree_builder::{ColumnTreeBuilder, ColumnTreeBuilderTrait};
 use neptune::{batch_hasher::Batcher, BatchHasher};
-use rust_gpu_tools::opencl::{Device, UniqueId};
+use rust_gpu_tools::{Device, UniqueId};
 use std::convert::TryFrom;
 use std::thread;
 use std::time::Instant;
@@ -100,7 +100,7 @@ struct Opts {
 }
 
 fn main() {
-    #[cfg(all(any(feature = "gpu", feature = "opencl"), target_os = "macos"))]
+    #[cfg(all(any(feature = "cuda", feature = "opencl"), target_os = "macos"))]
     unimplemented!("Running on macos is not recommended and may have bad consequences -- experiment at your own risk.");
     env_logger::init();
 
@@ -121,17 +121,15 @@ fn main() {
     // Comma separated list of GPU bus-ids
     let gpus = std::env::var("NEPTUNE_GBENCH_GPUS");
 
-    let default_device = *Device::all().first().unwrap();
+    let default_device = *Device::all().first().expect("Cannot get a default device");
 
     let devices = gpus
         .map(|v| {
             v.split(',')
                 .map(|s| UniqueId::try_from(s).expect("Invalid unique ID!"))
                 .map(|unique_id| {
-                    let device = Device::by_unique_id(unique_id).unwrap_or_else(|_| {
-                        panic!("No device with unique ID {} found!", unique_id)
-                    });
-                    device
+                    Device::by_unique_id(unique_id)
+                        .unwrap_or_else(|| panic!("No device with unique ID {} found!", unique_id))
                 })
                 .collect::<Vec<_>>()
         })