Skip to content

Commit

Permalink
feat: add support for CUDA
Browse files Browse the repository at this point in the history
Neptune now also runs on CUDA.

Closes #104.
  • Loading branch information
vmx committed Sep 22, 2021
1 parent e70ff72 commit 5bf574e
Show file tree
Hide file tree
Showing 19 changed files with 716 additions and 333 deletions.
70 changes: 51 additions & 19 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ version: 2.1
executors:
default:
machine:
image: ubuntu-1604-cuda-10.1:201909-23
image: ubuntu-2004-cuda-11.2:202103-01
working_directory: ~/gpuci
resource_class: gpu.nvidia.medium

Expand All @@ -18,6 +18,22 @@ restore-cache: &restore-cache
- repo-source-{{ .Branch }}-{{ .Revision }}

commands:
set-env-path:
steps:
- run:
name: Set the PATH env variable
command: |
echo 'export PATH="$HOME:~/.cargo/bin:/usr/local/cuda-11.2/bin:$PATH"' | tee --append $BASH_ENV
source $BASH_ENV
install-gpu-deps:
steps:
- run:
name: Install libraries for GPU tests
command: |
sudo apt update
sudo apt install -y ocl-icd-opencl-dev
test_target:
parameters:
target:
Expand All @@ -35,19 +51,31 @@ commands:
no_output_timeout: 30m
- run:
name: Test (opencl) (<< parameters.target >>)
command: TARGET=<< parameters.target >> cargo test --release --features opencl -- --test-threads=1
command: TARGET=<< parameters.target >> cargo test --release --features opencl,arity2,arity4,arity8,arity11,arity16,arity24,arity36 -- --test-threads=1
no_output_timeout: 30m
# Darwin CI doesn't support CUDA
- when:
condition:
not:
equal: [ darwin, << parameters.target >> ]
steps:
- run:
name: Test (cuda) (<< parameters.target >>)
command: TARGET=<< parameters.target >> cargo test --release --features cuda,arity2,arity4,arity8,arity11,arity16,arity24,arity36 -- --test-threads=1
no_output_timeout: 30m
- run:
name: Test (cuda,opencl) (<< parameters.target >>)
command: TARGET=<< parameters.target >> cargo test --release --features cuda,opencl,arity2,arity4,arity8,arity11,arity16,arity24,arity36 -- --test-threads=1
no_output_timeout: 30m

jobs:
cargo_fetch:
executor: default
steps:
- checkout
- run: curl https://sh.rustup.rs -sSf | sh -s -- -y
- run: echo 'export PATH="$HOME:~/.cargo/bin:$PATH"' >> $BASH_ENV
- run: echo $BASH_ENV
- set-env-path
- run: echo $HOME
- run: source $BASH_ENV
- run: cargo --version
- run: rustc --version
- run:
Expand Down Expand Up @@ -77,12 +105,14 @@ jobs:

test_x86_64-unknown-linux-gnu:
executor: default
environment:
RUST_LOG: debug
# Build the kernel only for the single architecture that is used on CI. This should reduce
# the overall compile-time significantly.
NEPTUNE_CUDA_NVCC_ARGS: --fatbin --gpu-architecture=sm_75 --generate-code=arch=compute_75,code=sm_75
steps:
- run: echo 'export PATH="$HOME:~/.cargo/bin:$PATH"' >> $BASH_ENV
- run: source $BASH_ENV
- run: sudo apt-get update -y
- run: apt-cache search opencl
- run: sudo apt install -y ocl-icd-opencl-dev
- set-env-path
- install-gpu-deps
- test_target:
target: "x86_64-unknown-linux-gnu"

Expand All @@ -109,8 +139,7 @@ jobs:
steps:
- *restore-workspace
- *restore-cache
- run: echo 'export PATH="$HOME:~/.cargo/bin:$PATH"' >> $BASH_ENV
- run: source $BASH_ENV
- set-env-path
- run:
name: Run cargo fmt
command: cargo fmt --all -- --check
Expand All @@ -120,22 +149,25 @@ jobs:
steps:
- *restore-workspace
- *restore-cache
- run: echo 'export PATH="$HOME:~/.cargo/bin:$PATH"' >> $BASH_ENV
- run: source $BASH_ENV
- run: sudo apt-get update -y
- run: apt-cache search opencl
- run: sudo apt install -y ocl-icd-opencl-dev
- set-env-path
- install-gpu-deps
- run:
# gbench doesn't support running without GPU support, hence don't run Clippy on the whole workspace
name: Run cargo clippy, without gbench
command: cargo clippy --all-targets -- -D warnings
- run:
name: Run cargo clippy (futhark)
command: cargo clippy --workspace --all-targets --features futhark -- -D warnings
command: cargo clippy --workspace --all-targets --no-default-features --features futhark -- -D warnings
- run:
name: Run cargo clippy (opencl)
# gbench enables `futhark` by default, disable it using `--no-default-features`.
command: cargo clippy --workspace --all-targets --no-default-features --features opencl -- -D warnings
command: cargo clippy --workspace --all-targets --no-default-features --features opencl,arity2 -- -D warnings
- run:
name: Run cargo clippy (cuda)
command: cargo clippy --workspace --all-targets --no-default-features --features cuda,arity2 -- -D warnings
- run:
name: Run cargo clippy (cuda,opencl)
command: cargo clippy --workspace --all-targets --no-default-features --features cuda,opencl,arity2 -- -D warnings

workflows:
version: 2.1
Expand Down
25 changes: 22 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ ff = "0.11.0"
generic-array = "0.14.4"
itertools = { version = "0.8.0" }
log = "0.4.8"
rust-gpu-tools = { version = "0.4.0", optional = true }
rust-gpu-tools = { git = "https://github.com/filecoin-project/rust-gpu-tools", branch = "master", default-features = false, optional = true }
triton = { version = "2.1.0", package = "neptune-triton", default-features = false, features = ["opencl"], optional = true }
pairing = "0.21"

Expand All @@ -32,6 +32,14 @@ tempdir = "0.3"
rand_xorshift = "0.3.0"
serde_json = "1.0.53"

[build-dependencies]
blstrs = { git = "https://github.com/filecoin-project/blstrs", branch = "master" }
ec-gpu = { git = "https://github.com/filecoin-project/ec-gpu", branch = "master", optional = true }
ec-gpu-gen = { git = "https://github.com/filecoin-project/ec-gpu", branch = "master", optional = true }
execute = "0.2.9"
hex = "0.4"
sha2 = "0.9"

[[bench]]
name = "hash"
harness = false
Expand All @@ -46,8 +54,19 @@ codegen-units = 1

[features]
default = []
futhark = ["triton", "rust-gpu-tools"]
opencl = ["rust-gpu-tools", "blstrs/gpu", "ec-gpu-gen", "ec-gpu"]
cuda = ["rust-gpu-tools/cuda", "blstrs/gpu", "ec-gpu-gen", "ec-gpu"]
futhark = ["triton", "rust-gpu-tools/opencl"]
opencl = ["rust-gpu-tools/opencl", "blstrs/gpu", "ec-gpu-gen", "ec-gpu"]
# The supported arities for Poseidon running on the GPU are specified at compile-time.
arity2 = []
arity4 = []
arity8 = []
arity11 = []
arity16 = []
arity24 = []
arity36 = []
# With this feature set, also the strengthened version of the kernel will be compiled.
strengthened = []

[workspace]
members = [
Expand Down
43 changes: 33 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,28 +17,51 @@ selection (including especially 2, 4, and 8 — which are explicitly, rather tha
Proofs](https://github.com/filecoin-project/rust-fil-proofs) make heavy use of 8-ary merkle trees and merkle inclusion
proofs (in SNARKs).

At the time of the 1.0.0 release, Neptune on RTX 2080Ti GPU can build 8-ary Merkle trees for 4GiB of input in 16 seconds.

## Implementation Specification

Filecoin's Poseidon specification is published in the Filecoin specification document [here](https://spec.filecoin.io/#section-algorithms.crypto.poseidon). Additionally, a PDF version is mirrored in this repo [here](poseidon_spec.pdf).

## Environment variables

- `NEPTUNE_DEFAULT_GPU=<unique-id>` allows you to select the default GPU that tree-builder is going to run on given its unique ID.

(The unique ID is the UUID or the hexadecimal Bus-ID that can be found through `nvidia-smi`, `rocm-smi`, `lspci` and etc.)

- `NEPTUNE_GPU_FRAMEWORK=<cuda | opencl>` allows to select whether the CUDA or OpenCL implementation should be used. If not set, `cuda` will be used if available.

- `NEPTUNE_CUDA_NVCC_ARGS`

By default the CUDA kernel is compiled for several architectures, which may take a long time. `BELLMAN_CUDA_NVCC_ARGS` can be used to override those arguments. The input and output file will still be automatically set.

// Example for compiling the kernel for only the Turing architecture
NEPTUNE_CUDA_NVCC_ARGS="--fatbin --gpu-architecture=sm_75 --generate-code=arch=compute_75,code=sm_75"

## Rust feature flags

Neptune also supports batch hashing and tree building, which can be performed on a GPU. The underlying GPU
implementation, [neptune-triton](https://github.com/filecoin-project/neptune-triton) is implemented in the [Futhark
Programming Language](https://futhark-lang.org/). To use `neptune-triton` GPU batch hashing, compile `neptune` with the
`futhark` feature.

Neptune now implements GPU batch hashing in pure OpenCL. The initial implementation is a bit less than 2x faster than
the Futhark implementation, so once stabilized this will likely be the preferred option. The pure OpenCL batch hashing
is provided by the internal `proteus` module. To use `proteus`, compile `neptune` with the `opencl` feature.
Neptune now implements GPU batch hashing in pure CUDA/OpenCL. The initial implementation is a bit less than 2x faster than
the Futhark implementation, so once stabilized this will likely be the preferred option. The pure CUDA/OpenCL batch hashing
is provided by the internal `proteus` module. To use `proteus`, compile `neptune` with the `opencl` and/or `cuda` feature.

The `futhark` and `opencl` features are mutually exclusive.
The `futhark` and `cuda/opencl` features are mutually exclusive. The `cuda` and `opencl` feature can be used independently or together. If both `cuda` and `opencl` are used, you can also select which implementation to use via the `NEPTUNE_GPU_FRAMEWORK` environment variable.

At the time of the 1.0.0 release, Neptune on RTX 2080Ti GPU can build 8-ary Merkle trees for 4GiB of input in 16 seconds.
### Arities

## Implementation Specification
The CUDA/OpenCL kernel (enabled with the `cuda/opencl` feature) is generated with specific arities. Those arities need to be specified at compile-time via Rust feature flags. Available features are `arity2`, `arity4`, `arity8`, `arity11`, `arity16`, `arity24`, `arity36`. When the `strengthened` feature is enables, there will be an additional strengthened version available for each arity.

Filecoin's Poseidon specification is published in the Filecoin specification document [here](https://spec.filecoin.io/#section-algorithms.crypto.poseidon). Additionally, a PDF version is mirrored in this repo [here](poseidon_spec.pdf).
When using the `cuda` feature, the kernel is generated at compile-time. The more arities are used, the longer is the compile time. Hence, by default there are no specific arities enabled. You need to set at least one yourself.

## Environment variables
## Running the tests

- `NEPTUNE_DEFAULT_GPU=<unique-id>` allows you to select the default GPU that tree-builder is going to run on given its unique ID.
As the compile-time of the kernel depends on how many arities are used, there are no arities enabled by default. In order to run the test, all arities need to explicitly be enabled. To run all tests on e.g. the CUDA implementation, run:

(The unique ID is the UUID or the hexadecimal Bus-ID that can be found through `nvidia-smi`, `rocm-smi`, `lspci` and etc.)
cargo test --no-default-features --features blst,cuda,arity2,arity4,arity8,arity11,arity16,arity24,arity36

## Future Work

Expand Down
83 changes: 83 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/// The build script is needed to compile the CUDA kernel.

#[cfg(feature = "cuda")]
fn main() {
use std::path::PathBuf;
use std::process::Command;
use std::{env, fs};

use blstrs::Scalar as Fr;
use ec_gpu_gen::Limb32;
use sha2::{Digest, Sha256};

#[path = "src/proteus/sources.rs"]
mod sources;

let kernel_source = sources::generate_program::<Fr, Limb32>();
let out_dir = env::var("OUT_DIR").expect("OUT_DIR was not set.");

// Make it possible to override the default options. Though the source and output file is
// always set automatically.
let mut nvcc = match env::var("NEPTUNE_CUDA_NVCC_ARGS") {
Ok(args) => execute::command(format!("nvcc {}", args)),
Err(_) => {
let mut command = Command::new("nvcc");
command
.arg("--optimize=6")
.arg("--fatbin")
.arg("--gpu-architecture=sm_86")
.arg("--generate-code=arch=compute_86,code=sm_86")
.arg("--generate-code=arch=compute_80,code=sm_80")
.arg("--generate-code=arch=compute_75,code=sm_75");
command
}
};

// Hash the source and and the compile flags. Use that as the filename, so that the kernel is
// only rebuilt if any of them change.
let mut hasher = Sha256::new();
hasher.update(kernel_source.as_bytes());
hasher.update(&format!("{:?}", &nvcc));
let kernel_digest = hex::encode(hasher.finalize());

let source_path: PathBuf = [&out_dir, &format!("{}.cu", &kernel_digest)]
.iter()
.collect();
let fatbin_path: PathBuf = [&out_dir, &format!("{}.fatbin", &kernel_digest)]
.iter()
.collect();

fs::write(&source_path, &kernel_source).unwrap_or_else(|_| {
panic!(
"Cannot write kernel source at {}.",
source_path.to_str().unwrap()
)
});

// Only compile if the output doesn't exist yet.
if !fatbin_path.as_path().exists() {
let status = nvcc
.arg("--output-file")
.arg(&fatbin_path)
.arg(&source_path)
.status()
.expect("Cannot run nvcc.");

if !status.success() {
panic!(
"nvcc failed. See the kernel source at {}",
source_path.to_str().unwrap()
);
}
}

// The idea to put the path to the farbin into a compile-time env variable is from
// https://github.com/LutzCle/fast-interconnects-demo/blob/b80ea8e04825167f486ab8ac1b5d67cf7dd51d2c/rust-demo/build.rs
println!(
"cargo:rustc-env=CUDA_FATBIN={}",
fatbin_path.to_str().unwrap()
);
}

#[cfg(not(feature = "cuda"))]
fn main() {}
7 changes: 4 additions & 3 deletions gbench/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@ ff = "0.11.0"
generic-array = "0.14.4"
log = "0.4.8"
neptune = { path = "../", default-features = false }
rust-gpu-tools = { version = "0.4.0", optional = true }
rust-gpu-tools = { git = "https://github.com/filecoin-project/rust-gpu-tools", branch = "master", default-features = false, optional = true }
structopt = { version = "0.3", default-features = false }
blstrs = { git = "https://github.com/filecoin-project/blstrs", branch = "master" }

[features]
default = ["futhark"]
futhark = ["neptune/futhark", "rust-gpu-tools"]
opencl = ["neptune/opencl", "rust-gpu-tools"]
cuda = ["neptune/cuda", "rust-gpu-tools/cuda"]
futhark = ["neptune/futhark", "rust-gpu-tools/opencl"]
opencl = ["neptune/opencl", "rust-gpu-tools/opencl"]
12 changes: 5 additions & 7 deletions gbench/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use generic_array::GenericArray;
use log::info;
use neptune::column_tree_builder::{ColumnTreeBuilder, ColumnTreeBuilderTrait};
use neptune::{batch_hasher::Batcher, BatchHasher};
use rust_gpu_tools::opencl::{Device, UniqueId};
use rust_gpu_tools::{Device, UniqueId};
use std::convert::TryFrom;
use std::thread;
use std::time::Instant;
Expand Down Expand Up @@ -100,7 +100,7 @@ struct Opts {
}

fn main() {
#[cfg(all(any(feature = "gpu", feature = "opencl"), target_os = "macos"))]
#[cfg(all(any(feature = "cuda", feature = "opencl"), target_os = "macos"))]
unimplemented!("Running on macos is not recommended and may have bad consequences -- experiment at your own risk.");
env_logger::init();

Expand All @@ -121,17 +121,15 @@ fn main() {
// Comma separated list of GPU bus-ids
let gpus = std::env::var("NEPTUNE_GBENCH_GPUS");

let default_device = *Device::all().first().unwrap();
let default_device = *Device::all().first().expect("Cannot get a default device");

let devices = gpus
.map(|v| {
v.split(',')
.map(|s| UniqueId::try_from(s).expect("Invalid unique ID!"))
.map(|unique_id| {
let device = Device::by_unique_id(unique_id).unwrap_or_else(|_| {
panic!("No device with unique ID {} found!", unique_id)
});
device
Device::by_unique_id(unique_id)
.unwrap_or_else(|| panic!("No device with unique ID {} found!", unique_id))
})
.collect::<Vec<_>>()
})
Expand Down
Loading

0 comments on commit 5bf574e

Please sign in to comment.