Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cudnn support #16

Closed
wants to merge 29 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
99d8c8f
init
M1ngXU Nov 5, 2022
e9ae2a6
added activations
M1ngXU Nov 5, 2022
88bc500
forward conv
M1ngXU Nov 6, 2022
9860a89
cudnn runs on correct stream, activation backward
M1ngXU Nov 7, 2022
f1aa74d
added convolution backward op
M1ngXU Nov 8, 2022
0fdc45f
added batchnorm (backward might be wrong), better tensor allocation, fmt
M1ngXU Nov 9, 2022
27cd864
added tensor ops
M1ngXU Nov 11, 2022
6040079
using cuda cudnn result to propagate cuda errors instead of panicking
M1ngXU Nov 11, 2022
94dd49c
removed tensors from batch norm struct
M1ngXU Nov 11, 2022
fd7b1a9
now only using cuda cudnn result, refactored filter
M1ngXU Nov 11, 2022
01bbbcd
removed activation descriptor as activation itself doesn't hold data
M1ngXU Nov 11, 2022
f3478fb
split modules into multiple files/dirs
M1ngXU Nov 11, 2022
c18c16d
more file/dir refactoring
M1ngXU Nov 11, 2022
616f490
refactored conv, now only using desc
M1ngXU Nov 11, 2022
d2ab32c
splitting batchnorm, adding docs
M1ngXU Nov 12, 2022
42c5ad4
added docs
M1ngXU Nov 12, 2022
6eca31e
finished doc
M1ngXU Nov 12, 2022
a615061
added softmax
M1ngXU Nov 12, 2022
916db90
added pooling
M1ngXU Nov 12, 2022
f77fccf
added custom kernel for tensor division
M1ngXU Nov 13, 2022
9efcadd
refactored cudnn custom kernel
M1ngXU Nov 13, 2022
e755961
implementing into kernel param for tensor data
M1ngXU Nov 13, 2022
fb0a32e
renaming as_data to get_data_ref
M1ngXU Nov 13, 2022
3892ba4
added scaling to division
M1ngXU Nov 13, 2022
73662c0
added clone_into_new to create new tensors with another data allocation
M1ngXU Nov 13, 2022
95e710e
refactored custom kernels, added sin/cos
M1ngXU Nov 13, 2022
8ef0600
added reduce and broadcasting, working on exmaple
M1ngXU Nov 15, 2022
b52dbed
fixed custom kernels
M1ngXU Dec 12, 2022
6a23616
stash
M1ngXU Jan 9, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
/target
/Cargo.lock
/src/cudnn/custom_kernels.cu
63 changes: 32 additions & 31 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,31 +1,32 @@
[package]
name = "cudarc"
version = "0.2.0"
edition = "2021"
license = "MIT OR Apache-2.0"

description = "Safe wrappers around CUDA apis"
homepage = "https://github.com/coreylowman/cudarc"
documentation = "https://docs.rs/cudarc"
repository = "https://github.com/coreylowman/cudarc"
readme = "README.md"

keywords = [
"cuda",
"nvidia",
"gpu",
"nvrtc",
"curand",
]

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[features]
default = [ "std" ]
std = [ "no-std-compat/std" ]

[dependencies]
no-std-compat = { version = "0.4.1", features = [ "alloc" ] }

[build-dependencies]
find_cuda_helper = "0.2.0"
[package]
name = "cudarc"
version = "0.2.0"
edition = "2021"
license = "MIT OR Apache-2.0"

description = "Safe wrappers around CUDA apis"
homepage = "https://github.com/coreylowman/cudarc"
documentation = "https://docs.rs/cudarc"
repository = "https://github.com/coreylowman/cudarc"
readme = "README.md"

keywords = [
"cuda",
"nvidia",
"gpu",
"nvrtc",
"curand",
]

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[features]
default = [ "std" ]
std = [ "no-std-compat/std" ]

[dependencies]
no-std-compat = { version = "0.4.1", features = [ "alloc" ] }
const_panic = "0.2.6"

[build-dependencies]
find_cuda_helper = "0.2.0"
3 changes: 3 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ fn main() {
println!("cargo:rustc-link-lib=dylib=nvrtc");
println!("cargo:rustc-link-lib=dylib=curand");
println!("cargo:rustc-link-lib=dylib=cudart");
// TODO maybe clean up
println!(r"cargo:rustc-link-search=native=C:\Program Files\NVIDIA\CUDNN\v8.6\lib\x64");
println!("cargo:rustc-link-lib=dylib=cudnn64_8");
println!("cargo:rustc-link-lib=dylib=cublas");
println!("cargo:rustc-link-lib=dylib=cublasLt");
}
Empty file added examples/devices.rs
Empty file.
Empty file added examples/kernels.rs
Empty file.
33 changes: 33 additions & 0 deletions src/arrays.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
pub trait NumElements {
const NUMEL: usize;
type Dtype;
}

macro_rules! impl_numel_for_builtin {
($T:ty) => {
impl NumElements for $T {
type Dtype = Self;

const NUMEL: usize = 1;
}
};
}

impl_numel_for_builtin!(u8);
impl_numel_for_builtin!(u16);
impl_numel_for_builtin!(u32);
impl_numel_for_builtin!(u64);
impl_numel_for_builtin!(usize);
impl_numel_for_builtin!(i8);
impl_numel_for_builtin!(i16);
impl_numel_for_builtin!(i32);
impl_numel_for_builtin!(i64);
impl_numel_for_builtin!(isize);
impl_numel_for_builtin!(f32);
impl_numel_for_builtin!(f64);

impl<T: NumElements, const M: usize> NumElements for [T; M] {
type Dtype = T::Dtype;

const NUMEL: usize = T::NUMEL * M;
}
7 changes: 7 additions & 0 deletions src/cudnn/activation/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
mod mode;
mod simple_activations;
mod softmax;

pub use mode::*;
pub use simple_activations::*;
pub use softmax::*;
37 changes: 37 additions & 0 deletions src/cudnn/activation/mode.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
use super::super::sys::*;

/// A Marker for an [ActivationMode].
///
/// # Supported modes
/// [Sigmoid], [Relu], [Tanh], [Elu], [Swish]
///
/// [Relu] has its upper bound set to `f64::MAX`.
///
/// Other modes are currently not supported as they require additional
/// parameters.
///
/// # See also
/// <https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnActivationMode_t>
pub trait ActivationMode {
fn get_activation_mode() -> cudnnActivationMode_t;
fn get_additional_parameter() -> f64;
}
macro_rules! impl_activation_mode {
($name:ident : $mode:ident) => {
pub struct $name;
impl ActivationMode for $name {
fn get_activation_mode() -> cudnnActivationMode_t {
cudnnActivationMode_t::$mode
}

fn get_additional_parameter() -> f64 {
f64::MAX
}
}
};
}

impl_activation_mode!(Sigmoid: CUDNN_ACTIVATION_SIGMOID);
impl_activation_mode!(Relu: CUDNN_ACTIVATION_RELU);
impl_activation_mode!(Tanh: CUDNN_ACTIVATION_TANH);
impl_activation_mode!(Elu: CUDNN_ACTIVATION_ELU);
Comment on lines +34 to +37
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i'm thinking we don't need to add this activation forward for dfdx since there are so little in cudnn. we can just write custom kernels for them (they are really easy to write)

134 changes: 134 additions & 0 deletions src/cudnn/activation/simple_activations.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
use core::marker::PhantomData;
use core::mem::MaybeUninit;

use crate::cudnn::sys::*;
use crate::prelude::*;

const NAN_PROPAGATION: cudnnNanPropagation_t = cudnnNanPropagation_t::CUDNN_PROPAGATE_NAN;
pub struct Activation<A> {
descriptor: cudnnActivationDescriptor_t,
activation_mode: PhantomData<A>,
}
impl<A: ActivationMode> Activation<A> {
pub fn create() -> CudaCudnnResult<Self> {
let descriptor = unsafe {
let mut descriptor = MaybeUninit::uninit();
cudnnCreateActivationDescriptor(descriptor.as_mut_ptr()).result()?;
descriptor.assume_init()
};
unsafe {
cudnnSetActivationDescriptor(
descriptor,
A::get_activation_mode(),
NAN_PROPAGATION,
A::get_additional_parameter(),
)
}
.result()?;
Ok(Self {
descriptor,
activation_mode: PhantomData,
})
}

pub fn forward<
T: TensorDataType,
const N: usize,
const C: usize,
const H: usize,
const W: usize,
>(
&self,
cudnn_handle: &CudnnHandle,
input: &Tensor4D<T, N, C, H, W>,
output: &mut Tensor4D<T, N, C, H, W>,
) -> CudaCudnnResult<()> {
unsafe {
cudnnActivationForward(
cudnn_handle.get_handle(),
self.descriptor,
&T::ONE as *const _ as *const _,
input.get_descriptor(),
input.get_data_ptr(),
&T::ZERO as *const _ as *const _,
output.get_descriptor(),
output.get_data_ptr_mut(),
)
}
.result()
}

pub fn backward<
T: TensorDataType,
const N: usize,
const C: usize,
const H: usize,
const W: usize,
>(
&self,
cudnn_handle: &CudnnHandle,
input: &Tensor4D<T, N, C, H, W>,
d_input: &Tensor4D<T, N, C, H, W>,
output: &Tensor4D<T, N, C, H, W>,
d_output: &mut Tensor4D<T, N, C, H, W>,
) -> CudaCudnnResult<()> {
unsafe {
cudnnActivationBackward(
cudnn_handle.get_handle(),
self.descriptor,
&T::ONE as *const _ as *const _,
input.get_descriptor(),
input.get_data_ptr(),
d_input.get_descriptor(),
d_input.get_data_ptr(),
output.get_descriptor(),
output.get_data_ptr(),
&T::ZERO as *const _ as *const _,
d_output.get_descriptor(),
d_output.get_data_ptr_mut(),
)
}
.result()
}
}
impl<A> Drop for Activation<A> {
fn drop(&mut self) {
unsafe { cudnnDestroyActivationDescriptor(self.descriptor) }
.result()
.unwrap();
}
}

#[cfg(test)]
mod tests {
use crate::prelude::*;

#[test]
fn test_relu_activation_forward_backward() {
let cuda = CudaDeviceBuilder::new(0).build().unwrap();
let cudnn_handle = CudnnHandle::create(&cuda).unwrap();
let x = Tensor4D::alloc_with(&cuda, [[[[f64::NAN, 2.0]]], [[[-1.0, 0.0]]]]).unwrap();
let dy = Tensor4D::alloc_with(&cuda, [[[[f64::NAN, 3.0]]], [[[-1.0, 0.0]]]]).unwrap();
let mut dx = unsafe { Tensor4D::alloc_uninit(&cuda) }.unwrap();
let mut y = unsafe { Tensor4D::alloc_uninit(&cuda) }.unwrap();

let activation = Activation::<Relu>::create().unwrap();
activation.forward(&cudnn_handle, &x, &mut y).unwrap();

let out = y.get_data().as_host().unwrap();
assert!(out[0][0][0][0].is_nan());
assert!((out[0][0][0][1] - 2.0).abs() < f64::EPSILON);
assert!(out[1][0][0][0].abs() < f64::EPSILON);
assert!(out[1][0][0][1].abs() < f64::EPSILON);
activation
.backward(&cudnn_handle, &x, &dy, &y, &mut dx)
.unwrap();

let out = dx.get_data().as_host().unwrap();
// NANs aren't backpropagated
assert!(out[0][0][0][0].abs() < f64::EPSILON);
assert!((out[0][0][0][1] - 3.0).abs() < f64::EPSILON);
assert!(out[1][0][0][0].abs() < f64::EPSILON);
assert!(out[1][0][0][1].abs() < f64::EPSILON);
}
}
101 changes: 101 additions & 0 deletions src/cudnn/activation/softmax.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
use crate::cudnn::sys::*;
use crate::prelude::*;

/// This does the softmax activation per image.
pub struct Softmax;
Copy link
Owner

@coreylowman coreylowman Jan 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cudnn softmax only supports along single axis right? we don't necessarily need softmax impl for dfdx since dfdx uses lower level primitives to implement it, and also supports over any axis


impl Softmax {
pub fn forward<
T: TensorDataType,
const N: usize,
const C: usize,
const H: usize,
const W: usize,
>(
&self,
cudnn_handle: &CudnnHandle,
x: &Tensor4D<T, N, C, H, W>,
y: &mut Tensor4D<T, N, C, H, W>,
) -> CudaCudnnResult<()> {
unsafe {
cudnnSoftmaxForward(
cudnn_handle.get_handle(),
cudnnSoftmaxAlgorithm_t::CUDNN_SOFTMAX_FAST,
cudnnSoftmaxMode_t::CUDNN_SOFTMAX_MODE_INSTANCE,
&T::ONE as *const _ as *const _,
x.get_descriptor(),
x.get_data_ptr(),
&T::ZERO as *const _ as *const _,
y.get_descriptor(),
y.get_data_ptr_mut(),
)
}
.result()
}

pub fn backward<
T: TensorDataType,
const N: usize,
const C: usize,
const H: usize,
const W: usize,
>(
&self,
cudnn_handle: &CudnnHandle,
y: &Tensor4D<T, N, C, H, W>,
dy: &Tensor4D<T, N, C, H, W>,
dx: &mut Tensor4D<T, N, C, H, W>,
) -> CudaCudnnResult<()> {
unsafe {
cudnnSoftmaxBackward(
cudnn_handle.get_handle(),
cudnnSoftmaxAlgorithm_t::CUDNN_SOFTMAX_FAST,
cudnnSoftmaxMode_t::CUDNN_SOFTMAX_MODE_CHANNEL,
&T::ONE as *const _ as *const _,
y.get_descriptor(),
y.get_data_ptr(),
dy.get_descriptor(),
dy.get_data_ptr(),
&T::ZERO as *const _ as *const _,
dx.get_descriptor(),
dx.get_data_ptr_mut(),
)
}
.result()
}
}

#[cfg(test)]
mod tests {
use crate::prelude::*;

#[test]
fn test_softmax() {
let device = CudaDeviceBuilder::new(0).build().unwrap();
let cudnn_handle = CudnnHandle::create(&device).unwrap();

let mut input_allocation = device.alloc_zeros().unwrap();
CudaRng::new(0, device.clone())
.unwrap()
.fill_with_normal(&mut input_allocation, 0.0, 1.0)
.unwrap();
let input = Tensor4D::<f64, 2, 5, 2, 1>::create_with(input_allocation).unwrap();
let mut output = unsafe { Tensor4D::alloc_uninit(&device) }.unwrap();

Softmax.forward(&cudnn_handle, &input, &mut output).unwrap();
let out = output.get_data().as_host().unwrap();
for channel in out.into_iter() {
assert!(
(channel
.into_iter()
.flatten()
.flatten()
.reduce(|a, b| a + b)
.unwrap()
- 1.0)
.abs()
< 0.0001
);
}
}
}
Loading