-
-
Notifications
You must be signed in to change notification settings - Fork 77
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Cudnn support #16
Closed
Closed
Cudnn support #16
Changes from all commits
Commits
Show all changes
29 commits
Select commit
Hold shift + click to select a range
99d8c8f
init
M1ngXU e9ae2a6
added activations
M1ngXU 88bc500
forward conv
M1ngXU 9860a89
cudnn runs on correct stream, activation backward
M1ngXU f1aa74d
added convolution backward op
M1ngXU 0fdc45f
added batchnorm (backward might be wrong), better tensor allocation, fmt
M1ngXU 27cd864
added tensor ops
M1ngXU 6040079
using cuda cudnn result to propagate cuda errors instead of panicking
M1ngXU 94dd49c
removed tensors from batch norm struct
M1ngXU fd7b1a9
now only using cuda cudnn result, refactored filter
M1ngXU 01bbbcd
removed activation descriptor as activation itself doesn't hold data
M1ngXU f3478fb
split modules into multiple files/dirs
M1ngXU c18c16d
more file/dir refactoring
M1ngXU 616f490
refactored conv, now only using desc
M1ngXU d2ab32c
splitting batchnorm, adding docs
M1ngXU 42c5ad4
added docs
M1ngXU 6eca31e
finished doc
M1ngXU a615061
added softmax
M1ngXU 916db90
added pooling
M1ngXU f77fccf
added custom kernel for tensor division
M1ngXU 9efcadd
refactored cudnn custom kernel
M1ngXU e755961
implementing into kernel param for tensor data
M1ngXU fb0a32e
renaming as_data to get_data_ref
M1ngXU 3892ba4
added scaling to division
M1ngXU 73662c0
added clone_into_new to create new tensors with another data allocation
M1ngXU 95e710e
refactored custom kernels, added sin/cos
M1ngXU 8ef0600
added reduce and broadcasting, working on exmaple
M1ngXU b52dbed
fixed custom kernels
M1ngXU 6a23616
stash
M1ngXU File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
/target | ||
/Cargo.lock | ||
/src/cudnn/custom_kernels.cu |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,31 +1,32 @@ | ||
[package] | ||
name = "cudarc" | ||
version = "0.2.0" | ||
edition = "2021" | ||
license = "MIT OR Apache-2.0" | ||
|
||
description = "Safe wrappers around CUDA apis" | ||
homepage = "https://github.com/coreylowman/cudarc" | ||
documentation = "https://docs.rs/cudarc" | ||
repository = "https://github.com/coreylowman/cudarc" | ||
readme = "README.md" | ||
|
||
keywords = [ | ||
"cuda", | ||
"nvidia", | ||
"gpu", | ||
"nvrtc", | ||
"curand", | ||
] | ||
|
||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | ||
|
||
[features] | ||
default = [ "std" ] | ||
std = [ "no-std-compat/std" ] | ||
|
||
[dependencies] | ||
no-std-compat = { version = "0.4.1", features = [ "alloc" ] } | ||
|
||
[build-dependencies] | ||
find_cuda_helper = "0.2.0" | ||
[package] | ||
name = "cudarc" | ||
version = "0.2.0" | ||
edition = "2021" | ||
license = "MIT OR Apache-2.0" | ||
|
||
description = "Safe wrappers around CUDA apis" | ||
homepage = "https://github.com/coreylowman/cudarc" | ||
documentation = "https://docs.rs/cudarc" | ||
repository = "https://github.com/coreylowman/cudarc" | ||
readme = "README.md" | ||
|
||
keywords = [ | ||
"cuda", | ||
"nvidia", | ||
"gpu", | ||
"nvrtc", | ||
"curand", | ||
] | ||
|
||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | ||
|
||
[features] | ||
default = [ "std" ] | ||
std = [ "no-std-compat/std" ] | ||
|
||
[dependencies] | ||
no-std-compat = { version = "0.4.1", features = [ "alloc" ] } | ||
const_panic = "0.2.6" | ||
|
||
[build-dependencies] | ||
find_cuda_helper = "0.2.0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
pub trait NumElements { | ||
const NUMEL: usize; | ||
type Dtype; | ||
} | ||
|
||
macro_rules! impl_numel_for_builtin { | ||
($T:ty) => { | ||
impl NumElements for $T { | ||
type Dtype = Self; | ||
|
||
const NUMEL: usize = 1; | ||
} | ||
}; | ||
} | ||
|
||
impl_numel_for_builtin!(u8); | ||
impl_numel_for_builtin!(u16); | ||
impl_numel_for_builtin!(u32); | ||
impl_numel_for_builtin!(u64); | ||
impl_numel_for_builtin!(usize); | ||
impl_numel_for_builtin!(i8); | ||
impl_numel_for_builtin!(i16); | ||
impl_numel_for_builtin!(i32); | ||
impl_numel_for_builtin!(i64); | ||
impl_numel_for_builtin!(isize); | ||
impl_numel_for_builtin!(f32); | ||
impl_numel_for_builtin!(f64); | ||
|
||
impl<T: NumElements, const M: usize> NumElements for [T; M] { | ||
type Dtype = T::Dtype; | ||
|
||
const NUMEL: usize = T::NUMEL * M; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
mod mode; | ||
mod simple_activations; | ||
mod softmax; | ||
|
||
pub use mode::*; | ||
pub use simple_activations::*; | ||
pub use softmax::*; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
use super::super::sys::*; | ||
|
||
/// A Marker for an [ActivationMode]. | ||
/// | ||
/// # Supported modes | ||
/// [Sigmoid], [Relu], [Tanh], [Elu], [Swish] | ||
/// | ||
/// [Relu] has its upper bound set to `f64::MAX`. | ||
/// | ||
/// Other modes are currently not supported as they require additional | ||
/// parameters. | ||
/// | ||
/// # See also | ||
/// <https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnActivationMode_t> | ||
pub trait ActivationMode { | ||
fn get_activation_mode() -> cudnnActivationMode_t; | ||
fn get_additional_parameter() -> f64; | ||
} | ||
macro_rules! impl_activation_mode { | ||
($name:ident : $mode:ident) => { | ||
pub struct $name; | ||
impl ActivationMode for $name { | ||
fn get_activation_mode() -> cudnnActivationMode_t { | ||
cudnnActivationMode_t::$mode | ||
} | ||
|
||
fn get_additional_parameter() -> f64 { | ||
f64::MAX | ||
} | ||
} | ||
}; | ||
} | ||
|
||
impl_activation_mode!(Sigmoid: CUDNN_ACTIVATION_SIGMOID); | ||
impl_activation_mode!(Relu: CUDNN_ACTIVATION_RELU); | ||
impl_activation_mode!(Tanh: CUDNN_ACTIVATION_TANH); | ||
impl_activation_mode!(Elu: CUDNN_ACTIVATION_ELU); | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
use core::marker::PhantomData; | ||
use core::mem::MaybeUninit; | ||
|
||
use crate::cudnn::sys::*; | ||
use crate::prelude::*; | ||
|
||
const NAN_PROPAGATION: cudnnNanPropagation_t = cudnnNanPropagation_t::CUDNN_PROPAGATE_NAN; | ||
pub struct Activation<A> { | ||
descriptor: cudnnActivationDescriptor_t, | ||
activation_mode: PhantomData<A>, | ||
} | ||
impl<A: ActivationMode> Activation<A> { | ||
pub fn create() -> CudaCudnnResult<Self> { | ||
let descriptor = unsafe { | ||
let mut descriptor = MaybeUninit::uninit(); | ||
cudnnCreateActivationDescriptor(descriptor.as_mut_ptr()).result()?; | ||
descriptor.assume_init() | ||
}; | ||
unsafe { | ||
cudnnSetActivationDescriptor( | ||
descriptor, | ||
A::get_activation_mode(), | ||
NAN_PROPAGATION, | ||
A::get_additional_parameter(), | ||
) | ||
} | ||
.result()?; | ||
Ok(Self { | ||
descriptor, | ||
activation_mode: PhantomData, | ||
}) | ||
} | ||
|
||
pub fn forward< | ||
T: TensorDataType, | ||
const N: usize, | ||
const C: usize, | ||
const H: usize, | ||
const W: usize, | ||
>( | ||
&self, | ||
cudnn_handle: &CudnnHandle, | ||
input: &Tensor4D<T, N, C, H, W>, | ||
output: &mut Tensor4D<T, N, C, H, W>, | ||
) -> CudaCudnnResult<()> { | ||
unsafe { | ||
cudnnActivationForward( | ||
cudnn_handle.get_handle(), | ||
self.descriptor, | ||
&T::ONE as *const _ as *const _, | ||
input.get_descriptor(), | ||
input.get_data_ptr(), | ||
&T::ZERO as *const _ as *const _, | ||
output.get_descriptor(), | ||
output.get_data_ptr_mut(), | ||
) | ||
} | ||
.result() | ||
} | ||
|
||
pub fn backward< | ||
T: TensorDataType, | ||
const N: usize, | ||
const C: usize, | ||
const H: usize, | ||
const W: usize, | ||
>( | ||
&self, | ||
cudnn_handle: &CudnnHandle, | ||
input: &Tensor4D<T, N, C, H, W>, | ||
d_input: &Tensor4D<T, N, C, H, W>, | ||
output: &Tensor4D<T, N, C, H, W>, | ||
d_output: &mut Tensor4D<T, N, C, H, W>, | ||
) -> CudaCudnnResult<()> { | ||
unsafe { | ||
cudnnActivationBackward( | ||
cudnn_handle.get_handle(), | ||
self.descriptor, | ||
&T::ONE as *const _ as *const _, | ||
input.get_descriptor(), | ||
input.get_data_ptr(), | ||
d_input.get_descriptor(), | ||
d_input.get_data_ptr(), | ||
output.get_descriptor(), | ||
output.get_data_ptr(), | ||
&T::ZERO as *const _ as *const _, | ||
d_output.get_descriptor(), | ||
d_output.get_data_ptr_mut(), | ||
) | ||
} | ||
.result() | ||
} | ||
} | ||
impl<A> Drop for Activation<A> { | ||
fn drop(&mut self) { | ||
unsafe { cudnnDestroyActivationDescriptor(self.descriptor) } | ||
.result() | ||
.unwrap(); | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use crate::prelude::*; | ||
|
||
#[test] | ||
fn test_relu_activation_forward_backward() { | ||
let cuda = CudaDeviceBuilder::new(0).build().unwrap(); | ||
let cudnn_handle = CudnnHandle::create(&cuda).unwrap(); | ||
let x = Tensor4D::alloc_with(&cuda, [[[[f64::NAN, 2.0]]], [[[-1.0, 0.0]]]]).unwrap(); | ||
let dy = Tensor4D::alloc_with(&cuda, [[[[f64::NAN, 3.0]]], [[[-1.0, 0.0]]]]).unwrap(); | ||
let mut dx = unsafe { Tensor4D::alloc_uninit(&cuda) }.unwrap(); | ||
let mut y = unsafe { Tensor4D::alloc_uninit(&cuda) }.unwrap(); | ||
|
||
let activation = Activation::<Relu>::create().unwrap(); | ||
activation.forward(&cudnn_handle, &x, &mut y).unwrap(); | ||
|
||
let out = y.get_data().as_host().unwrap(); | ||
assert!(out[0][0][0][0].is_nan()); | ||
assert!((out[0][0][0][1] - 2.0).abs() < f64::EPSILON); | ||
assert!(out[1][0][0][0].abs() < f64::EPSILON); | ||
assert!(out[1][0][0][1].abs() < f64::EPSILON); | ||
activation | ||
.backward(&cudnn_handle, &x, &dy, &y, &mut dx) | ||
.unwrap(); | ||
|
||
let out = dx.get_data().as_host().unwrap(); | ||
// NANs aren't backpropagated | ||
assert!(out[0][0][0][0].abs() < f64::EPSILON); | ||
assert!((out[0][0][0][1] - 3.0).abs() < f64::EPSILON); | ||
assert!(out[1][0][0][0].abs() < f64::EPSILON); | ||
assert!(out[1][0][0][1].abs() < f64::EPSILON); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
use crate::cudnn::sys::*; | ||
use crate::prelude::*; | ||
|
||
/// This does the softmax activation per image. | ||
pub struct Softmax; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. cudnn softmax only supports along single axis right? we don't necessarily need softmax impl for dfdx since dfdx uses lower level primitives to implement it, and also supports over any axis |
||
|
||
impl Softmax { | ||
pub fn forward< | ||
T: TensorDataType, | ||
const N: usize, | ||
const C: usize, | ||
const H: usize, | ||
const W: usize, | ||
>( | ||
&self, | ||
cudnn_handle: &CudnnHandle, | ||
x: &Tensor4D<T, N, C, H, W>, | ||
y: &mut Tensor4D<T, N, C, H, W>, | ||
) -> CudaCudnnResult<()> { | ||
unsafe { | ||
cudnnSoftmaxForward( | ||
cudnn_handle.get_handle(), | ||
cudnnSoftmaxAlgorithm_t::CUDNN_SOFTMAX_FAST, | ||
cudnnSoftmaxMode_t::CUDNN_SOFTMAX_MODE_INSTANCE, | ||
&T::ONE as *const _ as *const _, | ||
x.get_descriptor(), | ||
x.get_data_ptr(), | ||
&T::ZERO as *const _ as *const _, | ||
y.get_descriptor(), | ||
y.get_data_ptr_mut(), | ||
) | ||
} | ||
.result() | ||
} | ||
|
||
pub fn backward< | ||
T: TensorDataType, | ||
const N: usize, | ||
const C: usize, | ||
const H: usize, | ||
const W: usize, | ||
>( | ||
&self, | ||
cudnn_handle: &CudnnHandle, | ||
y: &Tensor4D<T, N, C, H, W>, | ||
dy: &Tensor4D<T, N, C, H, W>, | ||
dx: &mut Tensor4D<T, N, C, H, W>, | ||
) -> CudaCudnnResult<()> { | ||
unsafe { | ||
cudnnSoftmaxBackward( | ||
cudnn_handle.get_handle(), | ||
cudnnSoftmaxAlgorithm_t::CUDNN_SOFTMAX_FAST, | ||
cudnnSoftmaxMode_t::CUDNN_SOFTMAX_MODE_CHANNEL, | ||
&T::ONE as *const _ as *const _, | ||
y.get_descriptor(), | ||
y.get_data_ptr(), | ||
dy.get_descriptor(), | ||
dy.get_data_ptr(), | ||
&T::ZERO as *const _ as *const _, | ||
dx.get_descriptor(), | ||
dx.get_data_ptr_mut(), | ||
) | ||
} | ||
.result() | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use crate::prelude::*; | ||
|
||
#[test] | ||
fn test_softmax() { | ||
let device = CudaDeviceBuilder::new(0).build().unwrap(); | ||
let cudnn_handle = CudnnHandle::create(&device).unwrap(); | ||
|
||
let mut input_allocation = device.alloc_zeros().unwrap(); | ||
CudaRng::new(0, device.clone()) | ||
.unwrap() | ||
.fill_with_normal(&mut input_allocation, 0.0, 1.0) | ||
.unwrap(); | ||
let input = Tensor4D::<f64, 2, 5, 2, 1>::create_with(input_allocation).unwrap(); | ||
let mut output = unsafe { Tensor4D::alloc_uninit(&device) }.unwrap(); | ||
|
||
Softmax.forward(&cudnn_handle, &input, &mut output).unwrap(); | ||
let out = output.get_data().as_host().unwrap(); | ||
for channel in out.into_iter() { | ||
assert!( | ||
(channel | ||
.into_iter() | ||
.flatten() | ||
.flatten() | ||
.reduce(|a, b| a + b) | ||
.unwrap() | ||
- 1.0) | ||
.abs() | ||
< 0.0001 | ||
); | ||
} | ||
} | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i'm thinking we don't need to add this activation forward for dfdx since there are so little in cudnn. we can just write custom kernels for them (they are really easy to write)