coreylowman · M1ngXU · Nov 5, 2022 · Nov 5, 2022 · Nov 6, 2022 · Nov 7, 2022
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 /target
 /Cargo.lock
+/src/cudnn/custom_kernels.cu
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,31 +1,32 @@
-[package]
-name = "cudarc"
-version = "0.2.0"
-edition = "2021"
-license = "MIT OR Apache-2.0"
-
-description = "Safe wrappers around CUDA apis"
-homepage = "https://github.com/coreylowman/cudarc"
-documentation = "https://docs.rs/cudarc"
-repository = "https://github.com/coreylowman/cudarc"
-readme = "README.md"
-
-keywords = [
-    "cuda",
-    "nvidia",
-    "gpu",
-    "nvrtc",
-    "curand",
-]
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[features]
-default = [ "std" ]
-std = [ "no-std-compat/std" ]
-
-[dependencies]
-no-std-compat = { version = "0.4.1", features = [ "alloc" ] }
-
-[build-dependencies]
-find_cuda_helper = "0.2.0"
+[package]
+name = "cudarc"
+version = "0.2.0"
+edition = "2021"
+license = "MIT OR Apache-2.0"
+
+description = "Safe wrappers around CUDA apis"
+homepage = "https://github.com/coreylowman/cudarc"
+documentation = "https://docs.rs/cudarc"
+repository = "https://github.com/coreylowman/cudarc"
+readme = "README.md"
+
+keywords = [
+    "cuda",
+    "nvidia",
+    "gpu",
+    "nvrtc",
+    "curand",
+]
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[features]
+default = [ "std" ]
+std = [ "no-std-compat/std" ]
+
+[dependencies]
+no-std-compat = { version = "0.4.1", features = [ "alloc" ] }
+const_panic = "0.2.6"
+
+[build-dependencies]
+find_cuda_helper = "0.2.0"
diff --git a/build.rs b/build.rs
@@ -3,6 +3,9 @@ fn main() {
     println!("cargo:rustc-link-lib=dylib=nvrtc");
     println!("cargo:rustc-link-lib=dylib=curand");
     println!("cargo:rustc-link-lib=dylib=cudart");
+    // TODO maybe clean up
+    println!(r"cargo:rustc-link-search=native=C:\Program Files\NVIDIA\CUDNN\v8.6\lib\x64");
+    println!("cargo:rustc-link-lib=dylib=cudnn64_8");
     println!("cargo:rustc-link-lib=dylib=cublas");
     println!("cargo:rustc-link-lib=dylib=cublasLt");
 }
diff --git a/examples/devices.rs b/examples/devices.rs
diff --git a/examples/kernels.rs b/examples/kernels.rs
diff --git a/src/arrays.rs b/src/arrays.rs
@@ -0,0 +1,33 @@
+pub trait NumElements {
+    const NUMEL: usize;
+    type Dtype;
+}
+
+macro_rules! impl_numel_for_builtin {
+    ($T:ty) => {
+        impl NumElements for $T {
+            type Dtype = Self;
+
+            const NUMEL: usize = 1;
+        }
+    };
+}
+
+impl_numel_for_builtin!(u8);
+impl_numel_for_builtin!(u16);
+impl_numel_for_builtin!(u32);
+impl_numel_for_builtin!(u64);
+impl_numel_for_builtin!(usize);
+impl_numel_for_builtin!(i8);
+impl_numel_for_builtin!(i16);
+impl_numel_for_builtin!(i32);
+impl_numel_for_builtin!(i64);
+impl_numel_for_builtin!(isize);
+impl_numel_for_builtin!(f32);
+impl_numel_for_builtin!(f64);
+
+impl<T: NumElements, const M: usize> NumElements for [T; M] {
+    type Dtype = T::Dtype;
+
+    const NUMEL: usize = T::NUMEL * M;
+}
diff --git a/src/cudnn/activation/mod.rs b/src/cudnn/activation/mod.rs
@@ -0,0 +1,7 @@
+mod mode;
+mod simple_activations;
+mod softmax;
+
+pub use mode::*;
+pub use simple_activations::*;
+pub use softmax::*;
diff --git a/src/cudnn/activation/mode.rs b/src/cudnn/activation/mode.rs
@@ -0,0 +1,37 @@
+use super::super::sys::*;
+
+/// A Marker for an [ActivationMode].
+///
+/// # Supported modes
+/// [Sigmoid], [Relu], [Tanh], [Elu], [Swish]
+///
+/// [Relu] has its upper bound set to `f64::MAX`.
+///
+/// Other modes are currently not supported as they require additional
+/// parameters.
+///
+/// # See also
+/// <https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnActivationMode_t>
+pub trait ActivationMode {
+    fn get_activation_mode() -> cudnnActivationMode_t;
+    fn get_additional_parameter() -> f64;
+}
+macro_rules! impl_activation_mode {
+    ($name:ident : $mode:ident) => {
+        pub struct $name;
+        impl ActivationMode for $name {
+            fn get_activation_mode() -> cudnnActivationMode_t {
+                cudnnActivationMode_t::$mode
+            }
+
+            fn get_additional_parameter() -> f64 {
+                f64::MAX
+            }
+        }
+    };
+}
+
+impl_activation_mode!(Sigmoid: CUDNN_ACTIVATION_SIGMOID);
+impl_activation_mode!(Relu: CUDNN_ACTIVATION_RELU);
+impl_activation_mode!(Tanh: CUDNN_ACTIVATION_TANH);
+impl_activation_mode!(Elu: CUDNN_ACTIVATION_ELU);
diff --git a/src/cudnn/activation/simple_activations.rs b/src/cudnn/activation/simple_activations.rs
@@ -0,0 +1,134 @@
+use core::marker::PhantomData;
+use core::mem::MaybeUninit;
+
+use crate::cudnn::sys::*;
+use crate::prelude::*;
+
+const NAN_PROPAGATION: cudnnNanPropagation_t = cudnnNanPropagation_t::CUDNN_PROPAGATE_NAN;
+pub struct Activation<A> {
+    descriptor: cudnnActivationDescriptor_t,
+    activation_mode: PhantomData<A>,
+}
+impl<A: ActivationMode> Activation<A> {
+    pub fn create() -> CudaCudnnResult<Self> {
+        let descriptor = unsafe {
+            let mut descriptor = MaybeUninit::uninit();
+            cudnnCreateActivationDescriptor(descriptor.as_mut_ptr()).result()?;
+            descriptor.assume_init()
+        };
+        unsafe {
+            cudnnSetActivationDescriptor(
+                descriptor,
+                A::get_activation_mode(),
+                NAN_PROPAGATION,
+                A::get_additional_parameter(),
+            )
+        }
+        .result()?;
+        Ok(Self {
+            descriptor,
+            activation_mode: PhantomData,
+        })
+    }
+
+    pub fn forward<
+        T: TensorDataType,
+        const N: usize,
+        const C: usize,
+        const H: usize,
+        const W: usize,
+    >(
+        &self,
+        cudnn_handle: &CudnnHandle,
+        input: &Tensor4D<T, N, C, H, W>,
+        output: &mut Tensor4D<T, N, C, H, W>,
+    ) -> CudaCudnnResult<()> {
+        unsafe {
+            cudnnActivationForward(
+                cudnn_handle.get_handle(),
+                self.descriptor,
+                &T::ONE as *const _ as *const _,
+                input.get_descriptor(),
+                input.get_data_ptr(),
+                &T::ZERO as *const _ as *const _,
+                output.get_descriptor(),
+                output.get_data_ptr_mut(),
+            )
+        }
+        .result()
+    }
+
+    pub fn backward<
+        T: TensorDataType,
+        const N: usize,
+        const C: usize,
+        const H: usize,
+        const W: usize,
+    >(
+        &self,
+        cudnn_handle: &CudnnHandle,
+        input: &Tensor4D<T, N, C, H, W>,
+        d_input: &Tensor4D<T, N, C, H, W>,
+        output: &Tensor4D<T, N, C, H, W>,
+        d_output: &mut Tensor4D<T, N, C, H, W>,
+    ) -> CudaCudnnResult<()> {
+        unsafe {
+            cudnnActivationBackward(
+                cudnn_handle.get_handle(),
+                self.descriptor,
+                &T::ONE as *const _ as *const _,
+                input.get_descriptor(),
+                input.get_data_ptr(),
+                d_input.get_descriptor(),
+                d_input.get_data_ptr(),
+                output.get_descriptor(),
+                output.get_data_ptr(),
+                &T::ZERO as *const _ as *const _,
+                d_output.get_descriptor(),
+                d_output.get_data_ptr_mut(),
+            )
+        }
+        .result()
+    }
+}
+impl<A> Drop for Activation<A> {
+    fn drop(&mut self) {
+        unsafe { cudnnDestroyActivationDescriptor(self.descriptor) }
+            .result()
+            .unwrap();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::prelude::*;
+
+    #[test]
+    fn test_relu_activation_forward_backward() {
+        let cuda = CudaDeviceBuilder::new(0).build().unwrap();
+        let cudnn_handle = CudnnHandle::create(&cuda).unwrap();
+        let x = Tensor4D::alloc_with(&cuda, [[[[f64::NAN, 2.0]]], [[[-1.0, 0.0]]]]).unwrap();
+        let dy = Tensor4D::alloc_with(&cuda, [[[[f64::NAN, 3.0]]], [[[-1.0, 0.0]]]]).unwrap();
+        let mut dx = unsafe { Tensor4D::alloc_uninit(&cuda) }.unwrap();
+        let mut y = unsafe { Tensor4D::alloc_uninit(&cuda) }.unwrap();
+
+        let activation = Activation::<Relu>::create().unwrap();
+        activation.forward(&cudnn_handle, &x, &mut y).unwrap();
+
+        let out = y.get_data().as_host().unwrap();
+        assert!(out[0][0][0][0].is_nan());
+        assert!((out[0][0][0][1] - 2.0).abs() < f64::EPSILON);
+        assert!(out[1][0][0][0].abs() < f64::EPSILON);
+        assert!(out[1][0][0][1].abs() < f64::EPSILON);
+        activation
+            .backward(&cudnn_handle, &x, &dy, &y, &mut dx)
+            .unwrap();
+
+        let out = dx.get_data().as_host().unwrap();
+        // NANs aren't backpropagated
+        assert!(out[0][0][0][0].abs() < f64::EPSILON);
+        assert!((out[0][0][0][1] - 3.0).abs() < f64::EPSILON);
+        assert!(out[1][0][0][0].abs() < f64::EPSILON);
+        assert!(out[1][0][0][1].abs() < f64::EPSILON);
+    }
+}
diff --git a/src/cudnn/activation/softmax.rs b/src/cudnn/activation/softmax.rs
@@ -0,0 +1,101 @@
+use crate::cudnn::sys::*;
+use crate::prelude::*;
+
+/// This does the softmax activation per image.
+pub struct Softmax;
+
+impl Softmax {
+    pub fn forward<
+        T: TensorDataType,
+        const N: usize,
+        const C: usize,
+        const H: usize,
+        const W: usize,
+    >(
+        &self,
+        cudnn_handle: &CudnnHandle,
+        x: &Tensor4D<T, N, C, H, W>,
+        y: &mut Tensor4D<T, N, C, H, W>,
+    ) -> CudaCudnnResult<()> {
+        unsafe {
+            cudnnSoftmaxForward(
+                cudnn_handle.get_handle(),
+                cudnnSoftmaxAlgorithm_t::CUDNN_SOFTMAX_FAST,
+                cudnnSoftmaxMode_t::CUDNN_SOFTMAX_MODE_INSTANCE,
+                &T::ONE as *const _ as *const _,
+                x.get_descriptor(),
+                x.get_data_ptr(),
+                &T::ZERO as *const _ as *const _,
+                y.get_descriptor(),
+                y.get_data_ptr_mut(),
+            )
+        }
+        .result()
+    }
+
+    pub fn backward<
+        T: TensorDataType,
+        const N: usize,
+        const C: usize,
+        const H: usize,
+        const W: usize,
+    >(
+        &self,
+        cudnn_handle: &CudnnHandle,
+        y: &Tensor4D<T, N, C, H, W>,
+        dy: &Tensor4D<T, N, C, H, W>,
+        dx: &mut Tensor4D<T, N, C, H, W>,
+    ) -> CudaCudnnResult<()> {
+        unsafe {
+            cudnnSoftmaxBackward(
+                cudnn_handle.get_handle(),
+                cudnnSoftmaxAlgorithm_t::CUDNN_SOFTMAX_FAST,
+                cudnnSoftmaxMode_t::CUDNN_SOFTMAX_MODE_CHANNEL,
+                &T::ONE as *const _ as *const _,
+                y.get_descriptor(),
+                y.get_data_ptr(),
+                dy.get_descriptor(),
+                dy.get_data_ptr(),
+                &T::ZERO as *const _ as *const _,
+                dx.get_descriptor(),
+                dx.get_data_ptr_mut(),
+            )
+        }
+        .result()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::prelude::*;
+
+    #[test]
+    fn test_softmax() {
+        let device = CudaDeviceBuilder::new(0).build().unwrap();
+        let cudnn_handle = CudnnHandle::create(&device).unwrap();
+
+        let mut input_allocation = device.alloc_zeros().unwrap();
+        CudaRng::new(0, device.clone())
+            .unwrap()
+            .fill_with_normal(&mut input_allocation, 0.0, 1.0)
+            .unwrap();
+        let input = Tensor4D::<f64, 2, 5, 2, 1>::create_with(input_allocation).unwrap();
+        let mut output = unsafe { Tensor4D::alloc_uninit(&device) }.unwrap();
+
+        Softmax.forward(&cudnn_handle, &input, &mut output).unwrap();
+        let out = output.get_data().as_host().unwrap();
+        for channel in out.into_iter() {
+            assert!(
+                (channel
+                    .into_iter()
+                    .flatten()
+                    .flatten()
+                    .reduce(|a, b| a + b)
+                    .unwrap()
+                    - 1.0)
+                    .abs()
+                    < 0.0001
+            );
+        }
+    }
+}