coreylowman · swfsql · Jan 26, 2024 · Jan 26, 2024 · Feb 7, 2024 · Feb 6, 2024
diff --git a/dfdx-core/Cargo.toml b/dfdx-core/Cargo.toml
@@ -35,7 +35,7 @@ num-traits = { workspace = true }
 safetensors = { workspace = true, optional = true }
 memmap2 = { workspace = true, optional = true }
 half = { version = "2.3.1", optional = true, features = ["num-traits", "rand_distr"] }
-gemm = { version = "0.16.14", default-features = false, optional = true, features = ["rayon"] }
+gemm = { version = "0.17.1", default-features = false, optional = true, features = ["rayon"] }
 rayon = { version = "1.7.0", optional = true }
 libm = { workspace = true }
 wgpu = { version = "0.18.0", features = ["glsl", "spirv"], optional = true }

diff --git a/dfdx-core/src/data/collate.rs b/dfdx-core/src/data/collate.rs
@@ -1,4 +1,4 @@
 use std::{mem::MaybeUninit, vec::Vec};

 /// Collates `Self` into some other type.
 /// Generally similar to an unzip method;
@@ -55,6 +55,7 @@
 impl<'a, A, B> Collate for Vec<&'a (A, B)> {
     type Collated = (Vec<&'a A>, Vec<&'a B>);
     fn collated(self) -> Self::Collated {
+        #[allow(clippy::map_identity)]
         self.into_iter().map(|(a, b)| (a, b)).unzip()
     }
 }

diff --git a/dfdx-core/src/lib.rs b/dfdx-core/src/lib.rs
@@ -9,7 +9,7 @@
 //! The following sections provide some high level core concepts & exmaples, and
 //! there is more detailed documentation in each of dfdx's submodules.
 //!
 //! See [feature_flags] for details on feature flags.
 //!
 //! # Shapes & Tensors
 //!
@@ -59,7 +59,7 @@
 //! There are two options for this currently, with more planned to be added in the future:
 //!
 //! 1. [tensor::Cpu] - for tensors stored on the heap
 //! 2. [tensor::Cuda] - for tensors stored in GPU memory
 //!
 //! Both devices implement [Default], you can also create them with a certain seed
 //! and ordinal.
@@ -85,8 +85,8 @@
 //! | Unary Operations | `a.sqrt()` | `a.sqrt()` | `a.sqrt()` |
 //! | Binary Operations | `a + b` | `a + b` | `a + b` |
 //! | gemm/gemv | [tensor_ops::matmul] | `a @ b` | `a @ b` |
 //! | 2d Convolution | [tensor_ops::TryConv2D] | - | `torch.conv2d` |
 //! | 2d Transposed Convolution | [tensor_ops::TryConvTrans2D] | - | `torch.conv_transpose2d` |
 //! | Slicing | [tensor_ops::slice] | `a[...]` | `a[...]` |
 //! | Select | [tensor_ops::SelectTo] | `a[...]` | `torch.select` |
 //! | Gather | [tensor_ops::GatherTo] | `np.take` | `torch.gather` |
@@ -128,44 +128,6 @@
     pub use crate::tensor_ops::*;
 }
 
-/// Sets a CPU `sse` flag to flush denormal floating point numbers to zero. The opposite of this is [keep_denormals()].
-///
-/// Some resources:
-/// 1. [Effects of Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode?lang=en)
-/// 2. [When to use Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/when-to-use-flush-to-zero-mode?lang=en)
-pub fn flush_denormals_to_zero() {
-    #[cfg(all(target_arch = "x86", target_feature = "sse"))]
-    {
-        use std::arch::x86::{_MM_FLUSH_ZERO_ON, _MM_SET_FLUSH_ZERO_MODE};
-        unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON) }
-    }
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "sse"))]
-    {
-        use std::arch::x86_64::{_MM_FLUSH_ZERO_ON, _MM_SET_FLUSH_ZERO_MODE};
-        unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON) }
-    }
-}
-
-/// Sets a CPU flag to keep denormal floating point numbers. The opposite of this is [flush_denormals_to_zero()].
-///
-/// Some resources:
-/// 1. [Effects of Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode?lang=en)
-/// 2. [When to use Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/when-to-use-flush-to-zero-mode?lang=en)
-pub fn keep_denormals() {
-    #[cfg(all(target_arch = "x86", target_feature = "sse"))]
-    {
-        use std::arch::x86::{_MM_FLUSH_ZERO_OFF, _MM_SET_FLUSH_ZERO_MODE};
-        unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF) }
-    }
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "sse"))]
-    {
-        use std::arch::x86_64::{_MM_FLUSH_ZERO_OFF, _MM_SET_FLUSH_ZERO_MODE};
-        unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF) }
-    }
-}
-
 #[cfg(test)]
 pub(crate) mod tests {
     pub use num_traits::{Float, NumCast, Zero};

diff --git a/dfdx-core/src/tensor/gradients.rs b/dfdx-core/src/tensor/gradients.rs
@@ -153,7 +153,7 @@ impl<E, D: Storage<E>> Gradients<E, D> {
     #[inline]
     pub(crate) fn many_and_ref<L: Shape, R: Shape>(
         &mut self,
-        ls: &Vec<impl Tensorlike<L, E, D>>,
+        ls: &[impl Tensorlike<L, E, D>],
         r: &impl Tensorlike<R, E, D>,
     ) -> (Vec<&mut D::Vec>, &D::Vec) {
         for i in 0..ls.len() {

diff --git a/dfdx-core/src/tensor_ops/mod.rs b/dfdx-core/src/tensor_ops/mod.rs
@@ -184,6 +184,7 @@ mod mul;
 mod nans_to;
 mod negate;
 mod normalize;
+mod normalize_rms;
 pub(super) mod optim;
 mod permute_to;
 mod pow;
@@ -251,6 +252,7 @@ pub use mul::{mul, TryMul};
 pub use nans_to::nans_to;
 pub use negate::negate;
 pub use normalize::normalize;
+pub use normalize_rms::normalize_rms;
 pub use optim::*;
 pub use permute_to::PermuteTo;
 pub use pow::{powf, powi};

diff --git a/dfdx-core/src/tensor_ops/normalize_rms.rs b/dfdx-core/src/tensor_ops/normalize_rms.rs
@@ -0,0 +1,136 @@
+use crate::{
+    shapes::{Axes, Dtype, ReduceShape, Shape},
+    tensor::{Error, Tape, Tensor},
+};
+
+use super::{BroadcastTo, Device, MeanTo, TryAdd, TryMul};
+
+/// Normalizes `t` to have stddev `1.0` along `Ax`. `epsilon` is used during stddev.
+/// Computes `t / (t.square().mean() + epsilon).sqrt()`.
+///
+/// Normalizing a single axis:
+/// ```rust
+/// # use dfdx_core::prelude::*;
+/// # let dev: Cpu = Default::default();
+/// let t: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
+/// let _ = t.normalize_rms::<Axis<1>>(1e-5);
+/// ```
+pub fn normalize_rms<
+    Ax: Axes,
+    S: Shape + ReduceShape<Ax>,
+    E: Dtype,
+    D: Device<E>,
+    T: Tape<E, D>,
+>(
+    t: Tensor<S, E, D, T>,
+    epsilon: impl Into<f64>,
+) -> Tensor<S, E, D, T> {
+    t.normalize_rms::<Ax>(epsilon)
+}
+
+impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Tensor<S, E, D, T> {
+    /// See [normalize_rms]
+    pub fn normalize_rms<Ax: Axes>(self, epsilon: impl Into<f64>) -> Self
+    where
+        S: ReduceShape<Ax>,
+    {
+        self.try_normalize_rms::<Ax>(epsilon).unwrap()
+    }
+
+    /// See [normalize_rms]
+    pub fn try_normalize_rms<Ax: Axes>(self, epsilon: impl Into<f64>) -> Result<Self, Error>
+    where
+        S: ReduceShape<Ax>,
+    {
+        let shape = self.shape;
+        let sq = self.retaped::<T>().try_square()?;
+        let sq_mean = sq.try_mean::<_, Ax>()?;
+        let rsqrt = sq_mean
+            .try_add(epsilon)?
+            .try_sqrt()?
+            .try_recip()?
+            .try_broadcast_like(&shape)?;
+        self.try_mul(rsqrt)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::tests::*;
+    use crate::{shapes::*, tensor::*, tensor_ops::*};
+
+    #[test]
+    fn test_1d_normalize_rms_axis_last() {
+        let dev: TestDevice = Default::default();
+        let a = dev.tensor([-2.0, 0.0, 5.0]).to_dtype::<TestDtype>();
+        let r = a.leaky_trace().normalize_rms(1e-5);
+        assert_close_to_literal!(&r, [-0.64326715, 0.0, 1.6081679]);
+        // NOTE: .exp() so we can make sure normalize is using result grad properly
+        let g = r.exp().mean().backward();
+        assert_close_to_literal!(&g.get(&a), [0.23318729, 0.107211195, 0.09327549]);
+    }
+
+    #[test]
+    fn test_2d_normalize_rms_axis_last() {
+        let dev: TestDevice = Default::default();
+        let a = dev
+            .tensor([[-2.0, 0.0, 5.0], [1.0, 2.0, 3.0]])
+            .to_dtype::<TestDtype>();
+        let r = a.leaky_trace().normalize_rms::<Axis<1>>(1e-5);
+        assert_close_to_literal!(
+            r,
+            [
+                [-0.64326715, 0.0, 1.6081679],
+                [0.46290955, 0.9258191, 1.3887286]
+            ]
+        );
+        let g = r.exp().mean().backward();
+        assert_close_to_literal!(
+            g.get(&a),
+            [
+                [0.116593644, 0.053605597, 0.046637744],
+                [0.019706108, -0.011002079, 0.0007670224]
+            ]
+        );
+    }
+
+    #[test]
+    fn test_2d_normalize_rms_axis_first() {
+        let dev: TestDevice = Default::default();
+        let a = dev
+            .tensor([[-2.0, 0.0], [1.0, 2.0], [4.0, 5.0]])
+            .to_dtype::<TestDtype>();
+        let r = a.leaky_trace().normalize_rms::<Axis<0>>(1e-5);
+        assert_close_to_literal!(
+            r,
+            [
+                [-0.7559284, 0.0],
+                [0.3779642, 0.64326715],
+                [1.5118568, 1.6081679]
+            ]
+        );
+        let g = r.exp().mean().backward();
+        assert_close_to_literal!(
+            g.get(&a),
+            [
+                [0.14153406, 0.053605597],
+                [0.03595103, -0.0043795705],
+                [0.061779693, 0.0017521679]
+            ]
+        );
+    }
+
+    #[test]
+    fn test_3d_normalize_rms_axis_last() {
+        let dev: TestDevice = Default::default();
+        let a: Tensor<Rank3<4, 2, 3>, TestDtype, _> = dev.ones();
+        let r = a.leaky_trace().normalize_rms::<Axis<2>>(1e-5);
+        assert_close_to_literal!(r, [[[1.0; 3]; 2]; 4], 1e-5);
+        let g = r.exp().mean().backward();
+        assert_close_to_literal!(g.get(&a), [[[0.0; 3]; 2]; 4], 1e-5);
+    }
+}
+
+// Implementation references:
+// - https://github.com/johnma2006/mamba-minimal/blob/03de542a36d873f6e6c4057ad687278cc6ae944d/model.py#L328
+// - https://github.com/kroggen/mamba.c/blob/7387f49e352f86a0c22041c0f66fd2a40b58a207/mamba.c#L222
diff --git a/dfdx-core/src/tensor_ops/utilities/device.rs b/dfdx-core/src/tensor_ops/utilities/device.rs
@@ -114,33 +114,61 @@ pub trait Device<E: Dtype>:
     + crate::tensor_ops::axpy::AxpyKernel<E>
 
     // conv1d
-    + super::super::conv1d::Conv1DKernel<E>
+    + NonCudnnCuda<E>
+{
+}
+
+#[cfg(feature = "cudnn")]
+pub trait NonCudnnCuda<E: Dtype> {}
+
+#[cfg(not(feature = "cudnn"))]
+pub trait NonCudnnCuda<E: Dtype>:
+    // conv1d
+    super::super::conv1d::Conv1DKernel<E>
 {
 }
 
 #[cfg(feature = "f16")]
-impl Device<f16> for crate::tensor::Cpu {}
-#[cfg(feature = "f16")]
-impl Device<AMP<f16>> for crate::tensor::Cpu {}
+mod f16_ {
+    use super::*;
+    impl Device<f16> for crate::tensor::Cpu {}
+    impl NonCudnnCuda<f16> for crate::tensor::Cpu {}
+    impl Device<AMP<f16>> for crate::tensor::Cpu {}
+    impl NonCudnnCuda<AMP<f16>> for crate::tensor::Cpu {}
+}
 impl Device<f32> for crate::tensor::Cpu {}
+impl NonCudnnCuda<f32> for crate::tensor::Cpu {}
 impl Device<f64> for crate::tensor::Cpu {}
+impl NonCudnnCuda<f64> for crate::tensor::Cpu {}
 
 #[cfg(all(feature = "cuda", feature = "f16"))]
-impl Device<f16> for crate::tensor::Cuda {}
-#[cfg(all(feature = "cuda", feature = "f16"))]
-impl Device<AMP<f16>> for crate::tensor::Cuda {}
-#[cfg(feature = "cuda")]
-impl Device<f32> for crate::tensor::Cuda {}
+mod cuda_f16 {
+    use super::*;
+    impl Device<f16> for crate::tensor::Cuda {}
+    impl NonCudnnCuda<f16> for crate::tensor::Cuda {}
+    impl Device<AMP<f16>> for crate::tensor::Cuda {}
+    impl NonCudnnCuda<AMP<f16>> for crate::tensor::Cuda {}
+}
 #[cfg(feature = "cuda")]
-impl Device<f64> for crate::tensor::Cuda {}
+mod cuda {
+    use super::*;
+    impl Device<f32> for crate::tensor::Cuda {}
+    impl NonCudnnCuda<f32> for crate::tensor::Cuda {}
+    impl Device<f64> for crate::tensor::Cuda {}
+    impl NonCudnnCuda<f64> for crate::tensor::Cuda {}
+}
 
 // TODO: How can we implement this for f16 when WGSL doesn't support f16 yet?
 // #[cfg(all(feature = "webgpu", feature = "f16"))]
 // impl Device<f16> for crate::tensor::Webgpu {}
 // #[cfg(all(feature = "webgpu", feature = "f16"))]
 // impl Device<AMP<f16>> for crate::tensor::Webgpu {}
 #[cfg(feature = "webgpu")]
-impl Device<f32> for crate::tensor::Webgpu {}
+mod webgpu {
+    use super::*;
+    impl Device<f32> for crate::tensor::Webgpu {}
+    impl NonCudnnCuda<f32> for crate::tensor::Webgpu {}
+}
 
 // TODO: How can we implement this for f64 when WGSL doesn't support f64 yet?
 // #[cfg(feature = "webgpu")]

diff --git a/dfdx/examples/12-mnist.rs b/dfdx/examples/12-mnist.rs
@@ -62,9 +62,6 @@ type Mlp = (
 const BATCH_SIZE: usize = 32;
 
 fn main() {
-    // ftz substantially improves performance
-    dfdx::flush_denormals_to_zero();
-
     let mnist_path = std::env::args()
         .nth(1)
         .unwrap_or_else(|| "./datasets/MNIST/raw".to_string());