From 935810fcc59efb9761c5c2b5de5b6afa2beb0ed5 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sat, 15 Apr 2023 22:46:59 -0500
Subject: [PATCH 01/22] improve TensorCache api; move buffer conversion logic
 to CacheStorage

---
 src/tensor/cache.rs        | 121 ++++++++++++++++++++++++-------------
 src/tensor/cpu/allocate.rs |  11 +---
 src/tensor/cpu/device.rs   | 114 ++++++++++++++++------------------
 src/tensor/cuda/device.rs  |  74 ++++++++++++-----------
 4 files changed, 170 insertions(+), 150 deletions(-)
diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs
index e785cb647..389706777 100644
--- a/src/tensor/cache.rs
+++ b/src/tensor/cache.rs
@@ -33,12 +33,37 @@ pub(crate) struct AllocationKey {
 /// valid allocation. When the last value is removed from the list, the key
 /// is removed.
 #[derive(Debug)]
-pub(crate) struct TensorCache<Ptr> {
+pub(crate) struct TensorCache<Ptr: CacheStorage> {
     pub(crate) allocations: RwLock<BTreeMap<AllocationKey, Vec<Ptr>>>,
     pub(crate) enabled: RwLock<bool>,
 }
 
-impl<Ptr> Default for TensorCache<Ptr> {
+pub(crate) trait CacheStorage: Sized {
+    type Output<T>: CacheStorage;
+
+    /// Unsafely converts the elements of a contiguous collection type to another type. Note:
+    /// **This function is wildly unsafe**, see implementations for details
+    unsafe fn transmute_elements<T>(self) -> Self::Output<T>;
+
+    /// Uses transmute_elements to convert to an element type with alignment `align` before dropping. 
+    unsafe fn drop_with_alignment(self, align: usize) {
+        match align {
+            1 => drop(self.transmute_elements::<u8>()),
+            2 => drop(self.transmute_elements::<u16>()),
+            4 => drop(self.transmute_elements::<u32>()),
+            8 => drop(self.transmute_elements::<u64>()),
+            16 => drop(self.transmute_elements::<u128>()),
+            _ => panic!("Invalid alignment")
+        }
+    }
+
+    /// calls [CacheStorage::drop_with_alignment], extracting `align` from an [AllocationKey]
+    unsafe fn drop_with_key(self, key: AllocationKey) {
+        self.drop_with_alignment(key.alignment)
+    }
+}
+
+impl<Ptr: CacheStorage> Default for TensorCache<Ptr> {
     fn default() -> Self {
         Self {
             allocations: Default::default(),
@@ -47,7 +72,7 @@ impl<Ptr> Default for TensorCache<Ptr> {
     }
 }
 
-impl<Ptr> TensorCache<Ptr> {
+impl<Ptr: CacheStorage> TensorCache<Ptr> {
     /// Returns the number of allocations in the cache.
     #[allow(unused)]
     pub(crate) fn len(&self) -> usize {
@@ -102,7 +127,7 @@ impl<Ptr> TensorCache<Ptr> {
 
     /// Returns a cached allocation if one exists.
     /// Otherwise, returns `None`.
-    pub(crate) fn try_pop<E>(&self, len: usize) -> Option<Ptr> {
+    pub(crate) fn try_pop<E>(&self, len: usize) -> Option<Ptr::Output<E>> {
         if !self.is_enabled() {
             return None;
         }
@@ -142,21 +167,23 @@ impl<Ptr> TensorCache<Ptr> {
             if items.is_empty() {
                 cache.remove(&key);
             }
-            Some(allocation)
+            Some(unsafe { allocation.transmute_elements() })
         } else {
             None
         }
     }
 
     /// Inserts an allocation into the cache.
-    pub(crate) fn insert<E>(&self, len: usize, allocation: Ptr) {
+    pub(crate) fn insert<E>(&self, len: usize, allocation: Ptr::Output<E>)
+        where Ptr::Output<E>: CacheStorage<Output<u8> = Ptr>
+    {
         if !self.is_enabled() {
-            // This is a panic because it's a bug in the library.
-            panic!("Tried to insert into a disabled cache.");
+            return;
         }
 
+        let allocation = unsafe { allocation.transmute_elements::<u8>() };
         let layout = Layout::new::<E>();
-        let num_bytes = len * std::mem::size_of::<E>();
+        let num_bytes = len * layout.size();
         let key = AllocationKey {
             num_bytes,
             size: layout.size(),
@@ -181,22 +208,30 @@ impl<Ptr> TensorCache<Ptr> {
             cache.get_mut(&key).unwrap().push(allocation);
         }
     }
+
+    pub(crate) fn clear(&self) {
+        #[cfg(not(feature = "no-std"))]
+        let mut cache = self.allocations.write().unwrap();
+        #[cfg(feature = "no-std")]
+        let mut cache = self.allocations.write();
+        for (&key, allocations) in cache.iter_mut() {
+            assert!(key.num_bytes % key.size == 0);
+            assert!(key.num_bytes < isize::MAX as usize);
+            for alloc in allocations.drain(..) {
+                unsafe { alloc.drop_with_key(key) };
+            }
+        }
+        cache.clear();
+    }
 }
 
 #[cfg(test)]
 mod test {
     use super::*;
 
-    #[test]
-    #[should_panic(expected = "Tried to insert into a disabled cache.")]
-    fn test_insert_on_disabled_cache() {
-        let cache: TensorCache<usize> = Default::default();
-        cache.insert::<f32>(1, 0);
-    }
-
     #[test]
     fn test_try_pop_on_disabled_cache() {
-        let cache: TensorCache<usize> = Default::default();
+        let cache: TensorCache<Vec<u8>> = Default::default();
         cache.enable();
         assert!(cache.is_enabled());
         cache.disable();
@@ -207,7 +242,7 @@ mod test {
 
     #[test]
     fn test_try_pop_on_empty_cache() {
-        let cache: TensorCache<usize> = Default::default();
+        let cache: TensorCache<Vec<u8>> = Default::default();
         cache.enable();
         assert_eq!(cache.try_pop::<f32>(1), None);
         assert_eq!(cache.try_pop::<f32>(1), None);
@@ -215,35 +250,35 @@ mod test {
 
     #[test]
     fn test_try_pop_on_cache_with_multiple_sizes_and_alignment() {
-        let cache: TensorCache<usize> = Default::default();
+        let cache: TensorCache<Vec<u8>> = Default::default();
         cache.enable();
-        cache.insert::<f32>(1, 0);
-        cache.insert::<f32>(1, 1);
-        cache.insert::<f32>(1, 2);
-        cache.insert::<f32>(2, 3);
-        cache.insert::<f32>(2, 4);
-        cache.insert::<f32>(2, 5);
-        cache.insert::<f64>(1, 6);
-        cache.insert::<f64>(1, 7);
-        cache.insert::<f64>(1, 8);
-        cache.insert::<f64>(2, 9);
-        cache.insert::<f64>(2, 10);
-        cache.insert::<f64>(2, 11);
-        assert_eq!(cache.try_pop::<f32>(1), Some(2));
-        assert_eq!(cache.try_pop::<f32>(1), Some(1));
-        assert_eq!(cache.try_pop::<f32>(1), Some(0));
+        cache.insert::<f32>(1, vec![0.0]);
+        cache.insert::<f32>(1, vec![1.0]);
+        cache.insert::<f32>(1, vec![2.0]);
+        cache.insert::<f32>(2, vec![3.0; 2]);
+        cache.insert::<f32>(2, vec![4.0; 2]);
+        cache.insert::<f32>(2, vec![5.0; 2]);
+        cache.insert::<f64>(1, vec![6.0]);
+        cache.insert::<f64>(1, vec![7.0]);
+        cache.insert::<f64>(1, vec![8.0]);
+        cache.insert::<f64>(2, vec![9.0; 2]);
+        cache.insert::<f64>(2, vec![10.0; 2]);
+        cache.insert::<f64>(2, vec![11.0; 2]);
+        assert_eq!(cache.try_pop::<f32>(1), Some(vec![2.0]));
+        assert_eq!(cache.try_pop::<f32>(1), Some(vec![1.0]));
+        assert_eq!(cache.try_pop::<f32>(1), Some(vec![0.0]));
         assert_eq!(cache.try_pop::<f32>(1), None);
-        assert_eq!(cache.try_pop::<f32>(2), Some(5));
-        assert_eq!(cache.try_pop::<f32>(2), Some(4));
-        assert_eq!(cache.try_pop::<f32>(2), Some(3));
+        assert_eq!(cache.try_pop::<f32>(2), Some(vec![5.0; 2]));
+        assert_eq!(cache.try_pop::<f32>(2), Some(vec![4.0; 2]));
+        assert_eq!(cache.try_pop::<f32>(2), Some(vec![3.0; 2]));
         assert_eq!(cache.try_pop::<f32>(2), None);
-        assert_eq!(cache.try_pop::<f64>(1), Some(8));
-        assert_eq!(cache.try_pop::<f64>(1), Some(7));
-        assert_eq!(cache.try_pop::<f64>(1), Some(6));
+        assert_eq!(cache.try_pop::<f64>(1), Some(vec![8.0]));
+        assert_eq!(cache.try_pop::<f64>(1), Some(vec![7.0]));
+        assert_eq!(cache.try_pop::<f64>(1), Some(vec![6.0]));
         assert_eq!(cache.try_pop::<f64>(1), None);
-        assert_eq!(cache.try_pop::<f64>(2), Some(11));
-        assert_eq!(cache.try_pop::<f64>(2), Some(10));
-        assert_eq!(cache.try_pop::<f64>(2), Some(9));
+        assert_eq!(cache.try_pop::<f64>(2), Some(vec![11.0; 2]));
+        assert_eq!(cache.try_pop::<f64>(2), Some(vec![10.0; 2]));
+        assert_eq!(cache.try_pop::<f64>(2), Some(vec![9.0; 2]));
         assert_eq!(cache.try_pop::<f64>(2), None);
     }
 }
diff --git a/src/tensor/cpu/allocate.rs b/src/tensor/cpu/allocate.rs
index e9ac9d307..3a2a742cb 100644
--- a/src/tensor/cpu/allocate.rs
+++ b/src/tensor/cpu/allocate.rs
@@ -35,16 +35,7 @@ impl Cpu {
                 data.resize(numel, elem);
                 Ok(data)
             },
-            |allocation| {
-                // SAFETY:
-                // - ✅ "ptr must have been allocated using the global allocator, such as via the alloc::alloc function."
-                // - ✅ handled by tensor cache "T needs to have the same alignment as what ptr was allocated with."
-                // - ✅ handled by tensor cache "The size of T times the capacity needs to be the same size as the pointer was allocated with."
-                // - ✅ "length needs to be less than or equal to capacity."
-                // - ✅ all the dtypes for this are builtin numbers "The first length values must be properly initialized values of type T."
-                // - ✅ "capacity needs to be the capacity that the pointer was allocated with."
-                // - ✅ "The allocated size in bytes must be no larger than isize::MAX. See the safety documentation of pointer::offset."
-                let mut data = unsafe { Vec::from_raw_parts(allocation.0 as *mut E, numel, numel) };
+            |mut data| {
                 data.fill(elem);
                 Ok(data)
             },
diff --git a/src/tensor/cpu/device.rs b/src/tensor/cpu/device.rs
index 45907c232..65b434274 100644
--- a/src/tensor/cpu/device.rs
+++ b/src/tensor/cpu/device.rs
@@ -1,6 +1,7 @@
 use crate::shapes::{Shape, Unit};
-use crate::tensor::{cache::TensorCache, cpu::LendingIterator, storage_traits::*, Tensor};
+use crate::tensor::{cache::{TensorCache, CacheStorage}, cpu::LendingIterator, storage_traits::*, Tensor};
 use rand::{rngs::StdRng, Rng, SeedableRng};
+use core::alloc::Layout;
 use std::{sync::Arc, vec::Vec};
 
 #[cfg(feature = "no-std")]
@@ -15,6 +16,49 @@ pub(crate) struct BytesPtr(pub(crate) *mut u8);
 unsafe impl Send for BytesPtr {}
 unsafe impl Sync for BytesPtr {}
 
+impl<T> CacheStorage for Vec<T> {
+    type Output<T2> = Vec<T2>;
+
+    /// Unsafely converts the elements of a vector to a new type.
+    ///
+    /// # Safety
+    ///
+    /// * Has all of the potential pitfalls of slice.align_to
+    /// * If converting to a type with a different alignment, the caller must convert back to a
+    /// type with the same alignment before dropping
+    unsafe fn transmute_elements<T2>(mut self) -> Self::Output<T2> {
+        let src_layout = Layout::new::<T>().pad_to_align();
+        let dst_layout = Layout::new::<T2>().pad_to_align();
+
+        let byte_len = self.len() * src_layout.size();
+        let byte_capacity = self.capacity() * src_layout.size();
+        let ptr = self.as_mut_ptr();
+        std::mem::forget(self);
+
+        let dst_size = dst_layout.size();
+
+        assert_eq!(ptr.align_offset(dst_layout.align()), 0, "Allocation is improperly aligned");
+        assert!(byte_len % dst_size == 0 && byte_capacity % dst_size == 0, "Allocation is improperly sized");
+
+        let len = byte_len / dst_size;
+        let capacity = byte_capacity / dst_size;
+
+        // Safety:
+        // * T2 may not have the same alignment as the initial vector, it is the caller's
+        // responsiblity to ensure that the vector is converted to a type with the correct
+        // alignment before dropping
+        // * The first len values may not be correctly initialized, it is the caller's
+        // responsibility to ensure correct values before usage
+        //
+        // * ptr is allocated with the global allocator as long as self was
+        // * length is less than or equal to capacity as long as this is true of self.
+        // * capacity is the capacity the pointer was allocated with as long as this is true of
+        // self
+        // * The allocated size is less than isize::MAX as long as this is true of self
+        Vec::from_raw_parts(ptr as *mut T2, len, capacity)
+    }
+}
+
 /// A device that stores data on the heap.
 ///
 /// The [Default] impl seeds the underlying rng with seed of 0.
@@ -25,7 +69,7 @@ pub struct Cpu {
     /// A thread safe random number generator.
     pub(crate) rng: Arc<Mutex<StdRng>>,
     /// A thread safe cache of memory allocations that can be reused.
-    pub(crate) cache: Arc<TensorCache<BytesPtr>>,
+    pub(crate) cache: Arc<TensorCache<Vec<u8>>>,
 }
 
 impl Default for Cpu {
@@ -78,7 +122,7 @@ pub struct CachableVec<E> {
     /// The data stored in this vector.
     pub(crate) data: Vec<E>,
     /// A cache of memory allocations that can be reused.
-    pub(crate) cache: Arc<TensorCache<BytesPtr>>,
+    pub(crate) cache: Arc<TensorCache<Vec<u8>>>,
 }
 
 impl<E: Clone> Clone for CachableVec<E> {
@@ -89,17 +133,7 @@ impl<E: Clone> Clone for CachableVec<E> {
                 data: self.data.clone(),
                 cache: self.cache.clone(),
             },
-            |allocation| {
-                assert!(numel < isize::MAX as usize);
-                // SAFETY:
-                // - ✅ "ptr must have been allocated using the global allocator, such as via the alloc::alloc function."
-                // - ✅ handled by tensor cache "T needs to have the same alignment as what ptr was allocated with."
-                // - ✅ handled by tensor cache "The size of T times the capacity needs to be the same size as the pointer was allocated with."
-                // - ✅ "length needs to be less than or equal to capacity."
-                // - ✅ all the dtypes for this are builtin numbers "The first length values must be properly initialized values of type T."
-                // - ✅ "capacity needs to be the capacity that the pointer was allocated with."
-                // - ✅ "The allocated size in bytes must be no larger than isize::MAX. See the safety documentation of pointer::offset."
-                let mut data = unsafe { Vec::from_raw_parts(allocation.0 as *mut E, numel, numel) };
+            |mut data| {
                 data.clone_from(&self.data);
                 Self {
                     data,
@@ -112,16 +146,8 @@ impl<E: Clone> Clone for CachableVec<E> {
 
 impl<E> Drop for CachableVec<E> {
     fn drop(&mut self) {
-        if self.cache.is_enabled() {
-            let mut data = std::mem::take(&mut self.data);
-            data.shrink_to_fit();
-
-            let numel = data.len();
-            let ptr = data.as_mut_ptr() as *mut u8;
-            std::mem::forget(data);
-
-            self.cache.insert::<E>(numel, BytesPtr(ptr));
-        }
+        let data = std::mem::take(&mut self.data);
+        self.cache.insert::<E>(data.len(), data);
     }
 }
 
@@ -184,45 +210,7 @@ impl DeviceStorage for Cpu {
     }
 
     fn try_empty_cache(&self) -> Result<(), Self::Err> {
-        #[cfg(not(feature = "no-std"))]
-        let mut cache = self.cache.allocations.write().unwrap();
-        #[cfg(feature = "no-std")]
-        let mut cache = self.cache.allocations.write();
-        for (&key, allocations) in cache.iter_mut() {
-            assert!(key.num_bytes % key.size == 0);
-            assert!(key.num_bytes < isize::MAX as usize);
-            let len = key.num_bytes / key.size;
-            let cap = len;
-            for alloc in allocations.drain(..) {
-                // SAFETY:
-                // - "ptr must have been allocated using the global allocator, such as via the alloc::alloc function."
-                //    - ✅ cpu uses global allocator
-                // - "T needs to have the same alignment as what ptr was allocated with."
-                //    - ✅ we are matching on the alignment below
-                // - "The size of T times the capacity needs to be the same size as the pointer was allocated with."
-                //    - ✅ covered by `key.num_bytes / key.size` and the `key.num_bytes % key.size == 0` assertion above
-                // - "length needs to be less than or equal to capacity."
-                //    - ✅ they are equal
-                // - "The first length values must be properly initialized values of type T."
-                //    - ✅ any bit pattern is valid for unsigned ints used below
-                // - "capacity needs to be the capacity that the pointer was allocated with."
-                //    - ✅ handled by assertion above (key.num_bytes % key.size == 0)
-                // - "The allocated size in bytes must be no larger than isize::MAX. See the safety documentation of pointer::offset."
-                //    - ✅ handled by assertion above
-                debug_assert_eq!(std::alloc::Layout::new::<u8>().align(), 1);
-                debug_assert_eq!(std::alloc::Layout::new::<u16>().align(), 2);
-                debug_assert_eq!(std::alloc::Layout::new::<u32>().align(), 4);
-                debug_assert_eq!(std::alloc::Layout::new::<u64>().align(), 8);
-                match key.alignment {
-                    1 => unsafe { drop(Vec::from_raw_parts(alloc.0 as *mut u8, len, cap)) },
-                    2 => unsafe { drop(Vec::from_raw_parts(alloc.0 as *mut u16, len, cap)) },
-                    4 => unsafe { drop(Vec::from_raw_parts(alloc.0 as *mut u32, len, cap)) },
-                    8 => unsafe { drop(Vec::from_raw_parts(alloc.0 as *mut u64, len, cap)) },
-                    _ => unreachable!(),
-                };
-            }
-        }
-        cache.clear();
+        self.cache.clear();
         Ok(())
     }
 }
diff --git a/src/tensor/cuda/device.rs b/src/tensor/cuda/device.rs
index ae1807bdb..25f5d1fc0 100644
--- a/src/tensor/cuda/device.rs
+++ b/src/tensor/cuda/device.rs
@@ -1,19 +1,45 @@
 use crate::shapes::{Shape, Unit};
 use crate::tensor::cpu::{Cpu, CpuError};
-use crate::tensor::{cache::TensorCache, DeviceStorage, HasErr, NoneTape, Tensor};
+use crate::tensor::{cache::{TensorCache, CacheStorage}, DeviceStorage, HasErr, NoneTape, Tensor};
 
 use cudarc::driver::{DevicePtr, DevicePtrMut, DeviceRepr};
 use cudarc::{
     cublas::{result::CublasError, CudaBlas},
-    driver::sys::CUdeviceptr,
     driver::{CudaDevice, CudaSlice, CudaStream, DeviceSlice, DriverError},
 };
 
+use core::alloc::Layout;
 use std::{
     sync::{Arc, Mutex, MutexGuard},
     vec::Vec,
 };
 
+impl<T> CacheStorage for CudaSlice<T> {
+    type Output<T2> = CudaSlice<T2>;
+
+    /// Unsafely converts the elements of a CudaSlice to a new type.
+    ///
+    /// # Safety
+    ///
+    /// Assuming that `self` is a valid [CudaSlice], the main safety concern is that the output
+    /// slice should be assumed to be uninitialized
+    unsafe fn transmute_elements<T2>(self) -> Self::Output<T2> {
+        let dev = self.device();
+
+        let src_layout = Layout::new::<T>().pad_to_align();
+        let dst_layout = Layout::new::<T2>().pad_to_align();
+
+        let byte_len = self.len() * src_layout.size();
+        let ptr = self.leak();
+
+        assert_eq!(byte_len % dst_layout.size(), 0, "Allocation is improperly sized");
+
+        let len = byte_len / dst_layout.size();
+
+        dev.upgrade_device_ptr(ptr, len)
+    }
+}
+
 /// A Cuda device that enables constructing tensors on GPUs
 /// & running GPU kernels.
 #[derive(Clone, Debug)]
@@ -27,7 +53,7 @@ pub struct Cuda {
     /// A second stream for kernels to optionally execute on.
     pub(crate) par_stream: Arc<CudaStream>,
     pub(crate) workspace: Arc<Mutex<CudaSlice<u8>>>,
-    pub(crate) cache: Arc<TensorCache<CUdeviceptr>>,
+    pub(crate) cache: Arc<TensorCache<CudaSlice<u8>>>,
 }
 
 #[derive(Debug)]
@@ -111,11 +137,10 @@ impl Cuda {
         &self,
         len: usize,
     ) -> Result<CudaSlice<E>, CudaError> {
-        let data = self.cache.try_pop::<E>(len).map_or_else(
+        Ok(self.cache.try_pop::<E>(len).map_or_else(
             || self.dev.alloc::<E>(len),
-            |ptr| Ok(self.dev.upgrade_device_ptr(ptr, len)),
-        )?;
-        Ok(data)
+            Ok,
+        )?)
     }
     #[allow(unused)]
     pub(crate) unsafe fn get_workspace<E>(
@@ -152,7 +177,7 @@ pub struct CachableCudaSlice<E> {
     /// The actual data.
     pub(crate) data: CudaSlice<E>,
     /// A cache of device pointers that can be reused.
-    pub(crate) cache: Arc<TensorCache<CUdeviceptr>>,
+    pub(crate) cache: Arc<TensorCache<CudaSlice<u8>>>,
 }
 
 impl<E: cudarc::driver::DeviceRepr> Clone for CachableCudaSlice<E> {
@@ -161,11 +186,7 @@ impl<E: cudarc::driver::DeviceRepr> Clone for CachableCudaSlice<E> {
         let len = self.data.len();
         let data = self.cache.try_pop::<E>(len).map_or_else(
             || self.data.try_clone().unwrap(),
-            |ptr| {
-                // SAFETY:
-                // 1. we know that ptr is valid for `num_bytes` because it was registered for that.
-                // 2. we are about to set the memory with dtod_copy
-                let mut slice = unsafe { dev.upgrade_device_ptr(ptr, len) };
+            |mut slice| {
                 dev.dtod_copy(&self.data, &mut slice).unwrap();
                 slice
             },
@@ -224,16 +245,11 @@ impl<E> std::ops::DerefMut for CachableCudaSlice<E> {
 
 impl<E> Drop for CachableCudaSlice<E> {
     fn drop(&mut self) {
-        if self.cache.is_enabled() {
-            let dev = self.data.device();
-            // Replaces the CudaSlice with a 0 length CudaSlice. This won't take additional
-            // memory, but will give us ownership of the actual data.
-            let data = std::mem::replace(&mut self.data, dev.null().unwrap());
-            let numel = data.len();
-            // Get access to the raw pointer without freeing it.
-            let ptr = data.leak();
-            self.cache.insert::<E>(numel, ptr);
-        }
+        let dev = self.data.device();
+        // Replaces the CudaSlice with a 0 length CudaSlice. This won't take additional
+        // memory, but will give us ownership of the actual data.
+        let data = std::mem::replace(&mut self.data, dev.null().unwrap());
+        self.cache.insert::<E>(data.len(), data);
     }
 }
 
@@ -292,17 +308,7 @@ impl DeviceStorage for Cuda {
     }
 
     fn try_empty_cache(&self) -> Result<(), Self::Err> {
-        #[cfg(not(feature = "no-std"))]
-        let mut cache = self.cache.allocations.write().unwrap();
-        #[cfg(feature = "no-std")]
-        let mut cache = self.cache.allocations.write();
-        for (&key, allocations) in cache.iter_mut() {
-            for alloc in allocations.drain(..) {
-                let data = unsafe { self.dev.upgrade_device_ptr::<u8>(alloc, key.num_bytes) };
-                drop(data);
-            }
-        }
-        cache.clear();
+        self.cache.clear();
         Ok(())
     }
 }

From 0a2d7d9c83dc4524de0dd0beae5934c51dfc3ff8 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sat, 15 Apr 2023 22:56:20 -0500
Subject: [PATCH 02/22] split an assert; run cargo fmt

---
 src/tensor/cache.rs       |  7 ++++---
 src/tensor/cpu/device.rs  | 22 ++++++++++++++++++----
 src/tensor/cuda/device.rs | 19 +++++++++++++------
 3 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs
index 389706777..a5abd3839 100644
--- a/src/tensor/cache.rs
+++ b/src/tensor/cache.rs
@@ -45,7 +45,7 @@ pub(crate) trait CacheStorage: Sized {
     /// **This function is wildly unsafe**, see implementations for details
     unsafe fn transmute_elements<T>(self) -> Self::Output<T>;
 
-    /// Uses transmute_elements to convert to an element type with alignment `align` before dropping. 
+    /// Uses transmute_elements to convert to an element type with alignment `align` before dropping.
     unsafe fn drop_with_alignment(self, align: usize) {
         match align {
             1 => drop(self.transmute_elements::<u8>()),
@@ -53,7 +53,7 @@ pub(crate) trait CacheStorage: Sized {
             4 => drop(self.transmute_elements::<u32>()),
             8 => drop(self.transmute_elements::<u64>()),
             16 => drop(self.transmute_elements::<u128>()),
-            _ => panic!("Invalid alignment")
+            _ => panic!("Invalid alignment"),
         }
     }
 
@@ -175,7 +175,8 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
 
     /// Inserts an allocation into the cache.
     pub(crate) fn insert<E>(&self, len: usize, allocation: Ptr::Output<E>)
-        where Ptr::Output<E>: CacheStorage<Output<u8> = Ptr>
+    where
+        Ptr::Output<E>: CacheStorage<Output<u8> = Ptr>,
     {
         if !self.is_enabled() {
             return;
diff --git a/src/tensor/cpu/device.rs b/src/tensor/cpu/device.rs
index 65b434274..75c28a128 100644
--- a/src/tensor/cpu/device.rs
+++ b/src/tensor/cpu/device.rs
@@ -1,7 +1,12 @@
 use crate::shapes::{Shape, Unit};
-use crate::tensor::{cache::{TensorCache, CacheStorage}, cpu::LendingIterator, storage_traits::*, Tensor};
-use rand::{rngs::StdRng, Rng, SeedableRng};
+use crate::tensor::{
+    cache::{CacheStorage, TensorCache},
+    cpu::LendingIterator,
+    storage_traits::*,
+    Tensor,
+};
 use core::alloc::Layout;
+use rand::{rngs::StdRng, Rng, SeedableRng};
 use std::{sync::Arc, vec::Vec};
 
 #[cfg(feature = "no-std")]
@@ -37,8 +42,17 @@ impl<T> CacheStorage for Vec<T> {
 
         let dst_size = dst_layout.size();
 
-        assert_eq!(ptr.align_offset(dst_layout.align()), 0, "Allocation is improperly aligned");
-        assert!(byte_len % dst_size == 0 && byte_capacity % dst_size == 0, "Allocation is improperly sized");
+        assert_eq!(
+            ptr.align_offset(dst_layout.align()),
+            0,
+            "Allocation is improperly aligned"
+        );
+        assert_eq!(byte_len % dst_size, 0, "Length is improperly sized");
+        assert_eq!(
+            byte_capacity % dst_size,
+            0,
+            "Allocation is improperly sized"
+        );
 
         let len = byte_len / dst_size;
         let capacity = byte_capacity / dst_size;
diff --git a/src/tensor/cuda/device.rs b/src/tensor/cuda/device.rs
index 25f5d1fc0..a3796421a 100644
--- a/src/tensor/cuda/device.rs
+++ b/src/tensor/cuda/device.rs
@@ -1,6 +1,9 @@
 use crate::shapes::{Shape, Unit};
 use crate::tensor::cpu::{Cpu, CpuError};
-use crate::tensor::{cache::{TensorCache, CacheStorage}, DeviceStorage, HasErr, NoneTape, Tensor};
+use crate::tensor::{
+    cache::{CacheStorage, TensorCache},
+    DeviceStorage, HasErr, NoneTape, Tensor,
+};
 
 use cudarc::driver::{DevicePtr, DevicePtrMut, DeviceRepr};
 use cudarc::{
@@ -32,7 +35,11 @@ impl<T> CacheStorage for CudaSlice<T> {
         let byte_len = self.len() * src_layout.size();
         let ptr = self.leak();
 
-        assert_eq!(byte_len % dst_layout.size(), 0, "Allocation is improperly sized");
+        assert_eq!(
+            byte_len % dst_layout.size(),
+            0,
+            "Allocation is improperly sized"
+        );
 
         let len = byte_len / dst_layout.size();
 
@@ -137,10 +144,10 @@ impl Cuda {
         &self,
         len: usize,
     ) -> Result<CudaSlice<E>, CudaError> {
-        Ok(self.cache.try_pop::<E>(len).map_or_else(
-            || self.dev.alloc::<E>(len),
-            Ok,
-        )?)
+        Ok(self
+            .cache
+            .try_pop::<E>(len)
+            .map_or_else(|| self.dev.alloc::<E>(len), Ok)?)
     }
     #[allow(unused)]
     pub(crate) unsafe fn get_workspace<E>(

From 347b24bc5786a952adb5bf3847e078f0f172ed35 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sun, 16 Apr 2023 09:49:02 -0500
Subject: [PATCH 03/22] reduce visibility of internals; add safety comments

---
 build.rs                 |  4 ++--
 src/tensor/cache.rs      | 31 +++++++++++++++++++++++++------
 src/tensor/cpu/device.rs |  2 ++
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/build.rs b/build.rs
index 3860f5fb0..daf4caef5 100644
--- a/build.rs
+++ b/build.rs
@@ -93,8 +93,8 @@ mod cuda {
                 let out = out.lines().collect::<Vec<&str>>();
                 let mut codes = Vec::with_capacity(out.len());
                 for code in out {
-                    let code = code.split("_").collect::<Vec<&str>>();
-                    if code.len() != 0 && code.contains(&"sm") {
+                    let code = code.split('_').collect::<Vec<&str>>();
+                    if !code.is_empty() && code.contains(&"sm") {
                         if let Ok(num) = code[1].parse::<usize>() {
                             codes.push(num);
                         }
diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs
index a5abd3839..f30292b53 100644
--- a/src/tensor/cache.rs
+++ b/src/tensor/cache.rs
@@ -13,12 +13,12 @@ use spin::RwLock;
 /// for a hasmap, meaning we need this extra datastructure. Otherwise
 /// we could just using `(usize, Layout)` as the key.
 #[derive(Debug, Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd)]
-pub(crate) struct AllocationKey {
-    pub num_bytes: usize,
+pub struct AllocationKey {
+    num_bytes: usize,
     /// The size of the allocation in bytes - from [Layout].
-    pub size: usize,
+    size: usize,
     /// The alignment of the allocation in bytes - from [Layout].
-    pub alignment: usize,
+    alignment: usize,
 }
 
 /// A cache of allocations that can be reused.
@@ -34,8 +34,8 @@ pub(crate) struct AllocationKey {
 /// is removed.
 #[derive(Debug)]
 pub(crate) struct TensorCache<Ptr: CacheStorage> {
-    pub(crate) allocations: RwLock<BTreeMap<AllocationKey, Vec<Ptr>>>,
-    pub(crate) enabled: RwLock<bool>,
+    allocations: RwLock<BTreeMap<AllocationKey, Vec<Ptr>>>,
+    enabled: RwLock<bool>,
 }
 
 pub(crate) trait CacheStorage: Sized {
@@ -72,6 +72,12 @@ impl<Ptr: CacheStorage> Default for TensorCache<Ptr> {
     }
 }
 
+impl<Ptr: CacheStorage> Drop for TensorCache<Ptr> {
+    fn drop(&mut self) {
+        self.clear();
+    }
+}
+
 impl<Ptr: CacheStorage> TensorCache<Ptr> {
     /// Returns the number of allocations in the cache.
     #[allow(unused)]
@@ -167,6 +173,10 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
             if items.is_empty() {
                 cache.remove(&key);
             }
+            // SAFETY:
+            //
+            // * If inserted with 'insert', this will convert allocation to an element type with
+            // the correct alignment
             Some(unsafe { allocation.transmute_elements() })
         } else {
             None
@@ -182,6 +192,11 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
             return;
         }
 
+        // SAFETY:
+        //
+        // * Allocation must be converted back to an element type with the correct alignment before
+        // dropping
+        // * Allocation must not have its capacity modified
         let allocation = unsafe { allocation.transmute_elements::<u8>() };
         let layout = Layout::new::<E>();
         let num_bytes = len * layout.size();
@@ -219,6 +234,10 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
             assert!(key.num_bytes % key.size == 0);
             assert!(key.num_bytes < isize::MAX as usize);
             for alloc in allocations.drain(..) {
+                // SAFETY:
+                //
+                // * If inserted with 'insert', this will convert allocation to an element type with
+                // the correct alignment before dropping.
                 unsafe { alloc.drop_with_key(key) };
             }
         }
diff --git a/src/tensor/cpu/device.rs b/src/tensor/cpu/device.rs
index 75c28a128..d17709482 100644
--- a/src/tensor/cpu/device.rs
+++ b/src/tensor/cpu/device.rs
@@ -31,6 +31,8 @@ impl<T> CacheStorage for Vec<T> {
     /// * Has all of the potential pitfalls of slice.align_to
     /// * If converting to a type with a different alignment, the caller must convert back to a
     /// type with the same alignment before dropping
+    /// * If converting to a type with a different alignment, the caller must not grow or shrink
+    /// the allocation of the returned vector
     unsafe fn transmute_elements<T2>(mut self) -> Self::Output<T2> {
         let src_layout = Layout::new::<T>().pad_to_align();
         let dst_layout = Layout::new::<T2>().pad_to_align();

From f055ae54bb6e183ff748605f8ea282995ae54118 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sun, 16 Apr 2023 11:15:57 -0500
Subject: [PATCH 04/22] create safe wrapper around CacheStorage objects

---
 src/tensor/cache.rs | 69 ++++++++++++++++++++++-----------------------
 1 file changed, 34 insertions(+), 35 deletions(-)

diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs
index f30292b53..5068e1c1f 100644
--- a/src/tensor/cache.rs
+++ b/src/tensor/cache.rs
@@ -34,7 +34,7 @@ pub struct AllocationKey {
 /// is removed.
 #[derive(Debug)]
 pub(crate) struct TensorCache<Ptr: CacheStorage> {
-    allocations: RwLock<BTreeMap<AllocationKey, Vec<Ptr>>>,
+    allocations: RwLock<BTreeMap<AllocationKey, Vec<CacheWrapper<Ptr>>>>,
     enabled: RwLock<bool>,
 }
 
@@ -56,10 +56,36 @@ pub(crate) trait CacheStorage: Sized {
             _ => panic!("Invalid alignment"),
         }
     }
+}
+
+/// (Mostly) Safe wrapper around CacheStorage implementers
+#[derive(Clone, Debug)]
+struct CacheWrapper<Ptr: CacheStorage> {
+    ptr: Option<Ptr>,
+    align: usize,
+}
+
+impl<Ptr: CacheStorage> Drop for CacheWrapper<Ptr> {
+    fn drop(&mut self) {
+        if let Some(ptr) = std::mem::take(&mut self.ptr) {
+            unsafe { ptr.drop_with_alignment(self.align) }
+        }
+    }
+}
+
+impl<Ptr: CacheStorage> CacheWrapper<Ptr> {
+    fn from_storage<T>(storage: Ptr::Output<T>) -> Self where Ptr::Output<T>: CacheStorage<Output<u8> = Ptr> {
+        Self {
+            ptr: Some(unsafe { storage.transmute_elements::<u8>() }),
+            align: Layout::new::<T>().align()
+        }
+    }
 
-    /// calls [CacheStorage::drop_with_alignment], extracting `align` from an [AllocationKey]
-    unsafe fn drop_with_key(self, key: AllocationKey) {
-        self.drop_with_alignment(key.alignment)
+    // Safety: Same as slice.align_to, but considered safe internally
+    fn into_storage<T>(mut self) -> Ptr::Output<T> {
+        assert_eq!(Layout::new::<T>().align(), self.align);
+        let ptr = std::mem::take(&mut self.ptr).unwrap();
+        unsafe { ptr.transmute_elements() }
     }
 }
 
@@ -72,12 +98,6 @@ impl<Ptr: CacheStorage> Default for TensorCache<Ptr> {
     }
 }
 
-impl<Ptr: CacheStorage> Drop for TensorCache<Ptr> {
-    fn drop(&mut self) {
-        self.clear();
-    }
-}
-
 impl<Ptr: CacheStorage> TensorCache<Ptr> {
     /// Returns the number of allocations in the cache.
     #[allow(unused)]
@@ -173,11 +193,7 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
             if items.is_empty() {
                 cache.remove(&key);
             }
-            // SAFETY:
-            //
-            // * If inserted with 'insert', this will convert allocation to an element type with
-            // the correct alignment
-            Some(unsafe { allocation.transmute_elements() })
+            Some(allocation.into_storage())
         } else {
             None
         }
@@ -192,12 +208,7 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
             return;
         }
 
-        // SAFETY:
-        //
-        // * Allocation must be converted back to an element type with the correct alignment before
-        // dropping
-        // * Allocation must not have its capacity modified
-        let allocation = unsafe { allocation.transmute_elements::<u8>() };
+        let allocation = CacheWrapper::from_storage(allocation);
         let layout = Layout::new::<E>();
         let num_bytes = len * layout.size();
         let key = AllocationKey {
@@ -227,21 +238,9 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
 
     pub(crate) fn clear(&self) {
         #[cfg(not(feature = "no-std"))]
-        let mut cache = self.allocations.write().unwrap();
+        self.allocations.write().unwrap().clear();
         #[cfg(feature = "no-std")]
-        let mut cache = self.allocations.write();
-        for (&key, allocations) in cache.iter_mut() {
-            assert!(key.num_bytes % key.size == 0);
-            assert!(key.num_bytes < isize::MAX as usize);
-            for alloc in allocations.drain(..) {
-                // SAFETY:
-                //
-                // * If inserted with 'insert', this will convert allocation to an element type with
-                // the correct alignment before dropping.
-                unsafe { alloc.drop_with_key(key) };
-            }
-        }
-        cache.clear();
+        self.allocations.write().clear();
     }
 }
 

From 16970020e78ed86db43c5a64e34d61a57498c16d Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sun, 16 Apr 2023 11:16:15 -0500
Subject: [PATCH 05/22] run cargo fmt

---
 src/tensor/cache.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs
index 5068e1c1f..79c9eddb1 100644
--- a/src/tensor/cache.rs
+++ b/src/tensor/cache.rs
@@ -74,10 +74,13 @@ impl<Ptr: CacheStorage> Drop for CacheWrapper<Ptr> {
 }
 
 impl<Ptr: CacheStorage> CacheWrapper<Ptr> {
-    fn from_storage<T>(storage: Ptr::Output<T>) -> Self where Ptr::Output<T>: CacheStorage<Output<u8> = Ptr> {
+    fn from_storage<T>(storage: Ptr::Output<T>) -> Self
+    where
+        Ptr::Output<T>: CacheStorage<Output<u8> = Ptr>,
+    {
         Self {
             ptr: Some(unsafe { storage.transmute_elements::<u8>() }),
-            align: Layout::new::<T>().align()
+            align: Layout::new::<T>().align(),
         }
     }
 

From 240553c7ae6e735fc14c5815c05f5b6da8a39255 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sun, 16 Apr 2023 11:40:30 -0500
Subject: [PATCH 06/22] add check_key method to ensure CacheWrappers are used
 with valid keys

---
 src/tensor/cache.rs | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs
index 79c9eddb1..79f77f225 100644
--- a/src/tensor/cache.rs
+++ b/src/tensor/cache.rs
@@ -14,8 +14,9 @@ use spin::RwLock;
 /// we could just using `(usize, Layout)` as the key.
 #[derive(Debug, Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd)]
 pub struct AllocationKey {
+    /// The size of the allocation in bytes
     num_bytes: usize,
-    /// The size of the allocation in bytes - from [Layout].
+    /// The size of the type in bytes - from [Layout].
     size: usize,
     /// The alignment of the allocation in bytes - from [Layout].
     alignment: usize,
@@ -62,13 +63,14 @@ pub(crate) trait CacheStorage: Sized {
 #[derive(Clone, Debug)]
 struct CacheWrapper<Ptr: CacheStorage> {
     ptr: Option<Ptr>,
-    align: usize,
+    alignment: usize,
+    size: usize,
 }
 
 impl<Ptr: CacheStorage> Drop for CacheWrapper<Ptr> {
     fn drop(&mut self) {
         if let Some(ptr) = std::mem::take(&mut self.ptr) {
-            unsafe { ptr.drop_with_alignment(self.align) }
+            unsafe { ptr.drop_with_alignment(self.alignment) }
         }
     }
 }
@@ -78,15 +80,29 @@ impl<Ptr: CacheStorage> CacheWrapper<Ptr> {
     where
         Ptr::Output<T>: CacheStorage<Output<u8> = Ptr>,
     {
+        let layout = Layout::new::<T>();
         Self {
             ptr: Some(unsafe { storage.transmute_elements::<u8>() }),
-            align: Layout::new::<T>().align(),
+            alignment: layout.align(),
+            size: layout.size(),
         }
     }
 
+    fn check_key(&self, key: &AllocationKey) {
+        assert_eq!(self.alignment, key.alignment, "Alignment does not match");
+        assert_eq!(self.size, key.size, "Size does not match");
+        // Implicitly assumes that T should not have any padding, but this should always be true of
+        // primitive number types.
+        assert_eq!(key.num_bytes % key.size, 0, "Key is invalid or type is padded");
+    }
+
     // Safety: Same as slice.align_to, but considered safe internally
+    // Produces storage containing uninitialized values
     fn into_storage<T>(mut self) -> Ptr::Output<T> {
-        assert_eq!(Layout::new::<T>().align(), self.align);
+        let layout = Layout::new::<T>();
+        assert_eq!(layout.align(), self.alignment);
+        assert_eq!(layout.size(), self.size);
+
         let ptr = std::mem::take(&mut self.ptr).unwrap();
         unsafe { ptr.transmute_elements() }
     }
@@ -196,6 +212,7 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
             if items.is_empty() {
                 cache.remove(&key);
             }
+            allocation.check_key(&key);
             Some(allocation.into_storage())
         } else {
             None
@@ -219,6 +236,7 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
             size: layout.size(),
             alignment: layout.align(),
         };
+        allocation.check_key(&key);
         #[cfg(not(feature = "no-std"))]
         let mut cache = self.allocations.write().unwrap();
         #[cfg(feature = "no-std")]

From 101497edcda6f8d8eb739919cb4060c11912e6c7 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sun, 16 Apr 2023 12:46:07 -0500
Subject: [PATCH 07/22] typo; run cargo fmt

---
 src/tensor/cache.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs
index 79f77f225..9ca1a87ae 100644
--- a/src/tensor/cache.rs
+++ b/src/tensor/cache.rs
@@ -93,7 +93,11 @@ impl<Ptr: CacheStorage> CacheWrapper<Ptr> {
         assert_eq!(self.size, key.size, "Size does not match");
         // Implicitly assumes that T should not have any padding, but this should always be true of
         // primitive number types.
-        assert_eq!(key.num_bytes % key.size, 0, "Key is invalid or type is padded");
+        assert_eq!(
+            key.num_bytes % key.size,
+            0,
+            "Key is invalid or type is padded"
+        );
     }
 
     // Safety: Same as slice.align_to, but considered safe internally
@@ -144,7 +148,7 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
         }
     }
 
-    /// Disables the cache.
+    /// Enables the cache.
     pub(crate) fn enable(&self) {
         #[cfg(not(feature = "no-std"))]
         {

From ea7329471f2abc0993cb5b3fba070f37d900c8ec Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sun, 16 Apr 2023 13:48:42 -0500
Subject: [PATCH 08/22] add shrink method

---
 src/tensor/cache.rs       | 197 ++++++++++++++++++++++++--------------
 src/tensor/cpu/device.rs  |   5 +
 src/tensor/cuda/device.rs |   4 +
 3 files changed, 136 insertions(+), 70 deletions(-)

diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs
index 9ca1a87ae..3d185fb7e 100644
--- a/src/tensor/cache.rs
+++ b/src/tensor/cache.rs
@@ -1,4 +1,14 @@
-use std::{alloc::Layout, collections::BTreeMap, vec::Vec};
+use std::{
+    alloc::Layout,
+    collections::{BTreeMap, VecDeque},
+    vec::Vec,
+};
+
+#[cfg(not(feature = "no-std"))]
+use std::vec;
+
+#[cfg(feature = "no-std")]
+use alloc::vec;
 
 #[cfg(not(feature = "no-std"))]
 use std::sync::RwLock;
@@ -6,14 +16,40 @@ use std::sync::RwLock;
 #[cfg(feature = "no-std")]
 use spin::RwLock;
 
-/// A key for the tensor cache. Contains both number of bytes and informatino
+macro_rules! read {
+    ($x:expr) => {{
+        #[cfg(not(feature = "no-std"))]
+        {
+            $x.read().unwrap()
+        }
+        #[cfg(feature = "no-std")]
+        {
+            $x.read()
+        }
+    }};
+}
+
+macro_rules! write {
+    ($x:expr) => {{
+        #[cfg(not(feature = "no-std"))]
+        {
+            $x.write().unwrap()
+        }
+        #[cfg(feature = "no-std")]
+        {
+            $x.write()
+        }
+    }};
+}
+
+/// A key for the tensor cache. Contains both number of bytes and information
 /// about the layout of the allocation.
 ///
 /// Since [Layout] doesn't impl Ord, we can't use it directly as a key
-/// for a hasmap, meaning we need this extra datastructure. Otherwise
+/// for a hashmap, meaning we need this extra datastructure. Otherwise
 /// we could just using `(usize, Layout)` as the key.
 #[derive(Debug, Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd)]
-pub struct AllocationKey {
+struct AllocationKey {
     /// The size of the allocation in bytes
     num_bytes: usize,
     /// The size of the type in bytes - from [Layout].
@@ -22,6 +58,12 @@ pub struct AllocationKey {
     alignment: usize,
 }
 
+#[derive(Debug)]
+struct AllocationGroup<Ptr: CacheStorage> {
+    ignore_drops: usize,
+    allocations: Vec<CacheWrapper<Ptr>>,
+}
+
 /// A cache of allocations that can be reused.
 ///
 /// The key is the number of bytes in the allocation, AND the layout
@@ -30,18 +72,25 @@ pub struct AllocationKey {
 /// allocator assumes memory is allocated & deallocated with the same layout.
 /// The value is a list of allocations of that size.
 ///
-/// The prescense of a key in the map, indicates that there is *at least one*
+/// The presence of a key in the map, indicates that there is *at least one*
 /// valid allocation. When the last value is removed from the list, the key
 /// is removed.
 #[derive(Debug)]
 pub(crate) struct TensorCache<Ptr: CacheStorage> {
-    allocations: RwLock<BTreeMap<AllocationKey, Vec<CacheWrapper<Ptr>>>>,
+    allocations: RwLock<BTreeMap<AllocationKey, AllocationGroup<Ptr>>>,
     enabled: RwLock<bool>,
+
+    drop_queue: RwLock<VecDeque<AllocationKey>>,
+    size: RwLock<usize>,
+    max_size: RwLock<usize>,
 }
 
 pub(crate) trait CacheStorage: Sized {
     type Output<T>: CacheStorage;
 
+    /// returns the allocations's size in bytes
+    fn size(&self) -> usize;
+
     /// Unsafely converts the elements of a contiguous collection type to another type. Note:
     /// **This function is wildly unsafe**, see implementations for details
     unsafe fn transmute_elements<T>(self) -> Self::Output<T>;
@@ -60,7 +109,7 @@ pub(crate) trait CacheStorage: Sized {
 }
 
 /// (Mostly) Safe wrapper around CacheStorage implementers
-#[derive(Clone, Debug)]
+#[derive(Debug)]
 struct CacheWrapper<Ptr: CacheStorage> {
     ptr: Option<Ptr>,
     alignment: usize,
@@ -100,6 +149,10 @@ impl<Ptr: CacheStorage> CacheWrapper<Ptr> {
         );
     }
 
+    fn size(&self) -> usize {
+        self.ptr.as_ref().unwrap().size()
+    }
+
     // Safety: Same as slice.align_to, but considered safe internally
     // Produces storage containing uninitialized values
     fn into_storage<T>(mut self) -> Ptr::Output<T> {
@@ -117,6 +170,10 @@ impl<Ptr: CacheStorage> Default for TensorCache<Ptr> {
         Self {
             allocations: Default::default(),
             enabled: RwLock::new(false),
+            drop_queue: Default::default(),
+            size: RwLock::new(0),
+            // TODO: default max size
+            max_size: RwLock::new(1_000_000),
         }
     }
 }
@@ -125,52 +182,65 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
     /// Returns the number of allocations in the cache.
     #[allow(unused)]
     pub(crate) fn len(&self) -> usize {
-        #[cfg(not(feature = "no-std"))]
-        {
-            self.allocations.read().unwrap().len()
-        }
+        read!(self.allocations).len()
+    }
 
-        #[cfg(feature = "no-std")]
-        {
-            self.allocations.read().len()
-        }
+    /// Returns the number of bytes occupied by allocations in the cache.
+    #[allow(unused)]
+    pub(crate) fn size(&self) -> usize {
+        *read!(self.size)
     }
 
     /// Returns `true` if the cache is enabled.
     pub(crate) fn is_enabled(&self) -> bool {
-        #[cfg(not(feature = "no-std"))]
-        {
-            *self.enabled.read().unwrap()
-        }
-        #[cfg(feature = "no-std")]
-        {
-            *self.enabled.read()
-        }
+        *read!(self.enabled)
     }
 
     /// Enables the cache.
     pub(crate) fn enable(&self) {
-        #[cfg(not(feature = "no-std"))]
-        {
-            *self.enabled.write().unwrap() = true;
-        }
-
-        #[cfg(feature = "no-std")]
-        {
-            *self.enabled.write() = true;
-        }
+        *write!(self.enabled) = true;
     }
 
     /// Disables the cache.
     pub(crate) fn disable(&self) {
-        #[cfg(not(feature = "no-std"))]
-        {
-            *self.enabled.write().unwrap() = false;
+        *write!(self.enabled) = false;
+    }
+
+    /// Sets the maximum size of the cache
+    #[allow(unused)]
+    pub(crate) fn set_max_size(&self, size: usize) {
+        *write!(self.max_size) = size;
+
+        if size < *read!(self.size) {
+            self.shrink();
         }
+    }
 
-        #[cfg(feature = "no-std")]
-        {
-            *self.enabled.write() = false;
+    /// Shrinks the cache so its buffers contain at most `max_size` bytes
+    fn shrink(&self) {
+        let mut size = write!(self.size);
+        let max_size = read!(self.max_size);
+        let mut drop_queue = write!(self.drop_queue);
+        let mut allocations = write!(self.allocations);
+
+        while *size > *max_size {
+            let key = drop_queue
+                .pop_front()
+                .expect("ignore_drops values were set too high");
+            let Some(alloc_group) = allocations.get_mut(&key) else { continue };
+
+            if alloc_group.ignore_drops > 0 {
+                alloc_group.ignore_drops -= 1;
+            } else {
+                let allocation = alloc_group
+                    .allocations
+                    .pop()
+                    .expect("ignore_drops values were set too low");
+                if alloc_group.allocations.is_empty() {
+                    allocations.remove(&key);
+                }
+                *size -= allocation.size();
+            }
         }
     }
 
@@ -189,34 +259,27 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
             alignment: layout.align(),
         };
         // Check if there is a cached allocation.
-        let reuse = {
-            #[cfg(not(feature = "no-std"))]
-            let cache = self.allocations.read().unwrap();
-            #[cfg(feature = "no-std")]
-            let cache = self.allocations.read();
-            cache.contains_key(&key)
-        };
+        let reuse = read!(self.allocations).contains_key(&key);
         // If there is, remove it from the cache.
         // Otherwise, return `None`.
         if reuse {
-            #[cfg(not(feature = "no-std"))]
-            let mut cache = self.allocations.write().unwrap();
-            #[cfg(feature = "no-std")]
-            let mut cache = self.allocations.write();
+            let mut cache = write!(self.allocations);
             // unwrap is safe because we just checked for contains key above.
             let items = cache.get_mut(&key).unwrap();
+            items.ignore_drops += 1;
             // unwrap is safe because reuse is only true if there's at least one item,
             // which is also maintained by the block directly below.
-            let allocation = items.pop().unwrap();
+            let allocation = items.allocations.pop().unwrap();
             // If there are no more cached allocations of this size,
             // remove the entry from the cache.
             // This is important for correctness, because the presence
             // of an entry in the cache indicates that there are valid
             // allocations to use. (see `let reuse = { ... }` above).
-            if items.is_empty() {
+            if items.allocations.is_empty() {
                 cache.remove(&key);
             }
             allocation.check_key(&key);
+            *write!(self.size) -= allocation.size();
             Some(allocation.into_storage())
         } else {
             None
@@ -241,31 +304,25 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
             alignment: layout.align(),
         };
         allocation.check_key(&key);
-        #[cfg(not(feature = "no-std"))]
-        let mut cache = self.allocations.write().unwrap();
-        #[cfg(feature = "no-std")]
-        let mut cache = self.allocations.write();
+        *write!(self.size) += allocation.size();
+        write!(self.drop_queue).push_back(key);
+        let mut cache = write!(self.allocations);
         if let std::collections::btree_map::Entry::Vacant(e) = cache.entry(key) {
-            #[cfg(not(feature = "no-std"))]
-            {
-                e.insert(std::vec![allocation]);
-            }
-            #[cfg(feature = "no-std")]
-            {
-                let mut allocations = Vec::new();
-                allocations.push(allocation);
-                e.insert(allocations);
-            }
+            e.insert(AllocationGroup {
+                allocations: vec![allocation],
+                ignore_drops: 0,
+            });
         } else {
-            cache.get_mut(&key).unwrap().push(allocation);
+            cache.get_mut(&key).unwrap().allocations.push(allocation);
         }
+        std::mem::drop(cache);
+        self.shrink();
     }
 
     pub(crate) fn clear(&self) {
-        #[cfg(not(feature = "no-std"))]
-        self.allocations.write().unwrap().clear();
-        #[cfg(feature = "no-std")]
-        self.allocations.write().clear();
+        write!(self.allocations).clear();
+        write!(self.drop_queue).clear();
+        *write!(self.size) = 0;
     }
 }
 
diff --git a/src/tensor/cpu/device.rs b/src/tensor/cpu/device.rs
index d17709482..add9fc9c8 100644
--- a/src/tensor/cpu/device.rs
+++ b/src/tensor/cpu/device.rs
@@ -24,6 +24,11 @@ unsafe impl Sync for BytesPtr {}
 impl<T> CacheStorage for Vec<T> {
     type Output<T2> = Vec<T2>;
 
+    fn size(&self) -> usize {
+        // size in bytes of the underlying allocation
+        Layout::array::<T>(self.capacity()).unwrap().size()
+    }
+
     /// Unsafely converts the elements of a vector to a new type.
     ///
     /// # Safety
diff --git a/src/tensor/cuda/device.rs b/src/tensor/cuda/device.rs
index a3796421a..9f5e5c14a 100644
--- a/src/tensor/cuda/device.rs
+++ b/src/tensor/cuda/device.rs
@@ -20,6 +20,10 @@ use std::{
 impl<T> CacheStorage for CudaSlice<T> {
     type Output<T2> = CudaSlice<T2>;
 
+    fn size(&self) -> usize {
+        Layout::array::<T>(self.len()).unwrap().size()
+    }
+
     /// Unsafely converts the elements of a CudaSlice to a new type.
     ///
     /// # Safety

From 9efc215af93e1fc464a91afa4a27a1a71da734aa Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sun, 16 Apr 2023 17:44:47 -0500
Subject: [PATCH 09/22] document ignore_drops; fix constraint violations

---
 src/tensor/cache.rs | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs
index 3d185fb7e..5655874d1 100644
--- a/src/tensor/cache.rs
+++ b/src/tensor/cache.rs
@@ -60,6 +60,9 @@ struct AllocationKey {
 
 #[derive(Debug)]
 struct AllocationGroup<Ptr: CacheStorage> {
+    // Tracks the number of matching 'AllocationKey's in drop_queue to ignore. This is used to
+    // "remove" the next instance of the matching AllocationKey in the drop_queue, without having
+    // to run an O(n)
     ignore_drops: usize,
     allocations: Vec<CacheWrapper<Ptr>>,
 }
@@ -75,6 +78,10 @@ struct AllocationGroup<Ptr: CacheStorage> {
 /// The presence of a key in the map, indicates that there is *at least one*
 /// valid allocation. When the last value is removed from the list, the key
 /// is removed.
+///
+/// Constraint: for a given value of AllocationKey, the following must hold:
+///
+/// (instances in drop_queue) = (group.ignore_drops) + (group.allocations.len())
 #[derive(Debug)]
 pub(crate) struct TensorCache<Ptr: CacheStorage> {
     allocations: RwLock<BTreeMap<AllocationKey, AllocationGroup<Ptr>>>,
@@ -165,6 +172,12 @@ impl<Ptr: CacheStorage> CacheWrapper<Ptr> {
     }
 }
 
+impl<Ptr: CacheStorage> AllocationGroup<Ptr> {
+    fn is_empty(&self) -> bool {
+        self.allocations.is_empty() && self.ignore_drops == 0
+    }
+}
+
 impl<Ptr: CacheStorage> Default for TensorCache<Ptr> {
     fn default() -> Self {
         Self {
@@ -173,7 +186,7 @@ impl<Ptr: CacheStorage> Default for TensorCache<Ptr> {
             drop_queue: Default::default(),
             size: RwLock::new(0),
             // TODO: default max size
-            max_size: RwLock::new(1_000_000),
+            max_size: RwLock::new(1_000_000_000),
         }
     }
 }
@@ -259,7 +272,9 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
             alignment: layout.align(),
         };
         // Check if there is a cached allocation.
-        let reuse = read!(self.allocations).contains_key(&key);
+        let reuse = read!(self.allocations)
+            .get(&key)
+            .map_or(false, |group| !group.allocations.is_empty());
         // If there is, remove it from the cache.
         // Otherwise, return `None`.
         if reuse {
@@ -270,14 +285,6 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
             // unwrap is safe because reuse is only true if there's at least one item,
             // which is also maintained by the block directly below.
             let allocation = items.allocations.pop().unwrap();
-            // If there are no more cached allocations of this size,
-            // remove the entry from the cache.
-            // This is important for correctness, because the presence
-            // of an entry in the cache indicates that there are valid
-            // allocations to use. (see `let reuse = { ... }` above).
-            if items.allocations.is_empty() {
-                cache.remove(&key);
-            }
             allocation.check_key(&key);
             *write!(self.size) -= allocation.size();
             Some(allocation.into_storage())

From e99a2aef402c1e8adbdbc9d9e462b19d3417ce72 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sun, 16 Apr 2023 17:55:27 -0500
Subject: [PATCH 10/22] fix another constraint violation; fix TensorCache::len

---
 src/tensor/cache.rs | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs
index 5655874d1..aef55eaa9 100644
--- a/src/tensor/cache.rs
+++ b/src/tensor/cache.rs
@@ -195,7 +195,10 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
     /// Returns the number of allocations in the cache.
     #[allow(unused)]
     pub(crate) fn len(&self) -> usize {
-        read!(self.allocations).len()
+        read!(self.allocations)
+            .values()
+            .map(|group| group.allocations.len())
+            .sum()
     }
 
     /// Returns the number of bytes occupied by allocations in the cache.
@@ -236,6 +239,15 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
         let mut drop_queue = write!(self.drop_queue);
         let mut allocations = write!(self.allocations);
 
+        debug_assert_eq!(
+            *size,
+            allocations
+                .values()
+                .flat_map(|group| &group.allocations)
+                .map(|alloc| alloc.size())
+                .sum::<usize>()
+        );
+
         while *size > *max_size {
             let key = drop_queue
                 .pop_front()
@@ -249,7 +261,7 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
                     .allocations
                     .pop()
                     .expect("ignore_drops values were set too low");
-                if alloc_group.allocations.is_empty() {
+                if alloc_group.is_empty() {
                     allocations.remove(&key);
                 }
                 *size -= allocation.size();

From 76798c990485a6e49eb111f2711cecb90fab522d Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Mon, 17 Apr 2023 14:33:57 -0500
Subject: [PATCH 11/22] add size parameter to enable_cache and add
 set_cache_size

---
 src/tensor/cache.rs          | 14 +++++++-------
 src/tensor/cpu/device.rs     |  9 +++++++--
 src/tensor/cpu/mod.rs        |  8 ++++----
 src/tensor/cuda/device.rs    |  9 +++++++--
 src/tensor/cuda/mod.rs       |  8 ++++----
 src/tensor/storage_traits.rs | 18 +++++++++++++-----
 6 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs
index aef55eaa9..2b934bdf7 100644
--- a/src/tensor/cache.rs
+++ b/src/tensor/cache.rs
@@ -147,7 +147,7 @@ impl<Ptr: CacheStorage> CacheWrapper<Ptr> {
     fn check_key(&self, key: &AllocationKey) {
         assert_eq!(self.alignment, key.alignment, "Alignment does not match");
         assert_eq!(self.size, key.size, "Size does not match");
-        // Implicitly assumes that T should not have any padding, but this should always be true of
+        // Implicitly assumes that T does not have any padding, but this should always be true of
         // primitive number types.
         assert_eq!(
             key.num_bytes % key.size,
@@ -185,8 +185,7 @@ impl<Ptr: CacheStorage> Default for TensorCache<Ptr> {
             enabled: RwLock::new(false),
             drop_queue: Default::default(),
             size: RwLock::new(0),
-            // TODO: default max size
-            max_size: RwLock::new(1_000_000_000),
+            max_size: RwLock::new(0),
         }
     }
 }
@@ -213,8 +212,9 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
     }
 
     /// Enables the cache.
-    pub(crate) fn enable(&self) {
+    pub(crate) fn enable(&self, size: usize) {
         *write!(self.enabled) = true;
+        *write!(self.max_size) = size;
     }
 
     /// Disables the cache.
@@ -352,7 +352,7 @@ mod test {
     #[test]
     fn test_try_pop_on_disabled_cache() {
         let cache: TensorCache<Vec<u8>> = Default::default();
-        cache.enable();
+        cache.enable(1000);
         assert!(cache.is_enabled());
         cache.disable();
         assert!(!cache.is_enabled());
@@ -363,7 +363,7 @@ mod test {
     #[test]
     fn test_try_pop_on_empty_cache() {
         let cache: TensorCache<Vec<u8>> = Default::default();
-        cache.enable();
+        cache.enable(1000);
         assert_eq!(cache.try_pop::<f32>(1), None);
         assert_eq!(cache.try_pop::<f32>(1), None);
     }
@@ -371,7 +371,7 @@ mod test {
     #[test]
     fn test_try_pop_on_cache_with_multiple_sizes_and_alignment() {
         let cache: TensorCache<Vec<u8>> = Default::default();
-        cache.enable();
+        cache.enable(1000);
         cache.insert::<f32>(1, vec![0.0]);
         cache.insert::<f32>(1, vec![1.0]);
         cache.insert::<f32>(1, vec![2.0]);
diff --git a/src/tensor/cpu/device.rs b/src/tensor/cpu/device.rs
index add9fc9c8..501abcd7c 100644
--- a/src/tensor/cpu/device.rs
+++ b/src/tensor/cpu/device.rs
@@ -220,8 +220,8 @@ impl DeviceStorage for Cpu {
         Ok(())
     }
 
-    fn try_enable_cache(&self) -> Result<(), Self::Err> {
-        self.cache.enable();
+    fn try_enable_cache(&self, size: usize) -> Result<(), Self::Err> {
+        self.cache.enable(size);
         Ok(())
     }
 
@@ -234,4 +234,9 @@ impl DeviceStorage for Cpu {
         self.cache.clear();
         Ok(())
     }
+
+    fn try_set_cache_size(&self, size: usize) -> Result<(), Self::Err> {
+        self.cache.set_max_size(size);
+        Ok(())
+    }
 }
diff --git a/src/tensor/cpu/mod.rs b/src/tensor/cpu/mod.rs
index fda5306ea..79c3437a6 100644
--- a/src/tensor/cpu/mod.rs
+++ b/src/tensor/cpu/mod.rs
@@ -17,7 +17,7 @@ mod tests {
     #[test]
     fn test_empty_cache() {
         let dev: Cpu = Default::default();
-        dev.enable_cache();
+        dev.enable_cache(1000);
         let tensor: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         drop(tensor); // insert allocation into cache
         assert_eq!(dev.cache.len(), 1);
@@ -28,7 +28,7 @@ mod tests {
     #[test]
     fn test_disabling_cache_empties_it() {
         let dev: Cpu = Default::default();
-        dev.enable_cache();
+        dev.enable_cache(1000);
         let tensor: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         drop(tensor); // insert allocation into cache
         assert_eq!(dev.cache.len(), 1);
@@ -39,7 +39,7 @@ mod tests {
     #[test]
     fn test_reuse_allocation_on_new_tensor() {
         let dev: Cpu = Default::default();
-        dev.enable_cache();
+        dev.enable_cache(1000);
         assert_eq!(dev.cache.len(), 0);
         let tensor: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         assert_eq!(dev.cache.len(), 0);
@@ -57,7 +57,7 @@ mod tests {
     #[test]
     fn test_reuse_allocation_on_clone_tensor() {
         let dev: Cpu = Default::default();
-        dev.enable_cache();
+        dev.enable_cache(1000);
         let a: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         let b: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         drop(b); // insert allocation into cache
diff --git a/src/tensor/cuda/device.rs b/src/tensor/cuda/device.rs
index 9f5e5c14a..ed635b4fb 100644
--- a/src/tensor/cuda/device.rs
+++ b/src/tensor/cuda/device.rs
@@ -308,8 +308,8 @@ impl DeviceStorage for Cuda {
         self.dev.synchronize().map_err(CudaError::from)
     }
 
-    fn try_enable_cache(&self) -> Result<(), Self::Err> {
-        self.cache.enable();
+    fn try_enable_cache(&self, size: usize) -> Result<(), Self::Err> {
+        self.cache.enable(size);
         Ok(())
     }
 
@@ -322,4 +322,9 @@ impl DeviceStorage for Cuda {
         self.cache.clear();
         Ok(())
     }
+
+    fn try_set_cache_size(&self, size: usize) -> Result<(), Self::Err> {
+        self.cache.set_max_size(size);
+        Ok(())
+    }
 }
diff --git a/src/tensor/cuda/mod.rs b/src/tensor/cuda/mod.rs
index 8b91d2ab3..0f8a44302 100644
--- a/src/tensor/cuda/mod.rs
+++ b/src/tensor/cuda/mod.rs
@@ -21,7 +21,7 @@ mod tests {
     #[test]
     fn test_empty_cache() {
         let dev: Cuda = Default::default();
-        dev.enable_cache();
+        dev.enable_cache(1000);
         let tensor: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         drop(tensor); // insert allocation into cache
         assert_eq!(dev.cache.len(), 1);
@@ -32,7 +32,7 @@ mod tests {
     #[test]
     fn test_disabling_cache_empties_it() {
         let dev: Cuda = Default::default();
-        dev.enable_cache();
+        dev.enable_cache(1000);
         let tensor: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         drop(tensor); // insert allocation into cache
         assert_eq!(dev.cache.len(), 1);
@@ -43,7 +43,7 @@ mod tests {
     #[test]
     fn test_reuse_allocation_on_new_tensor() {
         let dev: Cuda = Default::default();
-        dev.enable_cache();
+        dev.enable_cache(1000);
         let tensor: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         let ptr = *tensor.data.device_ptr();
         drop(tensor); // insert allocation into cache
@@ -59,7 +59,7 @@ mod tests {
     #[test]
     fn test_reuse_allocation_on_clone_tensor() {
         let dev: Cuda = Default::default();
-        dev.enable_cache();
+        dev.enable_cache(1000);
         let a: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         let b: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         drop(b); // insert allocation into cache
diff --git a/src/tensor/storage_traits.rs b/src/tensor/storage_traits.rs
index 7b32abf54..997ca1bff 100644
--- a/src/tensor/storage_traits.rs
+++ b/src/tensor/storage_traits.rs
@@ -43,13 +43,13 @@ pub trait DeviceStorage: 'static + std::fmt::Debug + Default + Clone + HasErr {
     /// Blocks until all work on device to complete. Useful for benchmarking.
     fn try_synchronize(&self) -> Result<(), Self::Err>;
 
-    /// Enables the cache of the device.
-    fn enable_cache(&self) {
-        self.try_enable_cache().unwrap()
+    /// Enables the cache of the device, and sets the maximum size to `size`.
+    fn enable_cache(&self, size: usize) {
+        self.try_enable_cache(size).unwrap()
     }
 
-    /// Tries to enable the cache of the device.
-    fn try_enable_cache(&self) -> Result<(), Self::Err>;
+    /// Tries to enable the cache of the device, and sets the maximum size to `size`.
+    fn try_enable_cache(&self, size: usize) -> Result<(), Self::Err>;
 
     /// Disables the cache of the device. This will also empty the cache
     /// if there are things in it. See [DeviceStorage::empty_cache] for
@@ -78,6 +78,14 @@ pub trait DeviceStorage: 'static + std::fmt::Debug + Default + Clone + HasErr {
     /// Tries to empty the cache of the device. See [DeviceStorage::empty_cache] for
     /// details of when this is useful.
     fn try_empty_cache(&self) -> Result<(), Self::Err>;
+
+    /// Sets the maximum size of the cache in bytes, and shrinks the cache until it smaller than `size`.
+    fn set_cache_size(&self, size: usize) {
+        self.try_set_cache_size(size).unwrap()
+    }
+
+    /// Fallible version of [DeviceStorage::set_cache_size]
+    fn try_set_cache_size(&self, size: usize) -> Result<(), Self::Err>;
 }
 
 /// Internal trait - Represents something that can allocate its own gradient.

From e1f4622ab2de0ba43078550b432969529d4d68a4 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Wed, 19 Apr 2023 15:42:56 -0500
Subject: [PATCH 12/22] remove len argument from TensorCache::insert; add test
 for cache

---
 src/tensor/cache.rs       | 91 +++++++++++++++++++++++++++++++++------
 src/tensor/cpu/device.rs  |  4 +-
 src/tensor/cuda/device.rs |  2 +-
 3 files changed, 80 insertions(+), 17 deletions(-)

diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs
index 2b934bdf7..2648ebc85 100644
--- a/src/tensor/cache.rs
+++ b/src/tensor/cache.rs
@@ -283,6 +283,7 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
             size: layout.size(),
             alignment: layout.align(),
         };
+        println!("{key:?}");
         // Check if there is a cached allocation.
         let reuse = read!(self.allocations)
             .get(&key)
@@ -298,6 +299,7 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
             // which is also maintained by the block directly below.
             let allocation = items.allocations.pop().unwrap();
             allocation.check_key(&key);
+            println!("{:?} {}", allocation.size, allocation.alignment);
             *write!(self.size) -= allocation.size();
             Some(allocation.into_storage())
         } else {
@@ -306,7 +308,7 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
     }
 
     /// Inserts an allocation into the cache.
-    pub(crate) fn insert<E>(&self, len: usize, allocation: Ptr::Output<E>)
+    pub(crate) fn insert<E>(&self, allocation: Ptr::Output<E>)
     where
         Ptr::Output<E>: CacheStorage<Output<u8> = Ptr>,
     {
@@ -316,7 +318,7 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
 
         let allocation = CacheWrapper::from_storage(allocation);
         let layout = Layout::new::<E>();
-        let num_bytes = len * layout.size();
+        let num_bytes = allocation.size();
         let key = AllocationKey {
             num_bytes,
             size: layout.size(),
@@ -343,6 +345,10 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
         write!(self.drop_queue).clear();
         *write!(self.size) = 0;
     }
+
+    fn clear_check(&self) {
+        self.set_max_size(0);
+    }
 }
 
 #[cfg(test)]
@@ -372,18 +378,18 @@ mod test {
     fn test_try_pop_on_cache_with_multiple_sizes_and_alignment() {
         let cache: TensorCache<Vec<u8>> = Default::default();
         cache.enable(1000);
-        cache.insert::<f32>(1, vec![0.0]);
-        cache.insert::<f32>(1, vec![1.0]);
-        cache.insert::<f32>(1, vec![2.0]);
-        cache.insert::<f32>(2, vec![3.0; 2]);
-        cache.insert::<f32>(2, vec![4.0; 2]);
-        cache.insert::<f32>(2, vec![5.0; 2]);
-        cache.insert::<f64>(1, vec![6.0]);
-        cache.insert::<f64>(1, vec![7.0]);
-        cache.insert::<f64>(1, vec![8.0]);
-        cache.insert::<f64>(2, vec![9.0; 2]);
-        cache.insert::<f64>(2, vec![10.0; 2]);
-        cache.insert::<f64>(2, vec![11.0; 2]);
+        cache.insert::<f32>(vec![0.0]);
+        cache.insert::<f32>(vec![1.0]);
+        cache.insert::<f32>(vec![2.0]);
+        cache.insert::<f32>(vec![3.0; 2]);
+        cache.insert::<f32>(vec![4.0; 2]);
+        cache.insert::<f32>(vec![5.0; 2]);
+        cache.insert::<f64>(vec![6.0]);
+        cache.insert::<f64>(vec![7.0]);
+        cache.insert::<f64>(vec![8.0]);
+        cache.insert::<f64>(vec![9.0; 2]);
+        cache.insert::<f64>(vec![10.0; 2]);
+        cache.insert::<f64>(vec![11.0; 2]);
         assert_eq!(cache.try_pop::<f32>(1), Some(vec![2.0]));
         assert_eq!(cache.try_pop::<f32>(1), Some(vec![1.0]));
         assert_eq!(cache.try_pop::<f32>(1), Some(vec![0.0]));
@@ -400,5 +406,62 @@ mod test {
         assert_eq!(cache.try_pop::<f64>(2), Some(vec![10.0; 2]));
         assert_eq!(cache.try_pop::<f64>(2), Some(vec![9.0; 2]));
         assert_eq!(cache.try_pop::<f64>(2), None);
+        cache.clear_check();
+    }
+
+    #[test]
+    fn test_shrink() {
+        let cache: TensorCache<Vec<u8>> = Default::default();
+        cache.enable(16);
+        cache.insert::<u8>(vec![1; 1]);
+        cache.insert::<u8>(vec![2; 1]);
+        cache.insert::<u8>(vec![1; 2]);
+        cache.insert::<u8>(vec![1; 4]);
+        cache.insert::<u8>(vec![1; 8]);
+        assert_eq!(cache.len(), 5);
+        assert_eq!(cache.size(), 16);
+        cache.insert::<u8>(vec![2; 8]);
+        assert_eq!(cache.len(), 2);
+        assert_eq!(cache.size(), 16);
+        cache.insert::<u8>(vec![3; 1]);
+        assert_eq!(cache.len(), 2);
+        assert_eq!(cache.size(), 9);
+        cache.insert::<u8>(vec![1; 12]);
+        assert_eq!(cache.len(), 2);
+        assert_eq!(cache.size(), 13);
+        cache.clear_check();
+    }
+
+
+    #[test]
+    fn test_pop_and_shrink() {
+        let cache: TensorCache<Vec<u8>> = Default::default();
+        cache.enable(16);
+        cache.insert::<u8>(vec![1; 1]);
+        cache.insert::<u8>(vec![2; 1]);
+        cache.insert::<u8>(vec![1; 2]);
+        cache.insert::<u8>(vec![1; 4]);
+        cache.insert::<u8>(vec![1; 8]);
+        assert_eq!(cache.len(), 5);
+        assert_eq!(cache.size(), 16);
+        
+        assert_eq!(cache.try_pop::<u8>(1), Some(vec![2]));
+        assert_eq!(cache.try_pop::<u8>(2), Some(vec![1; 2]));
+        assert_eq!(cache.len(), 3);
+        assert_eq!(cache.size(), 13);
+        
+        cache.insert::<u8>(vec![2; 8]);
+        assert_eq!(cache.len(), 2);
+        assert_eq!(cache.size(), 16);
+        
+        assert_eq!(cache.try_pop::<u8>(8), Some(vec![2; 8]));
+        assert_eq!(cache.len(), 1);
+        assert_eq!(cache.size(), 8);
+        
+        cache.insert::<u8>(vec![2; 4]);
+        assert_eq!(cache.len(), 2);
+        assert_eq!(cache.size(), 12);
+        
+        cache.clear_check();
     }
 }
diff --git a/src/tensor/cpu/device.rs b/src/tensor/cpu/device.rs
index 501abcd7c..f8f2e74e5 100644
--- a/src/tensor/cpu/device.rs
+++ b/src/tensor/cpu/device.rs
@@ -26,7 +26,7 @@ impl<T> CacheStorage for Vec<T> {
 
     fn size(&self) -> usize {
         // size in bytes of the underlying allocation
-        Layout::array::<T>(self.capacity()).unwrap().size()
+        Layout::array::<T>(self.len()).unwrap().size()
     }
 
     /// Unsafely converts the elements of a vector to a new type.
@@ -168,7 +168,7 @@ impl<E: Clone> Clone for CachableVec<E> {
 impl<E> Drop for CachableVec<E> {
     fn drop(&mut self) {
         let data = std::mem::take(&mut self.data);
-        self.cache.insert::<E>(data.len(), data);
+        self.cache.insert::<E>(data);
     }
 }
 
diff --git a/src/tensor/cuda/device.rs b/src/tensor/cuda/device.rs
index ed635b4fb..28e25bb7b 100644
--- a/src/tensor/cuda/device.rs
+++ b/src/tensor/cuda/device.rs
@@ -260,7 +260,7 @@ impl<E> Drop for CachableCudaSlice<E> {
         // Replaces the CudaSlice with a 0 length CudaSlice. This won't take additional
         // memory, but will give us ownership of the actual data.
         let data = std::mem::replace(&mut self.data, dev.null().unwrap());
-        self.cache.insert::<E>(data.len(), data);
+        self.cache.insert::<E>(data);
     }
 }
 

From 7476b808aaefd8dcd4c45bfbb9a5a5ad45be0f26 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Wed, 19 Apr 2023 15:45:41 -0500
Subject: [PATCH 13/22] run cargo fmt

---
 src/tensor/cache.rs | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs
index 2648ebc85..217741653 100644
--- a/src/tensor/cache.rs
+++ b/src/tensor/cache.rs
@@ -432,7 +432,6 @@ mod test {
         cache.clear_check();
     }
 
-
     #[test]
     fn test_pop_and_shrink() {
         let cache: TensorCache<Vec<u8>> = Default::default();
@@ -444,24 +443,24 @@ mod test {
         cache.insert::<u8>(vec![1; 8]);
         assert_eq!(cache.len(), 5);
         assert_eq!(cache.size(), 16);
-        
+
         assert_eq!(cache.try_pop::<u8>(1), Some(vec![2]));
         assert_eq!(cache.try_pop::<u8>(2), Some(vec![1; 2]));
         assert_eq!(cache.len(), 3);
         assert_eq!(cache.size(), 13);
-        
+
         cache.insert::<u8>(vec![2; 8]);
         assert_eq!(cache.len(), 2);
         assert_eq!(cache.size(), 16);
-        
+
         assert_eq!(cache.try_pop::<u8>(8), Some(vec![2; 8]));
         assert_eq!(cache.len(), 1);
         assert_eq!(cache.size(), 8);
-        
+
         cache.insert::<u8>(vec![2; 4]);
         assert_eq!(cache.len(), 2);
         assert_eq!(cache.size(), 12);
-        
+
         cache.clear_check();
     }
 }

From 80c20e789968761ae79a00703b4abfa263040f2d Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Wed, 19 Apr 2023 16:00:34 -0500
Subject: [PATCH 14/22] fix clippy error

---
 src/tensor/cache.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs
index 217741653..cbe8df20c 100644
--- a/src/tensor/cache.rs
+++ b/src/tensor/cache.rs
@@ -346,6 +346,7 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
         *write!(self.size) = 0;
     }
 
+    #[allow(unused)]
     fn clear_check(&self) {
         self.set_max_size(0);
     }

From 3d443ba0da7f89b0482b54f6820d2699f7e4e889 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Wed, 19 Apr 2023 21:03:39 -0500
Subject: [PATCH 15/22] remove debug print statements

---
 src/tensor/cache.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs
index cbe8df20c..46262524a 100644
--- a/src/tensor/cache.rs
+++ b/src/tensor/cache.rs
@@ -283,7 +283,6 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
             size: layout.size(),
             alignment: layout.align(),
         };
-        println!("{key:?}");
         // Check if there is a cached allocation.
         let reuse = read!(self.allocations)
             .get(&key)
@@ -299,7 +298,6 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
             // which is also maintained by the block directly below.
             let allocation = items.allocations.pop().unwrap();
             allocation.check_key(&key);
-            println!("{:?} {}", allocation.size, allocation.alignment);
             *write!(self.size) -= allocation.size();
             Some(allocation.into_storage())
         } else {

From 9761e416927ff45f696fcae60e9ca1353b603efd Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Fri, 21 Apr 2023 11:35:08 -0500
Subject: [PATCH 16/22] improve documentation

---
 src/tensor/cache.rs | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs
index 46262524a..aa91183e6 100644
--- a/src/tensor/cache.rs
+++ b/src/tensor/cache.rs
@@ -62,7 +62,7 @@ struct AllocationKey {
 struct AllocationGroup<Ptr: CacheStorage> {
     // Tracks the number of matching 'AllocationKey's in drop_queue to ignore. This is used to
     // "remove" the next instance of the matching AllocationKey in the drop_queue, without having
-    // to run an O(n)
+    // to run an O(n) operation to actually remove the key.
     ignore_drops: usize,
     allocations: Vec<CacheWrapper<Ptr>>,
 }
@@ -79,9 +79,10 @@ struct AllocationGroup<Ptr: CacheStorage> {
 /// valid allocation. When the last value is removed from the list, the key
 /// is removed.
 ///
-/// Constraint: for a given value of AllocationKey, the following must hold:
+/// Constraint: for a given value of AllocationKey, the following must hold for each key value in
+/// `allocations`:
 ///
-/// (instances in drop_queue) = (group.ignore_drops) + (group.allocations.len())
+/// (instances of key in drop_queue) = allocations[key].ignore_drops + allocations[key].allocations.len()
 #[derive(Debug)]
 pub(crate) struct TensorCache<Ptr: CacheStorage> {
     allocations: RwLock<BTreeMap<AllocationKey, AllocationGroup<Ptr>>>,
@@ -103,6 +104,7 @@ pub(crate) trait CacheStorage: Sized {
     unsafe fn transmute_elements<T>(self) -> Self::Output<T>;
 
     /// Uses transmute_elements to convert to an element type with alignment `align` before dropping.
+    /// This **must** be a memory safe way to drop self, given the correct alignment
     unsafe fn drop_with_alignment(self, align: usize) {
         match align {
             1 => drop(self.transmute_elements::<u8>()),
@@ -126,17 +128,24 @@ struct CacheWrapper<Ptr: CacheStorage> {
 impl<Ptr: CacheStorage> Drop for CacheWrapper<Ptr> {
     fn drop(&mut self) {
         if let Some(ptr) = std::mem::take(&mut self.ptr) {
+            // Safety: This operation is memory safe because ptr is guaranteed to have elements
+            // with the correct alignment before being dropped. This is ensured by the CacheWrapper
+            // being constructed with from_storage.
             unsafe { ptr.drop_with_alignment(self.alignment) }
         }
     }
 }
 
 impl<Ptr: CacheStorage> CacheWrapper<Ptr> {
+    /// Safety: Storage must be valid to drop, and Ptr::transmute_elements must produce a storage
+    /// that is valid to drop after being converted to an element type with the same alignment as T
     fn from_storage<T>(storage: Ptr::Output<T>) -> Self
     where
         Ptr::Output<T>: CacheStorage<Output<u8> = Ptr>,
     {
         let layout = Layout::new::<T>();
+        // Safety: Ptr must be converted to an element typw with the correct alignment before
+        // it is dropped. Ptr might not be valid data after this operation
         Self {
             ptr: Some(unsafe { storage.transmute_elements::<u8>() }),
             alignment: layout.align(),
@@ -160,14 +169,17 @@ impl<Ptr: CacheStorage> CacheWrapper<Ptr> {
         self.ptr.as_ref().unwrap().size()
     }
 
-    // Safety: Same as slice.align_to, but considered safe internally
-    // Produces storage containing uninitialized values
+    /// Safety: Same as slice.align_to, but considered safe internally
+    /// Produces storage containing uninitialized values
     fn into_storage<T>(mut self) -> Ptr::Output<T> {
         let layout = Layout::new::<T>();
         assert_eq!(layout.align(), self.alignment);
         assert_eq!(layout.size(), self.size);
 
         let ptr = std::mem::take(&mut self.ptr).unwrap();
+
+        // Safety: This will always construct a storage with correct alignment and element size if
+        // this CacheWrapper was constructed with from_storage.
         unsafe { ptr.transmute_elements() }
     }
 }

From aef3be90301ce5978ae9c4dcdab6269049c2d1da Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Fri, 21 Apr 2023 11:58:22 -0500
Subject: [PATCH 17/22] add 'in bytes' to enable_cache documentation

---
 src/tensor/storage_traits.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tensor/storage_traits.rs b/src/tensor/storage_traits.rs
index 997ca1bff..f4be94eca 100644
--- a/src/tensor/storage_traits.rs
+++ b/src/tensor/storage_traits.rs
@@ -43,12 +43,12 @@ pub trait DeviceStorage: 'static + std::fmt::Debug + Default + Clone + HasErr {
     /// Blocks until all work on device to complete. Useful for benchmarking.
     fn try_synchronize(&self) -> Result<(), Self::Err>;
 
-    /// Enables the cache of the device, and sets the maximum size to `size`.
+    /// Enables the cache of the device, and sets the maximum size in bytes to `size`.
     fn enable_cache(&self, size: usize) {
         self.try_enable_cache(size).unwrap()
     }
 
-    /// Tries to enable the cache of the device, and sets the maximum size to `size`.
+    /// Tries to enable the cache of the device, and sets the maximum size in bytes to `size`.
     fn try_enable_cache(&self, size: usize) -> Result<(), Self::Err>;
 
     /// Disables the cache of the device. This will also empty the cache

From 59ed3b159ce8068abd573f8f55532644b351b1c6 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sat, 22 Apr 2023 15:30:47 -0500
Subject: [PATCH 18/22] add CacheSize enum

---
 src/tensor/cpu/device.rs     |  8 ++++----
 src/tensor/cpu/mod.rs        | 10 +++++-----
 src/tensor/cuda/device.rs    |  9 +++++----
 src/tensor/cuda/mod.rs       | 10 +++++-----
 src/tensor/storage_traits.rs | 36 ++++++++++++++++++++++++++++++++----
 5 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/src/tensor/cpu/device.rs b/src/tensor/cpu/device.rs
index f8f2e74e5..7440099c6 100644
--- a/src/tensor/cpu/device.rs
+++ b/src/tensor/cpu/device.rs
@@ -220,8 +220,8 @@ impl DeviceStorage for Cpu {
         Ok(())
     }
 
-    fn try_enable_cache(&self, size: usize) -> Result<(), Self::Err> {
-        self.cache.enable(size);
+    fn try_enable_cache(&self, size: CacheSize) -> Result<(), Self::Err> {
+        self.cache.enable(size.to_num_bytes());
         Ok(())
     }
 
@@ -235,8 +235,8 @@ impl DeviceStorage for Cpu {
         Ok(())
     }
 
-    fn try_set_cache_size(&self, size: usize) -> Result<(), Self::Err> {
-        self.cache.set_max_size(size);
+    fn try_set_cache_size(&self, size: CacheSize) -> Result<(), Self::Err> {
+        self.cache.set_max_size(size.to_num_bytes());
         Ok(())
     }
 }
diff --git a/src/tensor/cpu/mod.rs b/src/tensor/cpu/mod.rs
index 79c3437a6..360c9ae55 100644
--- a/src/tensor/cpu/mod.rs
+++ b/src/tensor/cpu/mod.rs
@@ -12,12 +12,12 @@ pub use device::{Cpu, CpuError};
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::{shapes::*, tensor::*};
+    use crate::{shapes::*, tensor::*, prelude::storage_traits::CacheSize};
 
     #[test]
     fn test_empty_cache() {
         let dev: Cpu = Default::default();
-        dev.enable_cache(1000);
+        dev.enable_cache(CacheSize::KB(1));
         let tensor: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         drop(tensor); // insert allocation into cache
         assert_eq!(dev.cache.len(), 1);
@@ -28,7 +28,7 @@ mod tests {
     #[test]
     fn test_disabling_cache_empties_it() {
         let dev: Cpu = Default::default();
-        dev.enable_cache(1000);
+        dev.enable_cache(CacheSize::KB(1));
         let tensor: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         drop(tensor); // insert allocation into cache
         assert_eq!(dev.cache.len(), 1);
@@ -39,7 +39,7 @@ mod tests {
     #[test]
     fn test_reuse_allocation_on_new_tensor() {
         let dev: Cpu = Default::default();
-        dev.enable_cache(1000);
+        dev.enable_cache(CacheSize::KB(1));
         assert_eq!(dev.cache.len(), 0);
         let tensor: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         assert_eq!(dev.cache.len(), 0);
@@ -57,7 +57,7 @@ mod tests {
     #[test]
     fn test_reuse_allocation_on_clone_tensor() {
         let dev: Cpu = Default::default();
-        dev.enable_cache(1000);
+        dev.enable_cache(CacheSize::KB(1));
         let a: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         let b: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         drop(b); // insert allocation into cache
diff --git a/src/tensor/cuda/device.rs b/src/tensor/cuda/device.rs
index 28e25bb7b..acc393a67 100644
--- a/src/tensor/cuda/device.rs
+++ b/src/tensor/cuda/device.rs
@@ -1,3 +1,4 @@
+use crate::prelude::storage_traits::CacheSize;
 use crate::shapes::{Shape, Unit};
 use crate::tensor::cpu::{Cpu, CpuError};
 use crate::tensor::{
@@ -308,8 +309,8 @@ impl DeviceStorage for Cuda {
         self.dev.synchronize().map_err(CudaError::from)
     }
 
-    fn try_enable_cache(&self, size: usize) -> Result<(), Self::Err> {
-        self.cache.enable(size);
+    fn try_enable_cache(&self, size: CacheSize) -> Result<(), Self::Err> {
+        self.cache.enable(size.to_num_bytes());
         Ok(())
     }
 
@@ -323,8 +324,8 @@ impl DeviceStorage for Cuda {
         Ok(())
     }
 
-    fn try_set_cache_size(&self, size: usize) -> Result<(), Self::Err> {
-        self.cache.set_max_size(size);
+    fn try_set_cache_size(&self, size: CacheSize) -> Result<(), Self::Err> {
+        self.cache.set_max_size(size.to_num_bytes());
         Ok(())
     }
 }
diff --git a/src/tensor/cuda/mod.rs b/src/tensor/cuda/mod.rs
index 0f8a44302..f8a4f6fb6 100644
--- a/src/tensor/cuda/mod.rs
+++ b/src/tensor/cuda/mod.rs
@@ -15,13 +15,13 @@ pub(crate) fn launch_cfg<const NUM_THREADS: u32>(n: u32) -> cudarc::driver::Laun
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::{shapes::*, tensor::*};
+    use crate::{shapes::*, tensor::*, prelude::storage_traits::CacheSize};
     use cudarc::driver::DevicePtr;
 
     #[test]
     fn test_empty_cache() {
         let dev: Cuda = Default::default();
-        dev.enable_cache(1000);
+        dev.enable_cache(CacheSize::KB(1));
         let tensor: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         drop(tensor); // insert allocation into cache
         assert_eq!(dev.cache.len(), 1);
@@ -32,7 +32,7 @@ mod tests {
     #[test]
     fn test_disabling_cache_empties_it() {
         let dev: Cuda = Default::default();
-        dev.enable_cache(1000);
+        dev.enable_cache(CacheSize::KB(1));
         let tensor: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         drop(tensor); // insert allocation into cache
         assert_eq!(dev.cache.len(), 1);
@@ -43,7 +43,7 @@ mod tests {
     #[test]
     fn test_reuse_allocation_on_new_tensor() {
         let dev: Cuda = Default::default();
-        dev.enable_cache(1000);
+        dev.enable_cache(CacheSize::KB(1));
         let tensor: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         let ptr = *tensor.data.device_ptr();
         drop(tensor); // insert allocation into cache
@@ -59,7 +59,7 @@ mod tests {
     #[test]
     fn test_reuse_allocation_on_clone_tensor() {
         let dev: Cuda = Default::default();
-        dev.enable_cache(1000);
+        dev.enable_cache(CacheSize::KB(1));
         let a: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         let b: Tensor<Rank2<2, 3>, f32, _> = dev.zeros();
         drop(b); // insert allocation into cache
diff --git a/src/tensor/storage_traits.rs b/src/tensor/storage_traits.rs
index f4be94eca..3764b968f 100644
--- a/src/tensor/storage_traits.rs
+++ b/src/tensor/storage_traits.rs
@@ -16,6 +16,34 @@ pub trait AsVec<E> {
     fn as_vec(&self) -> std::vec::Vec<E>;
 }
 
+/// Expresses the size of the cache in human readable units.
+#[derive(Debug, Clone, Copy)]
+pub enum CacheSize {
+    /// Bytes 
+    Bytes(usize),
+    /// Kilobytes (10 ^ 3 bytes)
+    KB(usize),
+    /// Megabytes (10 ^ 6 bytes)
+    MB(usize),
+    /// Gigabytes (10 ^ 9 bytes)
+    GB(usize),
+    /// Terabytes (10 ^ 12 bytes)
+    TB(usize),
+}
+
+impl CacheSize {
+    /// Returns the number of bytes this CacheSize represents
+    pub fn to_num_bytes(self) -> usize {
+        match self {
+            CacheSize::Bytes(b) => b,
+            CacheSize::KB(b) => b * 1e3 as usize,
+            CacheSize::MB(b) => b * 1e6 as usize,
+            CacheSize::GB(b) => b * 1e9 as usize,
+            CacheSize::TB(b) => b * 1e12 as usize,
+        }
+    }
+}
+
 /// Something that can store nd arrays for a given [Shape] and [Dtype]
 pub trait DeviceStorage: 'static + std::fmt::Debug + Default + Clone + HasErr {
     /// Generic storage type
@@ -44,12 +72,12 @@ pub trait DeviceStorage: 'static + std::fmt::Debug + Default + Clone + HasErr {
     fn try_synchronize(&self) -> Result<(), Self::Err>;
 
     /// Enables the cache of the device, and sets the maximum size in bytes to `size`.
-    fn enable_cache(&self, size: usize) {
+    fn enable_cache(&self, size: CacheSize) {
         self.try_enable_cache(size).unwrap()
     }
 
     /// Tries to enable the cache of the device, and sets the maximum size in bytes to `size`.
-    fn try_enable_cache(&self, size: usize) -> Result<(), Self::Err>;
+    fn try_enable_cache(&self, size: CacheSize) -> Result<(), Self::Err>;
 
     /// Disables the cache of the device. This will also empty the cache
     /// if there are things in it. See [DeviceStorage::empty_cache] for
@@ -80,12 +108,12 @@ pub trait DeviceStorage: 'static + std::fmt::Debug + Default + Clone + HasErr {
     fn try_empty_cache(&self) -> Result<(), Self::Err>;
 
     /// Sets the maximum size of the cache in bytes, and shrinks the cache until it smaller than `size`.
-    fn set_cache_size(&self, size: usize) {
+    fn set_cache_size(&self, size: CacheSize) {
         self.try_set_cache_size(size).unwrap()
     }
 
     /// Fallible version of [DeviceStorage::set_cache_size]
-    fn try_set_cache_size(&self, size: usize) -> Result<(), Self::Err>;
+    fn try_set_cache_size(&self, size: CacheSize) -> Result<(), Self::Err>;
 }
 
 /// Internal trait - Represents something that can allocate its own gradient.

From 902529e282536cb2e7e3811fc5449b8a3ea1c40f Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sat, 22 Apr 2023 15:33:47 -0500
Subject: [PATCH 19/22] run cargo fmt

---
 src/tensor/cpu/mod.rs        | 2 +-
 src/tensor/cuda/mod.rs       | 2 +-
 src/tensor/storage_traits.rs | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/tensor/cpu/mod.rs b/src/tensor/cpu/mod.rs
index 360c9ae55..5a5a5e392 100644
--- a/src/tensor/cpu/mod.rs
+++ b/src/tensor/cpu/mod.rs
@@ -12,7 +12,7 @@ pub use device::{Cpu, CpuError};
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::{shapes::*, tensor::*, prelude::storage_traits::CacheSize};
+    use crate::{prelude::storage_traits::CacheSize, shapes::*, tensor::*};
 
     #[test]
     fn test_empty_cache() {
diff --git a/src/tensor/cuda/mod.rs b/src/tensor/cuda/mod.rs
index f8a4f6fb6..b3d095f12 100644
--- a/src/tensor/cuda/mod.rs
+++ b/src/tensor/cuda/mod.rs
@@ -15,7 +15,7 @@ pub(crate) fn launch_cfg<const NUM_THREADS: u32>(n: u32) -> cudarc::driver::Laun
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::{shapes::*, tensor::*, prelude::storage_traits::CacheSize};
+    use crate::{prelude::storage_traits::CacheSize, shapes::*, tensor::*};
     use cudarc::driver::DevicePtr;
 
     #[test]
diff --git a/src/tensor/storage_traits.rs b/src/tensor/storage_traits.rs
index 3764b968f..acf24a5ac 100644
--- a/src/tensor/storage_traits.rs
+++ b/src/tensor/storage_traits.rs
@@ -19,7 +19,7 @@ pub trait AsVec<E> {
 /// Expresses the size of the cache in human readable units.
 #[derive(Debug, Clone, Copy)]
 pub enum CacheSize {
-    /// Bytes 
+    /// Bytes
     Bytes(usize),
     /// Kilobytes (10 ^ 3 bytes)
     KB(usize),

From a2a565d982d27ea4ed230a6289ad6ac43eda65ce Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Mon, 24 Apr 2023 11:03:09 -0500
Subject: [PATCH 20/22] further clarify comment; remove align to 16 check

---
 src/tensor/cache.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs
index aa91183e6..f8169d29d 100644
--- a/src/tensor/cache.rs
+++ b/src/tensor/cache.rs
@@ -62,7 +62,7 @@ struct AllocationKey {
 struct AllocationGroup<Ptr: CacheStorage> {
     // Tracks the number of matching 'AllocationKey's in drop_queue to ignore. This is used to
     // "remove" the next instance of the matching AllocationKey in the drop_queue, without having
-    // to run an O(n) operation to actually remove the key.
+    // to run an O(n) operation to actually remove the key from the queue.
     ignore_drops: usize,
     allocations: Vec<CacheWrapper<Ptr>>,
 }
@@ -111,7 +111,6 @@ pub(crate) trait CacheStorage: Sized {
             2 => drop(self.transmute_elements::<u16>()),
             4 => drop(self.transmute_elements::<u32>()),
             8 => drop(self.transmute_elements::<u64>()),
-            16 => drop(self.transmute_elements::<u128>()),
             _ => panic!("Invalid alignment"),
         }
     }

From 58518e25e153105c30753b6678e1989a5a43e149 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Mon, 24 Apr 2023 11:27:22 -0500
Subject: [PATCH 21/22] add check to minimize RwLock operations in shrink;
 export CacheSize

---
 src/tensor/cache.rs | 5 +++++
 src/tensor/mod.rs   | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs
index f8169d29d..dc5e3d9bd 100644
--- a/src/tensor/cache.rs
+++ b/src/tensor/cache.rs
@@ -247,6 +247,11 @@ impl<Ptr: CacheStorage> TensorCache<Ptr> {
     fn shrink(&self) {
         let mut size = write!(self.size);
         let max_size = read!(self.max_size);
+
+        if *size < *max_size {
+            return;
+        }
+
         let mut drop_queue = write!(self.drop_queue);
         let mut allocations = write!(self.allocations);
 
diff --git a/src/tensor/mod.rs b/src/tensor/mod.rs
index e9eb2f9f3..da19e00e3 100644
--- a/src/tensor/mod.rs
+++ b/src/tensor/mod.rs
@@ -169,7 +169,7 @@ pub use cuda::{Cuda, CudaError};
 pub type AutoDevice = Cuda;
 
 pub use storage_traits::{AsArray, CopySlice, TensorFrom, TensorFromVec};
-pub use storage_traits::{DeviceStorage, HasErr};
+pub use storage_traits::{DeviceStorage, HasErr, CacheSize};
 pub use storage_traits::{OnesTensor, SampleTensor, TriangleTensor, ZerosTensor};
 
 pub use tensor_impls::{PutTape, SplitTape, Tensor, Trace, WithEmptyTape};

From 5a03b08187fcd41be8daeffb7d4097026a509961 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Mon, 24 Apr 2023 12:00:31 -0500
Subject: [PATCH 22/22] run cargo fmt

---
 src/tensor/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tensor/mod.rs b/src/tensor/mod.rs
index da19e00e3..55e1f43f3 100644
--- a/src/tensor/mod.rs
+++ b/src/tensor/mod.rs
@@ -169,7 +169,7 @@ pub use cuda::{Cuda, CudaError};
 pub type AutoDevice = Cuda;
 
 pub use storage_traits::{AsArray, CopySlice, TensorFrom, TensorFromVec};
-pub use storage_traits::{DeviceStorage, HasErr, CacheSize};
+pub use storage_traits::{CacheSize, DeviceStorage, HasErr};
 pub use storage_traits::{OnesTensor, SampleTensor, TriangleTensor, ZerosTensor};
 
 pub use tensor_impls::{PutTape, SplitTape, Tensor, Trace, WithEmptyTape};