From 935810fcc59efb9761c5c2b5de5b6afa2beb0ed5 Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Sat, 15 Apr 2023 22:46:59 -0500 Subject: [PATCH 01/22] improve TensorCache api; move buffer conversion logic to CacheStorage --- src/tensor/cache.rs | 121 ++++++++++++++++++++++++------------- src/tensor/cpu/allocate.rs | 11 +--- src/tensor/cpu/device.rs | 114 ++++++++++++++++------------------ src/tensor/cuda/device.rs | 74 ++++++++++++----------- 4 files changed, 170 insertions(+), 150 deletions(-) diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs index e785cb647..389706777 100644 --- a/src/tensor/cache.rs +++ b/src/tensor/cache.rs @@ -33,12 +33,37 @@ pub(crate) struct AllocationKey { /// valid allocation. When the last value is removed from the list, the key /// is removed. #[derive(Debug)] -pub(crate) struct TensorCache { +pub(crate) struct TensorCache { pub(crate) allocations: RwLock>>, pub(crate) enabled: RwLock, } -impl Default for TensorCache { +pub(crate) trait CacheStorage: Sized { + type Output: CacheStorage; + + /// Unsafely converts the elements of a contiguous collection type to another type. Note: + /// **This function is wildly unsafe**, see implementations for details + unsafe fn transmute_elements(self) -> Self::Output; + + /// Uses transmute_elements to convert to an element type with alignment `align` before dropping. + unsafe fn drop_with_alignment(self, align: usize) { + match align { + 1 => drop(self.transmute_elements::()), + 2 => drop(self.transmute_elements::()), + 4 => drop(self.transmute_elements::()), + 8 => drop(self.transmute_elements::()), + 16 => drop(self.transmute_elements::()), + _ => panic!("Invalid alignment") + } + } + + /// calls [CacheStorage::drop_with_alignment], extracting `align` from an [AllocationKey] + unsafe fn drop_with_key(self, key: AllocationKey) { + self.drop_with_alignment(key.alignment) + } +} + +impl Default for TensorCache { fn default() -> Self { Self { allocations: Default::default(), @@ -47,7 +72,7 @@ impl Default for TensorCache { } } -impl TensorCache { +impl TensorCache { /// Returns the number of allocations in the cache. #[allow(unused)] pub(crate) fn len(&self) -> usize { @@ -102,7 +127,7 @@ impl TensorCache { /// Returns a cached allocation if one exists. /// Otherwise, returns `None`. - pub(crate) fn try_pop(&self, len: usize) -> Option { + pub(crate) fn try_pop(&self, len: usize) -> Option> { if !self.is_enabled() { return None; } @@ -142,21 +167,23 @@ impl TensorCache { if items.is_empty() { cache.remove(&key); } - Some(allocation) + Some(unsafe { allocation.transmute_elements() }) } else { None } } /// Inserts an allocation into the cache. - pub(crate) fn insert(&self, len: usize, allocation: Ptr) { + pub(crate) fn insert(&self, len: usize, allocation: Ptr::Output) + where Ptr::Output: CacheStorage = Ptr> + { if !self.is_enabled() { - // This is a panic because it's a bug in the library. - panic!("Tried to insert into a disabled cache."); + return; } + let allocation = unsafe { allocation.transmute_elements::() }; let layout = Layout::new::(); - let num_bytes = len * std::mem::size_of::(); + let num_bytes = len * layout.size(); let key = AllocationKey { num_bytes, size: layout.size(), @@ -181,22 +208,30 @@ impl TensorCache { cache.get_mut(&key).unwrap().push(allocation); } } + + pub(crate) fn clear(&self) { + #[cfg(not(feature = "no-std"))] + let mut cache = self.allocations.write().unwrap(); + #[cfg(feature = "no-std")] + let mut cache = self.allocations.write(); + for (&key, allocations) in cache.iter_mut() { + assert!(key.num_bytes % key.size == 0); + assert!(key.num_bytes < isize::MAX as usize); + for alloc in allocations.drain(..) { + unsafe { alloc.drop_with_key(key) }; + } + } + cache.clear(); + } } #[cfg(test)] mod test { use super::*; - #[test] - #[should_panic(expected = "Tried to insert into a disabled cache.")] - fn test_insert_on_disabled_cache() { - let cache: TensorCache = Default::default(); - cache.insert::(1, 0); - } - #[test] fn test_try_pop_on_disabled_cache() { - let cache: TensorCache = Default::default(); + let cache: TensorCache> = Default::default(); cache.enable(); assert!(cache.is_enabled()); cache.disable(); @@ -207,7 +242,7 @@ mod test { #[test] fn test_try_pop_on_empty_cache() { - let cache: TensorCache = Default::default(); + let cache: TensorCache> = Default::default(); cache.enable(); assert_eq!(cache.try_pop::(1), None); assert_eq!(cache.try_pop::(1), None); @@ -215,35 +250,35 @@ mod test { #[test] fn test_try_pop_on_cache_with_multiple_sizes_and_alignment() { - let cache: TensorCache = Default::default(); + let cache: TensorCache> = Default::default(); cache.enable(); - cache.insert::(1, 0); - cache.insert::(1, 1); - cache.insert::(1, 2); - cache.insert::(2, 3); - cache.insert::(2, 4); - cache.insert::(2, 5); - cache.insert::(1, 6); - cache.insert::(1, 7); - cache.insert::(1, 8); - cache.insert::(2, 9); - cache.insert::(2, 10); - cache.insert::(2, 11); - assert_eq!(cache.try_pop::(1), Some(2)); - assert_eq!(cache.try_pop::(1), Some(1)); - assert_eq!(cache.try_pop::(1), Some(0)); + cache.insert::(1, vec![0.0]); + cache.insert::(1, vec![1.0]); + cache.insert::(1, vec![2.0]); + cache.insert::(2, vec![3.0; 2]); + cache.insert::(2, vec![4.0; 2]); + cache.insert::(2, vec![5.0; 2]); + cache.insert::(1, vec![6.0]); + cache.insert::(1, vec![7.0]); + cache.insert::(1, vec![8.0]); + cache.insert::(2, vec![9.0; 2]); + cache.insert::(2, vec![10.0; 2]); + cache.insert::(2, vec![11.0; 2]); + assert_eq!(cache.try_pop::(1), Some(vec![2.0])); + assert_eq!(cache.try_pop::(1), Some(vec![1.0])); + assert_eq!(cache.try_pop::(1), Some(vec![0.0])); assert_eq!(cache.try_pop::(1), None); - assert_eq!(cache.try_pop::(2), Some(5)); - assert_eq!(cache.try_pop::(2), Some(4)); - assert_eq!(cache.try_pop::(2), Some(3)); + assert_eq!(cache.try_pop::(2), Some(vec![5.0; 2])); + assert_eq!(cache.try_pop::(2), Some(vec![4.0; 2])); + assert_eq!(cache.try_pop::(2), Some(vec![3.0; 2])); assert_eq!(cache.try_pop::(2), None); - assert_eq!(cache.try_pop::(1), Some(8)); - assert_eq!(cache.try_pop::(1), Some(7)); - assert_eq!(cache.try_pop::(1), Some(6)); + assert_eq!(cache.try_pop::(1), Some(vec![8.0])); + assert_eq!(cache.try_pop::(1), Some(vec![7.0])); + assert_eq!(cache.try_pop::(1), Some(vec![6.0])); assert_eq!(cache.try_pop::(1), None); - assert_eq!(cache.try_pop::(2), Some(11)); - assert_eq!(cache.try_pop::(2), Some(10)); - assert_eq!(cache.try_pop::(2), Some(9)); + assert_eq!(cache.try_pop::(2), Some(vec![11.0; 2])); + assert_eq!(cache.try_pop::(2), Some(vec![10.0; 2])); + assert_eq!(cache.try_pop::(2), Some(vec![9.0; 2])); assert_eq!(cache.try_pop::(2), None); } } diff --git a/src/tensor/cpu/allocate.rs b/src/tensor/cpu/allocate.rs index e9ac9d307..3a2a742cb 100644 --- a/src/tensor/cpu/allocate.rs +++ b/src/tensor/cpu/allocate.rs @@ -35,16 +35,7 @@ impl Cpu { data.resize(numel, elem); Ok(data) }, - |allocation| { - // SAFETY: - // - ✅ "ptr must have been allocated using the global allocator, such as via the alloc::alloc function." - // - ✅ handled by tensor cache "T needs to have the same alignment as what ptr was allocated with." - // - ✅ handled by tensor cache "The size of T times the capacity needs to be the same size as the pointer was allocated with." - // - ✅ "length needs to be less than or equal to capacity." - // - ✅ all the dtypes for this are builtin numbers "The first length values must be properly initialized values of type T." - // - ✅ "capacity needs to be the capacity that the pointer was allocated with." - // - ✅ "The allocated size in bytes must be no larger than isize::MAX. See the safety documentation of pointer::offset." - let mut data = unsafe { Vec::from_raw_parts(allocation.0 as *mut E, numel, numel) }; + |mut data| { data.fill(elem); Ok(data) }, diff --git a/src/tensor/cpu/device.rs b/src/tensor/cpu/device.rs index 45907c232..65b434274 100644 --- a/src/tensor/cpu/device.rs +++ b/src/tensor/cpu/device.rs @@ -1,6 +1,7 @@ use crate::shapes::{Shape, Unit}; -use crate::tensor::{cache::TensorCache, cpu::LendingIterator, storage_traits::*, Tensor}; +use crate::tensor::{cache::{TensorCache, CacheStorage}, cpu::LendingIterator, storage_traits::*, Tensor}; use rand::{rngs::StdRng, Rng, SeedableRng}; +use core::alloc::Layout; use std::{sync::Arc, vec::Vec}; #[cfg(feature = "no-std")] @@ -15,6 +16,49 @@ pub(crate) struct BytesPtr(pub(crate) *mut u8); unsafe impl Send for BytesPtr {} unsafe impl Sync for BytesPtr {} +impl CacheStorage for Vec { + type Output = Vec; + + /// Unsafely converts the elements of a vector to a new type. + /// + /// # Safety + /// + /// * Has all of the potential pitfalls of slice.align_to + /// * If converting to a type with a different alignment, the caller must convert back to a + /// type with the same alignment before dropping + unsafe fn transmute_elements(mut self) -> Self::Output { + let src_layout = Layout::new::().pad_to_align(); + let dst_layout = Layout::new::().pad_to_align(); + + let byte_len = self.len() * src_layout.size(); + let byte_capacity = self.capacity() * src_layout.size(); + let ptr = self.as_mut_ptr(); + std::mem::forget(self); + + let dst_size = dst_layout.size(); + + assert_eq!(ptr.align_offset(dst_layout.align()), 0, "Allocation is improperly aligned"); + assert!(byte_len % dst_size == 0 && byte_capacity % dst_size == 0, "Allocation is improperly sized"); + + let len = byte_len / dst_size; + let capacity = byte_capacity / dst_size; + + // Safety: + // * T2 may not have the same alignment as the initial vector, it is the caller's + // responsiblity to ensure that the vector is converted to a type with the correct + // alignment before dropping + // * The first len values may not be correctly initialized, it is the caller's + // responsibility to ensure correct values before usage + // + // * ptr is allocated with the global allocator as long as self was + // * length is less than or equal to capacity as long as this is true of self. + // * capacity is the capacity the pointer was allocated with as long as this is true of + // self + // * The allocated size is less than isize::MAX as long as this is true of self + Vec::from_raw_parts(ptr as *mut T2, len, capacity) + } +} + /// A device that stores data on the heap. /// /// The [Default] impl seeds the underlying rng with seed of 0. @@ -25,7 +69,7 @@ pub struct Cpu { /// A thread safe random number generator. pub(crate) rng: Arc>, /// A thread safe cache of memory allocations that can be reused. - pub(crate) cache: Arc>, + pub(crate) cache: Arc>>, } impl Default for Cpu { @@ -78,7 +122,7 @@ pub struct CachableVec { /// The data stored in this vector. pub(crate) data: Vec, /// A cache of memory allocations that can be reused. - pub(crate) cache: Arc>, + pub(crate) cache: Arc>>, } impl Clone for CachableVec { @@ -89,17 +133,7 @@ impl Clone for CachableVec { data: self.data.clone(), cache: self.cache.clone(), }, - |allocation| { - assert!(numel < isize::MAX as usize); - // SAFETY: - // - ✅ "ptr must have been allocated using the global allocator, such as via the alloc::alloc function." - // - ✅ handled by tensor cache "T needs to have the same alignment as what ptr was allocated with." - // - ✅ handled by tensor cache "The size of T times the capacity needs to be the same size as the pointer was allocated with." - // - ✅ "length needs to be less than or equal to capacity." - // - ✅ all the dtypes for this are builtin numbers "The first length values must be properly initialized values of type T." - // - ✅ "capacity needs to be the capacity that the pointer was allocated with." - // - ✅ "The allocated size in bytes must be no larger than isize::MAX. See the safety documentation of pointer::offset." - let mut data = unsafe { Vec::from_raw_parts(allocation.0 as *mut E, numel, numel) }; + |mut data| { data.clone_from(&self.data); Self { data, @@ -112,16 +146,8 @@ impl Clone for CachableVec { impl Drop for CachableVec { fn drop(&mut self) { - if self.cache.is_enabled() { - let mut data = std::mem::take(&mut self.data); - data.shrink_to_fit(); - - let numel = data.len(); - let ptr = data.as_mut_ptr() as *mut u8; - std::mem::forget(data); - - self.cache.insert::(numel, BytesPtr(ptr)); - } + let data = std::mem::take(&mut self.data); + self.cache.insert::(data.len(), data); } } @@ -184,45 +210,7 @@ impl DeviceStorage for Cpu { } fn try_empty_cache(&self) -> Result<(), Self::Err> { - #[cfg(not(feature = "no-std"))] - let mut cache = self.cache.allocations.write().unwrap(); - #[cfg(feature = "no-std")] - let mut cache = self.cache.allocations.write(); - for (&key, allocations) in cache.iter_mut() { - assert!(key.num_bytes % key.size == 0); - assert!(key.num_bytes < isize::MAX as usize); - let len = key.num_bytes / key.size; - let cap = len; - for alloc in allocations.drain(..) { - // SAFETY: - // - "ptr must have been allocated using the global allocator, such as via the alloc::alloc function." - // - ✅ cpu uses global allocator - // - "T needs to have the same alignment as what ptr was allocated with." - // - ✅ we are matching on the alignment below - // - "The size of T times the capacity needs to be the same size as the pointer was allocated with." - // - ✅ covered by `key.num_bytes / key.size` and the `key.num_bytes % key.size == 0` assertion above - // - "length needs to be less than or equal to capacity." - // - ✅ they are equal - // - "The first length values must be properly initialized values of type T." - // - ✅ any bit pattern is valid for unsigned ints used below - // - "capacity needs to be the capacity that the pointer was allocated with." - // - ✅ handled by assertion above (key.num_bytes % key.size == 0) - // - "The allocated size in bytes must be no larger than isize::MAX. See the safety documentation of pointer::offset." - // - ✅ handled by assertion above - debug_assert_eq!(std::alloc::Layout::new::().align(), 1); - debug_assert_eq!(std::alloc::Layout::new::().align(), 2); - debug_assert_eq!(std::alloc::Layout::new::().align(), 4); - debug_assert_eq!(std::alloc::Layout::new::().align(), 8); - match key.alignment { - 1 => unsafe { drop(Vec::from_raw_parts(alloc.0 as *mut u8, len, cap)) }, - 2 => unsafe { drop(Vec::from_raw_parts(alloc.0 as *mut u16, len, cap)) }, - 4 => unsafe { drop(Vec::from_raw_parts(alloc.0 as *mut u32, len, cap)) }, - 8 => unsafe { drop(Vec::from_raw_parts(alloc.0 as *mut u64, len, cap)) }, - _ => unreachable!(), - }; - } - } - cache.clear(); + self.cache.clear(); Ok(()) } } diff --git a/src/tensor/cuda/device.rs b/src/tensor/cuda/device.rs index ae1807bdb..25f5d1fc0 100644 --- a/src/tensor/cuda/device.rs +++ b/src/tensor/cuda/device.rs @@ -1,19 +1,45 @@ use crate::shapes::{Shape, Unit}; use crate::tensor::cpu::{Cpu, CpuError}; -use crate::tensor::{cache::TensorCache, DeviceStorage, HasErr, NoneTape, Tensor}; +use crate::tensor::{cache::{TensorCache, CacheStorage}, DeviceStorage, HasErr, NoneTape, Tensor}; use cudarc::driver::{DevicePtr, DevicePtrMut, DeviceRepr}; use cudarc::{ cublas::{result::CublasError, CudaBlas}, - driver::sys::CUdeviceptr, driver::{CudaDevice, CudaSlice, CudaStream, DeviceSlice, DriverError}, }; +use core::alloc::Layout; use std::{ sync::{Arc, Mutex, MutexGuard}, vec::Vec, }; +impl CacheStorage for CudaSlice { + type Output = CudaSlice; + + /// Unsafely converts the elements of a CudaSlice to a new type. + /// + /// # Safety + /// + /// Assuming that `self` is a valid [CudaSlice], the main safety concern is that the output + /// slice should be assumed to be uninitialized + unsafe fn transmute_elements(self) -> Self::Output { + let dev = self.device(); + + let src_layout = Layout::new::().pad_to_align(); + let dst_layout = Layout::new::().pad_to_align(); + + let byte_len = self.len() * src_layout.size(); + let ptr = self.leak(); + + assert_eq!(byte_len % dst_layout.size(), 0, "Allocation is improperly sized"); + + let len = byte_len / dst_layout.size(); + + dev.upgrade_device_ptr(ptr, len) + } +} + /// A Cuda device that enables constructing tensors on GPUs /// & running GPU kernels. #[derive(Clone, Debug)] @@ -27,7 +53,7 @@ pub struct Cuda { /// A second stream for kernels to optionally execute on. pub(crate) par_stream: Arc, pub(crate) workspace: Arc>>, - pub(crate) cache: Arc>, + pub(crate) cache: Arc>>, } #[derive(Debug)] @@ -111,11 +137,10 @@ impl Cuda { &self, len: usize, ) -> Result, CudaError> { - let data = self.cache.try_pop::(len).map_or_else( + Ok(self.cache.try_pop::(len).map_or_else( || self.dev.alloc::(len), - |ptr| Ok(self.dev.upgrade_device_ptr(ptr, len)), - )?; - Ok(data) + Ok, + )?) } #[allow(unused)] pub(crate) unsafe fn get_workspace( @@ -152,7 +177,7 @@ pub struct CachableCudaSlice { /// The actual data. pub(crate) data: CudaSlice, /// A cache of device pointers that can be reused. - pub(crate) cache: Arc>, + pub(crate) cache: Arc>>, } impl Clone for CachableCudaSlice { @@ -161,11 +186,7 @@ impl Clone for CachableCudaSlice { let len = self.data.len(); let data = self.cache.try_pop::(len).map_or_else( || self.data.try_clone().unwrap(), - |ptr| { - // SAFETY: - // 1. we know that ptr is valid for `num_bytes` because it was registered for that. - // 2. we are about to set the memory with dtod_copy - let mut slice = unsafe { dev.upgrade_device_ptr(ptr, len) }; + |mut slice| { dev.dtod_copy(&self.data, &mut slice).unwrap(); slice }, @@ -224,16 +245,11 @@ impl std::ops::DerefMut for CachableCudaSlice { impl Drop for CachableCudaSlice { fn drop(&mut self) { - if self.cache.is_enabled() { - let dev = self.data.device(); - // Replaces the CudaSlice with a 0 length CudaSlice. This won't take additional - // memory, but will give us ownership of the actual data. - let data = std::mem::replace(&mut self.data, dev.null().unwrap()); - let numel = data.len(); - // Get access to the raw pointer without freeing it. - let ptr = data.leak(); - self.cache.insert::(numel, ptr); - } + let dev = self.data.device(); + // Replaces the CudaSlice with a 0 length CudaSlice. This won't take additional + // memory, but will give us ownership of the actual data. + let data = std::mem::replace(&mut self.data, dev.null().unwrap()); + self.cache.insert::(data.len(), data); } } @@ -292,17 +308,7 @@ impl DeviceStorage for Cuda { } fn try_empty_cache(&self) -> Result<(), Self::Err> { - #[cfg(not(feature = "no-std"))] - let mut cache = self.cache.allocations.write().unwrap(); - #[cfg(feature = "no-std")] - let mut cache = self.cache.allocations.write(); - for (&key, allocations) in cache.iter_mut() { - for alloc in allocations.drain(..) { - let data = unsafe { self.dev.upgrade_device_ptr::(alloc, key.num_bytes) }; - drop(data); - } - } - cache.clear(); + self.cache.clear(); Ok(()) } } From 0a2d7d9c83dc4524de0dd0beae5934c51dfc3ff8 Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Sat, 15 Apr 2023 22:56:20 -0500 Subject: [PATCH 02/22] split an assert; run cargo fmt --- src/tensor/cache.rs | 7 ++++--- src/tensor/cpu/device.rs | 22 ++++++++++++++++++---- src/tensor/cuda/device.rs | 19 +++++++++++++------ 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs index 389706777..a5abd3839 100644 --- a/src/tensor/cache.rs +++ b/src/tensor/cache.rs @@ -45,7 +45,7 @@ pub(crate) trait CacheStorage: Sized { /// **This function is wildly unsafe**, see implementations for details unsafe fn transmute_elements(self) -> Self::Output; - /// Uses transmute_elements to convert to an element type with alignment `align` before dropping. + /// Uses transmute_elements to convert to an element type with alignment `align` before dropping. unsafe fn drop_with_alignment(self, align: usize) { match align { 1 => drop(self.transmute_elements::()), @@ -53,7 +53,7 @@ pub(crate) trait CacheStorage: Sized { 4 => drop(self.transmute_elements::()), 8 => drop(self.transmute_elements::()), 16 => drop(self.transmute_elements::()), - _ => panic!("Invalid alignment") + _ => panic!("Invalid alignment"), } } @@ -175,7 +175,8 @@ impl TensorCache { /// Inserts an allocation into the cache. pub(crate) fn insert(&self, len: usize, allocation: Ptr::Output) - where Ptr::Output: CacheStorage = Ptr> + where + Ptr::Output: CacheStorage = Ptr>, { if !self.is_enabled() { return; diff --git a/src/tensor/cpu/device.rs b/src/tensor/cpu/device.rs index 65b434274..75c28a128 100644 --- a/src/tensor/cpu/device.rs +++ b/src/tensor/cpu/device.rs @@ -1,7 +1,12 @@ use crate::shapes::{Shape, Unit}; -use crate::tensor::{cache::{TensorCache, CacheStorage}, cpu::LendingIterator, storage_traits::*, Tensor}; -use rand::{rngs::StdRng, Rng, SeedableRng}; +use crate::tensor::{ + cache::{CacheStorage, TensorCache}, + cpu::LendingIterator, + storage_traits::*, + Tensor, +}; use core::alloc::Layout; +use rand::{rngs::StdRng, Rng, SeedableRng}; use std::{sync::Arc, vec::Vec}; #[cfg(feature = "no-std")] @@ -37,8 +42,17 @@ impl CacheStorage for Vec { let dst_size = dst_layout.size(); - assert_eq!(ptr.align_offset(dst_layout.align()), 0, "Allocation is improperly aligned"); - assert!(byte_len % dst_size == 0 && byte_capacity % dst_size == 0, "Allocation is improperly sized"); + assert_eq!( + ptr.align_offset(dst_layout.align()), + 0, + "Allocation is improperly aligned" + ); + assert_eq!(byte_len % dst_size, 0, "Length is improperly sized"); + assert_eq!( + byte_capacity % dst_size, + 0, + "Allocation is improperly sized" + ); let len = byte_len / dst_size; let capacity = byte_capacity / dst_size; diff --git a/src/tensor/cuda/device.rs b/src/tensor/cuda/device.rs index 25f5d1fc0..a3796421a 100644 --- a/src/tensor/cuda/device.rs +++ b/src/tensor/cuda/device.rs @@ -1,6 +1,9 @@ use crate::shapes::{Shape, Unit}; use crate::tensor::cpu::{Cpu, CpuError}; -use crate::tensor::{cache::{TensorCache, CacheStorage}, DeviceStorage, HasErr, NoneTape, Tensor}; +use crate::tensor::{ + cache::{CacheStorage, TensorCache}, + DeviceStorage, HasErr, NoneTape, Tensor, +}; use cudarc::driver::{DevicePtr, DevicePtrMut, DeviceRepr}; use cudarc::{ @@ -32,7 +35,11 @@ impl CacheStorage for CudaSlice { let byte_len = self.len() * src_layout.size(); let ptr = self.leak(); - assert_eq!(byte_len % dst_layout.size(), 0, "Allocation is improperly sized"); + assert_eq!( + byte_len % dst_layout.size(), + 0, + "Allocation is improperly sized" + ); let len = byte_len / dst_layout.size(); @@ -137,10 +144,10 @@ impl Cuda { &self, len: usize, ) -> Result, CudaError> { - Ok(self.cache.try_pop::(len).map_or_else( - || self.dev.alloc::(len), - Ok, - )?) + Ok(self + .cache + .try_pop::(len) + .map_or_else(|| self.dev.alloc::(len), Ok)?) } #[allow(unused)] pub(crate) unsafe fn get_workspace( From 347b24bc5786a952adb5bf3847e078f0f172ed35 Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Sun, 16 Apr 2023 09:49:02 -0500 Subject: [PATCH 03/22] reduce visibility of internals; add safety comments --- build.rs | 4 ++-- src/tensor/cache.rs | 31 +++++++++++++++++++++++++------ src/tensor/cpu/device.rs | 2 ++ 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/build.rs b/build.rs index 3860f5fb0..daf4caef5 100644 --- a/build.rs +++ b/build.rs @@ -93,8 +93,8 @@ mod cuda { let out = out.lines().collect::>(); let mut codes = Vec::with_capacity(out.len()); for code in out { - let code = code.split("_").collect::>(); - if code.len() != 0 && code.contains(&"sm") { + let code = code.split('_').collect::>(); + if !code.is_empty() && code.contains(&"sm") { if let Ok(num) = code[1].parse::() { codes.push(num); } diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs index a5abd3839..f30292b53 100644 --- a/src/tensor/cache.rs +++ b/src/tensor/cache.rs @@ -13,12 +13,12 @@ use spin::RwLock; /// for a hasmap, meaning we need this extra datastructure. Otherwise /// we could just using `(usize, Layout)` as the key. #[derive(Debug, Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd)] -pub(crate) struct AllocationKey { - pub num_bytes: usize, +pub struct AllocationKey { + num_bytes: usize, /// The size of the allocation in bytes - from [Layout]. - pub size: usize, + size: usize, /// The alignment of the allocation in bytes - from [Layout]. - pub alignment: usize, + alignment: usize, } /// A cache of allocations that can be reused. @@ -34,8 +34,8 @@ pub(crate) struct AllocationKey { /// is removed. #[derive(Debug)] pub(crate) struct TensorCache { - pub(crate) allocations: RwLock>>, - pub(crate) enabled: RwLock, + allocations: RwLock>>, + enabled: RwLock, } pub(crate) trait CacheStorage: Sized { @@ -72,6 +72,12 @@ impl Default for TensorCache { } } +impl Drop for TensorCache { + fn drop(&mut self) { + self.clear(); + } +} + impl TensorCache { /// Returns the number of allocations in the cache. #[allow(unused)] @@ -167,6 +173,10 @@ impl TensorCache { if items.is_empty() { cache.remove(&key); } + // SAFETY: + // + // * If inserted with 'insert', this will convert allocation to an element type with + // the correct alignment Some(unsafe { allocation.transmute_elements() }) } else { None @@ -182,6 +192,11 @@ impl TensorCache { return; } + // SAFETY: + // + // * Allocation must be converted back to an element type with the correct alignment before + // dropping + // * Allocation must not have its capacity modified let allocation = unsafe { allocation.transmute_elements::() }; let layout = Layout::new::(); let num_bytes = len * layout.size(); @@ -219,6 +234,10 @@ impl TensorCache { assert!(key.num_bytes % key.size == 0); assert!(key.num_bytes < isize::MAX as usize); for alloc in allocations.drain(..) { + // SAFETY: + // + // * If inserted with 'insert', this will convert allocation to an element type with + // the correct alignment before dropping. unsafe { alloc.drop_with_key(key) }; } } diff --git a/src/tensor/cpu/device.rs b/src/tensor/cpu/device.rs index 75c28a128..d17709482 100644 --- a/src/tensor/cpu/device.rs +++ b/src/tensor/cpu/device.rs @@ -31,6 +31,8 @@ impl CacheStorage for Vec { /// * Has all of the potential pitfalls of slice.align_to /// * If converting to a type with a different alignment, the caller must convert back to a /// type with the same alignment before dropping + /// * If converting to a type with a different alignment, the caller must not grow or shrink + /// the allocation of the returned vector unsafe fn transmute_elements(mut self) -> Self::Output { let src_layout = Layout::new::().pad_to_align(); let dst_layout = Layout::new::().pad_to_align(); From f055ae54bb6e183ff748605f8ea282995ae54118 Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Sun, 16 Apr 2023 11:15:57 -0500 Subject: [PATCH 04/22] create safe wrapper around CacheStorage objects --- src/tensor/cache.rs | 69 ++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 35 deletions(-) diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs index f30292b53..5068e1c1f 100644 --- a/src/tensor/cache.rs +++ b/src/tensor/cache.rs @@ -34,7 +34,7 @@ pub struct AllocationKey { /// is removed. #[derive(Debug)] pub(crate) struct TensorCache { - allocations: RwLock>>, + allocations: RwLock>>>, enabled: RwLock, } @@ -56,10 +56,36 @@ pub(crate) trait CacheStorage: Sized { _ => panic!("Invalid alignment"), } } +} + +/// (Mostly) Safe wrapper around CacheStorage implementers +#[derive(Clone, Debug)] +struct CacheWrapper { + ptr: Option, + align: usize, +} + +impl Drop for CacheWrapper { + fn drop(&mut self) { + if let Some(ptr) = std::mem::take(&mut self.ptr) { + unsafe { ptr.drop_with_alignment(self.align) } + } + } +} + +impl CacheWrapper { + fn from_storage(storage: Ptr::Output) -> Self where Ptr::Output: CacheStorage = Ptr> { + Self { + ptr: Some(unsafe { storage.transmute_elements::() }), + align: Layout::new::().align() + } + } - /// calls [CacheStorage::drop_with_alignment], extracting `align` from an [AllocationKey] - unsafe fn drop_with_key(self, key: AllocationKey) { - self.drop_with_alignment(key.alignment) + // Safety: Same as slice.align_to, but considered safe internally + fn into_storage(mut self) -> Ptr::Output { + assert_eq!(Layout::new::().align(), self.align); + let ptr = std::mem::take(&mut self.ptr).unwrap(); + unsafe { ptr.transmute_elements() } } } @@ -72,12 +98,6 @@ impl Default for TensorCache { } } -impl Drop for TensorCache { - fn drop(&mut self) { - self.clear(); - } -} - impl TensorCache { /// Returns the number of allocations in the cache. #[allow(unused)] @@ -173,11 +193,7 @@ impl TensorCache { if items.is_empty() { cache.remove(&key); } - // SAFETY: - // - // * If inserted with 'insert', this will convert allocation to an element type with - // the correct alignment - Some(unsafe { allocation.transmute_elements() }) + Some(allocation.into_storage()) } else { None } @@ -192,12 +208,7 @@ impl TensorCache { return; } - // SAFETY: - // - // * Allocation must be converted back to an element type with the correct alignment before - // dropping - // * Allocation must not have its capacity modified - let allocation = unsafe { allocation.transmute_elements::() }; + let allocation = CacheWrapper::from_storage(allocation); let layout = Layout::new::(); let num_bytes = len * layout.size(); let key = AllocationKey { @@ -227,21 +238,9 @@ impl TensorCache { pub(crate) fn clear(&self) { #[cfg(not(feature = "no-std"))] - let mut cache = self.allocations.write().unwrap(); + self.allocations.write().unwrap().clear(); #[cfg(feature = "no-std")] - let mut cache = self.allocations.write(); - for (&key, allocations) in cache.iter_mut() { - assert!(key.num_bytes % key.size == 0); - assert!(key.num_bytes < isize::MAX as usize); - for alloc in allocations.drain(..) { - // SAFETY: - // - // * If inserted with 'insert', this will convert allocation to an element type with - // the correct alignment before dropping. - unsafe { alloc.drop_with_key(key) }; - } - } - cache.clear(); + self.allocations.write().clear(); } } From 16970020e78ed86db43c5a64e34d61a57498c16d Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Sun, 16 Apr 2023 11:16:15 -0500 Subject: [PATCH 05/22] run cargo fmt --- src/tensor/cache.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs index 5068e1c1f..79c9eddb1 100644 --- a/src/tensor/cache.rs +++ b/src/tensor/cache.rs @@ -74,10 +74,13 @@ impl Drop for CacheWrapper { } impl CacheWrapper { - fn from_storage(storage: Ptr::Output) -> Self where Ptr::Output: CacheStorage = Ptr> { + fn from_storage(storage: Ptr::Output) -> Self + where + Ptr::Output: CacheStorage = Ptr>, + { Self { ptr: Some(unsafe { storage.transmute_elements::() }), - align: Layout::new::().align() + align: Layout::new::().align(), } } From 240553c7ae6e735fc14c5815c05f5b6da8a39255 Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Sun, 16 Apr 2023 11:40:30 -0500 Subject: [PATCH 06/22] add check_key method to ensure CacheWrappers are used with valid keys --- src/tensor/cache.rs | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs index 79c9eddb1..79f77f225 100644 --- a/src/tensor/cache.rs +++ b/src/tensor/cache.rs @@ -14,8 +14,9 @@ use spin::RwLock; /// we could just using `(usize, Layout)` as the key. #[derive(Debug, Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd)] pub struct AllocationKey { + /// The size of the allocation in bytes num_bytes: usize, - /// The size of the allocation in bytes - from [Layout]. + /// The size of the type in bytes - from [Layout]. size: usize, /// The alignment of the allocation in bytes - from [Layout]. alignment: usize, @@ -62,13 +63,14 @@ pub(crate) trait CacheStorage: Sized { #[derive(Clone, Debug)] struct CacheWrapper { ptr: Option, - align: usize, + alignment: usize, + size: usize, } impl Drop for CacheWrapper { fn drop(&mut self) { if let Some(ptr) = std::mem::take(&mut self.ptr) { - unsafe { ptr.drop_with_alignment(self.align) } + unsafe { ptr.drop_with_alignment(self.alignment) } } } } @@ -78,15 +80,29 @@ impl CacheWrapper { where Ptr::Output: CacheStorage = Ptr>, { + let layout = Layout::new::(); Self { ptr: Some(unsafe { storage.transmute_elements::() }), - align: Layout::new::().align(), + alignment: layout.align(), + size: layout.size(), } } + fn check_key(&self, key: &AllocationKey) { + assert_eq!(self.alignment, key.alignment, "Alignment does not match"); + assert_eq!(self.size, key.size, "Size does not match"); + // Implicitly assumes that T should not have any padding, but this should always be true of + // primitive number types. + assert_eq!(key.num_bytes % key.size, 0, "Key is invalid or type is padded"); + } + // Safety: Same as slice.align_to, but considered safe internally + // Produces storage containing uninitialized values fn into_storage(mut self) -> Ptr::Output { - assert_eq!(Layout::new::().align(), self.align); + let layout = Layout::new::(); + assert_eq!(layout.align(), self.alignment); + assert_eq!(layout.size(), self.size); + let ptr = std::mem::take(&mut self.ptr).unwrap(); unsafe { ptr.transmute_elements() } } @@ -196,6 +212,7 @@ impl TensorCache { if items.is_empty() { cache.remove(&key); } + allocation.check_key(&key); Some(allocation.into_storage()) } else { None @@ -219,6 +236,7 @@ impl TensorCache { size: layout.size(), alignment: layout.align(), }; + allocation.check_key(&key); #[cfg(not(feature = "no-std"))] let mut cache = self.allocations.write().unwrap(); #[cfg(feature = "no-std")] From 101497edcda6f8d8eb739919cb4060c11912e6c7 Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Sun, 16 Apr 2023 12:46:07 -0500 Subject: [PATCH 07/22] typo; run cargo fmt --- src/tensor/cache.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs index 79f77f225..9ca1a87ae 100644 --- a/src/tensor/cache.rs +++ b/src/tensor/cache.rs @@ -93,7 +93,11 @@ impl CacheWrapper { assert_eq!(self.size, key.size, "Size does not match"); // Implicitly assumes that T should not have any padding, but this should always be true of // primitive number types. - assert_eq!(key.num_bytes % key.size, 0, "Key is invalid or type is padded"); + assert_eq!( + key.num_bytes % key.size, + 0, + "Key is invalid or type is padded" + ); } // Safety: Same as slice.align_to, but considered safe internally @@ -144,7 +148,7 @@ impl TensorCache { } } - /// Disables the cache. + /// Enables the cache. pub(crate) fn enable(&self) { #[cfg(not(feature = "no-std"))] { From ea7329471f2abc0993cb5b3fba070f37d900c8ec Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Sun, 16 Apr 2023 13:48:42 -0500 Subject: [PATCH 08/22] add shrink method --- src/tensor/cache.rs | 197 ++++++++++++++++++++++++-------------- src/tensor/cpu/device.rs | 5 + src/tensor/cuda/device.rs | 4 + 3 files changed, 136 insertions(+), 70 deletions(-) diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs index 9ca1a87ae..3d185fb7e 100644 --- a/src/tensor/cache.rs +++ b/src/tensor/cache.rs @@ -1,4 +1,14 @@ -use std::{alloc::Layout, collections::BTreeMap, vec::Vec}; +use std::{ + alloc::Layout, + collections::{BTreeMap, VecDeque}, + vec::Vec, +}; + +#[cfg(not(feature = "no-std"))] +use std::vec; + +#[cfg(feature = "no-std")] +use alloc::vec; #[cfg(not(feature = "no-std"))] use std::sync::RwLock; @@ -6,14 +16,40 @@ use std::sync::RwLock; #[cfg(feature = "no-std")] use spin::RwLock; -/// A key for the tensor cache. Contains both number of bytes and informatino +macro_rules! read { + ($x:expr) => {{ + #[cfg(not(feature = "no-std"))] + { + $x.read().unwrap() + } + #[cfg(feature = "no-std")] + { + $x.read() + } + }}; +} + +macro_rules! write { + ($x:expr) => {{ + #[cfg(not(feature = "no-std"))] + { + $x.write().unwrap() + } + #[cfg(feature = "no-std")] + { + $x.write() + } + }}; +} + +/// A key for the tensor cache. Contains both number of bytes and information /// about the layout of the allocation. /// /// Since [Layout] doesn't impl Ord, we can't use it directly as a key -/// for a hasmap, meaning we need this extra datastructure. Otherwise +/// for a hashmap, meaning we need this extra datastructure. Otherwise /// we could just using `(usize, Layout)` as the key. #[derive(Debug, Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd)] -pub struct AllocationKey { +struct AllocationKey { /// The size of the allocation in bytes num_bytes: usize, /// The size of the type in bytes - from [Layout]. @@ -22,6 +58,12 @@ pub struct AllocationKey { alignment: usize, } +#[derive(Debug)] +struct AllocationGroup { + ignore_drops: usize, + allocations: Vec>, +} + /// A cache of allocations that can be reused. /// /// The key is the number of bytes in the allocation, AND the layout @@ -30,18 +72,25 @@ pub struct AllocationKey { /// allocator assumes memory is allocated & deallocated with the same layout. /// The value is a list of allocations of that size. /// -/// The prescense of a key in the map, indicates that there is *at least one* +/// The presence of a key in the map, indicates that there is *at least one* /// valid allocation. When the last value is removed from the list, the key /// is removed. #[derive(Debug)] pub(crate) struct TensorCache { - allocations: RwLock>>>, + allocations: RwLock>>, enabled: RwLock, + + drop_queue: RwLock>, + size: RwLock, + max_size: RwLock, } pub(crate) trait CacheStorage: Sized { type Output: CacheStorage; + /// returns the allocations's size in bytes + fn size(&self) -> usize; + /// Unsafely converts the elements of a contiguous collection type to another type. Note: /// **This function is wildly unsafe**, see implementations for details unsafe fn transmute_elements(self) -> Self::Output; @@ -60,7 +109,7 @@ pub(crate) trait CacheStorage: Sized { } /// (Mostly) Safe wrapper around CacheStorage implementers -#[derive(Clone, Debug)] +#[derive(Debug)] struct CacheWrapper { ptr: Option, alignment: usize, @@ -100,6 +149,10 @@ impl CacheWrapper { ); } + fn size(&self) -> usize { + self.ptr.as_ref().unwrap().size() + } + // Safety: Same as slice.align_to, but considered safe internally // Produces storage containing uninitialized values fn into_storage(mut self) -> Ptr::Output { @@ -117,6 +170,10 @@ impl Default for TensorCache { Self { allocations: Default::default(), enabled: RwLock::new(false), + drop_queue: Default::default(), + size: RwLock::new(0), + // TODO: default max size + max_size: RwLock::new(1_000_000), } } } @@ -125,52 +182,65 @@ impl TensorCache { /// Returns the number of allocations in the cache. #[allow(unused)] pub(crate) fn len(&self) -> usize { - #[cfg(not(feature = "no-std"))] - { - self.allocations.read().unwrap().len() - } + read!(self.allocations).len() + } - #[cfg(feature = "no-std")] - { - self.allocations.read().len() - } + /// Returns the number of bytes occupied by allocations in the cache. + #[allow(unused)] + pub(crate) fn size(&self) -> usize { + *read!(self.size) } /// Returns `true` if the cache is enabled. pub(crate) fn is_enabled(&self) -> bool { - #[cfg(not(feature = "no-std"))] - { - *self.enabled.read().unwrap() - } - #[cfg(feature = "no-std")] - { - *self.enabled.read() - } + *read!(self.enabled) } /// Enables the cache. pub(crate) fn enable(&self) { - #[cfg(not(feature = "no-std"))] - { - *self.enabled.write().unwrap() = true; - } - - #[cfg(feature = "no-std")] - { - *self.enabled.write() = true; - } + *write!(self.enabled) = true; } /// Disables the cache. pub(crate) fn disable(&self) { - #[cfg(not(feature = "no-std"))] - { - *self.enabled.write().unwrap() = false; + *write!(self.enabled) = false; + } + + /// Sets the maximum size of the cache + #[allow(unused)] + pub(crate) fn set_max_size(&self, size: usize) { + *write!(self.max_size) = size; + + if size < *read!(self.size) { + self.shrink(); } + } - #[cfg(feature = "no-std")] - { - *self.enabled.write() = false; + /// Shrinks the cache so its buffers contain at most `max_size` bytes + fn shrink(&self) { + let mut size = write!(self.size); + let max_size = read!(self.max_size); + let mut drop_queue = write!(self.drop_queue); + let mut allocations = write!(self.allocations); + + while *size > *max_size { + let key = drop_queue + .pop_front() + .expect("ignore_drops values were set too high"); + let Some(alloc_group) = allocations.get_mut(&key) else { continue }; + + if alloc_group.ignore_drops > 0 { + alloc_group.ignore_drops -= 1; + } else { + let allocation = alloc_group + .allocations + .pop() + .expect("ignore_drops values were set too low"); + if alloc_group.allocations.is_empty() { + allocations.remove(&key); + } + *size -= allocation.size(); + } } } @@ -189,34 +259,27 @@ impl TensorCache { alignment: layout.align(), }; // Check if there is a cached allocation. - let reuse = { - #[cfg(not(feature = "no-std"))] - let cache = self.allocations.read().unwrap(); - #[cfg(feature = "no-std")] - let cache = self.allocations.read(); - cache.contains_key(&key) - }; + let reuse = read!(self.allocations).contains_key(&key); // If there is, remove it from the cache. // Otherwise, return `None`. if reuse { - #[cfg(not(feature = "no-std"))] - let mut cache = self.allocations.write().unwrap(); - #[cfg(feature = "no-std")] - let mut cache = self.allocations.write(); + let mut cache = write!(self.allocations); // unwrap is safe because we just checked for contains key above. let items = cache.get_mut(&key).unwrap(); + items.ignore_drops += 1; // unwrap is safe because reuse is only true if there's at least one item, // which is also maintained by the block directly below. - let allocation = items.pop().unwrap(); + let allocation = items.allocations.pop().unwrap(); // If there are no more cached allocations of this size, // remove the entry from the cache. // This is important for correctness, because the presence // of an entry in the cache indicates that there are valid // allocations to use. (see `let reuse = { ... }` above). - if items.is_empty() { + if items.allocations.is_empty() { cache.remove(&key); } allocation.check_key(&key); + *write!(self.size) -= allocation.size(); Some(allocation.into_storage()) } else { None @@ -241,31 +304,25 @@ impl TensorCache { alignment: layout.align(), }; allocation.check_key(&key); - #[cfg(not(feature = "no-std"))] - let mut cache = self.allocations.write().unwrap(); - #[cfg(feature = "no-std")] - let mut cache = self.allocations.write(); + *write!(self.size) += allocation.size(); + write!(self.drop_queue).push_back(key); + let mut cache = write!(self.allocations); if let std::collections::btree_map::Entry::Vacant(e) = cache.entry(key) { - #[cfg(not(feature = "no-std"))] - { - e.insert(std::vec![allocation]); - } - #[cfg(feature = "no-std")] - { - let mut allocations = Vec::new(); - allocations.push(allocation); - e.insert(allocations); - } + e.insert(AllocationGroup { + allocations: vec![allocation], + ignore_drops: 0, + }); } else { - cache.get_mut(&key).unwrap().push(allocation); + cache.get_mut(&key).unwrap().allocations.push(allocation); } + std::mem::drop(cache); + self.shrink(); } pub(crate) fn clear(&self) { - #[cfg(not(feature = "no-std"))] - self.allocations.write().unwrap().clear(); - #[cfg(feature = "no-std")] - self.allocations.write().clear(); + write!(self.allocations).clear(); + write!(self.drop_queue).clear(); + *write!(self.size) = 0; } } diff --git a/src/tensor/cpu/device.rs b/src/tensor/cpu/device.rs index d17709482..add9fc9c8 100644 --- a/src/tensor/cpu/device.rs +++ b/src/tensor/cpu/device.rs @@ -24,6 +24,11 @@ unsafe impl Sync for BytesPtr {} impl CacheStorage for Vec { type Output = Vec; + fn size(&self) -> usize { + // size in bytes of the underlying allocation + Layout::array::(self.capacity()).unwrap().size() + } + /// Unsafely converts the elements of a vector to a new type. /// /// # Safety diff --git a/src/tensor/cuda/device.rs b/src/tensor/cuda/device.rs index a3796421a..9f5e5c14a 100644 --- a/src/tensor/cuda/device.rs +++ b/src/tensor/cuda/device.rs @@ -20,6 +20,10 @@ use std::{ impl CacheStorage for CudaSlice { type Output = CudaSlice; + fn size(&self) -> usize { + Layout::array::(self.len()).unwrap().size() + } + /// Unsafely converts the elements of a CudaSlice to a new type. /// /// # Safety From 9efc215af93e1fc464a91afa4a27a1a71da734aa Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Sun, 16 Apr 2023 17:44:47 -0500 Subject: [PATCH 09/22] document ignore_drops; fix constraint violations --- src/tensor/cache.rs | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs index 3d185fb7e..5655874d1 100644 --- a/src/tensor/cache.rs +++ b/src/tensor/cache.rs @@ -60,6 +60,9 @@ struct AllocationKey { #[derive(Debug)] struct AllocationGroup { + // Tracks the number of matching 'AllocationKey's in drop_queue to ignore. This is used to + // "remove" the next instance of the matching AllocationKey in the drop_queue, without having + // to run an O(n) ignore_drops: usize, allocations: Vec>, } @@ -75,6 +78,10 @@ struct AllocationGroup { /// The presence of a key in the map, indicates that there is *at least one* /// valid allocation. When the last value is removed from the list, the key /// is removed. +/// +/// Constraint: for a given value of AllocationKey, the following must hold: +/// +/// (instances in drop_queue) = (group.ignore_drops) + (group.allocations.len()) #[derive(Debug)] pub(crate) struct TensorCache { allocations: RwLock>>, @@ -165,6 +172,12 @@ impl CacheWrapper { } } +impl AllocationGroup { + fn is_empty(&self) -> bool { + self.allocations.is_empty() && self.ignore_drops == 0 + } +} + impl Default for TensorCache { fn default() -> Self { Self { @@ -173,7 +186,7 @@ impl Default for TensorCache { drop_queue: Default::default(), size: RwLock::new(0), // TODO: default max size - max_size: RwLock::new(1_000_000), + max_size: RwLock::new(1_000_000_000), } } } @@ -259,7 +272,9 @@ impl TensorCache { alignment: layout.align(), }; // Check if there is a cached allocation. - let reuse = read!(self.allocations).contains_key(&key); + let reuse = read!(self.allocations) + .get(&key) + .map_or(false, |group| !group.allocations.is_empty()); // If there is, remove it from the cache. // Otherwise, return `None`. if reuse { @@ -270,14 +285,6 @@ impl TensorCache { // unwrap is safe because reuse is only true if there's at least one item, // which is also maintained by the block directly below. let allocation = items.allocations.pop().unwrap(); - // If there are no more cached allocations of this size, - // remove the entry from the cache. - // This is important for correctness, because the presence - // of an entry in the cache indicates that there are valid - // allocations to use. (see `let reuse = { ... }` above). - if items.allocations.is_empty() { - cache.remove(&key); - } allocation.check_key(&key); *write!(self.size) -= allocation.size(); Some(allocation.into_storage()) From e99a2aef402c1e8adbdbc9d9e462b19d3417ce72 Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Sun, 16 Apr 2023 17:55:27 -0500 Subject: [PATCH 10/22] fix another constraint violation; fix TensorCache::len --- src/tensor/cache.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs index 5655874d1..aef55eaa9 100644 --- a/src/tensor/cache.rs +++ b/src/tensor/cache.rs @@ -195,7 +195,10 @@ impl TensorCache { /// Returns the number of allocations in the cache. #[allow(unused)] pub(crate) fn len(&self) -> usize { - read!(self.allocations).len() + read!(self.allocations) + .values() + .map(|group| group.allocations.len()) + .sum() } /// Returns the number of bytes occupied by allocations in the cache. @@ -236,6 +239,15 @@ impl TensorCache { let mut drop_queue = write!(self.drop_queue); let mut allocations = write!(self.allocations); + debug_assert_eq!( + *size, + allocations + .values() + .flat_map(|group| &group.allocations) + .map(|alloc| alloc.size()) + .sum::() + ); + while *size > *max_size { let key = drop_queue .pop_front() @@ -249,7 +261,7 @@ impl TensorCache { .allocations .pop() .expect("ignore_drops values were set too low"); - if alloc_group.allocations.is_empty() { + if alloc_group.is_empty() { allocations.remove(&key); } *size -= allocation.size(); From 76798c990485a6e49eb111f2711cecb90fab522d Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Mon, 17 Apr 2023 14:33:57 -0500 Subject: [PATCH 11/22] add size parameter to enable_cache and add set_cache_size --- src/tensor/cache.rs | 14 +++++++------- src/tensor/cpu/device.rs | 9 +++++++-- src/tensor/cpu/mod.rs | 8 ++++---- src/tensor/cuda/device.rs | 9 +++++++-- src/tensor/cuda/mod.rs | 8 ++++---- src/tensor/storage_traits.rs | 18 +++++++++++++----- 6 files changed, 42 insertions(+), 24 deletions(-) diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs index aef55eaa9..2b934bdf7 100644 --- a/src/tensor/cache.rs +++ b/src/tensor/cache.rs @@ -147,7 +147,7 @@ impl CacheWrapper { fn check_key(&self, key: &AllocationKey) { assert_eq!(self.alignment, key.alignment, "Alignment does not match"); assert_eq!(self.size, key.size, "Size does not match"); - // Implicitly assumes that T should not have any padding, but this should always be true of + // Implicitly assumes that T does not have any padding, but this should always be true of // primitive number types. assert_eq!( key.num_bytes % key.size, @@ -185,8 +185,7 @@ impl Default for TensorCache { enabled: RwLock::new(false), drop_queue: Default::default(), size: RwLock::new(0), - // TODO: default max size - max_size: RwLock::new(1_000_000_000), + max_size: RwLock::new(0), } } } @@ -213,8 +212,9 @@ impl TensorCache { } /// Enables the cache. - pub(crate) fn enable(&self) { + pub(crate) fn enable(&self, size: usize) { *write!(self.enabled) = true; + *write!(self.max_size) = size; } /// Disables the cache. @@ -352,7 +352,7 @@ mod test { #[test] fn test_try_pop_on_disabled_cache() { let cache: TensorCache> = Default::default(); - cache.enable(); + cache.enable(1000); assert!(cache.is_enabled()); cache.disable(); assert!(!cache.is_enabled()); @@ -363,7 +363,7 @@ mod test { #[test] fn test_try_pop_on_empty_cache() { let cache: TensorCache> = Default::default(); - cache.enable(); + cache.enable(1000); assert_eq!(cache.try_pop::(1), None); assert_eq!(cache.try_pop::(1), None); } @@ -371,7 +371,7 @@ mod test { #[test] fn test_try_pop_on_cache_with_multiple_sizes_and_alignment() { let cache: TensorCache> = Default::default(); - cache.enable(); + cache.enable(1000); cache.insert::(1, vec![0.0]); cache.insert::(1, vec![1.0]); cache.insert::(1, vec![2.0]); diff --git a/src/tensor/cpu/device.rs b/src/tensor/cpu/device.rs index add9fc9c8..501abcd7c 100644 --- a/src/tensor/cpu/device.rs +++ b/src/tensor/cpu/device.rs @@ -220,8 +220,8 @@ impl DeviceStorage for Cpu { Ok(()) } - fn try_enable_cache(&self) -> Result<(), Self::Err> { - self.cache.enable(); + fn try_enable_cache(&self, size: usize) -> Result<(), Self::Err> { + self.cache.enable(size); Ok(()) } @@ -234,4 +234,9 @@ impl DeviceStorage for Cpu { self.cache.clear(); Ok(()) } + + fn try_set_cache_size(&self, size: usize) -> Result<(), Self::Err> { + self.cache.set_max_size(size); + Ok(()) + } } diff --git a/src/tensor/cpu/mod.rs b/src/tensor/cpu/mod.rs index fda5306ea..79c3437a6 100644 --- a/src/tensor/cpu/mod.rs +++ b/src/tensor/cpu/mod.rs @@ -17,7 +17,7 @@ mod tests { #[test] fn test_empty_cache() { let dev: Cpu = Default::default(); - dev.enable_cache(); + dev.enable_cache(1000); let tensor: Tensor, f32, _> = dev.zeros(); drop(tensor); // insert allocation into cache assert_eq!(dev.cache.len(), 1); @@ -28,7 +28,7 @@ mod tests { #[test] fn test_disabling_cache_empties_it() { let dev: Cpu = Default::default(); - dev.enable_cache(); + dev.enable_cache(1000); let tensor: Tensor, f32, _> = dev.zeros(); drop(tensor); // insert allocation into cache assert_eq!(dev.cache.len(), 1); @@ -39,7 +39,7 @@ mod tests { #[test] fn test_reuse_allocation_on_new_tensor() { let dev: Cpu = Default::default(); - dev.enable_cache(); + dev.enable_cache(1000); assert_eq!(dev.cache.len(), 0); let tensor: Tensor, f32, _> = dev.zeros(); assert_eq!(dev.cache.len(), 0); @@ -57,7 +57,7 @@ mod tests { #[test] fn test_reuse_allocation_on_clone_tensor() { let dev: Cpu = Default::default(); - dev.enable_cache(); + dev.enable_cache(1000); let a: Tensor, f32, _> = dev.zeros(); let b: Tensor, f32, _> = dev.zeros(); drop(b); // insert allocation into cache diff --git a/src/tensor/cuda/device.rs b/src/tensor/cuda/device.rs index 9f5e5c14a..ed635b4fb 100644 --- a/src/tensor/cuda/device.rs +++ b/src/tensor/cuda/device.rs @@ -308,8 +308,8 @@ impl DeviceStorage for Cuda { self.dev.synchronize().map_err(CudaError::from) } - fn try_enable_cache(&self) -> Result<(), Self::Err> { - self.cache.enable(); + fn try_enable_cache(&self, size: usize) -> Result<(), Self::Err> { + self.cache.enable(size); Ok(()) } @@ -322,4 +322,9 @@ impl DeviceStorage for Cuda { self.cache.clear(); Ok(()) } + + fn try_set_cache_size(&self, size: usize) -> Result<(), Self::Err> { + self.cache.set_max_size(size); + Ok(()) + } } diff --git a/src/tensor/cuda/mod.rs b/src/tensor/cuda/mod.rs index 8b91d2ab3..0f8a44302 100644 --- a/src/tensor/cuda/mod.rs +++ b/src/tensor/cuda/mod.rs @@ -21,7 +21,7 @@ mod tests { #[test] fn test_empty_cache() { let dev: Cuda = Default::default(); - dev.enable_cache(); + dev.enable_cache(1000); let tensor: Tensor, f32, _> = dev.zeros(); drop(tensor); // insert allocation into cache assert_eq!(dev.cache.len(), 1); @@ -32,7 +32,7 @@ mod tests { #[test] fn test_disabling_cache_empties_it() { let dev: Cuda = Default::default(); - dev.enable_cache(); + dev.enable_cache(1000); let tensor: Tensor, f32, _> = dev.zeros(); drop(tensor); // insert allocation into cache assert_eq!(dev.cache.len(), 1); @@ -43,7 +43,7 @@ mod tests { #[test] fn test_reuse_allocation_on_new_tensor() { let dev: Cuda = Default::default(); - dev.enable_cache(); + dev.enable_cache(1000); let tensor: Tensor, f32, _> = dev.zeros(); let ptr = *tensor.data.device_ptr(); drop(tensor); // insert allocation into cache @@ -59,7 +59,7 @@ mod tests { #[test] fn test_reuse_allocation_on_clone_tensor() { let dev: Cuda = Default::default(); - dev.enable_cache(); + dev.enable_cache(1000); let a: Tensor, f32, _> = dev.zeros(); let b: Tensor, f32, _> = dev.zeros(); drop(b); // insert allocation into cache diff --git a/src/tensor/storage_traits.rs b/src/tensor/storage_traits.rs index 7b32abf54..997ca1bff 100644 --- a/src/tensor/storage_traits.rs +++ b/src/tensor/storage_traits.rs @@ -43,13 +43,13 @@ pub trait DeviceStorage: 'static + std::fmt::Debug + Default + Clone + HasErr { /// Blocks until all work on device to complete. Useful for benchmarking. fn try_synchronize(&self) -> Result<(), Self::Err>; - /// Enables the cache of the device. - fn enable_cache(&self) { - self.try_enable_cache().unwrap() + /// Enables the cache of the device, and sets the maximum size to `size`. + fn enable_cache(&self, size: usize) { + self.try_enable_cache(size).unwrap() } - /// Tries to enable the cache of the device. - fn try_enable_cache(&self) -> Result<(), Self::Err>; + /// Tries to enable the cache of the device, and sets the maximum size to `size`. + fn try_enable_cache(&self, size: usize) -> Result<(), Self::Err>; /// Disables the cache of the device. This will also empty the cache /// if there are things in it. See [DeviceStorage::empty_cache] for @@ -78,6 +78,14 @@ pub trait DeviceStorage: 'static + std::fmt::Debug + Default + Clone + HasErr { /// Tries to empty the cache of the device. See [DeviceStorage::empty_cache] for /// details of when this is useful. fn try_empty_cache(&self) -> Result<(), Self::Err>; + + /// Sets the maximum size of the cache in bytes, and shrinks the cache until it smaller than `size`. + fn set_cache_size(&self, size: usize) { + self.try_set_cache_size(size).unwrap() + } + + /// Fallible version of [DeviceStorage::set_cache_size] + fn try_set_cache_size(&self, size: usize) -> Result<(), Self::Err>; } /// Internal trait - Represents something that can allocate its own gradient. From e1f4622ab2de0ba43078550b432969529d4d68a4 Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Wed, 19 Apr 2023 15:42:56 -0500 Subject: [PATCH 12/22] remove len argument from TensorCache::insert; add test for cache --- src/tensor/cache.rs | 91 +++++++++++++++++++++++++++++++++------ src/tensor/cpu/device.rs | 4 +- src/tensor/cuda/device.rs | 2 +- 3 files changed, 80 insertions(+), 17 deletions(-) diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs index 2b934bdf7..2648ebc85 100644 --- a/src/tensor/cache.rs +++ b/src/tensor/cache.rs @@ -283,6 +283,7 @@ impl TensorCache { size: layout.size(), alignment: layout.align(), }; + println!("{key:?}"); // Check if there is a cached allocation. let reuse = read!(self.allocations) .get(&key) @@ -298,6 +299,7 @@ impl TensorCache { // which is also maintained by the block directly below. let allocation = items.allocations.pop().unwrap(); allocation.check_key(&key); + println!("{:?} {}", allocation.size, allocation.alignment); *write!(self.size) -= allocation.size(); Some(allocation.into_storage()) } else { @@ -306,7 +308,7 @@ impl TensorCache { } /// Inserts an allocation into the cache. - pub(crate) fn insert(&self, len: usize, allocation: Ptr::Output) + pub(crate) fn insert(&self, allocation: Ptr::Output) where Ptr::Output: CacheStorage = Ptr>, { @@ -316,7 +318,7 @@ impl TensorCache { let allocation = CacheWrapper::from_storage(allocation); let layout = Layout::new::(); - let num_bytes = len * layout.size(); + let num_bytes = allocation.size(); let key = AllocationKey { num_bytes, size: layout.size(), @@ -343,6 +345,10 @@ impl TensorCache { write!(self.drop_queue).clear(); *write!(self.size) = 0; } + + fn clear_check(&self) { + self.set_max_size(0); + } } #[cfg(test)] @@ -372,18 +378,18 @@ mod test { fn test_try_pop_on_cache_with_multiple_sizes_and_alignment() { let cache: TensorCache> = Default::default(); cache.enable(1000); - cache.insert::(1, vec![0.0]); - cache.insert::(1, vec![1.0]); - cache.insert::(1, vec![2.0]); - cache.insert::(2, vec![3.0; 2]); - cache.insert::(2, vec![4.0; 2]); - cache.insert::(2, vec![5.0; 2]); - cache.insert::(1, vec![6.0]); - cache.insert::(1, vec![7.0]); - cache.insert::(1, vec![8.0]); - cache.insert::(2, vec![9.0; 2]); - cache.insert::(2, vec![10.0; 2]); - cache.insert::(2, vec![11.0; 2]); + cache.insert::(vec![0.0]); + cache.insert::(vec![1.0]); + cache.insert::(vec![2.0]); + cache.insert::(vec![3.0; 2]); + cache.insert::(vec![4.0; 2]); + cache.insert::(vec![5.0; 2]); + cache.insert::(vec![6.0]); + cache.insert::(vec![7.0]); + cache.insert::(vec![8.0]); + cache.insert::(vec![9.0; 2]); + cache.insert::(vec![10.0; 2]); + cache.insert::(vec![11.0; 2]); assert_eq!(cache.try_pop::(1), Some(vec![2.0])); assert_eq!(cache.try_pop::(1), Some(vec![1.0])); assert_eq!(cache.try_pop::(1), Some(vec![0.0])); @@ -400,5 +406,62 @@ mod test { assert_eq!(cache.try_pop::(2), Some(vec![10.0; 2])); assert_eq!(cache.try_pop::(2), Some(vec![9.0; 2])); assert_eq!(cache.try_pop::(2), None); + cache.clear_check(); + } + + #[test] + fn test_shrink() { + let cache: TensorCache> = Default::default(); + cache.enable(16); + cache.insert::(vec![1; 1]); + cache.insert::(vec![2; 1]); + cache.insert::(vec![1; 2]); + cache.insert::(vec![1; 4]); + cache.insert::(vec![1; 8]); + assert_eq!(cache.len(), 5); + assert_eq!(cache.size(), 16); + cache.insert::(vec![2; 8]); + assert_eq!(cache.len(), 2); + assert_eq!(cache.size(), 16); + cache.insert::(vec![3; 1]); + assert_eq!(cache.len(), 2); + assert_eq!(cache.size(), 9); + cache.insert::(vec![1; 12]); + assert_eq!(cache.len(), 2); + assert_eq!(cache.size(), 13); + cache.clear_check(); + } + + + #[test] + fn test_pop_and_shrink() { + let cache: TensorCache> = Default::default(); + cache.enable(16); + cache.insert::(vec![1; 1]); + cache.insert::(vec![2; 1]); + cache.insert::(vec![1; 2]); + cache.insert::(vec![1; 4]); + cache.insert::(vec![1; 8]); + assert_eq!(cache.len(), 5); + assert_eq!(cache.size(), 16); + + assert_eq!(cache.try_pop::(1), Some(vec![2])); + assert_eq!(cache.try_pop::(2), Some(vec![1; 2])); + assert_eq!(cache.len(), 3); + assert_eq!(cache.size(), 13); + + cache.insert::(vec![2; 8]); + assert_eq!(cache.len(), 2); + assert_eq!(cache.size(), 16); + + assert_eq!(cache.try_pop::(8), Some(vec![2; 8])); + assert_eq!(cache.len(), 1); + assert_eq!(cache.size(), 8); + + cache.insert::(vec![2; 4]); + assert_eq!(cache.len(), 2); + assert_eq!(cache.size(), 12); + + cache.clear_check(); } } diff --git a/src/tensor/cpu/device.rs b/src/tensor/cpu/device.rs index 501abcd7c..f8f2e74e5 100644 --- a/src/tensor/cpu/device.rs +++ b/src/tensor/cpu/device.rs @@ -26,7 +26,7 @@ impl CacheStorage for Vec { fn size(&self) -> usize { // size in bytes of the underlying allocation - Layout::array::(self.capacity()).unwrap().size() + Layout::array::(self.len()).unwrap().size() } /// Unsafely converts the elements of a vector to a new type. @@ -168,7 +168,7 @@ impl Clone for CachableVec { impl Drop for CachableVec { fn drop(&mut self) { let data = std::mem::take(&mut self.data); - self.cache.insert::(data.len(), data); + self.cache.insert::(data); } } diff --git a/src/tensor/cuda/device.rs b/src/tensor/cuda/device.rs index ed635b4fb..28e25bb7b 100644 --- a/src/tensor/cuda/device.rs +++ b/src/tensor/cuda/device.rs @@ -260,7 +260,7 @@ impl Drop for CachableCudaSlice { // Replaces the CudaSlice with a 0 length CudaSlice. This won't take additional // memory, but will give us ownership of the actual data. let data = std::mem::replace(&mut self.data, dev.null().unwrap()); - self.cache.insert::(data.len(), data); + self.cache.insert::(data); } } From 7476b808aaefd8dcd4c45bfbb9a5a5ad45be0f26 Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Wed, 19 Apr 2023 15:45:41 -0500 Subject: [PATCH 13/22] run cargo fmt --- src/tensor/cache.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs index 2648ebc85..217741653 100644 --- a/src/tensor/cache.rs +++ b/src/tensor/cache.rs @@ -432,7 +432,6 @@ mod test { cache.clear_check(); } - #[test] fn test_pop_and_shrink() { let cache: TensorCache> = Default::default(); @@ -444,24 +443,24 @@ mod test { cache.insert::(vec![1; 8]); assert_eq!(cache.len(), 5); assert_eq!(cache.size(), 16); - + assert_eq!(cache.try_pop::(1), Some(vec![2])); assert_eq!(cache.try_pop::(2), Some(vec![1; 2])); assert_eq!(cache.len(), 3); assert_eq!(cache.size(), 13); - + cache.insert::(vec![2; 8]); assert_eq!(cache.len(), 2); assert_eq!(cache.size(), 16); - + assert_eq!(cache.try_pop::(8), Some(vec![2; 8])); assert_eq!(cache.len(), 1); assert_eq!(cache.size(), 8); - + cache.insert::(vec![2; 4]); assert_eq!(cache.len(), 2); assert_eq!(cache.size(), 12); - + cache.clear_check(); } } From 80c20e789968761ae79a00703b4abfa263040f2d Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Wed, 19 Apr 2023 16:00:34 -0500 Subject: [PATCH 14/22] fix clippy error --- src/tensor/cache.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs index 217741653..cbe8df20c 100644 --- a/src/tensor/cache.rs +++ b/src/tensor/cache.rs @@ -346,6 +346,7 @@ impl TensorCache { *write!(self.size) = 0; } + #[allow(unused)] fn clear_check(&self) { self.set_max_size(0); } From 3d443ba0da7f89b0482b54f6820d2699f7e4e889 Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Wed, 19 Apr 2023 21:03:39 -0500 Subject: [PATCH 15/22] remove debug print statements --- src/tensor/cache.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs index cbe8df20c..46262524a 100644 --- a/src/tensor/cache.rs +++ b/src/tensor/cache.rs @@ -283,7 +283,6 @@ impl TensorCache { size: layout.size(), alignment: layout.align(), }; - println!("{key:?}"); // Check if there is a cached allocation. let reuse = read!(self.allocations) .get(&key) @@ -299,7 +298,6 @@ impl TensorCache { // which is also maintained by the block directly below. let allocation = items.allocations.pop().unwrap(); allocation.check_key(&key); - println!("{:?} {}", allocation.size, allocation.alignment); *write!(self.size) -= allocation.size(); Some(allocation.into_storage()) } else { From 9761e416927ff45f696fcae60e9ca1353b603efd Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Fri, 21 Apr 2023 11:35:08 -0500 Subject: [PATCH 16/22] improve documentation --- src/tensor/cache.rs | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs index 46262524a..aa91183e6 100644 --- a/src/tensor/cache.rs +++ b/src/tensor/cache.rs @@ -62,7 +62,7 @@ struct AllocationKey { struct AllocationGroup { // Tracks the number of matching 'AllocationKey's in drop_queue to ignore. This is used to // "remove" the next instance of the matching AllocationKey in the drop_queue, without having - // to run an O(n) + // to run an O(n) operation to actually remove the key. ignore_drops: usize, allocations: Vec>, } @@ -79,9 +79,10 @@ struct AllocationGroup { /// valid allocation. When the last value is removed from the list, the key /// is removed. /// -/// Constraint: for a given value of AllocationKey, the following must hold: +/// Constraint: for a given value of AllocationKey, the following must hold for each key value in +/// `allocations`: /// -/// (instances in drop_queue) = (group.ignore_drops) + (group.allocations.len()) +/// (instances of key in drop_queue) = allocations[key].ignore_drops + allocations[key].allocations.len() #[derive(Debug)] pub(crate) struct TensorCache { allocations: RwLock>>, @@ -103,6 +104,7 @@ pub(crate) trait CacheStorage: Sized { unsafe fn transmute_elements(self) -> Self::Output; /// Uses transmute_elements to convert to an element type with alignment `align` before dropping. + /// This **must** be a memory safe way to drop self, given the correct alignment unsafe fn drop_with_alignment(self, align: usize) { match align { 1 => drop(self.transmute_elements::()), @@ -126,17 +128,24 @@ struct CacheWrapper { impl Drop for CacheWrapper { fn drop(&mut self) { if let Some(ptr) = std::mem::take(&mut self.ptr) { + // Safety: This operation is memory safe because ptr is guaranteed to have elements + // with the correct alignment before being dropped. This is ensured by the CacheWrapper + // being constructed with from_storage. unsafe { ptr.drop_with_alignment(self.alignment) } } } } impl CacheWrapper { + /// Safety: Storage must be valid to drop, and Ptr::transmute_elements must produce a storage + /// that is valid to drop after being converted to an element type with the same alignment as T fn from_storage(storage: Ptr::Output) -> Self where Ptr::Output: CacheStorage = Ptr>, { let layout = Layout::new::(); + // Safety: Ptr must be converted to an element typw with the correct alignment before + // it is dropped. Ptr might not be valid data after this operation Self { ptr: Some(unsafe { storage.transmute_elements::() }), alignment: layout.align(), @@ -160,14 +169,17 @@ impl CacheWrapper { self.ptr.as_ref().unwrap().size() } - // Safety: Same as slice.align_to, but considered safe internally - // Produces storage containing uninitialized values + /// Safety: Same as slice.align_to, but considered safe internally + /// Produces storage containing uninitialized values fn into_storage(mut self) -> Ptr::Output { let layout = Layout::new::(); assert_eq!(layout.align(), self.alignment); assert_eq!(layout.size(), self.size); let ptr = std::mem::take(&mut self.ptr).unwrap(); + + // Safety: This will always construct a storage with correct alignment and element size if + // this CacheWrapper was constructed with from_storage. unsafe { ptr.transmute_elements() } } } From aef3be90301ce5978ae9c4dcdab6269049c2d1da Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Fri, 21 Apr 2023 11:58:22 -0500 Subject: [PATCH 17/22] add 'in bytes' to enable_cache documentation --- src/tensor/storage_traits.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tensor/storage_traits.rs b/src/tensor/storage_traits.rs index 997ca1bff..f4be94eca 100644 --- a/src/tensor/storage_traits.rs +++ b/src/tensor/storage_traits.rs @@ -43,12 +43,12 @@ pub trait DeviceStorage: 'static + std::fmt::Debug + Default + Clone + HasErr { /// Blocks until all work on device to complete. Useful for benchmarking. fn try_synchronize(&self) -> Result<(), Self::Err>; - /// Enables the cache of the device, and sets the maximum size to `size`. + /// Enables the cache of the device, and sets the maximum size in bytes to `size`. fn enable_cache(&self, size: usize) { self.try_enable_cache(size).unwrap() } - /// Tries to enable the cache of the device, and sets the maximum size to `size`. + /// Tries to enable the cache of the device, and sets the maximum size in bytes to `size`. fn try_enable_cache(&self, size: usize) -> Result<(), Self::Err>; /// Disables the cache of the device. This will also empty the cache From 59ed3b159ce8068abd573f8f55532644b351b1c6 Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Sat, 22 Apr 2023 15:30:47 -0500 Subject: [PATCH 18/22] add CacheSize enum --- src/tensor/cpu/device.rs | 8 ++++---- src/tensor/cpu/mod.rs | 10 +++++----- src/tensor/cuda/device.rs | 9 +++++---- src/tensor/cuda/mod.rs | 10 +++++----- src/tensor/storage_traits.rs | 36 ++++++++++++++++++++++++++++++++---- 5 files changed, 51 insertions(+), 22 deletions(-) diff --git a/src/tensor/cpu/device.rs b/src/tensor/cpu/device.rs index f8f2e74e5..7440099c6 100644 --- a/src/tensor/cpu/device.rs +++ b/src/tensor/cpu/device.rs @@ -220,8 +220,8 @@ impl DeviceStorage for Cpu { Ok(()) } - fn try_enable_cache(&self, size: usize) -> Result<(), Self::Err> { - self.cache.enable(size); + fn try_enable_cache(&self, size: CacheSize) -> Result<(), Self::Err> { + self.cache.enable(size.to_num_bytes()); Ok(()) } @@ -235,8 +235,8 @@ impl DeviceStorage for Cpu { Ok(()) } - fn try_set_cache_size(&self, size: usize) -> Result<(), Self::Err> { - self.cache.set_max_size(size); + fn try_set_cache_size(&self, size: CacheSize) -> Result<(), Self::Err> { + self.cache.set_max_size(size.to_num_bytes()); Ok(()) } } diff --git a/src/tensor/cpu/mod.rs b/src/tensor/cpu/mod.rs index 79c3437a6..360c9ae55 100644 --- a/src/tensor/cpu/mod.rs +++ b/src/tensor/cpu/mod.rs @@ -12,12 +12,12 @@ pub use device::{Cpu, CpuError}; #[cfg(test)] mod tests { use super::*; - use crate::{shapes::*, tensor::*}; + use crate::{shapes::*, tensor::*, prelude::storage_traits::CacheSize}; #[test] fn test_empty_cache() { let dev: Cpu = Default::default(); - dev.enable_cache(1000); + dev.enable_cache(CacheSize::KB(1)); let tensor: Tensor, f32, _> = dev.zeros(); drop(tensor); // insert allocation into cache assert_eq!(dev.cache.len(), 1); @@ -28,7 +28,7 @@ mod tests { #[test] fn test_disabling_cache_empties_it() { let dev: Cpu = Default::default(); - dev.enable_cache(1000); + dev.enable_cache(CacheSize::KB(1)); let tensor: Tensor, f32, _> = dev.zeros(); drop(tensor); // insert allocation into cache assert_eq!(dev.cache.len(), 1); @@ -39,7 +39,7 @@ mod tests { #[test] fn test_reuse_allocation_on_new_tensor() { let dev: Cpu = Default::default(); - dev.enable_cache(1000); + dev.enable_cache(CacheSize::KB(1)); assert_eq!(dev.cache.len(), 0); let tensor: Tensor, f32, _> = dev.zeros(); assert_eq!(dev.cache.len(), 0); @@ -57,7 +57,7 @@ mod tests { #[test] fn test_reuse_allocation_on_clone_tensor() { let dev: Cpu = Default::default(); - dev.enable_cache(1000); + dev.enable_cache(CacheSize::KB(1)); let a: Tensor, f32, _> = dev.zeros(); let b: Tensor, f32, _> = dev.zeros(); drop(b); // insert allocation into cache diff --git a/src/tensor/cuda/device.rs b/src/tensor/cuda/device.rs index 28e25bb7b..acc393a67 100644 --- a/src/tensor/cuda/device.rs +++ b/src/tensor/cuda/device.rs @@ -1,3 +1,4 @@ +use crate::prelude::storage_traits::CacheSize; use crate::shapes::{Shape, Unit}; use crate::tensor::cpu::{Cpu, CpuError}; use crate::tensor::{ @@ -308,8 +309,8 @@ impl DeviceStorage for Cuda { self.dev.synchronize().map_err(CudaError::from) } - fn try_enable_cache(&self, size: usize) -> Result<(), Self::Err> { - self.cache.enable(size); + fn try_enable_cache(&self, size: CacheSize) -> Result<(), Self::Err> { + self.cache.enable(size.to_num_bytes()); Ok(()) } @@ -323,8 +324,8 @@ impl DeviceStorage for Cuda { Ok(()) } - fn try_set_cache_size(&self, size: usize) -> Result<(), Self::Err> { - self.cache.set_max_size(size); + fn try_set_cache_size(&self, size: CacheSize) -> Result<(), Self::Err> { + self.cache.set_max_size(size.to_num_bytes()); Ok(()) } } diff --git a/src/tensor/cuda/mod.rs b/src/tensor/cuda/mod.rs index 0f8a44302..f8a4f6fb6 100644 --- a/src/tensor/cuda/mod.rs +++ b/src/tensor/cuda/mod.rs @@ -15,13 +15,13 @@ pub(crate) fn launch_cfg(n: u32) -> cudarc::driver::Laun #[cfg(test)] mod tests { use super::*; - use crate::{shapes::*, tensor::*}; + use crate::{shapes::*, tensor::*, prelude::storage_traits::CacheSize}; use cudarc::driver::DevicePtr; #[test] fn test_empty_cache() { let dev: Cuda = Default::default(); - dev.enable_cache(1000); + dev.enable_cache(CacheSize::KB(1)); let tensor: Tensor, f32, _> = dev.zeros(); drop(tensor); // insert allocation into cache assert_eq!(dev.cache.len(), 1); @@ -32,7 +32,7 @@ mod tests { #[test] fn test_disabling_cache_empties_it() { let dev: Cuda = Default::default(); - dev.enable_cache(1000); + dev.enable_cache(CacheSize::KB(1)); let tensor: Tensor, f32, _> = dev.zeros(); drop(tensor); // insert allocation into cache assert_eq!(dev.cache.len(), 1); @@ -43,7 +43,7 @@ mod tests { #[test] fn test_reuse_allocation_on_new_tensor() { let dev: Cuda = Default::default(); - dev.enable_cache(1000); + dev.enable_cache(CacheSize::KB(1)); let tensor: Tensor, f32, _> = dev.zeros(); let ptr = *tensor.data.device_ptr(); drop(tensor); // insert allocation into cache @@ -59,7 +59,7 @@ mod tests { #[test] fn test_reuse_allocation_on_clone_tensor() { let dev: Cuda = Default::default(); - dev.enable_cache(1000); + dev.enable_cache(CacheSize::KB(1)); let a: Tensor, f32, _> = dev.zeros(); let b: Tensor, f32, _> = dev.zeros(); drop(b); // insert allocation into cache diff --git a/src/tensor/storage_traits.rs b/src/tensor/storage_traits.rs index f4be94eca..3764b968f 100644 --- a/src/tensor/storage_traits.rs +++ b/src/tensor/storage_traits.rs @@ -16,6 +16,34 @@ pub trait AsVec { fn as_vec(&self) -> std::vec::Vec; } +/// Expresses the size of the cache in human readable units. +#[derive(Debug, Clone, Copy)] +pub enum CacheSize { + /// Bytes + Bytes(usize), + /// Kilobytes (10 ^ 3 bytes) + KB(usize), + /// Megabytes (10 ^ 6 bytes) + MB(usize), + /// Gigabytes (10 ^ 9 bytes) + GB(usize), + /// Terabytes (10 ^ 12 bytes) + TB(usize), +} + +impl CacheSize { + /// Returns the number of bytes this CacheSize represents + pub fn to_num_bytes(self) -> usize { + match self { + CacheSize::Bytes(b) => b, + CacheSize::KB(b) => b * 1e3 as usize, + CacheSize::MB(b) => b * 1e6 as usize, + CacheSize::GB(b) => b * 1e9 as usize, + CacheSize::TB(b) => b * 1e12 as usize, + } + } +} + /// Something that can store nd arrays for a given [Shape] and [Dtype] pub trait DeviceStorage: 'static + std::fmt::Debug + Default + Clone + HasErr { /// Generic storage type @@ -44,12 +72,12 @@ pub trait DeviceStorage: 'static + std::fmt::Debug + Default + Clone + HasErr { fn try_synchronize(&self) -> Result<(), Self::Err>; /// Enables the cache of the device, and sets the maximum size in bytes to `size`. - fn enable_cache(&self, size: usize) { + fn enable_cache(&self, size: CacheSize) { self.try_enable_cache(size).unwrap() } /// Tries to enable the cache of the device, and sets the maximum size in bytes to `size`. - fn try_enable_cache(&self, size: usize) -> Result<(), Self::Err>; + fn try_enable_cache(&self, size: CacheSize) -> Result<(), Self::Err>; /// Disables the cache of the device. This will also empty the cache /// if there are things in it. See [DeviceStorage::empty_cache] for @@ -80,12 +108,12 @@ pub trait DeviceStorage: 'static + std::fmt::Debug + Default + Clone + HasErr { fn try_empty_cache(&self) -> Result<(), Self::Err>; /// Sets the maximum size of the cache in bytes, and shrinks the cache until it smaller than `size`. - fn set_cache_size(&self, size: usize) { + fn set_cache_size(&self, size: CacheSize) { self.try_set_cache_size(size).unwrap() } /// Fallible version of [DeviceStorage::set_cache_size] - fn try_set_cache_size(&self, size: usize) -> Result<(), Self::Err>; + fn try_set_cache_size(&self, size: CacheSize) -> Result<(), Self::Err>; } /// Internal trait - Represents something that can allocate its own gradient. From 902529e282536cb2e7e3811fc5449b8a3ea1c40f Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Sat, 22 Apr 2023 15:33:47 -0500 Subject: [PATCH 19/22] run cargo fmt --- src/tensor/cpu/mod.rs | 2 +- src/tensor/cuda/mod.rs | 2 +- src/tensor/storage_traits.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tensor/cpu/mod.rs b/src/tensor/cpu/mod.rs index 360c9ae55..5a5a5e392 100644 --- a/src/tensor/cpu/mod.rs +++ b/src/tensor/cpu/mod.rs @@ -12,7 +12,7 @@ pub use device::{Cpu, CpuError}; #[cfg(test)] mod tests { use super::*; - use crate::{shapes::*, tensor::*, prelude::storage_traits::CacheSize}; + use crate::{prelude::storage_traits::CacheSize, shapes::*, tensor::*}; #[test] fn test_empty_cache() { diff --git a/src/tensor/cuda/mod.rs b/src/tensor/cuda/mod.rs index f8a4f6fb6..b3d095f12 100644 --- a/src/tensor/cuda/mod.rs +++ b/src/tensor/cuda/mod.rs @@ -15,7 +15,7 @@ pub(crate) fn launch_cfg(n: u32) -> cudarc::driver::Laun #[cfg(test)] mod tests { use super::*; - use crate::{shapes::*, tensor::*, prelude::storage_traits::CacheSize}; + use crate::{prelude::storage_traits::CacheSize, shapes::*, tensor::*}; use cudarc::driver::DevicePtr; #[test] diff --git a/src/tensor/storage_traits.rs b/src/tensor/storage_traits.rs index 3764b968f..acf24a5ac 100644 --- a/src/tensor/storage_traits.rs +++ b/src/tensor/storage_traits.rs @@ -19,7 +19,7 @@ pub trait AsVec { /// Expresses the size of the cache in human readable units. #[derive(Debug, Clone, Copy)] pub enum CacheSize { - /// Bytes + /// Bytes Bytes(usize), /// Kilobytes (10 ^ 3 bytes) KB(usize), From a2a565d982d27ea4ed230a6289ad6ac43eda65ce Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Mon, 24 Apr 2023 11:03:09 -0500 Subject: [PATCH 20/22] further clarify comment; remove align to 16 check --- src/tensor/cache.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs index aa91183e6..f8169d29d 100644 --- a/src/tensor/cache.rs +++ b/src/tensor/cache.rs @@ -62,7 +62,7 @@ struct AllocationKey { struct AllocationGroup { // Tracks the number of matching 'AllocationKey's in drop_queue to ignore. This is used to // "remove" the next instance of the matching AllocationKey in the drop_queue, without having - // to run an O(n) operation to actually remove the key. + // to run an O(n) operation to actually remove the key from the queue. ignore_drops: usize, allocations: Vec>, } @@ -111,7 +111,6 @@ pub(crate) trait CacheStorage: Sized { 2 => drop(self.transmute_elements::()), 4 => drop(self.transmute_elements::()), 8 => drop(self.transmute_elements::()), - 16 => drop(self.transmute_elements::()), _ => panic!("Invalid alignment"), } } From 58518e25e153105c30753b6678e1989a5a43e149 Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Mon, 24 Apr 2023 11:27:22 -0500 Subject: [PATCH 21/22] add check to minimize RwLock operations in shrink; export CacheSize --- src/tensor/cache.rs | 5 +++++ src/tensor/mod.rs | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/tensor/cache.rs b/src/tensor/cache.rs index f8169d29d..dc5e3d9bd 100644 --- a/src/tensor/cache.rs +++ b/src/tensor/cache.rs @@ -247,6 +247,11 @@ impl TensorCache { fn shrink(&self) { let mut size = write!(self.size); let max_size = read!(self.max_size); + + if *size < *max_size { + return; + } + let mut drop_queue = write!(self.drop_queue); let mut allocations = write!(self.allocations); diff --git a/src/tensor/mod.rs b/src/tensor/mod.rs index e9eb2f9f3..da19e00e3 100644 --- a/src/tensor/mod.rs +++ b/src/tensor/mod.rs @@ -169,7 +169,7 @@ pub use cuda::{Cuda, CudaError}; pub type AutoDevice = Cuda; pub use storage_traits::{AsArray, CopySlice, TensorFrom, TensorFromVec}; -pub use storage_traits::{DeviceStorage, HasErr}; +pub use storage_traits::{DeviceStorage, HasErr, CacheSize}; pub use storage_traits::{OnesTensor, SampleTensor, TriangleTensor, ZerosTensor}; pub use tensor_impls::{PutTape, SplitTape, Tensor, Trace, WithEmptyTape}; From 5a03b08187fcd41be8daeffb7d4097026a509961 Mon Sep 17 00:00:00 2001 From: Nathan Koppel Date: Mon, 24 Apr 2023 12:00:31 -0500 Subject: [PATCH 22/22] run cargo fmt --- src/tensor/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tensor/mod.rs b/src/tensor/mod.rs index da19e00e3..55e1f43f3 100644 --- a/src/tensor/mod.rs +++ b/src/tensor/mod.rs @@ -169,7 +169,7 @@ pub use cuda::{Cuda, CudaError}; pub type AutoDevice = Cuda; pub use storage_traits::{AsArray, CopySlice, TensorFrom, TensorFromVec}; -pub use storage_traits::{DeviceStorage, HasErr, CacheSize}; +pub use storage_traits::{CacheSize, DeviceStorage, HasErr}; pub use storage_traits::{OnesTensor, SampleTensor, TriangleTensor, ZerosTensor}; pub use tensor_impls::{PutTape, SplitTape, Tensor, Trace, WithEmptyTape};