From 94a9942cc48fd630059b3e0d4d879eb837f151b2 Mon Sep 17 00:00:00 2001
From: Corey Lowman <clowman1993@gmail.com>
Date: Sat, 8 Apr 2023 12:49:18 -0400
Subject: [PATCH 1/2] Simplify upscale cuda kernels

---
 src/tensor_ops/upscale2d/cuda_kernel.rs | 35 +++++++++----------------
 src/tensor_ops/upscale2d/upscale2d.cu   | 12 ++-------
 2 files changed, 15 insertions(+), 32 deletions(-)
diff --git a/src/tensor_ops/upscale2d/cuda_kernel.rs b/src/tensor_ops/upscale2d/cuda_kernel.rs
index ea5ab70e2..2d2013617 100644
--- a/src/tensor_ops/upscale2d/cuda_kernel.rs
+++ b/src/tensor_ops/upscale2d/cuda_kernel.rs
@@ -1,11 +1,11 @@
 use crate::{
     shapes::*,
-    tensor::{Cuda, Tensor},
+    tensor::{launch_cfg, Cuda, Tensor},
 };
 
 use std::sync::Arc;
 
-use cudarc::driver::{DeviceRepr, LaunchAsync, LaunchConfig};
+use cudarc::driver::{DeviceRepr, LaunchAsync};
 
 use super::{Bilinear, NearestNeighbor, UpscaleMethod};
 
@@ -13,9 +13,9 @@ const PTX_SRC: &str = include_str!(concat!(env!("OUT_DIR"), "/upscale2d.ptx"));
 
 unsafe impl DeviceRepr for super::Upscale2DOp {}
 
-fn make_4d<S: Shape>(strides: S::Concrete, pad: usize) -> [usize; 4] {
+fn make_4d<S: Shape>(strides: S::Concrete) -> [usize; 4] {
     match S::NUM_DIMS {
-        3 => [pad, strides[0], strides[1], strides[2]],
+        3 => [0, strides[0], strides[1], strides[2]],
         4 => [strides[0], strides[1], strides[2], strides[3]],
         _ => panic!("Only implemented for 3d & 4d arrays"),
     }
@@ -56,16 +56,14 @@ where
                 .load_ptx(PTX_SRC.into(), Self::FWD, &[Self::FWD, Self::BWD])?;
         }
 
-        let inp_strides = self.dev.htod_copy(make_4d::<I>(inp.strides, 0).into())?;
-        let out_strides = self.dev.htod_copy(make_4d::<O>(out.strides, 0).into())?;
+        let strides = self.dev.htod_copy(make_4d::<I>(inp.strides).into())?;
         let fwd_fn = self.dev.get_func(Self::FWD, Self::FWD).unwrap();
-        let cfg = LaunchConfig::for_num_elems(out.shape().num_elements() as u32);
+        let cfg = launch_cfg(out.shape().num_elements() as u32);
         let params = (
-            op,                           // const Pool2dOp op,
-            &inp_strides,                 // const size_t *inp_strides,
-            &out_strides,                 // const size_t *out_strides,
-            inp.data.as_ref(),            // const float *inp,
-            Arc::make_mut(&mut out.data), // float *out
+            op,
+            &strides,
+            inp.data.as_ref(),
+            Arc::make_mut(&mut out.data),
         );
         unsafe { fwd_fn.launch(cfg, params) }?;
         Ok(())
@@ -78,17 +76,10 @@ where
         out: &Tensor<O, E, Self>,
         grad_out: &Self::Vec<E>,
     ) -> Result<(), Self::Err> {
-        let inp_strides = self.dev.htod_copy(make_4d::<I>(inp.strides, 0).into())?;
-        let out_strides = self.dev.htod_copy(make_4d::<O>(out.strides, 0).into())?;
+        let strides = self.dev.htod_copy(make_4d::<I>(inp.strides).into())?;
         let bwd_fn = self.dev.get_func(Self::FWD, Self::BWD).unwrap();
-        let cfg = LaunchConfig::for_num_elems(out.shape().num_elements() as u32);
-        let params = (
-            op,           // const Pool2dOp op,
-            &inp_strides, // const size_t *inp_strides,
-            &out_strides, // const size_t *out_strides,
-            grad_inp,     // float *grad_inp,
-            grad_out,     // const float *grad_out
-        );
+        let cfg = launch_cfg(out.shape().num_elements() as u32);
+        let params = (op, &strides, grad_inp, grad_out);
         unsafe { bwd_fn.launch(cfg, params) }?;
         Ok(())
     }
diff --git a/src/tensor_ops/upscale2d/upscale2d.cu b/src/tensor_ops/upscale2d/upscale2d.cu
index 39786f4e5..33f337d42 100644
--- a/src/tensor_ops/upscale2d/upscale2d.cu
+++ b/src/tensor_ops/upscale2d/upscale2d.cu
@@ -13,7 +13,6 @@ template<typename T>
 __device__ void nearest_upscale2d_fwd(
     const Upscale2dOp op,
     const size_t *inp_strides,
-    const size_t *out_strides,
     const T *inp, // 4d (Batch, Channels, Height, Width)
     T *out // 4d (Batch, Channels, HeightOut, WidthOut)
 ) {
@@ -46,7 +45,6 @@ template<typename T>
 __device__ void nearest_upscale2d_bwd(
     const Upscale2dOp op,
     const size_t *inp_strides,
-    const size_t *out_strides,
     T *grad_inp,
     const T *grad_out // 4d (Batch, Channels, HeightOut, WidthOut)
 ) {
@@ -78,7 +76,6 @@ template<typename T>
 __device__ void bilinear_upscale2d_fwd(
     const Upscale2dOp op,
     const size_t *inp_strides,
-    const size_t *out_strides,
     const T *inp, // 4d (Batch, Channels, Height, Width)
     T *out // 4d (Batch, Channels, HeightOut, WidthOut)
 ) {
@@ -98,7 +95,6 @@ __device__ void bilinear_upscale2d_fwd(
     const size_t c = idx % op.chan;
     idx /= op.chan;
     const size_t b = idx % op.batch;
-    idx /= op.batch;
 
     size_t y0 = min(static_cast<size_t>(h_scale * oh), op.h_out - 1);
     size_t y1 = min(y0 + 1, op.h_out - 1);
@@ -122,7 +118,6 @@ template<typename T>
 __device__ void bilinear_upscale2d_bwd(
     const Upscale2dOp op,
     const size_t *inp_strides,
-    const size_t *out_strides,
     T *grad_inp, // 4d (Batch, Channels, Height, Width)
     const T *grad_out // 4d (Batch, Channels, HeightOut, WidthOut)
 ) {
@@ -142,7 +137,6 @@ __device__ void bilinear_upscale2d_bwd(
     const size_t c = idx % op.chan;
     idx /= op.chan;
     const size_t b = idx % op.batch;
-    idx /= op.batch;
 
     size_t y0 = min(static_cast<size_t>(h_scale * oh), op.h_out - 1);
     size_t y1 = min(y0 + 1, op.h_out - 1);
@@ -166,20 +160,18 @@ __device__ void bilinear_upscale2d_bwd(
 extern "C" __global__ void fwd( \
     const Upscale2dOp op, \
     const size_t *inp_strides, \
-    const size_t *out_strides, \
     const TYPENAME *inp, \
     TYPENAME *out \
 ) { \
-    fwd_FN(op, inp_strides, out_strides, inp, out); \
+    fwd_FN(op, inp_strides, inp, out); \
 } \
 extern "C" __global__ void bwd( \
     const Upscale2dOp op, \
     const size_t *inp_strides, \
-    const size_t *out_strides, \
     TYPENAME *grad_inp, \
     const TYPENAME *grad_out \
 ) { \
-    bwd_FN(op, inp_strides, out_strides, grad_inp, grad_out); \
+    bwd_FN(op, inp_strides, grad_inp, grad_out); \
 }
 
 UPSCALE_OP(

From 5751dc4d5200c98f6919c548fc5907c54a8f30a0 Mon Sep 17 00:00:00 2001
From: Corey Lowman <clowman1993@gmail.com>
Date: Sat, 8 Apr 2023 19:16:42 +0000
Subject: [PATCH 2/2] Fixing bounds checks in upscale kernel

---
 src/tensor_ops/upscale2d/upscale2d.cu | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/tensor_ops/upscale2d/upscale2d.cu b/src/tensor_ops/upscale2d/upscale2d.cu
index 33f337d42..49ad1b91a 100644
--- a/src/tensor_ops/upscale2d/upscale2d.cu
+++ b/src/tensor_ops/upscale2d/upscale2d.cu
@@ -33,8 +33,8 @@ __device__ void nearest_upscale2d_fwd(
     idx /= op.chan;
     const size_t b = idx % op.batch;
 
-    size_t ih = min(static_cast<size_t>(h_scale * oh), op.h_out - 1);
-    size_t iw = min(static_cast<size_t>(w_scale * ow), op.w_out - 1);
+    size_t ih = min(static_cast<size_t>(h_scale * oh), op.h_in - 1);
+    size_t iw = min(static_cast<size_t>(w_scale * ow), op.w_in - 1);
 
     size_t inp_i = b * inp_strides[0] + c * inp_strides[1] + ih * inp_strides[2] + iw * inp_strides[3];
     
@@ -65,8 +65,8 @@ __device__ void nearest_upscale2d_bwd(
     idx /= op.chan;
     const size_t b = idx % op.batch;
 
-    size_t ih = min(static_cast<size_t>(h_scale * oh), op.h_out - 1);
-    size_t iw = min(static_cast<size_t>(w_scale * ow), op.w_out - 1);
+    size_t ih = min(static_cast<size_t>(h_scale * oh), op.h_in - 1);
+    size_t iw = min(static_cast<size_t>(w_scale * ow), op.w_in - 1);
 
     size_t inp_i = b * inp_strides[0] + c * inp_strides[1] + ih * inp_strides[2] + iw * inp_strides[3];
     atomicAdd(grad_inp + inp_i, grad_out[i]);
@@ -96,10 +96,10 @@ __device__ void bilinear_upscale2d_fwd(
     idx /= op.chan;
     const size_t b = idx % op.batch;
 
-    size_t y0 = min(static_cast<size_t>(h_scale * oh), op.h_out - 1);
-    size_t y1 = min(y0 + 1, op.h_out - 1);
-    size_t x0 = min(static_cast<size_t>(w_scale * ow), op.w_out - 1);
-    size_t x1 = min(x0 + 1, op.w_out - 1);
+    size_t y0 = min(static_cast<size_t>(h_scale * oh), op.h_in - 1);
+    size_t y1 = min(y0 + 1, op.h_in - 1);
+    size_t x0 = min(static_cast<size_t>(w_scale * ow), op.w_in - 1);
+    size_t x1 = min(x0 + 1, op.w_in - 1);
 
     T hs = h_scale * oh - y0;
     T ws = w_scale * ow - x0;
@@ -138,10 +138,10 @@ __device__ void bilinear_upscale2d_bwd(
     idx /= op.chan;
     const size_t b = idx % op.batch;
 
-    size_t y0 = min(static_cast<size_t>(h_scale * oh), op.h_out - 1);
-    size_t y1 = min(y0 + 1, op.h_out - 1);
-    size_t x0 = min(static_cast<size_t>(w_scale * ow), op.w_out - 1);
-    size_t x1 = min(x0 + 1, op.w_out - 1);
+    size_t y0 = min(static_cast<size_t>(h_scale * oh), op.h_in - 1);
+    size_t y1 = min(y0 + 1, op.h_in - 1);
+    size_t x0 = min(static_cast<size_t>(w_scale * ow), op.w_in - 1);
+    size_t x1 = min(x0 + 1, op.w_in - 1);
 
     T hs = h_scale * oh - y0;
     T ws = w_scale * ow - x0;