From 94a9942cc48fd630059b3e0d4d879eb837f151b2 Mon Sep 17 00:00:00 2001 From: Corey Lowman Date: Sat, 8 Apr 2023 12:49:18 -0400 Subject: [PATCH 1/2] Simplify upscale cuda kernels --- src/tensor_ops/upscale2d/cuda_kernel.rs | 35 +++++++++---------------- src/tensor_ops/upscale2d/upscale2d.cu | 12 ++------- 2 files changed, 15 insertions(+), 32 deletions(-) diff --git a/src/tensor_ops/upscale2d/cuda_kernel.rs b/src/tensor_ops/upscale2d/cuda_kernel.rs index ea5ab70e2..2d2013617 100644 --- a/src/tensor_ops/upscale2d/cuda_kernel.rs +++ b/src/tensor_ops/upscale2d/cuda_kernel.rs @@ -1,11 +1,11 @@ use crate::{ shapes::*, - tensor::{Cuda, Tensor}, + tensor::{launch_cfg, Cuda, Tensor}, }; use std::sync::Arc; -use cudarc::driver::{DeviceRepr, LaunchAsync, LaunchConfig}; +use cudarc::driver::{DeviceRepr, LaunchAsync}; use super::{Bilinear, NearestNeighbor, UpscaleMethod}; @@ -13,9 +13,9 @@ const PTX_SRC: &str = include_str!(concat!(env!("OUT_DIR"), "/upscale2d.ptx")); unsafe impl DeviceRepr for super::Upscale2DOp {} -fn make_4d(strides: S::Concrete, pad: usize) -> [usize; 4] { +fn make_4d(strides: S::Concrete) -> [usize; 4] { match S::NUM_DIMS { - 3 => [pad, strides[0], strides[1], strides[2]], + 3 => [0, strides[0], strides[1], strides[2]], 4 => [strides[0], strides[1], strides[2], strides[3]], _ => panic!("Only implemented for 3d & 4d arrays"), } @@ -56,16 +56,14 @@ where .load_ptx(PTX_SRC.into(), Self::FWD, &[Self::FWD, Self::BWD])?; } - let inp_strides = self.dev.htod_copy(make_4d::(inp.strides, 0).into())?; - let out_strides = self.dev.htod_copy(make_4d::(out.strides, 0).into())?; + let strides = self.dev.htod_copy(make_4d::(inp.strides).into())?; let fwd_fn = self.dev.get_func(Self::FWD, Self::FWD).unwrap(); - let cfg = LaunchConfig::for_num_elems(out.shape().num_elements() as u32); + let cfg = launch_cfg(out.shape().num_elements() as u32); let params = ( - op, // const Pool2dOp op, - &inp_strides, // const size_t *inp_strides, - &out_strides, // const size_t *out_strides, - inp.data.as_ref(), // const float *inp, - Arc::make_mut(&mut out.data), // float *out + op, + &strides, + inp.data.as_ref(), + Arc::make_mut(&mut out.data), ); unsafe { fwd_fn.launch(cfg, params) }?; Ok(()) @@ -78,17 +76,10 @@ where out: &Tensor, grad_out: &Self::Vec, ) -> Result<(), Self::Err> { - let inp_strides = self.dev.htod_copy(make_4d::(inp.strides, 0).into())?; - let out_strides = self.dev.htod_copy(make_4d::(out.strides, 0).into())?; + let strides = self.dev.htod_copy(make_4d::(inp.strides).into())?; let bwd_fn = self.dev.get_func(Self::FWD, Self::BWD).unwrap(); - let cfg = LaunchConfig::for_num_elems(out.shape().num_elements() as u32); - let params = ( - op, // const Pool2dOp op, - &inp_strides, // const size_t *inp_strides, - &out_strides, // const size_t *out_strides, - grad_inp, // float *grad_inp, - grad_out, // const float *grad_out - ); + let cfg = launch_cfg(out.shape().num_elements() as u32); + let params = (op, &strides, grad_inp, grad_out); unsafe { bwd_fn.launch(cfg, params) }?; Ok(()) } diff --git a/src/tensor_ops/upscale2d/upscale2d.cu b/src/tensor_ops/upscale2d/upscale2d.cu index 39786f4e5..33f337d42 100644 --- a/src/tensor_ops/upscale2d/upscale2d.cu +++ b/src/tensor_ops/upscale2d/upscale2d.cu @@ -13,7 +13,6 @@ template __device__ void nearest_upscale2d_fwd( const Upscale2dOp op, const size_t *inp_strides, - const size_t *out_strides, const T *inp, // 4d (Batch, Channels, Height, Width) T *out // 4d (Batch, Channels, HeightOut, WidthOut) ) { @@ -46,7 +45,6 @@ template __device__ void nearest_upscale2d_bwd( const Upscale2dOp op, const size_t *inp_strides, - const size_t *out_strides, T *grad_inp, const T *grad_out // 4d (Batch, Channels, HeightOut, WidthOut) ) { @@ -78,7 +76,6 @@ template __device__ void bilinear_upscale2d_fwd( const Upscale2dOp op, const size_t *inp_strides, - const size_t *out_strides, const T *inp, // 4d (Batch, Channels, Height, Width) T *out // 4d (Batch, Channels, HeightOut, WidthOut) ) { @@ -98,7 +95,6 @@ __device__ void bilinear_upscale2d_fwd( const size_t c = idx % op.chan; idx /= op.chan; const size_t b = idx % op.batch; - idx /= op.batch; size_t y0 = min(static_cast(h_scale * oh), op.h_out - 1); size_t y1 = min(y0 + 1, op.h_out - 1); @@ -122,7 +118,6 @@ template __device__ void bilinear_upscale2d_bwd( const Upscale2dOp op, const size_t *inp_strides, - const size_t *out_strides, T *grad_inp, // 4d (Batch, Channels, Height, Width) const T *grad_out // 4d (Batch, Channels, HeightOut, WidthOut) ) { @@ -142,7 +137,6 @@ __device__ void bilinear_upscale2d_bwd( const size_t c = idx % op.chan; idx /= op.chan; const size_t b = idx % op.batch; - idx /= op.batch; size_t y0 = min(static_cast(h_scale * oh), op.h_out - 1); size_t y1 = min(y0 + 1, op.h_out - 1); @@ -166,20 +160,18 @@ __device__ void bilinear_upscale2d_bwd( extern "C" __global__ void fwd( \ const Upscale2dOp op, \ const size_t *inp_strides, \ - const size_t *out_strides, \ const TYPENAME *inp, \ TYPENAME *out \ ) { \ - fwd_FN(op, inp_strides, out_strides, inp, out); \ + fwd_FN(op, inp_strides, inp, out); \ } \ extern "C" __global__ void bwd( \ const Upscale2dOp op, \ const size_t *inp_strides, \ - const size_t *out_strides, \ TYPENAME *grad_inp, \ const TYPENAME *grad_out \ ) { \ - bwd_FN(op, inp_strides, out_strides, grad_inp, grad_out); \ + bwd_FN(op, inp_strides, grad_inp, grad_out); \ } UPSCALE_OP( From 5751dc4d5200c98f6919c548fc5907c54a8f30a0 Mon Sep 17 00:00:00 2001 From: Corey Lowman Date: Sat, 8 Apr 2023 19:16:42 +0000 Subject: [PATCH 2/2] Fixing bounds checks in upscale kernel --- src/tensor_ops/upscale2d/upscale2d.cu | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/tensor_ops/upscale2d/upscale2d.cu b/src/tensor_ops/upscale2d/upscale2d.cu index 33f337d42..49ad1b91a 100644 --- a/src/tensor_ops/upscale2d/upscale2d.cu +++ b/src/tensor_ops/upscale2d/upscale2d.cu @@ -33,8 +33,8 @@ __device__ void nearest_upscale2d_fwd( idx /= op.chan; const size_t b = idx % op.batch; - size_t ih = min(static_cast(h_scale * oh), op.h_out - 1); - size_t iw = min(static_cast(w_scale * ow), op.w_out - 1); + size_t ih = min(static_cast(h_scale * oh), op.h_in - 1); + size_t iw = min(static_cast(w_scale * ow), op.w_in - 1); size_t inp_i = b * inp_strides[0] + c * inp_strides[1] + ih * inp_strides[2] + iw * inp_strides[3]; @@ -65,8 +65,8 @@ __device__ void nearest_upscale2d_bwd( idx /= op.chan; const size_t b = idx % op.batch; - size_t ih = min(static_cast(h_scale * oh), op.h_out - 1); - size_t iw = min(static_cast(w_scale * ow), op.w_out - 1); + size_t ih = min(static_cast(h_scale * oh), op.h_in - 1); + size_t iw = min(static_cast(w_scale * ow), op.w_in - 1); size_t inp_i = b * inp_strides[0] + c * inp_strides[1] + ih * inp_strides[2] + iw * inp_strides[3]; atomicAdd(grad_inp + inp_i, grad_out[i]); @@ -96,10 +96,10 @@ __device__ void bilinear_upscale2d_fwd( idx /= op.chan; const size_t b = idx % op.batch; - size_t y0 = min(static_cast(h_scale * oh), op.h_out - 1); - size_t y1 = min(y0 + 1, op.h_out - 1); - size_t x0 = min(static_cast(w_scale * ow), op.w_out - 1); - size_t x1 = min(x0 + 1, op.w_out - 1); + size_t y0 = min(static_cast(h_scale * oh), op.h_in - 1); + size_t y1 = min(y0 + 1, op.h_in - 1); + size_t x0 = min(static_cast(w_scale * ow), op.w_in - 1); + size_t x1 = min(x0 + 1, op.w_in - 1); T hs = h_scale * oh - y0; T ws = w_scale * ow - x0; @@ -138,10 +138,10 @@ __device__ void bilinear_upscale2d_bwd( idx /= op.chan; const size_t b = idx % op.batch; - size_t y0 = min(static_cast(h_scale * oh), op.h_out - 1); - size_t y1 = min(y0 + 1, op.h_out - 1); - size_t x0 = min(static_cast(w_scale * ow), op.w_out - 1); - size_t x1 = min(x0 + 1, op.w_out - 1); + size_t y0 = min(static_cast(h_scale * oh), op.h_in - 1); + size_t y1 = min(y0 + 1, op.h_in - 1); + size_t x0 = min(static_cast(w_scale * ow), op.w_in - 1); + size_t x1 = min(x0 + 1, op.w_in - 1); T hs = h_scale * oh - y0; T ws = w_scale * ow - x0;