Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Using 128 threads by default for cuda kernels #599

Merged
merged 1 commit into from
Mar 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions src/optim/adam/cuda_kernel.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
use crate::{optim::optimizer::*, shapes::*, tensor::Cuda};
use cudarc::driver::{DeviceRepr, DeviceSlice, LaunchAsync, LaunchConfig};
use crate::{
optim::optimizer::*,
shapes::*,
tensor::{launch_cfg, Cuda},
};

use cudarc::driver::{DeviceRepr, DeviceSlice, LaunchAsync};

#[repr(C)]
struct CudaAdamConfig<E> {
Expand Down Expand Up @@ -63,7 +68,7 @@ where
let opt_cfg = adam_config_to_cuda(cfg);
let numel = param.len();
let func = self.dev.get_func(Self::MOD, Self::FWD).unwrap();
let cfg = LaunchConfig::for_num_elems(numel as u32);
let cfg = launch_cfg(numel as u32);
let t = <E>::from_i32(t).unwrap();
let params = (opt_cfg, numel, t, param, moment1, moment2, grad);
unsafe { func.launch(cfg, params) }?;
Expand Down
11 changes: 8 additions & 3 deletions src/optim/rmsprop/cuda_kernel.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
use super::RMSpropConfig;
use crate::{optim::optimizer::*, shapes::*, tensor::Cuda};
use cudarc::driver::{DeviceRepr, DeviceSlice, LaunchAsync, LaunchConfig};
use crate::{
optim::optimizer::*,
shapes::*,
tensor::{launch_cfg, Cuda},
};

use cudarc::driver::{DeviceRepr, DeviceSlice, LaunchAsync};

#[repr(C)]
struct CudaRMSpropConfig<E> {
Expand Down Expand Up @@ -73,7 +78,7 @@ where
let opt_cfg = rmsprop_config_to_cuda(cfg);
let numel = param.len();
let func = self.dev.get_func(Self::MOD, Self::FWD).unwrap();
let cfg = LaunchConfig::for_num_elems(numel as u32);
let cfg = launch_cfg(numel as u32);
let params = (opt_cfg, numel, param, momentum, square_avg, grad_avg, grad);
unsafe { func.launch(cfg, params) }?;
Ok(())
Expand Down
11 changes: 8 additions & 3 deletions src/optim/sgd/cuda_kernel.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
use super::SgdConfig;
use crate::{optim::optimizer::*, shapes::*, tensor::Cuda};
use cudarc::driver::{DeviceRepr, DeviceSlice, LaunchAsync, LaunchConfig};
use crate::{
optim::optimizer::*,
shapes::*,
tensor::{launch_cfg, Cuda},
};

use cudarc::driver::{DeviceRepr, DeviceSlice, LaunchAsync};

#[repr(C)]
struct CudaSgdConfig<E> {
Expand Down Expand Up @@ -61,7 +66,7 @@ where
let opt_cfg = sgd_config_to_cuda(cfg);
let numel = param.len();
let func = self.dev.get_func(Self::MOD, Self::FWD).unwrap();
let cfg = LaunchConfig::for_num_elems(numel as u32);
let cfg = launch_cfg(numel as u32);
unsafe { func.launch(cfg, (opt_cfg, numel, param, velocity, grad)) }?;
Ok(())
}
Expand Down
10 changes: 10 additions & 0 deletions src/tensor/cuda/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,13 @@ mod allocate;
mod device;

pub use device::{Cuda, CudaError};

pub(crate) fn launch_cfg(n: u32) -> cudarc::driver::LaunchConfig {
const NUM_THREADS: u32 = 128;
let num_blocks = (n + NUM_THREADS - 1) / NUM_THREADS;
cudarc::driver::LaunchConfig {
grid_dim: (num_blocks, 1, 1),
block_dim: (NUM_THREADS, 1, 1),
shared_mem_bytes: 0,
}
}
2 changes: 2 additions & 0 deletions src/tensor/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ pub use cpu::{Cpu, CpuError};
#[cfg(not(feature = "cuda"))]
pub type AutoDevice = Cpu;

#[cfg(feature = "cuda")]
pub(crate) use cuda::launch_cfg;
#[cfg(feature = "cuda")]
pub use cuda::{Cuda, CudaError};
#[cfg(feature = "cuda")]
Expand Down
4 changes: 2 additions & 2 deletions src/tensor_ops/attention_reshape/cuda_kernel.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use super::*;
use crate::tensor::cuda::Cuda;
use cudarc::driver::{DeviceRepr, LaunchAsync, LaunchConfig};
use cudarc::driver::{DeviceRepr, LaunchAsync};

const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/attention_reshape.ptx"));

Expand Down Expand Up @@ -72,7 +72,7 @@ where
sequence_length,
past_length,
};
let cfg = LaunchConfig::for_num_elems(numel as u32);
let cfg = launch_cfg(numel as u32);
let params = (
op,
qkv.data.as_ref(),
Expand Down
9 changes: 6 additions & 3 deletions src/tensor_ops/axpy/cuda_kernel.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
use crate::{shapes::*, tensor::Cuda};
use crate::{
shapes::*,
tensor::{launch_cfg, Cuda},
};

use cudarc::driver::{DeviceSlice, LaunchAsync, LaunchConfig};
use cudarc::driver::{DeviceSlice, LaunchAsync};

const PTX_SRC: &str = include_str!(concat!(env!("OUT_DIR"), "/axpy.ptx"));

Expand Down Expand Up @@ -30,7 +33,7 @@ where
}
let numel = a.len();
let fwd_fn = self.dev.get_func(Self::FN, Self::FN).unwrap();
let cfg = LaunchConfig::for_num_elems(numel as u32);
let cfg = launch_cfg(numel as u32);
unsafe { fwd_fn.launch(cfg, (numel, a, alpha, b, beta)) }?;
Ok(())
}
Expand Down
9 changes: 3 additions & 6 deletions src/tensor_ops/boolean/cuda_kernels.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
use super::BooleanKernel;
use crate::{
shapes::Shape,
tensor::{
cuda::{Cuda, CudaError},
Tensor,
},
tensor::{launch_cfg, Cuda, CudaError, Tensor},
};
use cudarc::driver::*;

Expand Down Expand Up @@ -35,7 +32,7 @@ impl Cuda {
let rhs_strides: CudaSlice<usize> = self.dev.htod_copy(rhs.strides.into())?;

let fwd_fn = self.dev.get_func(MODULE_NAME, fn_name).unwrap();
let cfg = LaunchConfig::for_num_elems(numel as u32);
let cfg = launch_cfg(numel as u32);
let params = (
numel, // const size_t numel,
S::NUM_DIMS, // const size_t num_dims,
Expand Down Expand Up @@ -65,7 +62,7 @@ impl BooleanKernel for Cuda {
let mut storage = unsafe { self.dev.alloc(numel) }?;

let fwd_fn = self.dev.get_func(MODULE_NAME, "boolean_not").unwrap();
let cfg = LaunchConfig::for_num_elems(numel as u32);
let cfg = launch_cfg(numel as u32);
let params = (
numel, // const size_t numel,
inp.data.as_ref(), // const bool *inp,
Expand Down
8 changes: 4 additions & 4 deletions src/tensor_ops/choose/cuda_kernel.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use crate::{
shapes::*,
tensor::{Cuda, Tensor},
tensor::{launch_cfg, Cuda, Tensor},
};
use cudarc::driver::{CudaSlice, LaunchAsync, LaunchConfig};
use cudarc::driver::{CudaSlice, LaunchAsync};

const PTX_SRC: &str = include_str!(concat!(env!("OUT_DIR"), "/choose.ptx"));

Expand Down Expand Up @@ -47,7 +47,7 @@ where
let rhs_strides: CudaSlice<usize> = self.dev.htod_copy(rhs.strides.into())?;

let fwd_fn = self.dev.get_func(Self::MOD, Self::FNS[0]).unwrap();
let cfg = LaunchConfig::for_num_elems(numel as u32);
let cfg = launch_cfg(numel as u32);
let params = (
numel, // const size_t numel,
S::NUM_DIMS, // const size_t num_dims,
Expand Down Expand Up @@ -81,7 +81,7 @@ where
let cond_strides: CudaSlice<usize> = self.dev.htod_copy(cond.strides.into())?;
let rhs_strides: CudaSlice<usize> = self.dev.htod_copy(rhs.strides.into())?;

let cfg = LaunchConfig::for_num_elems(numel as u32);
let cfg = launch_cfg(numel as u32);
let params = (
numel, // const size_t numel,
S::NUM_DIMS, // const size_t num_dims,
Expand Down
8 changes: 4 additions & 4 deletions src/tensor_ops/cmp/cuda_kernels.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use crate::{
shapes::{Shape, Unit},
tensor::{Cuda, Tensor},
tensor::{launch_cfg, Cuda, Tensor},
};
use cudarc::driver::{CudaSlice, LaunchAsync, LaunchConfig};
use cudarc::driver::{CudaSlice, LaunchAsync};

use super::{
CmpKernel, EqKernelOp, GeKernelOp, GtKernelOp, LeKernelOp, LtKernelOp, NeKernelOp,
Expand Down Expand Up @@ -56,7 +56,7 @@ impl<E: Unit, Op: CmpOpCudaKernel<E>> CmpKernel<Op, E> for Cuda {
let out_strides: CudaSlice<usize> = self.dev.htod_copy(strides.into())?;

let fwd_fn = self.dev.get_func(Op::MODULE_NAME, Op::FWD_FN_NAME).unwrap();
let cfg = LaunchConfig::for_num_elems(numel as u32);
let cfg = launch_cfg(numel as u32);
let params = (
numel, // const size_t numel,
S::NUM_DIMS, // const size_t num_dims,
Expand Down Expand Up @@ -95,7 +95,7 @@ impl<E: Unit, Op: ScalarCmpOpCudaKernel<E>> ScalarCmpKernel<Op, E> for Cuda {
let out_strides: CudaSlice<usize> = self.dev.htod_copy(strides.into())?;

let fwd_fn = self.dev.get_func(Op::MODULE_NAME, Op::FWD_FN_NAME).unwrap();
let cfg = LaunchConfig::for_num_elems(numel as u32);
let cfg = launch_cfg(numel as u32);
let params = (
numel, // const size_t numel,
S::NUM_DIMS, // const size_t num_dims,
Expand Down
8 changes: 4 additions & 4 deletions src/tensor_ops/concat/cuda_kernel.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use crate::{
shapes::*,
tensor::{unique_id, Cuda, Tensor},
tensor::{launch_cfg, unique_id, Cuda, Tensor},
};
use cudarc::driver::{DeviceSlice, LaunchAsync, LaunchConfig};
use cudarc::driver::{DeviceSlice, LaunchAsync};

const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/concat.ptx"));

Expand Down Expand Up @@ -67,14 +67,14 @@ where
{
let f = self.dev.get_func(Self::BWD_FN, Self::BWD_FN).unwrap();
let numel = grad_a.len();
let cfg = LaunchConfig::for_num_elems(numel as u32);
let cfg = launch_cfg(numel as u32);
unsafe { f.launch(cfg, (numel, &grad_out.slice(0..numel), grad_a)) }?;
offset += numel;
}
{
let f = self.dev.get_func(Self::BWD_FN, Self::BWD_FN).unwrap();
let numel = grad_b.len();
let cfg = LaunchConfig::for_num_elems(numel as u32);
let cfg = launch_cfg(numel as u32);
unsafe { f.launch(cfg, (numel, &grad_out.slice(offset..), grad_b)) }?;
}
Ok(())
Expand Down
12 changes: 6 additions & 6 deletions src/tensor_ops/conv2d/cuda_kernel.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
use cudarc::cublas::{CudaBlas, Gemm};
use cudarc::driver::{DeviceRepr, DeviceSlice, LaunchAsync, LaunchConfig, ValidAsZeroBits};
use cudarc::driver::{DeviceRepr, DeviceSlice, LaunchAsync, ValidAsZeroBits};

use crate::tensor_ops::matmul::cuda_kernel::sgemm_batch;
use crate::{
shapes::*,
tensor::{Cuda, Tensor},
tensor::{launch_cfg, Cuda, Tensor},
};

use std::sync::Arc;
Expand Down Expand Up @@ -66,7 +66,7 @@ where
let mut patches = self.dev.alloc_zeros::<E>(patches_numel)?;
let img_strides = self.dev.htod_copy(make_4d::<L>(lhs.strides).into())?;
let unfold_fn = self.dev.get_func(Self::MOD, Self::FNS[0]).unwrap();
let cfg = LaunchConfig::for_num_elems(patches.len() as u32);
let cfg = launch_cfg(patches.len() as u32);
let params = (op, lhs.data.as_ref(), &img_strides, &mut patches);
unsafe { unfold_fn.launch(cfg, params) }?;

Expand Down Expand Up @@ -108,7 +108,7 @@ where
{
// unfold grad_out into patches
let unfold_fn = self.dev.get_func(Self::MOD, Self::FNS[1]).unwrap();
let cfg = LaunchConfig::for_num_elems(patches_numel as u32);
let cfg = launch_cfg(patches_numel as u32);
unsafe { unfold_fn.launch(cfg, (op, grad_out, &mut patches)) }?;
}

Expand All @@ -121,7 +121,7 @@ where
// prepare filters for backward operations by
// swapping dims 0 and 1 and adding a batch dimension
let tr_fn = self.dev.get_func(Self::MOD, Self::FNS[2]).unwrap();
let cfg = LaunchConfig::for_num_elems(rhs.shape.num_elements() as u32);
let cfg = launch_cfg(rhs.shape.num_elements() as u32);
unsafe { tr_fn.launch(cfg, (op, rhs.data.as_ref(), &f_strides, &mut f_b1023)) }?;
}

Expand Down Expand Up @@ -171,7 +171,7 @@ where
// sum all the gradients collected in our broadcasted grad_f
// into grad_rhs
let sum_fn = self.dev.get_func(Self::MOD, Self::FNS[3]).unwrap();
let cfg = LaunchConfig::for_num_elems(rhs.shape.num_elements() as u32);
let cfg = launch_cfg(rhs.shape.num_elements() as u32);
unsafe { sum_fn.launch(cfg, (op, &grad_f_b1023, grad_rhs, &f_strides)) }?;
}

Expand Down
8 changes: 4 additions & 4 deletions src/tensor_ops/dropout/cuda_kernel.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use crate::{
shapes::*,
tensor::{cuda::Cuda, Tensor},
tensor::{launch_cfg, Cuda, Tensor},
};

use std::vec::Vec;

use cudarc::driver::{DeviceSlice, LaunchAsync, LaunchConfig};
use cudarc::driver::{DeviceSlice, LaunchAsync};

use rand::{rngs::StdRng, Rng, SeedableRng};
use rand_distr::{Distribution, Standard};
Expand Down Expand Up @@ -52,7 +52,7 @@ where
let mut storage = unsafe { self.dev.alloc::<E>(numel) }?;

let fwd_fn = self.dev.get_func(Self::MOD, Self::FNS[0]).unwrap();
let cfg = LaunchConfig::for_num_elems(numel as u32);
let cfg = launch_cfg(numel as u32);
let params = (op.prob, numel, inp.data.as_ref(), &noise, &mut storage);
unsafe { fwd_fn.launch(cfg, params) }?;
Ok(self.build_tensor(inp.shape, inp.strides, storage))
Expand All @@ -72,7 +72,7 @@ where
}?;
let bwd_fn = self.dev.get_func(Self::MOD, Self::FNS[1]).unwrap();
let numel = inp.data.len();
let cfg = LaunchConfig::for_num_elems(numel as u32);
let cfg = launch_cfg(numel as u32);
let params = (op.prob, numel, &noise, grad_inp, grad_out);
unsafe { bwd_fn.launch(cfg, params) }?;
Ok(())
Expand Down
10 changes: 5 additions & 5 deletions src/tensor_ops/max_to/cuda_kernel.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
use crate::{
shapes::*,
tensor::{cuda::Cuda, Tensor},
tensor::{launch_cfg, Cuda, Tensor},
tensor_ops::reduction_utils::*,
};

use cudarc::driver::{CudaSlice, DeviceSlice, LaunchAsync, LaunchConfig};
use cudarc::driver::{CudaSlice, DeviceSlice, LaunchAsync};

const PTX_SRC: &str = include_str!(concat!(env!("OUT_DIR"), "/max_to.ptx"));

Expand Down Expand Up @@ -46,7 +46,7 @@ where
let mut storage = unsafe {
let mut storage = self.dev.alloc::<E>(dst.num_elements())?;
fill_fn.launch(
LaunchConfig::for_num_elems(dst.num_elements() as u32),
launch_cfg(dst.num_elements() as u32),
(&mut storage, Self::INIT, dst.num_elements()),
)?;
storage
Expand All @@ -63,7 +63,7 @@ where
reduction_output_strides::<Ax, Src, Dst>(inp.strides, dst);
let chunk_len = physical_numel / dst_physical_numel;

let cfg = LaunchConfig::for_num_elems(physical_numel as u32);
let cfg = launch_cfg(physical_numel as u32);
let params = (
physical_numel, // const size_t numel,
dims.len(), // const size_t num_dims,
Expand Down Expand Up @@ -103,7 +103,7 @@ where
))
.unwrap();

let cfg = LaunchConfig::for_num_elems(physical_numel as u32);
let cfg = launch_cfg(physical_numel as u32);
let params = (
physical_numel, // const size_t numel,
Src::NUM_DIMS, // const size_t num_dims,
Expand Down
Loading