Skip to content

Commit

Permalink
Fix Linear layer bias gradient computation; add size checks to CUDA f…
Browse files Browse the repository at this point in the history
…unctions (#170)

* Assert the correct tensor sizes in copy() and gemm(); fix related Linear logic

* Check output matrix dims in GEMM; fix corresponding Linear layer logic

* Update coaster-blas/src/frameworks/cuda/helper.rs

Co-authored-by: Bernhard Schuster <bernhard@ahoi.io>

Co-authored-by: Mikhail Balakhno <{ID}+{username}@users.noreply.github.com>
Co-authored-by: Bernhard Schuster <bernhard@ahoi.io>
  • Loading branch information
3 people committed Aug 15, 2022
1 parent 53cf97b commit 6952a49
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 18 deletions.
55 changes: 40 additions & 15 deletions coaster-blas/src/frameworks/cuda/helper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ macro_rules! iblas_copy_for_cuda {
x: &SharedTensor<$t>,
y: &mut SharedTensor<$t>,
) -> Result<(), ::coaster::error::Error> {
assert_eq!(x.desc().size(), y.desc().size());
let n = x.desc().size() as i32;
let x_mem = read!(x, self);
let y_mem = write_only!(y, self);
Expand Down Expand Up @@ -250,32 +251,56 @@ macro_rules! iblas_gemm_for_cuda {
beta: &SharedTensor<$t>,
c: &mut SharedTensor<$t>,
) -> Result<(), ::coaster::error::Error> {
let c_desc = c.desc().clone();
let alpha_mem = read!(alpha, self);
let beta_mem = read!(beta, self);
let a_mem = read!(a, self);
let b_mem = read!(b, self);
let c_mem = write_only!(c, self);
use Transpose::{NoTrans, ConjTrans, Trans};

// Determine the dimensions of all the matrices.
// We always treat the first dimension as the number of rows and all
// the subsequent dimensions combined as the "columns".
let a_0 = a.desc()[0] as i32;
let a_1 = a.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32;
let b_0 = b.desc()[0] as i32;
let b_1 = b.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32;
let c_1 = c_desc.iter().skip(1).fold(1, |prod, i| prod * i) as i32;
let n = match bt {
Transpose::NoTrans => b_1,
_ => b_0,
};
let (m, k) = match at {
Transpose::NoTrans => (a_0, a_1),
_ => (a_1, a_0),
let c_0 = c.desc()[0] as i32;
let c_1 = c.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32;

let (m, n, k) = match (at, bt) {
(T::NoTrans, T::NoTrans) => {
assert_eq!(a_1, b_0);
(a_0, b_1, a_1)
}
(T::NoTrans, T::Trans | T::ConjTrans) => {
assert_eq!(a_1, b_1);
(a_0, b_0, a_1)
}
(T::Trans | T::ConjTrans, T::NoTrans) => {
assert_eq!(a_0, b_0);
(a_1, b_1, a_0)
}
(T::Trans | T::ConjTrans, T::Trans | T::ConjTrans) => {
assert_eq!(a_0, b_1);
(a_1, b_0, a_0)
}
};

// Verify that C dimensions match.
assert_eq!(c_0, m);
assert_eq!(c_1, n);

let lda = a_1;
let ldb = b_1;
let ldc = c_1;

let ctx: &cublas::Context = self.framework().cublas();

let alpha_mem = read!(alpha, self);
let beta_mem = read!(beta, self);
let a_mem = read!(a, self);
let b_mem = read!(b, self);
let c_mem = write_only!(c, self);

// cuBLAS uses column-major matrix format, while SharedTensor is row-major.
// To compute AxB = C, we instead compute BᵀxAᵀ = Cᵀ and treat the transposed
// column-major matrix as a normal (non-transposed) row-major one.
exec!(
gemm,
(*ctx).gemm(
Expand All @@ -285,7 +310,7 @@ macro_rules! iblas_gemm_for_cuda {
m,
k,
trans!(alpha_mem, $t),
trans!(b_mem, $t), // matrix a and b are switched to make it work with row-major memory layout.
trans!(b_mem, $t),
ldb,
trans!(a_mem, $t),
lda,
Expand Down
24 changes: 21 additions & 3 deletions juice/src/layers/common/linear.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,23 @@ pub struct Linear {

one: SharedTensor<f32>,
zero: SharedTensor<f32>,
ones_row: SharedTensor<f32>,
}

impl Linear {
/// Create a Linear layer from a LinearConfig.
pub fn from_config(config: &LinearConfig) -> Linear {
let one = native_scalar(1f32);
let zero = native_scalar(0f32);
let mut ones_row = SharedTensor::new(&vec![1]);
FillerType::fill_constant(&mut ones_row, 1.0);

Linear {
output_size: config.output_size,

one,
zero,
ones_row,
}
}

Expand Down Expand Up @@ -83,6 +87,7 @@ impl<B: IBackend + LayerOps<f32>> ILayer<B> for Linear {
output_gradient: &mut Vec<ArcLock<SharedTensor<f32>>>,
) {
let input = input_data[0].read().unwrap();
let batch_size = input.desc()[0];
// reshape top
let output_shape = self.calculate_output_shape(input.desc());
output_data[0].write().unwrap().resize(&output_shape).unwrap();
Expand Down Expand Up @@ -116,6 +121,10 @@ impl<B: IBackend + LayerOps<f32>> ILayer<B> for Linear {
if let Some(weight) = weights_gradient.get(1) {
weight.write().unwrap().resize(&(1, self.output_size)).unwrap();
}

// Reshape the column of 1s which is used to compute bias gradient.
self.ones_row.resize(&vec![1, batch_size]).unwrap();
FillerType::fill_constant(&mut self.ones_row, 1.0);
}

fn exact_num_output_blobs(&self) -> Option<usize> {
Expand Down Expand Up @@ -215,10 +224,19 @@ impl<B: IBackend + LayerOps<f32>> ComputeParametersGradient<f32, B> for Linear {
.unwrap();

// gradient w.r.t bias
// Technically, the gradient of vector b of length n to itself is the I_n identity matrix,
// so instead we'll just copy the output_gradient[0] vector into
// The gradient of vector b of length n to itself is the I_n identity matrix,
// so multiply output_gradient[0] by a 1-column.
// Since parameters_gradients[1] is a row-vector, transpose the whole operation.
backend
.copy(&output_gradients[0], &mut parameters_gradients[1])
.gemm(
&self.one,
Transpose::NoTrans,
&self.ones_row,
Transpose::NoTrans,
output_gradients[0],
&self.zero,
parameters_gradients[1],
)
.unwrap();
}
}
Expand Down

0 comments on commit 6952a49

Please sign in to comment.