Permalink
Fetching contributors…
Cannot retrieve contributors at this time
347 lines (320 sloc) 13.9 KB
#include "dynet/tensor-eigen.h"
#include "dynet/nodes-contract.h"
#include <limits>
#include <cmath>
#include <stdexcept>
#include "dynet/nodes-impl-macros.h"
// This file takes a long time to compile on GPU. Uncomment this line to skip it.
#define DYNET_SKIP_CUDA_CONTRACTIONS
#if defined(__CUDACC__) && !defined(DYNET_SKIP_CUDA_CONTRACTIONS)
#include "dynet/cuda.h"
#include "dynet/gpu-ops.h"
#include "dynet/matrix-multiply.h"
#endif
using namespace std;
namespace dynet {
// ************* InnerProduct3D_1D *************
#ifndef __CUDACC__
string InnerProduct3D_1D::as_string(const vector<string>& arg_names) const {
ostringstream s;
s << "dot(" << arg_names[0] << "," << arg_names[1] << ')';
if (arg_names.size() == 3) s << " + " << arg_names[2];
return s.str();
}
Dim InnerProduct3D_1D::dim_forward(const vector<Dim>& xs) const {
if (xs.size() != 2 && xs.size() != 3)
throw std::invalid_argument("Expected two or three arguments in InnerProduct3D_1D");
if (xs[0].ndims() != 3 ||
!LooksLikeVector(xs[1]) ||
xs[0].size(2) != xs[1].size(0)) {
ostringstream s; s << "Bad input dimensions in InnerProduct3D_1D: " << xs;
throw std::invalid_argument(s.str());
}
Dim d({xs[0].size(0), xs[0].size(1)}, max(xs[0].bd, xs[1].bd));
if (xs.size() == 3) d.bd = max(d.bd, xs[2].bd);
if (xs.size() == 3 && xs[2].single_batch() != d.single_batch()) {
ostringstream s; s << "Bad bias dimensions in InnerProduct3D_1D: " << xs;
throw std::invalid_argument(s.str());
}
return d;
}
#endif
// Y_ij = A_ijk * B_k (+ C_ij)
template<class MyDevice>
void InnerProduct3D_1D::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
#if defined(__CUDACC__) && defined(DYNET_SKIP_CUDA_CONTRACTIONS)
throw std::runtime_error("InnerProduct3D_1D::forward_dev_impl disabled on CUDA. Comment out DYNET_SKIP_CUDA_CONTRACTIONS in nodes-contract.cc to enable this function.");
#else
typedef Eigen::Tensor<float, 1>::DimensionPair DimPair;
Eigen::array<DimPair, 1> dims({{DimPair(2, 0)}});
// Handle hypothetical bias
if (xs.size() == 3) {
auto C = tb<2>(*xs[2]);
Eigen::array<ptrdiff_t, 3> bcast_C = {1, 1, (xs[2]->d.bd == 1 ? fx.d.bd : 1)};
tb<2>(fx).device(*dev.edevice) = C.broadcast(bcast_C);
}
#if defined(__CUDACC__) && !defined(DYNET_SKIP_CUDA_CONTRACTIONS)
// Use CUDA if accessible
// Reshape xs[0] to a matrix
Dim new_xs0_d({xs[0]->d[0] * xs[0]->d[1], xs[0]->d[2]}, xs[0]->d.bd);
const Tensor new_xs0(new_xs0_d, xs[0]->v, xs[0]->device, xs[0]->mem_pool);
// Reshape fx to a vector
Dim new_fx_d({fx.d[0] * fx.d[1]}, fx.d.bd);
Tensor new_fx(new_fx_d, fx.v, fx.device, fx.mem_pool);
// CUDA matrix multiply ftw
MatrixMultiply(dev, new_xs0, *xs[1], new_fx, dev.kSCALAR_ONE);
#else
// Otherwise use Eigen tensor contraction.
// TODO : maybe on CPU broadcast is not as affective as looping?
if (xs[0]->d.bd == 1) {
// A is a 3 tensor
Eigen::array<ptrdiff_t, 2> bcast_b = {1, (xs[1]->d.bd == 1 ? fx.d.bd : 1)};
auto b = tb<1>(*xs[1]);
auto A = t<3>(*xs[0]);
tb<2>(fx).device(*dev.edevice) += A.contract(b.broadcast(bcast_b), dims);
} else {
// A is a 4 tensor : loop over the batch dimension
auto A = tb<3>(*xs[0]);
if (xs[1]->d.bd == 1) { // b is a 1 tensor
auto b = t<1>(*xs[1]);
tb<2>(fx).device(*dev.edevice) += A.contract(b, dims);
} else {
// If both A and b are batched loop over batches
auto b = tb<1>(*xs[1]);
for (unsigned i = 0; i < fx.d.bd; ++i) {
auto b_ = b.chip<1>(i);
tb<2>(fx).chip<2>(i).device(*dev.edevice) += A.chip<3>(i).contract(b_, dims);
}
}
}
#endif
#endif
}
template<class MyDevice>
void InnerProduct3D_1D::backward_dev_impl(const MyDevice & dev,
const vector<const Tensor*>& xs,
const Tensor& fx,
const Tensor& dEdf,
unsigned i,
Tensor& dEdxi) const {
#if defined(__CUDACC__) && defined(DYNET_SKIP_CUDA_CONTRACTIONS)
throw std::runtime_error("InnerProduct3D_1D::backward_dev_impl disabled on CUDA. Comment out DYNET_SKIP_CUDA_CONTRACTIONS in nodes-contract.cc to enable this function.");
#else
auto tdEdf = tb<2>(dEdf); // 2 tensor
typedef Eigen::Tensor<float, 1>::DimensionPair DimPair;
if (i == 0) { // dEdA
#if defined(__CUDACC__) && !defined(DYNET_SKIP_CUDA_CONTRACTIONS)
if (dEdxi.d.bd == 1 && dEdf.d.bd == xs[1]->d.bd) {
// Basically here dEdxi_ijk = \sum_b dEdf_ijb * B_kb
// Which we do as matrix multiplication dEdxi_(i*j)k = \sum_b dEdf_(i*j)b * B^T_bk
// CUDA matrix multiply ftw
CUBLAS_CHECK(cublasSgemm(dev.cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T,
dEdxi.d[0] * dEdxi.d[1], dEdxi.d[2] , dEdf.d.bd,
dev.kSCALAR_ONE,
dEdf.v, dEdf.d.batch_size(),
xs[1]->v, dEdxi.d[2],
dev.kSCALAR_ONE, dEdxi.v, dEdxi.d[0] * dEdxi.d[1]));
} else {
// In this case dEdxi is batched and b isn't or neither dEdxi nor b are batched but dEdf is (ie C is batched)
// Iterate over the batches of dEdf and then do an outer product beween flattened dEdf and b
// and accumulate the result in dEdxi
float* dEdAv = dEdxi.v;
for (unsigned b = 0; b < dEdf.d.bd; b++) {
if (dEdxi.d.bd == dEdf.d.bd) // If A is batched
dEdAv = dEdxi.batch_ptr(b);
CUBLAS_CHECK(cublasSger(dev.cublas_handle,
dEdxi.d[0] * dEdxi.d[1], dEdxi.d[2] ,
dev.kSCALAR_ONE,
dEdf.batch_ptr(b), 1,
xs[1]->v, 1,
dEdAv, dEdxi.d[0] * dEdxi.d[1]));
}
}
#else
if (xs[0]->d.bd == 1) { // A is a 3 tensor
// tensor product
auto b = tb<1>(*xs[1]);
Eigen::array<ptrdiff_t, 2> bcast_b = {1, (xs[1]->d.bd == 1 ? fx.d.bd : 1)};
Eigen::array<DimPair, 1> dims({{DimPair(2, 1)}});
t<3>(dEdxi).device(*dev.edevice) += tdEdf.contract(b.broadcast(bcast_b), dims);
} else {
// For now if A is batched the CUDA version is not implemented
if (xs[1]->d.bd == 1) {
// auto b = t<1>(*xs[1]);
// Eigen::array<ptrdiff_t, 4> morph {dEdf.d[0], dEdf.d[1], xs[1]->d[0], dEdf.d.bd};
// tb<3>(dEdxi).device(*dev.edevice) += tdEdf.contract(b, Eigen::array<DimPair, 0> {{}}).reshape(morph);
auto b = t<1>(*xs[1]);
for (unsigned i = 0; i < fx.d.bd; ++i) {
tb<3>(dEdxi).chip<3>(i).device(*dev.edevice) += tdEdf.chip<2>(i).contract(b, Eigen::array<DimPair, 0> {{}});
}
} else {
auto b = tb<1>(*xs[1]);
for (unsigned i = 0; i < fx.d.bd; ++i) {
tb<3>(dEdxi).chip<3>(i).device(*dev.edevice) += tdEdf.chip<2>(i).contract(b.chip<1>(i), Eigen::array<DimPair, 0> {{}});
}
}
}
#endif
} else if (i == 1) { // dEdb
// dEdb_k = \sum_ij A_ijk dEdf_ij
#if defined(__CUDACC__) && !defined(DYNET_SKIP_CUDA_CONTRACTIONS)
if (xs[0]->d.bd == 1 && dEdf.d.bd == xs[1]->d.bd) {
// If b has the same batch dimension as dEdf and A is not batched,
// the double contraction can be done as a (transposed) matrix product
// dEdxi_kb = \sum_ij A_ijk dEdf_ijb = \sum_(i*j) A^T_k(i*j) dEdf_(i*j)b
CUBLAS_CHECK(cublasSgemm(dev.cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N,
dEdxi.d.rows(), dEdxi.d.batch_elems(), dEdf.d.batch_size(),
dev.kSCALAR_ONE,
xs[0]->v, xs[0]->d[0] * xs[0]->d[1],
dEdf.v, dEdf.d.batch_size(),
dev.kSCALAR_ONE, dEdxi.v, dEdxi.d.rows()));
} else {
// Here dEdf is batched so we iterate over it and depending on whether A or b is
// batched we take the slice/accumulate
// If everything is batched dEdb_k = \sum_(i*j) A^T_k(i*j) dEdf_(i*j) (for each batch element)
float* dEdbv = dEdxi.v;
const float* Av = xs[0]->v;
for (unsigned b = 0; b < dEdf.d.bd; ++b) {
if (dEdxi.d.bd > 1) {
dEdbv = dEdxi.batch_ptr(b);
}
if (xs[0]->d.bd > 1) {
Av = xs[0]->batch_ptr(b);
}
CUBLAS_CHECK(cublasSgemv(dev.cublas_handle, CUBLAS_OP_T,
dEdf.d.batch_size(), dEdxi.d.rows(),
dev.kSCALAR_ONE,
Av, dEdf.d.batch_size(),
dEdf.batch_ptr(b), 1,
dev.kSCALAR_ONE, dEdbv, 1));
}
}
#else
// When on CPU we use Eigen contractions
if (xs[1]->d.bd == 1) { // b is a 1 tensor
if (xs[0]->d.bd == 1) {
auto A = t<3>(*xs[0]); // A is 3 tensor
Eigen::array<ptrdiff_t, 1> red_axis = {0};
Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}});
t<1>(dEdxi).device(*dev.edevice) += tdEdf.contract(A, dims).sum(red_axis);
} else {
auto A = tb<3>(*xs[0]); // A is 4 tensor
Eigen::array<DimPair, 3> dims({{DimPair(0, 0), DimPair(1, 1), DimPair(2, 3)}});
t<1>(dEdxi).device(*dev.edevice) += tdEdf.contract(A, dims);
}
} else { // b is a 2 tensor
if (xs[0]->d.bd == 1) {
auto A = t<3>(*xs[0]); // A is 3 tensor
Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}});
tb<1>(dEdxi).device(*dev.edevice) += A.contract(tdEdf, dims);
} else {
auto A = tb<3>(*xs[0]); // A is 4 tensor
Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}});
for (unsigned i = 0; i < fx.d.bd; ++i) {
tb<1>(dEdxi).chip<1>(i).device(*dev.edevice) += tdEdf.chip<2>(i).contract(A.chip<3>(i), dims);
}
}
}
#endif
} else if (i == 2) { // dEdC
if (xs[2]->d.bd == 1) {
Eigen::array<ptrdiff_t, 1> red_axis = {2};
t<2>(dEdxi).device(*dev.edevice) += tdEdf.sum(red_axis);
} else {
tb<2>(dEdxi).device(*dev.edevice) += tdEdf;
}
} else {
throw std::runtime_error("Illegal configuration in InnerProduct3D");
}
#endif
}
DYNET_NODE_INST_DEV_IMPL(InnerProduct3D_1D)
// ************* InnerProduct3D_1D_1D *************
#ifndef __CUDACC__
string InnerProduct3D_1D_1D::as_string(const vector<string>& arg_names) const {
ostringstream s;
s << "dotdot(" << arg_names[0] << "," << arg_names[1] << "," << arg_names[2] << ')';
if (arg_names.size() == 4) s << " + " << arg_names[3];
return s.str();
}
Dim InnerProduct3D_1D_1D::dim_forward(const vector<Dim>& xs) const {
if (xs.size() != 3 && xs.size() != 4)
throw std::invalid_argument("Expected three or four arguments in InnerProduct3D_1D");
if (xs[0].ndims() != 3 ||
!LooksLikeVector(xs[1]) ||
!LooksLikeVector(xs[2])) {
// TODO fix add check
ostringstream s; s << "Bad input dimensions in InnerProduct3D_1D_1D: " << xs;
throw std::invalid_argument(s.str());
}
Dim d({xs[0].size(0)}, max(max(xs[0].bd, xs[1].bd), xs[2].bd));
if (xs.size() == 4) d.bd = max(d.bd, xs[3].bd);
if (xs.size() == 4 && xs[3] != d) {
ostringstream s; s << "Bad input dimensions in InnerProduct3D_1D_1D: " << xs;
throw std::invalid_argument(s.str());
}
return d;
}
#endif
// Y_ij = A_ijk * B_k * C_j (+ D_i)
template<class MyDevice>
void InnerProduct3D_1D_1D::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
#if defined(__CUDACC__) && defined(DYNET_SKIP_CUDA_CONTRACTIONS)
throw std::runtime_error("InnerProduct3D_1D_1D::forward_dev_impl disabled on CUDA. Comment out DYNET_SKIP_CUDA_CONTRACTIONS in nodes-contract.cc to enable this function.");
#else
auto A = t<3>(*xs[0]);
auto b = t<1>(*xs[1]);
auto c = t<1>(*xs[2]);
typedef Eigen::Tensor<float, 1>::DimensionPair DimPair;
Eigen::array<DimPair, 1> dims({{DimPair(2, 0)}});
Eigen::array<DimPair, 1> dims2({{DimPair(1, 0)}});
if (xs.size() == 3) {
t<1>(fx).device(*dev.edevice) = A.contract(b, dims).contract(c, dims2);
} else {
auto d = t<1>(*xs[3]);
t<1>(fx).device(*dev.edevice) = A.contract(b, dims).contract(c, dims2) + d;
}
#endif
}
template<class MyDevice>
void InnerProduct3D_1D_1D::backward_dev_impl(const MyDevice & dev,
const vector<const Tensor*>& xs,
const Tensor& fx,
const Tensor& dEdf,
unsigned i,
Tensor& dEdxi) const {
#if defined(__CUDACC__) && defined(DYNET_SKIP_CUDA_CONTRACTIONS)
throw std::runtime_error("InnerProduct3D_1D_1D::backward_dev_impl disabled on CUDA. Comment out DYNET_SKIP_CUDA_CONTRACTIONS in nodes-contract.cc to enable this function.");
#else
auto tdEdf = t<1>(dEdf); // vector
typedef Eigen::Tensor<float, 1>::DimensionPair DimPair;
if (i == 0) { // 3 tensor
// tensor product
auto b = t<1>(*xs[1]);
auto c = t<1>(*xs[2]);
t<3>(dEdxi).device(*dev.edevice) += tdEdf.contract(c, Eigen::array<DimPair, 0> {{}}).contract(b, Eigen::array<DimPair, 0> {{}});
} else if (i == 1) { // vector 1
// TODO these should be reorganized so the contraction is first with tdEdf and then with c or b.
// in theory, that intermediate result could be cached (although DYNET doesn't support this). the fact that it
// this part of the product is redone when i=1 and again when i=2 is probably why this is slower
// (or maybe it's the contract implementation?)
Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
Eigen::array<DimPair, 1> dims2({{DimPair(0, 0)}});
auto A = t<3>(*xs[0]);
auto c = t<1>(*xs[2]);
t<1>(dEdxi).device(*dev.edevice) += A.contract(c, dims).contract(tdEdf, dims2);
} else if (i == 2) { // vector 2
Eigen::array<DimPair, 1> dims({{DimPair(2, 0)}});
Eigen::array<DimPair, 1> dims2({{DimPair(0, 0)}});
auto A = t<3>(*xs[0]);
auto b = t<1>(*xs[1]);
t<1>(dEdxi).device(*dev.edevice) += A.contract(b, dims).contract(tdEdf, dims2);
} else if (i == 3) { // vector bias
t<1>(dEdxi).device(*dev.edevice) += tdEdf;
} else {
throw std::runtime_error("Illegal configuration in InnerProduct3D");
}
#endif
}
DYNET_NODE_INST_DEV_IMPL(InnerProduct3D_1D_1D)
} // namespace dynet