Skip to content

Commit

Permalink
Initial commit to resolve merge conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
deven-amd committed Dec 6, 2021
1 parent 9efea3e commit b3d2b2f
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 182 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -262,17 +262,11 @@ extern "C" void* _mlir_ciface_tf_jit_compile(
ctx->op_device_context()->stream()->GetCudaComputeCapability();
architectures.push_back(absl::StrCat("sm_", cc.major, cc.minor));
#elif defined(TENSORFLOW_USE_ROCM)
<<<<<<< HEAD
architectures.push_back(
ctx->op_device_context()->stream()->parent()
->GetDeviceDescription().rocm_amdgpu_gcn_arch_name());
=======
architectures.push_back(ctx->op_device_context()
->stream()
->parent()
->GetDeviceDescription()
.rocm_amdgpu_gcn_arch_name());
>>>>>>> google_upstream/master
#endif

// Construct `SmallVector`s from arguments.
Expand Down
7 changes: 0 additions & 7 deletions tensorflow/core/kernels/linalg/qr_op_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -237,16 +237,9 @@ class QrOpGpu : public AsyncOpKernel {
for (int batch = 0; batch < batch_size; ++batch) {
OP_REQUIRES_OK_ASYNC(
context,
<<<<<<< HEAD
solver->Geam(transa, transb, n,
full_matrices_ ? m : min_size, &alpha,
&input_transposed_reshaped(batch, 0, 0), m, &beta,
dummy, n, &r_reshaped(batch, 0, 0), n),
=======
solver->Geam(transa, transb, n, full_matrices_ ? m : min_size,
&alpha, &input_transposed_reshaped(batch, 0, 0), m,
&beta, dummy, n, &r_reshaped(batch, 0, 0), n),
>>>>>>> google_upstream/master
done);
}
}
Expand Down
57 changes: 9 additions & 48 deletions tensorflow/core/util/gpu_solvers.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,8 @@ inline typename ROCmComplexT<T>::type* ROCmComplex(T* p) {
// Template to give the Rocblas adjoint operation for real and complex types.
template <typename T>
rocblas_operation RocblasAdjointOp() {
<<<<<<< HEAD
return Eigen::NumTraits<T>::IsComplex ? rocblas_operation_conjugate_transpose : rocblas_operation_transpose;
=======
return Eigen::NumTraits<T>::IsComplex ? rocblas_operation_conjugate_transpose
: rocblas_operation_transpose;
>>>>>>> google_upstream/master
}
#endif

Expand Down Expand Up @@ -272,6 +268,14 @@ class GpuSolver {
const Scalar* const host_a_inverse_dev_ptrs[], int ldainv,
DeviceLapackInfo* dev_lapack_info, int batch_size);

// Computes matrix inverses for a batch of small matrices with size n < 32.
// Returns Status::OK() if the kernel was launched successfully.
template <typename Scalar>
Status MatInvBatched(int n, const Scalar* const host_a_dev_ptrs[], int lda,
const Scalar* const host_a_inverse_dev_ptrs[],
int ldainv, DeviceLapackInfo* dev_lapack_info,
int batch_size);

// Cholesky factorization
// Computes the Cholesky factorization A = L * L^H for a batch of small
// matrices.
Expand All @@ -287,80 +291,36 @@ class GpuSolver {

// QR factorization.
// Computes QR factorization A = Q * R.
<<<<<<< HEAD
// Returns Status::OK() if the kernel was launched successfully.
// See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-geqrf
=======
>>>>>>> google_upstream/master
template <typename Scalar>
Status Geqrf(int m, int n, Scalar* dev_A, int lda, Scalar* dev_tau,
int* dev_lapack_info);

<<<<<<< HEAD

// This function performs the matrix-matrix addition/transposition
// C = alpha * op(A) + beta * op(B).
// Returns Status::OK() if the kernel was launched successfully. See:
// http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-geam
// NOTE(ebrevdo): Does not support in-place transpose of non-square
// matrices.
=======
// This function performs the matrix-matrix addition/transposition
// C = alpha * op(A) + beta * op(B).
>>>>>>> google_upstream/master
template <typename Scalar>
Status Geam(rocblas_operation transa, rocblas_operation transb, int m, int n,
const Scalar* alpha, /* host or device pointer */
const Scalar* A, int lda,
const Scalar* beta, /* host or device pointer */
<<<<<<< HEAD
const Scalar* B, int ldb, Scalar* C,
int ldc);
=======
const Scalar* B, int ldb, Scalar* C, int ldc);
>>>>>>> google_upstream/master

// Overwrite matrix C by product of C and the unitary Householder matrix Q.
// The Householder matrix Q is represented by the output from Geqrf in dev_a
// and dev_tau.
<<<<<<< HEAD
// Returns Status::OK() if the kernel was launched successfully.
template <typename Scalar>
Status Unmqr(rocblas_side side, rocblas_operation trans, int m, int n,
int k, const Scalar* dev_a, int lda, const Scalar* dev_tau,
=======
template <typename Scalar>
Status Unmqr(rocblas_side side, rocblas_operation trans, int m, int n, int k,
const Scalar* dev_a, int lda, const Scalar* dev_tau,
>>>>>>> google_upstream/master
Scalar* dev_c, int ldc, int* dev_lapack_info);

// Overwrites QR factorization produced by Geqrf by the unitary Householder
// matrix Q. On input, the Householder matrix Q is represented by the output
// from Geqrf in dev_a and dev_tau. On output, dev_a is overwritten with the
// first n columns of Q. Requires m >= n >= 0.
<<<<<<< HEAD
// Returns Status::OK() if the kernel was launched successfully.
=======
>>>>>>> google_upstream/master
template <typename Scalar>
Status Ungqr(int m, int n, int k, Scalar* dev_a, int lda,
const Scalar* dev_tau, int* dev_lapack_info);

<<<<<<< HEAD

// Computes matrix inverses for a batch of small matrices with size n < 32.
// Returns Status::OK() if the kernel was launched successfully.
template <typename Scalar>
Status MatInvBatched(int n, const Scalar* const host_a_dev_ptrs[], int lda,
const Scalar* const host_a_inverse_dev_ptrs[],
int ldainv, DeviceLapackInfo* dev_lapack_info,
int batch_size);

#else //GOOGLE_CUDA
=======
#else // GOOGLE_CUDA
>>>>>>> google_upstream/master
// ====================================================================
// Wrappers for cuSolverDN and cuBlas solvers start here.
//
Expand Down Expand Up @@ -531,6 +491,7 @@ class GpuSolver {
const Scalar* const dev_Aarray[], int lda,
Scalar* dev_Barray[], int ldb, int batch_size);
#endif

private:
OpKernelContext* context_; // not owned.
#if GOOGLE_CUDA
Expand Down
163 changes: 47 additions & 116 deletions tensorflow/core/util/rocm_solvers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -222,12 +222,8 @@ void GpuSolver::CheckLapackInfoAndDeleteSolverAsync(
#define TF_CALL_LAPACK_TYPES(m) \
m(float, s) m(double, d) m(std::complex<float>, c) m(std::complex<double>, z)
#define TF_CALL_LAPACK_TYPES_NO_COMPLEX(m) m(float, s) m(double, d)
<<<<<<< HEAD
#define TF_CALL_LAPACK_TYPES_NO_REAL(m) m(std::complex<float>, c) m(std::complex<double>, z)
=======
#define TF_CALL_LAPACK_TYPES_NO_REAL(m) \
m(std::complex<float>, c) m(std::complex<double>, z)
>>>>>>> google_upstream/master

#define BLAS_SOLVER_FN(method, type_prefix) \
wrap::rocblas##_##type_prefix##method
Expand All @@ -249,81 +245,16 @@ void GpuSolver::CheckLapackInfoAndDeleteSolverAsync(

TF_CALL_LAPACK_TYPES(GETRF_INSTANCE);

#define GEQRF_INSTANCE(Scalar, type_prefix) \
template <> \
Status GpuSolver::Geqrf(int m, int n, Scalar* dev_A, int lda, Scalar* dev_tau, \
int* dev_lapack_info){ \
mutex_lock lock(handle_map_mutex); \
using ROCmScalar = typename ROCmComplexT<Scalar>::type; \
TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(geqrf, type_prefix)( \
rocm_blas_handle_, m, n, reinterpret_cast<ROCmScalar*>(dev_A), lda, \
reinterpret_cast<ROCmScalar*>(dev_tau))); \
return Status::OK(); \
}

TF_CALL_LAPACK_TYPES(GEQRF_INSTANCE);

#define UMMQR_INSTANCE(Scalar, type_prefix) \
template <> \
Status GpuSolver::Unmqr(rocblas_side side, rocblas_operation trans, int m, int n, \
int k, const Scalar* dev_a, int lda, const Scalar* dev_tau, \
Scalar* dev_c, int ldc, int* dev_lapack_info){ \
mutex_lock lock(handle_map_mutex); \
using ROCmScalar = typename ROCmComplexT<Scalar>::type; \
ScratchSpace<uint8> dev_a_copy = \
this->GetScratchSpace<uint8>(sizeof(ROCmScalar*) * m*k, "", \
/*on host */ false); \
if (!CopyHostToDevice(context_, dev_a_copy.mutable_data(), dev_a, \
dev_a_copy.bytes())) { \
return errors::Internal("Unmqr: Failed to copy ptrs to device"); \
} \
ScratchSpace<uint8> dev_tau_copy = \
this->GetScratchSpace<uint8>(sizeof(ROCmScalar*) *k*n, "", \
/*on host */ false); \
if (!CopyHostToDevice(context_, dev_tau_copy.mutable_data(), dev_tau, \
dev_tau_copy.bytes())) { \
return errors::Internal("Unmqr: Failed to copy ptrs to device"); \
} \
TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(unmqr, type_prefix)( \
rocm_blas_handle_,side,trans, m, n, k, reinterpret_cast<ROCmScalar*>(dev_a_copy.mutable_data()), lda, \
reinterpret_cast<ROCmScalar*>(dev_tau_copy.mutable_data()),reinterpret_cast<ROCmScalar*>(dev_c), ldc)); \
return Status::OK(); \
}

TF_CALL_LAPACK_TYPES_NO_REAL(UMMQR_INSTANCE);

#define UNGQR_INSTANCE(Scalar, type_prefix) \
template <> \
Status GpuSolver::Ungqr(int m, int n, int k, Scalar* dev_a, int lda, \
const Scalar* dev_tau, int* dev_lapack_info){ \
mutex_lock lock(handle_map_mutex); \
using ROCmScalar = typename ROCmComplexT<Scalar>::type; \
ScratchSpace<uint8> dev_tau_copy = \
this->GetScratchSpace<uint8>(sizeof(ROCmScalar*) *k*n, "", \
/*on host */ false); \
if (!CopyHostToDevice(context_, dev_tau_copy.mutable_data(), dev_tau, \
dev_tau_copy.bytes())) { \
return errors::Internal("Ungqr: Failed to copy ptrs to device"); \
} \
TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(ungqr, type_prefix)( \
rocm_blas_handle_, m, n, k, reinterpret_cast<ROCmScalar*>(dev_a), lda, \
reinterpret_cast<ROCmScalar*>(dev_tau_copy.mutable_data()))); \
return Status::OK(); \
}

TF_CALL_LAPACK_TYPES_NO_REAL(UNGQR_INSTANCE);


#define POTRF_INSTANCE(Scalar, type_prefix) \
template <> \
Status GpuSolver::Potrf<Scalar>(rocblas_fill uplo, int n, Scalar* dev_A, \
int lda, int* dev_lapack_info) { \
mutex_lock lock(handle_map_mutex); \
using ROCmScalar = typename ROCmComplexT<Scalar>::type; \
TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(potrf, type_prefix)( \
rocm_blas_handle_, uplo, n, reinterpret_cast<ROCmScalar*>(dev_A), \
lda, dev_lapack_info)); \
return Status::OK(); \
#define POTRF_INSTANCE(Scalar, type_prefix) \
template <> \
Status GpuSolver::Potrf<Scalar>(rocblas_fill uplo, int n, Scalar* dev_A, \
int lda, int* dev_lapack_info) { \
mutex_lock lock(handle_map_mutex); \
using ROCmScalar = typename ROCmComplexT<Scalar>::type; \
TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(potrf, type_prefix)( \
rocm_blas_handle_, uplo, n, reinterpret_cast<ROCmScalar*>(dev_A), lda, \
dev_lapack_info)); \
return Status::OK(); \
}

#define GEQRF_INSTANCE(Scalar, type_prefix) \
Expand Down Expand Up @@ -441,43 +372,6 @@ TF_CALL_LAPACK_TYPES(GETRS_INSTANCE);

TF_CALL_LAPACK_TYPES(GETRF_BATCHED_INSTANCE);

#define GETRI_BATCHED_INSTANCE(Scalar, type_prefix) \
template <> \
Status GpuSolver::GetriBatched<Scalar>( \
int n, const Scalar* const host_a_dev_ptrs[], int lda, \
const int* dev_pivots, const Scalar* const host_a_inverse_dev_ptrs[], \
int ldainv, DeviceLapackInfo* dev_lapack_info, int batch_size) { \
mutex_lock lock(handle_map_mutex); \
rocblas_stride stride = n; \
using ROCmScalar = typename ROCmComplexT<Scalar>::type; \
ScratchSpace<uint8> dev_a = this->GetScratchSpace<uint8>( \
sizeof(ROCmScalar*) * batch_size, "", /*on host */ false); \
if (!CopyHostToDevice(context_, dev_a.mutable_data(), host_a_dev_ptrs, \
dev_a.bytes())) { \
return errors::Internal("GetriBatched: Failed to copy ptrs to device"); \
} \
ScratchSpace<uint8> dev_a_inverse = this->GetScratchSpace<uint8>( \
sizeof(ROCmScalar*) * batch_size, "", /*on host */ false); \
if (!CopyHostToDevice(context_, dev_a_inverse.mutable_data(), host_a_inverse_dev_ptrs, \
dev_a_inverse.bytes())) { \
return errors::Internal("GetriBatched: Failed to copy ptrs to device"); \
} \
ScratchSpace<uint8> pivots = this->GetScratchSpace<uint8>( \
sizeof(ROCmScalar*) * batch_size, "", /*on host */ false); \
if (!CopyHostToDevice(context_, pivots.mutable_data(), dev_pivots, \
pivots.bytes())) { \
return errors::Internal("GetriBatched: Failed to copy ptrs to device"); \
} \
TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(getri_batched, type_prefix)( \
rocm_blas_handle_, n, \
reinterpret_cast<ROCmScalar**>(dev_a.mutable_data()), lda, \
reinterpret_cast<int*>(pivots.mutable_data()), \
stride, dev_lapack_info->mutable_data(), batch_size)); \
return Status::OK(); \
}

TF_CALL_LAPACK_TYPES(GETRI_BATCHED_INSTANCE);

#define POTRF_BATCHED_INSTANCE(Scalar, type_prefix) \
template <> \
Status GpuSolver::PotrfBatched<Scalar>( \
Expand Down Expand Up @@ -530,6 +424,43 @@ TF_CALL_LAPACK_TYPES(POTRF_BATCHED_INSTANCE);

TF_CALL_LAPACK_TYPES(GETRS_BATCHED_INSTANCE);

#define GETRI_BATCHED_INSTANCE(Scalar, type_prefix) \
template <> \
Status GpuSolver::GetriBatched<Scalar>( \
int n, const Scalar* const host_a_dev_ptrs[], int lda, \
const int* dev_pivots, const Scalar* const host_a_inverse_dev_ptrs[], \
int ldainv, DeviceLapackInfo* dev_lapack_info, int batch_size) { \
mutex_lock lock(handle_map_mutex); \
rocblas_stride stride = n; \
using ROCmScalar = typename ROCmComplexT<Scalar>::type; \
ScratchSpace<uint8> dev_a = this->GetScratchSpace<uint8>( \
sizeof(ROCmScalar*) * batch_size, "", /*on host */ false); \
if (!CopyHostToDevice(context_, dev_a.mutable_data(), host_a_dev_ptrs, \
dev_a.bytes())) { \
return errors::Internal("GetriBatched: Failed to copy ptrs to device"); \
} \
ScratchSpace<uint8> dev_a_inverse = this->GetScratchSpace<uint8>( \
sizeof(ROCmScalar*) * batch_size, "", /*on host */ false); \
if (!CopyHostToDevice(context_, dev_a_inverse.mutable_data(), \
host_a_inverse_dev_ptrs, dev_a_inverse.bytes())) { \
return errors::Internal("GetriBatched: Failed to copy ptrs to device"); \
} \
ScratchSpace<uint8> pivots = this->GetScratchSpace<uint8>( \
sizeof(ROCmScalar*) * batch_size, "", /*on host */ false); \
if (!CopyHostToDevice(context_, pivots.mutable_data(), dev_pivots, \
pivots.bytes())) { \
return errors::Internal("GetriBatched: Failed to copy ptrs to device"); \
} \
TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(getri_batched, type_prefix)( \
rocm_blas_handle_, n, \
reinterpret_cast<ROCmScalar**>(dev_a.mutable_data()), lda, \
reinterpret_cast<int*>(pivots.mutable_data()), stride, \
dev_lapack_info->mutable_data(), batch_size)); \
return Status::OK(); \
}

TF_CALL_LAPACK_TYPES(GETRI_BATCHED_INSTANCE);

// Allocates a temporary tensor. The GpuSolver object maintains a
// TensorReference to the underlying Tensor to prevent it from being deallocated
// prematurely.
Expand Down
Loading

0 comments on commit b3d2b2f

Please sign in to comment.